WIP

2019-07-30 17:58:55 +02:00 · 2019-07-30 17:58:55 +02:00 · 0e627adc9d
--- a/align/align.py
+++ b/align/align.py
@ -28,6 +28,10 @@ def main(args):
                        help='Play audio fragments as they are matched using SoX audio tool')
    parser.add_argument('--text-context', type=int, required=False, default=10,
                        help='Size of textual context for logged statements - default: 10')
+    parser.add_argument('--start', type=int, required=False, default=0,
+                        help='Start alignment process at given offset of transcribed fragments')
+    parser.add_argument('--num-samples', type=int, required=False,
+                        help='Number of fragments to align')

    audio_group = parser.add_argument_group(title='Audio pre-processing options')
    audio_group.add_argument('--audio-vad-aggressiveness', type=int, choices=range(4), required=False,
@ -40,6 +44,10 @@ def main(args):
    stt_group.add_argument('--stt-no-own-lm', action="store_true",
                           help='Deactivates creation of individual language models per document.' +
                                'Uses the one from model dir instead.')
+    stt_group.add_argument('--stt-min-duration', type=int, required=False, default=4,
+                           help='Minimum speech fragment duration in milliseconds to translate (default: 100)')
+    stt_group.add_argument('--stt-max-duration', type=int, required=False,
+                           help='Maximum speech fragment duration in milliseconds to translate (default: no limit)')

    text_group = parser.add_argument_group(title='Text pre-processing options')
    text_group.add_argument('--text-keep-dashes', action="store_true",
@ -54,13 +62,22 @@ def main(args):
                             help='How many global 3gram match candidates are tested at max (default: 10)')
    align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.92,
                             help='Factor for how many 3grams the next candidate should have at least ' +
-                                  'compared to its predecessor (default: 0.8)')
-    align_group.add_argument('--align-no-snap-to-token', action="store_true",
-                             help='Deactivates snapping to similar neighbour tokens ' +
-                                  'at the beginning and end of each phrase')
-    align_group.add_argument('--align-stretch-fraction', type=float, required=False, default=1/3,
-                             help='Fraction of its original length that a phrase could get expanded or shrunken ' +
-                                  'to match the original text (default: 0.33)')
+                                  'compared to its predecessor (default: 0.92)')
+    align_group.add_argument('--align-match-score', type=int, required=False, default=100,
+                             help='Matching score for Smith-Waterman alignment (default: 100)')
+    align_group.add_argument('--align-mismatch-score', type=int, required=False, default=-100,
+                             help='Mismatch score for Smith-Waterman alignment (default: -100)')
+    align_group.add_argument('--align-gap-score', type=int, required=False, default=-100,
+                             help='Gap score for Smith-Waterman alignment (default: -100)')
+    align_group.add_argument('--align-no-snap', action="store_true",
+                             help='Deactivates snapping to word boundaries at the beginning and end of each phrase')
+    align_group.add_argument('--align-snap-radius', type=int, required=False, default=0,
+                             help='How many words to look up to the left and right for snapping to word ' +
+                                  'boundaries at the beginning and end of each phrase')
+    align_group.add_argument('--align-min-length', type=int, required=False, default=4,
+                             help='Minimum STT phrase length to align (default: 4)')
+    align_group.add_argument('--align-max-length', type=int, required=False,
+                             help='Maximum STT phrase length to align (default: no limit)')

    output_group = parser.add_argument_group(title='Output options')
    output_group.add_argument('--output-stt', action="store_true",
@ -69,10 +86,18 @@ def main(args):
                              help='Writes clean aligned original transcripts to result file')
    output_group.add_argument('--output-aligned-raw', action="store_true",
                              help='Writes raw aligned original transcripts to result file')
-    output_group.add_argument('--output-min-length', type=int, required=False,
-                              help='Minimum phrase length (default: no limit)')
+    output_group.add_argument('--output-wer', action="store_true",
+                              help='Writes word error rate (WER) to output')
+    output_group.add_argument('--output-cer', action="store_true",
+                              help='Writes character error rate (CER) to output')
+    output_group.add_argument('--output-min-length', type=int, required=False, default=1,
+                              help='Minimum phrase length (default: 1)')
    output_group.add_argument('--output-max-length', type=int, required=False,
                              help='Maximum phrase length (default: no limit)')
+    output_group.add_argument('--output-min-score', type=float, required=False, default=2.0,
+                              help='Minimum matching score (default: 2.0)')
+    output_group.add_argument('--output-max-score', type=float, required=False,
+                              help='Maximum matching score (default: no limit)')
    for b in ['Min', 'Max']:
        for r in ['CER', 'WER']:
            output_group.add_argument('--output-' + b.lower() + '-' + r.lower(), type=float, required=False,
@ -167,8 +192,14 @@ def main(args):
        for i, segment in enumerate(segments):
            # Run DeepSpeech on the chunk that just completed VAD
            segment_buffer, time_start, time_end = segment
+            time_length = time_end - time_start
+            if args.stt_min_duration and time_length < args.stt_min_duration:
+                skip('Audio too short for STT', index)
+                continue
+            if args.stt_max_duration and time_length > args.stt_max_duration:
+                skip('Audio too long for STT', index)
+                continue
            logging.debug("Transcribing segment %002d (from %f to %f)..." % (i, time_start / 1000.0, time_end / 1000.0))
-
            audio = np.frombuffer(segment_buffer, dtype=np.int16)
            segment_transcript, segment_inference_time = wavTranscriber.stt(model, audio, sample_rate)
            segment_transcript = ' '.join(segment_transcript.split())
@ -178,7 +209,7 @@ def main(args):
                continue
            fragments.append({
                'time-start':  time_start,
-                'time-length': time_end-time_start,
+                'time-length': time_length,
                'transcript':  segment_transcript
            })
            offset += len(segment_transcript)
@ -190,68 +221,112 @@ def main(args):
    search = FuzzySearch(tc.clean_text,
                         max_candidates=args.align_max_candidates,
                         candidate_threshold=args.align_candidate_threshold,
-                         snap_token=not args.align_no_snap_to_token,
-                         stretch_factor=args.align_stretch_fraction)
-    start = 0
+                         snap_to_word=not args.align_no_snap,
+                         snap_radius=not args.align_snap_radius,
+                         match_score=args.align_match_score,
+                         mismatch_score=args.align_mismatch_score,
+                         gap_score=args.align_gap_score)
    result_fragments = []
    substitutions = Counter()
+    statistics = Counter()

-    for fragment in fragments:
+    def skip(message, index):
+        logging.info('Fragment %d: %s' % (index, message))
+        statistics[message] += 1
+
+    end_fragments = (args.start + args.num_samples) if args.num_samples else len(fragments)
+    fragments = fragments[args.start:end_fragments]
+
+    for index, fragment in enumerate(fragments):
        time_start = fragment['time-start']
        time_length = fragment['time-length']
        fragment_transcript = fragment['transcript']
+        if args.align_min_length and len(fragment_transcript) < args.align_min_length:
+            skip('Transcript too short for alignment', index)
+            continue
+        if args.align_max_length and len(fragment_transcript) > args.align_max_length:
+            skip('Transcript too long for alignment', index)
+            continue
        match, match_distance, match_substitutions = search.find_best(fragment_transcript)
-        if match is not None:
-            substitutions += match_substitutions
-            fragment_matched = tc.clean_text[match.start:match.end]
-            score = match_distance/len(fragment_matched)
+        if match is None:
+            skip('No match for transcript', index)
+            continue
+        substitutions += match_substitutions
+        fragment_matched = tc.clean_text[match.start:match.end]
+        if args.output_min_length and len(fragment_matched) < args.output_min_length:
+            skip('Match too short', index)
+            continue
+        if args.output_max_length and len(fragment_matched) > args.output_max_length:
+            skip('Match too long', index)
+            continue
+        score = match_distance/max(len(fragment_matched), len(fragment_transcript))
+        sample_numbers = ['Score %.2f' % score]
+        if args.output_min_score and score < args.output_min_score:
+            skip('Matching score too low', index)
+            continue
+        if args.output_max_score and score > args.output_max_score:
+            skip('Matching score too high', index)
+            continue
+        original_start = tc.get_original_offset(match.start)
+        original_end = tc.get_original_offset(match.end)
+        result_fragment = {
+            'time-start':  time_start,
+            'time-length': time_length,
+            'text-start':  original_start,
+            'text-length': original_end-original_start,
+            'score':       score
+        }
+        if args.output_cer or args.output_min_cer or args.output_max_cer:
            cer = levenshtein(fragment_transcript, fragment_matched)/len(fragment_matched)
+            sample_numbers.insert(0, 'CER: %.2f' % cer * 100)
+            if args.output_cer:
+                result_fragment['cer'] = cer
+        if args.output_min_cer and score < args.output_min_cer:
+            skip('Character error rate (CER) too low', index)
+            continue
+        if args.output_max_cer and score > args.output_max_cer:
+            skip('Character error rate (CER) too high', index)
+            continue
+        if args.output_wer or args.output_min_wer or args.output_max_wer:
            wer = levenshtein(fragment_transcript.split(), fragment_matched.split())/len(fragment_matched.split())
-            if (args.output_min_cer and cer * 100.0 < args.output_min_cer) or \
-               (args.output_max_cer and cer * 100.0 > args.output_max_cer) or \
-               (args.output_min_wer and wer * 100.0 < args.output_min_wer) or \
-               (args.output_max_wer and wer * 100.0 > args.output_max_wer) or \
-               (args.output_min_length and len(fragment_matched) < args.output_min_length) or \
-               (args.output_max_length and len(fragment_matched) > args.output_max_length):
-                continue
-            original_start = tc.get_original_offset(match.start)
-            original_end = tc.get_original_offset(match.end)
-            result_fragment = {
-                'time-start':  time_start,
-                'time-length': time_length,
-                'text-start':  original_start,
-                'text-length': original_end-original_start,
-                'score':       score,
-                'cer':         cer,
-                'wer':         wer
-            }
-            if args.output_stt:
-                result_fragment['stt'] = fragment_transcript
-            if args.output_aligned:
-                result_fragment['aligned'] = fragment_matched
-            if args.output_aligned_raw:
-                result_fragment['aligned-raw'] = original_transcript[original_start:original_end]
-            result_fragments.append(result_fragment)
-            logging.debug('Sample with WER %.2f CER %.2f Score %f' % (wer * 100, cer * 100, score))
-            logging.debug('- T:  ' + args.text_context * ' ' + '%s' % fragment_transcript)
-            logging.debug('- O: %s|%s|%s' % (
-                tc.clean_text[match.start-args.text_context:match.start],
-                fragment_matched,
-                tc.clean_text[match.end:match.end+args.text_context]))
-            start = match.end
-            if args.play:
-                subprocess.check_call(['play',
-                                       '--no-show-progress',
-                                       args.audio,
-                                       'trim',
-                                       str(time_start/1000.0),
-                                       '='+str((time_start + time_length)/1000.0)])
+            sample_numbers.insert(0, 'WER: %.2f' % wer * 100)
+            if args.output_wer:
+                result_fragment['wer'] = wer
+        if args.output_min_wer and score < args.output_min_wer:
+            skip('Word error rate (WER) too low', index)
+            continue
+        if args.output_max_wer and score > args.output_max_wer:
+            skip('Word error rate (WER) too high', index)
+            continue
+        if args.output_stt:
+            result_fragment['stt'] = fragment_transcript
+        if args.output_aligned:
+            result_fragment['aligned'] = fragment_matched
+        if args.output_aligned_raw:
+            result_fragment['aligned-raw'] = original_transcript[original_start:original_end]
+        result_fragments.append(result_fragment)
+        logging.debug('Fragment %d aligned with %s' % (index, ' '.join(sample_numbers)))
+        logging.debug('- T: ' + args.text_context * ' ' + '"%s"' % fragment_transcript)
+        logging.debug('- O: %s|%s|%s' % (
+            tc.clean_text[match.start-args.text_context:match.start],
+            fragment_matched,
+            tc.clean_text[match.end:match.end+args.text_context]))
+        start = match.end
+        if args.play:
+            subprocess.check_call(['play',
+                                   '--no-show-progress',
+                                   args.audio,
+                                   'trim',
+                                   str(time_start / 1000.0),
+                                   '='+str((time_start + time_length) / 1000.0)])
    with open(args.result, 'w') as result_file:
        result_file.write(json.dumps(result_fragments))

-    logging.info('Aligned %d fragments.' % len(result_fragments))
-    skipped = len(fragments)-len(result_fragments)
-    logging.info('Skipped %d fragments (%.2f%%).' % (skipped, skipped*100.0/len(fragments)))
+    logging.info('Aligned %d fragments' % len(result_fragments))
+    skipped = len(fragments) - len(result_fragments)
+    logging.info('Skipped %d fragments (%.2f%%):' % (skipped, skipped * 100.0 / len(fragments)))
+    for key, number in statistics.most_common():
+        logging.info(' - %s: %d' % (key, number))
    print(substitutions)

 if __name__ == '__main__':
--- a/align/search.py
+++ b/align/search.py
@ -1,29 +1,27 @@
 from collections import Counter
 from nltk import ngrams
-from text import levenshtein, TextRange
-from utils import circulate, by_len
-import random
+from text import TextRange
+

 class FuzzySearch(object):
    def __init__(self,
                 text,
                 max_candidates=10,
                 candidate_threshold=0.92,
-                 snap_token=True,
-                 stretch_factor=1/3,
-                 match_score=3,
-                 mismatch_score=0,
-                 delete_score=-4,
-                 insert_score=-4,
+                 snap_to_word=True,
+                 snap_radius=0,
+                 match_score=100,
+                 mismatch_score=-100,
+                 gap_score=-100,
                 similarities=None):
        self.text = text
        self.max_candidates = max_candidates
        self.candidate_threshold = candidate_threshold
-        self.snap_token = snap_token
+        self.snap_to_word = snap_to_word
+        self.snap_radius = snap_radius
        self.match_score = match_score
        self.mismatch_score = mismatch_score
-        self.delete_score = delete_score
-        self.insert_score = insert_score
+        self.gap_score = gap_score
        self.similarities = similarities
        self.ngrams = {}
        for i, ngram in enumerate(ngrams(' ' + text + ' ', 3)):
@ -47,38 +45,38 @@ class FuzzySearch(object):

    def sw_align(self, a, b):
        n, m = len(a), len(b)
-        f = [[]] * (n + 1)
+        # building scoring matrix
+        f = [[0]] * (n + 1)
        for i in range(0, n + 1):
            f[i] = [0] * (m + 1)
        for i in range(1, n + 1):
-            f[i][0] = self.insert_score * i
+            f[i][0] = self.gap_score * i
        for j in range(1, m + 1):
-            f[0][j] = self.delete_score * j
+            f[0][j] = self.gap_score * j
        max_score = 0
        start_i, start_j = 0, 0
        for i in range(1, n + 1):
            for j in range(1, m + 1):
                match = f[i - 1][j - 1] + self.similarity(a[i - 1], b[j - 1])
-                insert = f[i][j - 1] + self.insert_score
-                delete = f[i - 1][j] + self.delete_score
+                insert = f[i][j - 1] + self.gap_score
+                delete = f[i - 1][j] + self.gap_score
                score = max(0, match, insert, delete)
                f[i][j] = score
                if score > max_score:
                    max_score = score
                    start_i, start_j = i, j
-
+        # backtracking
        substitutions = Counter()
        i, j = start_i, start_j
        while (j > 0 or i > 0) and f[i][j] != 0:
            ca, cb = a[i - 1] if i > 0 else ' ', b[j - 1] if j > 0 else ' '
            s = self.similarity(ca, cb)
            if i > 0 and j > 0 and f[i][j] == (f[i - 1][j - 1] + s):
-                if ca != cb:
-                    substitutions[FuzzySearch.similarity_key(ca, cb)] += 1
+                substitutions[FuzzySearch.similarity_key(ca, cb)] += 1
                i, j = i - 1, j - 1
-            elif i > 0 and f[i][j] == (f[i - 1][j] + self.delete_score):
+            elif i > 0 and f[i][j] == (f[i - 1][j] + self.gap_score):
                i -= 1
-            elif j > 0 and f[i][j] == (f[i][j - 1] + self.insert_score):
+            elif j > 0 and f[i][j] == (f[i][j - 1] + self.gap_score):
                j -= 1
            else:
                raise Exception('Smith–Waterman failure')