diff --git a/align/align.py b/align/align.py index 1b4da78..319808f 100644 --- a/align/align.py +++ b/align/align.py @@ -4,6 +4,7 @@ import text import json import logging import argparse +import subprocess import os.path as path import numpy as np import wavTranscriber @@ -23,6 +24,8 @@ def main(args): help='Path to directory that contains all model files (output_graph, lm, trie and alphabet)') parser.add_argument('--loglevel', type=int, required=False, help='Log level (between 0 and 50) - default: 20') + parser.add_argument('--play', action="store_true", + help='Plays audio fragments as they are matched using SoX audio tool') args = parser.parse_args() # Debug helpers @@ -68,8 +71,7 @@ def main(args): fragments.append({ 'time_start': time_start, 'time_end': time_end, - 'transcript': segment_transcript, - 'offset': offset + 'transcript': segment_transcript }) offset += len(segment_transcript) @@ -80,18 +82,24 @@ def main(args): logging.debug("Loading original transcript from %s..." % args.transcript) with open(args.transcript, 'r') as transcript_file: original_transcript = transcript_file.read() - original_transcript = ' '.join(original_transcript.lower().split()) - original_transcript = alphabet.filter(original_transcript) - ls = text.LevenshteinSearch(original_transcript) + tc = text.TextCleaner(original_transcript, alphabet) + ls = text.LevenshteinSearch(tc.clean_text) start = 0 for fragment in fragments: - logging.debug('STT Transcribed: %s' % fragment['transcript']) - match_distance, match_offset, match_len = ls.find_best(fragment['transcript']) - if match_offset >= 0: - fragment['original'] = original_transcript[match_offset:match_offset+match_len] - logging.debug(' Original: %s' % fragment['original']) + fragment_transcript = fragment['transcript'] + match_distance, match_offset, match_len = ls.find_best(fragment_transcript) + if match_offset >= 0 and match_distance < 0.2 * len(fragment_transcript): + logging.debug('transcribed: %s' % fragment['transcript']) + original_start = tc.get_original_offset(match_offset) + original_end = tc.get_original_offset(match_offset+match_len) + fragment['offset'] = original_start + fragment['length'] = original_end-original_start + logging.debug(' original: %s' % ' '.join(original_transcript[original_start:original_end].split())) start = match_offset+match_len - + if args.play: + subprocess.check_call(['play', args.audio, 'trim', str(fragment['time_start']/1000.0), '='+str(fragment['time_end']/1000.0)]) + with open(args.result, 'w') as result_file: + result_file.write(json.dumps(fragments)) if __name__ == '__main__': main(sys.argv[1:]) diff --git a/align/text.py b/align/text.py index affb1b5..5e42543 100644 --- a/align/text.py +++ b/align/text.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, print_function +import math import codecs import logging from nltk import ngrams @@ -28,13 +29,6 @@ class Alphabet(object): def has_label(self, string): return string in self._str_to_label - def filter(self, string): - new_string = '' - for c in string: - if self.has_label(c): - new_string += c - return new_string - def label_from_string(self, string): try: return self._str_to_label[string] @@ -59,15 +53,32 @@ class Alphabet(object): class TextCleaner: def __init__(self, original_text, alphabet, to_lower=True, normalize_space=True): self.original_text = original_text - clean_text = original_text - if to_lower: - clean_text = clean_text.lower() - if normalize_space: - clean_text = ' '.join(clean_text.split()) - self.clean_text = alphabet.filter(clean_text) + prepared_text = original_text.lower() if to_lower else original_text + cleaned = [] + self.positions = [] + ws = False + for position, c in enumerate(prepared_text): + if not alphabet.has_label(c): + continue + if normalize_space and c.isspace(): + if ws: + continue + else: + ws = True + c = ' ' + else: + ws = False + cleaned.append(c) + self.positions.append(position) + self.clean_text = ''.join(cleaned) def get_original_offset(self, clean_offset): - return clean_offset + if clean_offset == len(self.positions): + return self.positions[-1]+1 + try: + return self.positions[clean_offset] + except: + print(len(self.positions), clean_offset) class LevenshteinSearch: @@ -95,18 +106,20 @@ class LevenshteinSearch: windows[window] = (windows[window] + 1) if window in windows else 1 candidate_windows = sorted(windows.keys(), key=lambda w: windows[w], reverse=True) found_best = False - best_distance = -1 - best_offset = -1 - best_len = -1 - for window in candidate_windows[0:4]: - for offset in range(int((window-0.5)*window_size), int((window+0.5)*window_size)): - distance = levenshtein(self.text[offset:offset + len(look_for)], look_for) - if not found_best or distance < best_distance: + best = (-1, -1, -1) + best_virtual_distance = -1 + for window in candidate_windows[:4]: + start_offset = max(start, int((window-0.5)*window_size)) + stop_offset = min(stop-len(look_for), int((window+0.5)*window_size)) + for offset in range(start_offset, stop_offset): + distance = levenshtein(self.text[offset:offset+len(look_for)], look_for) + virtual_distance = distance*(1+math.sqrt(offset-start)/100) + # print(virtual_distance) + if not found_best or virtual_distance < best_virtual_distance: found_best = True - best_distance = distance - best_offset = offset - best_len = len(look_for) - return best_distance, best_offset, best_len + best_virtual_distance = virtual_distance + best = (distance, offset, len(look_for)) + return best # The following code is from: http://hetland.org/coding/python/levenshtein.py