This commit is contained in:
Tilman Kamp 2019-07-05 18:17:16 +02:00
Родитель 6e29fa594b
Коммит e11fd4e724
2 изменённых файлов: 57 добавлений и 36 удалений

Просмотреть файл

@ -4,6 +4,7 @@ import text
import json
import logging
import argparse
import subprocess
import os.path as path
import numpy as np
import wavTranscriber
@ -23,6 +24,8 @@ def main(args):
help='Path to directory that contains all model files (output_graph, lm, trie and alphabet)')
parser.add_argument('--loglevel', type=int, required=False,
help='Log level (between 0 and 50) - default: 20')
parser.add_argument('--play', action="store_true",
help='Plays audio fragments as they are matched using SoX audio tool')
args = parser.parse_args()
# Debug helpers
@ -68,8 +71,7 @@ def main(args):
fragments.append({
'time_start': time_start,
'time_end': time_end,
'transcript': segment_transcript,
'offset': offset
'transcript': segment_transcript
})
offset += len(segment_transcript)
@ -80,18 +82,24 @@ def main(args):
logging.debug("Loading original transcript from %s..." % args.transcript)
with open(args.transcript, 'r') as transcript_file:
original_transcript = transcript_file.read()
original_transcript = ' '.join(original_transcript.lower().split())
original_transcript = alphabet.filter(original_transcript)
ls = text.LevenshteinSearch(original_transcript)
tc = text.TextCleaner(original_transcript, alphabet)
ls = text.LevenshteinSearch(tc.clean_text)
start = 0
for fragment in fragments:
logging.debug('STT Transcribed: %s' % fragment['transcript'])
match_distance, match_offset, match_len = ls.find_best(fragment['transcript'])
if match_offset >= 0:
fragment['original'] = original_transcript[match_offset:match_offset+match_len]
logging.debug(' Original: %s' % fragment['original'])
fragment_transcript = fragment['transcript']
match_distance, match_offset, match_len = ls.find_best(fragment_transcript)
if match_offset >= 0 and match_distance < 0.2 * len(fragment_transcript):
logging.debug('transcribed: %s' % fragment['transcript'])
original_start = tc.get_original_offset(match_offset)
original_end = tc.get_original_offset(match_offset+match_len)
fragment['offset'] = original_start
fragment['length'] = original_end-original_start
logging.debug(' original: %s' % ' '.join(original_transcript[original_start:original_end].split()))
start = match_offset+match_len
if args.play:
subprocess.check_call(['play', args.audio, 'trim', str(fragment['time_start']/1000.0), '='+str(fragment['time_end']/1000.0)])
with open(args.result, 'w') as result_file:
result_file.write(json.dumps(fragments))
if __name__ == '__main__':
main(sys.argv[1:])

Просмотреть файл

@ -1,5 +1,6 @@
from __future__ import absolute_import, division, print_function
import math
import codecs
import logging
from nltk import ngrams
@ -28,13 +29,6 @@ class Alphabet(object):
def has_label(self, string):
return string in self._str_to_label
def filter(self, string):
new_string = ''
for c in string:
if self.has_label(c):
new_string += c
return new_string
def label_from_string(self, string):
try:
return self._str_to_label[string]
@ -59,15 +53,32 @@ class Alphabet(object):
class TextCleaner:
def __init__(self, original_text, alphabet, to_lower=True, normalize_space=True):
self.original_text = original_text
clean_text = original_text
if to_lower:
clean_text = clean_text.lower()
if normalize_space:
clean_text = ' '.join(clean_text.split())
self.clean_text = alphabet.filter(clean_text)
prepared_text = original_text.lower() if to_lower else original_text
cleaned = []
self.positions = []
ws = False
for position, c in enumerate(prepared_text):
if not alphabet.has_label(c):
continue
if normalize_space and c.isspace():
if ws:
continue
else:
ws = True
c = ' '
else:
ws = False
cleaned.append(c)
self.positions.append(position)
self.clean_text = ''.join(cleaned)
def get_original_offset(self, clean_offset):
return clean_offset
if clean_offset == len(self.positions):
return self.positions[-1]+1
try:
return self.positions[clean_offset]
except:
print(len(self.positions), clean_offset)
class LevenshteinSearch:
@ -95,18 +106,20 @@ class LevenshteinSearch:
windows[window] = (windows[window] + 1) if window in windows else 1
candidate_windows = sorted(windows.keys(), key=lambda w: windows[w], reverse=True)
found_best = False
best_distance = -1
best_offset = -1
best_len = -1
for window in candidate_windows[0:4]:
for offset in range(int((window-0.5)*window_size), int((window+0.5)*window_size)):
distance = levenshtein(self.text[offset:offset + len(look_for)], look_for)
if not found_best or distance < best_distance:
best = (-1, -1, -1)
best_virtual_distance = -1
for window in candidate_windows[:4]:
start_offset = max(start, int((window-0.5)*window_size))
stop_offset = min(stop-len(look_for), int((window+0.5)*window_size))
for offset in range(start_offset, stop_offset):
distance = levenshtein(self.text[offset:offset+len(look_for)], look_for)
virtual_distance = distance*(1+math.sqrt(offset-start)/100)
# print(virtual_distance)
if not found_best or virtual_distance < best_virtual_distance:
found_best = True
best_distance = distance
best_offset = offset
best_len = len(look_for)
return best_distance, best_offset, best_len
best_virtual_distance = virtual_distance
best = (distance, offset, len(look_for))
return best
# The following code is from: http://hetland.org/coding/python/levenshtein.py