зеркало из https://github.com/mozilla/DSAlign.git
This commit is contained in:
Родитель
6e29fa594b
Коммит
e11fd4e724
|
@ -4,6 +4,7 @@ import text
|
|||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import os.path as path
|
||||
import numpy as np
|
||||
import wavTranscriber
|
||||
|
@ -23,6 +24,8 @@ def main(args):
|
|||
help='Path to directory that contains all model files (output_graph, lm, trie and alphabet)')
|
||||
parser.add_argument('--loglevel', type=int, required=False,
|
||||
help='Log level (between 0 and 50) - default: 20')
|
||||
parser.add_argument('--play', action="store_true",
|
||||
help='Plays audio fragments as they are matched using SoX audio tool')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Debug helpers
|
||||
|
@ -68,8 +71,7 @@ def main(args):
|
|||
fragments.append({
|
||||
'time_start': time_start,
|
||||
'time_end': time_end,
|
||||
'transcript': segment_transcript,
|
||||
'offset': offset
|
||||
'transcript': segment_transcript
|
||||
})
|
||||
offset += len(segment_transcript)
|
||||
|
||||
|
@ -80,18 +82,24 @@ def main(args):
|
|||
logging.debug("Loading original transcript from %s..." % args.transcript)
|
||||
with open(args.transcript, 'r') as transcript_file:
|
||||
original_transcript = transcript_file.read()
|
||||
original_transcript = ' '.join(original_transcript.lower().split())
|
||||
original_transcript = alphabet.filter(original_transcript)
|
||||
ls = text.LevenshteinSearch(original_transcript)
|
||||
tc = text.TextCleaner(original_transcript, alphabet)
|
||||
ls = text.LevenshteinSearch(tc.clean_text)
|
||||
start = 0
|
||||
for fragment in fragments:
|
||||
logging.debug('STT Transcribed: %s' % fragment['transcript'])
|
||||
match_distance, match_offset, match_len = ls.find_best(fragment['transcript'])
|
||||
if match_offset >= 0:
|
||||
fragment['original'] = original_transcript[match_offset:match_offset+match_len]
|
||||
logging.debug(' Original: %s' % fragment['original'])
|
||||
fragment_transcript = fragment['transcript']
|
||||
match_distance, match_offset, match_len = ls.find_best(fragment_transcript)
|
||||
if match_offset >= 0 and match_distance < 0.2 * len(fragment_transcript):
|
||||
logging.debug('transcribed: %s' % fragment['transcript'])
|
||||
original_start = tc.get_original_offset(match_offset)
|
||||
original_end = tc.get_original_offset(match_offset+match_len)
|
||||
fragment['offset'] = original_start
|
||||
fragment['length'] = original_end-original_start
|
||||
logging.debug(' original: %s' % ' '.join(original_transcript[original_start:original_end].split()))
|
||||
start = match_offset+match_len
|
||||
|
||||
if args.play:
|
||||
subprocess.check_call(['play', args.audio, 'trim', str(fragment['time_start']/1000.0), '='+str(fragment['time_end']/1000.0)])
|
||||
with open(args.result, 'w') as result_file:
|
||||
result_file.write(json.dumps(fragments))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import math
|
||||
import codecs
|
||||
import logging
|
||||
from nltk import ngrams
|
||||
|
@ -28,13 +29,6 @@ class Alphabet(object):
|
|||
def has_label(self, string):
|
||||
return string in self._str_to_label
|
||||
|
||||
def filter(self, string):
|
||||
new_string = ''
|
||||
for c in string:
|
||||
if self.has_label(c):
|
||||
new_string += c
|
||||
return new_string
|
||||
|
||||
def label_from_string(self, string):
|
||||
try:
|
||||
return self._str_to_label[string]
|
||||
|
@ -59,15 +53,32 @@ class Alphabet(object):
|
|||
class TextCleaner:
|
||||
def __init__(self, original_text, alphabet, to_lower=True, normalize_space=True):
|
||||
self.original_text = original_text
|
||||
clean_text = original_text
|
||||
if to_lower:
|
||||
clean_text = clean_text.lower()
|
||||
if normalize_space:
|
||||
clean_text = ' '.join(clean_text.split())
|
||||
self.clean_text = alphabet.filter(clean_text)
|
||||
prepared_text = original_text.lower() if to_lower else original_text
|
||||
cleaned = []
|
||||
self.positions = []
|
||||
ws = False
|
||||
for position, c in enumerate(prepared_text):
|
||||
if not alphabet.has_label(c):
|
||||
continue
|
||||
if normalize_space and c.isspace():
|
||||
if ws:
|
||||
continue
|
||||
else:
|
||||
ws = True
|
||||
c = ' '
|
||||
else:
|
||||
ws = False
|
||||
cleaned.append(c)
|
||||
self.positions.append(position)
|
||||
self.clean_text = ''.join(cleaned)
|
||||
|
||||
def get_original_offset(self, clean_offset):
|
||||
return clean_offset
|
||||
if clean_offset == len(self.positions):
|
||||
return self.positions[-1]+1
|
||||
try:
|
||||
return self.positions[clean_offset]
|
||||
except:
|
||||
print(len(self.positions), clean_offset)
|
||||
|
||||
|
||||
class LevenshteinSearch:
|
||||
|
@ -95,18 +106,20 @@ class LevenshteinSearch:
|
|||
windows[window] = (windows[window] + 1) if window in windows else 1
|
||||
candidate_windows = sorted(windows.keys(), key=lambda w: windows[w], reverse=True)
|
||||
found_best = False
|
||||
best_distance = -1
|
||||
best_offset = -1
|
||||
best_len = -1
|
||||
for window in candidate_windows[0:4]:
|
||||
for offset in range(int((window-0.5)*window_size), int((window+0.5)*window_size)):
|
||||
distance = levenshtein(self.text[offset:offset + len(look_for)], look_for)
|
||||
if not found_best or distance < best_distance:
|
||||
best = (-1, -1, -1)
|
||||
best_virtual_distance = -1
|
||||
for window in candidate_windows[:4]:
|
||||
start_offset = max(start, int((window-0.5)*window_size))
|
||||
stop_offset = min(stop-len(look_for), int((window+0.5)*window_size))
|
||||
for offset in range(start_offset, stop_offset):
|
||||
distance = levenshtein(self.text[offset:offset+len(look_for)], look_for)
|
||||
virtual_distance = distance*(1+math.sqrt(offset-start)/100)
|
||||
# print(virtual_distance)
|
||||
if not found_best or virtual_distance < best_virtual_distance:
|
||||
found_best = True
|
||||
best_distance = distance
|
||||
best_offset = offset
|
||||
best_len = len(look_for)
|
||||
return best_distance, best_offset, best_len
|
||||
best_virtual_distance = virtual_distance
|
||||
best = (distance, offset, len(look_for))
|
||||
return best
|
||||
|
||||
|
||||
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
||||
|
|
Загрузка…
Ссылка в новой задаче