This commit is contained in:
Tilman Kamp 2019-07-30 17:58:55 +02:00
Родитель 8ee9b6f835
Коммит 0e627adc9d
2 изменённых файлов: 156 добавлений и 83 удалений

Просмотреть файл

@ -28,6 +28,10 @@ def main(args):
help='Play audio fragments as they are matched using SoX audio tool')
parser.add_argument('--text-context', type=int, required=False, default=10,
help='Size of textual context for logged statements - default: 10')
parser.add_argument('--start', type=int, required=False, default=0,
help='Start alignment process at given offset of transcribed fragments')
parser.add_argument('--num-samples', type=int, required=False,
help='Number of fragments to align')
audio_group = parser.add_argument_group(title='Audio pre-processing options')
audio_group.add_argument('--audio-vad-aggressiveness', type=int, choices=range(4), required=False,
@ -40,6 +44,10 @@ def main(args):
stt_group.add_argument('--stt-no-own-lm', action="store_true",
help='Deactivates creation of individual language models per document.' +
'Uses the one from model dir instead.')
stt_group.add_argument('--stt-min-duration', type=int, required=False, default=4,
help='Minimum speech fragment duration in milliseconds to translate (default: 100)')
stt_group.add_argument('--stt-max-duration', type=int, required=False,
help='Maximum speech fragment duration in milliseconds to translate (default: no limit)')
text_group = parser.add_argument_group(title='Text pre-processing options')
text_group.add_argument('--text-keep-dashes', action="store_true",
@ -54,13 +62,22 @@ def main(args):
help='How many global 3gram match candidates are tested at max (default: 10)')
align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.92,
help='Factor for how many 3grams the next candidate should have at least ' +
'compared to its predecessor (default: 0.8)')
align_group.add_argument('--align-no-snap-to-token', action="store_true",
help='Deactivates snapping to similar neighbour tokens ' +
'at the beginning and end of each phrase')
align_group.add_argument('--align-stretch-fraction', type=float, required=False, default=1/3,
help='Fraction of its original length that a phrase could get expanded or shrunken ' +
'to match the original text (default: 0.33)')
'compared to its predecessor (default: 0.92)')
align_group.add_argument('--align-match-score', type=int, required=False, default=100,
help='Matching score for Smith-Waterman alignment (default: 100)')
align_group.add_argument('--align-mismatch-score', type=int, required=False, default=-100,
help='Mismatch score for Smith-Waterman alignment (default: -100)')
align_group.add_argument('--align-gap-score', type=int, required=False, default=-100,
help='Gap score for Smith-Waterman alignment (default: -100)')
align_group.add_argument('--align-no-snap', action="store_true",
help='Deactivates snapping to word boundaries at the beginning and end of each phrase')
align_group.add_argument('--align-snap-radius', type=int, required=False, default=0,
help='How many words to look up to the left and right for snapping to word ' +
'boundaries at the beginning and end of each phrase')
align_group.add_argument('--align-min-length', type=int, required=False, default=4,
help='Minimum STT phrase length to align (default: 4)')
align_group.add_argument('--align-max-length', type=int, required=False,
help='Maximum STT phrase length to align (default: no limit)')
output_group = parser.add_argument_group(title='Output options')
output_group.add_argument('--output-stt', action="store_true",
@ -69,10 +86,18 @@ def main(args):
help='Writes clean aligned original transcripts to result file')
output_group.add_argument('--output-aligned-raw', action="store_true",
help='Writes raw aligned original transcripts to result file')
output_group.add_argument('--output-min-length', type=int, required=False,
help='Minimum phrase length (default: no limit)')
output_group.add_argument('--output-wer', action="store_true",
help='Writes word error rate (WER) to output')
output_group.add_argument('--output-cer', action="store_true",
help='Writes character error rate (CER) to output')
output_group.add_argument('--output-min-length', type=int, required=False, default=1,
help='Minimum phrase length (default: 1)')
output_group.add_argument('--output-max-length', type=int, required=False,
help='Maximum phrase length (default: no limit)')
output_group.add_argument('--output-min-score', type=float, required=False, default=2.0,
help='Minimum matching score (default: 2.0)')
output_group.add_argument('--output-max-score', type=float, required=False,
help='Maximum matching score (default: no limit)')
for b in ['Min', 'Max']:
for r in ['CER', 'WER']:
output_group.add_argument('--output-' + b.lower() + '-' + r.lower(), type=float, required=False,
@ -167,8 +192,14 @@ def main(args):
for i, segment in enumerate(segments):
# Run DeepSpeech on the chunk that just completed VAD
segment_buffer, time_start, time_end = segment
time_length = time_end - time_start
if args.stt_min_duration and time_length < args.stt_min_duration:
skip('Audio too short for STT', index)
continue
if args.stt_max_duration and time_length > args.stt_max_duration:
skip('Audio too long for STT', index)
continue
logging.debug("Transcribing segment %002d (from %f to %f)..." % (i, time_start / 1000.0, time_end / 1000.0))
audio = np.frombuffer(segment_buffer, dtype=np.int16)
segment_transcript, segment_inference_time = wavTranscriber.stt(model, audio, sample_rate)
segment_transcript = ' '.join(segment_transcript.split())
@ -178,7 +209,7 @@ def main(args):
continue
fragments.append({
'time-start': time_start,
'time-length': time_end-time_start,
'time-length': time_length,
'transcript': segment_transcript
})
offset += len(segment_transcript)
@ -190,68 +221,112 @@ def main(args):
search = FuzzySearch(tc.clean_text,
max_candidates=args.align_max_candidates,
candidate_threshold=args.align_candidate_threshold,
snap_token=not args.align_no_snap_to_token,
stretch_factor=args.align_stretch_fraction)
start = 0
snap_to_word=not args.align_no_snap,
snap_radius=not args.align_snap_radius,
match_score=args.align_match_score,
mismatch_score=args.align_mismatch_score,
gap_score=args.align_gap_score)
result_fragments = []
substitutions = Counter()
statistics = Counter()
for fragment in fragments:
def skip(message, index):
logging.info('Fragment %d: %s' % (index, message))
statistics[message] += 1
end_fragments = (args.start + args.num_samples) if args.num_samples else len(fragments)
fragments = fragments[args.start:end_fragments]
for index, fragment in enumerate(fragments):
time_start = fragment['time-start']
time_length = fragment['time-length']
fragment_transcript = fragment['transcript']
if args.align_min_length and len(fragment_transcript) < args.align_min_length:
skip('Transcript too short for alignment', index)
continue
if args.align_max_length and len(fragment_transcript) > args.align_max_length:
skip('Transcript too long for alignment', index)
continue
match, match_distance, match_substitutions = search.find_best(fragment_transcript)
if match is not None:
substitutions += match_substitutions
fragment_matched = tc.clean_text[match.start:match.end]
score = match_distance/len(fragment_matched)
if match is None:
skip('No match for transcript', index)
continue
substitutions += match_substitutions
fragment_matched = tc.clean_text[match.start:match.end]
if args.output_min_length and len(fragment_matched) < args.output_min_length:
skip('Match too short', index)
continue
if args.output_max_length and len(fragment_matched) > args.output_max_length:
skip('Match too long', index)
continue
score = match_distance/max(len(fragment_matched), len(fragment_transcript))
sample_numbers = ['Score %.2f' % score]
if args.output_min_score and score < args.output_min_score:
skip('Matching score too low', index)
continue
if args.output_max_score and score > args.output_max_score:
skip('Matching score too high', index)
continue
original_start = tc.get_original_offset(match.start)
original_end = tc.get_original_offset(match.end)
result_fragment = {
'time-start': time_start,
'time-length': time_length,
'text-start': original_start,
'text-length': original_end-original_start,
'score': score
}
if args.output_cer or args.output_min_cer or args.output_max_cer:
cer = levenshtein(fragment_transcript, fragment_matched)/len(fragment_matched)
sample_numbers.insert(0, 'CER: %.2f' % cer * 100)
if args.output_cer:
result_fragment['cer'] = cer
if args.output_min_cer and score < args.output_min_cer:
skip('Character error rate (CER) too low', index)
continue
if args.output_max_cer and score > args.output_max_cer:
skip('Character error rate (CER) too high', index)
continue
if args.output_wer or args.output_min_wer or args.output_max_wer:
wer = levenshtein(fragment_transcript.split(), fragment_matched.split())/len(fragment_matched.split())
if (args.output_min_cer and cer * 100.0 < args.output_min_cer) or \
(args.output_max_cer and cer * 100.0 > args.output_max_cer) or \
(args.output_min_wer and wer * 100.0 < args.output_min_wer) or \
(args.output_max_wer and wer * 100.0 > args.output_max_wer) or \
(args.output_min_length and len(fragment_matched) < args.output_min_length) or \
(args.output_max_length and len(fragment_matched) > args.output_max_length):
continue
original_start = tc.get_original_offset(match.start)
original_end = tc.get_original_offset(match.end)
result_fragment = {
'time-start': time_start,
'time-length': time_length,
'text-start': original_start,
'text-length': original_end-original_start,
'score': score,
'cer': cer,
'wer': wer
}
if args.output_stt:
result_fragment['stt'] = fragment_transcript
if args.output_aligned:
result_fragment['aligned'] = fragment_matched
if args.output_aligned_raw:
result_fragment['aligned-raw'] = original_transcript[original_start:original_end]
result_fragments.append(result_fragment)
logging.debug('Sample with WER %.2f CER %.2f Score %f' % (wer * 100, cer * 100, score))
logging.debug('- T: ' + args.text_context * ' ' + '%s' % fragment_transcript)
logging.debug('- O: %s|%s|%s' % (
tc.clean_text[match.start-args.text_context:match.start],
fragment_matched,
tc.clean_text[match.end:match.end+args.text_context]))
start = match.end
if args.play:
subprocess.check_call(['play',
'--no-show-progress',
args.audio,
'trim',
str(time_start/1000.0),
'='+str((time_start + time_length)/1000.0)])
sample_numbers.insert(0, 'WER: %.2f' % wer * 100)
if args.output_wer:
result_fragment['wer'] = wer
if args.output_min_wer and score < args.output_min_wer:
skip('Word error rate (WER) too low', index)
continue
if args.output_max_wer and score > args.output_max_wer:
skip('Word error rate (WER) too high', index)
continue
if args.output_stt:
result_fragment['stt'] = fragment_transcript
if args.output_aligned:
result_fragment['aligned'] = fragment_matched
if args.output_aligned_raw:
result_fragment['aligned-raw'] = original_transcript[original_start:original_end]
result_fragments.append(result_fragment)
logging.debug('Fragment %d aligned with %s' % (index, ' '.join(sample_numbers)))
logging.debug('- T: ' + args.text_context * ' ' + '"%s"' % fragment_transcript)
logging.debug('- O: %s|%s|%s' % (
tc.clean_text[match.start-args.text_context:match.start],
fragment_matched,
tc.clean_text[match.end:match.end+args.text_context]))
start = match.end
if args.play:
subprocess.check_call(['play',
'--no-show-progress',
args.audio,
'trim',
str(time_start / 1000.0),
'='+str((time_start + time_length) / 1000.0)])
with open(args.result, 'w') as result_file:
result_file.write(json.dumps(result_fragments))
logging.info('Aligned %d fragments.' % len(result_fragments))
skipped = len(fragments)-len(result_fragments)
logging.info('Skipped %d fragments (%.2f%%).' % (skipped, skipped*100.0/len(fragments)))
logging.info('Aligned %d fragments' % len(result_fragments))
skipped = len(fragments) - len(result_fragments)
logging.info('Skipped %d fragments (%.2f%%):' % (skipped, skipped * 100.0 / len(fragments)))
for key, number in statistics.most_common():
logging.info(' - %s: %d' % (key, number))
print(substitutions)
if __name__ == '__main__':

Просмотреть файл

@ -1,29 +1,27 @@
from collections import Counter
from nltk import ngrams
from text import levenshtein, TextRange
from utils import circulate, by_len
import random
from text import TextRange
class FuzzySearch(object):
def __init__(self,
text,
max_candidates=10,
candidate_threshold=0.92,
snap_token=True,
stretch_factor=1/3,
match_score=3,
mismatch_score=0,
delete_score=-4,
insert_score=-4,
snap_to_word=True,
snap_radius=0,
match_score=100,
mismatch_score=-100,
gap_score=-100,
similarities=None):
self.text = text
self.max_candidates = max_candidates
self.candidate_threshold = candidate_threshold
self.snap_token = snap_token
self.snap_to_word = snap_to_word
self.snap_radius = snap_radius
self.match_score = match_score
self.mismatch_score = mismatch_score
self.delete_score = delete_score
self.insert_score = insert_score
self.gap_score = gap_score
self.similarities = similarities
self.ngrams = {}
for i, ngram in enumerate(ngrams(' ' + text + ' ', 3)):
@ -47,38 +45,38 @@ class FuzzySearch(object):
def sw_align(self, a, b):
n, m = len(a), len(b)
f = [[]] * (n + 1)
# building scoring matrix
f = [[0]] * (n + 1)
for i in range(0, n + 1):
f[i] = [0] * (m + 1)
for i in range(1, n + 1):
f[i][0] = self.insert_score * i
f[i][0] = self.gap_score * i
for j in range(1, m + 1):
f[0][j] = self.delete_score * j
f[0][j] = self.gap_score * j
max_score = 0
start_i, start_j = 0, 0
for i in range(1, n + 1):
for j in range(1, m + 1):
match = f[i - 1][j - 1] + self.similarity(a[i - 1], b[j - 1])
insert = f[i][j - 1] + self.insert_score
delete = f[i - 1][j] + self.delete_score
insert = f[i][j - 1] + self.gap_score
delete = f[i - 1][j] + self.gap_score
score = max(0, match, insert, delete)
f[i][j] = score
if score > max_score:
max_score = score
start_i, start_j = i, j
# backtracking
substitutions = Counter()
i, j = start_i, start_j
while (j > 0 or i > 0) and f[i][j] != 0:
ca, cb = a[i - 1] if i > 0 else ' ', b[j - 1] if j > 0 else ' '
s = self.similarity(ca, cb)
if i > 0 and j > 0 and f[i][j] == (f[i - 1][j - 1] + s):
if ca != cb:
substitutions[FuzzySearch.similarity_key(ca, cb)] += 1
substitutions[FuzzySearch.similarity_key(ca, cb)] += 1
i, j = i - 1, j - 1
elif i > 0 and f[i][j] == (f[i - 1][j] + self.delete_score):
elif i > 0 and f[i][j] == (f[i - 1][j] + self.gap_score):
i -= 1
elif j > 0 and f[i][j] == (f[i][j - 1] + self.insert_score):
elif j > 0 and f[i][j] == (f[i][j - 1] + self.gap_score):
j -= 1
else:
raise Exception('Smith–Waterman failure')