зеркало из https://github.com/mozilla/DSAlign.git
This commit is contained in:
Родитель
8ee9b6f835
Коммит
0e627adc9d
197
align/align.py
197
align/align.py
|
@ -28,6 +28,10 @@ def main(args):
|
|||
help='Play audio fragments as they are matched using SoX audio tool')
|
||||
parser.add_argument('--text-context', type=int, required=False, default=10,
|
||||
help='Size of textual context for logged statements - default: 10')
|
||||
parser.add_argument('--start', type=int, required=False, default=0,
|
||||
help='Start alignment process at given offset of transcribed fragments')
|
||||
parser.add_argument('--num-samples', type=int, required=False,
|
||||
help='Number of fragments to align')
|
||||
|
||||
audio_group = parser.add_argument_group(title='Audio pre-processing options')
|
||||
audio_group.add_argument('--audio-vad-aggressiveness', type=int, choices=range(4), required=False,
|
||||
|
@ -40,6 +44,10 @@ def main(args):
|
|||
stt_group.add_argument('--stt-no-own-lm', action="store_true",
|
||||
help='Deactivates creation of individual language models per document.' +
|
||||
'Uses the one from model dir instead.')
|
||||
stt_group.add_argument('--stt-min-duration', type=int, required=False, default=4,
|
||||
help='Minimum speech fragment duration in milliseconds to translate (default: 100)')
|
||||
stt_group.add_argument('--stt-max-duration', type=int, required=False,
|
||||
help='Maximum speech fragment duration in milliseconds to translate (default: no limit)')
|
||||
|
||||
text_group = parser.add_argument_group(title='Text pre-processing options')
|
||||
text_group.add_argument('--text-keep-dashes', action="store_true",
|
||||
|
@ -54,13 +62,22 @@ def main(args):
|
|||
help='How many global 3gram match candidates are tested at max (default: 10)')
|
||||
align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.92,
|
||||
help='Factor for how many 3grams the next candidate should have at least ' +
|
||||
'compared to its predecessor (default: 0.8)')
|
||||
align_group.add_argument('--align-no-snap-to-token', action="store_true",
|
||||
help='Deactivates snapping to similar neighbour tokens ' +
|
||||
'at the beginning and end of each phrase')
|
||||
align_group.add_argument('--align-stretch-fraction', type=float, required=False, default=1/3,
|
||||
help='Fraction of its original length that a phrase could get expanded or shrunken ' +
|
||||
'to match the original text (default: 0.33)')
|
||||
'compared to its predecessor (default: 0.92)')
|
||||
align_group.add_argument('--align-match-score', type=int, required=False, default=100,
|
||||
help='Matching score for Smith-Waterman alignment (default: 100)')
|
||||
align_group.add_argument('--align-mismatch-score', type=int, required=False, default=-100,
|
||||
help='Mismatch score for Smith-Waterman alignment (default: -100)')
|
||||
align_group.add_argument('--align-gap-score', type=int, required=False, default=-100,
|
||||
help='Gap score for Smith-Waterman alignment (default: -100)')
|
||||
align_group.add_argument('--align-no-snap', action="store_true",
|
||||
help='Deactivates snapping to word boundaries at the beginning and end of each phrase')
|
||||
align_group.add_argument('--align-snap-radius', type=int, required=False, default=0,
|
||||
help='How many words to look up to the left and right for snapping to word ' +
|
||||
'boundaries at the beginning and end of each phrase')
|
||||
align_group.add_argument('--align-min-length', type=int, required=False, default=4,
|
||||
help='Minimum STT phrase length to align (default: 4)')
|
||||
align_group.add_argument('--align-max-length', type=int, required=False,
|
||||
help='Maximum STT phrase length to align (default: no limit)')
|
||||
|
||||
output_group = parser.add_argument_group(title='Output options')
|
||||
output_group.add_argument('--output-stt', action="store_true",
|
||||
|
@ -69,10 +86,18 @@ def main(args):
|
|||
help='Writes clean aligned original transcripts to result file')
|
||||
output_group.add_argument('--output-aligned-raw', action="store_true",
|
||||
help='Writes raw aligned original transcripts to result file')
|
||||
output_group.add_argument('--output-min-length', type=int, required=False,
|
||||
help='Minimum phrase length (default: no limit)')
|
||||
output_group.add_argument('--output-wer', action="store_true",
|
||||
help='Writes word error rate (WER) to output')
|
||||
output_group.add_argument('--output-cer', action="store_true",
|
||||
help='Writes character error rate (CER) to output')
|
||||
output_group.add_argument('--output-min-length', type=int, required=False, default=1,
|
||||
help='Minimum phrase length (default: 1)')
|
||||
output_group.add_argument('--output-max-length', type=int, required=False,
|
||||
help='Maximum phrase length (default: no limit)')
|
||||
output_group.add_argument('--output-min-score', type=float, required=False, default=2.0,
|
||||
help='Minimum matching score (default: 2.0)')
|
||||
output_group.add_argument('--output-max-score', type=float, required=False,
|
||||
help='Maximum matching score (default: no limit)')
|
||||
for b in ['Min', 'Max']:
|
||||
for r in ['CER', 'WER']:
|
||||
output_group.add_argument('--output-' + b.lower() + '-' + r.lower(), type=float, required=False,
|
||||
|
@ -167,8 +192,14 @@ def main(args):
|
|||
for i, segment in enumerate(segments):
|
||||
# Run DeepSpeech on the chunk that just completed VAD
|
||||
segment_buffer, time_start, time_end = segment
|
||||
time_length = time_end - time_start
|
||||
if args.stt_min_duration and time_length < args.stt_min_duration:
|
||||
skip('Audio too short for STT', index)
|
||||
continue
|
||||
if args.stt_max_duration and time_length > args.stt_max_duration:
|
||||
skip('Audio too long for STT', index)
|
||||
continue
|
||||
logging.debug("Transcribing segment %002d (from %f to %f)..." % (i, time_start / 1000.0, time_end / 1000.0))
|
||||
|
||||
audio = np.frombuffer(segment_buffer, dtype=np.int16)
|
||||
segment_transcript, segment_inference_time = wavTranscriber.stt(model, audio, sample_rate)
|
||||
segment_transcript = ' '.join(segment_transcript.split())
|
||||
|
@ -178,7 +209,7 @@ def main(args):
|
|||
continue
|
||||
fragments.append({
|
||||
'time-start': time_start,
|
||||
'time-length': time_end-time_start,
|
||||
'time-length': time_length,
|
||||
'transcript': segment_transcript
|
||||
})
|
||||
offset += len(segment_transcript)
|
||||
|
@ -190,68 +221,112 @@ def main(args):
|
|||
search = FuzzySearch(tc.clean_text,
|
||||
max_candidates=args.align_max_candidates,
|
||||
candidate_threshold=args.align_candidate_threshold,
|
||||
snap_token=not args.align_no_snap_to_token,
|
||||
stretch_factor=args.align_stretch_fraction)
|
||||
start = 0
|
||||
snap_to_word=not args.align_no_snap,
|
||||
snap_radius=not args.align_snap_radius,
|
||||
match_score=args.align_match_score,
|
||||
mismatch_score=args.align_mismatch_score,
|
||||
gap_score=args.align_gap_score)
|
||||
result_fragments = []
|
||||
substitutions = Counter()
|
||||
statistics = Counter()
|
||||
|
||||
for fragment in fragments:
|
||||
def skip(message, index):
|
||||
logging.info('Fragment %d: %s' % (index, message))
|
||||
statistics[message] += 1
|
||||
|
||||
end_fragments = (args.start + args.num_samples) if args.num_samples else len(fragments)
|
||||
fragments = fragments[args.start:end_fragments]
|
||||
|
||||
for index, fragment in enumerate(fragments):
|
||||
time_start = fragment['time-start']
|
||||
time_length = fragment['time-length']
|
||||
fragment_transcript = fragment['transcript']
|
||||
if args.align_min_length and len(fragment_transcript) < args.align_min_length:
|
||||
skip('Transcript too short for alignment', index)
|
||||
continue
|
||||
if args.align_max_length and len(fragment_transcript) > args.align_max_length:
|
||||
skip('Transcript too long for alignment', index)
|
||||
continue
|
||||
match, match_distance, match_substitutions = search.find_best(fragment_transcript)
|
||||
if match is not None:
|
||||
substitutions += match_substitutions
|
||||
fragment_matched = tc.clean_text[match.start:match.end]
|
||||
score = match_distance/len(fragment_matched)
|
||||
if match is None:
|
||||
skip('No match for transcript', index)
|
||||
continue
|
||||
substitutions += match_substitutions
|
||||
fragment_matched = tc.clean_text[match.start:match.end]
|
||||
if args.output_min_length and len(fragment_matched) < args.output_min_length:
|
||||
skip('Match too short', index)
|
||||
continue
|
||||
if args.output_max_length and len(fragment_matched) > args.output_max_length:
|
||||
skip('Match too long', index)
|
||||
continue
|
||||
score = match_distance/max(len(fragment_matched), len(fragment_transcript))
|
||||
sample_numbers = ['Score %.2f' % score]
|
||||
if args.output_min_score and score < args.output_min_score:
|
||||
skip('Matching score too low', index)
|
||||
continue
|
||||
if args.output_max_score and score > args.output_max_score:
|
||||
skip('Matching score too high', index)
|
||||
continue
|
||||
original_start = tc.get_original_offset(match.start)
|
||||
original_end = tc.get_original_offset(match.end)
|
||||
result_fragment = {
|
||||
'time-start': time_start,
|
||||
'time-length': time_length,
|
||||
'text-start': original_start,
|
||||
'text-length': original_end-original_start,
|
||||
'score': score
|
||||
}
|
||||
if args.output_cer or args.output_min_cer or args.output_max_cer:
|
||||
cer = levenshtein(fragment_transcript, fragment_matched)/len(fragment_matched)
|
||||
sample_numbers.insert(0, 'CER: %.2f' % cer * 100)
|
||||
if args.output_cer:
|
||||
result_fragment['cer'] = cer
|
||||
if args.output_min_cer and score < args.output_min_cer:
|
||||
skip('Character error rate (CER) too low', index)
|
||||
continue
|
||||
if args.output_max_cer and score > args.output_max_cer:
|
||||
skip('Character error rate (CER) too high', index)
|
||||
continue
|
||||
if args.output_wer or args.output_min_wer or args.output_max_wer:
|
||||
wer = levenshtein(fragment_transcript.split(), fragment_matched.split())/len(fragment_matched.split())
|
||||
if (args.output_min_cer and cer * 100.0 < args.output_min_cer) or \
|
||||
(args.output_max_cer and cer * 100.0 > args.output_max_cer) or \
|
||||
(args.output_min_wer and wer * 100.0 < args.output_min_wer) or \
|
||||
(args.output_max_wer and wer * 100.0 > args.output_max_wer) or \
|
||||
(args.output_min_length and len(fragment_matched) < args.output_min_length) or \
|
||||
(args.output_max_length and len(fragment_matched) > args.output_max_length):
|
||||
continue
|
||||
original_start = tc.get_original_offset(match.start)
|
||||
original_end = tc.get_original_offset(match.end)
|
||||
result_fragment = {
|
||||
'time-start': time_start,
|
||||
'time-length': time_length,
|
||||
'text-start': original_start,
|
||||
'text-length': original_end-original_start,
|
||||
'score': score,
|
||||
'cer': cer,
|
||||
'wer': wer
|
||||
}
|
||||
if args.output_stt:
|
||||
result_fragment['stt'] = fragment_transcript
|
||||
if args.output_aligned:
|
||||
result_fragment['aligned'] = fragment_matched
|
||||
if args.output_aligned_raw:
|
||||
result_fragment['aligned-raw'] = original_transcript[original_start:original_end]
|
||||
result_fragments.append(result_fragment)
|
||||
logging.debug('Sample with WER %.2f CER %.2f Score %f' % (wer * 100, cer * 100, score))
|
||||
logging.debug('- T: ' + args.text_context * ' ' + '%s' % fragment_transcript)
|
||||
logging.debug('- O: %s|%s|%s' % (
|
||||
tc.clean_text[match.start-args.text_context:match.start],
|
||||
fragment_matched,
|
||||
tc.clean_text[match.end:match.end+args.text_context]))
|
||||
start = match.end
|
||||
if args.play:
|
||||
subprocess.check_call(['play',
|
||||
'--no-show-progress',
|
||||
args.audio,
|
||||
'trim',
|
||||
str(time_start/1000.0),
|
||||
'='+str((time_start + time_length)/1000.0)])
|
||||
sample_numbers.insert(0, 'WER: %.2f' % wer * 100)
|
||||
if args.output_wer:
|
||||
result_fragment['wer'] = wer
|
||||
if args.output_min_wer and score < args.output_min_wer:
|
||||
skip('Word error rate (WER) too low', index)
|
||||
continue
|
||||
if args.output_max_wer and score > args.output_max_wer:
|
||||
skip('Word error rate (WER) too high', index)
|
||||
continue
|
||||
if args.output_stt:
|
||||
result_fragment['stt'] = fragment_transcript
|
||||
if args.output_aligned:
|
||||
result_fragment['aligned'] = fragment_matched
|
||||
if args.output_aligned_raw:
|
||||
result_fragment['aligned-raw'] = original_transcript[original_start:original_end]
|
||||
result_fragments.append(result_fragment)
|
||||
logging.debug('Fragment %d aligned with %s' % (index, ' '.join(sample_numbers)))
|
||||
logging.debug('- T: ' + args.text_context * ' ' + '"%s"' % fragment_transcript)
|
||||
logging.debug('- O: %s|%s|%s' % (
|
||||
tc.clean_text[match.start-args.text_context:match.start],
|
||||
fragment_matched,
|
||||
tc.clean_text[match.end:match.end+args.text_context]))
|
||||
start = match.end
|
||||
if args.play:
|
||||
subprocess.check_call(['play',
|
||||
'--no-show-progress',
|
||||
args.audio,
|
||||
'trim',
|
||||
str(time_start / 1000.0),
|
||||
'='+str((time_start + time_length) / 1000.0)])
|
||||
with open(args.result, 'w') as result_file:
|
||||
result_file.write(json.dumps(result_fragments))
|
||||
|
||||
logging.info('Aligned %d fragments.' % len(result_fragments))
|
||||
skipped = len(fragments)-len(result_fragments)
|
||||
logging.info('Skipped %d fragments (%.2f%%).' % (skipped, skipped*100.0/len(fragments)))
|
||||
logging.info('Aligned %d fragments' % len(result_fragments))
|
||||
skipped = len(fragments) - len(result_fragments)
|
||||
logging.info('Skipped %d fragments (%.2f%%):' % (skipped, skipped * 100.0 / len(fragments)))
|
||||
for key, number in statistics.most_common():
|
||||
logging.info(' - %s: %d' % (key, number))
|
||||
print(substitutions)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,29 +1,27 @@
|
|||
from collections import Counter
|
||||
from nltk import ngrams
|
||||
from text import levenshtein, TextRange
|
||||
from utils import circulate, by_len
|
||||
import random
|
||||
from text import TextRange
|
||||
|
||||
|
||||
class FuzzySearch(object):
|
||||
def __init__(self,
|
||||
text,
|
||||
max_candidates=10,
|
||||
candidate_threshold=0.92,
|
||||
snap_token=True,
|
||||
stretch_factor=1/3,
|
||||
match_score=3,
|
||||
mismatch_score=0,
|
||||
delete_score=-4,
|
||||
insert_score=-4,
|
||||
snap_to_word=True,
|
||||
snap_radius=0,
|
||||
match_score=100,
|
||||
mismatch_score=-100,
|
||||
gap_score=-100,
|
||||
similarities=None):
|
||||
self.text = text
|
||||
self.max_candidates = max_candidates
|
||||
self.candidate_threshold = candidate_threshold
|
||||
self.snap_token = snap_token
|
||||
self.snap_to_word = snap_to_word
|
||||
self.snap_radius = snap_radius
|
||||
self.match_score = match_score
|
||||
self.mismatch_score = mismatch_score
|
||||
self.delete_score = delete_score
|
||||
self.insert_score = insert_score
|
||||
self.gap_score = gap_score
|
||||
self.similarities = similarities
|
||||
self.ngrams = {}
|
||||
for i, ngram in enumerate(ngrams(' ' + text + ' ', 3)):
|
||||
|
@ -47,38 +45,38 @@ class FuzzySearch(object):
|
|||
|
||||
def sw_align(self, a, b):
|
||||
n, m = len(a), len(b)
|
||||
f = [[]] * (n + 1)
|
||||
# building scoring matrix
|
||||
f = [[0]] * (n + 1)
|
||||
for i in range(0, n + 1):
|
||||
f[i] = [0] * (m + 1)
|
||||
for i in range(1, n + 1):
|
||||
f[i][0] = self.insert_score * i
|
||||
f[i][0] = self.gap_score * i
|
||||
for j in range(1, m + 1):
|
||||
f[0][j] = self.delete_score * j
|
||||
f[0][j] = self.gap_score * j
|
||||
max_score = 0
|
||||
start_i, start_j = 0, 0
|
||||
for i in range(1, n + 1):
|
||||
for j in range(1, m + 1):
|
||||
match = f[i - 1][j - 1] + self.similarity(a[i - 1], b[j - 1])
|
||||
insert = f[i][j - 1] + self.insert_score
|
||||
delete = f[i - 1][j] + self.delete_score
|
||||
insert = f[i][j - 1] + self.gap_score
|
||||
delete = f[i - 1][j] + self.gap_score
|
||||
score = max(0, match, insert, delete)
|
||||
f[i][j] = score
|
||||
if score > max_score:
|
||||
max_score = score
|
||||
start_i, start_j = i, j
|
||||
|
||||
# backtracking
|
||||
substitutions = Counter()
|
||||
i, j = start_i, start_j
|
||||
while (j > 0 or i > 0) and f[i][j] != 0:
|
||||
ca, cb = a[i - 1] if i > 0 else ' ', b[j - 1] if j > 0 else ' '
|
||||
s = self.similarity(ca, cb)
|
||||
if i > 0 and j > 0 and f[i][j] == (f[i - 1][j - 1] + s):
|
||||
if ca != cb:
|
||||
substitutions[FuzzySearch.similarity_key(ca, cb)] += 1
|
||||
substitutions[FuzzySearch.similarity_key(ca, cb)] += 1
|
||||
i, j = i - 1, j - 1
|
||||
elif i > 0 and f[i][j] == (f[i - 1][j] + self.delete_score):
|
||||
elif i > 0 and f[i][j] == (f[i - 1][j] + self.gap_score):
|
||||
i -= 1
|
||||
elif j > 0 and f[i][j] == (f[i][j - 1] + self.insert_score):
|
||||
elif j > 0 and f[i][j] == (f[i][j - 1] + self.gap_score):
|
||||
j -= 1
|
||||
else:
|
||||
raise Exception('Smith–Waterman failure')
|
||||
|
|
Загрузка…
Ссылка в новой задаче