Weighted N-gram similarity integration

This commit is contained in:
Tilman Kamp 2019-08-02 18:24:32 +02:00
Родитель c6efe188a2
Коммит 37b7313d25
2 изменённых файлов: 79 добавлений и 15 удалений

Просмотреть файл

@ -74,6 +74,14 @@ def main(args):
align_group.add_argument('--align-snap-radius', type=int, required=False, default=0,
help='How many words to look up to the left and right for snapping to word ' +
'boundaries at the beginning and end of each phrase')
align_group.add_argument('--align-min-ngram-size', type=int, required=False, default=1,
help='Minimum N-gram size for weighted N-gram similarity during snapping (default: 1)')
align_group.add_argument('--align-max-ngram-size', type=int, required=False, default=3,
help='Maximum N-gram size for weighted N-gram similarity during snapping (default: 3)')
align_group.add_argument('--align-ngram-size-factor', type=int, required=False, default=1,
help='Size weight for weighted N-gram similarity during snapping (default: 1)')
align_group.add_argument('--align-ngram-position-factor', type=int, required=False, default=3,
help='Position weight for weighted N-gram similarity during snapping (default: 3)')
align_group.add_argument('--align-min-length', type=int, required=False, default=4,
help='Minimum STT phrase length to align (default: 4)')
align_group.add_argument('--align-max-length', type=int, required=False,
@ -86,14 +94,22 @@ def main(args):
help='Writes clean aligned original transcripts to result file')
output_group.add_argument('--output-aligned-raw', action="store_true",
help='Writes raw aligned original transcripts to result file')
output_group.add_argument('--output-wng-min-ngram-size', type=int, required=False, default=1,
help='Minimum N-gram size for weighted N-gram similarity filter (default: 1)')
output_group.add_argument('--output-wng-max-ngram-size', type=int, required=False, default=3,
help='Maximum N-gram size for weighted N-gram similarity filter (default: 3)')
output_group.add_argument('--output-wng-ngram-size-factor', type=int, required=False, default=1,
help='Size weight for weighted N-gram similarity filter (default: 1)')
output_group.add_argument('--output-wng-ngram-position-factor', type=int, required=False, default=3,
help='Position weight for weighted N-gram similarity filter (default: 3)')
named_numbers = {
'tlen': ('transcript length', int, None),
'mlen': ('match length', int, None),
'SWS': ('Smith-Waterman score', float, 'From 0.0 (not equal at all) to 100.0+ (pretty equal)'),
'WNG': ('weighted N-gram intersection', float, 'From 0.0 (not equal at all) to 100.0 (totally equal)'),
'CER': ('character error rate', float, 'From 0.0 (no different words) to 100.0+ (total miss)'),
'WER': ('word error rate', float, 'From 0.0 (no wrong characters) to 100.0+ (total miss)')
'tlen': ('transcript length', int, None),
'mlen': ('match length', int, None),
'SWS': ('Smith-Waterman score', float, 'From 0.0 (not equal at all) to 100.0+ (pretty equal)'),
'WNG': ('weighted N-gram similarity', float, 'From 0.0 (not equal at all) to 100.0 (totally equal)'),
'CER': ('character error rate', float, 'From 0.0 (no different words) to 100.0+ (total miss)'),
'WER': ('word error rate', float, 'From 0.0 (no wrong characters) to 100.0+ (total miss)')
}
for short in named_numbers.keys():
@ -225,10 +241,14 @@ def main(args):
max_candidates=args.align_max_candidates,
candidate_threshold=args.align_candidate_threshold,
snap_to_word=not args.align_no_snap,
snap_radius=not args.align_snap_radius,
snap_radius=args.align_snap_radius,
match_score=args.align_match_score,
mismatch_score=args.align_mismatch_score,
gap_score=args.align_gap_score)
gap_score=args.align_gap_score,
min_ngram_size=args.align_min_ngram_size,
max_ngram_size=args.align_max_ngram_size,
size_factor=args.align_ngram_size_factor,
position_factor=args.align_ngram_position_factor)
result_fragments = []
substitutions = Counter()
statistics = Counter()
@ -298,16 +318,22 @@ def main(args):
continue
if should_skip('WNG', index, sample_numbers,
lambda: 100 * similarity(fragment_matched, fragment_transcript)):
lambda: 100 * similarity(fragment_matched,
fragment_transcript,
min_ngram_size=args.output_wng_min_ngram_size,
max_ngram_size=args.output_wng_max_ngram_size,
size_factor=args.output_wng_ngram_size_factor,
position_factor=args.output_wng_ngram_position_factor)):
continue
if should_skip('CER', index, sample_numbers,
lambda: 100 * levenshtein(fragment_transcript, fragment_matched) / len(fragment_matched)):
lambda: 100 * levenshtein(fragment_transcript, fragment_matched) /
len(fragment_matched)):
continue
if should_skip('WER', index, sample_numbers,
lambda: 100 * levenshtein(fragment_transcript.split(),
fragment_matched.split())/len(fragment_matched.split())):
lambda: 100 * levenshtein(fragment_transcript.split(), fragment_matched.split()) /
len(fragment_matched.split())):
continue
result_fragments.append(result_fragment)
@ -334,5 +360,6 @@ def main(args):
for key, number in statistics.most_common():
logging.info(' - %s: %d' % (key, number))
if __name__ == '__main__':
main(sys.argv[1:])

Просмотреть файл

@ -1,5 +1,5 @@
from collections import Counter
from text import TextRange, ngrams
from text import TextRange, ngrams, similarity
class FuzzySearch(object):
@ -9,6 +9,10 @@ class FuzzySearch(object):
candidate_threshold=0.92,
snap_to_word=True,
snap_radius=0,
min_ngram_size=1,
max_ngram_size=3,
size_factor=1,
position_factor=3,
match_score=100,
mismatch_score=-100,
gap_score=-100,
@ -18,6 +22,10 @@ class FuzzySearch(object):
self.candidate_threshold = candidate_threshold
self.snap_to_word = snap_to_word
self.snap_radius = snap_radius
self.min_ngram_size = min_ngram_size
self.max_ngram_size = max_ngram_size
self.size_factor = size_factor
self.position_factor = position_factor
self.match_score = match_score
self.mismatch_score = mismatch_score
self.gap_score = gap_score
@ -80,11 +88,40 @@ class FuzzySearch(object):
raise Exception('Smith–Waterman failure')
return start + j - 1, start + start_j, f[start_i][start_j], substitutions
def snap(self, start, end):
def phrase_similarity(self, a, b, direction):
return similarity(a,
b,
direction=direction,
min_ngram_size=self.min_ngram_size,
max_ngram_size=self.max_ngram_size,
size_factor=self.size_factor,
position_factor=self.position_factor)
def extend(self, target, start_token, direction):
best_similarity = 0
current_token = best_token = start_token
for i in range(self.snap_radius + 1):
current_similarity = self.phrase_similarity(current_token.get_text(), target, direction)
if current_similarity > best_similarity:
best_similarity = current_similarity
best_token = current_token
current_similarity = self.phrase_similarity((current_token + start_token).get_text(), target, direction)
if current_similarity > best_similarity:
best_similarity = current_similarity
best_token = current_token
current_token = current_token.neighbour_token(direction)
return best_token
def snap(self, look_for, start, end):
start_token = TextRange.token_at(self.text, start)
if len(start_token) == 0:
start_token = TextRange.token_at(self.text, start + 1)
end_token = TextRange.token_at(self.text, max(0, end - 1))
if self.snap_radius > 0:
look_for = look_for.split(' ')
lf_start, lf_end = look_for[0], look_for[-1]
start_token = self.extend(lf_start, start_token, -1)
end_token = self.extend(lf_end, end_token, 1)
snap_range = start_token + end_token
return snap_range.start, snap_range.end
@ -114,7 +151,7 @@ class FuzzySearch(object):
interval_end = min(stop, int((window + 2) * window_size))
interval_start, interval_end, score, substitutions = self.sw_align(look_for, interval_start, interval_end)
if self.snap_to_word:
interval_start, interval_end = self.snap(interval_start, interval_end)
interval_start, interval_end = self.snap(look_for, interval_start, interval_end)
if score > best_score:
best_interval = TextRange(self.text, interval_start, interval_end)
best_score = score