зеркало из https://github.com/mozilla/DSAlign.git
Weighted N-gram similarity integration
This commit is contained in:
Родитель
c6efe188a2
Коммит
37b7313d25
|
@ -74,6 +74,14 @@ def main(args):
|
|||
align_group.add_argument('--align-snap-radius', type=int, required=False, default=0,
|
||||
help='How many words to look up to the left and right for snapping to word ' +
|
||||
'boundaries at the beginning and end of each phrase')
|
||||
align_group.add_argument('--align-min-ngram-size', type=int, required=False, default=1,
|
||||
help='Minimum N-gram size for weighted N-gram similarity during snapping (default: 1)')
|
||||
align_group.add_argument('--align-max-ngram-size', type=int, required=False, default=3,
|
||||
help='Maximum N-gram size for weighted N-gram similarity during snapping (default: 3)')
|
||||
align_group.add_argument('--align-ngram-size-factor', type=int, required=False, default=1,
|
||||
help='Size weight for weighted N-gram similarity during snapping (default: 1)')
|
||||
align_group.add_argument('--align-ngram-position-factor', type=int, required=False, default=3,
|
||||
help='Position weight for weighted N-gram similarity during snapping (default: 3)')
|
||||
align_group.add_argument('--align-min-length', type=int, required=False, default=4,
|
||||
help='Minimum STT phrase length to align (default: 4)')
|
||||
align_group.add_argument('--align-max-length', type=int, required=False,
|
||||
|
@ -86,14 +94,22 @@ def main(args):
|
|||
help='Writes clean aligned original transcripts to result file')
|
||||
output_group.add_argument('--output-aligned-raw', action="store_true",
|
||||
help='Writes raw aligned original transcripts to result file')
|
||||
output_group.add_argument('--output-wng-min-ngram-size', type=int, required=False, default=1,
|
||||
help='Minimum N-gram size for weighted N-gram similarity filter (default: 1)')
|
||||
output_group.add_argument('--output-wng-max-ngram-size', type=int, required=False, default=3,
|
||||
help='Maximum N-gram size for weighted N-gram similarity filter (default: 3)')
|
||||
output_group.add_argument('--output-wng-ngram-size-factor', type=int, required=False, default=1,
|
||||
help='Size weight for weighted N-gram similarity filter (default: 1)')
|
||||
output_group.add_argument('--output-wng-ngram-position-factor', type=int, required=False, default=3,
|
||||
help='Position weight for weighted N-gram similarity filter (default: 3)')
|
||||
|
||||
named_numbers = {
|
||||
'tlen': ('transcript length', int, None),
|
||||
'mlen': ('match length', int, None),
|
||||
'SWS': ('Smith-Waterman score', float, 'From 0.0 (not equal at all) to 100.0+ (pretty equal)'),
|
||||
'WNG': ('weighted N-gram intersection', float, 'From 0.0 (not equal at all) to 100.0 (totally equal)'),
|
||||
'CER': ('character error rate', float, 'From 0.0 (no different words) to 100.0+ (total miss)'),
|
||||
'WER': ('word error rate', float, 'From 0.0 (no wrong characters) to 100.0+ (total miss)')
|
||||
'tlen': ('transcript length', int, None),
|
||||
'mlen': ('match length', int, None),
|
||||
'SWS': ('Smith-Waterman score', float, 'From 0.0 (not equal at all) to 100.0+ (pretty equal)'),
|
||||
'WNG': ('weighted N-gram similarity', float, 'From 0.0 (not equal at all) to 100.0 (totally equal)'),
|
||||
'CER': ('character error rate', float, 'From 0.0 (no different words) to 100.0+ (total miss)'),
|
||||
'WER': ('word error rate', float, 'From 0.0 (no wrong characters) to 100.0+ (total miss)')
|
||||
}
|
||||
|
||||
for short in named_numbers.keys():
|
||||
|
@ -225,10 +241,14 @@ def main(args):
|
|||
max_candidates=args.align_max_candidates,
|
||||
candidate_threshold=args.align_candidate_threshold,
|
||||
snap_to_word=not args.align_no_snap,
|
||||
snap_radius=not args.align_snap_radius,
|
||||
snap_radius=args.align_snap_radius,
|
||||
match_score=args.align_match_score,
|
||||
mismatch_score=args.align_mismatch_score,
|
||||
gap_score=args.align_gap_score)
|
||||
gap_score=args.align_gap_score,
|
||||
min_ngram_size=args.align_min_ngram_size,
|
||||
max_ngram_size=args.align_max_ngram_size,
|
||||
size_factor=args.align_ngram_size_factor,
|
||||
position_factor=args.align_ngram_position_factor)
|
||||
result_fragments = []
|
||||
substitutions = Counter()
|
||||
statistics = Counter()
|
||||
|
@ -298,16 +318,22 @@ def main(args):
|
|||
continue
|
||||
|
||||
if should_skip('WNG', index, sample_numbers,
|
||||
lambda: 100 * similarity(fragment_matched, fragment_transcript)):
|
||||
lambda: 100 * similarity(fragment_matched,
|
||||
fragment_transcript,
|
||||
min_ngram_size=args.output_wng_min_ngram_size,
|
||||
max_ngram_size=args.output_wng_max_ngram_size,
|
||||
size_factor=args.output_wng_ngram_size_factor,
|
||||
position_factor=args.output_wng_ngram_position_factor)):
|
||||
continue
|
||||
|
||||
if should_skip('CER', index, sample_numbers,
|
||||
lambda: 100 * levenshtein(fragment_transcript, fragment_matched) / len(fragment_matched)):
|
||||
lambda: 100 * levenshtein(fragment_transcript, fragment_matched) /
|
||||
len(fragment_matched)):
|
||||
continue
|
||||
|
||||
if should_skip('WER', index, sample_numbers,
|
||||
lambda: 100 * levenshtein(fragment_transcript.split(),
|
||||
fragment_matched.split())/len(fragment_matched.split())):
|
||||
lambda: 100 * levenshtein(fragment_transcript.split(), fragment_matched.split()) /
|
||||
len(fragment_matched.split())):
|
||||
continue
|
||||
|
||||
result_fragments.append(result_fragment)
|
||||
|
@ -334,5 +360,6 @@ def main(args):
|
|||
for key, number in statistics.most_common():
|
||||
logging.info(' - %s: %d' % (key, number))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from collections import Counter
|
||||
from text import TextRange, ngrams
|
||||
from text import TextRange, ngrams, similarity
|
||||
|
||||
|
||||
class FuzzySearch(object):
|
||||
|
@ -9,6 +9,10 @@ class FuzzySearch(object):
|
|||
candidate_threshold=0.92,
|
||||
snap_to_word=True,
|
||||
snap_radius=0,
|
||||
min_ngram_size=1,
|
||||
max_ngram_size=3,
|
||||
size_factor=1,
|
||||
position_factor=3,
|
||||
match_score=100,
|
||||
mismatch_score=-100,
|
||||
gap_score=-100,
|
||||
|
@ -18,6 +22,10 @@ class FuzzySearch(object):
|
|||
self.candidate_threshold = candidate_threshold
|
||||
self.snap_to_word = snap_to_word
|
||||
self.snap_radius = snap_radius
|
||||
self.min_ngram_size = min_ngram_size
|
||||
self.max_ngram_size = max_ngram_size
|
||||
self.size_factor = size_factor
|
||||
self.position_factor = position_factor
|
||||
self.match_score = match_score
|
||||
self.mismatch_score = mismatch_score
|
||||
self.gap_score = gap_score
|
||||
|
@ -80,11 +88,40 @@ class FuzzySearch(object):
|
|||
raise Exception('Smith–Waterman failure')
|
||||
return start + j - 1, start + start_j, f[start_i][start_j], substitutions
|
||||
|
||||
def snap(self, start, end):
|
||||
def phrase_similarity(self, a, b, direction):
|
||||
return similarity(a,
|
||||
b,
|
||||
direction=direction,
|
||||
min_ngram_size=self.min_ngram_size,
|
||||
max_ngram_size=self.max_ngram_size,
|
||||
size_factor=self.size_factor,
|
||||
position_factor=self.position_factor)
|
||||
|
||||
def extend(self, target, start_token, direction):
|
||||
best_similarity = 0
|
||||
current_token = best_token = start_token
|
||||
for i in range(self.snap_radius + 1):
|
||||
current_similarity = self.phrase_similarity(current_token.get_text(), target, direction)
|
||||
if current_similarity > best_similarity:
|
||||
best_similarity = current_similarity
|
||||
best_token = current_token
|
||||
current_similarity = self.phrase_similarity((current_token + start_token).get_text(), target, direction)
|
||||
if current_similarity > best_similarity:
|
||||
best_similarity = current_similarity
|
||||
best_token = current_token
|
||||
current_token = current_token.neighbour_token(direction)
|
||||
return best_token
|
||||
|
||||
def snap(self, look_for, start, end):
|
||||
start_token = TextRange.token_at(self.text, start)
|
||||
if len(start_token) == 0:
|
||||
start_token = TextRange.token_at(self.text, start + 1)
|
||||
end_token = TextRange.token_at(self.text, max(0, end - 1))
|
||||
if self.snap_radius > 0:
|
||||
look_for = look_for.split(' ')
|
||||
lf_start, lf_end = look_for[0], look_for[-1]
|
||||
start_token = self.extend(lf_start, start_token, -1)
|
||||
end_token = self.extend(lf_end, end_token, 1)
|
||||
snap_range = start_token + end_token
|
||||
return snap_range.start, snap_range.end
|
||||
|
||||
|
@ -114,7 +151,7 @@ class FuzzySearch(object):
|
|||
interval_end = min(stop, int((window + 2) * window_size))
|
||||
interval_start, interval_end, score, substitutions = self.sw_align(look_for, interval_start, interval_end)
|
||||
if self.snap_to_word:
|
||||
interval_start, interval_end = self.snap(interval_start, interval_end)
|
||||
interval_start, interval_end = self.snap(look_for, interval_start, interval_end)
|
||||
if score > best_score:
|
||||
best_interval = TextRange(self.text, interval_start, interval_end)
|
||||
best_score = score
|
||||
|
|
Загрузка…
Ссылка в новой задаче