This commit is contained in:
Tilman Kamp 2019-07-26 18:57:03 +02:00
Родитель a3ef2b0033
Коммит 82451bcc9c
2 изменённых файлов: 80 добавлений и 83 удалений

Просмотреть файл

@ -51,7 +51,7 @@ def main(args):
align_group = parser.add_argument_group(title='Alignment algorithm options')
align_group.add_argument('--align-max-candidates', type=int, required=False, default=10,
help='How many global 3gram match candidates are tested at max (default: 10)')
align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.8,
align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.5,
help='Factor for how many 3grams the next candidate should have at least ' +
'compared to its predecessor (default: 0.8)')
align_group.add_argument('--align-no-snap-to-token', action="store_true",

Просмотреть файл

@ -1,3 +1,4 @@
from collections import Counter
from nltk import ngrams
from text import levenshtein, TextRange
from utils import circulate, by_len
@ -10,14 +11,20 @@ class FuzzySearch(object):
candidate_threshold=0.8,
snap_token=True,
stretch_factor=1/3,
missed_penalty=1.0):
match_score=2,
mismatch_score=-2,
delete_score=-1,
insert_score=-1,
similarities=None):
self.text = text
self.max_candidates = max_candidates
self.candidate_threshold = candidate_threshold
self.snap_token = snap_token
self.stretch_factor = stretch_factor
self.missed_penalty = missed_penalty
self.word_distances = {}
self.match_score = match_score
self.mismatch_score = mismatch_score
self.delete_score = delete_score
self.insert_score = insert_score
self.similarities = similarities
self.ngrams = {}
for i, ngram in enumerate(ngrams(' ' + text + ' ', 3)):
if ngram in self.ngrams:
@ -26,56 +33,69 @@ class FuzzySearch(object):
ngram_bucket = self.ngrams[ngram] = []
ngram_bucket.append(i)
def get_missed_distance(self, missed):
return self.word_distance('', ' '.join(missed)) * self.missed_penalty
@staticmethod
def similarity_key(a, b):
if a > b:
a, b = b, a
return '' + a + b
def find_best_token_range(self, look_for, look_in, alignment=0):
if len(look_for) == 0:
return 0, (0, 0)
def similarity(self, a, b):
key = FuzzySearch.similarity_key(a, b)
if self.similarities and key in self.similarities:
return self.similarities[key]
return self.match_score if a == b else self.mismatch_score
if len(look_in) == 0:
return self.get_missed_distance(look_for), (0, 0)
def nwmatch(self, a, b):
n, m = len(a), len(b)
f = [[0] * m] * n
for i in range(n):
f[i][0] = self.insert_score * i # CHECK: correct order of delete and insert scores?
for j in range(m):
f[0][j] = self.delete_score * j
max_score = 0
start_i, start_j = 0, 0
for i in range(1, n):
for j in range(1, m):
match = f[i - 1][j - 1] + self.similarity(a[i], b[j])
delete = f[i - 1][j] + self.delete_score
insert = f[i][j - 1] + self.insert_score
score = max(0, match, insert, delete)
f[i][j] = score
if score > max_score:
max_score = score
start_i, start_j = i, j
dc, i, j = self.find_similar_words(look_for, look_in, alignment=alignment)
substitutions = Counter()
score = 0
match_start, match_len = -1, 0
i, j = start_i, start_j
align_a, align_b = '', ''
while (j > 0 or i > 0) and f[i][j] != 0:
if i > 0 and j > 0 and f[i][j] == (f[i - 1][j - 1] + self.similarity(a[i], b[j])):
align_a = a[i] + align_a
align_b = b[j] + align_b
score += self.similarity(a[i], b[j])
substitutions[FuzzySearch.similarity_key(a[i], b[j])] += 1
i, j = i - 1, j - 1
elif i > 0 and f[i][j] == (f[i - 1][j] + self.delete_score):
print('D')
align_a = a[i] + align_a
align_b = '-' + align_b
score += self.delete_score
i -= 1
elif j > 0 and f[i][j] == (f[i][j - 1] + self.insert_score):
print('I')
align_a = '-' + align_a
align_b = b[j] + align_b
score += self.insert_score
j -= 1
else:
print('Warum?', i, j, self.similarity(a[i], b[j]), f[i-1][j-1], f[i-1][j], f[i][j-1], f[i][j])
look_for_left = look_for[:i]
look_in_left = look_in[:j]
dl, il = self.find_best_token_range(look_for_left, look_in_left, alignment=1 if alignment == 1 else 0)
ls, le = il
ml = look_in_left[le+1:]
dml = self.get_missed_distance(ml)
print(align_a)
print(align_b)
look_for_right = look_for[i+1:]
look_in_right = look_in[j+1:]
dr, ir = self.find_best_token_range(look_for_right, look_in_right, alignment=-1 if alignment == -1 else 0)
rs, re = ir
mr = look_in_right[:rs]
dmr = self.get_missed_distance(mr)
start = ls if le - ls > 0 else j
end = j + 1 + re
print('Start: %d, End: %d' % (start, end))
return (dl + dml + dc + dmr + dr) / (end - start), (start, end)
def find_best_in_interval(self, look_for, start, stop):
tokens_look_for = look_for.split()
look_in_start_token = TextRange.token_at(self.text, start)
if len(look_in_start_token) == 0:
look_in_start_token = look_in_start_token.next_token()
look_in_end_token = TextRange.token_at(self.text, stop)
if len(look_in_end_token) == 0:
look_in_end_token = look_in_end_token.prev_token()
look_in_range = look_in_start_token + look_in_end_token
print('Searching for "%s"' % look_for)
print('Searching in "%s"' % look_in_range.get_text())
tokens_look_in = look_in_range.get_text().split()
distance, token_range = self.find_best_token_range(tokens_look_for, tokens_look_in)
token_start, token_end = token_range
text_start = look_in_range.start + len(''.join(tokens_look_in[:token_start])) + token_start
text_end = text_start + len(' '.join(tokens_look_in[token_start:token_end]))
return distance, TextRange(self.text, text_start, text_end)
return match_start, match_len, score, substitutions
def find_best(self, look_for, start=0, stop=-1):
stop = len(self.text) if stop < 0 else stop
@ -91,43 +111,20 @@ class FuzzySearch(object):
windows[window] = (windows[window] + 1) if window in windows else 1
candidate_windows = sorted(windows.keys(), key=lambda w: windows[w], reverse=True)
best_interval = None
best_distance = -1
best_score = -10000000000
last_window_grams = 0.1
for window in candidate_windows[:self.max_candidates]:
if windows[window] / last_window_grams < self.candidate_threshold:
print('Next candidate window below threshold')
break
last_window_grams = windows[window]
interval_start = max(start, int((window-0.5)*window_size))
interval_stop = min(stop, int((window+1.5)*window_size))
interval_distance, interval = self.find_best_in_interval(look_for, interval_start, interval_stop)
if not best_interval or interval_distance < best_distance:
interval_end = min(stop, int((window+1.5)*window_size))
interval_text = self.text[interval_start:interval_end]
match_start, match_len, score, substitutions = self.nwmatch(look_for, interval_text)
match_start += interval_start
interval = TextRange(self.text, match_start, match_start + match_len)
if score > best_score:
print('new best')
best_interval = interval
best_distance = interval_distance
return best_interval, best_distance
def word_distance(self, a, b):
key = (a, b)
if key in self.word_distances:
return self.word_distances[key]
avg_len = max(len(a), 1)
s = self.word_distances[key] = (levenshtein(a, b) / avg_len)
return s
def find_similar_words(self,
look_for,
look_in,
alignment=0,
distance_threshold=0.10):
lli = len(look_in)
for i, wa in by_len(look_for):
for j, wb in by_len(look_in):
d = self.word_distance(wa, wb)
off = abs(lli//2 - j) if alignment == 0 else ((lli - j) if alignment > 0 else j)
panelty = off / (lli * len(look_for))
if d < distance_threshold + 8 * panelty:
print('Accepted with distance %.2f: "%s" - "%s"' % (d, wa, wb))
return d, i, j
return self.find_similar_words(look_for,
look_in,
distance_threshold=distance_threshold*1.1)
best_score = score
return best_interval, best_score