зеркало из https://github.com/mozilla/DSAlign.git
This commit is contained in:
Родитель
a3ef2b0033
Коммит
82451bcc9c
|
@ -51,7 +51,7 @@ def main(args):
|
|||
align_group = parser.add_argument_group(title='Alignment algorithm options')
|
||||
align_group.add_argument('--align-max-candidates', type=int, required=False, default=10,
|
||||
help='How many global 3gram match candidates are tested at max (default: 10)')
|
||||
align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.8,
|
||||
align_group.add_argument('--align-candidate-threshold', type=float, required=False, default=0.5,
|
||||
help='Factor for how many 3grams the next candidate should have at least ' +
|
||||
'compared to its predecessor (default: 0.8)')
|
||||
align_group.add_argument('--align-no-snap-to-token', action="store_true",
|
||||
|
|
161
align/search.py
161
align/search.py
|
@ -1,3 +1,4 @@
|
|||
from collections import Counter
|
||||
from nltk import ngrams
|
||||
from text import levenshtein, TextRange
|
||||
from utils import circulate, by_len
|
||||
|
@ -10,14 +11,20 @@ class FuzzySearch(object):
|
|||
candidate_threshold=0.8,
|
||||
snap_token=True,
|
||||
stretch_factor=1/3,
|
||||
missed_penalty=1.0):
|
||||
match_score=2,
|
||||
mismatch_score=-2,
|
||||
delete_score=-1,
|
||||
insert_score=-1,
|
||||
similarities=None):
|
||||
self.text = text
|
||||
self.max_candidates = max_candidates
|
||||
self.candidate_threshold = candidate_threshold
|
||||
self.snap_token = snap_token
|
||||
self.stretch_factor = stretch_factor
|
||||
self.missed_penalty = missed_penalty
|
||||
self.word_distances = {}
|
||||
self.match_score = match_score
|
||||
self.mismatch_score = mismatch_score
|
||||
self.delete_score = delete_score
|
||||
self.insert_score = insert_score
|
||||
self.similarities = similarities
|
||||
self.ngrams = {}
|
||||
for i, ngram in enumerate(ngrams(' ' + text + ' ', 3)):
|
||||
if ngram in self.ngrams:
|
||||
|
@ -26,56 +33,69 @@ class FuzzySearch(object):
|
|||
ngram_bucket = self.ngrams[ngram] = []
|
||||
ngram_bucket.append(i)
|
||||
|
||||
def get_missed_distance(self, missed):
|
||||
return self.word_distance('', ' '.join(missed)) * self.missed_penalty
|
||||
@staticmethod
|
||||
def similarity_key(a, b):
|
||||
if a > b:
|
||||
a, b = b, a
|
||||
return '' + a + b
|
||||
|
||||
def find_best_token_range(self, look_for, look_in, alignment=0):
|
||||
if len(look_for) == 0:
|
||||
return 0, (0, 0)
|
||||
def similarity(self, a, b):
|
||||
key = FuzzySearch.similarity_key(a, b)
|
||||
if self.similarities and key in self.similarities:
|
||||
return self.similarities[key]
|
||||
return self.match_score if a == b else self.mismatch_score
|
||||
|
||||
if len(look_in) == 0:
|
||||
return self.get_missed_distance(look_for), (0, 0)
|
||||
def nwmatch(self, a, b):
|
||||
n, m = len(a), len(b)
|
||||
f = [[0] * m] * n
|
||||
for i in range(n):
|
||||
f[i][0] = self.insert_score * i # CHECK: correct order of delete and insert scores?
|
||||
for j in range(m):
|
||||
f[0][j] = self.delete_score * j
|
||||
max_score = 0
|
||||
start_i, start_j = 0, 0
|
||||
for i in range(1, n):
|
||||
for j in range(1, m):
|
||||
match = f[i - 1][j - 1] + self.similarity(a[i], b[j])
|
||||
delete = f[i - 1][j] + self.delete_score
|
||||
insert = f[i][j - 1] + self.insert_score
|
||||
score = max(0, match, insert, delete)
|
||||
f[i][j] = score
|
||||
if score > max_score:
|
||||
max_score = score
|
||||
start_i, start_j = i, j
|
||||
|
||||
dc, i, j = self.find_similar_words(look_for, look_in, alignment=alignment)
|
||||
substitutions = Counter()
|
||||
score = 0
|
||||
match_start, match_len = -1, 0
|
||||
i, j = start_i, start_j
|
||||
align_a, align_b = '', ''
|
||||
while (j > 0 or i > 0) and f[i][j] != 0:
|
||||
if i > 0 and j > 0 and f[i][j] == (f[i - 1][j - 1] + self.similarity(a[i], b[j])):
|
||||
align_a = a[i] + align_a
|
||||
align_b = b[j] + align_b
|
||||
score += self.similarity(a[i], b[j])
|
||||
substitutions[FuzzySearch.similarity_key(a[i], b[j])] += 1
|
||||
i, j = i - 1, j - 1
|
||||
elif i > 0 and f[i][j] == (f[i - 1][j] + self.delete_score):
|
||||
print('D')
|
||||
align_a = a[i] + align_a
|
||||
align_b = '-' + align_b
|
||||
score += self.delete_score
|
||||
i -= 1
|
||||
elif j > 0 and f[i][j] == (f[i][j - 1] + self.insert_score):
|
||||
print('I')
|
||||
align_a = '-' + align_a
|
||||
align_b = b[j] + align_b
|
||||
score += self.insert_score
|
||||
j -= 1
|
||||
else:
|
||||
print('Warum?', i, j, self.similarity(a[i], b[j]), f[i-1][j-1], f[i-1][j], f[i][j-1], f[i][j])
|
||||
|
||||
look_for_left = look_for[:i]
|
||||
look_in_left = look_in[:j]
|
||||
dl, il = self.find_best_token_range(look_for_left, look_in_left, alignment=1 if alignment == 1 else 0)
|
||||
ls, le = il
|
||||
ml = look_in_left[le+1:]
|
||||
dml = self.get_missed_distance(ml)
|
||||
print(align_a)
|
||||
print(align_b)
|
||||
|
||||
look_for_right = look_for[i+1:]
|
||||
look_in_right = look_in[j+1:]
|
||||
dr, ir = self.find_best_token_range(look_for_right, look_in_right, alignment=-1 if alignment == -1 else 0)
|
||||
rs, re = ir
|
||||
mr = look_in_right[:rs]
|
||||
dmr = self.get_missed_distance(mr)
|
||||
|
||||
start = ls if le - ls > 0 else j
|
||||
end = j + 1 + re
|
||||
|
||||
print('Start: %d, End: %d' % (start, end))
|
||||
|
||||
return (dl + dml + dc + dmr + dr) / (end - start), (start, end)
|
||||
|
||||
def find_best_in_interval(self, look_for, start, stop):
|
||||
tokens_look_for = look_for.split()
|
||||
look_in_start_token = TextRange.token_at(self.text, start)
|
||||
if len(look_in_start_token) == 0:
|
||||
look_in_start_token = look_in_start_token.next_token()
|
||||
look_in_end_token = TextRange.token_at(self.text, stop)
|
||||
if len(look_in_end_token) == 0:
|
||||
look_in_end_token = look_in_end_token.prev_token()
|
||||
look_in_range = look_in_start_token + look_in_end_token
|
||||
print('Searching for "%s"' % look_for)
|
||||
print('Searching in "%s"' % look_in_range.get_text())
|
||||
tokens_look_in = look_in_range.get_text().split()
|
||||
distance, token_range = self.find_best_token_range(tokens_look_for, tokens_look_in)
|
||||
token_start, token_end = token_range
|
||||
text_start = look_in_range.start + len(''.join(tokens_look_in[:token_start])) + token_start
|
||||
text_end = text_start + len(' '.join(tokens_look_in[token_start:token_end]))
|
||||
return distance, TextRange(self.text, text_start, text_end)
|
||||
return match_start, match_len, score, substitutions
|
||||
|
||||
def find_best(self, look_for, start=0, stop=-1):
|
||||
stop = len(self.text) if stop < 0 else stop
|
||||
|
@ -91,43 +111,20 @@ class FuzzySearch(object):
|
|||
windows[window] = (windows[window] + 1) if window in windows else 1
|
||||
candidate_windows = sorted(windows.keys(), key=lambda w: windows[w], reverse=True)
|
||||
best_interval = None
|
||||
best_distance = -1
|
||||
best_score = -10000000000
|
||||
last_window_grams = 0.1
|
||||
for window in candidate_windows[:self.max_candidates]:
|
||||
if windows[window] / last_window_grams < self.candidate_threshold:
|
||||
print('Next candidate window below threshold')
|
||||
break
|
||||
last_window_grams = windows[window]
|
||||
interval_start = max(start, int((window-0.5)*window_size))
|
||||
interval_stop = min(stop, int((window+1.5)*window_size))
|
||||
interval_distance, interval = self.find_best_in_interval(look_for, interval_start, interval_stop)
|
||||
if not best_interval or interval_distance < best_distance:
|
||||
interval_end = min(stop, int((window+1.5)*window_size))
|
||||
interval_text = self.text[interval_start:interval_end]
|
||||
match_start, match_len, score, substitutions = self.nwmatch(look_for, interval_text)
|
||||
match_start += interval_start
|
||||
interval = TextRange(self.text, match_start, match_start + match_len)
|
||||
if score > best_score:
|
||||
print('new best')
|
||||
best_interval = interval
|
||||
best_distance = interval_distance
|
||||
return best_interval, best_distance
|
||||
|
||||
def word_distance(self, a, b):
|
||||
key = (a, b)
|
||||
if key in self.word_distances:
|
||||
return self.word_distances[key]
|
||||
avg_len = max(len(a), 1)
|
||||
s = self.word_distances[key] = (levenshtein(a, b) / avg_len)
|
||||
return s
|
||||
|
||||
def find_similar_words(self,
|
||||
look_for,
|
||||
look_in,
|
||||
alignment=0,
|
||||
distance_threshold=0.10):
|
||||
lli = len(look_in)
|
||||
for i, wa in by_len(look_for):
|
||||
for j, wb in by_len(look_in):
|
||||
d = self.word_distance(wa, wb)
|
||||
off = abs(lli//2 - j) if alignment == 0 else ((lli - j) if alignment > 0 else j)
|
||||
panelty = off / (lli * len(look_for))
|
||||
if d < distance_threshold + 8 * panelty:
|
||||
print('Accepted with distance %.2f: "%s" - "%s"' % (d, wa, wb))
|
||||
return d, i, j
|
||||
return self.find_similar_words(look_for,
|
||||
look_in,
|
||||
distance_threshold=distance_threshold*1.1)
|
||||
best_score = score
|
||||
return best_interval, best_score
|
||||
|
|
Загрузка…
Ссылка в новой задаче