зеркало из https://github.com/mozilla/DSAlign.git
243 строки
8.9 KiB
Python
243 строки
8.9 KiB
Python
from __future__ import absolute_import, division, print_function
|
|
|
|
import codecs
|
|
from six.moves import range
|
|
from collections import Counter
|
|
from utils import enweight
|
|
|
|
|
|
class Alphabet(object):
|
|
def __init__(self, config_file):
|
|
self._config_file = config_file
|
|
self._label_to_str = []
|
|
self._str_to_label = {}
|
|
self._size = 0
|
|
with codecs.open(config_file, 'r', 'utf-8') as fin:
|
|
for line in fin:
|
|
if line[0:2] == '\\#':
|
|
line = '#\n'
|
|
elif line[0] == '#':
|
|
continue
|
|
self._label_to_str += line[:-1] # remove the line ending
|
|
self._str_to_label[line[:-1]] = self._size
|
|
self._size += 1
|
|
|
|
def string_from_label(self, label):
|
|
return self._label_to_str[label]
|
|
|
|
def has_label(self, string):
|
|
return string in self._str_to_label
|
|
|
|
def label_from_string(self, string):
|
|
try:
|
|
return self._str_to_label[string]
|
|
except KeyError as e:
|
|
raise KeyError(
|
|
'''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.'''
|
|
).with_traceback(e.__traceback__)
|
|
|
|
def decode(self, labels):
|
|
res = ''
|
|
for label in labels:
|
|
res += self.string_from_label(label)
|
|
return res
|
|
|
|
def size(self):
|
|
return self._size
|
|
|
|
def config_file(self):
|
|
return self._config_file
|
|
|
|
|
|
class TextCleaner(object):
|
|
def __init__(self, alphabet, to_lower=True, normalize_space=True, dashes_to_ws=True):
|
|
self.alphabet = alphabet
|
|
self.to_lower = to_lower
|
|
self.normalize_space = normalize_space
|
|
self.dashes_to_ws = dashes_to_ws
|
|
self.original_text = ''
|
|
self.clean_text = ''
|
|
self.positions = []
|
|
self.meta = []
|
|
|
|
def add_original_text(self, original_text, meta=None):
|
|
if len(self.positions) > 0:
|
|
self.clean_text += ' '
|
|
self.original_text += ' '
|
|
self.positions.append(len(self.original_text) - 1)
|
|
self.meta.append(None)
|
|
ws = True
|
|
else:
|
|
ws = False
|
|
cleaned = []
|
|
prepared_text = original_text.lower() if self.to_lower else original_text
|
|
for position, c in enumerate(prepared_text):
|
|
if self.dashes_to_ws and c == '-' and not self.alphabet.has_label('-'):
|
|
c = ' '
|
|
if self.normalize_space and c.isspace():
|
|
if ws:
|
|
continue
|
|
else:
|
|
ws = True
|
|
c = ' '
|
|
if not self.alphabet.has_label(c):
|
|
continue
|
|
if not c.isspace():
|
|
ws = False
|
|
cleaned.append(c)
|
|
self.positions.append(len(self.original_text) + position)
|
|
self.meta.append(meta)
|
|
self.original_text += original_text
|
|
self.clean_text += ''.join(cleaned)
|
|
|
|
def get_original_offset(self, clean_offset):
|
|
if clean_offset == len(self.positions):
|
|
return self.positions[-1] + 1
|
|
return self.positions[clean_offset]
|
|
|
|
def collect_meta(self, from_clean_offset, to_clean_offset=None):
|
|
if to_clean_offset is None:
|
|
return self.meta[from_clean_offset]
|
|
metas = []
|
|
for meta in self.meta[from_clean_offset:to_clean_offset + 1]:
|
|
if meta is not None and meta not in metas:
|
|
metas.append(meta)
|
|
return metas
|
|
|
|
|
|
class TextRange(object):
|
|
def __init__(self, document, start, end):
|
|
self.document = document
|
|
self.start = start
|
|
self.end = end
|
|
|
|
@staticmethod
|
|
def token_at(text, position):
|
|
start = len(text)
|
|
end = 0
|
|
for step in [-1, 1]:
|
|
pos = position
|
|
while 0 <= pos < len(text) and not text[pos].isspace():
|
|
if pos < start:
|
|
start = pos
|
|
if pos > end:
|
|
end = pos
|
|
pos += step
|
|
return TextRange(text, start, end + 1) if start <= end else TextRange(text, position, position)
|
|
|
|
def neighbour_token(self, direction):
|
|
return TextRange.token_at(self.document, self.start - 2 if direction < 0 else self.end + 1)
|
|
|
|
def next_token(self):
|
|
return self.neighbour_token(1)
|
|
|
|
def prev_token(self):
|
|
return self.neighbour_token(-1)
|
|
|
|
def get_text(self):
|
|
return self.document[self.start:self.end]
|
|
|
|
def __add__(self, other):
|
|
if not self.document == other.document:
|
|
raise Exception("Unable to add token from other string")
|
|
return TextRange(self.document, min(self.start, other.start), max(self.end, other.end))
|
|
|
|
def __eq__(self, other):
|
|
return self.document == other.document and self.start == other.start and self.end == other.end
|
|
|
|
def __len__(self):
|
|
return self.end-self.start
|
|
|
|
|
|
def ngrams(s, size):
|
|
"""
|
|
Lists all appearances of all N-grams of a string from left to right.
|
|
:param s: String to decompose
|
|
:param size: N-gram size
|
|
:return: Produces strings representing all N-grams
|
|
"""
|
|
window = len(s) - size
|
|
if window < 1 or size < 1:
|
|
if window == 0:
|
|
yield s
|
|
return
|
|
for i in range(0, window + 1):
|
|
yield s[i:i + size]
|
|
|
|
|
|
def weighted_ngrams(s, size, direction=0):
|
|
"""
|
|
Lists all appearances of all N-grams of a string from left to right together with a positional weight value.
|
|
The positional weight progresses quadratically.
|
|
:param s: String to decompose
|
|
:param size: N-gram size
|
|
:param direction: Order of assigning positional weights to N-grams:
|
|
direction < 0: Weight of first N-gram is 1.0 and of last one 0.0
|
|
direction > 0: Weight of first N-gram is 0.0 and of last one 1.0
|
|
direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0
|
|
:return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value
|
|
"""
|
|
return enweight(ngrams(s, size), direction=direction)
|
|
|
|
|
|
def similarity(a, b, direction=0, min_ngram_size=1, max_ngram_size=3, size_factor=1, position_factor=1):
|
|
"""
|
|
Computes similarity value of two strings ranging from 0.0 (completely different) to 1.0 (completely equal).
|
|
Counts intersection of weighted N-gram sets of both strings.
|
|
:param a: String to compare
|
|
:param b: String to compare
|
|
:param direction: Order of equality importance:
|
|
direction < 0: Left ends of strings more important to be similar
|
|
direction > 0: Right ends of strings more important to be similar
|
|
direction == 0: Left and right ends more important to be similar than center parts
|
|
:param min_ngram_size: Minimum N-gram size to take into account
|
|
:param max_ngram_size: Maximum N-gram size to take into account
|
|
:param size_factor: Importance factor of the N-gram size (compared to the positional importance).
|
|
:param position_factor: Importance factor of the N-gram position (compared to the size importance)
|
|
:return: Number between 0.0 (completely different) and 1.0 (completely equal)
|
|
"""
|
|
if len(a) < len(b):
|
|
a, b = b, a
|
|
ca, cb = Counter(), Counter()
|
|
for s, c in [(a, ca), (b, cb)]:
|
|
for size in range(min_ngram_size, max_ngram_size + 1):
|
|
for ng, position_weight in weighted_ngrams(s, size, direction=direction):
|
|
c[ng] += size * size_factor + position_weight * position_weight * position_factor
|
|
score = 0
|
|
for key in set(ca.keys()) & set(cb.keys()):
|
|
score += min(ca[key], cb[key])
|
|
return score / sum(ca.values())
|
|
|
|
|
|
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
|
|
|
# This is a straightforward implementation of a well-known algorithm, and thus
|
|
# probably shouldn't be covered by copyright to begin with. But in case it is,
|
|
# the author (Magnus Lie Hetland) has, to the extent possible under law,
|
|
# dedicated all copyright and related and neighboring rights to this software
|
|
# to the public domain worldwide, by distributing it under the CC0 license,
|
|
# version 1.0. This software is distributed without any warranty. For more
|
|
# information, see <http://creativecommons.org/publicdomain/zero/1.0>
|
|
|
|
def levenshtein(a, b):
|
|
"""
|
|
Calculates the Levenshtein distance between a and b.
|
|
"""
|
|
n, m = len(a), len(b)
|
|
if n > m:
|
|
# Make sure n <= m, to use O(min(n,m)) space
|
|
a, b = b, a
|
|
n, m = m, n
|
|
|
|
current = list(range(n+1))
|
|
for i in range(1, m+1):
|
|
previous, current = current, [i]+[0]*n
|
|
for j in range(1, n+1):
|
|
add, delete = previous[j]+1, current[j-1]+1
|
|
change = previous[j-1]
|
|
if a[j-1] != b[i-1]:
|
|
change = change + 1
|
|
current[j] = min(add, delete, change)
|
|
|
|
return current[n]
|