DSAlign/align/text.py

243 строки
8.9 KiB
Python
Исходник Постоянная ссылка Обычный вид История

2019-06-26 20:04:37 +03:00
from __future__ import absolute_import, division, print_function
import codecs
from six.moves import range
2019-07-31 19:27:57 +03:00
from collections import Counter
2019-08-05 19:25:16 +03:00
from utils import enweight
2019-08-01 19:50:34 +03:00
2019-07-24 19:36:24 +03:00
2019-06-26 20:04:37 +03:00
class Alphabet(object):
def __init__(self, config_file):
self._config_file = config_file
self._label_to_str = []
self._str_to_label = {}
self._size = 0
with codecs.open(config_file, 'r', 'utf-8') as fin:
for line in fin:
if line[0:2] == '\\#':
line = '#\n'
elif line[0] == '#':
continue
2019-06-28 00:52:45 +03:00
self._label_to_str += line[:-1] # remove the line ending
2019-06-26 20:04:37 +03:00
self._str_to_label[line[:-1]] = self._size
self._size += 1
def string_from_label(self, label):
return self._label_to_str[label]
2019-06-28 00:52:45 +03:00
def has_label(self, string):
return string in self._str_to_label
2019-06-26 20:04:37 +03:00
def label_from_string(self, string):
try:
return self._str_to_label[string]
except KeyError as e:
raise KeyError(
'''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.'''
).with_traceback(e.__traceback__)
def decode(self, labels):
res = ''
for label in labels:
res += self.string_from_label(label)
return res
def size(self):
return self._size
def config_file(self):
return self._config_file
2019-07-16 22:07:39 +03:00
class TextCleaner(object):
def __init__(self, alphabet, to_lower=True, normalize_space=True, dashes_to_ws=True):
self.alphabet = alphabet
self.to_lower = to_lower
self.normalize_space = normalize_space
self.dashes_to_ws = dashes_to_ws
self.original_text = ''
self.clean_text = ''
2019-07-05 19:17:16 +03:00
self.positions = []
self.meta = []
def add_original_text(self, original_text, meta=None):
if len(self.positions) > 0:
self.clean_text += ' '
self.original_text += ' '
self.positions.append(len(self.original_text) - 1)
self.meta.append(None)
ws = True
else:
ws = False
cleaned = []
prepared_text = original_text.lower() if self.to_lower else original_text
2019-07-05 19:17:16 +03:00
for position, c in enumerate(prepared_text):
if self.dashes_to_ws and c == '-' and not self.alphabet.has_label('-'):
2019-07-09 19:06:27 +03:00
c = ' '
if self.normalize_space and c.isspace():
2019-07-05 19:17:16 +03:00
if ws:
continue
else:
ws = True
c = ' '
if not self.alphabet.has_label(c):
2019-07-09 19:06:27 +03:00
continue
2019-07-11 19:18:02 +03:00
if not c.isspace():
ws = False
2019-07-05 19:17:16 +03:00
cleaned.append(c)
self.positions.append(len(self.original_text) + position)
self.meta.append(meta)
self.original_text += original_text
self.clean_text += ''.join(cleaned)
2019-07-04 19:00:47 +03:00
def get_original_offset(self, clean_offset):
2019-07-05 19:17:16 +03:00
if clean_offset == len(self.positions):
return self.positions[-1] + 1
2019-07-22 19:07:59 +03:00
return self.positions[clean_offset]
2019-07-04 19:00:47 +03:00
def collect_meta(self, from_clean_offset, to_clean_offset=None):
if to_clean_offset is None:
return self.meta[from_clean_offset]
metas = []
for meta in self.meta[from_clean_offset:to_clean_offset + 1]:
if meta is not None and meta not in metas:
metas.append(meta)
return metas
2019-07-04 19:00:47 +03:00
2019-07-16 22:07:39 +03:00
class TextRange(object):
def __init__(self, document, start, end):
self.document = document
self.start = start
self.end = end
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
@staticmethod
def token_at(text, position):
start = len(text)
end = 0
for step in [-1, 1]:
pos = position
while 0 <= pos < len(text) and not text[pos].isspace():
if pos < start:
start = pos
if pos > end:
end = pos
pos += step
return TextRange(text, start, end + 1) if start <= end else TextRange(text, position, position)
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
def neighbour_token(self, direction):
return TextRange.token_at(self.document, self.start - 2 if direction < 0 else self.end + 1)
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def next_token(self):
return self.neighbour_token(1)
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def prev_token(self):
return self.neighbour_token(-1)
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def get_text(self):
return self.document[self.start:self.end]
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def __add__(self, other):
if not self.document == other.document:
2019-07-22 19:07:59 +03:00
raise Exception("Unable to add token from other string")
2019-07-16 22:07:39 +03:00
return TextRange(self.document, min(self.start, other.start), max(self.end, other.end))
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
def __eq__(self, other):
return self.document == other.document and self.start == other.start and self.end == other.end
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
def __len__(self):
return self.end-self.start
2019-07-09 19:06:27 +03:00
2019-08-01 19:50:34 +03:00
def ngrams(s, size):
"""
Lists all appearances of all N-grams of a string from left to right.
:param s: String to decompose
:param size: N-gram size
:return: Produces strings representing all N-grams
"""
window = len(s) - size
if window < 1 or size < 1:
if window == 0:
yield s
2019-12-19 18:50:26 +03:00
return
2019-08-01 19:50:34 +03:00
for i in range(0, window + 1):
yield s[i:i + size]
def weighted_ngrams(s, size, direction=0):
"""
Lists all appearances of all N-grams of a string from left to right together with a positional weight value.
The positional weight progresses quadratically.
:param s: String to decompose
:param size: N-gram size
:param direction: Order of assigning positional weights to N-grams:
direction < 0: Weight of first N-gram is 1.0 and of last one 0.0
direction > 0: Weight of first N-gram is 0.0 and of last one 1.0
direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0
:return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value
"""
2019-08-05 19:25:16 +03:00
return enweight(ngrams(s, size), direction=direction)
2019-08-01 19:50:34 +03:00
def similarity(a, b, direction=0, min_ngram_size=1, max_ngram_size=3, size_factor=1, position_factor=1):
"""
Computes similarity value of two strings ranging from 0.0 (completely different) to 1.0 (completely equal).
Counts intersection of weighted N-gram sets of both strings.
:param a: String to compare
:param b: String to compare
:param direction: Order of equality importance:
direction < 0: Left ends of strings more important to be similar
direction > 0: Right ends of strings more important to be similar
direction == 0: Left and right ends more important to be similar than center parts
:param min_ngram_size: Minimum N-gram size to take into account
:param max_ngram_size: Maximum N-gram size to take into account
:param size_factor: Importance factor of the N-gram size (compared to the positional importance).
:param position_factor: Importance factor of the N-gram position (compared to the size importance)
:return: Number between 0.0 (completely different) and 1.0 (completely equal)
"""
if len(a) < len(b):
a, b = b, a
2019-07-31 19:27:57 +03:00
ca, cb = Counter(), Counter()
2019-08-01 19:50:34 +03:00
for s, c in [(a, ca), (b, cb)]:
for size in range(min_ngram_size, max_ngram_size + 1):
for ng, position_weight in weighted_ngrams(s, size, direction=direction):
2019-08-09 14:55:05 +03:00
c[ng] += size * size_factor + position_weight * position_weight * position_factor
2019-07-31 19:27:57 +03:00
score = 0
for key in set(ca.keys()) & set(cb.keys()):
2019-08-01 19:50:34 +03:00
score += min(ca[key], cb[key])
return score / sum(ca.values())
2019-07-31 19:27:57 +03:00
2019-06-26 20:04:37 +03:00
# The following code is from: http://hetland.org/coding/python/levenshtein.py
# This is a straightforward implementation of a well-known algorithm, and thus
# probably shouldn't be covered by copyright to begin with. But in case it is,
# the author (Magnus Lie Hetland) has, to the extent possible under law,
# dedicated all copyright and related and neighboring rights to this software
# to the public domain worldwide, by distributing it under the CC0 license,
# version 1.0. This software is distributed without any warranty. For more
# information, see <http://creativecommons.org/publicdomain/zero/1.0>
def levenshtein(a, b):
"""
Calculates the Levenshtein distance between a and b.
"""
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a, b = b, a
n, m = m, n
current = list(range(n+1))
for i in range(1, m+1):
previous, current = current, [i]+[0]*n
for j in range(1, n+1):
add, delete = previous[j]+1, current[j-1]+1
change = previous[j-1]
if a[j-1] != b[i-1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]