from __future__ import absolute_import, division, print_function import codecs from six.moves import range from collections import Counter from utils import enweight class Alphabet(object): def __init__(self, config_file): self._config_file = config_file self._label_to_str = [] self._str_to_label = {} self._size = 0 with codecs.open(config_file, 'r', 'utf-8') as fin: for line in fin: if line[0:2] == '\\#': line = '#\n' elif line[0] == '#': continue self._label_to_str += line[:-1] # remove the line ending self._str_to_label[line[:-1]] = self._size self._size += 1 def string_from_label(self, label): return self._label_to_str[label] def has_label(self, string): return string in self._str_to_label def label_from_string(self, string): try: return self._str_to_label[string] except KeyError as e: raise KeyError( '''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.''' ).with_traceback(e.__traceback__) def decode(self, labels): res = '' for label in labels: res += self.string_from_label(label) return res def size(self): return self._size def config_file(self): return self._config_file class TextCleaner(object): def __init__(self, alphabet, to_lower=True, normalize_space=True, dashes_to_ws=True): self.alphabet = alphabet self.to_lower = to_lower self.normalize_space = normalize_space self.dashes_to_ws = dashes_to_ws self.original_text = '' self.clean_text = '' self.positions = [] self.meta = [] def add_original_text(self, original_text, meta=None): if len(self.positions) > 0: self.clean_text += ' ' self.original_text += ' ' self.positions.append(len(self.original_text) - 1) self.meta.append(None) ws = True else: ws = False cleaned = [] prepared_text = original_text.lower() if self.to_lower else original_text for position, c in enumerate(prepared_text): if self.dashes_to_ws and c == '-' and not self.alphabet.has_label('-'): c = ' ' if self.normalize_space and c.isspace(): if ws: continue else: ws = True c = ' ' if not self.alphabet.has_label(c): continue if not c.isspace(): ws = False cleaned.append(c) self.positions.append(len(self.original_text) + position) self.meta.append(meta) self.original_text += original_text self.clean_text += ''.join(cleaned) def get_original_offset(self, clean_offset): if clean_offset == len(self.positions): return self.positions[-1] + 1 return self.positions[clean_offset] def collect_meta(self, from_clean_offset, to_clean_offset=None): if to_clean_offset is None: return self.meta[from_clean_offset] metas = [] for meta in self.meta[from_clean_offset:to_clean_offset + 1]: if meta is not None and meta not in metas: metas.append(meta) return metas class TextRange(object): def __init__(self, document, start, end): self.document = document self.start = start self.end = end @staticmethod def token_at(text, position): start = len(text) end = 0 for step in [-1, 1]: pos = position while 0 <= pos < len(text) and not text[pos].isspace(): if pos < start: start = pos if pos > end: end = pos pos += step return TextRange(text, start, end + 1) if start <= end else TextRange(text, position, position) def neighbour_token(self, direction): return TextRange.token_at(self.document, self.start - 2 if direction < 0 else self.end + 1) def next_token(self): return self.neighbour_token(1) def prev_token(self): return self.neighbour_token(-1) def get_text(self): return self.document[self.start:self.end] def __add__(self, other): if not self.document == other.document: raise Exception("Unable to add token from other string") return TextRange(self.document, min(self.start, other.start), max(self.end, other.end)) def __eq__(self, other): return self.document == other.document and self.start == other.start and self.end == other.end def __len__(self): return self.end-self.start def ngrams(s, size): """ Lists all appearances of all N-grams of a string from left to right. :param s: String to decompose :param size: N-gram size :return: Produces strings representing all N-grams """ window = len(s) - size if window < 1 or size < 1: if window == 0: yield s return for i in range(0, window + 1): yield s[i:i + size] def weighted_ngrams(s, size, direction=0): """ Lists all appearances of all N-grams of a string from left to right together with a positional weight value. The positional weight progresses quadratically. :param s: String to decompose :param size: N-gram size :param direction: Order of assigning positional weights to N-grams: direction < 0: Weight of first N-gram is 1.0 and of last one 0.0 direction > 0: Weight of first N-gram is 0.0 and of last one 1.0 direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0 :return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value """ return enweight(ngrams(s, size), direction=direction) def similarity(a, b, direction=0, min_ngram_size=1, max_ngram_size=3, size_factor=1, position_factor=1): """ Computes similarity value of two strings ranging from 0.0 (completely different) to 1.0 (completely equal). Counts intersection of weighted N-gram sets of both strings. :param a: String to compare :param b: String to compare :param direction: Order of equality importance: direction < 0: Left ends of strings more important to be similar direction > 0: Right ends of strings more important to be similar direction == 0: Left and right ends more important to be similar than center parts :param min_ngram_size: Minimum N-gram size to take into account :param max_ngram_size: Maximum N-gram size to take into account :param size_factor: Importance factor of the N-gram size (compared to the positional importance). :param position_factor: Importance factor of the N-gram position (compared to the size importance) :return: Number between 0.0 (completely different) and 1.0 (completely equal) """ if len(a) < len(b): a, b = b, a ca, cb = Counter(), Counter() for s, c in [(a, ca), (b, cb)]: for size in range(min_ngram_size, max_ngram_size + 1): for ng, position_weight in weighted_ngrams(s, size, direction=direction): c[ng] += size * size_factor + position_weight * position_weight * position_factor score = 0 for key in set(ca.keys()) & set(cb.keys()): score += min(ca[key], cb[key]) return score / sum(ca.values()) # The following code is from: http://hetland.org/coding/python/levenshtein.py # This is a straightforward implementation of a well-known algorithm, and thus # probably shouldn't be covered by copyright to begin with. But in case it is, # the author (Magnus Lie Hetland) has, to the extent possible under law, # dedicated all copyright and related and neighboring rights to this software # to the public domain worldwide, by distributing it under the CC0 license, # version 1.0. This software is distributed without any warranty. For more # information, see def levenshtein(a, b): """ Calculates the Levenshtein distance between a and b. """ n, m = len(a), len(b) if n > m: # Make sure n <= m, to use O(min(n,m)) space a, b = b, a n, m = m, n current = list(range(n+1)) for i in range(1, m+1): previous, current = current, [i]+[0]*n for j in range(1, n+1): add, delete = previous[j]+1, current[j-1]+1 change = previous[j-1] if a[j-1] != b[i-1]: change = change + 1 current[j] = min(add, delete, change) return current[n]