DSAlign/align/text.py

from __future__ import absolute_import, division, print_function

import codecs
from six.moves import range
from collections import Counter
from utils import enweight


class Alphabet(object):
    def __init__(self, config_file):
        self._config_file = config_file
        self._label_to_str = []
        self._str_to_label = {}
        self._size = 0
        with codecs.open(config_file, 'r', 'utf-8') as fin:
            for line in fin:
                if line[0:2] == '\\#':
                    line = '#\n'
                elif line[0] == '#':
                    continue
                self._label_to_str += line[:-1]  # remove the line ending
                self._str_to_label[line[:-1]] = self._size
                self._size += 1

    def string_from_label(self, label):
        return self._label_to_str[label]

    def has_label(self, string):
        return string in self._str_to_label

    def label_from_string(self, string):
        try:
            return self._str_to_label[string]
        except KeyError as e:
            raise KeyError(
                '''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.'''
            ).with_traceback(e.__traceback__)

    def decode(self, labels):
        res = ''
        for label in labels:
            res += self.string_from_label(label)
        return res

    def size(self):
        return self._size

    def config_file(self):
        return self._config_file


class TextCleaner(object):
    def __init__(self, alphabet, to_lower=True, normalize_space=True, dashes_to_ws=True):
        self.alphabet = alphabet
        self.to_lower = to_lower
        self.normalize_space = normalize_space
        self.dashes_to_ws = dashes_to_ws
        self.original_text = ''
        self.clean_text = ''
        self.positions = []
        self.meta = []

    def add_original_text(self, original_text, meta=None):
        if len(self.positions) > 0:
            self.clean_text += ' '
            self.original_text += ' '
            self.positions.append(len(self.original_text) - 1)
            self.meta.append(None)
            ws = True
        else:
            ws = False
        cleaned = []
        prepared_text = original_text.lower() if self.to_lower else original_text
        for position, c in enumerate(prepared_text):
            if self.dashes_to_ws and c == '-' and not self.alphabet.has_label('-'):
                c = ' '
            if self.normalize_space and c.isspace():
                if ws:
                    continue
                else:
                    ws = True
                    c = ' '
            if not self.alphabet.has_label(c):
                continue
            if not c.isspace():
                ws = False
            cleaned.append(c)
            self.positions.append(len(self.original_text) + position)
            self.meta.append(meta)
        self.original_text += original_text
        self.clean_text += ''.join(cleaned)

    def get_original_offset(self, clean_offset):
        if clean_offset == len(self.positions):
            return self.positions[-1] + 1
        return self.positions[clean_offset]

    def collect_meta(self, from_clean_offset, to_clean_offset=None):
        if to_clean_offset is None:
            return self.meta[from_clean_offset]
        metas = []
        for meta in self.meta[from_clean_offset:to_clean_offset + 1]:
            if meta is not None and meta not in metas:
                metas.append(meta)
        return metas


class TextRange(object):
    def __init__(self, document, start, end):
        self.document = document
        self.start = start
        self.end = end

    @staticmethod
    def token_at(text, position):
        start = len(text)
        end = 0
        for step in [-1, 1]:
            pos = position
            while 0 <= pos < len(text) and not text[pos].isspace():
                if pos < start:
                    start = pos
                if pos > end:
                    end = pos
                pos += step
        return TextRange(text, start, end + 1) if start <= end else TextRange(text, position, position)

    def neighbour_token(self, direction):
        return TextRange.token_at(self.document, self.start - 2 if direction < 0 else self.end + 1)

    def next_token(self):
        return self.neighbour_token(1)

    def prev_token(self):
        return self.neighbour_token(-1)

    def get_text(self):
        return self.document[self.start:self.end]

    def __add__(self, other):
        if not self.document == other.document:
            raise Exception("Unable to add token from other string")
        return TextRange(self.document, min(self.start, other.start), max(self.end, other.end))

    def __eq__(self, other):
        return self.document == other.document and self.start == other.start and self.end == other.end

    def __len__(self):
        return self.end-self.start


def ngrams(s, size):
    """
    Lists all appearances of all N-grams of a string from left to right.
    :param s: String to decompose
    :param size: N-gram size
    :return: Produces strings representing all N-grams
    """
    window = len(s) - size
    if window < 1 or size < 1:
        if window == 0:
            yield s
        return
    for i in range(0, window + 1):
        yield s[i:i + size]


def weighted_ngrams(s, size, direction=0):
    """
    Lists all appearances of all N-grams of a string from left to right together with a positional weight value.
    The positional weight progresses quadratically.
    :param s: String to decompose
    :param size: N-gram size
    :param direction: Order of assigning positional weights to N-grams:
        direction < 0: Weight of first N-gram is 1.0 and of last one 0.0
        direction > 0: Weight of first N-gram is 0.0 and of last one 1.0
        direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0
    :return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value
    """
    return enweight(ngrams(s, size), direction=direction)


def similarity(a, b, direction=0, min_ngram_size=1, max_ngram_size=3, size_factor=1, position_factor=1):
    """
    Computes similarity value of two strings ranging from 0.0 (completely different) to 1.0 (completely equal).
    Counts intersection of weighted N-gram sets of both strings.
    :param a: String to compare
    :param b: String to compare
    :param direction: Order of equality importance:
        direction < 0: Left ends of strings more important to be similar
        direction > 0: Right ends of strings more important to be similar
        direction == 0: Left and right ends more important to be similar than center parts
    :param min_ngram_size: Minimum N-gram size to take into account
    :param max_ngram_size: Maximum N-gram size to take into account
    :param size_factor: Importance factor of the N-gram size (compared to the positional importance).
    :param position_factor: Importance factor of the N-gram position (compared to the size importance)
    :return: Number between 0.0 (completely different) and 1.0 (completely equal)
    """
    if len(a) < len(b):
        a, b = b, a
    ca, cb = Counter(), Counter()
    for s, c in [(a, ca), (b, cb)]:
        for size in range(min_ngram_size, max_ngram_size + 1):
            for ng, position_weight in weighted_ngrams(s, size, direction=direction):
                c[ng] += size * size_factor + position_weight * position_weight * position_factor
    score = 0
    for key in set(ca.keys()) & set(cb.keys()):
        score += min(ca[key], cb[key])
    return score / sum(ca.values())


# The following code is from: http://hetland.org/coding/python/levenshtein.py

# This is a straightforward implementation of a well-known algorithm, and thus
# probably shouldn't be covered by copyright to begin with. But in case it is,
# the author (Magnus Lie Hetland) has, to the extent possible under law,
# dedicated all copyright and related and neighboring rights to this software
# to the public domain worldwide, by distributing it under the CC0 license,
# version 1.0. This software is distributed without any warranty. For more
# information, see <http://creativecommons.org/publicdomain/zero/1.0>

def levenshtein(a, b):
    """
    Calculates the Levenshtein distance between a and b.
    """
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a, b = b, a
        n, m = m, n

    current = list(range(n+1))
    for i in range(1, m+1):
        previous, current = current, [i]+[0]*n
        for j in range(1, n+1):
            add, delete = previous[j]+1, current[j-1]+1
            change = previous[j-1]
            if a[j-1] != b[i-1]:
                change = change + 1
            current[j] = min(add, delete, change)

    return current[n]
WIP 2019-06-26 20:04:37 +03:00			`from __future__ import absolute_import, division, print_function`

			`import codecs`
			`from six.moves import range`
WIP 2019-07-31 19:27:57 +03:00			`from collections import Counter`
WIP recursive matching 2019-08-05 19:25:16 +03:00			`from utils import enweight`
WIP 2019-08-01 19:50:34 +03:00
WIP 2019-07-24 19:36:24 +03:00
WIP 2019-06-26 20:04:37 +03:00			`class Alphabet(object):`
			`def __init__(self, config_file):`
			`self._config_file = config_file`
			`self._label_to_str = []`
			`self._str_to_label = {}`
			`self._size = 0`
			`with codecs.open(config_file, 'r', 'utf-8') as fin:`
			`for line in fin:`
			`if line[0:2] == '\\#':`
			`line = '#\n'`
			`elif line[0] == '#':`
			`continue`
WIP 2019-06-28 00:52:45 +03:00			`self._label_to_str += line[:-1] # remove the line ending`
WIP 2019-06-26 20:04:37 +03:00			`self._str_to_label[line[:-1]] = self._size`
			`self._size += 1`

			`def string_from_label(self, label):`
			`return self._label_to_str[label]`

WIP 2019-06-28 00:52:45 +03:00			`def has_label(self, string):`
			`return string in self._str_to_label`

WIP 2019-06-26 20:04:37 +03:00			`def label_from_string(self, string):`
			`try:`
			`return self._str_to_label[string]`
			`except KeyError as e:`
			`raise KeyError(`
			`'''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.'''`
			`).with_traceback(e.__traceback__)`

			`def decode(self, labels):`
			`res = ''`
			`for label in labels:`
			`res += self.string_from_label(label)`
			`return res`

			`def size(self):`
			`return self._size`

			`def config_file(self):`
			`return self._config_file`


Better text range #handling 2019-07-16 22:07:39 +03:00			`class TextCleaner(object):`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`def __init__(self, alphabet, to_lower=True, normalize_space=True, dashes_to_ws=True):`
			`self.alphabet = alphabet`
			`self.to_lower = to_lower`
			`self.normalize_space = normalize_space`
			`self.dashes_to_ws = dashes_to_ws`
			`self.original_text = ''`
			`self.clean_text = ''`
WIP 2019-07-05 19:17:16 +03:00			`self.positions = []`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`self.meta = []`

			`def add_original_text(self, original_text, meta=None):`
Fix for multi-phrase support, direct loading of .tlog files 2019-09-04 19:45:34 +03:00			`if len(self.positions) > 0:`
			`self.clean_text += ' '`
			`self.original_text += ' '`
			`self.positions.append(len(self.original_text) - 1)`
			`self.meta.append(None)`
			`ws = True`
			`else:`
			`ws = False`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`cleaned = []`
Fix for multi-phrase support, direct loading of .tlog files 2019-09-04 19:45:34 +03:00			`prepared_text = original_text.lower() if self.to_lower else original_text`
WIP 2019-07-05 19:17:16 +03:00			`for position, c in enumerate(prepared_text):`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`if self.dashes_to_ws and c == '-' and not self.alphabet.has_label('-'):`
WIP 2019-07-09 19:06:27 +03:00			`c = ' '`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`if self.normalize_space and c.isspace():`
WIP 2019-07-05 19:17:16 +03:00			`if ws:`
			`continue`
			`else:`
			`ws = True`
			`c = ' '`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`if not self.alphabet.has_label(c):`
WIP 2019-07-09 19:06:27 +03:00			`continue`
WIP 2019-07-11 19:18:02 +03:00			`if not c.isspace():`
			`ws = False`
WIP 2019-07-05 19:17:16 +03:00			`cleaned.append(c)`
Fix for multi-phrase support, direct loading of .tlog files 2019-09-04 19:45:34 +03:00			`self.positions.append(len(self.original_text) + position)`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`self.meta.append(meta)`
Fix for multi-phrase support, direct loading of .tlog files 2019-09-04 19:45:34 +03:00			`self.original_text += original_text`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`self.clean_text += ''.join(cleaned)`
WIP 2019-07-04 19:00:47 +03:00
			`def get_original_offset(self, clean_offset):`
WIP 2019-07-05 19:17:16 +03:00			`if clean_offset == len(self.positions):`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`return self.positions[-1] + 1`
Document specific LMs 2019-07-22 19:07:59 +03:00			`return self.positions[clean_offset]`
WIP 2019-07-04 19:00:47 +03:00
Fix for multi-phrase support, direct loading of .tlog files 2019-09-04 19:45:34 +03:00			`def collect_meta(self, from_clean_offset, to_clean_offset=None):`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`if to_clean_offset is None:`
			`return self.meta[from_clean_offset]`
			`metas = []`
Fix for multi-phrase support, direct loading of .tlog files 2019-09-04 19:45:34 +03:00			`for meta in self.meta[from_clean_offset:to_clean_offset + 1]:`
Bug fixes; progress-bars; multi phrase text cleaning preserving assigned meta-data 2019-09-04 16:45:16 +03:00			`if meta is not None and meta not in metas:`
			`metas.append(meta)`
			`return metas`

WIP 2019-07-04 19:00:47 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`class TextRange(object):`
			`def __init__(self, document, start, end):`
			`self.document = document`
			`self.start = start`
			`self.end = end`
WIP 2019-07-09 19:06:27 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`@staticmethod`
			`def token_at(text, position):`
			`start = len(text)`
			`end = 0`
			`for step in [-1, 1]:`
			`pos = position`
			`while 0 <= pos < len(text) and not text[pos].isspace():`
			`if pos < start:`
			`start = pos`
			`if pos > end:`
			`end = pos`
			`pos += step`
			`return TextRange(text, start, end + 1) if start <= end else TextRange(text, position, position)`
WIP 2019-07-09 19:06:27 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def neighbour_token(self, direction):`
			`return TextRange.token_at(self.document, self.start - 2 if direction < 0 else self.end + 1)`
WIP 2019-07-11 19:18:02 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def next_token(self):`
			`return self.neighbour_token(1)`
WIP 2019-07-11 19:18:02 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def prev_token(self):`
			`return self.neighbour_token(-1)`
WIP 2019-07-11 19:18:02 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def get_text(self):`
			`return self.document[self.start:self.end]`
WIP 2019-07-11 19:18:02 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def __add__(self, other):`
			`if not self.document == other.document:`
Document specific LMs 2019-07-22 19:07:59 +03:00			`raise Exception("Unable to add token from other string")`
Better text range #handling 2019-07-16 22:07:39 +03:00			`return TextRange(self.document, min(self.start, other.start), max(self.end, other.end))`
WIP 2019-07-09 19:06:27 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def __eq__(self, other):`
			`return self.document == other.document and self.start == other.start and self.end == other.end`
WIP 2019-07-09 19:06:27 +03:00
Better text range #handling 2019-07-16 22:07:39 +03:00			`def __len__(self):`
			`return self.end-self.start`
WIP 2019-07-09 19:06:27 +03:00

WIP 2019-08-01 19:50:34 +03:00			`def ngrams(s, size):`
			`"""`
			`Lists all appearances of all N-grams of a string from left to right.`
			`:param s: String to decompose`
			`:param size: N-gram size`
			`:return: Produces strings representing all N-grams`
			`"""`
			`window = len(s) - size`
			`if window < 1 or size < 1:`
			`if window == 0:`
			`yield s`
Changes needed for DeepSpeech 0.6.0 2019-12-19 18:50:26 +03:00			`return`
WIP 2019-08-01 19:50:34 +03:00			`for i in range(0, window + 1):`
			`yield s[i:i + size]`


			`def weighted_ngrams(s, size, direction=0):`
			`"""`
			`Lists all appearances of all N-grams of a string from left to right together with a positional weight value.`
			`The positional weight progresses quadratically.`
			`:param s: String to decompose`
			`:param size: N-gram size`
			`:param direction: Order of assigning positional weights to N-grams:`
			`direction < 0: Weight of first N-gram is 1.0 and of last one 0.0`
			`direction > 0: Weight of first N-gram is 0.0 and of last one 1.0`
			`direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0`
			`:return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value`
			`"""`
WIP recursive matching 2019-08-05 19:25:16 +03:00			`return enweight(ngrams(s, size), direction=direction)`
WIP 2019-08-01 19:50:34 +03:00

			`def similarity(a, b, direction=0, min_ngram_size=1, max_ngram_size=3, size_factor=1, position_factor=1):`
			`"""`
			`Computes similarity value of two strings ranging from 0.0 (completely different) to 1.0 (completely equal).`
			`Counts intersection of weighted N-gram sets of both strings.`
			`:param a: String to compare`
			`:param b: String to compare`
			`:param direction: Order of equality importance:`
			`direction < 0: Left ends of strings more important to be similar`
			`direction > 0: Right ends of strings more important to be similar`
			`direction == 0: Left and right ends more important to be similar than center parts`
			`:param min_ngram_size: Minimum N-gram size to take into account`
			`:param max_ngram_size: Maximum N-gram size to take into account`
			`:param size_factor: Importance factor of the N-gram size (compared to the positional importance).`
			`:param position_factor: Importance factor of the N-gram position (compared to the size importance)`
			`:return: Number between 0.0 (completely different) and 1.0 (completely equal)`
			`"""`
			`if len(a) < len(b):`
			`a, b = b, a`
WIP 2019-07-31 19:27:57 +03:00			`ca, cb = Counter(), Counter()`
WIP 2019-08-01 19:50:34 +03:00			`for s, c in [(a, ca), (b, cb)]:`
			`for size in range(min_ngram_size, max_ngram_size + 1):`
			`for ng, position_weight in weighted_ngrams(s, size, direction=direction):`
Better defaults 2019-08-09 14:55:05 +03:00			`c[ng] += size * size_factor + position_weight * position_weight * position_factor`
WIP 2019-07-31 19:27:57 +03:00			`score = 0`
			`for key in set(ca.keys()) & set(cb.keys()):`
WIP 2019-08-01 19:50:34 +03:00			`score += min(ca[key], cb[key])`
			`return score / sum(ca.values())`
WIP 2019-07-31 19:27:57 +03:00

WIP 2019-06-26 20:04:37 +03:00			`# The following code is from: http://hetland.org/coding/python/levenshtein.py`

			`# This is a straightforward implementation of a well-known algorithm, and thus`
			`# probably shouldn't be covered by copyright to begin with. But in case it is,`
			`# the author (Magnus Lie Hetland) has, to the extent possible under law,`
			`# dedicated all copyright and related and neighboring rights to this software`
			`# to the public domain worldwide, by distributing it under the CC0 license,`
			`# version 1.0. This software is distributed without any warranty. For more`
			`# information, see <http://creativecommons.org/publicdomain/zero/1.0>`

			`def levenshtein(a, b):`
			`"""`
			`Calculates the Levenshtein distance between a and b.`
			`"""`
			`n, m = len(a), len(b)`
			`if n > m:`
			`# Make sure n <= m, to use O(min(n,m)) space`
			`a, b = b, a`
			`n, m = m, n`

			`current = list(range(n+1))`
			`for i in range(1, m+1):`
			`previous, current = current, [i]+[0]*n`
			`for j in range(1, n+1):`
			`add, delete = previous[j]+1, current[j-1]+1`
			`change = previous[j-1]`
			`if a[j-1] != b[i-1]:`
			`change = change + 1`
			`current[j] = min(add, delete, change)`

			`return current[n]`