2019-06-26 20:04:37 +03:00
from __future__ import absolute_import , division , print_function
import codecs
from six . moves import range
2019-07-31 19:27:57 +03:00
from collections import Counter
2019-08-05 19:25:16 +03:00
from utils import enweight
2019-08-01 19:50:34 +03:00
2019-07-24 19:36:24 +03:00
2019-06-26 20:04:37 +03:00
class Alphabet ( object ) :
def __init__ ( self , config_file ) :
self . _config_file = config_file
self . _label_to_str = [ ]
self . _str_to_label = { }
self . _size = 0
with codecs . open ( config_file , ' r ' , ' utf-8 ' ) as fin :
for line in fin :
if line [ 0 : 2 ] == ' \\ # ' :
line = ' # \n '
elif line [ 0 ] == ' # ' :
continue
2019-06-28 00:52:45 +03:00
self . _label_to_str + = line [ : - 1 ] # remove the line ending
2019-06-26 20:04:37 +03:00
self . _str_to_label [ line [ : - 1 ] ] = self . _size
self . _size + = 1
def string_from_label ( self , label ) :
return self . _label_to_str [ label ]
2019-06-28 00:52:45 +03:00
def has_label ( self , string ) :
return string in self . _str_to_label
2019-06-26 20:04:37 +03:00
def label_from_string ( self , string ) :
try :
return self . _str_to_label [ string ]
except KeyError as e :
raise KeyError (
''' ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your { train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt. '''
) . with_traceback ( e . __traceback__ )
def decode ( self , labels ) :
res = ' '
for label in labels :
res + = self . string_from_label ( label )
return res
def size ( self ) :
return self . _size
def config_file ( self ) :
return self . _config_file
2019-07-16 22:07:39 +03:00
class TextCleaner ( object ) :
2019-09-04 16:45:16 +03:00
def __init__ ( self , alphabet , to_lower = True , normalize_space = True , dashes_to_ws = True ) :
self . alphabet = alphabet
self . to_lower = to_lower
self . normalize_space = normalize_space
self . dashes_to_ws = dashes_to_ws
self . original_text = ' '
self . clean_text = ' '
2019-07-05 19:17:16 +03:00
self . positions = [ ]
2019-09-04 16:45:16 +03:00
self . meta = [ ]
def add_original_text ( self , original_text , meta = None ) :
2019-09-04 19:45:34 +03:00
if len ( self . positions ) > 0 :
self . clean_text + = ' '
self . original_text + = ' '
self . positions . append ( len ( self . original_text ) - 1 )
self . meta . append ( None )
ws = True
else :
ws = False
2019-09-04 16:45:16 +03:00
cleaned = [ ]
2019-09-04 19:45:34 +03:00
prepared_text = original_text . lower ( ) if self . to_lower else original_text
2019-07-05 19:17:16 +03:00
for position , c in enumerate ( prepared_text ) :
2019-09-04 16:45:16 +03:00
if self . dashes_to_ws and c == ' - ' and not self . alphabet . has_label ( ' - ' ) :
2019-07-09 19:06:27 +03:00
c = ' '
2019-09-04 16:45:16 +03:00
if self . normalize_space and c . isspace ( ) :
2019-07-05 19:17:16 +03:00
if ws :
continue
else :
ws = True
c = ' '
2019-09-04 16:45:16 +03:00
if not self . alphabet . has_label ( c ) :
2019-07-09 19:06:27 +03:00
continue
2019-07-11 19:18:02 +03:00
if not c . isspace ( ) :
ws = False
2019-07-05 19:17:16 +03:00
cleaned . append ( c )
2019-09-04 19:45:34 +03:00
self . positions . append ( len ( self . original_text ) + position )
2019-09-04 16:45:16 +03:00
self . meta . append ( meta )
2019-09-04 19:45:34 +03:00
self . original_text + = original_text
2019-09-04 16:45:16 +03:00
self . clean_text + = ' ' . join ( cleaned )
2019-07-04 19:00:47 +03:00
def get_original_offset ( self , clean_offset ) :
2019-07-05 19:17:16 +03:00
if clean_offset == len ( self . positions ) :
2019-09-04 16:45:16 +03:00
return self . positions [ - 1 ] + 1
2019-07-22 19:07:59 +03:00
return self . positions [ clean_offset ]
2019-07-04 19:00:47 +03:00
2019-09-04 19:45:34 +03:00
def collect_meta ( self , from_clean_offset , to_clean_offset = None ) :
2019-09-04 16:45:16 +03:00
if to_clean_offset is None :
return self . meta [ from_clean_offset ]
metas = [ ]
2019-09-04 19:45:34 +03:00
for meta in self . meta [ from_clean_offset : to_clean_offset + 1 ] :
2019-09-04 16:45:16 +03:00
if meta is not None and meta not in metas :
metas . append ( meta )
return metas
2019-07-04 19:00:47 +03:00
2019-07-16 22:07:39 +03:00
class TextRange ( object ) :
def __init__ ( self , document , start , end ) :
self . document = document
self . start = start
self . end = end
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
@staticmethod
def token_at ( text , position ) :
start = len ( text )
end = 0
for step in [ - 1 , 1 ] :
pos = position
while 0 < = pos < len ( text ) and not text [ pos ] . isspace ( ) :
if pos < start :
start = pos
if pos > end :
end = pos
pos + = step
return TextRange ( text , start , end + 1 ) if start < = end else TextRange ( text , position , position )
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
def neighbour_token ( self , direction ) :
return TextRange . token_at ( self . document , self . start - 2 if direction < 0 else self . end + 1 )
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def next_token ( self ) :
return self . neighbour_token ( 1 )
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def prev_token ( self ) :
return self . neighbour_token ( - 1 )
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def get_text ( self ) :
return self . document [ self . start : self . end ]
2019-07-11 19:18:02 +03:00
2019-07-16 22:07:39 +03:00
def __add__ ( self , other ) :
if not self . document == other . document :
2019-07-22 19:07:59 +03:00
raise Exception ( " Unable to add token from other string " )
2019-07-16 22:07:39 +03:00
return TextRange ( self . document , min ( self . start , other . start ) , max ( self . end , other . end ) )
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
def __eq__ ( self , other ) :
return self . document == other . document and self . start == other . start and self . end == other . end
2019-07-09 19:06:27 +03:00
2019-07-16 22:07:39 +03:00
def __len__ ( self ) :
return self . end - self . start
2019-07-09 19:06:27 +03:00
2019-08-01 19:50:34 +03:00
def ngrams ( s , size ) :
"""
Lists all appearances of all N - grams of a string from left to right .
: param s : String to decompose
: param size : N - gram size
: return : Produces strings representing all N - grams
"""
window = len ( s ) - size
if window < 1 or size < 1 :
if window == 0 :
yield s
2019-12-19 18:50:26 +03:00
return
2019-08-01 19:50:34 +03:00
for i in range ( 0 , window + 1 ) :
yield s [ i : i + size ]
def weighted_ngrams ( s , size , direction = 0 ) :
"""
Lists all appearances of all N - grams of a string from left to right together with a positional weight value .
The positional weight progresses quadratically .
: param s : String to decompose
: param size : N - gram size
: param direction : Order of assigning positional weights to N - grams :
direction < 0 : Weight of first N - gram is 1.0 and of last one 0.0
direction > 0 : Weight of first N - gram is 0.0 and of last one 1.0
direction == 0 : Weight of center N - gram ( s ) near or equal 0 , weight of first and last N - gram 1.0
: return : Produces ( string , float ) tuples representing the N - gram along with its assigned positional weight value
"""
2019-08-05 19:25:16 +03:00
return enweight ( ngrams ( s , size ) , direction = direction )
2019-08-01 19:50:34 +03:00
def similarity ( a , b , direction = 0 , min_ngram_size = 1 , max_ngram_size = 3 , size_factor = 1 , position_factor = 1 ) :
"""
Computes similarity value of two strings ranging from 0.0 ( completely different ) to 1.0 ( completely equal ) .
Counts intersection of weighted N - gram sets of both strings .
: param a : String to compare
: param b : String to compare
: param direction : Order of equality importance :
direction < 0 : Left ends of strings more important to be similar
direction > 0 : Right ends of strings more important to be similar
direction == 0 : Left and right ends more important to be similar than center parts
: param min_ngram_size : Minimum N - gram size to take into account
: param max_ngram_size : Maximum N - gram size to take into account
: param size_factor : Importance factor of the N - gram size ( compared to the positional importance ) .
: param position_factor : Importance factor of the N - gram position ( compared to the size importance )
: return : Number between 0.0 ( completely different ) and 1.0 ( completely equal )
"""
if len ( a ) < len ( b ) :
a , b = b , a
2019-07-31 19:27:57 +03:00
ca , cb = Counter ( ) , Counter ( )
2019-08-01 19:50:34 +03:00
for s , c in [ ( a , ca ) , ( b , cb ) ] :
for size in range ( min_ngram_size , max_ngram_size + 1 ) :
for ng , position_weight in weighted_ngrams ( s , size , direction = direction ) :
2019-08-09 14:55:05 +03:00
c [ ng ] + = size * size_factor + position_weight * position_weight * position_factor
2019-07-31 19:27:57 +03:00
score = 0
for key in set ( ca . keys ( ) ) & set ( cb . keys ( ) ) :
2019-08-01 19:50:34 +03:00
score + = min ( ca [ key ] , cb [ key ] )
return score / sum ( ca . values ( ) )
2019-07-31 19:27:57 +03:00
2019-06-26 20:04:37 +03:00
# The following code is from: http://hetland.org/coding/python/levenshtein.py
# This is a straightforward implementation of a well-known algorithm, and thus
# probably shouldn't be covered by copyright to begin with. But in case it is,
# the author (Magnus Lie Hetland) has, to the extent possible under law,
# dedicated all copyright and related and neighboring rights to this software
# to the public domain worldwide, by distributing it under the CC0 license,
# version 1.0. This software is distributed without any warranty. For more
# information, see <http://creativecommons.org/publicdomain/zero/1.0>
def levenshtein ( a , b ) :
"""
Calculates the Levenshtein distance between a and b .
"""
n , m = len ( a ) , len ( b )
if n > m :
# Make sure n <= m, to use O(min(n,m)) space
a , b = b , a
n , m = m , n
current = list ( range ( n + 1 ) )
for i in range ( 1 , m + 1 ) :
previous , current = current , [ i ] + [ 0 ] * n
for j in range ( 1 , n + 1 ) :
add , delete = previous [ j ] + 1 , current [ j - 1 ] + 1
change = previous [ j - 1 ]
if a [ j - 1 ] != b [ i - 1 ] :
change = change + 1
current [ j ] = min ( add , delete , change )
return current [ n ]