Issue #230: Add HuggingFace vocab format to Bert tokenizer

HuggingFace vocab format is newline separated (unlike GPT which is
json). Newline separated is likely to be faster and doesn't require
an external library to parse it. Instead of introducing a json based
format, added support for native HuggingFace newline separated token
format.
This commit is contained in:
shaahji 2022-05-25 10:13:45 -07:00 коммит произвёл shaahji
Родитель 21dc637a6d
Коммит 49548f843d
1 изменённых файлов: 13 добавлений и 14 удалений

Просмотреть файл

@ -16,16 +16,21 @@ def make_custom_op(ctx, op_type, input_names, output_names, container, operator_
outputs=((_dt.INT64, []), (_dt.INT64, []), (_dt.INT64, [])))
def bert_tokenize(ctx, input_names, output_names, container, operator_name=None, **kwargs):
if 'hf_tok' in kwargs:
# TODO: need bert-tokenizer support JSON format
hf_bert_tokenizer = kwargs['hf_tok']
attrs = {'vocab_file': json.dumps(hf_bert_tokenizer.vocab, separators=(',', ':'))}
attrs = {'vocab_file': hf_bert_tokenizer.vocab}
elif 'vocab_file' in kwargs:
attrs = dict(vocab_file=kwargs['vocab_file'])
vocab = None
vocab_file = kwargs['vocab_file']
with open(vocab_file, "r", encoding='utf-8') as vf:
lines = vf.readlines()
vocab = '\n'.join(lines)
if vocab is None:
raise RuntimeError("Cannot load vocabulary file {}!".format(vocab_file))
attrs = dict(vocab_file=vocab)
else:
raise RuntimeError("Need hf_tok/vocab_file parameter to build the tokenizer")
if 'strip_accents' in kwargs:
strip_accents = kwargs['strip_accents']
attrs['strip_accents'] = strip_accents
attrs['strip_accents'] = kwargs['strip_accents']
return make_custom_op(ctx, 'BertTokenizer', input_names,
output_names, container, operator_name=operator_name, **attrs)
@ -67,18 +72,12 @@ class PreHuggingFaceBert(ProcessingTracedModule):
def __init__(self, hf_tok=None, vocab_file=None, do_lower_case=0, strip_accents=1):
super(PreHuggingFaceBert, self).__init__()
if hf_tok is None:
_vocab = None
with open(vocab_file, "r", encoding='utf-8') as vf:
lines = vf.readlines()
_vocab = '\n'.join(lines)
if _vocab is None:
raise RuntimeError("Cannot load vocabulary file {}!".format(vocab_file))
self.onnx_bert_tokenize = create_op_function('BertTokenizer', bert_tokenize,
vocab_file=_vocab,
vocab_file=vocab_file,
do_lower_case=do_lower_case,
strip_accents=strip_accents)
else:
self.onnx_bert_tokenize = create_op_function('BertTokenizer', bert_tokenize, hf_tok=self.hf_tok)
self.onnx_bert_tokenize = create_op_function('BertTokenizer', bert_tokenize, hf_tok=hf_tok)
def forward(self, text):
return self.onnx_bert_tokenize(text)
@ -96,7 +95,7 @@ class PreHuggingFaceGPT2(ProcessingTracedModule):
merges=_get_file_content(merges_file),
padding_length=padding_length)
else:
self.onnx_gpt2_tokenize = create_op_function('GPT2Tokenizer', gpt2_tokenize, hf_tok=self.hf_tok)
self.onnx_gpt2_tokenize = create_op_function('GPT2Tokenizer', gpt2_tokenize, hf_tok=hf_tok)
def forward(self, text):
return self.onnx_gpt2_tokenize(text)