Add Falcon-7b and Falcon-40b tokenizer support (#510)
* Add Falcon-7b and Falcon-40b tokenizer support * fix alignment and add tokenizer file in test/data to speed up compute --------- Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
This commit is contained in:
Родитель
d9fa8ea060
Коммит
997e9ee007
|
@ -133,6 +133,8 @@ _PROCESSOR_DICT = {
|
|||
"LlamaTokenizer": TokenOpParam("SentencepieceTokenizer", HFTokenizerConverter.spm_tokenizer,
|
||||
"SentencepieceDecoder", HFTokenizerConverter.spm_decoder,
|
||||
default_inputs={'add_bos': [True]}),
|
||||
"FalconTokenizer": TokenOpParam('ClipTokenizer', HFTokenizerConverter.bpe_tokenizer,
|
||||
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None)
|
||||
}
|
||||
# @formatter:on
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -28,6 +28,18 @@ class TestAutoTokenizer(unittest.TestCase):
|
|||
actual_ids = ort_tok([text])[0]
|
||||
np.testing.assert_array_equal(ids[0], actual_ids)
|
||||
|
||||
def test_falcon_tokenizer(self):
|
||||
# replace the official model name after the model is not gated anymore
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
text = "why don't you teach me some German?"
|
||||
ids = tokenizer.encode(text, return_tensors="np")
|
||||
|
||||
ort_tok = OrtPyFunction.from_model(gen_processing_models(
|
||||
tokenizer,
|
||||
pre_kwargs={"WITH_DEFAULT_INPUTS": True})[0])
|
||||
actual_ids = ort_tok([text])[0]
|
||||
np.testing.assert_array_equal(ids[0], actual_ids)
|
||||
|
||||
def test_t5_tokenizer(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=512)
|
||||
text = "best hotel in bay area."
|
||||
|
|
Загрузка…
Ссылка в новой задаче