Improve handling of missing `vocab_file` attribute in HFTokenizerConverter (#677)
This commit updates `HFTokenizerConverter` to handle cases where the `hf_tokenizer` object might not have a `vocab_file` attribute. Changes: * Uses `getattr` to retrieve the `vocab_file` attribute for flexibility * Stores the retrieved value in a separate variable `vocab_file` for clarity * Checks if `vocab_file` is `None` before checking its existence This ensures the converter works correctly even with tokenizers that don't define a `vocab_file` attribute.
This commit is contained in:
Родитель
29a4b49a5a
Коммит
31f129c779
|
@ -43,10 +43,11 @@ class HFTokenizerConverter(CustomOpConverter):
|
|||
f"{hf_tokenizer.__name__}: vocab_files_names is not found")
|
||||
|
||||
tokenizer_file = filenames["tokenizer_file"]
|
||||
if (hf_tokenizer.vocab_file is None) or (not os.path.exists(hf_tokenizer.vocab_file)):
|
||||
vocab_file = getattr(hf_tokenizer, "vocab_file", None)
|
||||
if (vocab_file is None) or (not os.path.exists(vocab_file)):
|
||||
model_dir = hf_tokenizer.name_or_path
|
||||
else:
|
||||
model_dir = os.path.dirname(hf_tokenizer.vocab_file)
|
||||
model_dir = os.path.dirname(vocab_file)
|
||||
tokenizer_json = json.load(
|
||||
open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8"))
|
||||
# get vocab object from json file
|
||||
|
|
Загрузка…
Ссылка в новой задаче