From da4784a2cc7ff1683eb10d7cd910855691805cd3 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Wed, 1 Jun 2022 10:41:42 -0700
Subject: [PATCH] update the bert end to end example with hftok (#236)

---
 onnxruntime_extensions/pnp/_nlp.py |  5 ++++-
 tutorials/bert_e2e.py              | 13 ++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/onnxruntime_extensions/pnp/_nlp.py b/onnxruntime_extensions/pnp/_nlp.py
index f8447910..71df7f27 100644
--- a/onnxruntime_extensions/pnp/_nlp.py
+++ b/onnxruntime_extensions/pnp/_nlp.py
@@ -1,4 +1,5 @@
 import json
+from collections import OrderedDict
 
 from ._base import ProcessingTracedModule, tensor_data_type as _dt
 from ._torchext import create_op_function
@@ -17,7 +18,9 @@ def make_custom_op(ctx, op_type, input_names, output_names, container, operator_
 def bert_tokenize(ctx, input_names, output_names, container, operator_name=None, **kwargs):
     if 'hf_tok' in kwargs:
         hf_bert_tokenizer = kwargs['hf_tok']
-        attrs = {'vocab_file': hf_bert_tokenizer.vocab}
+        ordered_vocab = OrderedDict(sorted(hf_bert_tokenizer.vocab.items(), key=lambda item: int(item[1])))
+        vocab = '\n'.join(ordered_vocab.keys())
+        attrs = dict(vocab_file=vocab)
     elif 'vocab_file' in kwargs:
         vocab = None
         vocab_file = kwargs['vocab_file']
diff --git a/tutorials/bert_e2e.py b/tutorials/bert_e2e.py
index 0e66f34e..e32034e0 100644
--- a/tutorials/bert_e2e.py
+++ b/tutorials/bert_e2e.py
@@ -1,6 +1,5 @@
 import onnx
 import torch
-import onnxruntime_extensions
 
 from pathlib import Path
 from onnxruntime_extensions import pnp, OrtPyFunction
@@ -10,13 +9,13 @@ from transformers.onnx import export, FeaturesManager
 # get an onnx model by converting HuggingFace pretrained model
 model_name = "bert-base-cased"
 model_path = Path("onnx-model/bert-base-cased.onnx")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 if not model_path.exists():
     if not model_path.parent.exists():
         model_path.parent.mkdir(parents=True, exist_ok=True)
     model = FeaturesManager.get_model_from_feature("default", model_name)
     model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature="default")
     onnx_config = model_onnx_config(model.config)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
     export(tokenizer,
            model=model,
            config=onnx_config,
@@ -35,20 +34,20 @@ def mapping_token_output(_1, _2, _3):
 
 
 test_sentence = ["this is a test sentence."]
-ort_tok = pnp.PreHuggingFaceBert(
-    vocab_file=onnxruntime_extensions.get_test_data_file(
-        '../test', 'data', 'bert_basic_cased_vocab.txt'))
+ort_tok = pnp.PreHuggingFaceBert(hf_tok=tokenizer)
 onnx_model = onnx.load_model(str(model_path))
 
+
+augmented_model_name = 'temp_bert_tok_all.onnx'
 # create the final onnx model which includes pre- and post- processing.
 augmented_model = pnp.export(pnp.SequentialProcessingModule(
                              ort_tok, mapping_token_output,
                              onnx_model, post_processing_forward),
                              test_sentence,
                              opset_version=12,
-                             output_path='bert_tok_all.onnx')
+                             output_path=augmented_model_name)
 
 # test the augmented onnx model with raw string input.
-model_func = OrtPyFunction.from_model('bert_tok_all.onnx')
+model_func = OrtPyFunction.from_model(augmented_model_name)
 result = model_func(test_sentence)
 print(result)