onnxruntime-extensions/tutorials/bert_e2e.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import argparse
import os
import shutil
import re
import tempfile
import functools
from pathlib import Path

from onnxruntime_extensions.tools import add_pre_post_processing_to_model as add_ppp
import onnxruntime_extensions

# for tokenizer
import transformers
import numpy as np
import onnxruntime


# avoid loading model from hugging-face multiple times, it's time consuming
@functools.lru_cache
def get_tokenizer_and_model_from_huggingface(model_name):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    config = transformers.AutoConfig.from_pretrained(model_name)

    if model_name == "xlm-roberta-base":
        model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
        onnx_config = transformers.models.xlm_roberta.XLMRobertaOnnxConfig(config, "sequence-classification")
        text = ("Hello, my dog is cute",)
    elif model_name == "google/mobilebert-uncased":
        model = transformers.MobileBertForNextSentencePrediction.from_pretrained(model_name)
        onnx_config = transformers.models.mobilebert.MobileBertOnnxConfig(config, "default")
        text = ("where is Jim Henson?", "he is at school from where two blocks away")
    elif model_name == "csarron/mobilebert-uncased-squad-v2":
        model = transformers.MobileBertForQuestionAnswering.from_pretrained(model_name)
        onnx_config = transformers.models.mobilebert.MobileBertOnnxConfig(config, "question-answering")
        text = ("Who was Jim Henson?", "Jim Henson was a nice puppet")
    elif model_name == "lordtt13/emo-mobilebert":
        model = transformers.MobileBertForSequenceClassification.from_pretrained(model_name)
        onnx_config = transformers.models.mobilebert.MobileBertOnnxConfig(config, "sequence-classification")
        text = ("Hello, my dog is cute",)
    else:
        raise ValueError(f"{model_name} is not supported yet.")
    return tokenizer, model, onnx_config, text


def export_backbone(model_name: str, bert_onnx_model: Path):
    """
    To export onnx model from huggingface.
    This model usually has inputs "input_ids", "attention_mask", "token_type_ids", and tensor outputs.
    """

    # fix the seed so we can reproduce the results
    transformers.set_seed(42)
    tokenizer, model, onnx_config, text = get_tokenizer_and_model_from_huggingface(model_name)

    if bert_onnx_model and bert_onnx_model.exists():
        print("Using cached ONNX model, skipping re-exporting the backbone model.")
        return tokenizer, bert_onnx_model, onnx_config

    # tempfile will be removed automatically
    with tempfile.TemporaryDirectory() as tmpdir:
        canonized_name = bert_onnx_model.name
        tmp_model_path = Path(tmpdir + "/" + canonized_name)
        onnx_inputs, onnx_outputs = transformers.onnx.export(tokenizer, model, onnx_config, 16, tmp_model_path)
        shutil.copy(tmp_model_path, bert_onnx_model)
        return tokenizer, bert_onnx_model, onnx_config


def add_pre_post_processing_to_transformers(model_name: str, input_model_file: Path, output_model_file: Path):
    """Construct the pipeline for an end2end model with pre and post processing.
    The final model can take text as inputs and output the result in text format for models like Q & A.

    Args:
        model_name (str): Model to export from hugging-face. Used to infer tokenizer and onnx model backbone.
        input_model_file (Path): The onnx model needed to be saved/cached, if not provided, will export from hugging-face.
        output_model_file (Path): where to save the final onnx model.
    """
    tokenizer, bert_onnx_model, onnx_config = export_backbone(model_name, input_model_file)
    if not hasattr(tokenizer, "vocab_file"):
        vocab_file = bert_onnx_model.parent / "vocab.txt"
        import json
        with open(str(vocab_file), 'w') as f:
            f.write(json.dumps(tokenizer.vocab))
    else:
        vocab_file = tokenizer.vocab_file
    tokenizer_type = 'BertTokenizer' if model_name != 'xlm-roberta-base' else 'SentencePieceTokenizer'
    task_type = ('NextSentencePrediction' if model_name == 'google/mobilebert-uncased'
                 else ''.join([i.capitalize() for i in onnx_config.task.split('-')]))
    add_ppp.transformers_and_bert(bert_onnx_model, output_model_file,
                                  vocab_file, tokenizer_type,
                                  task_type,
                                  add_debug_before_postprocessing=True)


def verify_results_for_e2e_model(model_name: str, input_bert_model: Path, output_model_file: Path):
    """
    Args:
        output_model_file: the onnx model which finalized and needs to be verified
        model_name: the huggingface model name
        input_bert_model: the onnx model which is generated by huggingface or user provide
    """
    tokenizer, hg_model, _, text = get_tokenizer_and_model_from_huggingface(model_name)
    encoded_input = tokenizer(*text, return_tensors="pt")
    transformers.set_seed(42)

    session_options = onnxruntime.SessionOptions()

    output_name_for_verify = ''
    session = onnxruntime.InferenceSession(
        str(input_bert_model.resolve(strict=True)), providers=["CPUExecutionProvider"]
    )
    inputs = {key: value.detach().numpy() for key, value in encoded_input.items()}
    output_name_for_verify = session.get_outputs()[0].name
    ref_outputs = session.run([output_name_for_verify], inputs)

    # Load tokenizer op
    session_options.register_custom_ops_library(onnxruntime_extensions.get_library_path())

    session = onnxruntime.InferenceSession(
        str(output_model_file.resolve(strict=True)), session_options, providers=["CPUExecutionProvider"]
    )

    inputs = dict(input_text=np.array([[*text]]))
    real_outputs = session.run([output_name_for_verify+"_debug"], inputs)
    assert np.allclose(
        real_outputs[0], ref_outputs[0], atol=1e-2, rtol=1e-6
    ), f"Results do not match, expected:{ref_outputs[0]}, but got {real_outputs[0] }"

    print("Results matches:", real_outputs[0], "\ndiff:", real_outputs[0] - ref_outputs[0])


def main():
    parser = argparse.ArgumentParser(
        os.path.basename(__file__),
        description="""Add pre and post processing to a model.

        This tutorial supports updating:
          - MobileBert with different tasks
          - XLM-Roberta with classification task

        This tutorial provides an example of how to add pre/post processing to a transformer model.
        It can add a tokenizer (SentencePiece/Berttokenizer/HfbertTokenizer) for pre-processing,
        and a classifier/decoder for post-processing.

        Exports models from huggingface by default if an existing onnx model is not provided.
        NOTE: if providing a onnx model, you have to make sure your model is matched with the model_type in hugging-face as we are using the hugging-face tokenizer to do the pre-processing.
        """,
    )

    parser.add_argument(
        "-t",
        "--model_type",
        type=str,
        required=True,
        choices=[
            "xlm-roberta-base",
            "google/mobilebert-uncased",
            "csarron/mobilebert-uncased-squad-v2",
            "lordtt13/emo-mobilebert",
        ],
        help="Model type.",
    )

    parser.add_argument(
        "model_path",
        type=Path,
        help="""The path to an existing ONNX model or directory name to save a model exported from HuggingFace in.
                This model will be updated to add pre/post processing, and saved in the same location with the suffix
                '.with_pre_post_processing.onnx'""",
    )

    args = parser.parse_args()

    model_path = args.model_path.resolve(strict=True)
    canonized_name = re.sub(r"[^a-zA-Z0-9]", "_", args.model_type) + ".onnx"

    if model_path.is_dir():
        model_path = model_path / canonized_name

    new_model_path = model_path.with_suffix(".with_pre_post_processing.onnx")

    add_pre_post_processing_to_transformers(args.model_type, model_path, new_model_path)
    verify_results_for_e2e_model(args.model_type, model_path, new_model_path)
    return new_model_path


if __name__ == "__main__":
    main()