Add perf check pipeline (#523)

## Describe your changes Add perf check pipeline. - 4 models supported: - Intel/bert-base-uncased-mrpc - microsoft/deberta-base-mnli - distilbert": "distilbert-base-uncased-finetuned-sst-2-english - roberta_large": "roberta-large-mnli - The pipeline will be run at 6:00 am every Friday. - Each metric will be run 10 times and print the average. - You can manually run it by `python run_performance_check.py --model_name bert` - Pipeline link: https://aiinfra.visualstudio.com/Model%20optimization%20Toolkit/_build?definitionId=1265&_a=summary ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [ ] Update documents if necessary. - [ ] Format your code by running `pre-commit run --all-files` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. ## (Optional) Issue link
2023-09-08 01:54:50 -07:00 · 2023-09-08 01:54:50 -07:00 · 5cef4ddcc0
--- a/.azure_pipelines/job_templates/olive-performance-template.yaml
+++ b/.azure_pipelines/job_templates/olive-performance-template.yaml
@ -0,0 +1,46 @@
+parameters:
+  model_name: ''
+  pool: ''
+  device: 'cpu'
+
+jobs:
+- job: ${{ parameters.device }}_Model_Performance
+  timeoutInMinutes: 300
+  pool:
+    name: ${{ parameters.pool }}
+  strategy:
+    matrix:
+      ${{ insert }}: ${{ parameters.examples }}
+  variables:
+    WINDOWS: ${{ parameters.windows }}
+    runCodesignValidationInjection: false
+    device: ${{ parameters.device }}
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: 3.8
+    displayName: Use Python 3.8
+
+  - script: make install-olive PIPELINE=True INSTALL_EXTRAS=[$(device)]
+    displayName: Install Olive
+
+  - script: make performance PIPELINE=True MODEL_NAME=$(MODEL_NAME) DEVICE=${{ parameters.device }}
+    displayName: Run performance comparison
+
+  - task: CredScan@3
+    displayName: 'Run CredScan'
+    inputs:
+      debugMode: false
+    continueOnError: true
+
+  - task: ComponentGovernanceComponentDetection@0
+    inputs:
+      scanType: 'Register'
+      verbosity: 'Verbose'
+      alertWarningLevel: 'High'
+    displayName: Component Detection
+
+  - script: make clean WINDOWS=$(WINDOWS)
+    condition: always()
+    displayName: Clean remaining artifacts
--- a/.azure_pipelines/performance.yaml
+++ b/.azure_pipelines/performance.yaml
@ -0,0 +1,44 @@
+trigger: none
+pr: none
+
+schedules:
+- cron: 0 6 * * 5
+  displayName: Scheduled Build
+  branches:
+    include:
+    - main
+  always: true
+
+
+jobs:
+- template: job_templates/olive-performance-template.yaml
+  parameters:
+    name: Linux_CPU_CI
+    pool: $(OLIVE_POOL_UBUNTU2004)
+    windows: False
+    device: cpu
+    examples:
+      bert:
+        model_name: bert
+      distilbert:
+        model_name: distilbert
+      deberta:
+        model_name: deberta
+      roberta_large:
+        model_name: roberta_large
+
+- template: job_templates/olive-performance-template.yaml
+  parameters:
+    name: Linux_GPU_CI
+    pool: $(OLIVE_GPU_POOL_UBUNTU2004)
+    windows: False
+    device: gpu
+    examples:
+      bert:
+        model_name: bert
+      distilbert:
+        model_name: distilbert
+      deberta:
+        model_name: deberta
+      roberta_large:
+        model_name: roberta_large
--- a/.azure_pipelines/performance_check/configs/bert.json
+++ b/.azure_pipelines/performance_check/configs/bert.json
@ -0,0 +1,86 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Intel/bert-base-uncased-mrpc",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mrpc",
+                    "split": "validation",
+                    "input_cols": ["sentence1", "sentence2"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "clean_cache": true,
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_ptq"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/bert_gpu.json
+++ b/.azure_pipelines/performance_check/configs/bert_gpu.json
@ -0,0 +1,85 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "Intel/bert-base-uncased-mrpc",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mrpc",
+                    "split": "validation",
+                    "input_cols": ["sentence1", "sentence2"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": true
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "enable_cuda_graph": true,
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "pass_flows": [
+        ["conversion", "transformers_optimization", "perf_tuning"],
+        ["conversion", "perf_tuning"]
+    ],
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CUDAExecutionProvider"],
+        "clean_cache": true,
+        "cache_dir": "cache",
+        "output_dir" : "models/bert_gpu"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/deberta.json
+++ b/.azure_pipelines/performance_check/configs/deberta.json
@ -0,0 +1,92 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "microsoft/deberta-base-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise", "hypothesis"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100,
+                    "component_kwargs": {
+                        "pre_process_data": {
+                            "align_labels": true
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "clean_cache": true,
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/microsoft-deberta"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/deberta_gpu.json
+++ b/.azure_pipelines/performance_check/configs/deberta_gpu.json
@ -0,0 +1,91 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "microsoft/deberta-base-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise", "hypothesis"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100,
+                    "component_kwargs": {
+                        "pre_process_data": {
+                            "align_labels": true
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": true
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "enable_cuda_graph": true,
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "pass_flows": [
+        ["conversion", "transformers_optimization", "perf_tuning"],
+        ["conversion", "perf_tuning"]
+    ],
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "clean_cache": true,
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CUDAExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/microsoft-deberta_cuda"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/distilbert.json
+++ b/.azure_pipelines/performance_check/configs/distilbert.json
@ -0,0 +1,87 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "sst2",
+                    "split": "validation",
+                    "input_cols": ["sentence"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100
+                }
+            }
+        }
+    },
+
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "clean_cache": true,
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/distilbert"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/distilbert_gpu.json
+++ b/.azure_pipelines/performance_check/configs/distilbert_gpu.json
@ -0,0 +1,85 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "sst2",
+                    "split": "validation",
+                    "input_cols": ["sentence"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": true
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "enable_cuda_graph": true,
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "pass_flows": [
+        ["conversion", "transformers_optimization", "perf_tuning"],
+        ["conversion", "perf_tuning"]
+    ],
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CUDAExecutionProvider"],
+        "clean_cache": true,
+        "cache_dir": "cache",
+        "output_dir" : "models/distilbert_cuda"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/perf.json
+++ b/.azure_pipelines/performance_check/configs/perf.json
@ -0,0 +1,26 @@
+{
+    "input_model":{
+        "type": "ONNXModel",
+        "config": {
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": ["cpu"]
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[]
+        }
+    },
+    "engine": {
+        "evaluator": "common_evaluator",
+        "host": "local_system",
+        "target": "local_system",
+        "clean_cache": true
+    }
+}
--- a/.azure_pipelines/performance_check/configs/roberta_large.json
+++ b/.azure_pipelines/performance_check/configs/roberta_large.json
@ -0,0 +1,91 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "roberta-large-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise", "hypothesis"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100,
+                    "component_kwargs": {
+                        "pre_process_data": {
+                            "align_labels": true
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxQuantization",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "clean_cache": true,
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CPUExecutionProvider"],
+        "cache_dir": "cache",
+        "output_dir" : "models/roberta_large"
+    }
+}
--- a/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
+++ b/.azure_pipelines/performance_check/configs/roberta_large_gpu.json
@ -0,0 +1,90 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "roberta-large-mnli",
+                "task": "text-classification",
+                "dataset": {
+                    "data_name":"glue",
+                    "subset": "mnli_matched",
+                    "split": "validation",
+                    "input_cols": ["premise", "hypothesis"],
+                    "label_cols": ["label"],
+                    "batch_size": 1,
+                    "max_samples": 100,
+                    "component_kwargs": {
+                        "pre_process_data": {
+                            "align_labels": true
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+            "metrics":[
+                {
+                    "name": "accuracy",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "sub_types": [
+                        {"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}
+                    ]
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        {"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "config": {
+                "target_opset": 13
+            }
+        },
+        "transformers_optimization": {
+            "type": "OrtTransformersOptimization",
+            "disable_search": true,
+            "config": {
+                "model_type": "bert",
+                "num_heads": 12,
+                "hidden_size": 768,
+                "float16": true
+            }
+        },
+        "perf_tuning": {
+            "type": "OrtPerfTuning",
+            "config": {
+                "enable_cuda_graph": true,
+                "data_config": "__input_model_data_config__"
+            }
+        }
+    },
+    "pass_flows": [
+        ["conversion", "transformers_optimization", "perf_tuning"],
+        ["conversion", "perf_tuning"]
+    ],
+    "engine": {
+        "search_strategy": {
+            "execution_order": "joint",
+            "search_algorithm": "tpe",
+            "search_algorithm_config": {
+                "num_samples": 3,
+                "seed": 0
+            }
+        },
+        "evaluator": "common_evaluator",
+        "execution_providers": ["CUDAExecutionProvider"],
+        "clean_cache": true,
+        "cache_dir": "cache",
+        "output_dir" : "models/roberta_large"
+    }
+}
--- a/.azure_pipelines/performance_check/requirements-cpu.txt
+++ b/.azure_pipelines/performance_check/requirements-cpu.txt
@ -0,0 +1,9 @@
+apache_beam
+datasets
+evaluate
+neural-compressor
+onnxruntime
+optimum
+scikit-learn
+tabulate
+transformers
--- a/.azure_pipelines/performance_check/requirements-gpu.txt
+++ b/.azure_pipelines/performance_check/requirements-gpu.txt
@ -0,0 +1,9 @@
+apache_beam
+datasets
+evaluate
+neural-compressor
+onnxruntime-gpu
+optimum
+scikit-learn
+tabulate
+transformers
--- a/.azure_pipelines/performance_check/run_performance_check.py
+++ b/.azure_pipelines/performance_check/run_performance_check.py
@ -0,0 +1,297 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# The Optimum optimization levels are:
+# O1: basic general optimizations.
+# O2: basic and extended general optimizations, transformers-specific fusions.
+# O3: same as O2 with GELU approximation.
+# O4: same as O3 with mixed precision (fp16, GPU-only, requires --device cuda).
+
+import argparse
+import ast
+import copy
+import json
+import subprocess
+from pathlib import Path
+
+from optimum.exporters.onnx import main_export
+from optimum.onnxruntime import ORTOptimizer, ORTQuantizer
+from optimum.onnxruntime.configuration import AutoOptimizationConfig, AutoQuantizationConfig
+from tabulate import tabulate
+
+from olive.data.template import huggingface_data_config_template
+from olive.workflows import run as olive_run
+
+MODEL_NAME_MAP = {
+    "bert": "Intel/bert-base-uncased-mrpc",
+    "deberta": "microsoft/deberta-base-mnli",
+    "distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
+    "roberta_large": "roberta-large-mnli",
+}
+
+MODEL_NAME_TO_CONFIG_MAP = {
+    "bert": {
+        "model_name": "Intel/bert-base-uncased-mrpc",
+        "task": "text-classification",
+        "dataset": {
+            "data_name": "glue",
+            "subset": "mrpc",
+            "split": "validation",
+            "input_cols": ["sentence1", "sentence2"],
+            "label_cols": ["label"],
+            "batch_size": 1,
+            "max_samples": 100,
+        },
+    },
+    "deberta": {
+        "model_name": "microsoft/deberta-base-mnli",
+        "task": "text-classification",
+        "dataset": {
+            "data_name": "glue",
+            "subset": "mnli_matched",
+            "split": "validation",
+            "input_cols": ["premise", "hypothesis"],
+            "label_cols": ["label"],
+            "batch_size": 1,
+            "max_samples": 100,
+            "component_kwargs": {"pre_process_data": {"align_labels": True}},
+        },
+    },
+    "distilbert": {
+        "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
+        "task": "text-classification",
+        "dataset": {
+            "data_name": "glue",
+            "subset": "sst2",
+            "split": "validation",
+            "input_cols": ["sentence"],
+            "label_cols": ["label"],
+            "batch_size": 1,
+            "max_samples": 100,
+        },
+    },
+    "roberta_large": {
+        "model_name": "roberta-large-mnli",
+        "task": "text-classification",
+        "dataset": {
+            "data_name": "glue",
+            "subset": "mnli_matched",
+            "split": "validation",
+            "input_cols": ["premise", "hypothesis"],
+            "label_cols": ["label"],
+            "batch_size": 1,
+            "max_samples": 100,
+            "component_kwargs": {"pre_process_data": {"align_labels": True}},
+        },
+    },
+}
+
+ACC_METRIC = {
+    "name": "accuracy",
+    "type": "accuracy",
+    "backend": "huggingface_metrics",
+    "sub_types": [{"name": "accuracy", "priority": 1, "goal": {"type": "max-degradation", "value": 0.01}}],
+}
+
+LAT_METRIC = {
+    "name": "latency",
+    "type": "latency",
+    "sub_types": [{"name": "avg", "priority": 2, "goal": {"type": "percent-min-improvement", "value": 20}}],
+}
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_name", type=str, help="The name of the model to run the perf comparison on")
+    parser.add_argument("--device", type=str, default="cpu", help="The device to run the perf comparison on")
+    parser.add_argument("--test_num", type=int, default=10, help="The number of times to run the perf comparison")
+
+    args = parser.parse_args()
+    return args
+
+
+def export_onnx(model_name, model_root_path, device="cpu"):
+    onnx_model_path = model_root_path / "onnx"
+    main_export(model_name, onnx_model_path) if device == "cpu" else main_export(
+        model_name, onnx_model_path, device="cuda"
+    )
+    return onnx_model_path
+
+
+def export_optimum_o1(optimizer, model_root_path):
+    o1_model_path = model_root_path / "optimum_o1"
+    optimization_config = AutoOptimizationConfig.O1()
+    optimizer.optimize(save_dir=o1_model_path, optimization_config=optimization_config)
+
+
+def export_optimum_o2(optimizer, model_root_path):
+    o2_model_path = model_root_path / "optimum_o2"
+    optimization_config = AutoOptimizationConfig.O2()
+    optimizer.optimize(save_dir=o2_model_path, optimization_config=optimization_config)
+
+
+def export_optimum_o3(optimizer, model_root_path):
+    o3_model_path = model_root_path / "optimum_o3"
+    optimization_config = AutoOptimizationConfig.O3()
+    optimizer.optimize(save_dir=o3_model_path, optimization_config=optimization_config)
+
+
+def export_optimum_o4(optimizer, model_root_path):
+    o4_model_path = model_root_path / "optimum_o4"
+    optimization_config = AutoOptimizationConfig.O4()
+    optimizer.optimize(save_dir=o4_model_path, optimization_config=optimization_config)
+
+
+def export_optimum_dynamic_quantization(onnx_model_path, model_root_path):
+    quantizer = ORTQuantizer.from_pretrained(onnx_model_path)
+    quantization_model_path = model_root_path / "optimum_dynamic_quantization"
+    dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
+    quantizer.quantize(
+        save_dir=quantization_model_path,
+        quantization_config=dqconfig,
+    )
+
+
+def run_with_config(tool, olive_config, metric_res):
+    outputs = olive_run(olive_config)
+    if tool == "olive":
+        metric = str(list(list(outputs.values())[0].nodes.values())[0].metrics.value)
+    else:
+        metric = str(list(outputs.values())[0])
+    metric_dict = ast.literal_eval(metric)
+
+    for metric_name, metric_value in metric_dict.items():
+        if metric_name not in metric_res[tool]:
+            metric_res[tool][metric_name] = []
+        metric_res[tool][metric_name].append(metric_value)
+
+
+def run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num):
+    print(f"Start running perf comparison on {model_name} model {test_num} times...")
+    model_list = ["hf_pytorch", "pytorch_compile", "onnx", "optimum_o1", "optimum_o2", "optimum_o3"]
+    if device == "gpu":
+        model_list.append("optimum_o4")
+    if device == "cpu":
+        model_list.append("optimum_dynamic_quantization")
+    metric_res = {}
+    config_json_path = cur_dir / "configs" / "perf.json"
+    for optimized_model in model_list:
+        metric_res[f"{optimized_model}"] = {}
+    metric_res["olive"] = {}
+    for i in range(test_num):
+        print(f"Start running {i} time...")
+        for optimized_model in model_list:
+            accuracy_metric = copy.deepcopy(ACC_METRIC)
+            latency_metric = copy.deepcopy(LAT_METRIC)
+            print(f"Start evaluating {optimized_model} model")
+            with open(config_json_path, "r") as fin:
+                olive_config = json.load(fin)
+                user_script_path = str(cur_dir / "user_scripts" / f"{model_name}.py")
+                hf_model_config = MODEL_NAME_TO_CONFIG_MAP[model_name]
+                if optimized_model == "onnx":
+                    olive_config["input_model"]["config"]["model_path"] = str(
+                        Path(model_root_path / optimized_model / "model.onnx")
+                    )
+                elif optimized_model == "optimum_dynamic_quantization":
+                    olive_config["input_model"]["config"]["model_path"] = str(
+                        Path(model_root_path / optimized_model / "model_quantized.onnx")
+                    )
+                elif optimized_model in ["optimum_o1", "optimum_o2", "optimum_o3", "optimum_o4"]:
+                    olive_config["input_model"]["config"]["model_path"] = str(
+                        Path(model_root_path / optimized_model / "model_optimized.onnx")
+                    )
+                elif optimized_model == "hf_pytorch":
+                    olive_config["input_model"]["type"] = "PyTorchModel"
+                    hf_config = {"hf_config": hf_model_config}
+                    olive_config["input_model"]["config"] = hf_config
+                elif optimized_model == "pytorch_compile":
+                    olive_config["input_model"]["type"] = "PyTorchModel"
+                    olive_config["input_model"]["config"]["model_script"] = user_script_path
+                    olive_config["input_model"]["config"]["model_loader"] = "torch_complied_model"
+
+                olive_config["systems"]["local_system"]["config"]["accelerators"] = (
+                    ["cpu"] if device == "cpu" else ["gpu"]
+                )
+                olive_config["engine"]["cache_dir"] = str(Path(model_root_path / optimized_model / "cache"))
+                olive_config["engine"]["output_dir"] = str(Path(model_root_path / optimized_model / "output"))
+                olive_config["engine"]["execution_providers"] = (
+                    ["CPUExecutionProvider"] if device == "cpu" else ["CUDAExecutionProvider"]
+                )
+                olive_config["evaluators"]["common_evaluator"]["metrics"].append(accuracy_metric)
+                olive_config["evaluators"]["common_evaluator"]["metrics"].append(latency_metric)
+                olive_config["evaluators"]["common_evaluator"]["metrics"][0][
+                    "data_config"
+                ] = huggingface_data_config_template(
+                    hf_model_config["model_name"], hf_model_config["task"], **hf_model_config["dataset"]
+                )
+                olive_config["evaluators"]["common_evaluator"]["metrics"][1][
+                    "data_config"
+                ] = huggingface_data_config_template(
+                    hf_model_config["model_name"], hf_model_config["task"], **hf_model_config["dataset"]
+                )
+
+            run_with_config(optimized_model, olive_config, metric_res)
+
+        olive_config = f"{model_name}.json" if device == "cpu" else f"{model_name}_gpu.json"
+        olive_config_path = cur_dir / "configs" / olive_config
+        run_with_config("olive", olive_config_path, metric_res)
+    print(metric_res)
+    for model, v in metric_res.items():
+        for metric_name, metric_value_list in v.items():
+            vsum = sum(float(v) for v in metric_value_list)
+            metric_res[model][metric_name] = vsum / len(metric_value_list)
+    return metric_res
+
+
+def print_perf_table(metric_res, device):
+    for key, value in metric_res.items():
+        json_value = str(value).replace("'", '"')
+        metric_res[key] = ast.literal_eval(json_value)
+
+    columns = [f"tool({device})"] + list(metric_res[next(iter(metric_res))].keys())
+    rows = [[key] + list(values.values()) for key, values in metric_res.items()]
+    table = tabulate(rows, headers=columns, tablefmt="pipe")
+    print(table)
+
+
+def main():
+    args = get_args()
+    model_name = args.model_name
+    model_id = MODEL_NAME_MAP[model_name]
+    device = args.device
+    test_num = args.test_num
+
+    cur_dir = Path(__file__).absolute().parent
+    model_root_path = cur_dir / "run_cache" / model_name
+    model_root_path.mkdir(parents=True, exist_ok=True)
+
+    # export the model to onnx
+    onnx_model_path = export_onnx(model_id, model_root_path, device)
+
+    optimizer = ORTOptimizer.from_pretrained(onnx_model_path)
+
+    # Optimum optimization
+    export_optimum_o1(optimizer, model_root_path)
+    export_optimum_o2(optimizer, model_root_path)
+    export_optimum_o3(optimizer, model_root_path)
+    if device == "gpu":
+        export_optimum_o4(optimizer, model_root_path)
+    if device == "cpu":
+        export_optimum_dynamic_quantization(onnx_model_path, model_root_path)
+
+    metric_res = run_perf_comparison(cur_dir, model_name, device, model_root_path, test_num)
+
+    if device == "cpu":
+        lscpu = subprocess.check_output(["lscpu"])
+        print(lscpu.decode("utf-8"))
+    elif device == "gpu":
+        nvidia_smi = subprocess.check_output(["nvidia-smi"])
+        print(nvidia_smi.decode("utf-8"))
+    print_perf_table(metric_res, device)
+
+
+if __name__ == "__main__":
+    main()
--- a/.azure_pipelines/performance_check/user_scripts/bert.py
+++ b/.azure_pipelines/performance_check/user_scripts/bert.py
@ -0,0 +1,11 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from transformers import BertForSequenceClassification
+
+
+def torch_complied_model(model_path):
+    model = BertForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc")
+    return torch.compile(model)
--- a/.azure_pipelines/performance_check/user_scripts/deberta.py
+++ b/.azure_pipelines/performance_check/user_scripts/deberta.py
@ -0,0 +1,11 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from transformers import DebertaForSequenceClassification
+
+
+def torch_complied_model(model_path):
+    model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base-mnli")
+    return torch.compile(model)
--- a/.azure_pipelines/performance_check/user_scripts/distilbert.py
+++ b/.azure_pipelines/performance_check/user_scripts/distilbert.py
@ -0,0 +1,11 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from transformers import DistilBertForSequenceClassification
+
+
+def torch_complied_model(model_path):
+    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
+    return torch.compile(model)
--- a/.azure_pipelines/performance_check/user_scripts/roberta_large.py
+++ b/.azure_pipelines/performance_check/user_scripts/roberta_large.py
@ -0,0 +1,11 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import torch
+from transformers import RobertaForSequenceClassification
+
+
+def torch_complied_model(model_path):
+    model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
+    return torch.compile(model)
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@ examples/**/data
 examples/**/model
 test/**/data
 nc_workspace/
+run_cache/

 # Onnx dynamo export artifacts
 *.sarif
--- a/8
+++ b/8
@ -5,6 +5,8 @@ EXAMPLE_FOLDER             ?=
 EXAMPLE_NAME               ?=
 INSTALL_EXTRAS             ?=
 VERSION                    ?=
+MODEL_NAME                 ?=
+DEVICE                     ?=
 ifeq ($(WINDOWS), True)
 	CURRENT_DIR             = "$(subst /,\\,${CURDIR})"
 	MKDIR_LOG_CMD           = mkdir logs | exit 0
@ -12,6 +14,7 @@ ifeq ($(WINDOWS), True)
 	TEST_CMD                = "scripts\\test.bat"
 	TEST_EXAMPLES_CMD       = "scripts\\test_examples.bat"
 	OVERWRITE_VERSION       = "python scripts\\overwrite_version.py --version $(VERSION)"
+	PERF_CHECK_CMD          = "scripts\\run_performance_check.bat"
 else
 	CURRENT_DIR             = ${CURDIR}
 	MKDIR_LOG_CMD           = mkdir -p logs
@ -19,6 +22,7 @@ else
 	TEST_CMD                = bash scripts/test.sh
 	TEST_EXAMPLES_CMD       = bash scripts/test_examples.sh
 	OVERWRITE_VERSION       = python scripts/overwrite_version.py --version $(VERSION)
+	PERF_CHECK_CMD          = bash scripts/run_performance_check.sh
 endif

 .PHONY: all
@ -49,6 +53,10 @@ test-examples: logs/
 test-examples:
 	$(TEST_EXAMPLES_CMD) $(PIPELINE) $(CURRENT_DIR) $(EXAMPLE_FOLDER) $(EXAMPLE_NAME)

+.PHONY: performance
+performance:
+	$(PERF_CHECK_CMD) $(PIPELINE) $(CURRENT_DIR) $(MODEL_NAME) $(DEVICE)
+
 .PHONY: clean
 clean:
 	git clean -dfX
--- a/scripts/run_performance_check.bat
+++ b/scripts/run_performance_check.bat
@ -0,0 +1,20 @@
+REM -------------------------------------------------------------------------
+REM Copyright (c) Microsoft Corporation. All rights reserved.
+REM Licensed under the MIT License.
+REM --------------------------------------------------------------------------
+@echo off
+
+set PIPELINE=%1
+set INSTALL_DEV_MODE=%2
+set MODEL_NAME=%3
+set DEVICE=%4
+
+if "%PIPELINE%"=="True" (
+    call olive-venv\\Scripts\\activate.bat || goto :error
+)
+
+call python -m pip install -r %ROOT_DIR%\\.azure_pipelines\\performance_check\\requirements-%DEVICE%.txt
+call python %ROOT_DIR%\\.azure_pipelines\\performance_check\\run_performance_check.py --model_name %MODEL_NAME% --device %DEVICE%
+
+REM clean up
+call rmdir /s /q %ROOT_DIR%\\.azure_pipelines\\performance_check\\run_cache
--- a/scripts/run_performance_check.sh
+++ b/scripts/run_performance_check.sh
@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+set -eoux pipefail
+
+PIPELINE=$1
+ROOT_DIR=$2
+MODEL_NAME=$3
+DEVICE=$4
+
+echo $PIPELINE
+if [[ "$PIPELINE" == "True" ]]; then
+    set +x
+    source olive-venv/bin/activate
+    set -x
+fi
+
+python -m pip install -r $ROOT_DIR/.azure_pipelines/performance_check/requirements-$DEVICE.txt
+python $ROOT_DIR/.azure_pipelines/performance_check/run_performance_check.py --model_name $MODEL_NAME --device $DEVICE
+
+# clean up
+rm -rf $ROOT_DIR/.azure_pipelines/performance_check/run_cache