🎪 Phi2 windows (#970)

## Describe your changes 1. Fix phi readme. - `python phi2.py cpu_fp32` -> `python phi2.py --model_type cpu_fp32` - onnxruntime-1.17.0 did not support transformer optimization with `phi` model type. It is supported in latest nightly build 1.18.0 2. Provide the windows phi2 optimization with legacy onnx export but not dynamo export which is not supported in windows as of now. Perf tables: 1. cpu_fp32: ![image](https://github.com/microsoft/Olive/assets/13343117/c0c69400-f53d-458c-9d0b-d361cbdba972) 2. cpu_int4 ![image](https://github.com/microsoft/Olive/assets/13343117/3072aabb-095f-4a15-9956-0ce1a84ccf65) 3. cuda_fp16 ![image](https://github.com/microsoft/Olive/assets/13343117/03c1f069-2dba-4815-b49d-49fef8dadc12) 4. cuda_int4 ![image](https://github.com/microsoft/Olive/assets/13343117/cf246d8f-4508-4cbf-afea-a6ee5eb04f06) ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [ ] Update documents if necessary. - [ ] Lint and apply fixes to your code by running `lintrunner -a` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. ## (Optional) Issue link
2024-02-29 11:15:14 +08:00 · 2024-02-29 11:15:14 +08:00 · 6488eed3c8
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@ -6,7 +6,7 @@
 ||mistral|[Link](https://github.com/microsoft/Olive/tree/main/examples/mistral)|`CPU`: with Optimum conversion and ONNX Runtime optimizations and Intel® Neural Compressor static quantization for optimized INT8 ONNX model
 ||open llama|[Link](https://github.com/microsoft/Olive/tree/main/examples/open_llama)|`GPU`: with Optimum conversion and merging and ONNX Runtime optimizations for optimized ONNX model <br>`GPU`: with SparseGPT and TorchTRT conversion for an optimized PyTorch model with sparsity<br>`GPU`: with PyTorch LoRA/QLoRA/LoftQ for model fine tune<br>`GPU`: with ONNX Runtime QLoRA for model fine tune<br>`AzureML compute`: with Optimum conversion and merging and ONNX Runtime optimizations in AzureML<br>`CPU`: with Optimum conversion and merging and ONNX Runtime optimizations and Intel® Neural Compressor 4-bits weight-only quantization for optimized INT4 ONNX model
 ||phi|[Link](https://github.com/microsoft/Olive/tree/main/examples/phi)|`GPU`: with PyTorch QLoRA for model fine tune
-||phi2|[Link](https://github.com/microsoft/Olive/tree/main/examples/phi2)|`CPU`: with ONNX Runtime optimizations
+||phi2|[Link](https://github.com/microsoft/Olive/tree/main/examples/phi2)|`CPU`: with ONNX Runtime optimizations fp32/int4<br>`GPU` with ONNX Runtime optimizations fp16/int4.<br>
 ||falcon|[Link](https://github.com/microsoft/Olive/tree/main/examples/falcon)|`GPU`: with ONNX Runtime optimizations for optimized FP16 ONNX model
 ||red pajama|[Link](https://github.com/microsoft/Olive/tree/main/examples/red_pajama)| `CPU`: with Optimum conversion and merging and ONNX Runtime optimizations for a single optimized ONNX model
 ||bert|[Link](https://github.com/microsoft/Olive/tree/main/examples/bert)|`CPU`: with ONNX Runtime optimizations and quantization for optimized INT8 ONNX model<br>`CPU`: with ONNX Runtime optimizations and Intel® Neural Compressor quantization for optimized INT8 ONNX model<br>`CPU`: with PyTorch QAT Customized Training Loop and ONNX Runtime optimizations for optimized ONNX INT8 model<br>`GPU`: with ONNX Runtime optimizations for CUDA EP<br>`GPU`: with ONNX Runtime optimizations for TRT EP
--- a/examples/phi2/phi2.py
+++ b/examples/phi2/phi2.py
@ -5,6 +5,7 @@

 import argparse
 import json
+import platform

 from onnxruntime import __version__ as OrtVersion
 from packaging import version
@ -35,8 +36,21 @@ def get_args(raw_args):


 def main(raw_args=None):
-    if version.parse(OrtVersion) < version.parse("1.17.0"):
-        raise ValueError("Please use onnxruntime>=1.17.0 for phi2 optimization")
+    # Check if onnxruntime version is supported
+    # in linux, it requires the
+    # 1. model_type as `phi`
+    # 2. "optimization_options": {"attention_op_type": "MultiHeadAttention"}
+    # in windows, it requires the
+    # 1. model_type as `gpt2`
+    # 2. "optimization_options": {"attention_op_type": "MultiHeadAttention"}
+    # and `phi` and `MultiHeadAttention` requires ort-nightly version >= 1.18.0
+    if version.parse(OrtVersion) < version.parse("1.18.0"):
+        raise ValueError(
+            "Please use onnxruntime>=1.18.0 for phi2 optimization in Linux, you can refer to "
+            "https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages "
+            "for ort-nightly installation. If you are optimizing phi2 model in GPU, only cuda11 "
+            "is supported in onnxruntime>=1.18.0"
+        )

    args = get_args(raw_args)

@ -44,6 +58,14 @@ def main(raw_args=None):
    with open(json_file_template) as f:
        template_json = json.load(f)

+    if platform.system() == "Windows":
+        template_json["passes"]["convert"]["config"]["use_dynamo_exporter"] = False
+        template_json["passes"]["optimize_cpu"]["config"]["model_type"] = "gpt2"
+        template_json["passes"]["optimize_cuda"]["config"]["model_type"] = "gpt2"
+
+    with open("phi2_optimize.json", "w") as f:
+        json.dump(template_json, f, indent=4)
+
    # add pass flows
    model_type = str(args.model_type)
    template_json["pass_flows"] = SUPPORTED_WORKFLOWS[model_type]
--- a/examples/phi2/readme.md
+++ b/examples/phi2/readme.md
@ -8,24 +8,27 @@ This folder contains an example of phi2 optimization with Olive workflow.
 * Pytorch>=2.2.0 \
  _The [official website](https://pytorch.org/) offers packages compatible with CUDA 11.8 and 12.1. Please select the appropriate version according to your needs._
 * [ONNXRuntime nightly package](https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages)
+  In Linux, phi2 optimization requires the ONNXRuntime nightly package(>=1.18.0). In Windows, ONNXRuntime>=1.17.0 is recommended.

 ## Usage
 cpu_fp32
 ```bash
-python phi2.py --cpu_fp32
+python phi2.py --model_type cpu_fp32
 ```
 cpu_int4
 ```bash
-python phi2.py --cpu_int4
+python phi2.py --model_type cpu_int4
 ```
 cuda_fp16
 ```bash
-python phi2.py --cuda_fp16
+python phi2.py --model_type cuda_fp16
 ```
 cuda_int4
 ```bash
-python phi2.py --cuda_int4
+python phi2.py --model_type cuda_int4
 ```

 ## Limitations
-TorchDynamo-based ONNX Exporter only supports Linux.
+The latest ONNXRuntime implements specific fusion patterns for better performance but only works for ONNX model from TorchDynamo-based ONNX Exporter. And the TorchDynamo-based ONNX Exporter is only available on Linux.
+When using Windows, this example will fallback to the default PyTorch ONNX Exporter, that can achieve a few improvements but not as much as the TorchDynamo-based ONNX Exporter.
+Therefore, it is recommended to use Linux for phi2 optimization.
--- a/examples/phi2/requirements.txt
+++ b/examples/phi2/requirements.txt
@ -1,4 +1,5 @@
 einops
 onnx>=1.15.0
 onnxscript>=0.1.0.dev20240126
+torch>=2.2.0
 transformers>=4.36.2
--- a/examples/phi2/user_script.py
+++ b/examples/phi2/user_script.py
@ -1,6 +1,7 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
+import platform
 from itertools import chain
 from typing import TYPE_CHECKING, List, Tuple

@ -215,8 +216,14 @@ def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tenso
    past_kv = {}
    # Convert list of past_kv to dict of past_key and past_value
    for i, (past_k, past_v) in enumerate(past_key_values):
-        past_kv[f"past_key_{i}"] = past_k
-        past_kv[f"past_value_{i}"] = past_v
+        if platform.system() == "Windows":
+            # For Windows, the dynamo export is not supported yet, and the default export is used.
+            # The default export uses the following format for past_key_values.{i}.key and past_key_values.{i}.value
+            past_kv[f"past_key_values.{i}.key"] = past_k
+            past_kv[f"past_key_values.{i}.value"] = past_v
+        elif platform.system() == "Linux":
+            past_kv[f"past_key_{i}"] = past_k
+            past_kv[f"past_value_{i}"] = past_v
    return past_kv


--- a/olive/passes/onnx/transformer_optimization.py
+++ b/olive/passes/onnx/transformer_optimization.py
@ -181,6 +181,11 @@ class OrtTransformersOptimization(Pass):
        attn_op_type = run_config["optimization_options"].get("attention_op_type")

        if attn_op_type:
+            from onnxruntime import __version__ as OrtVersion
+            from packaging import version
+
+            if OrtVersion < version.parse("1.18.0"):
+                raise ValueError("AttentionOpType is only supported in ORT 1.18.0 or later")
            from onnxruntime.transformers.fusion_options import AttentionOpType

            if attn_op_type == "Attention":