Format all python files under onnxruntime with black and isort (#11324)
Description: Format all python files under onnxruntime with black and isort. After checking in, we can use .git-blame-ignore-revs to ignore the formatting PR in git blame. #11315, #11316
This commit is contained in:
Родитель
13f86e7d56
Коммит
fdce4fa6af
1
.flake8
1
.flake8
|
@ -22,3 +22,4 @@ exclude =
|
|||
./orttraining,
|
||||
# ignore server code for now
|
||||
./server,
|
||||
ignore = W503, E203
|
||||
|
|
|
@ -15,8 +15,10 @@ package_url = None
|
|||
|
||||
registrations = []
|
||||
|
||||
with open(os.path.join(REPO_DIR, 'tools', 'ci_build', 'github', 'linux', 'docker', 'Dockerfile.manylinux2014_cuda11'),
|
||||
mode="r") as f:
|
||||
with open(
|
||||
os.path.join(REPO_DIR, "tools", "ci_build", "github", "linux", "docker", "Dockerfile.manylinux2014_cuda11"),
|
||||
mode="r",
|
||||
) as f:
|
||||
for line in f:
|
||||
if not line.strip():
|
||||
package_name = None
|
||||
|
@ -36,15 +38,12 @@ with open(os.path.join(REPO_DIR, 'tools', 'ci_build', 'github', 'linux', 'docker
|
|||
m = re.match(r"(.+?)_DOWNLOAD_URL=(\S+)", line)
|
||||
if m is not None:
|
||||
package_url = m.group(2)
|
||||
if package_name == 'LIBXCRYPT':
|
||||
package_url = m.group(2) + "/v" + \
|
||||
package_filename + ".tar.gz"
|
||||
elif package_name == 'CMAKE':
|
||||
package_url = m.group(
|
||||
2) + "/v" + package_filename + "/cmake-" + package_filename + ".tar.gz"
|
||||
if package_name == "LIBXCRYPT":
|
||||
package_url = m.group(2) + "/v" + package_filename + ".tar.gz"
|
||||
elif package_name == "CMAKE":
|
||||
package_url = m.group(2) + "/v" + package_filename + "/cmake-" + package_filename + ".tar.gz"
|
||||
else:
|
||||
package_url = m.group(2) + "/" + \
|
||||
package_filename + ".tar.gz"
|
||||
package_url = m.group(2) + "/" + package_filename + ".tar.gz"
|
||||
registration = {
|
||||
"Component": {
|
||||
"Type": "other",
|
||||
|
@ -53,7 +52,7 @@ with open(os.path.join(REPO_DIR, 'tools', 'ci_build', 'github', 'linux', 'docker
|
|||
"Version": package_filename.split("-")[-1],
|
||||
"DownloadUrl": package_url,
|
||||
},
|
||||
"comments": "manylinux dependency"
|
||||
"comments": "manylinux dependency",
|
||||
}
|
||||
}
|
||||
registrations.append(registration)
|
||||
|
@ -67,14 +66,23 @@ def normalize_path_separators(path):
|
|||
|
||||
|
||||
proc = subprocess.run(
|
||||
["git", "submodule", "foreach", "--quiet", "--recursive", "{} {} $toplevel/$sm_path".format(
|
||||
normalize_path_separators(sys.executable),
|
||||
normalize_path_separators(os.path.join(SCRIPT_DIR, "print_submodule_info.py")))],
|
||||
[
|
||||
"git",
|
||||
"submodule",
|
||||
"foreach",
|
||||
"--quiet",
|
||||
"--recursive",
|
||||
"{} {} $toplevel/$sm_path".format(
|
||||
normalize_path_separators(sys.executable),
|
||||
normalize_path_separators(os.path.join(SCRIPT_DIR, "print_submodule_info.py")),
|
||||
),
|
||||
],
|
||||
check=True,
|
||||
cwd=REPO_DIR,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True)
|
||||
universal_newlines=True,
|
||||
)
|
||||
|
||||
|
||||
submodule_lines = proc.stdout.splitlines()
|
||||
|
@ -88,7 +96,8 @@ for submodule_line in submodule_lines:
|
|||
"repositoryUrl": url,
|
||||
},
|
||||
"comments": "git submodule at {}".format(
|
||||
normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR)))
|
||||
normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))
|
||||
),
|
||||
}
|
||||
}
|
||||
registrations.append(registration)
|
||||
|
|
|
@ -10,19 +10,19 @@ assert len(sys.argv) == 2
|
|||
|
||||
path = sys.argv[1]
|
||||
|
||||
proc = subprocess.run(["git", "config", "--get", "remote.origin.url"],
|
||||
check=True,
|
||||
cwd=path,
|
||||
stdout=subprocess.PIPE,
|
||||
universal_newlines=True)
|
||||
proc = subprocess.run(
|
||||
["git", "config", "--get", "remote.origin.url"],
|
||||
check=True,
|
||||
cwd=path,
|
||||
stdout=subprocess.PIPE,
|
||||
universal_newlines=True,
|
||||
)
|
||||
|
||||
url = proc.stdout.strip()
|
||||
|
||||
proc = subprocess.run(["git", "rev-parse", "HEAD"],
|
||||
check=True,
|
||||
cwd=path,
|
||||
stdout=subprocess.PIPE,
|
||||
universal_newlines=True)
|
||||
proc = subprocess.run(
|
||||
["git", "rev-parse", "HEAD"], check=True, cwd=path, stdout=subprocess.PIPE, universal_newlines=True
|
||||
)
|
||||
|
||||
commit = proc.stdout.strip()
|
||||
|
||||
|
|
|
@ -2,28 +2,21 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
import onnx
|
||||
from onnx import helper
|
||||
from onnx import TensorProto, helper
|
||||
from onnx.helper import make_opsetid
|
||||
from onnx import TensorProto
|
||||
|
||||
input_info = helper.make_tensor_value_info('input', TensorProto.BFLOAT16, [1, 5])
|
||||
output_info = helper.make_tensor_value_info('output', TensorProto.BFLOAT16, [1, 5])
|
||||
input_info = helper.make_tensor_value_info("input", TensorProto.BFLOAT16, [1, 5])
|
||||
output_info = helper.make_tensor_value_info("output", TensorProto.BFLOAT16, [1, 5])
|
||||
|
||||
# Create a node (NodeProto) - This is based on Pad-11
|
||||
node_def = helper.make_node(
|
||||
'Identity', # node name
|
||||
['input'], # inputs
|
||||
['output'] # outputs
|
||||
)
|
||||
node_def = helper.make_node("Identity", ["input"], ["output"]) # node name # inputs # outputs
|
||||
|
||||
graph_def = helper.make_graph(nodes=[node_def], name='test_types_BLOAT16',
|
||||
inputs=[input_info], outputs=[output_info])
|
||||
graph_def = helper.make_graph(nodes=[node_def], name="test_types_BLOAT16", inputs=[input_info], outputs=[output_info])
|
||||
|
||||
model_def = helper.make_model(graph_def, producer_name='AIInfra',
|
||||
opset_imports=[make_opsetid('', 13)])
|
||||
model_def = helper.make_model(graph_def, producer_name="AIInfra", opset_imports=[make_opsetid("", 13)])
|
||||
|
||||
onnx.checker.check_model(model_def)
|
||||
onnx.helper.strip_doc_string(model_def)
|
||||
final_model = onnx.shape_inference.infer_shapes(model_def)
|
||||
onnx.checker.check_model(final_model)
|
||||
onnx.save(final_model, 'test_types_BFLOAT16.onnx')
|
||||
onnx.save(final_model, "test_types_BFLOAT16.onnx")
|
||||
|
|
|
@ -2,31 +2,28 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
import onnx
|
||||
from onnx import helper
|
||||
from onnx import TensorProto, helper
|
||||
from onnx.helper import make_opsetid
|
||||
from onnx import TensorProto
|
||||
|
||||
input_info = helper.make_tensor_value_info('input', TensorProto.FLOAT16, [1, 5])
|
||||
output_info = helper.make_tensor_value_info('output', TensorProto.FLOAT16, [1, 5])
|
||||
input_info = helper.make_tensor_value_info("input", TensorProto.FLOAT16, [1, 5])
|
||||
output_info = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [1, 5])
|
||||
|
||||
# Create a node (NodeProto) - This is based on Pad-11
|
||||
node_def = helper.make_node(
|
||||
'Slice', # node name
|
||||
['input'], # inputs
|
||||
['output'], # outputs
|
||||
"Slice", # node name
|
||||
["input"], # inputs
|
||||
["output"], # outputs
|
||||
axes=[0, 1], # attributes
|
||||
ends=[1, 5],
|
||||
starts=[0, 0]
|
||||
starts=[0, 0],
|
||||
)
|
||||
|
||||
graph_def = helper.make_graph(nodes=[node_def], name='test_input_FLOAT16',
|
||||
inputs=[input_info], outputs=[output_info])
|
||||
graph_def = helper.make_graph(nodes=[node_def], name="test_input_FLOAT16", inputs=[input_info], outputs=[output_info])
|
||||
|
||||
model_def = helper.make_model(graph_def, producer_name='AIInfra',
|
||||
opset_imports=[make_opsetid('', 7)])
|
||||
model_def = helper.make_model(graph_def, producer_name="AIInfra", opset_imports=[make_opsetid("", 7)])
|
||||
|
||||
onnx.checker.check_model(model_def)
|
||||
onnx.helper.strip_doc_string(model_def)
|
||||
final_model = onnx.shape_inference.infer_shapes(model_def)
|
||||
onnx.checker.check_model(final_model)
|
||||
onnx.save(final_model, 'test_types_FLOAT16.onnx')
|
||||
onnx.save(final_model, "test_types_FLOAT16.onnx")
|
||||
|
|
|
@ -6,16 +6,18 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
import onnxruntime
|
||||
|
||||
# import recommonmark
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'ONNX Runtime'
|
||||
copyright = '2018-2021, Microsoft'
|
||||
author = 'Microsoft'
|
||||
project = "ONNX Runtime"
|
||||
copyright = "2018-2021, Microsoft"
|
||||
author = "Microsoft"
|
||||
version = onnxruntime.__version__
|
||||
release = version
|
||||
|
||||
|
@ -23,70 +25,72 @@ release = version
|
|||
|
||||
extensions = [
|
||||
"alabaster",
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.imgmath',
|
||||
'sphinx.ext.ifconfig',
|
||||
'sphinx.ext.viewcode',
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx.ext.imgmath",
|
||||
"sphinx.ext.ifconfig",
|
||||
"sphinx.ext.viewcode",
|
||||
"sphinx.ext.autodoc",
|
||||
'sphinx.ext.githubpages',
|
||||
"sphinx.ext.githubpages",
|
||||
"sphinx_gallery.gen_gallery",
|
||||
'sphinx.ext.graphviz',
|
||||
"sphinx.ext.graphviz",
|
||||
"pyquickhelper.sphinxext.sphinx_runpython_extension",
|
||||
]
|
||||
|
||||
templates_path = ['_templates']
|
||||
templates_path = ["_templates"]
|
||||
|
||||
source_parsers = {
|
||||
'.md': 'recommonmark.parser.CommonMarkParser',
|
||||
".md": "recommonmark.parser.CommonMarkParser",
|
||||
}
|
||||
|
||||
source_suffix = ['.rst'] # , '.md']
|
||||
source_suffix = [".rst"] # , '.md']
|
||||
|
||||
master_doc = 'index'
|
||||
master_doc = "index"
|
||||
language = "en"
|
||||
exclude_patterns = []
|
||||
pygments_style = 'default'
|
||||
autoclass_content = 'both'
|
||||
pygments_style = "default"
|
||||
autoclass_content = "both"
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
html_theme = "alabaster"
|
||||
html_logo = "ONNX_Runtime_icon.png"
|
||||
html_static_path = ['_static']
|
||||
html_static_path = ["_static"]
|
||||
graphviz_output_format = "svg"
|
||||
|
||||
# -- Options for intersphinx extension ---------------------------------------
|
||||
|
||||
# Example configuration for intersphinx: refer to the Python standard library.
|
||||
intersphinx_mapping = {'https://docs.python.org/': None}
|
||||
intersphinx_mapping = {"https://docs.python.org/": None}
|
||||
|
||||
# -- Options for Sphinx Gallery ----------------------------------------------
|
||||
|
||||
sphinx_gallery_conf = {
|
||||
'examples_dirs': 'examples',
|
||||
'gallery_dirs': 'auto_examples',
|
||||
"examples_dirs": "examples",
|
||||
"gallery_dirs": "auto_examples",
|
||||
}
|
||||
|
||||
# -- markdown options -----------------------------------------------------------
|
||||
|
||||
md_image_dest = "media"
|
||||
md_link_replace = {
|
||||
'#onnxruntimesessionoptionsenable-profiling)': '#class-onnxruntimesessionoptions)',
|
||||
"#onnxruntimesessionoptionsenable-profiling)": "#class-onnxruntimesessionoptions)",
|
||||
}
|
||||
|
||||
# -- Setup actions -----------------------------------------------------------
|
||||
|
||||
|
||||
def setup(app):
|
||||
# download examples for the documentation
|
||||
this = os.path.abspath(os.path.dirname(__file__))
|
||||
dest = os.path.join(this, "model.onnx")
|
||||
if not os.path.exists(dest):
|
||||
import urllib.request
|
||||
url = 'https://raw.githubusercontent.com/onnx/onnx/master/onnx/backend/test/data/node/test_sigmoid/model.onnx'
|
||||
|
||||
url = "https://raw.githubusercontent.com/onnx/onnx/master/onnx/backend/test/data/node/test_sigmoid/model.onnx"
|
||||
urllib.request.urlretrieve(url, dest)
|
||||
loc = os.path.split(dest)[-1]
|
||||
if not os.path.exists(loc):
|
||||
import shutil
|
||||
|
||||
shutil.copy(dest, loc)
|
||||
return app
|
||||
|
||||
|
|
|
@ -15,15 +15,16 @@ Let's use the API to compute the prediction
|
|||
of a simple logistic regression model.
|
||||
"""
|
||||
import numpy as np
|
||||
from onnxruntime import datasets
|
||||
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
|
||||
import onnxruntime.backend as backend
|
||||
from onnx import load
|
||||
|
||||
import onnxruntime.backend as backend
|
||||
|
||||
########################################
|
||||
# The device depends on how the package was compiled,
|
||||
# GPU or CPU.
|
||||
from onnxruntime import get_device
|
||||
from onnxruntime import datasets, get_device
|
||||
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
|
||||
|
||||
device = get_device()
|
||||
|
||||
name = datasets.get_example("logreg_iris.onnx")
|
||||
|
|
|
@ -15,9 +15,10 @@ It starts by loading the model trained in example
|
|||
trained on *Iris* datasets. The model takes
|
||||
a vector of dimension 2 and returns a class among three.
|
||||
"""
|
||||
import numpy
|
||||
|
||||
import onnxruntime as rt
|
||||
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
|
||||
import numpy
|
||||
from onnxruntime.datasets import get_example
|
||||
|
||||
example2 = get_example("logreg_iris.onnx")
|
||||
|
@ -37,7 +38,7 @@ try:
|
|||
except Exception as e:
|
||||
print("Unexpected type")
|
||||
print("{0}: {1}".format(type(e), e))
|
||||
|
||||
|
||||
#########################
|
||||
# The model fails to return an output if the name
|
||||
# is misspelled.
|
||||
|
@ -76,12 +77,12 @@ except Exception as e:
|
|||
# dimension is a multiple of the expected input dimension.
|
||||
|
||||
for x in [
|
||||
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
|
||||
]:
|
||||
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
|
||||
]:
|
||||
try:
|
||||
r = sess.run([output_name], {input_name: x})
|
||||
print("Shape={0} and predicted labels={1}".format(x.shape, r))
|
||||
|
@ -89,12 +90,12 @@ for x in [
|
|||
print("ERROR with Shape={0} - {1}".format(x.shape, e))
|
||||
|
||||
for x in [
|
||||
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
|
||||
]:
|
||||
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
|
||||
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
|
||||
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
|
||||
]:
|
||||
try:
|
||||
r = sess.run(None, {input_name: x})
|
||||
print("Shape={0} and predicted probabilities={1}".format(x.shape, r[1]))
|
||||
|
@ -106,10 +107,10 @@ for x in [
|
|||
# is higher than expects but produces a warning.
|
||||
|
||||
for x in [
|
||||
numpy.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=numpy.float32),
|
||||
numpy.array([[[1.0, 2.0, 3.0]]], dtype=numpy.float32),
|
||||
numpy.array([[[1.0, 2.0]], [[3.0, 4.0]]], dtype=numpy.float32),
|
||||
]:
|
||||
numpy.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=numpy.float32),
|
||||
numpy.array([[[1.0, 2.0, 3.0]]], dtype=numpy.float32),
|
||||
numpy.array([[[1.0, 2.0]], [[3.0, 4.0]]], dtype=numpy.float32),
|
||||
]:
|
||||
try:
|
||||
r = sess.run([output_name], {input_name: x})
|
||||
print("Shape={0} and predicted labels={1}".format(x.shape, r))
|
||||
|
|
|
@ -21,24 +21,25 @@ The first step consists in retrieving the boston datset.
|
|||
"""
|
||||
import pandas
|
||||
from sklearn.datasets import load_boston
|
||||
|
||||
boston = load_boston()
|
||||
X, y = boston.data, boston.target
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||
X_train_dict = pandas.DataFrame(X_train[:,1:]).T.to_dict().values()
|
||||
X_test_dict = pandas.DataFrame(X_test[:,1:]).T.to_dict().values()
|
||||
X_train_dict = pandas.DataFrame(X_train[:, 1:]).T.to_dict().values()
|
||||
X_test_dict = pandas.DataFrame(X_test[:, 1:]).T.to_dict().values()
|
||||
|
||||
####################################
|
||||
# We create a pipeline.
|
||||
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
pipe = make_pipeline(
|
||||
DictVectorizer(sparse=False),
|
||||
GradientBoostingRegressor())
|
||||
|
||||
from sklearn.pipeline import make_pipeline
|
||||
|
||||
pipe = make_pipeline(DictVectorizer(sparse=False), GradientBoostingRegressor())
|
||||
|
||||
pipe.fit(X_train_dict, y_train)
|
||||
|
||||
####################################
|
||||
|
@ -53,15 +54,15 @@ print(r2_score(y_test, pred))
|
|||
# Conversion to ONNX format
|
||||
# +++++++++++++++++++++++++
|
||||
#
|
||||
# We use module
|
||||
# We use module
|
||||
# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
|
||||
# to convert the model into ONNX format.
|
||||
|
||||
from skl2onnx import convert_sklearn
|
||||
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType, DictionaryType, SequenceType
|
||||
from skl2onnx.common.data_types import DictionaryType, FloatTensorType, Int64TensorType, SequenceType
|
||||
|
||||
# initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
|
||||
initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
|
||||
initial_type = [("float_input", DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
|
||||
onx = convert_sklearn(pipe, initial_types=initial_type)
|
||||
with open("pipeline_vectorize.onnx", "wb") as f:
|
||||
f.write(onx.SerializeToString())
|
||||
|
@ -75,6 +76,7 @@ from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
|
|||
sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers())
|
||||
|
||||
import numpy
|
||||
|
||||
inp, out = sess.get_inputs()[0], sess.get_outputs()[0]
|
||||
print("input name='{}' and shape={} and type={}".format(inp.name, inp.shape, inp.type))
|
||||
print("output name='{}' and shape={} and type={}".format(out.name, out.shape, out.type))
|
||||
|
@ -100,4 +102,3 @@ print(r2_score(pred, pred_onx))
|
|||
#########################
|
||||
# Very similar. *ONNX Runtime* uses floats instead of doubles,
|
||||
# that explains the small discrepencies.
|
||||
|
||||
|
|
|
@ -12,8 +12,9 @@ the output for an input vector. It also shows how to
|
|||
retrieve the definition of its inputs and outputs.
|
||||
"""
|
||||
|
||||
import onnxruntime as rt
|
||||
import numpy
|
||||
|
||||
import onnxruntime as rt
|
||||
from onnxruntime.datasets import get_example
|
||||
|
||||
#########################
|
||||
|
@ -37,7 +38,7 @@ print("input type", input_type)
|
|||
# Let's see the output name and shape.
|
||||
|
||||
output_name = sess.get_outputs()[0].name
|
||||
print("output name", output_name)
|
||||
print("output name", output_name)
|
||||
output_shape = sess.get_outputs()[0].shape
|
||||
print("output shape", output_shape)
|
||||
output_type = sess.get_outputs()[0].type
|
||||
|
@ -47,7 +48,8 @@ print("output type", output_type)
|
|||
# Let's compute its outputs (or predictions if it is a machine learned model).
|
||||
|
||||
import numpy.random
|
||||
x = numpy.random.random((3,4,5))
|
||||
|
||||
x = numpy.random.random((3, 4, 5))
|
||||
x = x.astype(numpy.float32)
|
||||
res = sess.run([output_name], {input_name: x})
|
||||
print(res)
|
||||
|
|
|
@ -15,9 +15,11 @@ logistic regression model trained with
|
|||
"""
|
||||
|
||||
from onnxruntime.datasets import get_example
|
||||
|
||||
example = get_example("logreg_iris.onnx")
|
||||
|
||||
import onnx
|
||||
|
||||
model = onnx.load(example)
|
||||
|
||||
print("doc_string={}".format(model.doc_string))
|
||||
|
@ -32,6 +34,7 @@ print("producer_version={}".format(model.producer_version))
|
|||
# With *ONNX Runtime*:
|
||||
|
||||
import onnxruntime as rt
|
||||
|
||||
sess = rt.InferenceSession(example, providers=rt.get_available_providers())
|
||||
meta = sess.get_modelmeta()
|
||||
|
||||
|
|
|
@ -21,12 +21,14 @@ That's the most simple way.
|
|||
"""
|
||||
|
||||
from onnxruntime.datasets import get_example
|
||||
|
||||
example1 = get_example("mul_1.onnx")
|
||||
|
||||
import onnx
|
||||
|
||||
model = onnx.load(example1) # model is a ModelProto protobuf message
|
||||
|
||||
print(model)
|
||||
print(model)
|
||||
|
||||
|
||||
#################################
|
||||
|
@ -39,31 +41,30 @@ print(model)
|
|||
|
||||
|
||||
from onnx import ModelProto
|
||||
|
||||
model = ModelProto()
|
||||
with open(example1, 'rb') as fid:
|
||||
with open(example1, "rb") as fid:
|
||||
content = fid.read()
|
||||
model.ParseFromString(content)
|
||||
|
||||
###################################
|
||||
# We convert it into a graph.
|
||||
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
|
||||
pydot_graph = GetPydotGraph(model.graph, name=model.graph.name, rankdir="LR",
|
||||
node_producer=GetOpNodeProducer("docstring"))
|
||||
from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph
|
||||
|
||||
pydot_graph = GetPydotGraph(
|
||||
model.graph, name=model.graph.name, rankdir="LR", node_producer=GetOpNodeProducer("docstring")
|
||||
)
|
||||
pydot_graph.write_dot("graph.dot")
|
||||
|
||||
#######################################
|
||||
# Then into an image
|
||||
import os
|
||||
os.system('dot -O -Tpng graph.dot')
|
||||
|
||||
os.system("dot -O -Tpng graph.dot")
|
||||
|
||||
################################
|
||||
# Which we display...
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
image = plt.imread("graph.dot.png")
|
||||
plt.imshow(image)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -11,9 +11,10 @@ Profile the execution of a simple model
|
|||
*ONNX Runtime* can profile the execution of the model.
|
||||
This example shows how to interpret the results.
|
||||
"""
|
||||
import onnx
|
||||
import onnxruntime as rt
|
||||
import numpy
|
||||
import onnx
|
||||
|
||||
import onnxruntime as rt
|
||||
from onnxruntime.datasets import get_example
|
||||
|
||||
|
||||
|
@ -27,8 +28,6 @@ def change_ir_version(filename, ir_version=6):
|
|||
return model
|
||||
|
||||
|
||||
|
||||
|
||||
#########################
|
||||
# Let's load a very simple model and compute some prediction.
|
||||
|
||||
|
@ -61,10 +60,9 @@ print(prof_file)
|
|||
# The results are stored un a file in JSON format.
|
||||
# Let's see what it contains.
|
||||
import json
|
||||
|
||||
with open(prof_file, "r") as f:
|
||||
sess_time = json.load(f)
|
||||
import pprint
|
||||
|
||||
pprint.pprint(sess_time)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -22,16 +22,19 @@ The first step consists in retrieving the iris datset.
|
|||
"""
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
iris = load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||
|
||||
####################################
|
||||
# Then we fit a model.
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
clr = LogisticRegression()
|
||||
clr.fit(X_train, y_train)
|
||||
|
||||
|
@ -47,14 +50,14 @@ print(confusion_matrix(y_test, pred))
|
|||
# Conversion to ONNX format
|
||||
# +++++++++++++++++++++++++
|
||||
#
|
||||
# We use module
|
||||
# We use module
|
||||
# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
|
||||
# to convert the model into ONNX format.
|
||||
|
||||
from skl2onnx import convert_sklearn
|
||||
from skl2onnx.common.data_types import FloatTensorType
|
||||
|
||||
initial_type = [('float_input', FloatTensorType([None, 4]))]
|
||||
initial_type = [("float_input", FloatTensorType([None, 4]))]
|
||||
onx = convert_sklearn(clr, initial_types=initial_type)
|
||||
with open("logreg_iris.onnx", "wb") as f:
|
||||
f.write(onx.SerializeToString())
|
||||
|
@ -64,12 +67,11 @@ with open("logreg_iris.onnx", "wb") as f:
|
|||
# its input and output.
|
||||
|
||||
import onnxruntime as rt
|
||||
|
||||
sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())
|
||||
|
||||
print("input name='{}' and shape={}".format(
|
||||
sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
|
||||
print("output name='{}' and shape={}".format(
|
||||
sess.get_outputs()[0].name, sess.get_outputs()[0].shape))
|
||||
print("input name='{}' and shape={}".format(sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
|
||||
print("output name='{}' and shape={}".format(sess.get_outputs()[0].name, sess.get_outputs()[0].shape))
|
||||
|
||||
##################################
|
||||
# We compute the predictions.
|
||||
|
@ -78,6 +80,7 @@ input_name = sess.get_inputs()[0].name
|
|||
label_name = sess.get_outputs()[0].name
|
||||
|
||||
import numpy
|
||||
|
||||
pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]
|
||||
print(confusion_matrix(pred, pred_onx))
|
||||
|
||||
|
@ -97,18 +100,20 @@ print(prob_sklearn[:3])
|
|||
|
||||
#############################
|
||||
# And then with ONNX Runtime.
|
||||
# The probabilies appear to be
|
||||
# The probabilies appear to be
|
||||
|
||||
prob_name = sess.get_outputs()[1].name
|
||||
prob_rt = sess.run([prob_name], {input_name: X_test.astype(numpy.float32)})[0]
|
||||
|
||||
import pprint
|
||||
|
||||
pprint.pprint(prob_rt[0:3])
|
||||
|
||||
###############################
|
||||
# Let's benchmark.
|
||||
from timeit import Timer
|
||||
|
||||
|
||||
def speed(inst, number=10, repeat=20):
|
||||
timer = Timer(inst, globals=globals())
|
||||
raw = numpy.array(timer.repeat(repeat, number=number))
|
||||
|
@ -117,6 +122,7 @@ def speed(inst, number=10, repeat=20):
|
|||
print("Average %1.3g min=%1.3g max=%1.3g" % (ave, mi, ma))
|
||||
return ave
|
||||
|
||||
|
||||
print("Execution time for clr.predict")
|
||||
speed("clr.predict(X_test)")
|
||||
|
||||
|
@ -128,20 +134,24 @@ speed("sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]")
|
|||
# experiences: the model has to do one prediction at a time
|
||||
# as opposed to a batch of prediction.
|
||||
|
||||
|
||||
def loop(X_test, fct, n=None):
|
||||
nrow = X_test.shape[0]
|
||||
if n is None:
|
||||
n = nrow
|
||||
for i in range(0, n):
|
||||
im = i % nrow
|
||||
fct(X_test[im: im+1])
|
||||
fct(X_test[im : im + 1])
|
||||
|
||||
|
||||
print("Execution time for clr.predict")
|
||||
speed("loop(X_test, clr.predict, 100)")
|
||||
|
||||
|
||||
def sess_predict(x):
|
||||
return sess.run([label_name], {input_name: x.astype(numpy.float32)})[0]
|
||||
|
||||
|
||||
print("Execution time for sess_predict")
|
||||
speed("loop(X_test, sess_predict, 100)")
|
||||
|
||||
|
@ -151,14 +161,16 @@ speed("loop(X_test, sess_predict, 100)")
|
|||
print("Execution time for predict_proba")
|
||||
speed("loop(X_test, clr.predict_proba, 100)")
|
||||
|
||||
|
||||
def sess_predict_proba(x):
|
||||
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
|
||||
|
||||
|
||||
print("Execution time for sess_predict_proba")
|
||||
speed("loop(X_test, sess_predict_proba, 100)")
|
||||
|
||||
#####################################
|
||||
# This second comparison is better as
|
||||
# This second comparison is better as
|
||||
# ONNX Runtime, in this experience,
|
||||
# computes the label and the probabilities
|
||||
# in every case.
|
||||
|
@ -169,10 +181,11 @@ speed("loop(X_test, sess_predict_proba, 100)")
|
|||
#
|
||||
# We first train and save a model in ONNX format.
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
rf = RandomForestClassifier()
|
||||
rf.fit(X_train, y_train)
|
||||
|
||||
initial_type = [('float_input', FloatTensorType([1, 4]))]
|
||||
initial_type = [("float_input", FloatTensorType([1, 4]))]
|
||||
onx = convert_sklearn(rf, initial_types=initial_type)
|
||||
with open("rf_iris.onnx", "wb") as f:
|
||||
f.write(onx.SerializeToString())
|
||||
|
@ -182,9 +195,11 @@ with open("rf_iris.onnx", "wb") as f:
|
|||
|
||||
sess = rt.InferenceSession("rf_iris.onnx", providers=rt.get_available_providers())
|
||||
|
||||
|
||||
def sess_predict_proba_rf(x):
|
||||
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
|
||||
|
||||
|
||||
print("Execution time for predict_proba")
|
||||
speed("loop(X_test, rf.predict_proba, 100)")
|
||||
|
||||
|
@ -196,26 +211,28 @@ speed("loop(X_test, sess_predict_proba_rf, 100)")
|
|||
|
||||
measures = []
|
||||
|
||||
for n_trees in range(5, 51, 5):
|
||||
for n_trees in range(5, 51, 5):
|
||||
print(n_trees)
|
||||
rf = RandomForestClassifier(n_estimators=n_trees)
|
||||
rf.fit(X_train, y_train)
|
||||
initial_type = [('float_input', FloatTensorType([1, 4]))]
|
||||
initial_type = [("float_input", FloatTensorType([1, 4]))]
|
||||
onx = convert_sklearn(rf, initial_types=initial_type)
|
||||
with open("rf_iris_%d.onnx" % n_trees, "wb") as f:
|
||||
f.write(onx.SerializeToString())
|
||||
sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees, providers=rt.get_available_providers())
|
||||
|
||||
def sess_predict_proba_loop(x):
|
||||
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
|
||||
|
||||
tsk = speed("loop(X_test, rf.predict_proba, 100)", number=5, repeat=5)
|
||||
trt = speed("loop(X_test, sess_predict_proba_loop, 100)", number=5, repeat=5)
|
||||
measures.append({'n_trees': n_trees, 'sklearn': tsk, 'rt': trt})
|
||||
measures.append({"n_trees": n_trees, "sklearn": tsk, "rt": trt})
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
df = DataFrame(measures)
|
||||
ax = df.plot(x="n_trees", y="sklearn", label="scikit-learn", c="blue", logy=True)
|
||||
df.plot(x="n_trees", y="rt", label="onnxruntime",
|
||||
ax=ax, c="green", logy=True)
|
||||
df.plot(x="n_trees", y="rt", label="onnxruntime", ax=ax, c="green", logy=True)
|
||||
ax.set_xlabel("Number of trees")
|
||||
ax.set_ylabel("Prediction time (s)")
|
||||
ax.set_title("Speed comparison between scikit-learn and ONNX Runtime\nFor a random forest on Iris dataset")
|
||||
|
|
|
@ -7,30 +7,28 @@
|
|||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'ORTModule'
|
||||
copyright = '2018-2021, Microsoft'
|
||||
author = 'Microsoft'
|
||||
version = '0.1' # TODO: Should use `onnxruntime.__version__` instead?
|
||||
project = "ORTModule"
|
||||
copyright = "2018-2021, Microsoft"
|
||||
author = "Microsoft"
|
||||
version = "0.1" # TODO: Should use `onnxruntime.__version__` instead?
|
||||
release = version
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
extensions = ['sphinx.ext.autodoc',
|
||||
'sphinx.ext.intersphinx'
|
||||
]
|
||||
templates_path = ['_templates']
|
||||
extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx"]
|
||||
templates_path = ["_templates"]
|
||||
exclude_patterns = []
|
||||
autoclass_content = 'both'
|
||||
autoclass_content = "both"
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_static_path = ['_static']
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
html_static_path = ["_static"]
|
||||
|
||||
# -- Options for intersphinx extension ---------------------------------------
|
||||
|
||||
intersphinx_mapping = {
|
||||
'python': ('https://docs.python.org/3', None),
|
||||
'numpy': ('https://numpy.org/doc/stable', None),
|
||||
'torch': ('https://pytorch.org/docs/stable/', None),
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
"numpy": ("https://numpy.org/doc/stable", None),
|
||||
"torch": ("https://pytorch.org/docs/stable/", None),
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import onnx
|
||||
from onnx import helper
|
||||
from onnx import TensorProto
|
||||
from onnx import TensorProto, helper
|
||||
|
||||
graph = helper.make_graph(
|
||||
[ # nodes
|
||||
|
@ -8,12 +7,13 @@ graph = helper.make_graph(
|
|||
],
|
||||
"SingleAdd", # name
|
||||
[ # inputs
|
||||
helper.make_tensor_value_info('A', TensorProto.FLOAT, [1]),
|
||||
helper.make_tensor_value_info('B', TensorProto.FLOAT, [1]),
|
||||
helper.make_tensor_value_info("A", TensorProto.FLOAT, [1]),
|
||||
helper.make_tensor_value_info("B", TensorProto.FLOAT, [1]),
|
||||
],
|
||||
[ # outputs
|
||||
helper.make_tensor_value_info('C', TensorProto.FLOAT, [1]),
|
||||
])
|
||||
helper.make_tensor_value_info("C", TensorProto.FLOAT, [1]),
|
||||
],
|
||||
)
|
||||
|
||||
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 12)])
|
||||
onnx.save(model, r'single_add.onnx')
|
||||
onnx.save(model, r"single_add.onnx")
|
||||
|
|
|
@ -20,11 +20,31 @@ __author__ = "Microsoft"
|
|||
# meaningful messages to the user.
|
||||
# the saved exception is raised after device version validation.
|
||||
try:
|
||||
from onnxruntime.capi._pybind_state import get_all_providers, get_available_providers, get_device, set_seed, \
|
||||
RunOptions, SessionOptions, set_default_logger_severity, enable_telemetry_events, disable_telemetry_events, \
|
||||
NodeArg, ModelMetadata, GraphOptimizationLevel, ExecutionMode, ExecutionOrder, SessionIOBinding, \
|
||||
OrtAllocatorType, OrtMemType, OrtArenaCfg, OrtMemoryInfo, create_and_register_allocator, OrtSparseFormat, \
|
||||
set_default_logger_verbosity
|
||||
from onnxruntime.capi._pybind_state import (
|
||||
ExecutionMode,
|
||||
ExecutionOrder,
|
||||
GraphOptimizationLevel,
|
||||
ModelMetadata,
|
||||
NodeArg,
|
||||
OrtAllocatorType,
|
||||
OrtArenaCfg,
|
||||
OrtMemoryInfo,
|
||||
OrtMemType,
|
||||
OrtSparseFormat,
|
||||
RunOptions,
|
||||
SessionIOBinding,
|
||||
SessionOptions,
|
||||
create_and_register_allocator,
|
||||
disable_telemetry_events,
|
||||
enable_telemetry_events,
|
||||
get_all_providers,
|
||||
get_available_providers,
|
||||
get_device,
|
||||
set_default_logger_severity,
|
||||
set_default_logger_verbosity,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
import_capi_exception = None
|
||||
except Exception as e:
|
||||
import_capi_exception = e
|
||||
|
@ -34,9 +54,13 @@ from onnxruntime.capi import onnxruntime_validation
|
|||
if import_capi_exception:
|
||||
raise import_capi_exception
|
||||
|
||||
from onnxruntime.capi.onnxruntime_inference_collection import InferenceSession, IOBinding, OrtValue, SparseTensor, \
|
||||
OrtDevice
|
||||
|
||||
from onnxruntime.capi.onnxruntime_inference_collection import (
|
||||
InferenceSession,
|
||||
IOBinding,
|
||||
OrtDevice,
|
||||
OrtValue,
|
||||
SparseTensor,
|
||||
)
|
||||
from onnxruntime.capi.training import * # noqa: F403
|
||||
|
||||
# TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
|
||||
|
@ -45,7 +69,8 @@ try:
|
|||
except ImportError:
|
||||
pass
|
||||
|
||||
from onnxruntime.capi.onnxruntime_validation import package_name, version, cuda_version
|
||||
from onnxruntime.capi.onnxruntime_validation import cuda_version, package_name, version
|
||||
|
||||
if version:
|
||||
__version__ = version
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
|
|
@ -8,14 +8,16 @@ import os
|
|||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def is_windows():
|
||||
return sys.platform.startswith("win")
|
||||
|
||||
|
||||
def gen_md5(filename):
|
||||
if not os.path.exists(filename):
|
||||
return False
|
||||
hash_md5 = hashlib.md5()
|
||||
BLOCKSIZE = 1024*64
|
||||
BLOCKSIZE = 1024 * 64
|
||||
with open(filename, "rb") as f:
|
||||
buf = f.read(BLOCKSIZE)
|
||||
while len(buf) > 0:
|
||||
|
@ -23,54 +25,61 @@ def gen_md5(filename):
|
|||
buf = f.read(BLOCKSIZE)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def gen_checksum(file_checksum, input_dir):
|
||||
if not file_checksum:
|
||||
return
|
||||
|
||||
name = 'ORTInternal_checksum'
|
||||
with open(os.path.join(input_dir, name + '.cc'), 'w') as checksum_cc:
|
||||
print('#include <stdlib.h>', file=checksum_cc)
|
||||
name = "ORTInternal_checksum"
|
||||
with open(os.path.join(input_dir, name + ".cc"), "w") as checksum_cc:
|
||||
print("#include <stdlib.h>", file=checksum_cc)
|
||||
print('static const char model_checksum[] = "' + file_checksum + '";', file=checksum_cc)
|
||||
print('extern "C"', file=checksum_cc)
|
||||
if is_windows():
|
||||
print('__declspec(dllexport)', file=checksum_cc)
|
||||
print('void _ORTInternal_GetCheckSum(const char*& cs, size_t& len) {', file=checksum_cc)
|
||||
print(' cs = model_checksum; len = sizeof(model_checksum)/sizeof(model_checksum[0]) - 1;', file=checksum_cc)
|
||||
print('}', file=checksum_cc)
|
||||
print("__declspec(dllexport)", file=checksum_cc)
|
||||
print("void _ORTInternal_GetCheckSum(const char*& cs, size_t& len) {", file=checksum_cc)
|
||||
print(" cs = model_checksum; len = sizeof(model_checksum)/sizeof(model_checksum[0]) - 1;", file=checksum_cc)
|
||||
print("}", file=checksum_cc)
|
||||
|
||||
|
||||
def gen_cache_version(input_dir):
|
||||
name = 'ORTInternal_cache_version'
|
||||
with open(os.path.join(input_dir, name + '.cc'), 'w') as cache_version_cc:
|
||||
header_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NUPHAR_CACHE_VERSION')
|
||||
name = "ORTInternal_cache_version"
|
||||
with open(os.path.join(input_dir, name + ".cc"), "w") as cache_version_cc:
|
||||
header_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "NUPHAR_CACHE_VERSION")
|
||||
print('#include "{}"'.format(header_file), file=cache_version_cc)
|
||||
print('extern "C"', file=cache_version_cc)
|
||||
if is_windows():
|
||||
print('__declspec(dllexport)', file=cache_version_cc)
|
||||
print('const char* _ORTInternal_GetCacheVersion() {', file=cache_version_cc)
|
||||
print(' return __NUPHAR_CACHE_VERSION__;', file=cache_version_cc)
|
||||
print('}', file=cache_version_cc)
|
||||
print("__declspec(dllexport)", file=cache_version_cc)
|
||||
print("const char* _ORTInternal_GetCacheVersion() {", file=cache_version_cc)
|
||||
print(" return __NUPHAR_CACHE_VERSION__;", file=cache_version_cc)
|
||||
print("}", file=cache_version_cc)
|
||||
|
||||
|
||||
def compile_all_cc(path):
|
||||
for f in os.listdir(path):
|
||||
name, ext = os.path.splitext(f)
|
||||
if ext != '.cc':
|
||||
if ext != ".cc":
|
||||
continue
|
||||
if is_windows():
|
||||
subprocess.run(['cl', '/Fo' + name + '.o', '/c', f], cwd=path, check=True)
|
||||
subprocess.run(["cl", "/Fo" + name + ".o", "/c", f], cwd=path, check=True)
|
||||
else:
|
||||
subprocess.run(['g++', '-std=c++14', '-fPIC', '-o', name + '.o', '-c', f], cwd=path, check=True)
|
||||
subprocess.run(["g++", "-std=c++14", "-fPIC", "-o", name + ".o", "-c", f], cwd=path, check=True)
|
||||
os.remove(os.path.join(path, f))
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="Offline shared lib creation tool.")
|
||||
# Main arguments
|
||||
parser.add_argument('--keep_input', action='store_true', help="Keep input files after created so.")
|
||||
parser.add_argument('--input_dir', help="The input directory that contains obj files.", required=True)
|
||||
parser.add_argument('--output_name', help="The output so file name.", default='jit.so')
|
||||
parser.add_argument('--input_model', help="The input model file name to generate checksum into shared lib.", default=None)
|
||||
parser.add_argument("--keep_input", action="store_true", help="Keep input files after created so.")
|
||||
parser.add_argument("--input_dir", help="The input directory that contains obj files.", required=True)
|
||||
parser.add_argument("--output_name", help="The output so file name.", default="jit.so")
|
||||
parser.add_argument(
|
||||
"--input_model", help="The input model file name to generate checksum into shared lib.", default=None
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
|
||||
if args.input_model:
|
||||
|
@ -81,8 +90,8 @@ if __name__ == '__main__':
|
|||
|
||||
if is_windows():
|
||||
# create dllmain
|
||||
name = 'ORTInternal_dllmain'
|
||||
with open(os.path.join(args.input_dir, name + '.cc'), 'w') as dllmain_cc:
|
||||
name = "ORTInternal_dllmain"
|
||||
with open(os.path.join(args.input_dir, name + ".cc"), "w") as dllmain_cc:
|
||||
print("#include <windows.h>", file=dllmain_cc)
|
||||
print("BOOL APIENTRY DllMain(HMODULE hModule,", file=dllmain_cc)
|
||||
print(" DWORD ul_reason_for_call,", file=dllmain_cc)
|
||||
|
@ -90,12 +99,20 @@ if __name__ == '__main__':
|
|||
print(" {return TRUE;}", file=dllmain_cc)
|
||||
|
||||
compile_all_cc(args.input_dir)
|
||||
objs = [f for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and '.o' == os.path.splitext(f)[1]]
|
||||
objs = [
|
||||
f
|
||||
for f in os.listdir(args.input_dir)
|
||||
if os.path.isfile(os.path.join(args.input_dir, f)) and ".o" == os.path.splitext(f)[1]
|
||||
]
|
||||
|
||||
if is_windows():
|
||||
subprocess.run(['link', '-dll', '-FORCE:MULTIPLE', '-EXPORT:__tvm_main__', '-out:' + args.output_name, '*.o'], cwd=args.input_dir, check=True)
|
||||
subprocess.run(
|
||||
["link", "-dll", "-FORCE:MULTIPLE", "-EXPORT:__tvm_main__", "-out:" + args.output_name, "*.o"],
|
||||
cwd=args.input_dir,
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
subprocess.run(['g++', '-shared', '-fPIC', '-o', args.output_name] + objs, cwd=args.input_dir, check=True)
|
||||
subprocess.run(["g++", "-shared", "-fPIC", "-o", args.output_name] + objs, cwd=args.input_dir, check=True)
|
||||
|
||||
if not args.keep_input:
|
||||
for f in objs:
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -3,13 +3,16 @@
|
|||
|
||||
# -*- coding: UTF-8 -*-
|
||||
import argparse
|
||||
from enum import Enum
|
||||
import json
|
||||
from enum import Enum
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import helper, numpy_helper
|
||||
|
||||
from .node_factory import NodeFactory, ensure_opset
|
||||
|
||||
|
||||
class QuantizeConfig:
|
||||
def __init__(self, signed, reserved_bits, type_bits):
|
||||
self.sign_bit_ = 1 if signed else 0
|
||||
|
@ -18,9 +21,9 @@ class QuantizeConfig:
|
|||
|
||||
@staticmethod
|
||||
def from_dict(qcfg_dict):
|
||||
return QuantizeConfig(1 if qcfg_dict['QuantizationType'] == 'Signed' else 0,
|
||||
qcfg_dict['ReservedBit'],
|
||||
qcfg_dict['QuantizeBit'])
|
||||
return QuantizeConfig(
|
||||
1 if qcfg_dict["QuantizationType"] == "Signed" else 0, qcfg_dict["ReservedBit"], qcfg_dict["QuantizeBit"]
|
||||
)
|
||||
|
||||
def signed(self):
|
||||
return self.sign_bit_ == 1
|
||||
|
@ -47,10 +50,15 @@ class QuantizeConfig:
|
|||
def q_type_bits(self):
|
||||
return self.type_bits_
|
||||
|
||||
def __iter__(self): # need this to make dict for json
|
||||
return iter([('QuantizeBit', self.type_bits_),
|
||||
('QuantizationType', 'Signed' if self.sign_bit_ else 'Unsigned'),
|
||||
('ReservedBit', self.reserved_bits_)])
|
||||
def __iter__(self): # need this to make dict for json
|
||||
return iter(
|
||||
[
|
||||
("QuantizeBit", self.type_bits_),
|
||||
("QuantizationType", "Signed" if self.sign_bit_ else "Unsigned"),
|
||||
("ReservedBit", self.reserved_bits_),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def parse_custom_attributes(in_node):
|
||||
if in_node.doc_string:
|
||||
|
@ -67,40 +75,56 @@ def parse_custom_attributes(in_node):
|
|||
# "ReservedBitOfMatrix":0}}
|
||||
qcfg_str = in_node.doc_string
|
||||
# make sure it's the string we can parse
|
||||
if 'custom_attributes' in qcfg_str:
|
||||
if "custom_attributes" in qcfg_str:
|
||||
# some fixes to make it a valid JSON string, when model keys are not string
|
||||
if qcfg_str[1] == 'c':
|
||||
qcfg_str = qcfg_str.replace('{', '{"')
|
||||
qcfg_str = qcfg_str.replace(',', ',"')
|
||||
qcfg_str = qcfg_str.replace(':', '":')
|
||||
qcfg_str = qcfg_str.replace('{"}', '{}')
|
||||
qcfg = json.loads(qcfg_str)['custom_attributes']
|
||||
if qcfg_str[1] == "c":
|
||||
qcfg_str = qcfg_str.replace("{", '{"')
|
||||
qcfg_str = qcfg_str.replace(",", ',"')
|
||||
qcfg_str = qcfg_str.replace(":", '":')
|
||||
qcfg_str = qcfg_str.replace('{"}', "{}")
|
||||
qcfg = json.loads(qcfg_str)["custom_attributes"]
|
||||
if qcfg:
|
||||
return qcfg
|
||||
return None
|
||||
|
||||
|
||||
def parse_node_description(in_node):
|
||||
if not in_node.doc_string:
|
||||
return None
|
||||
custom_qcfg = parse_custom_attributes(in_node)
|
||||
if custom_qcfg:
|
||||
assert custom_qcfg['IntermediateBit'] == 32
|
||||
assert custom_qcfg['PerRowQuantization']
|
||||
assert custom_qcfg['QuantizeBitOfVector'] == custom_qcfg['QuantizeBitOfMatrix']
|
||||
qbits = custom_qcfg['QuantizeBitOfVector']
|
||||
assert ("Asymmetric" in custom_qcfg['VectorQuantizationType']) == ("Asymmetric" in custom_qcfg['MatrixQuantizationType'])
|
||||
symmetric = 0 if "Asymmetric" in custom_qcfg['VectorQuantizationType'] else 1
|
||||
x_signed = 0 if "Unsigned" in custom_qcfg['VectorQuantizationType'] else 1
|
||||
w_signed = 0 if "Unsigned" in custom_qcfg['MatrixQuantizationType'] else 1
|
||||
x_reserved_bits = custom_qcfg['ReservedBitOfVector']
|
||||
w_reserved_bits = custom_qcfg['ReservedBitOfMatrix']
|
||||
return {'W' : dict(QuantizeConfig(signed=w_signed, reserved_bits=w_reserved_bits, type_bits=qbits)),
|
||||
'X' : dict(QuantizeConfig(signed=x_signed, reserved_bits=x_reserved_bits, type_bits=qbits)),
|
||||
'Symmetric' : symmetric}
|
||||
assert custom_qcfg["IntermediateBit"] == 32
|
||||
assert custom_qcfg["PerRowQuantization"]
|
||||
assert custom_qcfg["QuantizeBitOfVector"] == custom_qcfg["QuantizeBitOfMatrix"]
|
||||
qbits = custom_qcfg["QuantizeBitOfVector"]
|
||||
assert ("Asymmetric" in custom_qcfg["VectorQuantizationType"]) == (
|
||||
"Asymmetric" in custom_qcfg["MatrixQuantizationType"]
|
||||
)
|
||||
symmetric = 0 if "Asymmetric" in custom_qcfg["VectorQuantizationType"] else 1
|
||||
x_signed = 0 if "Unsigned" in custom_qcfg["VectorQuantizationType"] else 1
|
||||
w_signed = 0 if "Unsigned" in custom_qcfg["MatrixQuantizationType"] else 1
|
||||
x_reserved_bits = custom_qcfg["ReservedBitOfVector"]
|
||||
w_reserved_bits = custom_qcfg["ReservedBitOfMatrix"]
|
||||
return {
|
||||
"W": dict(QuantizeConfig(signed=w_signed, reserved_bits=w_reserved_bits, type_bits=qbits)),
|
||||
"X": dict(QuantizeConfig(signed=x_signed, reserved_bits=x_reserved_bits, type_bits=qbits)),
|
||||
"Symmetric": symmetric,
|
||||
}
|
||||
return None
|
||||
|
||||
def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg, onnx_opset_ver):
|
||||
assert in_node.op_type == 'MatMul'
|
||||
|
||||
def quantize_matmul_2d_with_weight(
|
||||
in_node,
|
||||
in_graph,
|
||||
nf,
|
||||
converted_weights,
|
||||
quantized_inputs,
|
||||
qcfg_dict,
|
||||
update_qcfg_dict,
|
||||
default_qcfg,
|
||||
onnx_opset_ver,
|
||||
):
|
||||
assert in_node.op_type == "MatMul"
|
||||
|
||||
# quantize weight
|
||||
# only handles weight being inputs[1] of MatMul/Gemm node
|
||||
|
@ -108,7 +132,7 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
|
|||
|
||||
# skip if weights shared by other nodes that's not MatMul
|
||||
# TODO: support GEMM op if needed
|
||||
other_nodes = [n for n in in_graph.node if n != in_node and fparam_name in n.input and n.op_type != 'MatMul']
|
||||
other_nodes = [n for n in in_graph.node if n != in_node and fparam_name in n.input and n.op_type != "MatMul"]
|
||||
if other_nodes:
|
||||
return False
|
||||
|
||||
|
@ -119,12 +143,16 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
|
|||
if not node_qcfg:
|
||||
if not update_qcfg_dict and qcfg_dict:
|
||||
# when qcfg_dict is readonly, raise warning if qcfg is not found for this node
|
||||
print("Warning: qcfg is not found for node with output: " + in_node.output[0] + ", fall back to default qcfg.")
|
||||
print(
|
||||
"Warning: qcfg is not found for node with output: "
|
||||
+ in_node.output[0]
|
||||
+ ", fall back to default qcfg."
|
||||
)
|
||||
node_qcfg = default_qcfg
|
||||
|
||||
w_qcfg = QuantizeConfig.from_dict(node_qcfg['W'])
|
||||
x_qcfg = QuantizeConfig.from_dict(node_qcfg['X'])
|
||||
symmetric = node_qcfg['Symmetric']
|
||||
w_qcfg = QuantizeConfig.from_dict(node_qcfg["W"])
|
||||
x_qcfg = QuantizeConfig.from_dict(node_qcfg["X"])
|
||||
symmetric = node_qcfg["Symmetric"]
|
||||
|
||||
# for symmetric quantization, both weight and input should be quantized to signed
|
||||
assert not symmetric or (w_qcfg.signed() and x_qcfg.signed())
|
||||
|
@ -149,32 +177,34 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
|
|||
else:
|
||||
fmin = np.amin(fparam, axis=0)
|
||||
fmax = np.amax(fparam, axis=0)
|
||||
fscale = (fmax - fmin)/(2 if w_qcfg.signed() else 1) # signed would be normalized to [-1, 1], and unsigned to [0, 1]
|
||||
fscale = (fmax - fmin) / (
|
||||
2 if w_qcfg.signed() else 1
|
||||
) # signed would be normalized to [-1, 1], and unsigned to [0, 1]
|
||||
step = fscale / q_range
|
||||
base = (fmax + fmin + step) * 0.5 if w_qcfg.signed() else fmin
|
||||
|
||||
fparam_norm = np.zeros_like(fparam)
|
||||
expand_fscale = np.expand_dims(fscale,0)
|
||||
np.divide((fparam - np.expand_dims(base,0)), expand_fscale, out=fparam_norm, where=expand_fscale!=0)
|
||||
expand_fscale = np.expand_dims(fscale, 0)
|
||||
np.divide((fparam - np.expand_dims(base, 0)), expand_fscale, out=fparam_norm, where=expand_fscale != 0)
|
||||
qparam = np.round(fparam_norm * q_range)
|
||||
qparam = np.clip(qparam, w_qcfg.q_min(), w_qcfg.q_max())
|
||||
qparam_rowsum = np.sum(qparam, axis=0)
|
||||
qparam = qparam.astype(w_qcfg.q_type())
|
||||
|
||||
# create new weights in main graph in case other Scans share via converted_weights
|
||||
nf.make_initializer(step, fparam_name + '_step', in_main_graph=True)
|
||||
nf.make_initializer(qparam, fparam_name + '_qparam', in_main_graph=True)
|
||||
step = fparam_name + '_step'
|
||||
qparam = fparam_name + '_qparam'
|
||||
nf.make_initializer(step, fparam_name + "_step", in_main_graph=True)
|
||||
nf.make_initializer(qparam, fparam_name + "_qparam", in_main_graph=True)
|
||||
step = fparam_name + "_step"
|
||||
qparam = fparam_name + "_qparam"
|
||||
if symmetric:
|
||||
# no need to compute qparam_rowsum and base for symmetric quantization
|
||||
base = None
|
||||
qparam_rowsum = None
|
||||
else:
|
||||
nf.make_initializer(base, fparam_name + '_base', in_main_graph=True)
|
||||
base = fparam_name + '_base'
|
||||
nf.make_initializer(qparam_rowsum, fparam_name + '_qparam_rowsum', in_main_graph=True)
|
||||
qparam_rowsum = fparam_name + '_qparam_rowsum'
|
||||
nf.make_initializer(base, fparam_name + "_base", in_main_graph=True)
|
||||
base = fparam_name + "_base"
|
||||
nf.make_initializer(qparam_rowsum, fparam_name + "_qparam_rowsum", in_main_graph=True)
|
||||
qparam_rowsum = fparam_name + "_qparam_rowsum"
|
||||
converted_weights[fparam_name] = (step, base, qparam_rowsum, qparam, w_qcfg, symmetric)
|
||||
nf.remove_initializer(fparam_name)
|
||||
|
||||
|
@ -183,136 +213,216 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
|
|||
input_dim = nf.get_initializer(qparam).shape[0]
|
||||
X = in_node.input[0]
|
||||
if quantized_inputs is not None:
|
||||
quantized_inputs_key = '{}_{}_{}'.format(X, symmetric, '|'.join(['{}:{}'.format(k,v) for (k, v) in x_qcfg]))
|
||||
quantized_inputs_key = "{}_{}_{}".format(
|
||||
X, symmetric, "|".join(["{}:{}".format(k, v) for (k, v) in x_qcfg])
|
||||
)
|
||||
if quantized_inputs is not None and quantized_inputs_key in quantized_inputs:
|
||||
scale_X, bias_X, Q_X, Q_X_sum_int32 = quantized_inputs[quantized_inputs_key]
|
||||
else:
|
||||
if symmetric:
|
||||
delta_X = nf.make_node('ReduceMax', nf.make_node('Abs', X), {'axes':[-1]}) # keepdims = 1
|
||||
inv_delta_X = nf.make_node('Reciprocal', delta_X)
|
||||
norm_X = nf.make_node('Mul', [X, inv_delta_X])
|
||||
delta_X = nf.make_node("ReduceMax", nf.make_node("Abs", X), {"axes": [-1]}) # keepdims = 1
|
||||
inv_delta_X = nf.make_node("Reciprocal", delta_X)
|
||||
norm_X = nf.make_node("Mul", [X, inv_delta_X])
|
||||
bias_X = None
|
||||
assert x_qcfg.signed()
|
||||
else:
|
||||
reduce_max_X = nf.make_node('ReduceMax', X, {'axes':[-1]}) # keepdims = 1
|
||||
bias_X = nf.make_node('ReduceMin', X, {'axes':[-1]})
|
||||
delta_X = nf.make_node('Sub', [reduce_max_X, bias_X])
|
||||
inv_delta_X = nf.make_node('Reciprocal', delta_X)
|
||||
norm_X = nf.make_node('Mul', [nf.make_node('Sub', [X, bias_X]), inv_delta_X])
|
||||
reduce_max_X = nf.make_node("ReduceMax", X, {"axes": [-1]}) # keepdims = 1
|
||||
bias_X = nf.make_node("ReduceMin", X, {"axes": [-1]})
|
||||
delta_X = nf.make_node("Sub", [reduce_max_X, bias_X])
|
||||
inv_delta_X = nf.make_node("Reciprocal", delta_X)
|
||||
norm_X = nf.make_node("Mul", [nf.make_node("Sub", [X, bias_X]), inv_delta_X])
|
||||
|
||||
scale_X = nf.make_node('Mul', [delta_X, np.asarray(1.0 / x_qcfg.q_range()).astype(np.float32)])
|
||||
Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
|
||||
Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)])
|
||||
Q_Xf = nf.make_node('Floor', Q_Xf)
|
||||
scale_X = nf.make_node("Mul", [delta_X, np.asarray(1.0 / x_qcfg.q_range()).astype(np.float32)])
|
||||
Q_Xf = nf.make_node("Mul", [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
|
||||
Q_Xf = nf.make_node("Add", [Q_Xf, np.asarray(0.5).astype(np.float32)])
|
||||
Q_Xf = nf.make_node("Floor", Q_Xf)
|
||||
if onnx_opset_ver < 11:
|
||||
Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
|
||||
Q_Xf = nf.make_node("Clip", Q_Xf, {"max": x_qcfg.q_max(), "min": x_qcfg.q_min()})
|
||||
else:
|
||||
# Clip changed min max to inputs in opset 11
|
||||
Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
|
||||
Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8 : onnx.TensorProto.UINT8,
|
||||
np.int8 : onnx.TensorProto.INT8,
|
||||
np.uint16 : onnx.TensorProto.UINT16,
|
||||
np.int16 : onnx.TensorProto.INT16}[x_qcfg.q_type()])})
|
||||
Q_Xf = nf.make_node(
|
||||
"Clip",
|
||||
[
|
||||
Q_Xf,
|
||||
np.asarray(x_qcfg.q_min()).astype(np.float32),
|
||||
np.asarray(x_qcfg.q_max()).astype(np.float32),
|
||||
],
|
||||
)
|
||||
Q_X = nf.make_node(
|
||||
"Cast",
|
||||
Q_Xf,
|
||||
{
|
||||
"to": int(
|
||||
{
|
||||
np.uint8: onnx.TensorProto.UINT8,
|
||||
np.int8: onnx.TensorProto.INT8,
|
||||
np.uint16: onnx.TensorProto.UINT16,
|
||||
np.int16: onnx.TensorProto.INT16,
|
||||
}[x_qcfg.q_type()]
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
if symmetric:
|
||||
Q_X_sum_int32 = None
|
||||
else:
|
||||
Q_X_sum_int32 = nf.make_node_with_axes('ReduceSum', nf.make_node('Cast', Q_X, {'to':int(onnx.TensorProto.INT32)}), [-1], onnx_opset_ver)
|
||||
Q_X_sum_int32 = nf.make_node_with_axes(
|
||||
"ReduceSum", nf.make_node("Cast", Q_X, {"to": int(onnx.TensorProto.INT32)}), [-1], onnx_opset_ver
|
||||
)
|
||||
|
||||
if quantized_inputs is not None:
|
||||
quantized_inputs[quantized_inputs_key] = (scale_X, bias_X, Q_X, Q_X_sum_int32)
|
||||
|
||||
# MatMulInteger
|
||||
if x_qcfg.q_type_bits() == 8:
|
||||
Q_Y = nf.make_node('MatMulInteger', [Q_X, qparam])
|
||||
Q_Y = nf.make_node("MatMulInteger", [Q_X, qparam])
|
||||
else:
|
||||
Q_Y = nf.make_node('MatMulInteger16', [Q_X, qparam])
|
||||
Q_Y = nf.make_node("MatMulInteger16", [Q_X, qparam])
|
||||
Q_Y.domain = "com.microsoft"
|
||||
|
||||
# Dequantize
|
||||
Y = in_node.output[0]
|
||||
if symmetric:
|
||||
nf.make_node('Mul',
|
||||
[nf.make_node('Mul', [step, scale_X]),
|
||||
nf.make_node('Cast', Q_Y, {'to': int(onnx.TensorProto.FLOAT)})],
|
||||
output_names=Y)
|
||||
nf.make_node(
|
||||
"Mul",
|
||||
[nf.make_node("Mul", [step, scale_X]), nf.make_node("Cast", Q_Y, {"to": int(onnx.TensorProto.FLOAT)})],
|
||||
output_names=Y,
|
||||
)
|
||||
else:
|
||||
o0 = nf.make_node('Mul', [nf.make_node('Mul', [step, scale_X]),
|
||||
nf.make_node('Cast', Q_Y, {'to': int(onnx.TensorProto.FLOAT)})])
|
||||
o1 = nf.make_node('Mul', [nf.make_node('Mul', [step, bias_X]), qparam_rowsum])
|
||||
o2 = nf.make_node('Mul', [base, nf.make_node('Mul', [scale_X, nf.make_node('Cast', Q_X_sum_int32, {'to':int(onnx.TensorProto.FLOAT)})])])
|
||||
o3 = nf.make_node('Mul', [base, nf.make_node('Mul', [bias_X, np.asarray(float(input_dim)).astype(np.float32)])])
|
||||
nf.make_node('Sum', [o3, o2, o1, o0], output_names=Y)
|
||||
o0 = nf.make_node(
|
||||
"Mul",
|
||||
[nf.make_node("Mul", [step, scale_X]), nf.make_node("Cast", Q_Y, {"to": int(onnx.TensorProto.FLOAT)})],
|
||||
)
|
||||
o1 = nf.make_node("Mul", [nf.make_node("Mul", [step, bias_X]), qparam_rowsum])
|
||||
o2 = nf.make_node(
|
||||
"Mul",
|
||||
[
|
||||
base,
|
||||
nf.make_node(
|
||||
"Mul", [scale_X, nf.make_node("Cast", Q_X_sum_int32, {"to": int(onnx.TensorProto.FLOAT)})]
|
||||
),
|
||||
],
|
||||
)
|
||||
o3 = nf.make_node(
|
||||
"Mul", [base, nf.make_node("Mul", [bias_X, np.asarray(float(input_dim)).astype(np.float32)])]
|
||||
)
|
||||
nf.make_node("Sum", [o3, o2, o1, o0], output_names=Y)
|
||||
|
||||
if update_qcfg_dict:
|
||||
qcfg_dict[in_node.output[0]] = node_qcfg
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def upgrade_op(nf, in_n):
|
||||
if in_n.op_type == 'Slice' and len(in_n.input) == 1:
|
||||
if in_n.op_type == "Slice" and len(in_n.input) == 1:
|
||||
# convert opset9 Slice to opset10
|
||||
with nf.scoped_prefix(in_n.name) as scoped_prefix:
|
||||
slice_inputs = [in_n.input[0],
|
||||
np.asarray(NodeFactory.get_attribute(in_n,'starts')).astype(np.int64),
|
||||
np.asarray(NodeFactory.get_attribute(in_n,'ends')).astype(np.int64),
|
||||
np.asarray(NodeFactory.get_attribute(in_n,'axes')).astype(np.int64)]
|
||||
nf.make_node('Slice', slice_inputs, output_names=list(in_n.output))
|
||||
slice_inputs = [
|
||||
in_n.input[0],
|
||||
np.asarray(NodeFactory.get_attribute(in_n, "starts")).astype(np.int64),
|
||||
np.asarray(NodeFactory.get_attribute(in_n, "ends")).astype(np.int64),
|
||||
np.asarray(NodeFactory.get_attribute(in_n, "axes")).astype(np.int64),
|
||||
]
|
||||
nf.make_node("Slice", slice_inputs, output_names=list(in_n.output))
|
||||
return True
|
||||
elif in_n.op_type == 'TopK' and len(in_n.input) == 1:
|
||||
elif in_n.op_type == "TopK" and len(in_n.input) == 1:
|
||||
# convert opset1 TopK to opset10
|
||||
with nf.scoped_prefix(in_n.name) as scoped_prefix:
|
||||
topk_inputs = [in_n.input[0],
|
||||
np.asarray([NodeFactory.get_attribute(in_n,'k')]).astype(np.int64)]
|
||||
nf.make_node('TopK', topk_inputs, {'axis':NodeFactory.get_attribute(in_n,'axis',-1)}, output_names=list(in_n.output))
|
||||
topk_inputs = [in_n.input[0], np.asarray([NodeFactory.get_attribute(in_n, "k")]).astype(np.int64)]
|
||||
nf.make_node(
|
||||
"TopK",
|
||||
topk_inputs,
|
||||
{"axis": NodeFactory.get_attribute(in_n, "axis", -1)},
|
||||
output_names=list(in_n.output),
|
||||
)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
# quantize matmul to MatMulInteger using asymm uint8
|
||||
def convert_matmul_model(input_model, output_model, only_for_scan=False, share_input_quantization=False, preset_str='asymm8_param0_input1', qcfg_json=None, export_qcfg_json=None):
|
||||
preset_qcfgs = {'asymm8_param0_input1' : {'W' : dict(QuantizeConfig(signed=1, reserved_bits=0, type_bits=8)),
|
||||
'X' : dict(QuantizeConfig(signed=0, reserved_bits=1, type_bits=8)),
|
||||
'Symmetric' : 0},
|
||||
'symm16_param3_input3' : {'W' : dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
|
||||
'X' : dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
|
||||
'Symmetric' : 1}}
|
||||
def convert_matmul_model(
|
||||
input_model,
|
||||
output_model,
|
||||
only_for_scan=False,
|
||||
share_input_quantization=False,
|
||||
preset_str="asymm8_param0_input1",
|
||||
qcfg_json=None,
|
||||
export_qcfg_json=None,
|
||||
):
|
||||
preset_qcfgs = {
|
||||
"asymm8_param0_input1": {
|
||||
"W": dict(QuantizeConfig(signed=1, reserved_bits=0, type_bits=8)),
|
||||
"X": dict(QuantizeConfig(signed=0, reserved_bits=1, type_bits=8)),
|
||||
"Symmetric": 0,
|
||||
},
|
||||
"symm16_param3_input3": {
|
||||
"W": dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
|
||||
"X": dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
|
||||
"Symmetric": 1,
|
||||
},
|
||||
}
|
||||
default_qcfg = preset_qcfgs[preset_str]
|
||||
in_mp = onnx.load(input_model)
|
||||
|
||||
qcfg_dict = {}
|
||||
if qcfg_json and not export_qcfg_json:
|
||||
with open(qcfg_json, 'r') as f:
|
||||
with open(qcfg_json, "r") as f:
|
||||
qcfg_dict = json.load(f)
|
||||
|
||||
out_mp = onnx.ModelProto()
|
||||
out_mp.CopyFrom(in_mp)
|
||||
out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
|
||||
onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
|
||||
ensure_opset(out_mp, 1, 'com.microsoft') # add MS domain for MatMulInteger16
|
||||
out_mp.graph.ClearField('node')
|
||||
out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
|
||||
onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
|
||||
ensure_opset(out_mp, 1, "com.microsoft") # add MS domain for MatMulInteger16
|
||||
out_mp.graph.ClearField("node")
|
||||
nf = NodeFactory(out_mp.graph)
|
||||
converted_weights = {} # remember MatMul weights that have been converted, in case of sharing
|
||||
quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
|
||||
converted_weights = {} # remember MatMul weights that have been converted, in case of sharing
|
||||
quantized_inputs = (
|
||||
{} if share_input_quantization else None
|
||||
) # remember quantized inputs that might be able to share between MatMuls
|
||||
for in_n in in_mp.graph.node:
|
||||
if upgrade_op(nf, in_n):
|
||||
continue
|
||||
|
||||
if in_n.op_type == 'MatMul' and not only_for_scan:
|
||||
if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
|
||||
if in_n.op_type == "MatMul" and not only_for_scan:
|
||||
if quantize_matmul_2d_with_weight(
|
||||
in_n,
|
||||
in_mp.graph,
|
||||
nf,
|
||||
converted_weights,
|
||||
quantized_inputs,
|
||||
qcfg_dict,
|
||||
export_qcfg_json,
|
||||
default_qcfg,
|
||||
onnx_opset_ver,
|
||||
):
|
||||
continue
|
||||
|
||||
out_n = out_mp.graph.node.add()
|
||||
out_n.CopyFrom(in_n)
|
||||
if in_n.op_type == 'Scan' or in_n.op_type == 'Loop':
|
||||
in_subgraph = NodeFactory.get_attribute(in_n, 'body')
|
||||
out_subgraph = NodeFactory.get_attribute(out_n, 'body')
|
||||
out_subgraph.ClearField('node')
|
||||
if in_n.op_type == "Scan" or in_n.op_type == "Loop":
|
||||
in_subgraph = NodeFactory.get_attribute(in_n, "body")
|
||||
out_subgraph = NodeFactory.get_attribute(out_n, "body")
|
||||
out_subgraph.ClearField("node")
|
||||
scan_nf = NodeFactory(out_mp.graph, out_subgraph)
|
||||
subgraph_quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
|
||||
subgraph_quantized_inputs = (
|
||||
{} if share_input_quantization else None
|
||||
) # remember quantized inputs that might be able to share between MatMuls
|
||||
for in_sn in in_subgraph.node:
|
||||
if in_sn.op_type == 'MatMul':
|
||||
if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
|
||||
if in_sn.op_type == "MatMul":
|
||||
if quantize_matmul_2d_with_weight(
|
||||
in_sn,
|
||||
in_subgraph,
|
||||
scan_nf,
|
||||
converted_weights,
|
||||
subgraph_quantized_inputs,
|
||||
qcfg_dict,
|
||||
export_qcfg_json,
|
||||
default_qcfg,
|
||||
onnx_opset_ver,
|
||||
):
|
||||
continue
|
||||
|
||||
if upgrade_op(scan_nf, in_sn):
|
||||
|
@ -323,25 +433,55 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
|
|||
|
||||
onnx.save(out_mp, output_model)
|
||||
if export_qcfg_json:
|
||||
with open(qcfg_json, 'w') as f:
|
||||
with open(qcfg_json, "w") as f:
|
||||
f.write(json.dumps(qcfg_dict, indent=2))
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--input', required=True, help='The input model file')
|
||||
parser.add_argument('--output', required=True, help='The output model file')
|
||||
parser.add_argument('--default_qcfg', help='The preset of quantization of <asymm|symm><qbits>_param<reserve_bit>_input<reserve_bit>', choices=['asymm8_param0_input1', 'symm16_param3_input3'], default='asymm8_param0_input1')
|
||||
parser.add_argument('--qcfg_json', help='The quantization config json file for read or write.', default=None)
|
||||
parser.add_argument('--export_qcfg_json', help='If set, write default quantization config to qcfg_json file.', action='store_true', default=False)
|
||||
parser.add_argument('--only_for_scan', help='If set, apply quantization of MatMul only inside scan', action='store_true', default=False)
|
||||
parser.add_argument('--share_input_quantization', help='If set, allow input quantization to be shared if the same input is used in multiple MatMul', action='store_true', default=False)
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input", required=True, help="The input model file")
|
||||
parser.add_argument("--output", required=True, help="The output model file")
|
||||
parser.add_argument(
|
||||
"--default_qcfg",
|
||||
help="The preset of quantization of <asymm|symm><qbits>_param<reserve_bit>_input<reserve_bit>",
|
||||
choices=["asymm8_param0_input1", "symm16_param3_input3"],
|
||||
default="asymm8_param0_input1",
|
||||
)
|
||||
parser.add_argument("--qcfg_json", help="The quantization config json file for read or write.", default=None)
|
||||
parser.add_argument(
|
||||
"--export_qcfg_json",
|
||||
help="If set, write default quantization config to qcfg_json file.",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only_for_scan",
|
||||
help="If set, apply quantization of MatMul only inside scan",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--share_input_quantization",
|
||||
help="If set, allow input quantization to be shared if the same input is used in multiple MatMul",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
print('input model: ' + args.input)
|
||||
print('output model ' + args.output)
|
||||
print('Quantize MatMul to MatMulInteger...')
|
||||
print("input model: " + args.input)
|
||||
print("output model " + args.output)
|
||||
print("Quantize MatMul to MatMulInteger...")
|
||||
assert not args.export_qcfg_json or args.qcfg_json, "--qcfg_json must be specified when --export_qcfg_json is used"
|
||||
convert_matmul_model(args.input, args.output, args.only_for_scan, args.share_input_quantization, args.default_qcfg, args.qcfg_json, args.export_qcfg_json)
|
||||
print('Done!')
|
||||
convert_matmul_model(
|
||||
args.input,
|
||||
args.output,
|
||||
args.only_for_scan,
|
||||
args.share_input_quantization,
|
||||
args.default_qcfg,
|
||||
args.qcfg_json,
|
||||
args.export_qcfg_json,
|
||||
)
|
||||
print("Done!")
|
||||
|
|
|
@ -1,21 +1,25 @@
|
|||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
import onnxruntime as ort
|
||||
import onnx
|
||||
from onnx import helper
|
||||
from onnxruntime.nuphar.node_factory import ensure_opset
|
||||
from onnxruntime.nuphar.model_editor import convert_loop_to_scan_model
|
||||
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto
|
||||
import onnxruntime.tools.onnxruntime_test as ort_test
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from numpy.testing import assert_array_equal
|
||||
from onnx import helper
|
||||
|
||||
import onnxruntime as ort
|
||||
import onnxruntime.tools.onnxruntime_test as ort_test
|
||||
from onnxruntime.nuphar.model_editor import convert_loop_to_scan_model
|
||||
from onnxruntime.nuphar.node_factory import ensure_opset
|
||||
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto
|
||||
|
||||
|
||||
def run_shape_inference(input_model, output_model):
|
||||
in_mp = onnx.load(input_model)
|
||||
in_mp = SymbolicShapeInference.infer_shapes(in_mp, auto_merge=True)
|
||||
onnx.save(in_mp, output_model)
|
||||
|
||||
|
||||
# use this function to make a loop op's output as model output.
|
||||
# it helps to debug data issues when edited model outputs do not match the original model.
|
||||
def extract_loop_outputs_as_model_outputs(model):
|
||||
|
@ -29,13 +33,15 @@ def extract_loop_outputs_as_model_outputs(model):
|
|||
break
|
||||
|
||||
for node in model.graph.node:
|
||||
if node.op_type == 'Loop':
|
||||
if node.op_type == "Loop":
|
||||
# for debugging to make scan output as model graph output
|
||||
set_op_output_as_model_output(node, model.graph)
|
||||
|
||||
|
||||
def run_with_ort(model_path, symbolic_dims={}, feeds=None, ort_test_case_dir=None):
|
||||
_, feeds, outputs = ort_test.run_model(model_path, symbolic_dims=symbolic_dims,
|
||||
feeds=feeds, override_initializers=False)
|
||||
_, feeds, outputs = ort_test.run_model(
|
||||
model_path, symbolic_dims=symbolic_dims, feeds=feeds, override_initializers=False
|
||||
)
|
||||
|
||||
if ort_test_case_dir:
|
||||
model = onnx.load(model_path)
|
||||
|
@ -44,61 +50,73 @@ def run_with_ort(model_path, symbolic_dims={}, feeds=None, ort_test_case_dir=Non
|
|||
if not os.path.exists(ort_test_case_dir):
|
||||
os.makedirs(ort_test_case_dir)
|
||||
|
||||
test_data_set_dir = os.path.join(ort_test_case_dir, 'test_data_set_0')
|
||||
test_data_set_dir = os.path.join(ort_test_case_dir, "test_data_set_0")
|
||||
if not os.path.exists(test_data_set_dir):
|
||||
os.makedirs(test_data_set_dir)
|
||||
|
||||
onnx.save(model, os.path.join(ort_test_case_dir, 'model.onnx'))
|
||||
onnx.save(model, os.path.join(ort_test_case_dir, "model.onnx"))
|
||||
for i, (input_name, input) in enumerate(feeds.items()):
|
||||
onnx.save_tensor(onnx.numpy_helper.from_array(input, input_name),
|
||||
os.path.join(test_data_set_dir, 'input_{0}.pb'.format(i)))
|
||||
onnx.save_tensor(
|
||||
onnx.numpy_helper.from_array(input, input_name),
|
||||
os.path.join(test_data_set_dir, "input_{0}.pb".format(i)),
|
||||
)
|
||||
|
||||
output_names = [output.name for output in model.graph.output]
|
||||
output_dict = dict(zip(output_names, outputs))
|
||||
for i, (output_name, output) in enumerate(output_dict.items()):
|
||||
onnx.save_tensor(onnx.numpy_helper.from_array(output, output_name),
|
||||
os.path.join(test_data_set_dir, 'output_{0}.pb'.format(i)))
|
||||
onnx.save_tensor(
|
||||
onnx.numpy_helper.from_array(output, output_name),
|
||||
os.path.join(test_data_set_dir, "output_{0}.pb".format(i)),
|
||||
)
|
||||
|
||||
save_ort_test_case(ort_test_case_dir)
|
||||
|
||||
return feeds, outputs
|
||||
|
||||
|
||||
def validate_with_ort(input_filename, output_filename, symbolic_dims={}):
|
||||
feeds, loop_output = run_with_ort(input_filename, symbolic_dims=symbolic_dims)
|
||||
_, scan_output = run_with_ort(output_filename, symbolic_dims=symbolic_dims, feeds=feeds)
|
||||
|
||||
assert(len(loop_output) == len(scan_output))
|
||||
assert len(loop_output) == len(scan_output)
|
||||
for index in range(0, len(loop_output)):
|
||||
assert_array_equal(loop_output[index], scan_output[index])
|
||||
|
||||
|
||||
def convert_loop_to_scan_and_validate(input_filename, output_filename, symbolic_dims={}):
|
||||
convert_loop_to_scan_model(args.input, args.output)
|
||||
validate_with_ort(args.input, args.output, symbolic_dims=symbolic_dims)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--tool', help='what to do',
|
||||
choices=['run_shape_inference',
|
||||
'run_with_ort',
|
||||
'validate_with_ort',
|
||||
'convert_loop_to_scan_and_validate'])
|
||||
parser.add_argument(
|
||||
"--tool",
|
||||
help="what to do",
|
||||
choices=["run_shape_inference", "run_with_ort", "validate_with_ort", "convert_loop_to_scan_and_validate"],
|
||||
)
|
||||
|
||||
parser.add_argument('--input', help='The input model file', default=None)
|
||||
parser.add_argument('--output', help='The output model file', default=None)
|
||||
parser.add_argument('--symbolic_dims', default={}, type=lambda s: dict(x.split("=") for x in s.split(",")),
|
||||
help='Comma separated name=value pairs for any symbolic dimensions in the model input. '
|
||||
'e.g. --symbolic_dims batch=1,seqlen=5. '
|
||||
'If not provided, the value of 1 will be used for all symbolic dimensions.')
|
||||
parser.add_argument('--ort_test_case_dir', help='ort test case dir', default=None)
|
||||
parser.add_argument("--input", help="The input model file", default=None)
|
||||
parser.add_argument("--output", help="The output model file", default=None)
|
||||
parser.add_argument(
|
||||
"--symbolic_dims",
|
||||
default={},
|
||||
type=lambda s: dict(x.split("=") for x in s.split(",")),
|
||||
help="Comma separated name=value pairs for any symbolic dimensions in the model input. "
|
||||
"e.g. --symbolic_dims batch=1,seqlen=5. "
|
||||
"If not provided, the value of 1 will be used for all symbolic dimensions.",
|
||||
)
|
||||
parser.add_argument("--ort_test_case_dir", help="ort test case dir", default=None)
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
if args.tool == 'run_shape_inference':
|
||||
if args.tool == "run_shape_inference":
|
||||
run_shape_inference(args.input, args.output)
|
||||
elif args.tool == 'run_with_ort':
|
||||
elif args.tool == "run_with_ort":
|
||||
run_with_ort(args.input, symbolic_dims=args.symbolic_dims, ort_test_case_dir=args.ort_test_case_dir)
|
||||
elif args.tool == 'validate_with_ort':
|
||||
elif args.tool == "validate_with_ort":
|
||||
validate_with_ort(args.input, args.output, symbolic_dims=args.symbolic_dims)
|
||||
elif args.tool == 'convert_loop_to_scan_and_validate':
|
||||
elif args.tool == "convert_loop_to_scan_and_validate":
|
||||
convert_loop_to_scan_and_validate(args.input, args.output, symbolic_dims=args.symbolic_dims)
|
||||
|
|
|
@ -1,19 +1,22 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
# -*- coding: UTF-8 -*-
|
||||
from enum import Enum
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import helper, numpy_helper
|
||||
import re
|
||||
|
||||
|
||||
class NodeFactory:
|
||||
node_count_ = 0
|
||||
const_count_ = 0
|
||||
|
||||
def __init__(self, main_graph, sub_graph=None, prefix=''):
|
||||
def __init__(self, main_graph, sub_graph=None, prefix=""):
|
||||
self.graph_ = sub_graph if sub_graph else main_graph
|
||||
self.main_graph_ = main_graph
|
||||
self.name_prefix_ = prefix
|
||||
|
@ -91,15 +94,17 @@ class NodeFactory:
|
|||
|
||||
value_info.CopyFrom(helper.make_tensor_value_info(name, data_type, shape))
|
||||
|
||||
def make_initializer(self, ndarray, name='', in_main_graph=False):
|
||||
def make_initializer(self, ndarray, name="", in_main_graph=False):
|
||||
new_initializer = (self.main_graph_ if in_main_graph else self.graph_).initializer.add()
|
||||
new_name = name
|
||||
if len(new_name) == 0:
|
||||
already_existed = True
|
||||
while already_existed:
|
||||
new_name = self.name_prefix_ + '_Const_' + str(NodeFactory.const_count_)
|
||||
new_name = self.name_prefix_ + "_Const_" + str(NodeFactory.const_count_)
|
||||
NodeFactory.const_count_ = NodeFactory.const_count_ + 1
|
||||
already_existed = new_name in [i.name for i in list(self.main_graph_.initializer) + list(self.graph_.initializer)]
|
||||
already_existed = new_name in [
|
||||
i.name for i in list(self.main_graph_.initializer) + list(self.graph_.initializer)
|
||||
]
|
||||
new_initializer.CopyFrom(numpy_helper.from_array(ndarray, new_name))
|
||||
return new_initializer
|
||||
|
||||
|
@ -118,12 +123,12 @@ class NodeFactory:
|
|||
new_initializer = self.make_initializer(i)
|
||||
input_names.append(new_initializer.name)
|
||||
else:
|
||||
assert False # unexpected type in input
|
||||
assert False # unexpected type in input
|
||||
|
||||
if not node:
|
||||
node = self.graph_.node.add()
|
||||
|
||||
name = self.name_prefix_ + op_type + '_' + str(NodeFactory.node_count_)
|
||||
name = self.name_prefix_ + op_type + "_" + str(NodeFactory.node_count_)
|
||||
NodeFactory.node_count_ = NodeFactory.node_count_ + 1
|
||||
|
||||
if not output_names:
|
||||
|
@ -134,9 +139,9 @@ class NodeFactory:
|
|||
|
||||
# Squeeze/Unsqueeze/ReduceSum changed axes to input[1] in opset 13
|
||||
def make_node_with_axes(self, op_type, input, axes, onnx_opset_ver, attributes={}, output_names=None):
|
||||
assert op_type in ['Squeeze', 'Unsqueeze', 'ReduceSum']
|
||||
assert op_type in ["Squeeze", "Unsqueeze", "ReduceSum"]
|
||||
if onnx_opset_ver < 13:
|
||||
attributes.update({'axes':axes})
|
||||
attributes.update({"axes": axes})
|
||||
return self.make_node(op_type, input, attributes=attributes, output_names=output_names)
|
||||
else:
|
||||
axes = np.asarray(axes).astype(np.int64)
|
||||
|
@ -149,13 +154,14 @@ class NodeFactory:
|
|||
# Split changed split to input[1] in opset 13
|
||||
def make_split_node(self, input, split, onnx_opset_ver, attributes, output_names=None):
|
||||
if onnx_opset_ver < 13:
|
||||
attributes.update({'split':split})
|
||||
return self.make_node('Split', input, attributes=attributes, output_names=output_names)
|
||||
attributes.update({"split": split})
|
||||
return self.make_node("Split", input, attributes=attributes, output_names=output_names)
|
||||
else:
|
||||
split = np.asarray(split).astype(np.int64)
|
||||
return self.make_node('Split', [input, split], attributes=attributes, output_names=output_names)
|
||||
return self.make_node("Split", [input, split], attributes=attributes, output_names=output_names)
|
||||
|
||||
def ensure_opset(mp, ver, domains=['onnx', '']):
|
||||
|
||||
def ensure_opset(mp, ver, domains=["onnx", ""]):
|
||||
if type(domains) == str:
|
||||
domains = [domains]
|
||||
assert type(domains) == list
|
||||
|
|
|
@ -4,54 +4,107 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import numpy as np
|
||||
import onnx
|
||||
# use lines below when building ONNX Runtime from source with --enable_pybind
|
||||
#import sys
|
||||
#sys.path.append(r'X:\Repos\Lotus\build\Windows\Release\Release')
|
||||
#sys.path.append('/repos/Lotus/build/Linux/Release')
|
||||
import onnxruntime
|
||||
from onnx import helper, numpy_helper
|
||||
from onnx import shape_inference
|
||||
from onnx import IR_VERSION
|
||||
import os
|
||||
from timeit import default_timer as timer
|
||||
|
||||
def generate_model(rnn_type, input_dim, hidden_dim, bidirectional, layers, model_name, batch_one=True, has_seq_len=False, onnx_opset_ver=7):
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import IR_VERSION, helper, numpy_helper, shape_inference
|
||||
|
||||
# use lines below when building ONNX Runtime from source with --enable_pybind
|
||||
# import sys
|
||||
# sys.path.append(r'X:\Repos\Lotus\build\Windows\Release\Release')
|
||||
# sys.path.append('/repos/Lotus/build/Linux/Release')
|
||||
import onnxruntime
|
||||
|
||||
|
||||
def generate_model(
|
||||
rnn_type,
|
||||
input_dim,
|
||||
hidden_dim,
|
||||
bidirectional,
|
||||
layers,
|
||||
model_name,
|
||||
batch_one=True,
|
||||
has_seq_len=False,
|
||||
onnx_opset_ver=7,
|
||||
):
|
||||
model = onnx.ModelProto()
|
||||
model.ir_version = IR_VERSION
|
||||
|
||||
|
||||
opset = model.opset_import.add()
|
||||
opset.domain == 'onnx'
|
||||
opset.domain == "onnx"
|
||||
opset.version = onnx_opset_ver
|
||||
num_directions = 2 if bidirectional else 1
|
||||
|
||||
X = 'input'
|
||||
model.graph.input.add().CopyFrom(helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ['s', 1 if batch_one else 'b', input_dim]))
|
||||
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.asarray([0, 0, -1], dtype=np.int64), 'shape'))
|
||||
X = "input"
|
||||
model.graph.input.add().CopyFrom(
|
||||
helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ["s", 1 if batch_one else "b", input_dim])
|
||||
)
|
||||
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.asarray([0, 0, -1], dtype=np.int64), "shape"))
|
||||
|
||||
if has_seq_len:
|
||||
seq_len = 'seq_len'
|
||||
model.graph.input.add().CopyFrom(helper.make_tensor_value_info(seq_len, onnx.TensorProto.INT32, [1 if batch_one else 'b',]))
|
||||
seq_len = "seq_len"
|
||||
model.graph.input.add().CopyFrom(
|
||||
helper.make_tensor_value_info(
|
||||
seq_len,
|
||||
onnx.TensorProto.INT32,
|
||||
[
|
||||
1 if batch_one else "b",
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
gates = {'lstm':4, 'gru':3, 'rnn':1}[rnn_type]
|
||||
gates = {"lstm": 4, "gru": 3, "rnn": 1}[rnn_type]
|
||||
for i in range(layers):
|
||||
layer_input_dim = (input_dim if i == 0 else hidden_dim * num_directions)
|
||||
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.random.rand(num_directions, gates*hidden_dim, layer_input_dim).astype(np.float32), 'W'+str(i)))
|
||||
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.random.rand(num_directions, gates*hidden_dim, hidden_dim).astype(np.float32), 'R'+str(i)))
|
||||
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.random.rand(num_directions, 2*gates*hidden_dim).astype(np.float32), 'B'+str(i)))
|
||||
layer_inputs = [X, 'W'+str(i), 'R'+str(i), 'B'+str(i)]
|
||||
layer_input_dim = input_dim if i == 0 else hidden_dim * num_directions
|
||||
model.graph.initializer.add().CopyFrom(
|
||||
numpy_helper.from_array(
|
||||
np.random.rand(num_directions, gates * hidden_dim, layer_input_dim).astype(np.float32), "W" + str(i)
|
||||
)
|
||||
)
|
||||
model.graph.initializer.add().CopyFrom(
|
||||
numpy_helper.from_array(
|
||||
np.random.rand(num_directions, gates * hidden_dim, hidden_dim).astype(np.float32), "R" + str(i)
|
||||
)
|
||||
)
|
||||
model.graph.initializer.add().CopyFrom(
|
||||
numpy_helper.from_array(
|
||||
np.random.rand(num_directions, 2 * gates * hidden_dim).astype(np.float32), "B" + str(i)
|
||||
)
|
||||
)
|
||||
layer_inputs = [X, "W" + str(i), "R" + str(i), "B" + str(i)]
|
||||
if has_seq_len:
|
||||
layer_inputs += [seq_len]
|
||||
layer_outputs = ['layer_output_'+str(i)]
|
||||
model.graph.node.add().CopyFrom(helper.make_node(rnn_type.upper(), layer_inputs, layer_outputs, rnn_type+str(i), hidden_size=hidden_dim, direction='bidirectional' if bidirectional else 'forward'))
|
||||
model.graph.node.add().CopyFrom(helper.make_node('Transpose', layer_outputs, ['transposed_output_'+str(i)], 'transpose'+str(i), perm=[0,2,1,3]))
|
||||
model.graph.node.add().CopyFrom(helper.make_node('Reshape', ['transposed_output_'+str(i), 'shape'], ['reshaped_output_'+str(i)], 'reshape'+str(i)))
|
||||
X = 'reshaped_output_'+str(i)
|
||||
model.graph.output.add().CopyFrom(helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ['s', 'b', hidden_dim * num_directions]))
|
||||
layer_outputs = ["layer_output_" + str(i)]
|
||||
model.graph.node.add().CopyFrom(
|
||||
helper.make_node(
|
||||
rnn_type.upper(),
|
||||
layer_inputs,
|
||||
layer_outputs,
|
||||
rnn_type + str(i),
|
||||
hidden_size=hidden_dim,
|
||||
direction="bidirectional" if bidirectional else "forward",
|
||||
)
|
||||
)
|
||||
model.graph.node.add().CopyFrom(
|
||||
helper.make_node(
|
||||
"Transpose", layer_outputs, ["transposed_output_" + str(i)], "transpose" + str(i), perm=[0, 2, 1, 3]
|
||||
)
|
||||
)
|
||||
model.graph.node.add().CopyFrom(
|
||||
helper.make_node(
|
||||
"Reshape", ["transposed_output_" + str(i), "shape"], ["reshaped_output_" + str(i)], "reshape" + str(i)
|
||||
)
|
||||
)
|
||||
X = "reshaped_output_" + str(i)
|
||||
model.graph.output.add().CopyFrom(
|
||||
helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ["s", "b", hidden_dim * num_directions])
|
||||
)
|
||||
model = shape_inference.infer_shapes(model)
|
||||
onnx.save(model, model_name)
|
||||
|
||||
|
||||
def perf_run(sess, feeds, min_counts=5, min_duration_seconds=10):
|
||||
# warm up
|
||||
sess.run([], feeds)
|
||||
|
@ -70,19 +123,23 @@ def perf_run(sess, feeds, min_counts=5, min_duration_seconds=10):
|
|||
run = False
|
||||
return count, (end - start), per_iter_cost
|
||||
|
||||
|
||||
def top_n_avg(per_iter_cost, n):
|
||||
# following the perf test methodology in [timeit](https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat)
|
||||
per_iter_cost.sort()
|
||||
return sum(per_iter_cost[:n]) * 1000 / n
|
||||
|
||||
|
||||
def get_num_threads():
|
||||
return os.environ['OMP_NUM_THREADS'] if 'OMP_NUM_THREADS' in os.environ else None
|
||||
return os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else None
|
||||
|
||||
|
||||
def set_num_threads(num_threads):
|
||||
if num_threads:
|
||||
os.environ['OMP_NUM_THREADS'] = str(num_threads)
|
||||
os.environ["OMP_NUM_THREADS"] = str(num_threads)
|
||||
else:
|
||||
del os.environ['OMP_NUM_THREADS']
|
||||
del os.environ["OMP_NUM_THREADS"]
|
||||
|
||||
|
||||
class ScopedSetNumThreads:
|
||||
def __init__(self, num_threads):
|
||||
|
@ -95,117 +152,222 @@ class ScopedSetNumThreads:
|
|||
def __exit__(self, type, value, tb):
|
||||
set_num_threads(self.saved_num_threads_)
|
||||
|
||||
def perf_test(rnn_type, num_threads, input_dim, hidden_dim, bidirectional, layers, seq_len, batch_size, top_n=5, min_duration_seconds=10):
|
||||
model_name = '{}_i{}_h{}_{}_l{}_{}.onnx'.format(rnn_type, input_dim, hidden_dim,
|
||||
'bi' if bidirectional else '',
|
||||
layers,
|
||||
'batched' if batch_size > 1 else 'no_batch')
|
||||
|
||||
def perf_test(
|
||||
rnn_type,
|
||||
num_threads,
|
||||
input_dim,
|
||||
hidden_dim,
|
||||
bidirectional,
|
||||
layers,
|
||||
seq_len,
|
||||
batch_size,
|
||||
top_n=5,
|
||||
min_duration_seconds=10,
|
||||
):
|
||||
model_name = "{}_i{}_h{}_{}_l{}_{}.onnx".format(
|
||||
rnn_type,
|
||||
input_dim,
|
||||
hidden_dim,
|
||||
"bi" if bidirectional else "",
|
||||
layers,
|
||||
"batched" if batch_size > 1 else "no_batch",
|
||||
)
|
||||
|
||||
generate_model(rnn_type, input_dim, hidden_dim, bidirectional, layers, model_name, batch_size == 1)
|
||||
feeds = {'input':np.random.rand(seq_len, batch_size, input_dim).astype(np.float32)}
|
||||
feeds = {"input": np.random.rand(seq_len, batch_size, input_dim).astype(np.float32)}
|
||||
|
||||
# run original model in CPU provider, using all threads
|
||||
# there are some local thread pool inside LSTM/GRU CPU kernel
|
||||
# that cannot be controlled by OMP or intra_op_num_threads
|
||||
sess = onnxruntime.InferenceSession(model_name, providers=['CPUExecutionProvider'])
|
||||
sess = onnxruntime.InferenceSession(model_name, providers=["CPUExecutionProvider"])
|
||||
count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
|
||||
avg_rnn = top_n_avg(per_iter_cost, top_n)
|
||||
print('perf_rnn (with default threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(model_name, count, top_n, avg_rnn))
|
||||
print(
|
||||
"perf_rnn (with default threads) {}: run for {} iterations, top {} avg {:.3f} ms".format(
|
||||
model_name, count, top_n, avg_rnn
|
||||
)
|
||||
)
|
||||
|
||||
# run converted model in Nuphar, using specified threads
|
||||
with ScopedSetNumThreads(num_threads) as scoped_set_num_threads:
|
||||
# run Scan model converted from original in Nuphar
|
||||
from .model_editor import convert_to_scan_model
|
||||
from ..tools.symbolic_shape_infer import SymbolicShapeInference
|
||||
scan_model_name = os.path.splitext(model_name)[0] + '_scan.onnx'
|
||||
from .model_editor import convert_to_scan_model
|
||||
|
||||
scan_model_name = os.path.splitext(model_name)[0] + "_scan.onnx"
|
||||
convert_to_scan_model(model_name, scan_model_name)
|
||||
# note that symbolic shape inference is needed because model has symbolic batch dim, thus init_state is ConstantOfShape
|
||||
onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(scan_model_name)), scan_model_name)
|
||||
sess = onnxruntime.InferenceSession(scan_model_name, providers=onnxruntime.get_available_providers())
|
||||
count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
|
||||
count, duration, per_iter_cost = perf_run(
|
||||
sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds
|
||||
)
|
||||
avg_scan = top_n_avg(per_iter_cost, top_n)
|
||||
print('perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, scan_model_name, count, top_n, avg_scan))
|
||||
print(
|
||||
"perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms".format(
|
||||
num_threads, scan_model_name, count, top_n, avg_scan
|
||||
)
|
||||
)
|
||||
|
||||
# quantize Scan model to int8 and run in Nuphar
|
||||
from .model_quantizer import convert_matmul_model
|
||||
int8_model_name = os.path.splitext(model_name)[0] + '_int8.onnx'
|
||||
|
||||
int8_model_name = os.path.splitext(model_name)[0] + "_int8.onnx"
|
||||
convert_matmul_model(scan_model_name, int8_model_name)
|
||||
onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(int8_model_name)), int8_model_name)
|
||||
sess = onnxruntime.InferenceSession(int8_model_name, providers=onnxruntime.get_available_providers())
|
||||
count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
|
||||
count, duration, per_iter_cost = perf_run(
|
||||
sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds
|
||||
)
|
||||
avg_int8 = top_n_avg(per_iter_cost, top_n)
|
||||
print('perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, int8_model_name, count, top_n, avg_int8))
|
||||
print(
|
||||
"perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms".format(
|
||||
num_threads, int8_model_name, count, top_n, avg_int8
|
||||
)
|
||||
)
|
||||
|
||||
return avg_rnn, avg_scan, avg_int8
|
||||
|
||||
|
||||
def perf_test_auto(auto_file):
|
||||
# generate reports in csv format
|
||||
with open('single_thread_' + auto_file + '.csv', 'w') as f:
|
||||
print('single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 batch_size=1', file=f)
|
||||
print('rnn_type,hidden,seq_len,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
|
||||
for rnn_type in ['lstm', 'gru', 'rnn']:
|
||||
with open("single_thread_" + auto_file + ".csv", "w") as f:
|
||||
print("single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 batch_size=1", file=f)
|
||||
print("rnn_type,hidden,seq_len,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
|
||||
for rnn_type in ["lstm", "gru", "rnn"]:
|
||||
for hidden_dim in [32, 128, 1024, 2048]:
|
||||
for seq_len in [1, 16, 32, 64]:
|
||||
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, 1, 128, hidden_dim, False, 4, seq_len, 1)
|
||||
print('{},{},{},{},{},{},{},{}'.format(rnn_type,hidden_dim, seq_len, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
|
||||
print(
|
||||
"{},{},{},{},{},{},{},{}".format(
|
||||
rnn_type,
|
||||
hidden_dim,
|
||||
seq_len,
|
||||
avg_rnn,
|
||||
avg_scan,
|
||||
avg_int8,
|
||||
avg_rnn / avg_scan,
|
||||
avg_rnn / avg_int8,
|
||||
),
|
||||
file=f,
|
||||
)
|
||||
|
||||
with open('multi_thread_' + auto_file + '.csv', 'w') as f:
|
||||
print('multi-thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 seq_len=32 batch_size=1', file=f)
|
||||
print('rnn_type,threads,hidden,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
|
||||
for rnn_type in ['lstm', 'gru', 'rnn']:
|
||||
with open("multi_thread_" + auto_file + ".csv", "w") as f:
|
||||
print("multi-thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 seq_len=32 batch_size=1", file=f)
|
||||
print("rnn_type,threads,hidden,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
|
||||
for rnn_type in ["lstm", "gru", "rnn"]:
|
||||
for num_threads in [1, 2, 4]:
|
||||
for hidden_dim in [32, 128, 1024, 2048]:
|
||||
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, num_threads, 128, hidden_dim, False, 4, seq_len, 1)
|
||||
print('{},{},{},{},{},{},{},{}'.format(rnn_type,num_threads, hidden_dim, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
|
||||
avg_rnn, avg_scan, avg_int8 = perf_test(
|
||||
rnn_type, num_threads, 128, hidden_dim, False, 4, seq_len, 1
|
||||
)
|
||||
print(
|
||||
"{},{},{},{},{},{},{},{}".format(
|
||||
rnn_type,
|
||||
num_threads,
|
||||
hidden_dim,
|
||||
avg_rnn,
|
||||
avg_scan,
|
||||
avg_int8,
|
||||
avg_rnn / avg_scan,
|
||||
avg_rnn / avg_int8,
|
||||
),
|
||||
file=f,
|
||||
)
|
||||
|
||||
with open('batch_single_thread_' + auto_file + '.csv', 'w') as f:
|
||||
print('single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024', file=f)
|
||||
print('rnn_type,seq_len,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
|
||||
for rnn_type in ['lstm', 'gru', 'rnn']:
|
||||
with open("batch_single_thread_" + auto_file + ".csv", "w") as f:
|
||||
print("single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024", file=f)
|
||||
print("rnn_type,seq_len,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
|
||||
for rnn_type in ["lstm", "gru", "rnn"]:
|
||||
for seq_len in [1, 16, 32, 64]:
|
||||
for batch_size in [1, 4, 16, 64]:
|
||||
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, 1, 128, 1024, False, 4, seq_len, batch_size)
|
||||
print('{},{},{},{},{},{},{},{}'.format(rnn_type,seq_len, batch_size, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
|
||||
print(
|
||||
"{},{},{},{},{},{},{},{}".format(
|
||||
rnn_type,
|
||||
seq_len,
|
||||
batch_size,
|
||||
avg_rnn,
|
||||
avg_scan,
|
||||
avg_int8,
|
||||
avg_rnn / avg_scan,
|
||||
avg_rnn / avg_int8,
|
||||
),
|
||||
file=f,
|
||||
)
|
||||
|
||||
with open('batch_multi_thread_' + auto_file + '.csv', 'w') as f:
|
||||
print('batch thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024 seq_len=32', file=f)
|
||||
print('rnn_type,threads,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
|
||||
for rnn_type in ['lstm', 'gru', 'rnn']:
|
||||
with open("batch_multi_thread_" + auto_file + ".csv", "w") as f:
|
||||
print(
|
||||
"batch thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024 seq_len=32", file=f
|
||||
)
|
||||
print("rnn_type,threads,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
|
||||
for rnn_type in ["lstm", "gru", "rnn"]:
|
||||
for num_threads in [1, 2, 4]:
|
||||
for batch_size in [1, 4, 16, 64]:
|
||||
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, num_threads, 128, 1024, False, 4, 32, batch_size)
|
||||
print('{},{},{},{},{},{},{},{}'.format(rnn_type,num_threads, batch_size, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
|
||||
print(
|
||||
"{},{},{},{},{},{},{},{}".format(
|
||||
rnn_type,
|
||||
num_threads,
|
||||
batch_size,
|
||||
avg_rnn,
|
||||
avg_scan,
|
||||
avg_int8,
|
||||
avg_rnn / avg_scan,
|
||||
avg_rnn / avg_int8,
|
||||
),
|
||||
file=f,
|
||||
)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--rnn_type', help='Type of rnn, one of lstm/gru/rnn', choices=['lstm', 'gru', 'rnn'], default='lstm')
|
||||
parser.add_argument('--input_dim', help='Input size of lstm/gru/rnn', type=int, default=128)
|
||||
parser.add_argument('--hidden_dim', help='Hidden size of lstm/gru/rnn', type=int, default=1024)
|
||||
parser.add_argument('--bidirectional', help='Use bidirectional', action='store_true', default=False)
|
||||
parser.add_argument('--layers', help='Number of layers', type=int, default=4)
|
||||
parser.add_argument('--seq_len', help='Sequence length', type=int, default=32)
|
||||
parser.add_argument('--batch_size', help='Batch size', type=int, default=1)
|
||||
parser.add_argument('--num_threads', help='Number of MKL threads', type=int, default=multiprocessing.cpu_count())
|
||||
parser.add_argument('--top_n', help='Fastest N samples to compute average time', type=int, default=5)
|
||||
parser.add_argument('--auto', help='Auto_name (usually CPU type) for auto test to generate (batch_)single|multithread_<auto_name>.csv files', default=None)
|
||||
return parser.parse_args()
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--rnn_type", help="Type of rnn, one of lstm/gru/rnn", choices=["lstm", "gru", "rnn"], default="lstm"
|
||||
)
|
||||
parser.add_argument("--input_dim", help="Input size of lstm/gru/rnn", type=int, default=128)
|
||||
parser.add_argument("--hidden_dim", help="Hidden size of lstm/gru/rnn", type=int, default=1024)
|
||||
parser.add_argument("--bidirectional", help="Use bidirectional", action="store_true", default=False)
|
||||
parser.add_argument("--layers", help="Number of layers", type=int, default=4)
|
||||
parser.add_argument("--seq_len", help="Sequence length", type=int, default=32)
|
||||
parser.add_argument("--batch_size", help="Batch size", type=int, default=1)
|
||||
parser.add_argument("--num_threads", help="Number of MKL threads", type=int, default=multiprocessing.cpu_count())
|
||||
parser.add_argument("--top_n", help="Fastest N samples to compute average time", type=int, default=5)
|
||||
parser.add_argument(
|
||||
"--auto",
|
||||
help="Auto_name (usually CPU type) for auto test to generate (batch_)single|multithread_<auto_name>.csv files",
|
||||
default=None,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
if args.auto:
|
||||
perf_test_auto(args.auto)
|
||||
else:
|
||||
print('Testing model: ', args.rnn_type.upper())
|
||||
print(' input_dim: ', args.input_dim)
|
||||
print(' hidden_dim: ', args.hidden_dim)
|
||||
print("Testing model: ", args.rnn_type.upper())
|
||||
print(" input_dim: ", args.input_dim)
|
||||
print(" hidden_dim: ", args.hidden_dim)
|
||||
if args.bidirectional:
|
||||
print(' bidirectional')
|
||||
print(' layers: ', args.layers)
|
||||
print(" bidirectional")
|
||||
print(" layers: ", args.layers)
|
||||
cpu_count = multiprocessing.cpu_count()
|
||||
num_threads = max(min(args.num_threads, cpu_count), 1)
|
||||
print('Test setup')
|
||||
print(' cpu_count: ', cpu_count)
|
||||
print(' num_threads: ', num_threads)
|
||||
print(' seq_len: ', args.seq_len)
|
||||
print(' batch_size: ', args.batch_size)
|
||||
perf_test(args.rnn_type, num_threads, args.input_dim, args.hidden_dim, args.bidirectional, args.layers, args.seq_len, args.batch_size, args.top_n)
|
||||
print("Test setup")
|
||||
print(" cpu_count: ", cpu_count)
|
||||
print(" num_threads: ", num_threads)
|
||||
print(" seq_len: ", args.seq_len)
|
||||
print(" batch_size: ", args.batch_size)
|
||||
perf_test(
|
||||
args.rnn_type,
|
||||
num_threads,
|
||||
args.input_dim,
|
||||
args.hidden_dim,
|
||||
args.bidirectional,
|
||||
args.layers,
|
||||
args.seq_len,
|
||||
args.batch_size,
|
||||
args.top_n,
|
||||
)
|
||||
|
|
|
@ -5,15 +5,15 @@
|
|||
"""
|
||||
Implements ONNX's backend API.
|
||||
"""
|
||||
from onnx import ModelProto
|
||||
from onnx import helper
|
||||
from onnx import version
|
||||
from onnx.checker import check_model
|
||||
from onnx.backend.base import Backend
|
||||
from onnxruntime import InferenceSession, SessionOptions, get_device, get_available_providers
|
||||
from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep
|
||||
import unittest
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from onnx import ModelProto, helper, version
|
||||
from onnx.backend.base import Backend
|
||||
from onnx.checker import check_model
|
||||
|
||||
from onnxruntime import InferenceSession, SessionOptions, get_available_providers, get_device
|
||||
from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep
|
||||
|
||||
|
||||
class OnnxRuntimeBackend(Backend):
|
||||
|
@ -28,7 +28,7 @@ class OnnxRuntimeBackend(Backend):
|
|||
Note: This is not the official Python API.
|
||||
""" # noqa: E501
|
||||
|
||||
allowReleasedOpsetsOnly = bool(os.getenv('ALLOW_RELEASED_ONNX_OPSET_ONLY', '1') == '1')
|
||||
allowReleasedOpsetsOnly = bool(os.getenv("ALLOW_RELEASED_ONNX_OPSET_ONLY", "1") == "1")
|
||||
|
||||
@classmethod
|
||||
def is_compatible(cls, model, device=None, **kwargs):
|
||||
|
@ -55,22 +55,26 @@ class OnnxRuntimeBackend(Backend):
|
|||
"""
|
||||
if cls.allowReleasedOpsetsOnly:
|
||||
for opset in model.opset_import:
|
||||
domain = opset.domain if opset.domain else 'ai.onnx'
|
||||
domain = opset.domain if opset.domain else "ai.onnx"
|
||||
try:
|
||||
key = (domain, opset.version)
|
||||
if not (key in helper.OP_SET_ID_VERSION_MAP):
|
||||
error_message = ("Skipping this test as only released onnx opsets are supported."
|
||||
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
|
||||
" Got Domain '{0}' version '{1}'.".format(domain, opset.version))
|
||||
error_message = (
|
||||
"Skipping this test as only released onnx opsets are supported."
|
||||
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
|
||||
" Got Domain '{0}' version '{1}'.".format(domain, opset.version)
|
||||
)
|
||||
return False, error_message
|
||||
except AttributeError:
|
||||
# for some CI pipelines accessing helper.OP_SET_ID_VERSION_MAP
|
||||
# is generating attribute error. TODO investigate the pipelines to
|
||||
# fix this error. Falling back to a simple version check when this error is encountered
|
||||
if (domain == 'ai.onnx' and opset.version > 12) or (domain == 'ai.ommx.ml' and opset.version > 2):
|
||||
error_message = ("Skipping this test as only released onnx opsets are supported."
|
||||
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
|
||||
" Got Domain '{0}' version '{1}'.".format(domain, opset.version))
|
||||
if (domain == "ai.onnx" and opset.version > 12) or (domain == "ai.ommx.ml" and opset.version > 2):
|
||||
error_message = (
|
||||
"Skipping this test as only released onnx opsets are supported."
|
||||
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
|
||||
" Got Domain '{0}' version '{1}'.".format(domain, opset.version)
|
||||
)
|
||||
return False, error_message
|
||||
return True, ""
|
||||
|
||||
|
@ -80,8 +84,8 @@ class OnnxRuntimeBackend(Backend):
|
|||
Check whether the backend is compiled with particular device support.
|
||||
In particular it's used in the testing suite.
|
||||
"""
|
||||
if device == 'CUDA':
|
||||
device = 'GPU'
|
||||
if device == "CUDA":
|
||||
device = "GPU"
|
||||
return device in get_device()
|
||||
|
||||
@classmethod
|
||||
|
@ -108,7 +112,7 @@ class OnnxRuntimeBackend(Backend):
|
|||
if hasattr(options, k):
|
||||
setattr(options, k, v)
|
||||
|
||||
excluded_providers = os.getenv('ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS', default="").split(',')
|
||||
excluded_providers = os.getenv("ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS", default="").split(",")
|
||||
providers = [x for x in get_available_providers() if (x not in excluded_providers)]
|
||||
|
||||
inf = InferenceSession(model, sess_options=options, providers=providers)
|
||||
|
@ -156,10 +160,10 @@ class OnnxRuntimeBackend(Backend):
|
|||
|
||||
@classmethod
|
||||
def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
|
||||
'''
|
||||
"""
|
||||
This method is not implemented as it is much more efficient
|
||||
to run a whole model than every node independently.
|
||||
'''
|
||||
"""
|
||||
raise NotImplementedError("It is much more efficient to run a whole model than every node independently.")
|
||||
|
||||
|
||||
|
|
|
@ -5,10 +5,12 @@
|
|||
"""
|
||||
Implements ONNX's backend API.
|
||||
"""
|
||||
from onnxruntime import RunOptions
|
||||
from onnx.backend.base import BackendRep
|
||||
from typing import Any, Tuple
|
||||
|
||||
from onnx.backend.base import BackendRep
|
||||
|
||||
from onnxruntime import RunOptions
|
||||
|
||||
|
||||
class OnnxRuntimeBackendRep(BackendRep):
|
||||
"""
|
||||
|
|
|
@ -2,9 +2,9 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
import warnings
|
||||
import ctypes
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
|
||||
def find_cudart_versions(build_env=False, build_cuda_version=None):
|
||||
|
@ -16,16 +16,16 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
|
|||
# for the above reason, we need find all versions in the environment and
|
||||
# only give warnings if the expected cuda version is not found.
|
||||
# in onnxruntime build environment, we expected only one Cuda version.
|
||||
if not sys.platform.startswith('linux'):
|
||||
warnings.warn('find_cudart_versions only works on Linux')
|
||||
if not sys.platform.startswith("linux"):
|
||||
warnings.warn("find_cudart_versions only works on Linux")
|
||||
return None
|
||||
|
||||
cudart_possible_versions = {None, build_cuda_version}
|
||||
|
||||
def get_cudart_version(find_cudart_version=None):
|
||||
cudart_lib_filename = 'libcudart.so'
|
||||
cudart_lib_filename = "libcudart.so"
|
||||
if find_cudart_version:
|
||||
cudart_lib_filename = cudart_lib_filename + '.' + find_cudart_version
|
||||
cudart_lib_filename = cudart_lib_filename + "." + find_cudart_version
|
||||
|
||||
try:
|
||||
cudart = ctypes.CDLL(cudart_lib_filename)
|
||||
|
@ -35,14 +35,13 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
|
|||
status = cudart.cudaRuntimeGetVersion(ctypes.byref(version))
|
||||
if status != 0:
|
||||
return None
|
||||
except: # noqa
|
||||
except: # noqa
|
||||
return None
|
||||
|
||||
return version.value
|
||||
|
||||
# use set to avoid duplications
|
||||
cudart_found_versions = {
|
||||
get_cudart_version(cudart_version) for cudart_version in cudart_possible_versions}
|
||||
cudart_found_versions = {get_cudart_version(cudart_version) for cudart_version in cudart_possible_versions}
|
||||
|
||||
# convert to list and remove None
|
||||
return [ver for ver in cudart_found_versions if ver]
|
||||
|
@ -50,27 +49,42 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
|
|||
|
||||
def find_cudnn_supported_cuda_versions(build_env=False):
|
||||
# comments in get_cudart_version apply here
|
||||
if not sys.platform.startswith('linux'):
|
||||
warnings.warn('find_cudnn_versions only works on Linux')
|
||||
if not sys.platform.startswith("linux"):
|
||||
warnings.warn("find_cudnn_versions only works on Linux")
|
||||
|
||||
cudnn_possible_versions = {None}
|
||||
if not build_env:
|
||||
# if not in a build environment, there may be more than one installed cudnn.
|
||||
# https://developer.nvidia.com/rdp/cudnn-archive to include all that may support Cuda 10+.
|
||||
cudnn_possible_versions.update({
|
||||
'8.2',
|
||||
'8.1.1', '8.1.0',
|
||||
'8.0.5', '8.0.4', '8.0.3', '8.0.2', '8.0.1',
|
||||
'7.6.5', '7.6.4', '7.6.3', '7.6.2', '7.6.1', '7.6.0',
|
||||
'7.5.1', '7.5.0',
|
||||
'7.4.2', '7.4.1',
|
||||
'7.3.1', '7.3.0',
|
||||
})
|
||||
cudnn_possible_versions.update(
|
||||
{
|
||||
"8.2",
|
||||
"8.1.1",
|
||||
"8.1.0",
|
||||
"8.0.5",
|
||||
"8.0.4",
|
||||
"8.0.3",
|
||||
"8.0.2",
|
||||
"8.0.1",
|
||||
"7.6.5",
|
||||
"7.6.4",
|
||||
"7.6.3",
|
||||
"7.6.2",
|
||||
"7.6.1",
|
||||
"7.6.0",
|
||||
"7.5.1",
|
||||
"7.5.0",
|
||||
"7.4.2",
|
||||
"7.4.1",
|
||||
"7.3.1",
|
||||
"7.3.0",
|
||||
}
|
||||
)
|
||||
|
||||
def get_cudnn_supported_cuda_version(find_cudnn_version=None):
|
||||
cudnn_lib_filename = 'libcudnn.so'
|
||||
cudnn_lib_filename = "libcudnn.so"
|
||||
if find_cudnn_version:
|
||||
cudnn_lib_filename = cudnn_lib_filename + '.' + find_cudnn_version
|
||||
cudnn_lib_filename = cudnn_lib_filename + "." + find_cudnn_version
|
||||
|
||||
# in cudnn.h cudnn version are calculated as:
|
||||
# #define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
|
||||
|
@ -79,7 +93,7 @@ def find_cudnn_supported_cuda_versions(build_env=False):
|
|||
# cudnn_ver = cudnn.cudnnGetVersion()
|
||||
cuda_ver = cudnn.cudnnGetCudartVersion()
|
||||
return cuda_ver
|
||||
except: # noqa
|
||||
except: # noqa
|
||||
return None
|
||||
|
||||
# use set to avoid duplications
|
||||
|
|
|
@ -12,14 +12,14 @@ from onnxruntime.capi import _pybind_state as C
|
|||
|
||||
def get_ort_device_type(device):
|
||||
device_type = device if type(device) is str else device.type.lower()
|
||||
if device_type == 'cuda':
|
||||
if device_type == "cuda":
|
||||
return C.OrtDevice.cuda()
|
||||
elif device_type == 'cpu':
|
||||
elif device_type == "cpu":
|
||||
return C.OrtDevice.cpu()
|
||||
elif device_type == 'ort':
|
||||
elif device_type == "ort":
|
||||
return C.get_ort_device(device.index).device_type()
|
||||
else:
|
||||
raise Exception('Unsupported device type: ' + device_type)
|
||||
raise Exception("Unsupported device type: " + device_type)
|
||||
|
||||
|
||||
def check_and_normalize_provider_args(providers, provider_options, available_provider_names):
|
||||
|
@ -52,8 +52,10 @@ def check_and_normalize_provider_args(providers, provider_options, available_pro
|
|||
|
||||
def set_provider_options(name, options):
|
||||
if name not in available_provider_names:
|
||||
warnings.warn("Specified provider '{}' is not in available provider names."
|
||||
"Available providers: '{}'".format(name, ", ".join(available_provider_names)))
|
||||
warnings.warn(
|
||||
"Specified provider '{}' is not in available provider names."
|
||||
"Available providers: '{}'".format(name, ", ".join(available_provider_names))
|
||||
)
|
||||
|
||||
if name in provider_name_to_options:
|
||||
warnings.warn("Duplicate provider '{}' encountered, ignoring.".format(name))
|
||||
|
@ -85,8 +87,12 @@ def check_and_normalize_provider_args(providers, provider_options, available_pro
|
|||
for provider in providers:
|
||||
if isinstance(provider, str):
|
||||
set_provider_options(provider, dict())
|
||||
elif isinstance(provider, tuple) and len(provider) == 2 and \
|
||||
isinstance(provider[0], str) and isinstance(provider[1], dict):
|
||||
elif (
|
||||
isinstance(provider, tuple)
|
||||
and len(provider) == 2
|
||||
and isinstance(provider[0], str)
|
||||
and isinstance(provider[1], dict)
|
||||
):
|
||||
set_provider_options(provider[0], provider[1])
|
||||
else:
|
||||
raise ValueError("'providers' values must be either strings or (string, dict) tuples.")
|
||||
|
@ -98,6 +104,7 @@ class Session:
|
|||
"""
|
||||
This is the main class used to run a model.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
||||
# self._sess is managed by the derived class and relies on bindings from C.InferenceSession
|
||||
|
@ -216,6 +223,7 @@ class Session:
|
|||
|
||||
sess.run([output_name], {input_name: x})
|
||||
"""
|
||||
|
||||
def invoke(sess, output_names, input_dict_ort_values, run_options):
|
||||
input_dict = {}
|
||||
for n, v in input_dict_ort_values.items():
|
||||
|
@ -268,10 +276,10 @@ class Session:
|
|||
|
||||
def run_with_iobinding(self, iobinding, run_options=None):
|
||||
"""
|
||||
Compute the predictions.
|
||||
Compute the predictions.
|
||||
|
||||
:param iobinding: the iobinding object that has graph inputs/outputs bind.
|
||||
:param run_options: See :class:`onnxruntime.RunOptions`.
|
||||
:param iobinding: the iobinding object that has graph inputs/outputs bind.
|
||||
:param run_options: See :class:`onnxruntime.RunOptions`.
|
||||
"""
|
||||
self._sess.run_with_iobinding(iobinding._iobinding, run_options)
|
||||
|
||||
|
@ -280,6 +288,7 @@ class InferenceSession(Session):
|
|||
"""
|
||||
This is the main class used to run a model.
|
||||
"""
|
||||
|
||||
def __init__(self, path_or_bytes, sess_options=None, providers=None, provider_options=None, **kwargs):
|
||||
"""
|
||||
:param path_or_bytes: filename or serialized ONNX or ORT format model in a byte string
|
||||
|
@ -326,10 +335,10 @@ class InferenceSession(Session):
|
|||
self._sess_options = sess_options
|
||||
self._sess_options_initial = sess_options
|
||||
self._enable_fallback = True
|
||||
self._read_config_from_model = os.environ.get('ORT_LOAD_CONFIG_FROM_MODEL') == '1'
|
||||
self._read_config_from_model = os.environ.get("ORT_LOAD_CONFIG_FROM_MODEL") == "1"
|
||||
|
||||
# internal parameters that we don't expect to be used in general so aren't documented
|
||||
disabled_optimizers = kwargs['disabled_optimizers'] if 'disabled_optimizers' in kwargs else None
|
||||
disabled_optimizers = kwargs["disabled_optimizers"] if "disabled_optimizers" in kwargs else None
|
||||
|
||||
try:
|
||||
self._create_inference_session(providers, provider_options, disabled_optimizers)
|
||||
|
@ -347,23 +356,25 @@ class InferenceSession(Session):
|
|||
available_providers = C.get_available_providers()
|
||||
|
||||
# Tensorrt can fall back to CUDA. All others fall back to CPU.
|
||||
if 'TensorrtExecutionProvider' in available_providers:
|
||||
self._fallback_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
elif 'MIGraphXExecutionProvider' in available_providers:
|
||||
self._fallback_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
if "TensorrtExecutionProvider" in available_providers:
|
||||
self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
elif "MIGraphXExecutionProvider" in available_providers:
|
||||
self._fallback_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
|
||||
else:
|
||||
self._fallback_providers = ['CPUExecutionProvider']
|
||||
self._fallback_providers = ["CPUExecutionProvider"]
|
||||
|
||||
# validate providers and provider_options before other initialization
|
||||
providers, provider_options = check_and_normalize_provider_args(providers,
|
||||
provider_options,
|
||||
available_providers)
|
||||
providers, provider_options = check_and_normalize_provider_args(
|
||||
providers, provider_options, available_providers
|
||||
)
|
||||
if providers == [] and len(available_providers) > 1:
|
||||
self.disable_fallback()
|
||||
raise ValueError("This ORT build has {} enabled. ".format(available_providers) +
|
||||
"Since ORT 1.9, you are required to explicitly set " +
|
||||
"the providers parameter when instantiating InferenceSession. For example, "
|
||||
"onnxruntime.InferenceSession(..., providers={}, ...)".format(available_providers))
|
||||
raise ValueError(
|
||||
"This ORT build has {} enabled. ".format(available_providers)
|
||||
+ "Since ORT 1.9, you are required to explicitly set "
|
||||
+ "the providers parameter when instantiating InferenceSession. For example, "
|
||||
"onnxruntime.InferenceSession(..., providers={}, ...)".format(available_providers)
|
||||
)
|
||||
|
||||
session_options = self._sess_options if self._sess_options else C.get_default_session_options()
|
||||
if self._model_path:
|
||||
|
@ -410,19 +421,20 @@ class InferenceSession(Session):
|
|||
|
||||
|
||||
class IOBinding:
|
||||
'''
|
||||
"""
|
||||
This class provides API to bind input/output to a specified device, e.g. GPU.
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, session):
|
||||
self._iobinding = C.SessionIOBinding(session._sess)
|
||||
self._numpy_obj_references = {}
|
||||
|
||||
def bind_cpu_input(self, name, arr_on_cpu):
|
||||
'''
|
||||
"""
|
||||
bind an input to array on CPU
|
||||
:param name: input name
|
||||
:param arr_on_cpu: input values as a python array on CPU
|
||||
'''
|
||||
"""
|
||||
# Hold a reference to the numpy object as the bound OrtValue is backed
|
||||
# directly by the data buffer of the numpy object and so the numpy object
|
||||
# must be around until this IOBinding instance is around
|
||||
|
@ -430,38 +442,53 @@ class IOBinding:
|
|||
self._iobinding.bind_input(name, arr_on_cpu)
|
||||
|
||||
def bind_input(self, name, device_type, device_id, element_type, shape, buffer_ptr):
|
||||
'''
|
||||
"""
|
||||
:param name: input name
|
||||
:param device_type: e.g. cpu, cuda
|
||||
:param device_id: device id, e.g. 0
|
||||
:param element_type: input element type
|
||||
:param shape: input shape
|
||||
:param buffer_ptr: memory pointer to input data
|
||||
'''
|
||||
self._iobinding.bind_input(name,
|
||||
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(),
|
||||
device_id),
|
||||
element_type, shape, buffer_ptr)
|
||||
"""
|
||||
self._iobinding.bind_input(
|
||||
name,
|
||||
C.OrtDevice(
|
||||
get_ort_device_type(device_type),
|
||||
C.OrtDevice.default_memory(),
|
||||
device_id,
|
||||
),
|
||||
element_type,
|
||||
shape,
|
||||
buffer_ptr,
|
||||
)
|
||||
|
||||
def bind_ortvalue_input(self, name, ortvalue):
|
||||
'''
|
||||
"""
|
||||
:param name: input name
|
||||
:param ortvalue: OrtValue instance to bind
|
||||
'''
|
||||
"""
|
||||
self._iobinding.bind_ortvalue_input(name, ortvalue._ortvalue)
|
||||
|
||||
def synchronize_inputs(self):
|
||||
self._iobinding.synchronize_inputs()
|
||||
|
||||
def bind_output(self, name, device_type='cpu', device_id=0, element_type=None, shape=None, buffer_ptr=None):
|
||||
'''
|
||||
def bind_output(
|
||||
self,
|
||||
name,
|
||||
device_type="cpu",
|
||||
device_id=0,
|
||||
element_type=None,
|
||||
shape=None,
|
||||
buffer_ptr=None,
|
||||
):
|
||||
"""
|
||||
:param name: output name
|
||||
:param device_type: e.g. cpu, cuda, cpu by default
|
||||
:param device_id: device id, e.g. 0
|
||||
:param element_type: output element type
|
||||
:param shape: output shape
|
||||
:param buffer_ptr: memory pointer to output data
|
||||
'''
|
||||
"""
|
||||
|
||||
# Follow the `if` path when the user has not provided any pre-allocated buffer but still
|
||||
# would like to bind an output to a specific device (e.g. cuda).
|
||||
|
@ -470,32 +497,44 @@ class IOBinding:
|
|||
# in which case ORT will allocate the memory for the user
|
||||
# (2) The output has a dynamic shape and hence the size of the buffer may not be fixed across runs
|
||||
if buffer_ptr is None:
|
||||
self._iobinding.bind_output(name,
|
||||
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(),
|
||||
device_id))
|
||||
self._iobinding.bind_output(
|
||||
name,
|
||||
C.OrtDevice(
|
||||
get_ort_device_type(device_type),
|
||||
C.OrtDevice.default_memory(),
|
||||
device_id,
|
||||
),
|
||||
)
|
||||
else:
|
||||
if element_type is None or shape is None:
|
||||
raise ValueError("`element_type` and `shape` are to be provided if pre-allocated memory is provided")
|
||||
self._iobinding.bind_output(name,
|
||||
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(),
|
||||
device_id),
|
||||
element_type, shape, buffer_ptr)
|
||||
self._iobinding.bind_output(
|
||||
name,
|
||||
C.OrtDevice(
|
||||
get_ort_device_type(device_type),
|
||||
C.OrtDevice.default_memory(),
|
||||
device_id,
|
||||
),
|
||||
element_type,
|
||||
shape,
|
||||
buffer_ptr,
|
||||
)
|
||||
|
||||
def bind_ortvalue_output(self, name, ortvalue):
|
||||
'''
|
||||
"""
|
||||
:param name: output name
|
||||
:param ortvalue: OrtValue instance to bind
|
||||
'''
|
||||
"""
|
||||
self._iobinding.bind_ortvalue_output(name, ortvalue._ortvalue)
|
||||
|
||||
def synchronize_outputs(self):
|
||||
self._iobinding.synchronize_outputs()
|
||||
|
||||
def get_outputs(self):
|
||||
'''
|
||||
"""
|
||||
Returns the output OrtValues from the Run() that preceded the call.
|
||||
The data buffer of the obtained OrtValues may not reside on CPU memory
|
||||
'''
|
||||
"""
|
||||
returned_ortvalues = []
|
||||
|
||||
for ortvalue in self._iobinding.get_outputs():
|
||||
|
@ -504,7 +543,7 @@ class IOBinding:
|
|||
return returned_ortvalues
|
||||
|
||||
def copy_outputs_to_cpu(self):
|
||||
'''Copy output contents to CPU (if on another device). No-op if already on the CPU.'''
|
||||
"""Copy output contents to CPU (if on another device). No-op if already on the CPU."""
|
||||
return self._iobinding.copy_outputs_to_cpu()
|
||||
|
||||
def clear_binding_inputs(self):
|
||||
|
@ -515,11 +554,12 @@ class IOBinding:
|
|||
|
||||
|
||||
class OrtValue:
|
||||
'''
|
||||
"""
|
||||
A data structure that supports all ONNX data formats (tensors and non-tensors) that allows users
|
||||
to place the data backing these on a device, for example, on a CUDA supported device.
|
||||
This class provides APIs to construct and deal with OrtValues.
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, ortvalue, numpy_obj=None):
|
||||
if isinstance(ortvalue, C.OrtValue):
|
||||
self._ortvalue = ortvalue
|
||||
|
@ -528,157 +568,183 @@ class OrtValue:
|
|||
self._numpy_obj = numpy_obj
|
||||
else:
|
||||
# An end user won't hit this error
|
||||
raise ValueError("`Provided ortvalue` needs to be of type " +
|
||||
"`onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`")
|
||||
raise ValueError(
|
||||
"`Provided ortvalue` needs to be of type " + "`onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`"
|
||||
)
|
||||
|
||||
def _get_c_value(self):
|
||||
return self._ortvalue
|
||||
|
||||
@staticmethod
|
||||
def ortvalue_from_numpy(numpy_obj, device_type='cpu', device_id=0):
|
||||
'''
|
||||
def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
|
||||
"""
|
||||
Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object
|
||||
A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu
|
||||
|
||||
:param numpy_obj: The Numpy object to construct the OrtValue from
|
||||
:param device_type: e.g. cpu, cuda, cpu by default
|
||||
:param device_id: device id, e.g. 0
|
||||
'''
|
||||
"""
|
||||
# Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue
|
||||
# is backed directly by the data buffer of the numpy object and so the numpy object
|
||||
# must be around until this OrtValue instance is around
|
||||
return OrtValue(C.OrtValue.ortvalue_from_numpy(numpy_obj, C.OrtDevice(get_ort_device_type(device_type),
|
||||
C.OrtDevice.default_memory(), device_id)), numpy_obj if device_type.lower() == 'cpu' else None)
|
||||
return OrtValue(
|
||||
C.OrtValue.ortvalue_from_numpy(
|
||||
numpy_obj,
|
||||
C.OrtDevice(
|
||||
get_ort_device_type(device_type),
|
||||
C.OrtDevice.default_memory(),
|
||||
device_id,
|
||||
),
|
||||
),
|
||||
numpy_obj if device_type.lower() == "cpu" else None,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def ortvalue_from_shape_and_type(shape=None, element_type=None, device_type='cpu', device_id=0):
|
||||
'''
|
||||
def ortvalue_from_shape_and_type(shape=None, element_type=None, device_type="cpu", device_id=0):
|
||||
"""
|
||||
Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
|
||||
|
||||
:param shape: List of integers indicating the shape of the OrtValue
|
||||
:param element_type: The data type of the elements in the OrtValue (numpy type)
|
||||
:param device_type: e.g. cpu, cuda, cpu by default
|
||||
:param device_id: device id, e.g. 0
|
||||
'''
|
||||
"""
|
||||
if shape is None or element_type is None:
|
||||
raise ValueError("`element_type` and `shape` are to be provided if pre-allocated memory is provided")
|
||||
|
||||
return OrtValue(C.OrtValue.ortvalue_from_shape_and_type(shape, element_type,
|
||||
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(), device_id)))
|
||||
return OrtValue(
|
||||
C.OrtValue.ortvalue_from_shape_and_type(
|
||||
shape,
|
||||
element_type,
|
||||
C.OrtDevice(
|
||||
get_ort_device_type(device_type),
|
||||
C.OrtDevice.default_memory(),
|
||||
device_id,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def ort_value_from_sparse_tensor(sparse_tensor):
|
||||
'''
|
||||
"""
|
||||
The function will construct an OrtValue instance from a valid SparseTensor
|
||||
The new instance of OrtValue will assume the ownership of sparse_tensor
|
||||
'''
|
||||
"""
|
||||
return OrtValue(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
|
||||
|
||||
def as_sparse_tensor(self):
|
||||
'''
|
||||
"""
|
||||
The function will return SparseTensor contained in this OrtValue
|
||||
'''
|
||||
"""
|
||||
return SparseTensor(self._ortvalue.as_sparse_tensor())
|
||||
|
||||
def data_ptr(self):
|
||||
'''
|
||||
"""
|
||||
Returns the address of the first element in the OrtValue's data buffer
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.data_ptr()
|
||||
|
||||
def device_name(self):
|
||||
'''
|
||||
"""
|
||||
Returns the name of the device where the OrtValue's data buffer resides e.g. cpu, cuda
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.device_name().lower()
|
||||
|
||||
def shape(self):
|
||||
'''
|
||||
"""
|
||||
Returns the shape of the data in the OrtValue
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.shape()
|
||||
|
||||
def data_type(self):
|
||||
'''
|
||||
"""
|
||||
Returns the data type of the data in the OrtValue
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.data_type()
|
||||
|
||||
def element_type(self):
|
||||
'''
|
||||
"""
|
||||
Returns the proto type of the data in the OrtValue
|
||||
if the OrtValue is a tensor.
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.element_type()
|
||||
|
||||
def has_value(self):
|
||||
'''
|
||||
"""
|
||||
Returns True if the OrtValue corresponding to an
|
||||
optional type contains data, else returns False
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.has_value()
|
||||
|
||||
def is_tensor(self):
|
||||
'''
|
||||
"""
|
||||
Returns True if the OrtValue contains a Tensor, else returns False
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.is_tensor()
|
||||
|
||||
def is_sparse_tensor(self):
|
||||
'''
|
||||
"""
|
||||
Returns True if the OrtValue contains a SparseTensor, else returns False
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.is_sparse_tensor()
|
||||
|
||||
def is_tensor_sequence(self):
|
||||
'''
|
||||
"""
|
||||
Returns True if the OrtValue contains a Tensor Sequence, else returns False
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.is_tensor_sequence()
|
||||
|
||||
def numpy(self):
|
||||
'''
|
||||
"""
|
||||
Returns a Numpy object from the OrtValue.
|
||||
Valid only for OrtValues holding Tensors. Throws for OrtValues holding non-Tensors.
|
||||
Use accessors to gain a reference to non-Tensor objects such as SparseTensor
|
||||
'''
|
||||
"""
|
||||
return self._ortvalue.numpy()
|
||||
|
||||
def update_inplace(self, np_arr):
|
||||
'''
|
||||
"""
|
||||
Update the OrtValue in place with a new Numpy array. The numpy contents
|
||||
are copied over to the device memory backing the OrtValue. It can be used
|
||||
to update the input valuess for an InferenceSession with CUDA graph
|
||||
enabled or other scenarios where the OrtValue needs to be updated while
|
||||
the memory address can not be changed.
|
||||
'''
|
||||
"""
|
||||
self._ortvalue.update_inplace(np_arr)
|
||||
|
||||
|
||||
class OrtDevice:
|
||||
'''
|
||||
"""
|
||||
A data structure that exposes the underlying C++ OrtDevice
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, c_ort_device):
|
||||
'''
|
||||
"""
|
||||
Internal constructor
|
||||
'''
|
||||
"""
|
||||
if isinstance(c_ort_device, C.OrtDevice):
|
||||
self._ort_device = c_ort_device
|
||||
else:
|
||||
raise ValueError("`Provided object` needs to be of type " +
|
||||
"`onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`")
|
||||
raise ValueError(
|
||||
"`Provided object` needs to be of type " + "`onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`"
|
||||
)
|
||||
|
||||
def _get_c_device(self):
|
||||
'''
|
||||
"""
|
||||
Internal accessor to underlying object
|
||||
'''
|
||||
"""
|
||||
return self._ort_device
|
||||
|
||||
@staticmethod
|
||||
def make(ort_device_name, device_id):
|
||||
return OrtDevice(C.OrtDevice(get_ort_device_type(ort_device_name),
|
||||
C.OrtDevice.default_memory(), device_id))
|
||||
return OrtDevice(
|
||||
C.OrtDevice(
|
||||
get_ort_device_type(ort_device_name),
|
||||
C.OrtDevice.default_memory(),
|
||||
device_id,
|
||||
)
|
||||
)
|
||||
|
||||
def device_id(self):
|
||||
return self._ort_device.device_id()
|
||||
|
@ -688,29 +754,31 @@ class OrtDevice:
|
|||
|
||||
|
||||
class SparseTensor:
|
||||
'''
|
||||
"""
|
||||
A data structure that project the C++ SparseTensor object
|
||||
The class provides API to work with the object.
|
||||
Depending on the format, the class will hold more than one buffer
|
||||
depending on the format
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, sparse_tensor):
|
||||
'''
|
||||
"""
|
||||
Internal constructor
|
||||
'''
|
||||
"""
|
||||
if isinstance(sparse_tensor, C.SparseTensor):
|
||||
self._tensor = sparse_tensor
|
||||
else:
|
||||
# An end user won't hit this error
|
||||
raise ValueError("`Provided object` needs to be of type " +
|
||||
"`onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`")
|
||||
raise ValueError(
|
||||
"`Provided object` needs to be of type " + "`onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`"
|
||||
)
|
||||
|
||||
def _get_c_tensor(self):
|
||||
return self._tensor
|
||||
|
||||
@staticmethod
|
||||
def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
|
||||
'''
|
||||
"""
|
||||
Factory method to construct a SparseTensor in COO format from given arguments
|
||||
|
||||
:param dense_shape: 1-D numpy array(int64) or a python list that contains a dense_shape of the sparse tensor
|
||||
|
@ -729,13 +797,14 @@ class SparseTensor:
|
|||
on GC. The buffers may reside in any storage either CPU or GPU.
|
||||
For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
|
||||
on other devices and their memory can not be mapped.
|
||||
'''
|
||||
return SparseTensor(C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices,
|
||||
ort_device._get_c_device()))
|
||||
"""
|
||||
return SparseTensor(
|
||||
C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device())
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort_device):
|
||||
'''
|
||||
"""
|
||||
Factory method to construct a SparseTensor in CSR format from given arguments
|
||||
|
||||
:param dense_shape: 1-D numpy array(int64) or a python list that contains a dense_shape of the
|
||||
|
@ -754,20 +823,27 @@ class SparseTensor:
|
|||
The buffers may reside in any storage either CPU or GPU.
|
||||
For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
|
||||
on other devices and their memory can not be mapped.
|
||||
'''
|
||||
return SparseTensor(C.SparseTensor.sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices,
|
||||
ort_device._get_c_device()))
|
||||
"""
|
||||
return SparseTensor(
|
||||
C.SparseTensor.sparse_csr_from_numpy(
|
||||
dense_shape,
|
||||
values,
|
||||
inner_indices,
|
||||
outer_indices,
|
||||
ort_device._get_c_device(),
|
||||
)
|
||||
)
|
||||
|
||||
def values(self):
|
||||
'''
|
||||
"""
|
||||
The method returns a numpy array that is backed by the native memory
|
||||
if the data type is numeric. Otherwise, the returned numpy array that contains
|
||||
copies of the strings.
|
||||
'''
|
||||
"""
|
||||
return self._tensor.values()
|
||||
|
||||
def as_coo_view(self):
|
||||
'''
|
||||
"""
|
||||
The method will return coo representation of the sparse tensor which will enable
|
||||
querying COO indices. If the instance did not contain COO format, it would throw.
|
||||
You can query coo indices as:
|
||||
|
@ -777,11 +853,11 @@ class SparseTensor:
|
|||
coo_indices = sparse_tensor.as_coo_view().indices()
|
||||
|
||||
which will return a numpy array that is backed by the native memory.
|
||||
'''
|
||||
"""
|
||||
return self._tensor.get_coo_data()
|
||||
|
||||
def as_csrc_view(self):
|
||||
'''
|
||||
"""
|
||||
The method will return CSR(C) representation of the sparse tensor which will enable
|
||||
querying CRS(C) indices. If the instance dit not contain CSR(C) format, it would throw.
|
||||
You can query indices as:
|
||||
|
@ -792,11 +868,11 @@ class SparseTensor:
|
|||
outer_ndices = sparse_tensor.as_csrc_view().outer()
|
||||
|
||||
returning numpy arrays backed by the native memory.
|
||||
'''
|
||||
"""
|
||||
return self._tensor.get_csrc_data()
|
||||
|
||||
def as_blocksparse_view(self):
|
||||
'''
|
||||
"""
|
||||
The method will return coo representation of the sparse tensor which will enable
|
||||
querying BlockSparse indices. If the instance did not contain BlockSparse format, it would throw.
|
||||
You can query coo indices as:
|
||||
|
@ -806,11 +882,11 @@ class SparseTensor:
|
|||
block_sparse_indices = sparse_tensor.as_blocksparse_view().indices()
|
||||
|
||||
which will return a numpy array that is backed by the native memory
|
||||
'''
|
||||
"""
|
||||
return self._tensor.get_blocksparse_data()
|
||||
|
||||
def to_cuda(self, ort_device):
|
||||
'''
|
||||
"""
|
||||
Returns a copy of this instance on the specified cuda device
|
||||
|
||||
:param ort_device: with name 'cuda' and valid gpu device id
|
||||
|
@ -821,29 +897,29 @@ class SparseTensor:
|
|||
- this instance is already on GPU. Cross GPU copy is not supported
|
||||
- CUDA is not present in this build
|
||||
- if the specified device is not valid
|
||||
'''
|
||||
"""
|
||||
return SparseTensor(self._tensor.to_cuda(ort_device._get_c_device()))
|
||||
|
||||
def format(self):
|
||||
'''
|
||||
"""
|
||||
Returns a OrtSparseFormat enumeration
|
||||
'''
|
||||
"""
|
||||
return self._tensor.format
|
||||
|
||||
def dense_shape(self):
|
||||
'''
|
||||
"""
|
||||
Returns a numpy array(int64) containing a dense shape of a sparse tensor
|
||||
'''
|
||||
"""
|
||||
return self._tensor.dense_shape()
|
||||
|
||||
def data_type(self):
|
||||
'''
|
||||
"""
|
||||
Returns a string data type of the data in the OrtValue
|
||||
'''
|
||||
"""
|
||||
return self._tensor.data_type()
|
||||
|
||||
def device_name(self):
|
||||
'''
|
||||
"""
|
||||
Returns the name of the device where the SparseTensor data buffers reside e.g. cpu, cuda
|
||||
'''
|
||||
"""
|
||||
return self._tensor.device_name().lower()
|
||||
|
|
|
@ -5,34 +5,36 @@
|
|||
"""
|
||||
Check OS requirements for ONNX Runtime Python Bindings.
|
||||
"""
|
||||
import platform
|
||||
import linecache
|
||||
import platform
|
||||
import warnings
|
||||
|
||||
|
||||
def check_distro_info():
|
||||
__my_distro__ = ''
|
||||
__my_distro_ver__ = ''
|
||||
__my_distro__ = ""
|
||||
__my_distro_ver__ = ""
|
||||
__my_system__ = platform.system().lower()
|
||||
|
||||
__OS_RELEASE_FILE__ = '/etc/os-release'
|
||||
__LSB_RELEASE_FILE__ = '/etc/lsb-release'
|
||||
__OS_RELEASE_FILE__ = "/etc/os-release"
|
||||
__LSB_RELEASE_FILE__ = "/etc/lsb-release"
|
||||
|
||||
if __my_system__ == 'windows':
|
||||
if __my_system__ == "windows":
|
||||
__my_distro__ = __my_system__
|
||||
__my_distro_ver__ = platform.release().lower()
|
||||
|
||||
if __my_distro_ver__ != '10':
|
||||
warnings.warn('Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only.' %
|
||||
__my_distro_ver__)
|
||||
elif __my_system__ == 'linux':
|
||||
''' Although the 'platform' python module for getting Distro information works well on standard OS images
|
||||
if __my_distro_ver__ != "10":
|
||||
warnings.warn(
|
||||
"Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
|
||||
% __my_distro_ver__
|
||||
)
|
||||
elif __my_system__ == "linux":
|
||||
"""Although the 'platform' python module for getting Distro information works well on standard OS images
|
||||
running on real hardware, it is not accurate when running on Azure VMs, Git Bash, Cygwin, etc.
|
||||
The returned values for release and version are unpredictable for virtualized or emulated environments.
|
||||
/etc/os-release and /etc/lsb_release files, on the other hand, are guaranteed to exist and have standard values
|
||||
in all OSes supported by onnxruntime. The former is the current standard file to check OS info and the latter
|
||||
is its predecessor.
|
||||
'''
|
||||
"""
|
||||
# Newer systems have /etc/os-release with relevant distro info
|
||||
__my_distro__ = linecache.getline(__OS_RELEASE_FILE__, 3)[3:-1]
|
||||
__my_distro_ver__ = linecache.getline(__OS_RELEASE_FILE__, 6)[12:-2]
|
||||
|
@ -46,16 +48,18 @@ def check_distro_info():
|
|||
# warn the user ONNX Runtime may not work out of the box
|
||||
__my_distro__ = __my_distro__.lower()
|
||||
__my_distro_ver__ = __my_distro_ver__.lower()
|
||||
elif __my_system__ == 'darwin':
|
||||
elif __my_system__ == "darwin":
|
||||
__my_distro__ = __my_system__
|
||||
__my_distro_ver__ = platform.release().lower()
|
||||
|
||||
if int(__my_distro_ver__.split('.')[0]) < 11:
|
||||
warnings.warn('Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later.' %
|
||||
(__my_distro_ver__))
|
||||
if int(__my_distro_ver__.split(".")[0]) < 11:
|
||||
warnings.warn(
|
||||
"Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later." % (__my_distro_ver__)
|
||||
)
|
||||
else:
|
||||
warnings.warn('Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only.' %
|
||||
__my_system__)
|
||||
warnings.warn(
|
||||
"Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only." % __my_system__
|
||||
)
|
||||
|
||||
|
||||
def validate_build_package_info():
|
||||
|
@ -63,7 +67,8 @@ def validate_build_package_info():
|
|||
|
||||
has_ortmodule = False
|
||||
try:
|
||||
from onnxruntime.training.ortmodule import ORTModule # noqa
|
||||
from onnxruntime.training.ortmodule import ORTModule # noqa
|
||||
|
||||
has_ortmodule = True
|
||||
except ImportError:
|
||||
# ORTModule not present
|
||||
|
@ -74,6 +79,7 @@ def validate_build_package_info():
|
|||
# device version validation and raise the exception after.
|
||||
try:
|
||||
from onnxruntime.training.ortmodule._fallback import ORTModuleInitException
|
||||
|
||||
if isinstance(e, ORTModuleInitException):
|
||||
# ORTModule is present but not ready to run yet
|
||||
has_ortmodule = True
|
||||
|
@ -84,19 +90,19 @@ def validate_build_package_info():
|
|||
if not has_ortmodule:
|
||||
import_ortmodule_exception = e
|
||||
|
||||
package_name = ''
|
||||
version = ''
|
||||
cuda_version = ''
|
||||
package_name = ""
|
||||
version = ""
|
||||
cuda_version = ""
|
||||
|
||||
if has_ortmodule:
|
||||
try:
|
||||
# collect onnxruntime package name, version, and cuda version
|
||||
from .build_and_package_info import package_name
|
||||
from .build_and_package_info import __version__ as version
|
||||
from .build_and_package_info import package_name
|
||||
|
||||
try:
|
||||
from .build_and_package_info import cuda_version
|
||||
except: # noqa
|
||||
except: # noqa
|
||||
pass
|
||||
|
||||
if cuda_version:
|
||||
|
@ -104,29 +110,30 @@ def validate_build_package_info():
|
|||
# when the build environment has none or multiple libraries installed
|
||||
try:
|
||||
from .build_and_package_info import cudart_version
|
||||
except: # noqa
|
||||
warnings.warn('WARNING: failed to get cudart_version from onnxruntime build info.')
|
||||
except: # noqa
|
||||
warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
|
||||
cudart_version = None
|
||||
|
||||
def print_build_package_info():
|
||||
warnings.warn('onnxruntime training package info: package_name: %s' % package_name)
|
||||
warnings.warn('onnxruntime training package info: __version__: %s' % version)
|
||||
warnings.warn('onnxruntime training package info: cuda_version: %s' % cuda_version)
|
||||
warnings.warn('onnxruntime build info: cudart_version: %s' % cudart_version)
|
||||
warnings.warn("onnxruntime training package info: package_name: %s" % package_name)
|
||||
warnings.warn("onnxruntime training package info: __version__: %s" % version)
|
||||
warnings.warn("onnxruntime training package info: cuda_version: %s" % cuda_version)
|
||||
warnings.warn("onnxruntime build info: cudart_version: %s" % cudart_version)
|
||||
|
||||
# collection cuda library info from current environment.
|
||||
from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
|
||||
|
||||
local_cudart_versions = find_cudart_versions(build_env=False, build_cuda_version=cuda_version)
|
||||
if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
|
||||
print_build_package_info()
|
||||
warnings.warn('WARNING: failed to find cudart version that matches onnxruntime build info')
|
||||
warnings.warn('WARNING: found cudart versions: %s' % local_cudart_versions)
|
||||
warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
|
||||
warnings.warn("WARNING: found cudart versions: %s" % local_cudart_versions)
|
||||
else:
|
||||
# TODO: rcom
|
||||
pass
|
||||
|
||||
except Exception as e: # noqa
|
||||
warnings.warn('WARNING: failed to collect onnxruntime version and build info')
|
||||
except Exception as e: # noqa
|
||||
warnings.warn("WARNING: failed to collect onnxruntime version and build info")
|
||||
print(e)
|
||||
|
||||
if import_ortmodule_exception:
|
||||
|
|
|
@ -9,9 +9,10 @@ import textwrap
|
|||
|
||||
|
||||
def rewrite_target_file(target):
|
||||
with open(target, 'a') as f:
|
||||
f.write(textwrap.dedent(
|
||||
"""
|
||||
with open(target, "a") as f:
|
||||
f.write(
|
||||
textwrap.dedent(
|
||||
"""
|
||||
import warnings
|
||||
|
||||
try:
|
||||
|
@ -33,15 +34,21 @@ def rewrite_target_file(target):
|
|||
f"WARNING: Failed to register python functions to work with TVM EP. More details: {e}"
|
||||
)
|
||||
"""
|
||||
))
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--target_file", type=str, required=True, help="Path to the file to be expanded.")
|
||||
parser.add_argument(
|
||||
"--target_file",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the file to be expanded.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
rewrite_target_file(args.target_file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -4,17 +4,16 @@
|
|||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import os
|
||||
import collections
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
|
||||
import onnx
|
||||
import tvm
|
||||
from tvm import relay, auto_scheduler
|
||||
from tvm.relay import vm
|
||||
from tvm import auto_scheduler, autotvm, relay
|
||||
from tvm.contrib import graph_executor
|
||||
from tvm import autotvm
|
||||
from tvm.relay import vm
|
||||
|
||||
log = logging.getLogger("tvm_ep")
|
||||
|
||||
|
@ -23,18 +22,20 @@ AUTO_TVM_TYPE = "AutoTVM"
|
|||
|
||||
|
||||
@tvm.register_func("tvm_onnx_import_and_compile")
|
||||
def onnx_compile(model_string,
|
||||
model_path,
|
||||
executor,
|
||||
target,
|
||||
target_host,
|
||||
opt_level,
|
||||
opset,
|
||||
freeze_params,
|
||||
input_shapes,
|
||||
nhwc=False,
|
||||
tuning_logfile="",
|
||||
tuning_type=AUTO_TVM_TYPE):
|
||||
def onnx_compile(
|
||||
model_string,
|
||||
model_path,
|
||||
executor,
|
||||
target,
|
||||
target_host,
|
||||
opt_level,
|
||||
opset,
|
||||
freeze_params,
|
||||
input_shapes,
|
||||
nhwc=False,
|
||||
tuning_logfile="",
|
||||
tuning_type=AUTO_TVM_TYPE,
|
||||
):
|
||||
def get_tvm_executor(irmod, executor, target, params):
|
||||
if executor == "vm":
|
||||
log.info("Build TVM virtual machine")
|
||||
|
@ -47,8 +48,9 @@ def onnx_compile(model_string,
|
|||
log.info("Build TVM graph executor")
|
||||
lib = relay.build(irmod, target=target, params=params)
|
||||
else:
|
||||
log.error("Executor type {} is unsupported. ".format(executor) +
|
||||
"Only \"vm\" and \"graph\" types are supported")
|
||||
log.error(
|
||||
"Executor type {} is unsupported. ".format(executor) + 'Only "vm" and "graph" types are supported'
|
||||
)
|
||||
return None
|
||||
return lib
|
||||
|
||||
|
@ -94,7 +96,7 @@ def onnx_compile(model_string,
|
|||
config={
|
||||
"relay.backend.use_auto_scheduler": True,
|
||||
"relay.FuseOps.max_depth": 30,
|
||||
}
|
||||
},
|
||||
):
|
||||
if nhwc:
|
||||
seq = tvm.transform.Sequential(
|
||||
|
@ -113,8 +115,10 @@ def onnx_compile(model_string,
|
|||
with autotvm.apply_history_best(tuning_logfile):
|
||||
lib = get_tvm_executor(irmod, executor, tvm_target, params)
|
||||
else:
|
||||
log.error("Tuning log type {} is unsupported. ".format(tuning_type) +
|
||||
"Only {} and {} types are supported".format(ANSOR_TYPE, AUTO_TVM_TYPE))
|
||||
log.error(
|
||||
"Tuning log type {} is unsupported. ".format(tuning_type)
|
||||
+ "Only {} and {} types are supported".format(ANSOR_TYPE, AUTO_TVM_TYPE)
|
||||
)
|
||||
return None
|
||||
else:
|
||||
with tvm.transform.PassContext(opt_level=opt_level):
|
||||
|
@ -129,8 +133,10 @@ def onnx_compile(model_string,
|
|||
elif executor == "graph":
|
||||
m = graph_executor.GraphModule(lib["default"](ctx))
|
||||
else:
|
||||
print("ERROR: Executor type {} is unsupported. ".format(executor),
|
||||
"Only \"vm\" and \"graph\" types are supported")
|
||||
print(
|
||||
"ERROR: Executor type {} is unsupported. ".format(executor),
|
||||
'Only "vm" and "graph" types are supported',
|
||||
)
|
||||
return None
|
||||
|
||||
return m.module
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
@ -24,12 +25,21 @@ class BenchmarkAttention(BenchmarkOp):
|
|||
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
np.random.seed(0)
|
||||
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
|
||||
op_param.data_type
|
||||
)
|
||||
weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type)
|
||||
bias = np.random.rand(op_param.length).astype(op_param.data_type)
|
||||
mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32)
|
||||
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index}
|
||||
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
|
||||
op_param.data_type
|
||||
)
|
||||
inputs = {
|
||||
"INPUT": input_data,
|
||||
"WEIGHT": weight,
|
||||
"BIAS": bias,
|
||||
"MASK_INDEX": mask_index,
|
||||
}
|
||||
outputs = {"return_val": output_data}
|
||||
return inputs, outputs
|
||||
|
||||
|
|
|
@ -1,44 +1,67 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import logging
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser
|
||||
import logging
|
||||
|
||||
import numpy
|
||||
import onnxruntime as ort
|
||||
import time
|
||||
import torch
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def numpy_type(torch_type):
|
||||
type_map = {torch.float32: numpy.float32,
|
||||
torch.float16: numpy.float16,
|
||||
torch.int32: numpy.int32}
|
||||
type_map = {
|
||||
torch.float32: numpy.float32,
|
||||
torch.float16: numpy.float16,
|
||||
torch.int32: numpy.int32,
|
||||
}
|
||||
return type_map[torch_type]
|
||||
|
||||
|
||||
def add_arguments(parser: ArgumentParser):
|
||||
parser.add_argument("--provider", required=False, type=str,
|
||||
choices=["cuda", "rocm", "cpu", None], default=None,
|
||||
help=("Execution provider to use. By default, a "
|
||||
"provider is selected in the priority order "
|
||||
"(cuda|rocm, cpu) depending on availability."))
|
||||
parser.add_argument("--precision", required=False, type=str,
|
||||
choices=["fp16", "fp32"], default="fp16",
|
||||
help="Number format to use")
|
||||
parser.add_argument('--profiling', required=False, type=bool,
|
||||
default=False, help='If enable profiling')
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
choices=["cuda", "rocm", "cpu", None],
|
||||
default=None,
|
||||
help=(
|
||||
"Execution provider to use. By default, a "
|
||||
"provider is selected in the priority order "
|
||||
"(cuda|rocm, cpu) depending on availability."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--precision",
|
||||
required=False,
|
||||
type=str,
|
||||
choices=["fp16", "fp32"],
|
||||
default="fp16",
|
||||
help="Number format to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profiling",
|
||||
required=False,
|
||||
type=bool,
|
||||
default=False,
|
||||
help="If enable profiling",
|
||||
)
|
||||
|
||||
|
||||
def provider_name(name):
|
||||
provider_map = {"cuda": "CUDAExecutionProvider",
|
||||
"rocm": "ROCMExecutionProvider",
|
||||
"cpu": "CPUExecutionProvider"}
|
||||
provider_map = {
|
||||
"cuda": "CUDAExecutionProvider",
|
||||
"rocm": "ROCMExecutionProvider",
|
||||
"cpu": "CPUExecutionProvider",
|
||||
}
|
||||
return provider_map[name]
|
||||
|
||||
|
||||
|
@ -52,8 +75,7 @@ def get_default_provider():
|
|||
|
||||
class Benchmark:
|
||||
def __init__(self, model, inputs, outputs, args):
|
||||
self.provider = (get_default_provider() if args.provider == None
|
||||
else provider_name(args.provider))
|
||||
self.provider = get_default_provider() if args.provider == None else provider_name(args.provider)
|
||||
logger.info(f"Execution provider: {self.provider}")
|
||||
self.profiling = args.profiling
|
||||
self.model = model
|
||||
|
@ -62,43 +84,49 @@ class Benchmark:
|
|||
self.outputs = outputs
|
||||
|
||||
def create_input_output_tensors(self):
|
||||
on_gpu = (self.provider == "CUDAExecutionProvider"
|
||||
or self.provider == "ROCMExecutionProvider")
|
||||
on_gpu = self.provider == "CUDAExecutionProvider" or self.provider == "ROCMExecutionProvider"
|
||||
device = "cuda" if on_gpu else "cpu"
|
||||
input_tensors = {name: torch.from_numpy(array).to(device)
|
||||
for name, array in self.inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device)
|
||||
for name, array in self.outputs.items()}
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
@classmethod
|
||||
def create_io_binding(cls, sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(name, tensor.device.type, 0,
|
||||
numpy_type(tensor.dtype), tensor.shape,
|
||||
tensor.data_ptr())
|
||||
io_binding.bind_input(
|
||||
name,
|
||||
tensor.device.type,
|
||||
0,
|
||||
numpy_type(tensor.dtype),
|
||||
tensor.shape,
|
||||
tensor.data_ptr(),
|
||||
)
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(name, tensor.device.type, 0,
|
||||
numpy_type(tensor.dtype), tensor.shape,
|
||||
tensor.data_ptr())
|
||||
io_binding.bind_output(
|
||||
name,
|
||||
tensor.device.type,
|
||||
0,
|
||||
numpy_type(tensor.dtype),
|
||||
tensor.shape,
|
||||
tensor.data_ptr(),
|
||||
)
|
||||
return io_binding
|
||||
|
||||
def create_session(self):
|
||||
sess_opt = ort.SessionOptions()
|
||||
sess_opt.enable_profiling = self.profiling
|
||||
sess = ort.InferenceSession(self.model, sess_options=sess_opt,
|
||||
providers=[self.provider])
|
||||
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=[self.provider])
|
||||
return sess
|
||||
|
||||
def benchmark(self):
|
||||
sess = self.create_session()
|
||||
input_tensors, output_tensors = self.create_input_output_tensors()
|
||||
io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
|
||||
|
||||
|
||||
# warm up
|
||||
for iter in range(10):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
sess.run_with_iobinding(io_binding)
|
||||
|
||||
# measure
|
||||
max_iters = 100
|
||||
|
|
|
@ -1,28 +1,29 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpParam:
|
||||
x : int
|
||||
y : int
|
||||
m : int
|
||||
n : int
|
||||
input_data_type : type
|
||||
output_data_type : type
|
||||
x: int
|
||||
y: int
|
||||
m: int
|
||||
n: int
|
||||
input_data_type: type
|
||||
output_data_type: type
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelParam:
|
||||
token_type_ids_dim0 : int
|
||||
input_ids_dim1 : int
|
||||
token_type_ids_dim0: int
|
||||
input_ids_dim1: int
|
||||
|
||||
|
||||
class BenchmarkCast(BenchmarkOp):
|
||||
|
@ -38,9 +39,39 @@ class BenchmarkCast(BenchmarkOp):
|
|||
return inputs, outputs
|
||||
|
||||
def add_model_cases(self, mp, model, input_data_type, output_data_type):
|
||||
self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1024, input_data_type, output_data_type), model)
|
||||
self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1, input_data_type, output_data_type), model)
|
||||
self.add_case(OpParam(16, mp.token_type_ids_dim0, mp.input_ids_dim1, mp.input_ids_dim1, input_data_type, output_data_type), model)
|
||||
self.add_case(
|
||||
OpParam(
|
||||
1,
|
||||
mp.token_type_ids_dim0,
|
||||
mp.input_ids_dim1,
|
||||
1024,
|
||||
input_data_type,
|
||||
output_data_type,
|
||||
),
|
||||
model,
|
||||
)
|
||||
self.add_case(
|
||||
OpParam(
|
||||
1,
|
||||
mp.token_type_ids_dim0,
|
||||
mp.input_ids_dim1,
|
||||
1,
|
||||
input_data_type,
|
||||
output_data_type,
|
||||
),
|
||||
model,
|
||||
)
|
||||
self.add_case(
|
||||
OpParam(
|
||||
16,
|
||||
mp.token_type_ids_dim0,
|
||||
mp.input_ids_dim1,
|
||||
mp.input_ids_dim1,
|
||||
input_data_type,
|
||||
output_data_type,
|
||||
),
|
||||
model,
|
||||
)
|
||||
|
||||
def create_cases(self):
|
||||
model = "models/cast_fp16tofp32.onnx" if self.args.precision == "fp16" else "models/cast_fp32tofp16.onnx"
|
||||
|
@ -61,7 +92,7 @@ class BenchmarkCast(BenchmarkOp):
|
|||
def case_profile(cls, op_param, time):
|
||||
profile = f"(x y m n input_data_type) = ({op_param.x} {op_param.y} {op_param.m} {op_param.n} {op_param.input_data_type}), {time:7.4f} ms"
|
||||
return profile
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
@ -43,7 +44,12 @@ class BenchmarkFastGelu(BenchmarkOp):
|
|||
data_type = np.float16 if self.args.precision == "fp16" else np.float32
|
||||
# bert-large
|
||||
model_param = ModelParam(1, 384, 1024 * 4, data_type)
|
||||
op_param = OpParam(model_param.batch_size, model_param.seq_len, model_param.inter_dim, model_param.data_type)
|
||||
op_param = OpParam(
|
||||
model_param.batch_size,
|
||||
model_param.seq_len,
|
||||
model_param.inter_dim,
|
||||
model_param.data_type,
|
||||
)
|
||||
self.add_case(op_param, model)
|
||||
|
||||
def case_profile(cls, op_param, time):
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
@ -43,10 +44,36 @@ class BenchmarkMatMul(BenchmarkOp):
|
|||
return inputs, outputs
|
||||
|
||||
def add_model_cases(self, mp, model):
|
||||
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.hidden_size, mp.data_type), model)
|
||||
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type), model)
|
||||
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type), model)
|
||||
self.add_case(OpParam(mp.batch_size, mp.num_heads, mp.seq_len, mp.seq_len, int(mp.hidden_size / mp.num_heads), mp.data_type), model)
|
||||
self.add_case(
|
||||
OpParam(
|
||||
1,
|
||||
mp.batch_size,
|
||||
mp.seq_len,
|
||||
mp.hidden_size,
|
||||
mp.hidden_size,
|
||||
mp.data_type,
|
||||
),
|
||||
model,
|
||||
)
|
||||
self.add_case(
|
||||
OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type),
|
||||
model,
|
||||
)
|
||||
self.add_case(
|
||||
OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type),
|
||||
model,
|
||||
)
|
||||
self.add_case(
|
||||
OpParam(
|
||||
mp.batch_size,
|
||||
mp.num_heads,
|
||||
mp.seq_len,
|
||||
mp.seq_len,
|
||||
int(mp.hidden_size / mp.num_heads),
|
||||
mp.data_type,
|
||||
),
|
||||
model,
|
||||
)
|
||||
|
||||
def create_cases(self):
|
||||
model = "models/matmul_fp16.onnx" if self.args.precision == "fp16" else "models/matmul_fp32.onnx"
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
@ -23,20 +24,32 @@ class BenchmarkSkipLayerNorm(BenchmarkOp):
|
|||
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
np.random.seed(0)
|
||||
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
|
||||
op_param.data_type
|
||||
)
|
||||
skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
|
||||
beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
|
||||
bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
|
||||
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
|
||||
inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias}
|
||||
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
|
||||
op_param.data_type
|
||||
)
|
||||
|
||||
inputs = {
|
||||
"INPUT": input_data,
|
||||
"SKIP": skip,
|
||||
"GAMMA": gamma,
|
||||
"BETA": beta,
|
||||
"BIAS": bias,
|
||||
}
|
||||
outputs = {"return_val": output_data}
|
||||
|
||||
|
||||
return inputs, outputs
|
||||
|
||||
def create_cases(self):
|
||||
model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
|
||||
model = (
|
||||
"models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
|
||||
)
|
||||
data_type = np.float16 if self.args.precision == "fp16" else np.float32
|
||||
# bert-large
|
||||
op_param = OpParam(1, 384, 1024, data_type)
|
||||
|
|
|
@ -1,21 +1,23 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# An offline standalone script to declassify an ONNX model by randomizing the tensor data in initializers.
|
||||
# The ORT Performance may change especially on generative models.
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
from onnx import onnx_pb, numpy_helper, save_model, load_model
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from onnx import load_model, numpy_helper, onnx_pb, save_model
|
||||
|
||||
# An experimental small value for differentiating shape data and weights.
|
||||
# The tensor data with larger size can't be shape data.
|
||||
# User may adjust this value as needed.
|
||||
SIZE_THRESHOLD = 10
|
||||
|
||||
|
||||
def graph_iterator(model, func):
|
||||
graph_queue = [model.graph]
|
||||
while graph_queue:
|
||||
|
@ -24,11 +26,11 @@ def graph_iterator(model, func):
|
|||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
if attr.type == onnx_pb.AttributeProto.AttributeType.GRAPH:
|
||||
assert (isinstance(attr.g, onnx_pb.GraphProto))
|
||||
assert isinstance(attr.g, onnx_pb.GraphProto)
|
||||
graph_queue.append(attr.g)
|
||||
if attr.type == onnx_pb.AttributeProto.AttributeType.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
assert (isinstance(g, onnx_pb.GraphProto))
|
||||
assert isinstance(g, onnx_pb.GraphProto)
|
||||
graph_queue.append(g)
|
||||
|
||||
|
||||
|
@ -37,54 +39,47 @@ def randomize_graph_initializer(graph):
|
|||
array = numpy_helper.to_array(i_tensor)
|
||||
# TODO: need to find a better way to differentiate shape data and weights.
|
||||
if array.size > SIZE_THRESHOLD:
|
||||
random_array = np.random.uniform(array.min(),
|
||||
array.max(),
|
||||
size=array.shape).astype(
|
||||
array.dtype)
|
||||
random_array = np.random.uniform(array.min(), array.max(), size=array.shape).astype(array.dtype)
|
||||
o_tensor = numpy_helper.from_array(random_array, i_tensor.name)
|
||||
i_tensor.CopyFrom(o_tensor)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Randomize the weights of an ONNX model')
|
||||
parser.add_argument('-m',
|
||||
type=str,
|
||||
required=True,
|
||||
help='input onnx model path')
|
||||
parser.add_argument('-o',
|
||||
type=str,
|
||||
required=True,
|
||||
help='output onnx model path')
|
||||
parser.add_argument("--use_external_data_format",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Store or Save in external data format")
|
||||
parser.add_argument("--all_tensors_to_one_file",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Save all tensors to one file")
|
||||
parser = argparse.ArgumentParser(description="Randomize the weights of an ONNX model")
|
||||
parser.add_argument("-m", type=str, required=True, help="input onnx model path")
|
||||
parser.add_argument("-o", type=str, required=True, help="output onnx model path")
|
||||
parser.add_argument(
|
||||
"--use_external_data_format",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Store or Save in external data format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all_tensors_to_one_file",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Save all tensors to one file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
data_path = None
|
||||
if args.use_external_data_format:
|
||||
if Path(args.m).parent == Path(args.o).parent:
|
||||
raise RuntimeError(
|
||||
"Please specify output directory with different parent path to input directory."
|
||||
)
|
||||
raise RuntimeError("Please specify output directory with different parent path to input directory.")
|
||||
if args.all_tensors_to_one_file:
|
||||
data_path = Path(args.o).name + ".data"
|
||||
|
||||
Path(args.o).parent.mkdir(parents=True, exist_ok=True)
|
||||
onnx_model = load_model(args.m,
|
||||
load_external_data=args.use_external_data_format)
|
||||
onnx_model = load_model(args.m, load_external_data=args.use_external_data_format)
|
||||
graph_iterator(onnx_model, randomize_graph_initializer)
|
||||
save_model(onnx_model,
|
||||
args.o,
|
||||
save_as_external_data=args.use_external_data_format,
|
||||
all_tensors_to_one_file=args.all_tensors_to_one_file,
|
||||
location=data_path)
|
||||
save_model(
|
||||
onnx_model,
|
||||
args.o,
|
||||
save_as_external_data=args.use_external_data_format,
|
||||
all_tensors_to_one_file=args.all_tensors_to_one_file,
|
||||
location=data_path,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,27 +1,34 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
import onnxruntime as onnxrt
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
from timeit import default_timer as timer
|
||||
|
||||
float_dict = {'tensor(float16)': 'float16', 'tensor(float)': 'float32', 'tensor(double)': 'float64'}
|
||||
import numpy as np
|
||||
|
||||
import onnxruntime as onnxrt
|
||||
|
||||
float_dict = {
|
||||
"tensor(float16)": "float16",
|
||||
"tensor(float)": "float32",
|
||||
"tensor(double)": "float64",
|
||||
}
|
||||
|
||||
integer_dict = {
|
||||
'tensor(int32)': 'int32',
|
||||
'tensor(int8)': 'int8',
|
||||
'tensor(uint8)': 'uint8',
|
||||
'tensor(int16)': 'int16',
|
||||
'tensor(uint16)': 'uint16',
|
||||
'tensor(int64)': 'int64',
|
||||
'tensor(uint64)': 'uint64'
|
||||
"tensor(int32)": "int32",
|
||||
"tensor(int8)": "int8",
|
||||
"tensor(uint8)": "uint8",
|
||||
"tensor(int16)": "int16",
|
||||
"tensor(uint16)": "uint16",
|
||||
"tensor(int64)": "int64",
|
||||
"tensor(uint64)": "uint64",
|
||||
}
|
||||
|
||||
|
||||
def generate_feeds(sess, symbolic_dims={}):
|
||||
feeds = {}
|
||||
for input_meta in sess.get_inputs():
|
||||
|
@ -43,23 +50,27 @@ def generate_feeds(sess, symbolic_dims={}):
|
|||
if input_meta.type in float_dict:
|
||||
feeds[input_meta.name] = np.random.rand(*shape).astype(float_dict[input_meta.type])
|
||||
elif input_meta.type in integer_dict:
|
||||
feeds[input_meta.name] = np.random.uniform(high=1000,
|
||||
size=tuple(shape)).astype(integer_dict[input_meta.type])
|
||||
elif input_meta.type == 'tensor(bool)':
|
||||
feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype('bool')
|
||||
feeds[input_meta.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(
|
||||
integer_dict[input_meta.type]
|
||||
)
|
||||
elif input_meta.type == "tensor(bool)":
|
||||
feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
|
||||
else:
|
||||
print("unsupported input type {} for input {}".format(input_meta.type, input_meta.name))
|
||||
sys.exit(-1)
|
||||
return feeds
|
||||
|
||||
|
||||
# simple test program for loading onnx model, feeding all inputs and running the model num_iters times.
|
||||
def run_model(model_path,
|
||||
num_iters=1,
|
||||
debug=None,
|
||||
profile=None,
|
||||
symbolic_dims={},
|
||||
feeds=None,
|
||||
override_initializers=True):
|
||||
def run_model(
|
||||
model_path,
|
||||
num_iters=1,
|
||||
debug=None,
|
||||
profile=None,
|
||||
symbolic_dims={},
|
||||
feeds=None,
|
||||
override_initializers=True,
|
||||
):
|
||||
if debug:
|
||||
print("Pausing execution ready for debugger to attach to pid: {}".format(os.getpid()))
|
||||
print("Press key to continue.")
|
||||
|
@ -71,7 +82,11 @@ def run_model(model_path,
|
|||
sess_options.enable_profiling = True
|
||||
sess_options.profile_file_prefix = os.path.basename(model_path)
|
||||
|
||||
sess = onnxrt.InferenceSession(model_path, sess_options=sess_options, providers=onnxrt.get_available_providers())
|
||||
sess = onnxrt.InferenceSession(
|
||||
model_path,
|
||||
sess_options=sess_options,
|
||||
providers=onnxrt.get_available_providers(),
|
||||
)
|
||||
meta = sess.get_modelmeta()
|
||||
|
||||
if not feeds:
|
||||
|
@ -86,10 +101,11 @@ def run_model(model_path,
|
|||
if initializer.type in float_dict:
|
||||
feeds[initializer.name] = np.random.rand(*shape).astype(float_dict[initializer.type])
|
||||
elif initializer.type in integer_dict:
|
||||
feeds[initializer.name] = np.random.uniform(high=1000,
|
||||
size=tuple(shape)).astype(integer_dict[initializer.type])
|
||||
elif initializer.type == 'tensor(bool)':
|
||||
feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype('bool')
|
||||
feeds[initializer.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(
|
||||
integer_dict[initializer.type]
|
||||
)
|
||||
elif initializer.type == "tensor(bool)":
|
||||
feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
|
||||
else:
|
||||
print("unsupported initializer type {} for initializer {}".format(initializer.type, initializer.name))
|
||||
sys.exit(-1)
|
||||
|
@ -112,15 +128,29 @@ def run_model(model_path,
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Simple ONNX Runtime Test Tool.')
|
||||
parser.add_argument('model_path', help='model path')
|
||||
parser.add_argument('num_iters', nargs='?', type=int, default=1000, help='model run iterations. default=1000')
|
||||
parser.add_argument('--debug', action='store_true', help='pause execution to allow attaching a debugger.')
|
||||
parser.add_argument('--profile', action='store_true', help='enable chrome timeline trace profiling.')
|
||||
parser.add_argument('--symbolic_dims', default={}, type=lambda s: dict(x.split("=") for x in s.split(",")),
|
||||
help='Comma separated name=value pairs for any symbolic dimensions in the model input. '
|
||||
'e.g. --symbolic_dims batch=1,seqlen=5. '
|
||||
'If not provided, the value of 1 will be used for all symbolic dimensions.')
|
||||
parser = argparse.ArgumentParser(description="Simple ONNX Runtime Test Tool.")
|
||||
parser.add_argument("model_path", help="model path")
|
||||
parser.add_argument(
|
||||
"num_iters",
|
||||
nargs="?",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="model run iterations. default=1000",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="pause execution to allow attaching a debugger.",
|
||||
)
|
||||
parser.add_argument("--profile", action="store_true", help="enable chrome timeline trace profiling.")
|
||||
parser.add_argument(
|
||||
"--symbolic_dims",
|
||||
default={},
|
||||
type=lambda s: dict(x.split("=") for x in s.split(",")),
|
||||
help="Comma separated name=value pairs for any symbolic dimensions in the model input. "
|
||||
"e.g. --symbolic_dims batch=1,seqlen=5. "
|
||||
"If not provided, the value of 1 will be used for all symbolic dimensions.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
exit_code, _, _ = run_model(args.model_path, args.num_iters, args.debug, args.profile, args.symbolic_dims)
|
||||
|
|
|
@ -12,8 +12,8 @@ try:
|
|||
from torch.onnx import register_custom_op_symbolic
|
||||
except ModuleNotFoundError:
|
||||
raise ModuleNotFoundError(
|
||||
"This module is only useful in combination with PyTorch. "
|
||||
"To install PyTorch see https://pytorch.org/.")
|
||||
"This module is only useful in combination with PyTorch. To install PyTorch see https://pytorch.org/."
|
||||
)
|
||||
import torch.onnx.symbolic_helper as sym_help
|
||||
import torch.onnx.symbolic_registry as sym_registry
|
||||
|
||||
|
@ -44,8 +44,8 @@ def register():
|
|||
# 'reflection' : onnx::Constant[value={2}]
|
||||
mode = sym_help._maybe_get_const(mode, "i")
|
||||
padding_mode = sym_help._maybe_get_const(padding_mode, "i")
|
||||
mode_str = ['bilinear', 'nearest', 'bicubic'][mode]
|
||||
padding_mode_str = ['zeros', 'border', 'reflection'][padding_mode]
|
||||
mode_str = ["bilinear", "nearest", "bicubic"][mode]
|
||||
padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
|
||||
align_corners = int(sym_help._maybe_get_const(align_corners, "b"))
|
||||
|
||||
# From opset v13 onward, the output shape can be specified with
|
||||
|
@ -55,28 +55,36 @@ def register():
|
|||
# output_shape = input_shape[:2] + gird_shape[1:3]
|
||||
# g.op(...).setType(input.type().with_sizes(output_shape))
|
||||
|
||||
return g.op("com.microsoft::GridSample", input, grid,
|
||||
mode_s=mode_str,
|
||||
padding_mode_s=padding_mode_str,
|
||||
align_corners_i=align_corners)
|
||||
return g.op(
|
||||
"com.microsoft::GridSample",
|
||||
input,
|
||||
grid,
|
||||
mode_s=mode_str,
|
||||
padding_mode_s=padding_mode_str,
|
||||
align_corners_i=align_corners,
|
||||
)
|
||||
|
||||
_reg(grid_sampler)
|
||||
|
||||
def inverse(g, self):
|
||||
return g.op("com.microsoft::Inverse", self).setType(self.type())
|
||||
|
||||
_reg(inverse)
|
||||
|
||||
def gelu(g, self):
|
||||
return g.op("com.microsoft::Gelu", self).setType(self.type())
|
||||
|
||||
_reg(gelu)
|
||||
|
||||
def triu(g, self, diagonal):
|
||||
return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())
|
||||
|
||||
_reg(triu)
|
||||
|
||||
def tril(g, self, diagonal):
|
||||
return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0).setType(self.type())
|
||||
_reg(tril)
|
||||
|
||||
_reg(tril)
|
||||
|
||||
|
||||
def unregister():
|
||||
|
@ -86,6 +94,5 @@ def unregister():
|
|||
for name in _registered_ops:
|
||||
ns, kind = name.split("::")
|
||||
for version in sym_help._onnx_stable_opsets:
|
||||
if (version >= _OPSET_VERSION and
|
||||
sym_registry.is_registered_op(kind, ns, version)):
|
||||
if version >= _OPSET_VERSION and sym_registry.is_registered_op(kind, ns, version):
|
||||
del sym_registry._registry[(ns, version)][kind]
|
||||
|
|
|
@ -4,10 +4,12 @@
|
|||
|
||||
import flatbuffers
|
||||
from flatbuffers.compat import import_numpy
|
||||
|
||||
np = import_numpy()
|
||||
|
||||
|
||||
class KeyValue(object):
|
||||
__slots__ = ['_tab']
|
||||
__slots__ = ["_tab"]
|
||||
|
||||
@classmethod
|
||||
def GetRootAs(cls, buf, offset=0):
|
||||
|
@ -20,6 +22,7 @@ class KeyValue(object):
|
|||
def GetRootAsKeyValue(cls, buf, offset=0):
|
||||
"""This method is deprecated. Please switch to GetRootAs."""
|
||||
return cls.GetRootAs(buf, offset)
|
||||
|
||||
# KeyValue
|
||||
def Init(self, buf, pos):
|
||||
self._tab = flatbuffers.table.Table(buf, pos)
|
||||
|
@ -38,19 +41,38 @@ class KeyValue(object):
|
|||
return self._tab.String(o + self._tab.Pos)
|
||||
return None
|
||||
|
||||
def Start(builder): builder.StartObject(2)
|
||||
|
||||
def Start(builder):
|
||||
builder.StartObject(2)
|
||||
|
||||
|
||||
def KeyValueStart(builder):
|
||||
"""This method is deprecated. Please switch to Start."""
|
||||
return Start(builder)
|
||||
def AddKey(builder, key): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
|
||||
|
||||
|
||||
def AddKey(builder, key):
|
||||
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
|
||||
|
||||
|
||||
def KeyValueAddKey(builder, key):
|
||||
"""This method is deprecated. Please switch to AddKey."""
|
||||
return AddKey(builder, key)
|
||||
def AddValue(builder, value): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
|
||||
|
||||
|
||||
def AddValue(builder, value):
|
||||
builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
|
||||
|
||||
|
||||
def KeyValueAddValue(builder, value):
|
||||
"""This method is deprecated. Please switch to AddValue."""
|
||||
return AddValue(builder, value)
|
||||
def End(builder): return builder.EndObject()
|
||||
|
||||
|
||||
def End(builder):
|
||||
return builder.EndObject()
|
||||
|
||||
|
||||
def KeyValueEnd(builder):
|
||||
"""This method is deprecated. Please switch to End."""
|
||||
return End(builder)
|
||||
return End(builder)
|
||||
|
|
|
@ -4,10 +4,12 @@
|
|||
|
||||
import flatbuffers
|
||||
from flatbuffers.compat import import_numpy
|
||||
|
||||
np = import_numpy()
|
||||
|
||||
|
||||
class TrtTable(object):
|
||||
__slots__ = ['_tab']
|
||||
__slots__ = ["_tab"]
|
||||
|
||||
@classmethod
|
||||
def GetRootAs(cls, buf, offset=0):
|
||||
|
@ -20,6 +22,7 @@ class TrtTable(object):
|
|||
def GetRootAsTrtTable(cls, buf, offset=0):
|
||||
"""This method is deprecated. Please switch to GetRootAs."""
|
||||
return cls.GetRootAs(buf, offset)
|
||||
|
||||
# TrtTable
|
||||
def Init(self, buf, pos):
|
||||
self._tab = flatbuffers.table.Table(buf, pos)
|
||||
|
@ -32,6 +35,7 @@ class TrtTable(object):
|
|||
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
|
||||
x = self._tab.Indirect(x)
|
||||
from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue
|
||||
|
||||
obj = KeyValue()
|
||||
obj.Init(self._tab.Bytes, x)
|
||||
return obj
|
||||
|
@ -49,19 +53,38 @@ class TrtTable(object):
|
|||
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
||||
return o == 0
|
||||
|
||||
def Start(builder): builder.StartObject(1)
|
||||
|
||||
def Start(builder):
|
||||
builder.StartObject(1)
|
||||
|
||||
|
||||
def TrtTableStart(builder):
|
||||
"""This method is deprecated. Please switch to Start."""
|
||||
return Start(builder)
|
||||
def AddDict(builder, dict): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
|
||||
|
||||
|
||||
def AddDict(builder, dict):
|
||||
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
|
||||
|
||||
|
||||
def TrtTableAddDict(builder, dict):
|
||||
"""This method is deprecated. Please switch to AddDict."""
|
||||
return AddDict(builder, dict)
|
||||
def StartDictVector(builder, numElems): return builder.StartVector(4, numElems, 4)
|
||||
|
||||
|
||||
def StartDictVector(builder, numElems):
|
||||
return builder.StartVector(4, numElems, 4)
|
||||
|
||||
|
||||
def TrtTableStartDictVector(builder, numElems):
|
||||
"""This method is deprecated. Please switch to Start."""
|
||||
return StartDictVector(builder, numElems)
|
||||
def End(builder): return builder.EndObject()
|
||||
|
||||
|
||||
def End(builder):
|
||||
return builder.EndObject()
|
||||
|
||||
|
||||
def TrtTableEnd(builder):
|
||||
"""This method is deprecated. Please switch to End."""
|
||||
return End(builder)
|
||||
return End(builder)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from .quantize import quantize_static, quantize_dynamic
|
||||
from .quantize import QuantizationMode
|
||||
from .calibrate import CalibrationDataReader, CalibraterBase, MinMaxCalibrater, create_calibrator, CalibrationMethod
|
||||
from .quant_utils import QuantType, QuantFormat, write_calibration_table
|
||||
from .calibrate import CalibraterBase, CalibrationDataReader, CalibrationMethod, MinMaxCalibrater, create_calibrator
|
||||
from .qdq_quantizer import QDQQuantizer
|
||||
from .quant_utils import QuantFormat, QuantType, write_calibration_table
|
||||
from .quantize import QuantizationMode, quantize_dynamic, quantize_static
|
||||
|
|
|
@ -7,27 +7,37 @@
|
|||
# --------------------------------------------------------------------------
|
||||
import abc
|
||||
import itertools
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
|
||||
import onnx
|
||||
from onnx import helper, TensorProto, ModelProto
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
from .quant_utils import QuantType, model_has_infer_metadata, smooth_distribution, apply_plot, load_model, clone_model_with_shape_infer
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import ModelProto, TensorProto, helper
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
import onnxruntime
|
||||
|
||||
from .quant_utils import (
|
||||
QuantType,
|
||||
apply_plot,
|
||||
clone_model_with_shape_infer,
|
||||
load_model,
|
||||
model_has_infer_metadata,
|
||||
smooth_distribution,
|
||||
)
|
||||
from .registry import QLinearOpsRegistry
|
||||
|
||||
|
||||
class CalibrationMethod(Enum):
|
||||
MinMax = 0
|
||||
Entropy = 1
|
||||
Percentile = 2
|
||||
|
||||
|
||||
class CalibrationDataReader(metaclass=abc.ABCMeta):
|
||||
@classmethod
|
||||
def __subclasshook__(cls, subclass):
|
||||
return (hasattr(subclass, 'get_next') and callable(subclass.get_next) or NotImplemented)
|
||||
return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_next(self) -> dict:
|
||||
|
@ -36,14 +46,21 @@ class CalibrationDataReader(metaclass=abc.ABCMeta):
|
|||
|
||||
|
||||
class CalibraterBase:
|
||||
def __init__(self, model, op_types_to_calibrate=[], augmented_model_path='augmented_model.onnx', symmetric=False, use_external_data_format=False):
|
||||
'''
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path="augmented_model.onnx",
|
||||
symmetric=False,
|
||||
use_external_data_format=False,
|
||||
):
|
||||
"""
|
||||
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
|
||||
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
|
||||
:param augmented_model_path: save augmented model to this path.
|
||||
:param symmetric: make range of tensor symmetric (central point is 0).
|
||||
:param use_external_data_format: use external data format to store model which size is >= 2Gb
|
||||
'''
|
||||
"""
|
||||
if isinstance(model, str):
|
||||
self.model = load_model(Path(model), False)
|
||||
elif isinstance(model, Path):
|
||||
|
@ -51,7 +68,7 @@ class CalibraterBase:
|
|||
elif isinstance(model, ModelProto):
|
||||
self.model = model
|
||||
else:
|
||||
raise ValueError('model should be either model path or onnx.ModelProto.')
|
||||
raise ValueError("model should be either model path or onnx.ModelProto.")
|
||||
|
||||
self.op_types_to_calibrate = op_types_to_calibrate
|
||||
self.augmented_model_path = augmented_model_path
|
||||
|
@ -64,33 +81,35 @@ class CalibraterBase:
|
|||
|
||||
# Create InferenceSession
|
||||
self.infer_session = None
|
||||
self.execution_providers = ['CPUExecutionProvider']
|
||||
self.execution_providers = ["CPUExecutionProvider"]
|
||||
self._create_inference_session()
|
||||
|
||||
def set_execution_providers(self, execution_providers=['CPUExecutionProvider']):
|
||||
'''
|
||||
def set_execution_providers(self, execution_providers=["CPUExecutionProvider"]):
|
||||
"""
|
||||
reset the execution providers to execute the collect_data. It triggers to re-creating inference session.
|
||||
'''
|
||||
"""
|
||||
self.execution_providers = execution_providers
|
||||
self._create_inference_session()
|
||||
|
||||
def _create_inference_session(self):
|
||||
'''
|
||||
"""
|
||||
create an OnnxRuntime InferenceSession.
|
||||
'''
|
||||
"""
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
self.infer_session = onnxruntime.InferenceSession(self.augmented_model_path,
|
||||
sess_options=sess_options,
|
||||
providers=self.execution_providers)
|
||||
self.infer_session = onnxruntime.InferenceSession(
|
||||
self.augmented_model_path,
|
||||
sess_options=sess_options,
|
||||
providers=self.execution_providers,
|
||||
)
|
||||
|
||||
def select_tensors_to_calibrate(self, model):
|
||||
'''
|
||||
select all quantization_candidates op type nodes' input/output tensors.
|
||||
"""
|
||||
select all quantization_candidates op type nodes' input/output tensors.
|
||||
returns:
|
||||
tensors (set): set of tensor name.
|
||||
value_infos (dict): tensor name to value info.
|
||||
'''
|
||||
"""
|
||||
value_infos = {vi.name: vi for vi in model.graph.value_info}
|
||||
value_infos.update({ot.name: ot for ot in model.graph.output})
|
||||
value_infos.update({it.name: it for it in model.graph.input})
|
||||
|
@ -104,50 +123,54 @@ class CalibraterBase:
|
|||
for tensor_name in itertools.chain(node.input, node.output):
|
||||
if tensor_name in value_infos.keys():
|
||||
vi = value_infos[tensor_name]
|
||||
if vi.type.HasField('tensor_type') and (
|
||||
vi.type.tensor_type.elem_type in tensor_type_to_calibrate) and (
|
||||
tensor_name not in initializer):
|
||||
if (
|
||||
vi.type.HasField("tensor_type")
|
||||
and (vi.type.tensor_type.elem_type in tensor_type_to_calibrate)
|
||||
and (tensor_name not in initializer)
|
||||
):
|
||||
tensors_to_calibrate.add(tensor_name)
|
||||
|
||||
return tensors_to_calibrate, value_infos
|
||||
|
||||
def get_augment_model(self):
|
||||
'''
|
||||
"""
|
||||
return: augmented onnx model
|
||||
'''
|
||||
"""
|
||||
return self.augment_model
|
||||
|
||||
def augment_graph(self):
|
||||
'''
|
||||
"""
|
||||
abstract method: augment the input model to prepare for collecting data. It will:
|
||||
1. save augmented model to augmented_model_path.
|
||||
2. set the self.augment_model
|
||||
'''
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def collect_data(self, data_reader: CalibrationDataReader):
|
||||
'''
|
||||
"""
|
||||
abstract method: collect the tensors that will be used for range computation. It can be called multiple times.
|
||||
'''
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def compute_range(self, data_reader: CalibrationDataReader):
|
||||
'''
|
||||
"""
|
||||
abstract method: compute the [min, max] range for the tensors to calibrate based on the collected data.
|
||||
'''
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MinMaxCalibrater(CalibraterBase):
|
||||
def __init__(self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path='augmented_model.onnx',
|
||||
symmetric=False,
|
||||
use_external_data_format=False,
|
||||
moving_average=False,
|
||||
averaging_constant=0.01):
|
||||
'''
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path="augmented_model.onnx",
|
||||
symmetric=False,
|
||||
use_external_data_format=False,
|
||||
moving_average=False,
|
||||
averaging_constant=0.01,
|
||||
):
|
||||
"""
|
||||
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
|
||||
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
|
||||
:param augmented_model_path: save augmented model to this path.
|
||||
|
@ -155,8 +178,14 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
:param use_external_data_format: use external data format to store model which size is >= 2Gb
|
||||
:param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
|
||||
:param averaging_constant: constant smoothing factor to use when computing the moving average.
|
||||
'''
|
||||
super(MinMaxCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, symmetric, use_external_data_format)
|
||||
"""
|
||||
super(MinMaxCalibrater, self).__init__(
|
||||
model,
|
||||
op_types_to_calibrate,
|
||||
augmented_model_path,
|
||||
symmetric,
|
||||
use_external_data_format,
|
||||
)
|
||||
self.intermediate_outputs = []
|
||||
self.calibrate_tensors_range = None
|
||||
self.num_model_outputs = len(self.model.graph.output)
|
||||
|
@ -167,16 +196,16 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
self.averaging_constant = averaging_constant
|
||||
|
||||
def augment_graph(self):
|
||||
'''
|
||||
"""
|
||||
Adds ReduceMin and ReduceMax nodes to all quantization_candidates op type nodes in
|
||||
model and ensures their outputs are stored as part of the graph output
|
||||
:return: augmented ONNX model
|
||||
'''
|
||||
"""
|
||||
model = clone_model_with_shape_infer(self.model)
|
||||
|
||||
added_nodes = []
|
||||
added_outputs = []
|
||||
tensors, value_infos = self.select_tensors_to_calibrate(model)
|
||||
tensors, value_infos = self.select_tensors_to_calibrate(model)
|
||||
|
||||
for tensor in tensors:
|
||||
|
||||
|
@ -193,22 +222,38 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
shape = (1,) if len(dim) == 1 else tuple(1 for i in range(len(dim)))
|
||||
|
||||
# Adding ReduceMin nodes
|
||||
reduce_min_name = tensor + '_ReduceMin'
|
||||
reduce_min_node = onnx.helper.make_node('ReduceMin', [tensor], [tensor + '_ReduceMin'], reduce_min_name, keepdims=keepdims)
|
||||
reduce_min_name = tensor + "_ReduceMin"
|
||||
reduce_min_node = onnx.helper.make_node(
|
||||
"ReduceMin",
|
||||
[tensor],
|
||||
[tensor + "_ReduceMin"],
|
||||
reduce_min_name,
|
||||
keepdims=keepdims,
|
||||
)
|
||||
|
||||
added_nodes.append(reduce_min_node)
|
||||
added_outputs.append(helper.make_tensor_value_info(reduce_min_node.output[0], TensorProto.FLOAT, shape))
|
||||
|
||||
# Adding ReduceMax nodes
|
||||
reduce_max_name = tensor + '_ReduceMax'
|
||||
reduce_max_node = onnx.helper.make_node('ReduceMax', [tensor], [tensor + '_ReduceMax'], reduce_max_name, keepdims=keepdims)
|
||||
reduce_max_name = tensor + "_ReduceMax"
|
||||
reduce_max_node = onnx.helper.make_node(
|
||||
"ReduceMax",
|
||||
[tensor],
|
||||
[tensor + "_ReduceMax"],
|
||||
reduce_max_name,
|
||||
keepdims=keepdims,
|
||||
)
|
||||
|
||||
added_nodes.append(reduce_max_node)
|
||||
added_outputs.append(helper.make_tensor_value_info(reduce_max_node.output[0], TensorProto.FLOAT, shape))
|
||||
|
||||
model.graph.node.extend(added_nodes)
|
||||
model.graph.output.extend(added_outputs)
|
||||
onnx.save(model, self.augmented_model_path, save_as_external_data=self.use_external_data_format)
|
||||
onnx.save(
|
||||
model,
|
||||
self.augmented_model_path,
|
||||
save_as_external_data=self.use_external_data_format,
|
||||
)
|
||||
self.augment_model = model
|
||||
|
||||
def clear_collected_data(self):
|
||||
|
@ -231,7 +276,7 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
if not old_range:
|
||||
return new_range
|
||||
|
||||
for key, value in old_range.items():
|
||||
for key, value in old_range.items():
|
||||
if self.moving_average:
|
||||
min_value = value[0] + self.averaging_constant * (new_range[key][0] - value[0])
|
||||
max_value = value[1] + self.averaging_constant * (new_range[key][1] - value[1])
|
||||
|
@ -243,10 +288,10 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
return new_range
|
||||
|
||||
def compute_range(self):
|
||||
'''
|
||||
"""
|
||||
Compute the min-max range of tensor
|
||||
:return: dictionary mapping: {added node names: (ReduceMin, ReduceMax) pairs }
|
||||
'''
|
||||
"""
|
||||
|
||||
if len(self.intermediate_outputs) == 0:
|
||||
return self.calibrate_tensors_range
|
||||
|
@ -260,21 +305,22 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
for d in output_dicts_list:
|
||||
for k, v in d.items():
|
||||
merged_output_dict.setdefault(k, []).append(v)
|
||||
added_output_names = output_names[self.num_model_outputs:]
|
||||
added_output_names = output_names[self.num_model_outputs :]
|
||||
calibrate_tensor_names = [
|
||||
added_output_names[i].rpartition('_')[0] for i in range(0, len(added_output_names), 2)
|
||||
] #output names
|
||||
added_output_names[i].rpartition("_")[0] for i in range(0, len(added_output_names), 2)
|
||||
] # output names
|
||||
|
||||
merged_added_output_dict = dict(
|
||||
(i, merged_output_dict[i]) for i in merged_output_dict if i not in self.model_original_outputs)
|
||||
(i, merged_output_dict[i]) for i in merged_output_dict if i not in self.model_original_outputs
|
||||
)
|
||||
|
||||
pairs = []
|
||||
for i in range(0, len(added_output_names), 2):
|
||||
min_value = 0
|
||||
max_value = 0
|
||||
if self.moving_average:
|
||||
min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis = 0)
|
||||
max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis = 0)
|
||||
min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis=0)
|
||||
max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
|
||||
else:
|
||||
min_value_array = min(merged_added_output_dict[added_output_names[i]])
|
||||
max_value_array = max(merged_added_output_dict[added_output_names[i + 1]])
|
||||
|
@ -293,22 +339,25 @@ class MinMaxCalibrater(CalibraterBase):
|
|||
if self.calibrate_tensors_range:
|
||||
self.calibrate_tensors_range = self.merge_range(self.calibrate_tensors_range, new_calibrate_tensors_range)
|
||||
else:
|
||||
self.calibrate_tensors_range = new_calibrate_tensors_range
|
||||
self.calibrate_tensors_range = new_calibrate_tensors_range
|
||||
|
||||
return self.calibrate_tensors_range
|
||||
|
||||
|
||||
class HistogramCalibrater(CalibraterBase):
|
||||
def __init__(self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path='augmented_model.onnx',
|
||||
use_external_data_format=False,
|
||||
method='percentile',
|
||||
symmetric=False,
|
||||
num_bins=128,
|
||||
num_quantized_bins=2048,
|
||||
percentile=99.999):
|
||||
'''
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path="augmented_model.onnx",
|
||||
use_external_data_format=False,
|
||||
method="percentile",
|
||||
symmetric=False,
|
||||
num_bins=128,
|
||||
num_quantized_bins=2048,
|
||||
percentile=99.999,
|
||||
):
|
||||
"""
|
||||
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
|
||||
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
|
||||
:param augmented_model_path: save augmented model to this path.
|
||||
|
@ -318,8 +367,10 @@ class HistogramCalibrater(CalibraterBase):
|
|||
:param num_bins: number of bins to create a new histogram for collecting tensor values.
|
||||
:param num_quantized_bins: number of quantized bins. Default 128.
|
||||
:param percentile: A float number between [0, 100]. Default 99.99.
|
||||
'''
|
||||
super(HistogramCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, use_external_data_format)
|
||||
"""
|
||||
super(HistogramCalibrater, self).__init__(
|
||||
model, op_types_to_calibrate, augmented_model_path, use_external_data_format
|
||||
)
|
||||
self.intermediate_outputs = []
|
||||
self.calibrate_tensors_range = None
|
||||
self.num_model_outputs = len(self.model.graph.output)
|
||||
|
@ -332,31 +383,35 @@ class HistogramCalibrater(CalibraterBase):
|
|||
self.percentile = percentile
|
||||
|
||||
def augment_graph(self):
|
||||
'''
|
||||
"""
|
||||
make all quantization_candidates op type nodes as part of the graph output.
|
||||
:return: augmented ONNX model
|
||||
'''
|
||||
"""
|
||||
model = clone_model_with_shape_infer(self.model)
|
||||
|
||||
added_nodes = []
|
||||
added_outputs = []
|
||||
tensors, value_infos = self.select_tensors_to_calibrate(model)
|
||||
tensors, value_infos = self.select_tensors_to_calibrate(model)
|
||||
|
||||
for tensor in tensors:
|
||||
added_outputs.append(value_infos[tensor])
|
||||
|
||||
model.graph.node.extend(added_nodes)
|
||||
model.graph.output.extend(added_outputs)
|
||||
onnx.save(model, self.augmented_model_path, save_as_external_data=self.use_external_data_format)
|
||||
onnx.save(
|
||||
model,
|
||||
self.augmented_model_path,
|
||||
save_as_external_data=self.use_external_data_format,
|
||||
)
|
||||
self.augment_model = model
|
||||
|
||||
def clear_collected_data(self):
|
||||
self.intermediate_outputs = []
|
||||
|
||||
def collect_data(self, data_reader: CalibrationDataReader):
|
||||
'''
|
||||
Entropy Calibrator collects operators' tensors as well as generates tensor histogram for each operator.
|
||||
'''
|
||||
"""
|
||||
Entropy Calibrator collects operators' tensors as well as generates tensor histogram for each operator.
|
||||
"""
|
||||
while True:
|
||||
inputs = data_reader.get_next()
|
||||
if not inputs:
|
||||
|
@ -379,36 +434,41 @@ class HistogramCalibrater(CalibraterBase):
|
|||
clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict if i not in self.model_original_outputs)
|
||||
|
||||
if not self.collector:
|
||||
self.collector = HistogramCollector(method=self.method,
|
||||
symmetric=self.symmetric,
|
||||
num_bins=self.num_bins,
|
||||
num_quantized_bins=self.num_quantized_bins,
|
||||
percentile=self.percentile)
|
||||
self.collector = HistogramCollector(
|
||||
method=self.method,
|
||||
symmetric=self.symmetric,
|
||||
num_bins=self.num_bins,
|
||||
num_quantized_bins=self.num_quantized_bins,
|
||||
percentile=self.percentile,
|
||||
)
|
||||
self.collector.collect(clean_merged_dict)
|
||||
|
||||
self.clear_collected_data()
|
||||
|
||||
def compute_range(self):
|
||||
'''
|
||||
"""
|
||||
Compute the min-max range of tensor
|
||||
:return: dictionary mapping: {tensor name: (min value, max value)}
|
||||
'''
|
||||
"""
|
||||
if not self.collector:
|
||||
raise ValueError("No collector created and can't generate calibration data.")
|
||||
|
||||
return self.collector.compute_collection_result()
|
||||
|
||||
|
||||
class EntropyCalibrater(HistogramCalibrater):
|
||||
def __init__(self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path='augmented_model.onnx',
|
||||
use_external_data_format=False,
|
||||
method='entropy',
|
||||
symmetric=False,
|
||||
num_bins=128,
|
||||
num_quantized_bins=128):
|
||||
'''
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path="augmented_model.onnx",
|
||||
use_external_data_format=False,
|
||||
method="entropy",
|
||||
symmetric=False,
|
||||
num_bins=128,
|
||||
num_quantized_bins=128,
|
||||
):
|
||||
"""
|
||||
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
|
||||
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
|
||||
:param augmented_model_path: save augmented model to this path.
|
||||
|
@ -417,21 +477,32 @@ class EntropyCalibrater(HistogramCalibrater):
|
|||
:param symmetric: make range of tensor symmetric (central point is 0).
|
||||
:param num_bins: number of bins to create a new histogram for collecting tensor values.
|
||||
:param num_quantized_bins: number of quantized bins. Default 128.
|
||||
'''
|
||||
super(EntropyCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, use_external_data_format,
|
||||
method=method, symmetric=symmetric, num_bins=num_bins, num_quantized_bins=num_quantized_bins)
|
||||
"""
|
||||
super(EntropyCalibrater, self).__init__(
|
||||
model,
|
||||
op_types_to_calibrate,
|
||||
augmented_model_path,
|
||||
use_external_data_format,
|
||||
method=method,
|
||||
symmetric=symmetric,
|
||||
num_bins=num_bins,
|
||||
num_quantized_bins=num_quantized_bins,
|
||||
)
|
||||
|
||||
|
||||
class PercentileCalibrater(HistogramCalibrater):
|
||||
def __init__(self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path='augmented_model.onnx',
|
||||
use_external_data_format=False,
|
||||
method='percentile',
|
||||
symmetric=False,
|
||||
num_bins=2048,
|
||||
percentile=99.999):
|
||||
'''
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path="augmented_model.onnx",
|
||||
use_external_data_format=False,
|
||||
method="percentile",
|
||||
symmetric=False,
|
||||
num_bins=2048,
|
||||
percentile=99.999,
|
||||
):
|
||||
"""
|
||||
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
|
||||
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
|
||||
:param augmented_model_path: save augmented model to this path.
|
||||
|
@ -440,9 +511,18 @@ class PercentileCalibrater(HistogramCalibrater):
|
|||
:param symmetric: make range of tensor symmetric (central point is 0).
|
||||
:param num_quantized_bins: number of quantized bins. Default 128.
|
||||
:param percentile: A float number between [0, 100]. Default 99.99.
|
||||
'''
|
||||
super(PercentileCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, use_external_data_format,
|
||||
method=method, symmetric=symmetric, num_bins=num_bins, percentile=percentile)
|
||||
"""
|
||||
super(PercentileCalibrater, self).__init__(
|
||||
model,
|
||||
op_types_to_calibrate,
|
||||
augmented_model_path,
|
||||
use_external_data_format,
|
||||
method=method,
|
||||
symmetric=symmetric,
|
||||
num_bins=num_bins,
|
||||
percentile=percentile,
|
||||
)
|
||||
|
||||
|
||||
class CalibrationDataCollector(metaclass=abc.ABCMeta):
|
||||
"""
|
||||
|
@ -453,18 +533,19 @@ class CalibrationDataCollector(metaclass=abc.ABCMeta):
|
|||
def collect(self, name_to_arr):
|
||||
"""
|
||||
Generate informative data based on given data.
|
||||
name_to_arr : dict
|
||||
tensor name to NDArray data
|
||||
name_to_arr : dict
|
||||
tensor name to NDArray data
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def compute_collection_result(self):
|
||||
"""
|
||||
Get the optimal result among collection data.
|
||||
Get the optimal result among collection data.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HistogramCollector(CalibrationDataCollector):
|
||||
"""
|
||||
Collecting histogram for each tensor. Percentile and Entropy method are supported.
|
||||
|
@ -473,12 +554,13 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
ref: https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/_modules/
|
||||
pytorch_quantization/calib/histogram.html
|
||||
"""
|
||||
|
||||
def __init__(self, method, symmetric, num_bins, num_quantized_bins, percentile):
|
||||
self.histogram_dict = {}
|
||||
self.method = method
|
||||
self.symmetric = symmetric
|
||||
self.num_bins = num_bins
|
||||
self.num_quantized_bins= num_quantized_bins
|
||||
self.num_quantized_bins = num_quantized_bins
|
||||
self.percentile = percentile
|
||||
|
||||
def get_histogram_dict(self):
|
||||
|
@ -489,24 +571,24 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
|
||||
# TODO: Currently we have different collect() for entropy and percentile method respectively.
|
||||
# Need unified collect in the future.
|
||||
if self.method == 'entropy':
|
||||
if self.method == "entropy":
|
||||
return self.collect_value(name_to_arr)
|
||||
elif self.method == 'percentile':
|
||||
elif self.method == "percentile":
|
||||
if self.symmetric:
|
||||
return self.collect_absolute_value(name_to_arr)
|
||||
else:
|
||||
return self.collect_value(name_to_arr)
|
||||
else:
|
||||
raise ValueError('Only \'entropy\' or \'percentile\' method are supported')
|
||||
raise ValueError("Only 'entropy' or 'percentile' method are supported")
|
||||
|
||||
def collect_absolute_value(self, name_to_arr):
|
||||
'''
|
||||
"""
|
||||
Collect histogram on absolute value
|
||||
'''
|
||||
"""
|
||||
for tensor, data_arr in name_to_arr.items():
|
||||
data_arr = np.asarray(data_arr)
|
||||
data_arr = data_arr.flatten()
|
||||
data_arr = np.absolute(data_arr) # only consider absolute value
|
||||
data_arr = np.absolute(data_arr) # only consider absolute value
|
||||
|
||||
if tensor not in self.histogram_dict:
|
||||
# first time it uses num_bins to compute histogram.
|
||||
|
@ -524,13 +606,13 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
new_bin_edges = np.arange(old_hist_edges[-1] + width, temp_amax + width, width)
|
||||
old_hist_edges = np.hstack((old_hist_edges, new_bin_edges))
|
||||
hist, hist_edges = np.histogram(data_arr, bins=old_hist_edges)
|
||||
hist[:len(old_hist)] += old_hist
|
||||
hist[: len(old_hist)] += old_hist
|
||||
self.histogram_dict[tensor] = (hist, hist_edges)
|
||||
|
||||
def collect_value(self, name_to_arr):
|
||||
'''
|
||||
"""
|
||||
Collect histogram on real value
|
||||
'''
|
||||
"""
|
||||
for tensor, data_arr in name_to_arr.items():
|
||||
data_arr = np.asarray(data_arr)
|
||||
data_arr = data_arr.flatten()
|
||||
|
@ -546,10 +628,18 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
|
||||
if tensor in self.histogram_dict:
|
||||
old_histogram = self.histogram_dict[tensor]
|
||||
self.histogram_dict[tensor] = self.merge_histogram(old_histogram, data_arr, min_value, max_value, threshold)
|
||||
self.histogram_dict[tensor] = self.merge_histogram(
|
||||
old_histogram, data_arr, min_value, max_value, threshold
|
||||
)
|
||||
else:
|
||||
hist, hist_edges = np.histogram(data_arr, self.num_bins, range=(-threshold, threshold))
|
||||
self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value, threshold)
|
||||
self.histogram_dict[tensor] = (
|
||||
hist,
|
||||
hist_edges,
|
||||
min_value,
|
||||
max_value,
|
||||
threshold,
|
||||
)
|
||||
|
||||
def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_threshold):
|
||||
|
||||
|
@ -557,7 +647,13 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
|
||||
if new_threshold <= old_threshold:
|
||||
new_hist, _ = np.histogram(data_arr, len(old_hist), range=(-old_threshold, old_threshold))
|
||||
return (new_hist + old_hist, old_hist_edges, min(old_min, new_min), max(old_max, new_max), old_threshold)
|
||||
return (
|
||||
new_hist + old_hist,
|
||||
old_hist_edges,
|
||||
min(old_min, new_min),
|
||||
max(old_max, new_max),
|
||||
old_threshold,
|
||||
)
|
||||
else:
|
||||
if old_threshold == 0:
|
||||
hist, hist_edges = np.histogram(data_arr, len(old_hist), range=(-new_threshold, new_threshold))
|
||||
|
@ -565,24 +661,30 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
else:
|
||||
old_num_bins = len(old_hist)
|
||||
old_stride = 2 * old_threshold / old_num_bins
|
||||
half_increased_bins = int((new_threshold - old_threshold) // old_stride + 1)
|
||||
half_increased_bins = int((new_threshold - old_threshold) // old_stride + 1)
|
||||
new_num_bins = old_num_bins + 2 * half_increased_bins
|
||||
new_threshold = half_increased_bins * old_stride + old_threshold
|
||||
hist, hist_edges = np.histogram(data_arr, new_num_bins, range=(-new_threshold, new_threshold))
|
||||
hist[half_increased_bins:new_num_bins-half_increased_bins] += old_hist
|
||||
return (hist, hist_edges, min(old_min, new_min), max(old_max, new_max), new_threshold)
|
||||
hist[half_increased_bins : new_num_bins - half_increased_bins] += old_hist
|
||||
return (
|
||||
hist,
|
||||
hist_edges,
|
||||
min(old_min, new_min),
|
||||
max(old_max, new_max),
|
||||
new_threshold,
|
||||
)
|
||||
|
||||
def compute_collection_result(self):
|
||||
if not self.histogram_dict or len(self.histogram_dict) == 0:
|
||||
raise ValueError("Histogram has not been collected. Please run collect() first.")
|
||||
print("Finding optimal threshold for each tensor using {} algorithm ...".format(self.method))
|
||||
|
||||
if self.method == 'entropy':
|
||||
if self.method == "entropy":
|
||||
return self.compute_entropy()
|
||||
elif self.method == 'percentile':
|
||||
elif self.method == "percentile":
|
||||
return self.compute_percentile()
|
||||
else:
|
||||
raise ValueError('Only \'entropy\' or \'percentile\' method are supported')
|
||||
raise ValueError("Only 'entropy' or 'percentile' method are supported")
|
||||
|
||||
def compute_percentile(self):
|
||||
if self.percentile < 0 or self.percentile > 100:
|
||||
|
@ -591,7 +693,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
histogram_dict = self.histogram_dict
|
||||
percentile = self.percentile
|
||||
|
||||
thresholds_dict = {} # per tensor thresholds
|
||||
thresholds_dict = {} # per tensor thresholds
|
||||
|
||||
print("Number of tensors : {}".format(len(histogram_dict)))
|
||||
print("Number of histogram bins : {}".format(self.num_bins))
|
||||
|
@ -601,15 +703,21 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
hist = histogram[0]
|
||||
hist_edges = histogram[1]
|
||||
total = hist.sum()
|
||||
cdf = np.cumsum(hist/total)
|
||||
cdf = np.cumsum(hist / total)
|
||||
if self.symmetric:
|
||||
idx_right = np.searchsorted(cdf, percentile / 100.0)
|
||||
thresholds_dict[tensor] = (-float(hist_edges[idx_right]), float(hist_edges[idx_right]))
|
||||
thresholds_dict[tensor] = (
|
||||
-float(hist_edges[idx_right]),
|
||||
float(hist_edges[idx_right]),
|
||||
)
|
||||
else:
|
||||
percent_to_cut_one_side = (100.0 - percentile) / 200.0
|
||||
idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
|
||||
idx_left = np.searchsorted(cdf, percent_to_cut_one_side)
|
||||
thresholds_dict[tensor] = (float(hist_edges[idx_left]), float(hist_edges[idx_right]))
|
||||
thresholds_dict[tensor] = (
|
||||
float(hist_edges[idx_left]),
|
||||
float(hist_edges[idx_right]),
|
||||
)
|
||||
|
||||
# Plot histogram for debug only
|
||||
if False:
|
||||
|
@ -621,10 +729,14 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
histogram_dict = self.histogram_dict
|
||||
num_quantized_bins = self.num_quantized_bins
|
||||
|
||||
thresholds_dict = {} # per tensor thresholds
|
||||
thresholds_dict = {} # per tensor thresholds
|
||||
|
||||
print("Number of tensors : {}".format(len(histogram_dict)))
|
||||
print("Number of histogram bins : {} (The number may increase depends on the data it collects)".format(self.num_bins))
|
||||
print(
|
||||
"Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
|
||||
self.num_bins
|
||||
)
|
||||
)
|
||||
print("Number of quantized bins : {}".format(self.num_quantized_bins))
|
||||
|
||||
for tensor, histogram in histogram_dict.items():
|
||||
|
@ -643,17 +755,18 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
`q` is a truncated version of the original distribution.
|
||||
Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
|
||||
"""
|
||||
from scipy.stats import entropy
|
||||
import copy
|
||||
|
||||
from scipy.stats import entropy
|
||||
|
||||
hist = histogram[0]
|
||||
hist_edges = histogram[1]
|
||||
num_bins = hist.size
|
||||
zero_bin_index = num_bins // 2
|
||||
num_half_quantized_bin = num_quantized_bins // 2
|
||||
|
||||
|
||||
kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
|
||||
thresholds = [(0, 0) for i in range(kl_divergence.size)]
|
||||
thresholds = [(0, 0) for i in range(kl_divergence.size)]
|
||||
|
||||
# <------------ num bins ---------------->
|
||||
# <--- quantized bins ---->
|
||||
|
@ -670,33 +783,36 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
# start index end index (end of iteration)
|
||||
|
||||
for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
|
||||
start_index = zero_bin_index - i
|
||||
start_index = zero_bin_index - i
|
||||
end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
|
||||
|
||||
thresholds[i - num_half_quantized_bin] = (float(hist_edges[start_index]), float(hist_edges[end_index]))
|
||||
thresholds[i - num_half_quantized_bin] = (
|
||||
float(hist_edges[start_index]),
|
||||
float(hist_edges[end_index]),
|
||||
)
|
||||
|
||||
sliced_distribution = copy.deepcopy(hist[start_index:end_index])
|
||||
|
||||
# reference distribution p
|
||||
p = sliced_distribution.copy() # a copy of np array
|
||||
left_outliers_count = sum(hist[:start_index])
|
||||
p = sliced_distribution.copy() # a copy of np array
|
||||
left_outliers_count = sum(hist[:start_index])
|
||||
right_outliers_count = sum(hist[end_index:])
|
||||
p[0] += left_outliers_count
|
||||
p[-1] += right_outliers_count
|
||||
|
||||
# nonzeros[i] incidates whether p[i] is non-zero
|
||||
nonzeros = (p != 0).astype(np.int64)
|
||||
|
||||
# quantize p.size bins into quantized bins (default 128 bins)
|
||||
|
||||
# quantize p.size bins into quantized bins (default 128 bins)
|
||||
quantized_bins = np.zeros(num_quantized_bins, dtype=np.int64)
|
||||
num_merged_bins = sliced_distribution.size // num_quantized_bins
|
||||
|
||||
# merge bins into quantized bins
|
||||
for index in range(num_quantized_bins):
|
||||
start = index * num_merged_bins
|
||||
start = index * num_merged_bins
|
||||
end = start + num_merged_bins
|
||||
quantized_bins[index] = sum(sliced_distribution[start:end])
|
||||
quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins:])
|
||||
quantized_bins[index] = sum(sliced_distribution[start:end])
|
||||
quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins :])
|
||||
|
||||
# in order to compare p and q, we need to make length of q equals to length of p
|
||||
# expand quantized bins into p.size bins
|
||||
|
@ -708,63 +824,71 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
norm = sum(nonzeros[start:end])
|
||||
if norm != 0:
|
||||
q[start:end] = float(quantized_bins[index]) / float(norm)
|
||||
|
||||
|
||||
p = smooth_distribution(p)
|
||||
q = smooth_distribution(q)
|
||||
|
||||
if isinstance(q, np.ndarray):
|
||||
kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
|
||||
else:
|
||||
kl_divergence[i - num_half_quantized_bin] = float('inf')
|
||||
kl_divergence[i - num_half_quantized_bin] = float("inf")
|
||||
|
||||
min_kl_divergence_idx = np.argmin(kl_divergence)
|
||||
optimal_threshold = thresholds[min_kl_divergence_idx]
|
||||
optimal_threshold = thresholds[min_kl_divergence_idx]
|
||||
|
||||
return optimal_threshold
|
||||
|
||||
|
||||
def create_calibrator(model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path='augmented_model.onnx',
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
use_external_data_format=False,
|
||||
extra_options={}):
|
||||
def create_calibrator(
|
||||
model,
|
||||
op_types_to_calibrate=[],
|
||||
augmented_model_path="augmented_model.onnx",
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
use_external_data_format=False,
|
||||
extra_options={},
|
||||
):
|
||||
|
||||
if calibrate_method == CalibrationMethod.MinMax:
|
||||
# default settings for min-max algorithm
|
||||
symmetric = False if 'symmetric' not in extra_options else extra_options['symmetric']
|
||||
moving_average = False if 'moving_average' not in extra_options else extra_options['moving_average']
|
||||
averaging_constant = 0.01 if 'averaging_constant' not in extra_options else extra_options['averaging_constant']
|
||||
symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
|
||||
moving_average = False if "moving_average" not in extra_options else extra_options["moving_average"]
|
||||
averaging_constant = 0.01 if "averaging_constant" not in extra_options else extra_options["averaging_constant"]
|
||||
return MinMaxCalibrater(
|
||||
model, op_types_to_calibrate, augmented_model_path,
|
||||
model,
|
||||
op_types_to_calibrate,
|
||||
augmented_model_path,
|
||||
use_external_data_format=use_external_data_format,
|
||||
symmetric=symmetric,
|
||||
moving_average=moving_average,
|
||||
averaging_constant=averaging_constant
|
||||
averaging_constant=averaging_constant,
|
||||
)
|
||||
elif calibrate_method == CalibrationMethod.Entropy:
|
||||
# default settings for entropy algorithm
|
||||
num_bins = 128 if 'num_bins' not in extra_options else extra_options['num_bins']
|
||||
num_quantized_bins = 128 if 'num_quantized_bins' not in extra_options else extra_options['num_quantized_bins']
|
||||
symmetric = False if 'symmetric' not in extra_options else extra_options['symmetric']
|
||||
num_bins = 128 if "num_bins" not in extra_options else extra_options["num_bins"]
|
||||
num_quantized_bins = 128 if "num_quantized_bins" not in extra_options else extra_options["num_quantized_bins"]
|
||||
symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
|
||||
return EntropyCalibrater(
|
||||
model, op_types_to_calibrate, augmented_model_path,
|
||||
model,
|
||||
op_types_to_calibrate,
|
||||
augmented_model_path,
|
||||
use_external_data_format=use_external_data_format,
|
||||
symmetric=symmetric,
|
||||
num_bins=num_bins,
|
||||
num_quantized_bins=num_quantized_bins
|
||||
num_quantized_bins=num_quantized_bins,
|
||||
)
|
||||
elif calibrate_method == CalibrationMethod.Percentile:
|
||||
# default settings for percentile algorithm
|
||||
num_bins = 2048 if 'num_bins' not in extra_options else extra_options['num_bins']
|
||||
percentile = 99.999 if 'percentile' not in extra_options else extra_options['percentile']
|
||||
symmetric = True if 'symmetric' not in extra_options else extra_options['symmetric']
|
||||
num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
|
||||
percentile = 99.999 if "percentile" not in extra_options else extra_options["percentile"]
|
||||
symmetric = True if "symmetric" not in extra_options else extra_options["symmetric"]
|
||||
return PercentileCalibrater(
|
||||
model, op_types_to_calibrate, augmented_model_path,
|
||||
model,
|
||||
op_types_to_calibrate,
|
||||
augmented_model_path,
|
||||
use_external_data_format=use_external_data_format,
|
||||
symmetric=symmetric,
|
||||
num_bins=num_bins,
|
||||
percentile=percentile
|
||||
percentile=percentile,
|
||||
)
|
||||
|
||||
raise ValueError('Unsupported calibration method {}'.format(calibrate_method))
|
||||
raise ValueError("Unsupported calibration method {}".format(calibrate_method))
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import onnx
|
||||
import itertools
|
||||
from .quant_utils import find_by_name, attribute_to_kwarg
|
||||
from pathlib import Path
|
||||
|
||||
import onnx
|
||||
|
||||
from .quant_utils import attribute_to_kwarg, find_by_name
|
||||
|
||||
|
||||
class ONNXModel:
|
||||
def __init__(self, model):
|
||||
self.model = model
|
||||
|
@ -121,19 +124,19 @@ class ONNXModel:
|
|||
return output_name_to_node[input]
|
||||
|
||||
def find_node_by_name(self, node_name, new_nodes_list, graph):
|
||||
'''
|
||||
"""
|
||||
Find out if a node exists in a graph or a node is in the
|
||||
new set of nodes created during quantization. Return the node found.
|
||||
'''
|
||||
graph_nodes_list = list(graph.node) #deep copy
|
||||
"""
|
||||
graph_nodes_list = list(graph.node) # deep copy
|
||||
graph_nodes_list.extend(new_nodes_list)
|
||||
node = find_by_name(node_name, graph_nodes_list)
|
||||
return node
|
||||
|
||||
def find_nodes_by_initializer(self, graph, initializer):
|
||||
'''
|
||||
"""
|
||||
Find all nodes with given initializer as an input.
|
||||
'''
|
||||
"""
|
||||
nodes = []
|
||||
for node in graph.node:
|
||||
for node_input in node.input:
|
||||
|
@ -174,19 +177,19 @@ class ONNXModel:
|
|||
kwargs.update(kv)
|
||||
node = onnx.helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
|
||||
|
||||
if node.op_type == 'Gemm':
|
||||
if node.op_type == "Gemm":
|
||||
alpha = 1.0
|
||||
beta = 1.0
|
||||
transA = 0
|
||||
transB = 0
|
||||
for attr in node.attribute:
|
||||
if attr.name == 'alpha':
|
||||
if attr.name == "alpha":
|
||||
alpha = onnx.helper.get_attribute_value(attr)
|
||||
elif attr.name == 'beta':
|
||||
elif attr.name == "beta":
|
||||
beta = onnx.helper.get_attribute_value(attr)
|
||||
elif attr.name == 'transA':
|
||||
elif attr.name == "transA":
|
||||
transA = onnx.helper.get_attribute_value(attr)
|
||||
elif attr.name == 'transB':
|
||||
elif attr.name == "transB":
|
||||
transB = onnx.helper.get_attribute_value(attr)
|
||||
if alpha == 1.0 and beta == 1.0 and transA == 0:
|
||||
inputB = node.input[1]
|
||||
|
@ -204,25 +207,30 @@ class ONNXModel:
|
|||
break
|
||||
Bs_graph.initializer.extend([B_trans])
|
||||
else:
|
||||
inputB += '_Transposed'
|
||||
transpose_node = onnx.helper.make_node('Transpose',
|
||||
inputs=[node.input[1]],
|
||||
outputs=[inputB],
|
||||
name=node.name + '_Transpose' if node.name != "" else "")
|
||||
inputB += "_Transposed"
|
||||
transpose_node = onnx.helper.make_node(
|
||||
"Transpose",
|
||||
inputs=[node.input[1]],
|
||||
outputs=[inputB],
|
||||
name=node.name + "_Transpose" if node.name != "" else "",
|
||||
)
|
||||
new_nodes.append(transpose_node)
|
||||
|
||||
matmul_node = onnx.helper.make_node(
|
||||
'MatMul',
|
||||
"MatMul",
|
||||
inputs=[node.input[0], inputB],
|
||||
outputs=[node.output[0] + ('_MatMul' if len(node.input) > 2 else '')],
|
||||
name=node.name + '_MatMul' if node.name != "" else "")
|
||||
outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
|
||||
name=node.name + "_MatMul" if node.name != "" else "",
|
||||
)
|
||||
new_nodes.append(matmul_node)
|
||||
|
||||
if len(node.input) > 2:
|
||||
add_node = onnx.helper.make_node('Add',
|
||||
inputs=[node.output[0] + '_MatMul', node.input[2]],
|
||||
outputs=node.output,
|
||||
name=node.name + '_Add' if node.name != "" else "")
|
||||
add_node = onnx.helper.make_node(
|
||||
"Add",
|
||||
inputs=[node.output[0] + "_MatMul", node.input[2]],
|
||||
outputs=node.output,
|
||||
name=node.name + "_Add" if node.name != "" else "",
|
||||
)
|
||||
new_nodes.append(add_node)
|
||||
|
||||
# unsupported
|
||||
|
@ -233,7 +241,7 @@ class ONNXModel:
|
|||
else:
|
||||
new_nodes.append(node)
|
||||
|
||||
graph.ClearField('node')
|
||||
graph.ClearField("node")
|
||||
graph.node.extend(new_nodes)
|
||||
graph_path.pop()
|
||||
return graph
|
||||
|
@ -243,14 +251,16 @@ class ONNXModel:
|
|||
ONNXModel.__replace_gemm_with_matmul(graph_path)
|
||||
|
||||
def save_model_to_file(self, output_path, use_external_data_format=False):
|
||||
'''
|
||||
"""
|
||||
Save model to external data, which is needed for model size > 2GB
|
||||
'''
|
||||
"""
|
||||
self.topological_sort()
|
||||
if use_external_data_format:
|
||||
onnx.external_data_helper.convert_model_to_external_data(self.model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=Path(output_path).name + ".data")
|
||||
onnx.external_data_helper.convert_model_to_external_data(
|
||||
self.model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=Path(output_path).name + ".data",
|
||||
)
|
||||
onnx.save_model(self.model, output_path)
|
||||
|
||||
@staticmethod
|
||||
|
@ -278,12 +288,15 @@ class ONNXModel:
|
|||
def remove_unused_constant(self):
|
||||
input_name_to_nodes = self.input_name_to_nodes()
|
||||
|
||||
#remove unused constant
|
||||
# remove unused constant
|
||||
unused_nodes = []
|
||||
nodes = self.nodes()
|
||||
for node in nodes:
|
||||
if node.op_type == "Constant" and not self.is_graph_output(
|
||||
node.output[0]) and node.output[0] not in input_name_to_nodes:
|
||||
if (
|
||||
node.op_type == "Constant"
|
||||
and not self.is_graph_output(node.output[0])
|
||||
and node.output[0] not in input_name_to_nodes
|
||||
):
|
||||
unused_nodes.append(node)
|
||||
|
||||
self.remove_nodes(unused_nodes)
|
||||
|
@ -308,13 +321,13 @@ class ONNXModel:
|
|||
# TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
|
||||
# Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
|
||||
def topological_sort(self):
|
||||
deps_count = [0]*len(self.nodes()) # dependency count of each node
|
||||
deps_to_nodes = {} # input to node indice
|
||||
deps_count = [0] * len(self.nodes()) # dependency count of each node
|
||||
deps_to_nodes = {} # input to node indice
|
||||
sorted_nodes = [] # initialize sorted_nodes
|
||||
for node_idx, node in enumerate(self.nodes()):
|
||||
# CANNOT use len(node.input) directly because input can be optional
|
||||
deps_count[node_idx] = sum(1 for _ in node.input if _ )
|
||||
if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
|
||||
deps_count[node_idx] = sum(1 for _ in node.input if _)
|
||||
if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
|
||||
sorted_nodes.append(self.nodes()[node_idx])
|
||||
continue
|
||||
|
||||
|
@ -353,6 +366,6 @@ class ONNXModel:
|
|||
end = end + 1
|
||||
start = start + 1
|
||||
|
||||
assert(end == len(self.graph().node)), "Graph is not a DAG"
|
||||
self.graph().ClearField('node')
|
||||
self.graph().node.extend(sorted_nodes)
|
||||
assert end == len(self.graph().node), "Graph is not a DAG"
|
||||
self.graph().ClearField("node")
|
||||
self.graph().node.extend(sorted_nodes)
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,2 +1,2 @@
|
|||
#from .base_operator import QuantOperatorBase
|
||||
#from .matmul import MatMulInteger
|
||||
# from .base_operator import QuantOperatorBase
|
||||
# from .matmul import MatMulInteger
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
|
||||
class QLinearActivation(QuantOperatorBase):
|
||||
|
@ -11,7 +12,7 @@ class QLinearActivation(QuantOperatorBase):
|
|||
|
||||
def QuantizeClipRelu(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Relu" or node.op_type == 'Clip')
|
||||
assert node.op_type == "Relu" or node.op_type == "Clip"
|
||||
|
||||
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
|
||||
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
|
||||
|
@ -25,22 +26,34 @@ class QLinearActivation(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
if node.op_type == "Relu" or node.op_type == 'Clip':
|
||||
if node.op_type == "Relu" or node.op_type == "Clip":
|
||||
self.QuantizeClipRelu()
|
||||
return
|
||||
|
||||
nnapi_sigmoid_option = 'extra.Sigmoid.nnapi'
|
||||
sigmoid_nnapi_mode = (node.op_type == 'Sigmoid' and
|
||||
nnapi_sigmoid_option in self.quantizer.extra_options and
|
||||
self.quantizer.extra_options[nnapi_sigmoid_option])
|
||||
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
|
||||
sigmoid_nnapi_mode = (
|
||||
node.op_type == "Sigmoid"
|
||||
and nnapi_sigmoid_option in self.quantizer.extra_options
|
||||
and self.quantizer.extra_options[nnapi_sigmoid_option]
|
||||
)
|
||||
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
|
||||
use_zeropoint = 0 if sigmoid_nnapi_mode else None
|
||||
|
||||
# No assert on op_type as it is controlled by registry
|
||||
# only try to quantize when given quantization parameters for it
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
|
||||
quantized_input_names, zero_point_names, scale_names, nodes = self.quantizer.quantize_inputs(node, [0])
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
|
@ -54,15 +67,29 @@ class QLinearActivation(QuantOperatorBase):
|
|||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_activation_inputs = [
|
||||
quantized_input_names[0], scale_names[0], zero_point_names[0], output_scale_name, output_zp_name
|
||||
quantized_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
|
||||
qlinear_activation_node = onnx.helper.make_node("QLinear" + node.op_type, qlinear_activation_inputs,
|
||||
[qlinear_activation_output], qlinear_activation_name, **kwargs)
|
||||
qlinear_activation_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_activation_inputs,
|
||||
[qlinear_activation_output],
|
||||
qlinear_activation_name,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(node.output[0], qlinear_activation_output, output_scale_name, output_zp_name,
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_activation_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
nodes.append(qlinear_activation_node)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
# Use the quantized tensor as input without DQ.
|
||||
class QArgMax(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
|
@ -14,4 +15,4 @@ class QArgMax(QuantOperatorBase):
|
|||
return
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import onnx
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
'''
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Attention
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class AttentionQuant(QuantOperatorBase):
|
||||
|
@ -12,23 +14,27 @@ class AttentionQuant(QuantOperatorBase):
|
|||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
'''
|
||||
parameter node: Attention node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
'''
|
||||
"""
|
||||
parameter node: Attention node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert (node.op_type == "Attention")
|
||||
assert node.op_type == "Attention"
|
||||
|
||||
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
|
||||
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
|
||||
# is implemented
|
||||
for attr in node.attribute:
|
||||
if 'qkv_hidden_sizes' == attr.name:
|
||||
if "qkv_hidden_sizes" == attr.name:
|
||||
return super().quantize()
|
||||
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
|
|
|
@ -4,13 +4,13 @@ class QuantOperatorBase:
|
|||
self.node = onnx_node
|
||||
|
||||
def quantize(self):
|
||||
'''
|
||||
"""
|
||||
Given a node which does not support quantization, this method checks whether the input to
|
||||
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
|
||||
parameter node: Current node
|
||||
parameter new_nodes_list: List of new nodes created before processing current node
|
||||
return: List of new nodes created
|
||||
'''
|
||||
"""
|
||||
nodes = []
|
||||
for index, node_input in enumerate(self.node.input):
|
||||
dequantize_node = self.quantizer._dequantize_value(node_input)
|
||||
|
@ -18,4 +18,4 @@ class QuantOperatorBase:
|
|||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
|
||||
# Append the original node
|
||||
self.quantizer.new_nodes.append(self.node)
|
||||
self.quantizer.new_nodes.append(self.node)
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import onnx
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantizedValue, QuantizedValueType
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearBinaryOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
|
@ -11,10 +12,19 @@ class QLinearBinaryOp(QuantOperatorBase):
|
|||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], initializer_use_weight_qType=False)
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], initializer_use_weight_qType=False)
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
|
@ -40,14 +50,23 @@ class QLinearBinaryOp(QuantOperatorBase):
|
|||
qlinear_binary_math_inputs.append(output_scale_name)
|
||||
qlinear_binary_math_inputs.append(output_zp_name)
|
||||
|
||||
qlinear_binary_math_node = onnx.helper.make_node("QLinear" + node.op_type, qlinear_binary_math_inputs,
|
||||
[qlinear_binary_math_output], qlinear_binary_math_name,
|
||||
**kwargs)
|
||||
qlinear_binary_math_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_binary_math_inputs,
|
||||
[qlinear_binary_math_output],
|
||||
qlinear_binary_math_name,
|
||||
**kwargs
|
||||
)
|
||||
nodes.append(qlinear_binary_math_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(node.output[0], qlinear_binary_math_output, output_scale_name, output_zp_name,
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_binary_math_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
from ..quant_utils import QuantizedValue, attribute_to_kwarg, ms_domain, QuantizedValueType
|
||||
|
||||
|
||||
class QLinearConcat(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
|
@ -10,18 +12,31 @@ class QLinearConcat(QuantOperatorBase):
|
|||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(q_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [*range(0, len(node.input))], initializer_use_weight_qType=False)
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [*range(0, len(node.input))], initializer_use_weight_qType=False)
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
|
||||
output_scale_name, output_zp_name,
|
||||
quantized_input_value.value_type)
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + "_quantized",
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
|
@ -33,11 +48,14 @@ class QLinearConcat(QuantOperatorBase):
|
|||
qlconcat_inputs = [output_scale_name, output_zp_name]
|
||||
for i in range(0, len(q_input_names)):
|
||||
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
qlconcat_node = onnx.helper.make_node("QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs)
|
||||
qlconcat_node = onnx.helper.make_node(
|
||||
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlconcat_node]
|
||||
|
||||
|
||||
class QDQConcat(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
|
|
@ -1,9 +1,17 @@
|
|||
import onnx
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import (
|
||||
BiasToQuantize,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name,
|
||||
get_mul_node,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
from ..quant_utils import find_by_name, get_mul_node, QuantizedValue, QuantizedValueType, attribute_to_kwarg, BiasToQuantize
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
|
||||
class ConvInteger(QuantOperatorBase):
|
||||
|
@ -11,7 +19,7 @@ class ConvInteger(QuantOperatorBase):
|
|||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def add_bias(self, nodes, scaled_output):
|
||||
'''
|
||||
"""
|
||||
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
|
||||
parameter nodes: new nodes would be appended into nodes
|
||||
parameter node: current node (Conv)
|
||||
|
@ -19,7 +27,7 @@ class ConvInteger(QuantOperatorBase):
|
|||
parameter output: output of Conv
|
||||
parameter bias_name: bias of Conv
|
||||
return: the name of output
|
||||
'''
|
||||
"""
|
||||
node = self.node
|
||||
model = self.quantizer.model
|
||||
# Add tensors for the shape to be reshaped to
|
||||
|
@ -29,14 +37,15 @@ class ConvInteger(QuantOperatorBase):
|
|||
|
||||
# Add reshape for correct broadcase
|
||||
output = node.output[0]
|
||||
reshape_input_data = node.input[2] # bias of Conv
|
||||
reshape_input_data = node.input[2] # bias of Conv
|
||||
reshape_input_shape = output + "_bias_reshape_shape"
|
||||
reshape_output = output + "_bias_reshape_output"
|
||||
|
||||
shape = np.ones((len(weight.dims)), dtype=np.int64)
|
||||
shape[1] = -1
|
||||
init_shape = onnx.helper.make_tensor(reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)],
|
||||
shape)
|
||||
init_shape = onnx.helper.make_tensor(
|
||||
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
|
||||
)
|
||||
model.add_initializer(init_shape)
|
||||
|
||||
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
|
||||
|
@ -48,10 +57,14 @@ class ConvInteger(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Conv")
|
||||
assert node.op_type == "Conv"
|
||||
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
|
||||
|
||||
conv_integer_output = node.output[0] + "_output_quantized"
|
||||
conv_integer_name = node.name + "_quant" if node.name != "" else ""
|
||||
|
@ -59,19 +72,24 @@ class ConvInteger(QuantOperatorBase):
|
|||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
conv_integer_node = onnx.helper.make_node("ConvInteger", quantized_input_names + zero_point_names,
|
||||
[conv_integer_output], conv_integer_name, **kwargs)
|
||||
conv_integer_node = onnx.helper.make_node(
|
||||
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
|
||||
)
|
||||
nodes.append(conv_integer_node)
|
||||
|
||||
# Add cast operation to cast convInteger output to float.
|
||||
cast_op_output = conv_integer_output + "_cast_output"
|
||||
cast_node = onnx.helper.make_node("Cast", [conv_integer_output], [cast_op_output],
|
||||
conv_integer_output + "_cast",
|
||||
to=onnx_proto.TensorProto.FLOAT)
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[conv_integer_output],
|
||||
[cast_op_output],
|
||||
conv_integer_output + "_cast",
|
||||
to=onnx_proto.TensorProto.FLOAT,
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert (len(scale_names) == 2)
|
||||
assert len(scale_names) == 2
|
||||
if conv_integer_name != "":
|
||||
scales_mul_op = conv_integer_name + "_scales_mul"
|
||||
else:
|
||||
|
@ -90,7 +108,13 @@ class ConvInteger(QuantOperatorBase):
|
|||
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
|
||||
# and make the output of this node the same as output of original conv node.
|
||||
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name != "" else ""
|
||||
nodes.append(get_mul_node([cast_op_output, scales_mul_op_output], scaled_output_name, output_scale_mul_op))
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
scaled_output_name,
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
|
||||
if has_bias:
|
||||
self.add_bias(nodes, scaled_output_name)
|
||||
|
@ -104,22 +128,36 @@ class QLinearConv(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Conv")
|
||||
assert node.op_type == "Conv"
|
||||
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_weight(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1], onnx_proto.TensorProto.INT8,
|
||||
0)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1], onnx_proto.TensorProto.INT8, 0
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
@ -153,13 +191,19 @@ class QLinearConv(QuantOperatorBase):
|
|||
if bias_present:
|
||||
qlinear_conv_inputs.append(quantized_bias_name)
|
||||
|
||||
qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs, [qlinear_conv_output],
|
||||
qlinear_conv_name, **kwargs)
|
||||
qlinear_conv_node = onnx.helper.make_node(
|
||||
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
|
||||
)
|
||||
nodes.append(qlinear_conv_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(node.output[0], qlinear_conv_output, output_scale_name, output_zp_name,
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_conv_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
@ -171,7 +215,7 @@ class QDQConv(QDQOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Conv")
|
||||
assert node.op_type == "Conv"
|
||||
|
||||
self.quantizer.quantize_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from ..quant_utils import QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType
|
||||
|
||||
|
||||
# For operators that support 8bits operations directly, and output could
|
||||
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
|
||||
|
@ -19,9 +20,13 @@ class Direct8BitOp(QuantOperatorBase):
|
|||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
|
||||
quantized_input_value.scale_name, quantized_input_value.zp_name,
|
||||
quantized_input_value.value_type)
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + "_quantized",
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
|
@ -30,19 +35,27 @@ class Direct8BitOp(QuantOperatorBase):
|
|||
|
||||
else:
|
||||
# Force quantize those ops if possible, use exclude node list if this is not you want
|
||||
if (not self.quantizer.is_valid_quantize_weight(node.input[0])):
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
|
||||
scale_names[0], zero_point_names[0],
|
||||
QuantizedValueType.Input)
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + "_quantized",
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_names[0]
|
||||
|
@ -52,7 +65,6 @@ class Direct8BitOp(QuantOperatorBase):
|
|||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
|
||||
class QDQDirect8BitOp(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
|
|
@ -1,28 +1,32 @@
|
|||
import onnx
|
||||
import logging
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
'''
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
|
||||
|
||||
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
|
||||
weight inputs associated with the node to uint8.
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "EmbedLayerNormalization")
|
||||
assert node.op_type == "EmbedLayerNormalization"
|
||||
|
||||
if len(node.output) > 2:
|
||||
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
|
||||
return super().quantize()
|
||||
|
||||
'''
|
||||
"""
|
||||
Pre-quantization EmbedLayerNorm inputs:
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
|
@ -32,15 +36,19 @@ class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
|||
[5] gamma (float32)
|
||||
[6] beta (float32)
|
||||
[7] mask (int32) (optional)
|
||||
'''
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6])
|
||||
"""
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qembed_layer_norm_name = "" if node.name == "" else node.name + "_quant"
|
||||
|
||||
'''
|
||||
"""
|
||||
Quantized Input Tensor List
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
|
@ -60,7 +68,7 @@ class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
|||
[15] segment_embedding_zero_point (uint8)
|
||||
[16] gamma_zero_point (uint8)
|
||||
[17] beta_zero_point (uint8)
|
||||
'''
|
||||
"""
|
||||
inputs = []
|
||||
# 'input_ids'
|
||||
inputs.extend([node.input[0]])
|
||||
|
@ -98,8 +106,13 @@ class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
|||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qembed_layer_norm_node = onnx.helper.make_node("QEmbedLayerNormalization", inputs, node.output,
|
||||
qembed_layer_norm_name, **kwargs)
|
||||
qembed_layer_norm_node = onnx.helper.make_node(
|
||||
"QEmbedLayerNormalization",
|
||||
inputs,
|
||||
node.output,
|
||||
qembed_layer_norm_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qembed_layer_norm_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import onnx
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
'''
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Gather
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class GatherQuant(QuantOperatorBase):
|
||||
|
@ -13,21 +15,30 @@ class GatherQuant(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Gather")
|
||||
if (not self.quantizer.is_valid_quantize_weight(node.input[0])):
|
||||
assert node.op_type == "Gather"
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
gather_new_output = node.output[0] + "_quantized"
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(node.output[0], gather_new_output, scale_names[0], zero_point_names[0],
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
gather_new_output,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
gather_original_output = node.output[0]
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantizedValue, QuantizedValueType
|
||||
|
||||
|
||||
class QGlobalAveragePool(QuantOperatorBase):
|
||||
|
@ -9,7 +10,7 @@ class QGlobalAveragePool(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "GlobalAveragePool")
|
||||
assert node.op_type == "GlobalAveragePool"
|
||||
|
||||
# If input to this node is not quantized then keep this node.
|
||||
if node.input[0] not in self.quantizer.quantized_value_map:
|
||||
|
@ -19,13 +20,23 @@ class QGlobalAveragePool(QuantOperatorBase):
|
|||
|
||||
# Create an entry for output quantized value.
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
data_found, output_scale_name_from_parameter, output_zp_name_from_parameter, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
data_found,
|
||||
output_scale_name_from_parameter,
|
||||
output_zp_name_from_parameter,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
# Just use input scale and zp if parameters for output is not specified.
|
||||
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
|
||||
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
|
||||
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized", output_scale_name,
|
||||
output_zp_name, QuantizedValueType.Input)
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + "_quantized",
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
|
@ -35,8 +46,17 @@ class QGlobalAveragePool(QuantOperatorBase):
|
|||
kwargs["channels_last"] = 0
|
||||
qnode_name = node.name + "_quant" if node.name != "" else ""
|
||||
|
||||
qnode = onnx.helper.make_node("QLinear" + node.op_type, [
|
||||
quantized_input_value.q_name, quantized_input_value.scale_name, quantized_input_value.zp_name,
|
||||
output_scale_name, output_zp_name
|
||||
], [quantized_output_value.q_name], qnode_name, **kwargs)
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_value.q_name,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[quantized_output_value.q_name],
|
||||
qnode_name,
|
||||
**kwargs
|
||||
)
|
||||
self.quantizer.new_nodes += [qnode]
|
||||
|
|
|
@ -1,55 +1,76 @@
|
|||
import onnx
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, find_by_name, get_mul_node, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
from ..quant_utils import find_by_name, get_mul_node, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
|
||||
def is_B_transposed(gemm_node):
|
||||
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == 'transB']
|
||||
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]
|
||||
if len(transB_attribute):
|
||||
return 0 < onnx.helper.get_attribute_value(transB_attribute[0])
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == 'beta']
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if len(beta_attribute):
|
||||
return onnx.helper.get_attribute_value(beta_attribute[0])
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
def set_default_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == 'beta']
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if len(beta_attribute):
|
||||
beta_attribute[0].f = 1.0
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
class QLinearGemm(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Gemm")
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_weight(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1], onnx_proto.TensorProto.INT8,
|
||||
0 if is_B_transposed(node) else 1)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0 if is_B_transposed(node) else 1,
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
@ -59,7 +80,9 @@ class QLinearGemm(QuantOperatorBase):
|
|||
if not self.quantizer.is_input_a_weight(node.input[2]):
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1], get_beta(self.node))
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(
|
||||
node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
|
||||
qgemm_output = node.output[0] + "_quantized"
|
||||
qgemm_name = qgemm_name = node.name + "_quant" if node.name != "" else ""
|
||||
|
@ -77,13 +100,17 @@ class QLinearGemm(QuantOperatorBase):
|
|||
|
||||
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
|
||||
|
||||
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output],
|
||||
qgemm_name, **kwargs)
|
||||
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
|
||||
nodes.append(qgemm_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(node.output[0], qgemm_output, output_scale_name, output_zp_name,
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qgemm_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
@ -95,7 +122,7 @@ class QDQGemm(QDQOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Gemm")
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
self.quantizer.quantize_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
|
@ -112,6 +139,7 @@ class QDQGemm(QDQOperatorBase):
|
|||
set_default_beta(self.node)
|
||||
else:
|
||||
logging.warning(
|
||||
"Bias of Gemm node '{}' is not constant. Please exclude this node for better performance."
|
||||
.format(self.node.name))
|
||||
|
||||
"Bias of Gemm node '{}' is not constant. Please exclude this node for better performance.".format(
|
||||
self.node.name
|
||||
)
|
||||
)
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
import onnx
|
||||
import numpy
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantType
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
'''
|
||||
|
||||
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize LSTM
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class LSTMQuant(QuantOperatorBase):
|
||||
|
@ -13,16 +15,17 @@ class LSTMQuant(QuantOperatorBase):
|
|||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
'''
|
||||
parameter node: LSTM node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
'''
|
||||
"""
|
||||
parameter node: LSTM node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert (node.op_type == "LSTM")
|
||||
assert node.op_type == "LSTM"
|
||||
|
||||
if (not self.quantizer.is_valid_quantize_weight(node.input[1])
|
||||
or not self.quantizer.is_valid_quantize_weight(node.input[2])):
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
|
||||
node.input[2]
|
||||
):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
|
@ -30,7 +33,7 @@ class LSTMQuant(QuantOperatorBase):
|
|||
W = model.get_initializer(node.input[1])
|
||||
R = model.get_initializer(node.input[2])
|
||||
|
||||
if (len(W.dims) != 3 or len(R.dims) != 3):
|
||||
if len(W.dims) != 3 or len(R.dims) != 3:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
|
@ -43,10 +46,12 @@ class LSTMQuant(QuantOperatorBase):
|
|||
W.dims[0] = W_num_dir * W_4_hidden_size
|
||||
R.dims[0] = R_num_dir * R_4_hidden_size
|
||||
|
||||
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1],
|
||||
onnx_proto.TensorProto.INT8, 0)
|
||||
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[2],
|
||||
onnx_proto.TensorProto.INT8, 0)
|
||||
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1], onnx_proto.TensorProto.INT8, 0
|
||||
)
|
||||
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[2], onnx_proto.TensorProto.INT8, 0
|
||||
)
|
||||
|
||||
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])
|
||||
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])
|
||||
|
@ -87,10 +92,14 @@ class LSTMQuant(QuantOperatorBase):
|
|||
inputs.extend([node.input[5] if input_len > 5 else ""])
|
||||
inputs.extend([node.input[6] if input_len > 6 else ""])
|
||||
inputs.extend([node.input[7] if input_len > 7 else ""])
|
||||
inputs.extend([
|
||||
quant_input_weight_tuple[2], quant_input_weight_tuple[1], quant_recurrent_weight_tuple[2],
|
||||
quant_recurrent_weight_tuple[1]
|
||||
])
|
||||
inputs.extend(
|
||||
[
|
||||
quant_input_weight_tuple[2],
|
||||
quant_input_weight_tuple[1],
|
||||
quant_recurrent_weight_tuple[2],
|
||||
quant_recurrent_weight_tuple[1],
|
||||
]
|
||||
)
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
import onnx
|
||||
import itertools
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
from ..quant_utils import find_by_name, get_mul_node, QuantizedValue, QuantizedValueType
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
'''
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.IntegerOps.
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class MatMulInteger(QuantOperatorBase):
|
||||
|
@ -15,28 +18,43 @@ class MatMulInteger(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "MatMul")
|
||||
assert node.op_type == "MatMul"
|
||||
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
|
||||
|
||||
matmul_integer_output = node.output[0] + "_output_quantized"
|
||||
matmul_integer_name = node.name + "_quant" if node.name != "" else ""
|
||||
matmul_integer_node = onnx.helper.make_node("MatMulInteger", quantized_input_names + zero_point_names,
|
||||
[matmul_integer_output], matmul_integer_name)
|
||||
matmul_integer_node = onnx.helper.make_node(
|
||||
"MatMulInteger",
|
||||
quantized_input_names + zero_point_names,
|
||||
[matmul_integer_output],
|
||||
matmul_integer_name,
|
||||
)
|
||||
nodes.append(matmul_integer_node)
|
||||
|
||||
# Add cast operation to cast matmulInteger output to float.
|
||||
cast_op_output = matmul_integer_output + "_cast_output"
|
||||
cast_node = onnx.helper.make_node("Cast", [matmul_integer_output], [cast_op_output],
|
||||
matmul_integer_output + "_cast",
|
||||
to=onnx_proto.TensorProto.FLOAT)
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[matmul_integer_output],
|
||||
[cast_op_output],
|
||||
matmul_integer_output + "_cast",
|
||||
to=onnx_proto.TensorProto.FLOAT,
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert (len(scale_names) == 2)
|
||||
scales_mul_op = matmul_integer_name + "_scales_mul" if matmul_integer_name != "" else scale_names[
|
||||
0] + "_" + scale_names[1] + "_mul"
|
||||
assert len(scale_names) == 2
|
||||
scales_mul_op = (
|
||||
matmul_integer_name + "_scales_mul"
|
||||
if matmul_integer_name != ""
|
||||
else scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
)
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
|
@ -50,13 +68,19 @@ class MatMulInteger(QuantOperatorBase):
|
|||
output_scale_mul_op = ""
|
||||
if matmul_integer_name != "":
|
||||
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
|
||||
nodes.append(get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op))
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
node.output[0],
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.QLinearOps
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class QLinearMatMul(QuantOperatorBase):
|
||||
|
@ -65,12 +89,21 @@ class QLinearMatMul(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "MatMul")
|
||||
assert node.op_type == "MatMul"
|
||||
|
||||
(quantized_input_names, zero_point_names, scale_names, nodes) = \
|
||||
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
|
@ -90,24 +123,34 @@ class QLinearMatMul(QuantOperatorBase):
|
|||
qlinear_matmul_inputs.append(output_scale_name)
|
||||
qlinear_matmul_inputs.append(output_zp_name)
|
||||
|
||||
qlinear_matmul_node = onnx.helper.make_node("QLinearMatMul", qlinear_matmul_inputs, [qlinear_matmul_output],
|
||||
qlinear_matmul_name)
|
||||
qlinear_matmul_node = onnx.helper.make_node(
|
||||
"QLinearMatMul",
|
||||
qlinear_matmul_inputs,
|
||||
[qlinear_matmul_output],
|
||||
qlinear_matmul_name,
|
||||
)
|
||||
nodes.append(qlinear_matmul_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(node.output[0], qlinear_matmul_output, output_scale_name, output_zp_name,
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_matmul_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQMatMul(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "MatMul")
|
||||
assert node.op_type == "MatMul"
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
nodes_to_iterate = node.input
|
||||
|
@ -116,7 +159,7 @@ class QDQMatMul(QDQOperatorBase):
|
|||
|
||||
for tensor_name in nodes_to_iterate:
|
||||
# only support per-channel quantization on weight
|
||||
if self.quantizer.is_per_channel() and find_by_name(tensor_name, self.quantizer.model.initializer()) :
|
||||
if self.quantizer.is_per_channel() and find_by_name(tensor_name, self.quantizer.model.initializer()):
|
||||
channel_axis = self.quantizer.qdq_op_type_per_channel_support_to_axis.get(node.op_type, 1)
|
||||
self.quantizer.quantize_tensor_per_channel(tensor_name, channel_axis)
|
||||
else:
|
||||
|
|
|
@ -7,7 +7,7 @@ class QMaxPool(Direct8BitOp):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "MaxPool")
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, go to normal quantize.
|
||||
if self.quantizer.opset_version < 12:
|
||||
|
@ -24,7 +24,7 @@ class QDQMaxPool(QDQDirect8BitOp):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "MaxPool")
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, just no change
|
||||
if self.quantizer.opset_version < 12:
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import onnx
|
||||
import numpy as np
|
||||
from .base_operator import QuantOperatorBase
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QPad(QuantOperatorBase):
|
||||
|
@ -10,7 +11,7 @@ class QPad(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Pad")
|
||||
assert node.op_type == "Pad"
|
||||
|
||||
# Only after version 11, it has the optional constant_value
|
||||
# If input[0] is not quantized, do not quanitize this node
|
||||
|
@ -24,7 +25,7 @@ class QPad(QuantOperatorBase):
|
|||
kv = attribute_to_kwarg(attribute)
|
||||
kwargs.update(kv)
|
||||
|
||||
if 'mode' not in kwargs or kwargs['mode'] == b'constant':
|
||||
if "mode" not in kwargs or kwargs["mode"] == b"constant":
|
||||
if len(node.input) > 2: # There is 3rd input 'constant_value'
|
||||
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
|
||||
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
|
||||
|
@ -39,29 +40,43 @@ class QPad(QuantOperatorBase):
|
|||
scale_array = onnx.numpy_helper.to_array(scale_tensor)
|
||||
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
|
||||
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
|
||||
quantized_padding_constant_array = quantize_nparray(self.quantizer.input_qType,
|
||||
padding_constant_array, scale_value, zp_value)
|
||||
quantized_padding_constant_array = quantize_nparray(
|
||||
self.quantizer.input_qType,
|
||||
padding_constant_array,
|
||||
scale_value,
|
||||
zp_value,
|
||||
)
|
||||
quantized_padding_constant_name = node.input[2] + "_quantized"
|
||||
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
|
||||
quantized_padding_constant_array, quantized_padding_constant_name)
|
||||
quantized_padding_constant_array,
|
||||
quantized_padding_constant_name,
|
||||
)
|
||||
# Suppose this padding constant initializer only used by the node
|
||||
self.quantizer.model.remove_initializer(padding_constant_initializer)
|
||||
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
|
||||
node.input[2] = quantized_padding_constant_name
|
||||
else:
|
||||
# TODO: check quantize_inputs after sub graph is supported
|
||||
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(node, 2, self.quantizer.input_qType,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name)
|
||||
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
|
||||
node,
|
||||
2,
|
||||
self.quantizer.input_qType,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
)
|
||||
self.quantizer.new_nodes += [pad_value_qnodes]
|
||||
node.input[2] = pad_value_qnodes.output[0]
|
||||
else:
|
||||
node.input.extend([quantized_input_value.zp_name]) # pad zero_point for original zero
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
|
||||
quantized_input_value.scale_name, quantized_input_value.zp_name,
|
||||
QuantizedValueType.Input)
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + "_quantized",
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantizedValue, QuantizedValueType
|
||||
|
||||
|
||||
class QLinearPool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
|
@ -10,11 +12,21 @@ class QLinearPool(QuantOperatorBase):
|
|||
node = self.node
|
||||
|
||||
# only try to quantize when given quantization parameters for it
|
||||
data_found, output_scale_name, output_zp_name, _, _ = \
|
||||
self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
quantized_input_names, input_zero_point_names, input_scale_names, nodes = self.quantizer.quantize_inputs(node, [0])
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
@ -22,7 +34,12 @@ class QLinearPool(QuantOperatorBase):
|
|||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + "_quantized"
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0], qlinear_output_name, output_scale_name, output_zp_name, QuantizedValueType.Input)
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear pool node for given type (AveragePool, etc)
|
||||
|
@ -33,10 +50,17 @@ class QLinearPool(QuantOperatorBase):
|
|||
qlinear_node_name = node.name + "_quant" if node.name != "" else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[quantized_input_names[0], input_scale_names[0], input_zero_point_names[0], output_scale_name, output_zp_name],
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs)
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
|
|
|
@ -1,14 +1,16 @@
|
|||
import itertools
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QDQOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
self.disable_qdq_for_node_output = True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization \
|
||||
else False
|
||||
self.disable_qdq_for_node_output = (
|
||||
True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False
|
||||
)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
|
|
@ -7,7 +7,7 @@ class QResize(Direct8BitOp):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Resize")
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, go to normal quantize.
|
||||
if self.quantizer.opset_version < 11:
|
||||
|
@ -24,7 +24,7 @@ class QDQResize(QDQDirect8BitOp):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert (node.op_type == "Resize")
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, just keep this node
|
||||
if self.quantizer.opset_version < 11:
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import onnx
|
||||
from .base_operator import QuantOperatorBase
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QSplit(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
|
@ -10,7 +11,12 @@ class QSplit(QuantOperatorBase):
|
|||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
quantized_input_names, zero_point_names, scale_names, nodes = self.quantizer.quantize_inputs(node, [0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_inputs(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
|
@ -26,14 +32,20 @@ class QSplit(QuantOperatorBase):
|
|||
for output_name in node.output:
|
||||
quantized_output_name = output_name + "quantized"
|
||||
quantized_output_names.append(quantized_output_name)
|
||||
q_output = QuantizedValue(output_name, quantized_output_name, scale_names[0], zero_point_names[0],
|
||||
QuantizedValueType.Input)
|
||||
q_output = QuantizedValue(
|
||||
output_name,
|
||||
quantized_output_name,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[output_name] = q_output
|
||||
|
||||
if len(node.input) > 1:
|
||||
quantized_input_names = quantized_input_names.extend(node.input[1:])
|
||||
quantized_node = onnx.helper.make_node(node.op_type, quantized_input_names, quantized_output_names,
|
||||
quantized_node_name, **kwargs)
|
||||
quantized_node = onnx.helper.make_node(
|
||||
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
|
||||
)
|
||||
|
||||
nodes.append(quantized_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
|
|
@ -3,33 +3,72 @@
|
|||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
import logging
|
||||
import os
|
||||
import struct
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
import onnx.numpy_helper
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
from onnx import TensorProto
|
||||
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue
|
||||
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name, quantize_nparray
|
||||
from .quant_utils import QuantType, onnx_domain, __producer__, __version__
|
||||
|
||||
from .registry import CreateQDQQuantizer
|
||||
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
|
||||
|
||||
from .onnx_model import ONNXModel
|
||||
from .onnx_quantizer import ONNXQuantizer
|
||||
from .quant_utils import (
|
||||
QuantizationMode,
|
||||
QuantizedInitializer,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
QuantType,
|
||||
__producer__,
|
||||
__version__,
|
||||
attribute_to_kwarg,
|
||||
find_by_name,
|
||||
generate_identified_filename,
|
||||
get_elem_index,
|
||||
get_mul_node,
|
||||
onnx_domain,
|
||||
quantize_nparray,
|
||||
type_to_name,
|
||||
)
|
||||
from .registry import CreateQDQQuantizer
|
||||
|
||||
|
||||
class QDQQuantizer(ONNXQuantizer):
|
||||
def __init__(self, model, per_channel, reduce_range, mode, static, weight_qType, input_qType, tensors_range,
|
||||
nodes_to_quantize, nodes_to_exclude, op_types_to_quantize, extra_options={}):
|
||||
ONNXQuantizer.__init__(self, model, per_channel, reduce_range, mode, static, weight_qType, input_qType,
|
||||
tensors_range, nodes_to_quantize, nodes_to_exclude, op_types_to_quantize, extra_options)
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
mode,
|
||||
static,
|
||||
weight_qType,
|
||||
input_qType,
|
||||
tensors_range,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options={},
|
||||
):
|
||||
ONNXQuantizer.__init__(
|
||||
self,
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
mode,
|
||||
static,
|
||||
weight_qType,
|
||||
input_qType,
|
||||
tensors_range,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options,
|
||||
)
|
||||
self.tensors_to_quantize = []
|
||||
self.tensors_to_quantize_per_channel = []
|
||||
self.bias_to_quantize = []
|
||||
|
@ -40,23 +79,33 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
# because those ops may be followed by nodes that require high resolution inputs.
|
||||
# Adding QDQ for those ops' output may end up with worse accuracy.
|
||||
# So, we don't recommend to add QDQ to node's output under such condition.
|
||||
self.op_types_to_exclude_output_quantization = [] if 'OpTypesToExcludeOutputQuantizatioin' not in extra_options \
|
||||
else extra_options['OpTypesToExcludeOutputQuantizatioin']
|
||||
self.op_types_to_exclude_output_quantization = (
|
||||
[]
|
||||
if "OpTypesToExcludeOutputQuantizatioin" not in extra_options
|
||||
else extra_options["OpTypesToExcludeOutputQuantizatioin"]
|
||||
)
|
||||
|
||||
# We do quantization on Dequantizelinear's input to remove Quantizelinear for weight as an optimization.
|
||||
# In some cases, for example QDQ BERT model for TensorRT, QDQ should always appear as a pair.
|
||||
# Therefore, we need to disable this optimization and add qdq pair to weight.
|
||||
self.add_qdq_pair_to_weight = False if 'AddQDQPairToWeight' not in extra_options \
|
||||
else extra_options['AddQDQPairToWeight']
|
||||
self.add_qdq_pair_to_weight = (
|
||||
False if "AddQDQPairToWeight" not in extra_options else extra_options["AddQDQPairToWeight"]
|
||||
)
|
||||
|
||||
# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
|
||||
# In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node.
|
||||
self.dedicated_qdq_pair = False if 'DedicatedQDQPair' not in extra_options else extra_options['DedicatedQDQPair']
|
||||
# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
|
||||
# In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node.
|
||||
self.dedicated_qdq_pair = (
|
||||
False if "DedicatedQDQPair" not in extra_options else extra_options["DedicatedQDQPair"]
|
||||
)
|
||||
if self.dedicated_qdq_pair:
|
||||
self.tensor_to_its_receiving_nodes = {}
|
||||
|
||||
# Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
|
||||
self.qdq_op_type_per_channel_support_to_axis = {} if 'QDQOpTypePerChannelSupportToAxis' not in extra_options else extra_options['QDQOpTypePerChannelSupportToAxis']
|
||||
self.qdq_op_type_per_channel_support_to_axis = (
|
||||
{}
|
||||
if "QDQOpTypePerChannelSupportToAxis" not in extra_options
|
||||
else extra_options["QDQOpTypePerChannelSupportToAxis"]
|
||||
)
|
||||
|
||||
def quantize_tensor(self, tensor_name):
|
||||
weight = find_by_name(tensor_name, self.model.initializer())
|
||||
|
@ -65,12 +114,14 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
self.tensors_to_quantize.append(tensor_name)
|
||||
elif tensor_name in self.value_infos.keys():
|
||||
vi = self.value_infos[tensor_name]
|
||||
if vi.type.HasField('tensor_type') and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
self.tensors_to_quantize.append(tensor_name)
|
||||
else:
|
||||
logging.warning(
|
||||
"failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(
|
||||
tensor_name))
|
||||
tensor_name
|
||||
)
|
||||
)
|
||||
|
||||
def quantize_tensor_per_channel(self, tensor_name, axis):
|
||||
weight = find_by_name(tensor_name, self.model.initializer())
|
||||
|
@ -80,10 +131,12 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
else:
|
||||
logging.warning(
|
||||
"only support per-channel quantization on weight. Quantize tensor: {} with per-tensor instead.".format(
|
||||
tensor_name))
|
||||
tensor_name
|
||||
)
|
||||
)
|
||||
self.quantize_tensor(tensor_name)
|
||||
|
||||
def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta = 1.0):
|
||||
def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
|
||||
weight = find_by_name(bias_name, self.model.initializer())
|
||||
if weight is not None:
|
||||
if weight.data_type == onnx_proto.TensorProto.FLOAT:
|
||||
|
@ -124,9 +177,11 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
return self.model.model
|
||||
|
||||
def try_replacing_upstream_output(self, upstream_output_name, output_name):
|
||||
if output_name in self.quantization_params.keys() and \
|
||||
len(self.model.input_name_to_nodes()[upstream_output_name]) == 1 and \
|
||||
not self.model.is_graph_output(upstream_output_name):
|
||||
if (
|
||||
output_name in self.quantization_params.keys()
|
||||
and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
|
||||
and not self.model.is_graph_output(upstream_output_name)
|
||||
):
|
||||
self.model.replace_output_of_all_nodes(upstream_output_name, output_name)
|
||||
self.tensors_to_quantize.remove(upstream_output_name)
|
||||
return True
|
||||
|
@ -141,25 +196,34 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
if initializer is not None:
|
||||
|
||||
if self.add_qdq_pair_to_weight:
|
||||
q_weight_name, zp_name, scale_name = self.quantize_weight(initializer,
|
||||
self.weight_qType,
|
||||
keep_float_weight=True)
|
||||
qlinear_node = onnx.helper.make_node("QuantizeLinear", [tensor_name, scale_name, zp_name],
|
||||
[tensor_name + "_QuantizeLinear"],
|
||||
tensor_name + "_QuantizeLinear")
|
||||
dequant_node = onnx.helper.make_node("DequantizeLinear",
|
||||
[tensor_name + "_QuantizeLinear", scale_name, zp_name],
|
||||
[tensor_name + "_DequantizeLinear"],
|
||||
tensor_name + "_DequantizeLinear")
|
||||
q_weight_name, zp_name, scale_name = self.quantize_weight(
|
||||
initializer, self.weight_qType, keep_float_weight=True
|
||||
)
|
||||
qlinear_node = onnx.helper.make_node(
|
||||
"QuantizeLinear",
|
||||
[tensor_name, scale_name, zp_name],
|
||||
[tensor_name + "_QuantizeLinear"],
|
||||
tensor_name + "_QuantizeLinear",
|
||||
)
|
||||
dequant_node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
[tensor_name + "_QuantizeLinear", scale_name, zp_name],
|
||||
[tensor_name + "_DequantizeLinear"],
|
||||
tensor_name + "_DequantizeLinear",
|
||||
)
|
||||
self.model.replace_input_of_all_nodes(tensor_name, tensor_name + "_DequantizeLinear")
|
||||
|
||||
self.model.add_nodes([qlinear_node, dequant_node])
|
||||
else:
|
||||
q_weight_name, zp_name, scale_name = self.quantize_weight(initializer, self.weight_qType)
|
||||
inputs = [q_weight_name, scale_name, zp_name]
|
||||
output_name = tensor_name + '_DequantizeLinear'
|
||||
node = onnx.helper.make_node("DequantizeLinear", inputs, [output_name],
|
||||
tensor_name + '_DequantizeLinear')
|
||||
output_name = tensor_name + "_DequantizeLinear"
|
||||
node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
inputs,
|
||||
[output_name],
|
||||
tensor_name + "_DequantizeLinear",
|
||||
)
|
||||
self.model.add_node(node)
|
||||
self.model.replace_input_of_all_nodes(tensor_name, tensor_name + "_DequantizeLinear")
|
||||
else:
|
||||
|
@ -168,32 +232,49 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
if data_found == False:
|
||||
raise ValueError(
|
||||
"Quantization parameters are not specified for param {}."
|
||||
"In static mode quantization params for inputs and outputs of nodes to be quantized are required."
|
||||
.format(tensor_name))
|
||||
"In static mode quantization params for inputs and outputs of nodes to be quantized are required.".format(
|
||||
tensor_name
|
||||
)
|
||||
)
|
||||
|
||||
if self.dedicated_qdq_pair and tensor_name in self.tensor_to_its_receiving_nodes and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1:
|
||||
if (
|
||||
self.dedicated_qdq_pair
|
||||
and tensor_name in self.tensor_to_its_receiving_nodes
|
||||
and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
|
||||
):
|
||||
num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])
|
||||
for i in range(num_dedicated_qdq_pair):
|
||||
postfix = str(i+1)
|
||||
postfix = str(i + 1)
|
||||
q_input = tensor_name
|
||||
q_output = tensor_name + "_QuantizeLinear_" + postfix
|
||||
q_output = tensor_name + "_QuantizeLinear_" + postfix
|
||||
dq_input = q_output
|
||||
dq_output = tensor_name + "_DequantizeLinear_" + postfix
|
||||
quant_node_name = tensor_name + "_QuantizeLinear_" + postfix
|
||||
dequant_node_name = tensor_name + "_DequantizeLinear_" + postfix
|
||||
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
|
||||
[q_output], quant_node_name)
|
||||
dequant_node = onnx.helper.make_node("DequantizeLinear",
|
||||
[dq_input, scale_name, zp_name],
|
||||
[dq_output],
|
||||
dequant_node_name)
|
||||
qlinear_node = onnx.helper.make_node(
|
||||
"QuantizeLinear",
|
||||
[q_input, scale_name, zp_name],
|
||||
[q_output],
|
||||
quant_node_name,
|
||||
)
|
||||
dequant_node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
[dq_input, scale_name, zp_name],
|
||||
[dq_output],
|
||||
dequant_node_name,
|
||||
)
|
||||
self.model.add_nodes([qlinear_node, dequant_node])
|
||||
|
||||
node = self.tensor_to_its_receiving_nodes[tensor_name][i]
|
||||
self.model.replace_node_input(node, tensor_name, dq_output)
|
||||
|
||||
quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
|
||||
QuantizedValueType.Input)
|
||||
quantized_value = QuantizedValue(
|
||||
tensor_name,
|
||||
dq_output,
|
||||
scale_name,
|
||||
zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantized_value_map[tensor_name] = quantized_value
|
||||
else:
|
||||
q_input = tensor_name
|
||||
|
@ -209,16 +290,27 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
|
||||
quant_node_name = tensor_name + "_QuantizeLinear"
|
||||
dequant_node_name = tensor_name + "_DequantizeLinear"
|
||||
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
|
||||
[q_output], quant_node_name)
|
||||
dequant_node = onnx.helper.make_node("DequantizeLinear",
|
||||
[dq_input, scale_name, zp_name],
|
||||
[dq_output],
|
||||
dequant_node_name)
|
||||
qlinear_node = onnx.helper.make_node(
|
||||
"QuantizeLinear",
|
||||
[q_input, scale_name, zp_name],
|
||||
[q_output],
|
||||
quant_node_name,
|
||||
)
|
||||
dequant_node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
[dq_input, scale_name, zp_name],
|
||||
[dq_output],
|
||||
dequant_node_name,
|
||||
)
|
||||
self.model.add_nodes([qlinear_node, dequant_node])
|
||||
|
||||
quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
|
||||
QuantizedValueType.Input)
|
||||
quantized_value = QuantizedValue(
|
||||
tensor_name,
|
||||
dq_output,
|
||||
scale_name,
|
||||
zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantized_value_map[tensor_name] = quantized_value
|
||||
|
||||
def quantize_bias_tensors(self):
|
||||
|
@ -231,13 +323,20 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
quant_value = self.quantized_value_map[bias_name]
|
||||
inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]
|
||||
if quant_value.axis is not None:
|
||||
dequant_node = onnx.helper.make_node("DequantizeLinear",
|
||||
inputs, [bias_name],
|
||||
bias_name + '_DequantizeLinear',
|
||||
axis=quant_value.axis)
|
||||
dequant_node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
inputs,
|
||||
[bias_name],
|
||||
bias_name + "_DequantizeLinear",
|
||||
axis=quant_value.axis,
|
||||
)
|
||||
else:
|
||||
dequant_node = onnx.helper.make_node("DequantizeLinear", inputs, [bias_name],
|
||||
bias_name + '_DequantizeLinear')
|
||||
dequant_node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
inputs,
|
||||
[bias_name],
|
||||
bias_name + "_DequantizeLinear",
|
||||
)
|
||||
self.model.add_node(dequant_node)
|
||||
|
||||
def quantize_weights_per_channel(self):
|
||||
|
@ -245,31 +344,44 @@ class QDQQuantizer(ONNXQuantizer):
|
|||
raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
|
||||
for weight_name, axis in self.tensors_to_quantize_per_channel:
|
||||
if self.add_qdq_pair_to_weight:
|
||||
q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, onnx_proto.TensorProto.INT8,
|
||||
axis, keep_float_weight=True)
|
||||
qlinear_node = onnx.helper.make_node("QuantizeLinear", [weight_name, scale_name, zp_name],
|
||||
[weight_name + "_QuantizeLinear"],
|
||||
weight_name + "_QuantizeLinear",
|
||||
axis=axis)
|
||||
dequant_node = onnx.helper.make_node("DequantizeLinear",
|
||||
[weight_name + "_QuantizeLinear", scale_name, zp_name],
|
||||
[weight_name + "_DequantizeLinear"],
|
||||
weight_name + "_DequantizeLinear",
|
||||
axis=axis)
|
||||
q_name, zp_name, scale_name = self.quantize_weight_per_channel(
|
||||
weight_name,
|
||||
onnx_proto.TensorProto.INT8,
|
||||
axis,
|
||||
keep_float_weight=True,
|
||||
)
|
||||
qlinear_node = onnx.helper.make_node(
|
||||
"QuantizeLinear",
|
||||
[weight_name, scale_name, zp_name],
|
||||
[weight_name + "_QuantizeLinear"],
|
||||
weight_name + "_QuantizeLinear",
|
||||
axis=axis,
|
||||
)
|
||||
dequant_node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
[weight_name + "_QuantizeLinear", scale_name, zp_name],
|
||||
[weight_name + "_DequantizeLinear"],
|
||||
weight_name + "_DequantizeLinear",
|
||||
axis=axis,
|
||||
)
|
||||
self.model.replace_input_of_all_nodes(weight_name, weight_name + "_DequantizeLinear")
|
||||
|
||||
self.model.add_nodes([qlinear_node, dequant_node])
|
||||
else:
|
||||
#q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, self.weight_qType, axis)
|
||||
q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, onnx_proto.TensorProto.INT8,
|
||||
axis)
|
||||
# q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, self.weight_qType, axis)
|
||||
q_name, zp_name, scale_name = self.quantize_weight_per_channel(
|
||||
weight_name, onnx_proto.TensorProto.INT8, axis
|
||||
)
|
||||
|
||||
inputs = [q_name, scale_name, zp_name]
|
||||
output_name = weight_name + "_DequantizeLinear"
|
||||
node = onnx.helper.make_node("DequantizeLinear",
|
||||
inputs, [output_name],
|
||||
weight_name + '_DequantizeLinear',
|
||||
axis=axis)
|
||||
node = onnx.helper.make_node(
|
||||
"DequantizeLinear",
|
||||
inputs,
|
||||
[output_name],
|
||||
weight_name + "_DequantizeLinear",
|
||||
axis=axis,
|
||||
)
|
||||
self.model.add_node(node)
|
||||
|
||||
# Replace weight_name with output of DequantizeLinear
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import logging
|
||||
import numpy
|
||||
import onnx
|
||||
import tempfile
|
||||
|
||||
from enum import Enum
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
from onnx import external_data_helper
|
||||
from pathlib import Path
|
||||
|
||||
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
|
||||
import numpy
|
||||
import onnx
|
||||
from onnx import external_data_helper
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
|
||||
|
||||
__producer__ = "onnx.quantize"
|
||||
__version__ = "0.1.0"
|
||||
|
@ -97,14 +97,17 @@ class QuantFormat(Enum):
|
|||
except KeyError:
|
||||
raise ValueError()
|
||||
|
||||
|
||||
ONNX_TYPE_TO_NP_TYPE = {
|
||||
onnx_proto.TensorProto.INT8: numpy.dtype('int8'),
|
||||
onnx_proto.TensorProto.UINT8: numpy.dtype('uint8')
|
||||
onnx_proto.TensorProto.INT8: numpy.dtype("int8"),
|
||||
onnx_proto.TensorProto.UINT8: numpy.dtype("uint8"),
|
||||
}
|
||||
|
||||
|
||||
def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
|
||||
assert qType in ONNX_TYPE_TO_NP_TYPE, \
|
||||
"Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)
|
||||
assert (
|
||||
qType in ONNX_TYPE_TO_NP_TYPE
|
||||
), "Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)
|
||||
dtype = ONNX_TYPE_TO_NP_TYPE[qType]
|
||||
cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
|
||||
cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
|
||||
|
@ -114,10 +117,10 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
|
|||
|
||||
|
||||
def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
|
||||
'''
|
||||
Calculate the scale s and zero point z for the quantization relation
|
||||
"""
|
||||
Calculate the scale s and zero point z for the quantization relation
|
||||
r = s(q-z), where r are the original values and q are the corresponding
|
||||
quantized values.
|
||||
quantized values.
|
||||
|
||||
r and z are calculated such that every value within [rmin,rmax] has an
|
||||
approximate representation within [qmin,qmax]. In addition, qmin <= z <=
|
||||
|
@ -131,8 +134,8 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
|
|||
:parameter qmax: maximum value representable by the target quantization data type
|
||||
:return: zero and scale [z, s]
|
||||
|
||||
'''
|
||||
|
||||
"""
|
||||
|
||||
# Adjust rmin and rmax such that 0 is included in the range. This is
|
||||
# required to make sure zero can be represented by the quantization data
|
||||
# type (i.e. to make sure qmin <= zero_point <= qmax)
|
||||
|
@ -144,21 +147,21 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
|
|||
rmin = -absmax
|
||||
rmax = +absmax
|
||||
|
||||
scale = (rmax - rmin) / float(qmax-qmin) if rmax!=rmin else 1.0
|
||||
zero_point = round(qmin - rmin/scale)
|
||||
scale = (rmax - rmin) / float(qmax - qmin) if rmax != rmin else 1.0
|
||||
zero_point = round(qmin - rmin / scale)
|
||||
|
||||
return [zero_point, scale]
|
||||
|
||||
|
||||
def quantize_data(data, qType, symmetric, reduce_range=False):
|
||||
'''
|
||||
"""
|
||||
:param data: data to quantize
|
||||
:param qType: data type to quantize to. Supported types UINT8 and INT8
|
||||
:param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
|
||||
:return: minimum, maximum, zero point, scale, and quantized weights
|
||||
|
||||
To pack weights, we compute a linear transformation
|
||||
|
||||
|
||||
- when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
|
||||
- when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
|
||||
`m = max(abs(rmin), abs(rmax))`
|
||||
|
@ -166,12 +169,12 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
|
|||
and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
|
||||
|
||||
:math:`r = S(q-z)`, where
|
||||
|
||||
|
||||
- *r*: real original value
|
||||
- *q*: quantized value
|
||||
- *S*: scale
|
||||
- *z*: zero point
|
||||
'''
|
||||
"""
|
||||
|
||||
rmin = 0
|
||||
rmax = 0
|
||||
|
@ -188,46 +191,52 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
|
|||
|
||||
return rmin, rmax, zero_point, scale, quantized_data
|
||||
|
||||
|
||||
def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):
|
||||
'''
|
||||
"""
|
||||
Return qmin and qmax, the minimum and maximum value representable by the given qType
|
||||
:parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
|
||||
:return: qmin, qmax
|
||||
'''
|
||||
"""
|
||||
if qType == onnx_proto.TensorProto.UINT8:
|
||||
(qmin, qmax) = (0,127) if reduce_range else (0,255)
|
||||
(qmin, qmax) = (0, 127) if reduce_range else (0, 255)
|
||||
elif qType == onnx_proto.TensorProto.INT8:
|
||||
if symmetric:
|
||||
(qmin, qmax) = (-64,64) if reduce_range else (-127,127)
|
||||
(qmin, qmax) = (-64, 64) if reduce_range else (-127, 127)
|
||||
else:
|
||||
(qmin, qmax) = (-64,64) if reduce_range else (-128,127)
|
||||
(qmin, qmax) = (-64, 64) if reduce_range else (-128, 127)
|
||||
else:
|
||||
raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
|
||||
return qmin, qmax
|
||||
|
||||
|
||||
def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):
|
||||
'''
|
||||
"""
|
||||
Helper function to get the quantization range for a type.
|
||||
parameter qType: quantization type.
|
||||
return: quantization range.
|
||||
'''
|
||||
"""
|
||||
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
|
||||
return qmax - qmin
|
||||
return qmax - qmin
|
||||
|
||||
|
||||
class QuantizedInitializer:
|
||||
'''
|
||||
Represents a linearly quantized weight input from ONNX operators
|
||||
'''
|
||||
def __init__(self,
|
||||
name,
|
||||
initializer,
|
||||
rmins,
|
||||
rmaxs,
|
||||
zero_points,
|
||||
scales,
|
||||
data=[],
|
||||
quantized_data=[],
|
||||
axis=None):
|
||||
"""
|
||||
Represents a linearly quantized weight input from ONNX operators
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name,
|
||||
initializer,
|
||||
rmins,
|
||||
rmaxs,
|
||||
zero_points,
|
||||
scales,
|
||||
data=[],
|
||||
quantized_data=[],
|
||||
axis=None,
|
||||
):
|
||||
self.name = name
|
||||
self.initializer = initializer # TensorProto initializer in ONNX graph
|
||||
self.rmins = rmins # List of minimum range for each axis
|
||||
|
@ -243,16 +252,19 @@ class QuantizedInitializer:
|
|||
|
||||
|
||||
class QuantizedValue:
|
||||
'''
|
||||
"""
|
||||
Represents a linearly quantized value (input\output\intializer)
|
||||
'''
|
||||
def __init__(self,
|
||||
name,
|
||||
new_quantized_name,
|
||||
scale_name,
|
||||
zero_point_name,
|
||||
quantized_value_type,
|
||||
axis=None):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name,
|
||||
new_quantized_name,
|
||||
scale_name,
|
||||
zero_point_name,
|
||||
quantized_value_type,
|
||||
axis=None,
|
||||
):
|
||||
self.original_name = name
|
||||
self.q_name = new_quantized_name
|
||||
self.scale_name = scale_name
|
||||
|
@ -262,9 +274,10 @@ class QuantizedValue:
|
|||
|
||||
|
||||
class BiasToQuantize:
|
||||
'''
|
||||
"""
|
||||
Represents a bias to be quantized
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, bias_name, input_name, weight_name):
|
||||
self.bias_name = bias_name
|
||||
self.input_name = input_name
|
||||
|
@ -272,57 +285,57 @@ class BiasToQuantize:
|
|||
|
||||
|
||||
def attribute_to_kwarg(attribute):
|
||||
'''
|
||||
"""
|
||||
Convert attribute to kwarg format for use with onnx.helper.make_node.
|
||||
:parameter attribute: attribute in AttributeProto format.
|
||||
:return: attribute in {key: value} format.
|
||||
'''
|
||||
if (attribute.type == 0):
|
||||
raise ValueError('attribute {} does not have type specified.'.format(attribute.name))
|
||||
"""
|
||||
if attribute.type == 0:
|
||||
raise ValueError("attribute {} does not have type specified.".format(attribute.name))
|
||||
|
||||
# Based on attribute type definitions from AttributeProto
|
||||
# definition in https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
|
||||
if (attribute.type == 1):
|
||||
if attribute.type == 1:
|
||||
value = attribute.f
|
||||
elif (attribute.type == 2):
|
||||
elif attribute.type == 2:
|
||||
value = attribute.i
|
||||
elif (attribute.type == 3):
|
||||
elif attribute.type == 3:
|
||||
value = attribute.s
|
||||
elif (attribute.type == 4):
|
||||
elif attribute.type == 4:
|
||||
value = attribute.t
|
||||
elif (attribute.type == 5):
|
||||
elif attribute.type == 5:
|
||||
value = attribute.g
|
||||
elif (attribute.type == 6):
|
||||
elif attribute.type == 6:
|
||||
value = attribute.floats
|
||||
elif (attribute.type == 7):
|
||||
elif attribute.type == 7:
|
||||
value = attribute.ints
|
||||
elif (attribute.type == 8):
|
||||
elif attribute.type == 8:
|
||||
value = attribute.strings
|
||||
elif (attribute.type == 9):
|
||||
elif attribute.type == 9:
|
||||
value = attribute.tensors
|
||||
elif (attribute.type == 10):
|
||||
elif attribute.type == 10:
|
||||
value = attribute.graphs
|
||||
else:
|
||||
raise ValueError('attribute {} has unsupported type {}.'.format(attribute.name, attribute.type))
|
||||
raise ValueError("attribute {} has unsupported type {}.".format(attribute.name, attribute.type))
|
||||
|
||||
return {attribute.name: value}
|
||||
|
||||
|
||||
def find_by_name(item_name, item_list):
|
||||
'''
|
||||
"""
|
||||
Helper function to find item by name in a list.
|
||||
parameter item_name: name of the item.
|
||||
parameter item_list: list of items.
|
||||
return: item if found. None otherwise.
|
||||
'''
|
||||
"""
|
||||
items = [item for item in item_list if item.name == item_name]
|
||||
return items[0] if len(items) > 0 else None
|
||||
|
||||
|
||||
def get_elem_index(elem_name, elem_list):
|
||||
'''
|
||||
"""
|
||||
Helper function to return index of an item in a node list
|
||||
'''
|
||||
"""
|
||||
elem_idx = -1
|
||||
for i in range(0, len(elem_list)):
|
||||
if elem_list[i] == elem_name:
|
||||
|
@ -331,50 +344,56 @@ def get_elem_index(elem_name, elem_list):
|
|||
|
||||
|
||||
def get_mul_node(inputs, output, name):
|
||||
'''
|
||||
"""
|
||||
Helper function to create a Mul node.
|
||||
parameter inputs: list of input names.
|
||||
parameter output: output name.
|
||||
parameter name: name of the node.
|
||||
return: Mul node in NodeProto format.
|
||||
'''
|
||||
"""
|
||||
return onnx.helper.make_node("Mul", inputs, [output], name)
|
||||
|
||||
|
||||
def generate_identified_filename(filename: Path, identifier: str) -> Path:
|
||||
'''
|
||||
Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
|
||||
'''
|
||||
"""
|
||||
Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
|
||||
"""
|
||||
return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
|
||||
|
||||
|
||||
def apply_plot(hist, hist_edges):
|
||||
import sys
|
||||
import numpy
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy
|
||||
|
||||
numpy.set_printoptions(threshold=sys.maxsize)
|
||||
print("Histogram:")
|
||||
print(hist)
|
||||
print("Histogram Edges:")
|
||||
print(hist_edges)
|
||||
plt.stairs(hist, hist_edges, fill=True)
|
||||
plt.xlabel('Tensor value')
|
||||
plt.ylabel('Counts')
|
||||
plt.title('Tensor value V.S. Counts')
|
||||
plt.xlabel("Tensor value")
|
||||
plt.ylabel("Counts")
|
||||
plt.title("Tensor value V.S. Counts")
|
||||
plt.show()
|
||||
|
||||
|
||||
def write_calibration_table(calibration_cache):
|
||||
'''
|
||||
Helper function to write calibration table to files.
|
||||
'''
|
||||
"""
|
||||
Helper function to write calibration table to files.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import flatbuffers
|
||||
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
|
||||
|
||||
import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
|
||||
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
|
||||
|
||||
logging.info("calibration cache: {}".format(calibration_cache))
|
||||
|
||||
with open("calibration.json", 'w') as file:
|
||||
with open("calibration.json", "w") as file:
|
||||
file.write(json.dumps(calibration_cache)) # use `json.loads` to do the reverse
|
||||
|
||||
# Serialize data using FlatBuffers
|
||||
|
@ -406,7 +425,7 @@ def write_calibration_table(calibration_cache):
|
|||
builder.Finish(cal_table)
|
||||
buf = builder.Output()
|
||||
|
||||
with open("calibration.flatbuffers", 'wb') as file:
|
||||
with open("calibration.flatbuffers", "wb") as file:
|
||||
file.write(buf)
|
||||
|
||||
# Deserialize data (for validation)
|
||||
|
@ -419,12 +438,13 @@ def write_calibration_table(calibration_cache):
|
|||
logging.info(key_value.Value())
|
||||
|
||||
# write plain text
|
||||
with open("calibration.cache", 'w') as file:
|
||||
with open("calibration.cache", "w") as file:
|
||||
for key in sorted(calibration_cache.keys()):
|
||||
value = calibration_cache[key]
|
||||
s = key + ' ' + str(max(abs(value[0]), abs(value[1])))
|
||||
s = key + " " + str(max(abs(value[0]), abs(value[1])))
|
||||
file.write(s)
|
||||
file.write('\n')
|
||||
file.write("\n")
|
||||
|
||||
|
||||
def smooth_distribution(p, eps=0.0001):
|
||||
"""Given a discrete distribution (may have not been normalized to 1),
|
||||
|
@ -444,7 +464,11 @@ def smooth_distribution(p, eps=0.0001):
|
|||
# raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
|
||||
return -1
|
||||
eps1 = eps * float(n_zeros) / float(n_nonzeros)
|
||||
assert eps1 < 1.0, 'n_zeros=%d, n_nonzeros=%d, eps1=%f' % (n_zeros, n_nonzeros, eps1)
|
||||
assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
|
||||
n_zeros,
|
||||
n_nonzeros,
|
||||
eps1,
|
||||
)
|
||||
|
||||
hist = p.astype(np.float32)
|
||||
hist += eps * is_zeros + (-eps1) * is_nonzeros
|
||||
|
@ -452,32 +476,36 @@ def smooth_distribution(p, eps=0.0001):
|
|||
|
||||
return hist
|
||||
|
||||
def model_has_external_data(model_path : Path):
|
||||
|
||||
def model_has_external_data(model_path: Path):
|
||||
model = onnx.load(model_path.as_posix(), load_external_data=False)
|
||||
for intializer in model.graph.initializer:
|
||||
if external_data_helper.uses_external_data(intializer):
|
||||
return True
|
||||
return False
|
||||
|
||||
def optimize_model(model_path : Path, opt_model_path : Path):
|
||||
'''
|
||||
|
||||
def optimize_model(model_path: Path, opt_model_path: Path):
|
||||
"""
|
||||
Generate model that applies graph optimization (constant folding, etc.)
|
||||
parameter model_path: path to the original onnx model
|
||||
parameter opt_model_path: path to the optimized onnx model
|
||||
:return: optimized onnx model
|
||||
'''
|
||||
"""
|
||||
sess_option = SessionOptions()
|
||||
sess_option.optimized_model_filepath = opt_model_path.as_posix()
|
||||
sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
_ = InferenceSession(model_path.as_posix(), sess_option, providers=['CPUExecutionProvider'])
|
||||
_ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"])
|
||||
|
||||
|
||||
def add_infer_metadata(model):
|
||||
metadata_props = {"onnx.infer": "onnxruntime.quant"}
|
||||
if model.metadata_props:
|
||||
for p in model.metadata_props:
|
||||
metadata_props.update({p.key : p.value})
|
||||
metadata_props.update({p.key: p.value})
|
||||
onnx.helper.set_model_props(model, metadata_props)
|
||||
|
||||
|
||||
def model_has_infer_metadata(model):
|
||||
if model.metadata_props:
|
||||
for p in model.metadata_props:
|
||||
|
@ -485,7 +513,8 @@ def model_has_infer_metadata(model):
|
|||
return True
|
||||
return False
|
||||
|
||||
def load_model_with_shape_infer(model_path : Path):
|
||||
|
||||
def load_model_with_shape_infer(model_path: Path):
|
||||
inferred_model_path = generate_identified_filename(model_path, "-inferred")
|
||||
onnx.shape_inference.infer_shapes_path(str(model_path), str(inferred_model_path))
|
||||
model = onnx.load(inferred_model_path.as_posix())
|
||||
|
@ -493,8 +522,8 @@ def load_model_with_shape_infer(model_path : Path):
|
|||
return model
|
||||
|
||||
|
||||
def load_model(model_path : Path, need_optimize : bool):
|
||||
with tempfile.TemporaryDirectory(prefix='ort.quant.') as quant_tmp_dir:
|
||||
def load_model(model_path: Path, need_optimize: bool):
|
||||
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
|
||||
if need_optimize and not model_has_external_data(model_path):
|
||||
opt_model_path = Path(quant_tmp_dir).joinpath("model.onnx")
|
||||
optimize_model(model_path, opt_model_path)
|
||||
|
@ -504,18 +533,19 @@ def load_model(model_path : Path, need_optimize : bool):
|
|||
add_infer_metadata(model)
|
||||
return model
|
||||
|
||||
|
||||
def save_and_reload_model(model):
|
||||
with tempfile.TemporaryDirectory(prefix='ort.quant.') as quant_tmp_dir:
|
||||
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
|
||||
model_path = Path(quant_tmp_dir).joinpath("model.onnx")
|
||||
onnx.external_data_helper.convert_model_to_external_data(model,
|
||||
all_tensors_to_one_file=True)
|
||||
onnx.external_data_helper.convert_model_to_external_data(model, all_tensors_to_one_file=True)
|
||||
onnx.save_model(model, model_path.as_posix())
|
||||
return load_model(model_path, False)
|
||||
|
||||
|
||||
def clone_model_with_shape_infer(model):
|
||||
if model_has_infer_metadata(model):
|
||||
cloned_model = onnx_proto.ModelProto()
|
||||
cloned_model.CopyFrom(model)
|
||||
else:
|
||||
cloned_model = save_and_reload_model(model)
|
||||
return cloned_model
|
||||
return cloned_model
|
||||
|
|
|
@ -5,52 +5,63 @@
|
|||
# --------------------------------------------------------------------------
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue
|
||||
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg
|
||||
from .quant_utils import QuantType, QuantFormat
|
||||
from .quant_utils import load_model
|
||||
|
||||
from .registry import QLinearOpsRegistry, IntegerOpsRegistry
|
||||
|
||||
from .calibrate import CalibrationDataReader, CalibrationMethod, create_calibrator
|
||||
from .onnx_model import ONNXModel
|
||||
from .onnx_quantizer import ONNXQuantizer
|
||||
from .qdq_quantizer import QDQQuantizer
|
||||
from .calibrate import CalibrationDataReader, create_calibrator, CalibrationMethod
|
||||
from .quant_utils import (
|
||||
QuantFormat,
|
||||
QuantizationMode,
|
||||
QuantizedInitializer,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
QuantType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name,
|
||||
generate_identified_filename,
|
||||
get_elem_index,
|
||||
get_mul_node,
|
||||
load_model,
|
||||
)
|
||||
from .registry import IntegerOpsRegistry, QLinearOpsRegistry
|
||||
|
||||
|
||||
def check_static_quant_arguments(quant_format : QuantFormat,
|
||||
activation_type : QuantType,
|
||||
weight_type : QuantType):
|
||||
def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
|
||||
if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
|
||||
raise ValueError("ONNXRuntime quantization doesn't support data format:"
|
||||
"activation_type=QuantType.QInt8, weight_type = QuantType.QUInt8")
|
||||
raise ValueError(
|
||||
"ONNXRuntime quantization doesn't support data format:"
|
||||
"activation_type=QuantType.QInt8, weight_type = QuantType.QUInt8"
|
||||
)
|
||||
|
||||
if activation_type == QuantType.QInt8 and \
|
||||
weight_type == QuantType.QInt8 and \
|
||||
quant_format != QuantFormat.QDQ: \
|
||||
logging.warning("Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
|
||||
"Or it will lead to bad performance on x64.")
|
||||
if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
|
||||
logging.warning(
|
||||
"Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
|
||||
"Or it will lead to bad performance on x64."
|
||||
)
|
||||
|
||||
|
||||
def quantize_static(model_input,
|
||||
model_output,
|
||||
calibration_data_reader: CalibrationDataReader,
|
||||
quant_format=QuantFormat.QDQ,
|
||||
op_types_to_quantize=[],
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
activation_type=QuantType.QInt8,
|
||||
weight_type=QuantType.QInt8,
|
||||
nodes_to_quantize=[],
|
||||
nodes_to_exclude=[],
|
||||
optimize_model=True,
|
||||
use_external_data_format=False,
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
extra_options = {}):
|
||||
def quantize_static(
|
||||
model_input,
|
||||
model_output,
|
||||
calibration_data_reader: CalibrationDataReader,
|
||||
quant_format=QuantFormat.QDQ,
|
||||
op_types_to_quantize=[],
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
activation_type=QuantType.QInt8,
|
||||
weight_type=QuantType.QInt8,
|
||||
nodes_to_quantize=[],
|
||||
nodes_to_exclude=[],
|
||||
optimize_model=True,
|
||||
use_external_data_format=False,
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
extra_options={},
|
||||
):
|
||||
|
||||
'''
|
||||
"""
|
||||
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
|
||||
|
||||
It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and
|
||||
|
@ -81,9 +92,9 @@ def quantize_static(model_input,
|
|||
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
||||
when it is not None.
|
||||
:param optimize_model: optimize model before quantization.
|
||||
:param use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
||||
:param calibrate_method:
|
||||
Current calibration methods supported are MinMax and Entropy.
|
||||
:param use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
||||
:param calibrate_method:
|
||||
Current calibration methods supported are MinMax and Entropy.
|
||||
Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
|
||||
:param extra_options:
|
||||
key value pair dictionary for various options in different case. Current used:
|
||||
|
@ -97,13 +108,13 @@ def quantize_static(model_input,
|
|||
always quantize input and so generate quantized output. Also the True behavior
|
||||
could be disabled per node using the nodes_to_exclude.
|
||||
MatMulConstBOnly = True/False: Default is False for static mode. If enabled, only MatMul with const B will be quantized.
|
||||
AddQDQPairToWeight = True/False : Default is False which quantizes floating-point weight and feeds it to
|
||||
soley inserted DeQuantizeLinear node. If True, it remains floating-point weight and
|
||||
AddQDQPairToWeight = True/False : Default is False which quantizes floating-point weight and feeds it to
|
||||
soley inserted DeQuantizeLinear node. If True, it remains floating-point weight and
|
||||
inserts both QuantizeLinear/DeQuantizeLinear nodes to weight.
|
||||
OpTypesToExcludeOutputQuantizatioin = list of op type : Default is []. If any op type is specified, it won't quantize
|
||||
OpTypesToExcludeOutputQuantizatioin = list of op type : Default is []. If any op type is specified, it won't quantize
|
||||
the output of ops with this specific op types.
|
||||
DedicatedQDQPair = True/False : Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their inputs.
|
||||
If True, it will create identical and dedicated QDQ pair for each node.
|
||||
If True, it will create identical and dedicated QDQ pair for each node.
|
||||
QDQOpTypePerChannelSupportToAxis = dictionary : Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1},
|
||||
and it's effective only when per channel quantization is supported and per_channel is True.
|
||||
If specific op type supports per channel quantization but not explicitly specified with channel axis,
|
||||
|
@ -114,7 +125,7 @@ def quantize_static(model_input,
|
|||
CalibMovingAverageConstant = float : Default is 0.01. Constant smoothing factor to use when computing the moving average of
|
||||
the minimum and maximum values. Effective only when the calibration method selected is
|
||||
MinMax and when CalibMovingAverage is set to True.
|
||||
'''
|
||||
"""
|
||||
|
||||
mode = QuantizationMode.QLinearOps
|
||||
|
||||
|
@ -124,17 +135,19 @@ def quantize_static(model_input,
|
|||
model = load_model(Path(model_input), optimize_model)
|
||||
|
||||
calib_extra_options_keys = [
|
||||
('CalibTensorRangeSymmetric', 'symmetric'),
|
||||
('CalibMovingAverage', 'moving_average'),
|
||||
('CalibMovingAverageConstant', 'averaging_constant')
|
||||
("CalibTensorRangeSymmetric", "symmetric"),
|
||||
("CalibMovingAverage", "moving_average"),
|
||||
("CalibMovingAverageConstant", "averaging_constant"),
|
||||
]
|
||||
calib_extra_options = {key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options}
|
||||
calib_extra_options = {
|
||||
key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
|
||||
}
|
||||
calibrator = create_calibrator(
|
||||
model,
|
||||
op_types_to_quantize,
|
||||
calibrate_method=calibrate_method,
|
||||
use_external_data_format=use_external_data_format,
|
||||
extra_options=calib_extra_options
|
||||
extra_options=calib_extra_options,
|
||||
)
|
||||
calibrator.collect_data(calibration_data_reader)
|
||||
tensors_range = calibrator.compute_range()
|
||||
|
@ -154,7 +167,8 @@ def quantize_static(model_input,
|
|||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options)
|
||||
extra_options,
|
||||
)
|
||||
else:
|
||||
quantizer = QDQQuantizer(
|
||||
model,
|
||||
|
@ -168,24 +182,27 @@ def quantize_static(model_input,
|
|||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options)
|
||||
extra_options,
|
||||
)
|
||||
|
||||
quantizer.quantize_model()
|
||||
quantizer.model.save_model_to_file(model_output, use_external_data_format)
|
||||
|
||||
|
||||
def quantize_dynamic(model_input: Path,
|
||||
model_output: Path,
|
||||
op_types_to_quantize=[],
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
weight_type=QuantType.QInt8,
|
||||
nodes_to_quantize=[],
|
||||
nodes_to_exclude=[],
|
||||
optimize_model=True,
|
||||
use_external_data_format=False,
|
||||
extra_options = { }):
|
||||
'''
|
||||
def quantize_dynamic(
|
||||
model_input: Path,
|
||||
model_output: Path,
|
||||
op_types_to_quantize=[],
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
weight_type=QuantType.QInt8,
|
||||
nodes_to_quantize=[],
|
||||
nodes_to_exclude=[],
|
||||
optimize_model=True,
|
||||
use_external_data_format=False,
|
||||
extra_options={},
|
||||
):
|
||||
"""
|
||||
Given an onnx model, create a quantized onnx model and save it into a file
|
||||
:param model_input: file path of model to quantize
|
||||
:param model_output: file path of quantized model
|
||||
|
@ -218,7 +235,7 @@ def quantize_dynamic(model_input: Path,
|
|||
always quantize input and so generate quantized output. Also the True behavior
|
||||
could be disabled per node using the nodes_to_exclude.
|
||||
MatMulConstBOnly = True/False: Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
|
||||
'''
|
||||
"""
|
||||
|
||||
mode = QuantizationMode.IntegerOps
|
||||
|
||||
|
@ -227,22 +244,23 @@ def quantize_dynamic(model_input: Path,
|
|||
|
||||
model = load_model(Path(model_input), optimize_model)
|
||||
|
||||
if 'MatMulConstBOnly' not in extra_options:
|
||||
extra_options['MatMulConstBOnly'] = True
|
||||
if "MatMulConstBOnly" not in extra_options:
|
||||
extra_options["MatMulConstBOnly"] = True
|
||||
|
||||
quantizer = ONNXQuantizer(
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
mode,
|
||||
False, #static
|
||||
False, # static
|
||||
weight_type,
|
||||
QuantType.QUInt8, #dynamic activation only supports uint8
|
||||
QuantType.QUInt8, # dynamic activation only supports uint8
|
||||
None,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options)
|
||||
extra_options,
|
||||
)
|
||||
|
||||
quantizer.quantize_model()
|
||||
quantizer.model.save_model_to_file(model_output, use_external_data_format)
|
||||
|
|
|
@ -1,28 +1,28 @@
|
|||
from .quant_utils import QuantizationMode
|
||||
from .operators.activation import QDQRemovableActivation, QLinearActivation
|
||||
from .operators.argmax import QArgMax
|
||||
from .operators.base_operator import QuantOperatorBase
|
||||
from .operators.qdq_base_operator import QDQOperatorBase
|
||||
from .operators.matmul import MatMulInteger, QLinearMatMul, QDQMatMul
|
||||
from .operators.attention import AttentionQuant
|
||||
from .operators.base_operator import QuantOperatorBase
|
||||
from .operators.binary_op import QLinearBinaryOp
|
||||
from .operators.concat import QDQConcat, QLinearConcat
|
||||
from .operators.conv import ConvInteger, QDQConv, QLinearConv
|
||||
from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
from .operators.embed_layernorm import EmbedLayerNormalizationQuant
|
||||
from .operators.gather import GatherQuant
|
||||
from .operators.conv import QLinearConv, ConvInteger, QDQConv
|
||||
from .operators.activation import QLinearActivation, QDQRemovableActivation
|
||||
from .operators.binary_op import QLinearBinaryOp
|
||||
from .operators.maxpool import QDQMaxPool, QMaxPool
|
||||
from .operators.gavgpool import QGlobalAveragePool
|
||||
from .operators.gemm import QDQGemm, QLinearGemm
|
||||
from .operators.lstm import LSTMQuant
|
||||
from .operators.split import QSplit
|
||||
from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
|
||||
from .operators.maxpool import QDQMaxPool, QMaxPool
|
||||
from .operators.pad import QPad
|
||||
from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
from .operators.resize import QResize, QDQResize
|
||||
from .operators.pooling import QLinearPool
|
||||
from .operators.concat import QLinearConcat, QDQConcat
|
||||
from .operators.gemm import QLinearGemm, QDQGemm
|
||||
from .operators.qdq_base_operator import QDQOperatorBase
|
||||
from .operators.resize import QDQResize, QResize
|
||||
from .operators.split import QSplit
|
||||
from .quant_utils import QuantizationMode
|
||||
|
||||
CommonOpsRegistry = {
|
||||
"Gather": GatherQuant,
|
||||
"Transpose" : Direct8BitOp,
|
||||
"Transpose": Direct8BitOp,
|
||||
"EmbedLayerNormalization": EmbedLayerNormalizationQuant,
|
||||
}
|
||||
|
||||
|
@ -50,10 +50,10 @@ QLinearOpsRegistry = {
|
|||
"Split": QSplit,
|
||||
"Pad": QPad,
|
||||
"Reshape": Direct8BitOp,
|
||||
"Squeeze" : Direct8BitOp,
|
||||
"Unsqueeze" : Direct8BitOp,
|
||||
"Squeeze": Direct8BitOp,
|
||||
"Unsqueeze": Direct8BitOp,
|
||||
"Resize": QResize,
|
||||
"AveragePool" : QLinearPool,
|
||||
"AveragePool": QLinearPool,
|
||||
"Concat": QLinearConcat,
|
||||
}
|
||||
QLinearOpsRegistry.update(CommonOpsRegistry)
|
||||
|
@ -64,12 +64,12 @@ QDQRegistry = {
|
|||
"Clip": QDQRemovableActivation,
|
||||
"Relu": QDQRemovableActivation,
|
||||
"Reshape": QDQDirect8BitOp,
|
||||
"Transpose" : QDQDirect8BitOp,
|
||||
"Squeeze" : QDQDirect8BitOp,
|
||||
"Unsqueeze" : QDQDirect8BitOp,
|
||||
"Transpose": QDQDirect8BitOp,
|
||||
"Squeeze": QDQDirect8BitOp,
|
||||
"Unsqueeze": QDQDirect8BitOp,
|
||||
"Resize": QDQResize,
|
||||
"MaxPool": QDQMaxPool,
|
||||
"AveragePool" : QDQDirect8BitOp,
|
||||
"AveragePool": QDQDirect8BitOp,
|
||||
"Concat": QDQConcat,
|
||||
"MatMul": QDQMatMul,
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,35 +1,49 @@
|
|||
import os
|
||||
import csv
|
||||
import logging
|
||||
import coloredlogs
|
||||
import argparse
|
||||
import copy
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
import pprint
|
||||
from perf_utils import *
|
||||
import re
|
||||
|
||||
import coloredlogs
|
||||
from benchmark import *
|
||||
from perf_utils import *
|
||||
|
||||
|
||||
def write_model_info_to_file(model, path):
|
||||
with open(path, 'w') as file:
|
||||
file.write(json.dumps(model)) # use `json.loads` to do the reverse
|
||||
with open(path, "w") as file:
|
||||
file.write(json.dumps(model)) # use `json.loads` to do the reverse
|
||||
|
||||
def get_ep_list(comparison):
|
||||
if comparison == 'acl':
|
||||
|
||||
def get_ep_list(comparison):
|
||||
if comparison == "acl":
|
||||
ep_list = [cpu, acl]
|
||||
else:
|
||||
else:
|
||||
# test with cuda and trt
|
||||
ep_list = [cpu, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
|
||||
ep_list = [
|
||||
cpu,
|
||||
cuda,
|
||||
trt,
|
||||
standalone_trt,
|
||||
cuda_fp16,
|
||||
trt_fp16,
|
||||
standalone_trt_fp16,
|
||||
]
|
||||
return ep_list
|
||||
|
||||
def resolve_trtexec_path(workspace):
|
||||
|
||||
def resolve_trtexec_path(workspace):
|
||||
trtexec_options = get_output(["find", workspace, "-name", "trtexec"])
|
||||
trtexec_path = re.search(r'.*/bin/trtexec', trtexec_options).group(0)
|
||||
trtexec_path = re.search(r".*/bin/trtexec", trtexec_options).group(0)
|
||||
logger.info("using trtexec {}".format(trtexec_path))
|
||||
return trtexec_path
|
||||
|
||||
|
||||
def dict_to_args(dct):
|
||||
return ','.join(["{}={}".format(k, v) for k, v in dct.items()])
|
||||
return ",".join(["{}={}".format(k, v) for k, v in dct.items()])
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
@ -42,7 +56,7 @@ def main():
|
|||
else:
|
||||
ep_list = get_ep_list(args.comparison)
|
||||
|
||||
if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
|
||||
if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
|
||||
trtexec = resolve_trtexec_path(args.workspace)
|
||||
|
||||
models = {}
|
||||
|
@ -59,28 +73,35 @@ def main():
|
|||
specs_csv = specs_name + csv_ending
|
||||
|
||||
for model, model_info in models.items():
|
||||
logger.info("\n" + "="*40 + "="*len(model))
|
||||
logger.info("="*20 + model +"="*20)
|
||||
logger.info("="*40 + "="*len(model))
|
||||
logger.info("\n" + "=" * 40 + "=" * len(model))
|
||||
logger.info("=" * 20 + model + "=" * 20)
|
||||
logger.info("=" * 40 + "=" * len(model))
|
||||
|
||||
model_info["model_name"] = model
|
||||
|
||||
model_list_file = os.path.join(os.getcwd(), model +'.json')
|
||||
model_info["model_name"] = model
|
||||
|
||||
model_list_file = os.path.join(os.getcwd(), model + ".json")
|
||||
write_model_info_to_file([model_info], model_list_file)
|
||||
|
||||
for ep in ep_list:
|
||||
|
||||
command = ["python3",
|
||||
"benchmark.py",
|
||||
"-r", args.running_mode,
|
||||
"-m", model_list_file,
|
||||
"-o", args.perf_result_path,
|
||||
"--ep", ep,
|
||||
"--write_test_result", "false"]
|
||||
|
||||
if ep == standalone_trt or ep == standalone_trt_fp16:
|
||||
if args.running_mode == "validate":
|
||||
continue
|
||||
|
||||
command = [
|
||||
"python3",
|
||||
"benchmark.py",
|
||||
"-r",
|
||||
args.running_mode,
|
||||
"-m",
|
||||
model_list_file,
|
||||
"-o",
|
||||
args.perf_result_path,
|
||||
"--ep",
|
||||
ep,
|
||||
"--write_test_result",
|
||||
"false",
|
||||
]
|
||||
|
||||
if ep == standalone_trt or ep == standalone_trt_fp16:
|
||||
if args.running_mode == "validate":
|
||||
continue
|
||||
else:
|
||||
command.extend(["--trtexec", trtexec])
|
||||
|
||||
|
@ -92,20 +113,30 @@ def main():
|
|||
|
||||
if args.running_mode == "validate":
|
||||
command.extend(["--benchmark_metrics_csv", benchmark_metrics_csv])
|
||||
|
||||
|
||||
elif args.running_mode == "benchmark":
|
||||
command.extend(["-t", str(args.test_times),
|
||||
"-o", args.perf_result_path,
|
||||
"--write_test_result", "false",
|
||||
"--benchmark_fail_csv", benchmark_fail_csv,
|
||||
"--benchmark_latency_csv", benchmark_latency_csv,
|
||||
"--benchmark_success_csv", benchmark_success_csv])
|
||||
|
||||
command.extend(
|
||||
[
|
||||
"-t",
|
||||
str(args.test_times),
|
||||
"-o",
|
||||
args.perf_result_path,
|
||||
"--write_test_result",
|
||||
"false",
|
||||
"--benchmark_fail_csv",
|
||||
benchmark_fail_csv,
|
||||
"--benchmark_latency_csv",
|
||||
benchmark_latency_csv,
|
||||
"--benchmark_success_csv",
|
||||
benchmark_success_csv,
|
||||
]
|
||||
)
|
||||
|
||||
p = subprocess.run(command)
|
||||
logger.info(p)
|
||||
|
||||
if p.returncode != 0:
|
||||
error_type = "runtime error"
|
||||
error_type = "runtime error"
|
||||
error_message = "Benchmark script exited with returncode = " + str(p.returncode)
|
||||
logger.error(error_message)
|
||||
update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)
|
||||
|
@ -117,6 +148,7 @@ def main():
|
|||
path = os.path.join(os.getcwd(), args.perf_result_path)
|
||||
if not os.path.exists(path):
|
||||
from pathlib import Path
|
||||
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.running_mode == "validate":
|
||||
|
@ -127,8 +159,8 @@ def main():
|
|||
if os.path.exists(METRICS_FILE):
|
||||
model_to_metrics = read_map_from_file(METRICS_FILE)
|
||||
output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))
|
||||
logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
|
||||
|
||||
logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
|
||||
|
||||
elif args.running_mode == "benchmark":
|
||||
logger.info("\n=========================================")
|
||||
logger.info("======= Models/EPs session creation =======")
|
||||
|
@ -138,8 +170,8 @@ def main():
|
|||
model_to_session = read_map_from_file(SESSION_FILE)
|
||||
pretty_print(pp, model_to_session)
|
||||
output_session_creation(model_to_session, os.path.join(path, benchmark_session_csv))
|
||||
logger.info("\nSaved session creation results to {}".format(benchmark_session_csv))
|
||||
|
||||
logger.info("\nSaved session creation results to {}".format(benchmark_session_csv))
|
||||
|
||||
logger.info("\n=========================================================")
|
||||
logger.info("========== Failing Models/EPs (accumulated) ==============")
|
||||
logger.info("==========================================================")
|
||||
|
@ -148,7 +180,7 @@ def main():
|
|||
model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
|
||||
output_fail(model_to_fail_ep, os.path.join(path, benchmark_fail_csv))
|
||||
logger.info(model_to_fail_ep)
|
||||
logger.info("\nSaved model failing results to {}".format(benchmark_fail_csv))
|
||||
logger.info("\nSaved model failing results to {}".format(benchmark_fail_csv))
|
||||
|
||||
logger.info("\n=======================================================")
|
||||
logger.info("=========== Models/EPs Status (accumulated) ===========")
|
||||
|
@ -163,11 +195,11 @@ def main():
|
|||
model_fail = read_map_from_file(FAIL_MODEL_FILE)
|
||||
is_fail = True
|
||||
model_status = build_status(model_status, model_fail, is_fail)
|
||||
|
||||
|
||||
pretty_print(pp, model_status)
|
||||
|
||||
output_status(model_status, os.path.join(path, benchmark_status_csv))
|
||||
logger.info("\nSaved model status results to {}".format(benchmark_status_csv))
|
||||
|
||||
output_status(model_status, os.path.join(path, benchmark_status_csv))
|
||||
logger.info("\nSaved model status results to {}".format(benchmark_status_csv))
|
||||
|
||||
logger.info("\n=========================================================")
|
||||
logger.info("=========== Models/EPs latency (accumulated) ===========")
|
||||
|
@ -176,11 +208,11 @@ def main():
|
|||
if os.path.exists(LATENCY_FILE):
|
||||
model_to_latency = read_map_from_file(LATENCY_FILE)
|
||||
add_improvement_information(model_to_latency)
|
||||
|
||||
|
||||
pretty_print(pp, model_to_latency)
|
||||
|
||||
|
||||
output_latency(model_to_latency, os.path.join(path, benchmark_latency_csv))
|
||||
logger.info("\nSaved model latency results to {}".format(benchmark_latency_csv))
|
||||
logger.info("\nSaved model latency results to {}".format(benchmark_latency_csv))
|
||||
|
||||
logger.info("\n===========================================")
|
||||
logger.info("=========== System information ===========")
|
||||
|
@ -189,7 +221,8 @@ def main():
|
|||
pretty_print(pp, info)
|
||||
logger.info("\n")
|
||||
output_specs(info, os.path.join(path, specs_csv))
|
||||
logger.info("\nSaved hardware specs to {}".format(specs_csv))
|
||||
logger.info("\nSaved hardware specs to {}".format(specs_csv))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,46 +1,62 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
ep_map = {"cpu": "CPU", "cuda":"CUDA","trt": "TRT EP","native": "Standalone TRT"}
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
def parse_arguments():
|
||||
ep_map = {"cpu": "CPU", "cuda": "CUDA", "trt": "TRT EP", "native": "Standalone TRT"}
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
# create parser
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-p", "--prev", required=True, help="previous csv")
|
||||
parser.add_argument("-c", "--current", required=True, help="current csv")
|
||||
parser.add_argument("-o", "--output_csv", required=True, help="output different csv")
|
||||
parser.add_argument("--ep", required=False, default="trt", choices=["cpu", "cuda", "trt", "native"], help="ep to capture regressions on")
|
||||
parser.add_argument("--tolerance", required=False, default=0, help="allowed tolerance for latency comparison")
|
||||
parser.add_argument(
|
||||
"--ep",
|
||||
required=False,
|
||||
default="trt",
|
||||
choices=["cpu", "cuda", "trt", "native"],
|
||||
help="ep to capture regressions on",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tolerance",
|
||||
required=False,
|
||||
default=0,
|
||||
help="allowed tolerance for latency comparison",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
return args
|
||||
|
||||
def get_table_condition(table, fp, ep, tol):
|
||||
|
||||
def get_table_condition(table, fp, ep, tol):
|
||||
ep = ep_map[ep]
|
||||
col1 = ep + " " + fp + " \nmean (ms)_x"
|
||||
col2 = ep + " " + fp + " \nmean (ms)_y"
|
||||
condition = table[col1] > (table[col2] + tol)
|
||||
return condition
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
a = pd.read_csv(args.prev)
|
||||
b = pd.read_csv(args.current)
|
||||
|
||||
common = a.merge(b, on=['Model'])
|
||||
|
||||
|
||||
common = a.merge(b, on=["Model"])
|
||||
|
||||
condition_fp32 = get_table_condition(common, "fp32", args.ep, args.tolerance)
|
||||
condition_fp16 = get_table_condition(common, "fp16", args.ep, args.tolerance)
|
||||
|
||||
common['greater'] = np.where((condition_fp32 | condition_fp16), True, False)
|
||||
greater = common[common['greater'] == True].drop(['greater'], axis=1)
|
||||
|
||||
|
||||
common["greater"] = np.where((condition_fp32 | condition_fp16), True, False)
|
||||
greater = common[common["greater"] == True].drop(["greater"], axis=1)
|
||||
|
||||
# arrange columns
|
||||
keys = list(greater.keys().sort_values())
|
||||
keys.insert(0, keys.pop(keys.index('Model')))
|
||||
keys.insert(0, keys.pop(keys.index("Model")))
|
||||
greater = greater[keys]
|
||||
|
||||
|
||||
greater.to_csv(args.output_csv)
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,22 +1,30 @@
|
|||
import pandas as pd
|
||||
import argparse
|
||||
|
||||
def parse_arguments():
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
# create parser
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-p", "--prev", required=True, help="previous csv")
|
||||
parser.add_argument("-c", "--current", required=True, help="current csv")
|
||||
parser.add_argument("-o", "--output_csv", required=True, help="output different csv")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
a = pd.read_csv(args.prev)
|
||||
b = pd.read_csv(args.current)
|
||||
common = b.merge(a, on=['model','ep','error type','error message'])
|
||||
diff = b.append(common, ignore_index=True).drop_duplicates(['model', 'ep', 'error type', 'error message'], keep=False).loc[:b.index.max()]
|
||||
common = b.merge(a, on=["model", "ep", "error type", "error message"])
|
||||
diff = (
|
||||
b.append(common, ignore_index=True)
|
||||
.drop_duplicates(["model", "ep", "error type", "error message"], keep=False)
|
||||
.loc[: b.index.max()]
|
||||
)
|
||||
diff.to_csv(args.output_csv)
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,21 +1,22 @@
|
|||
import subprocess
|
||||
import json
|
||||
import pprint
|
||||
import logging
|
||||
import coloredlogs
|
||||
import pprint
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
debug = False
|
||||
debug_verbose = False
|
||||
import coloredlogs
|
||||
|
||||
# ORT ep names
|
||||
debug = False
|
||||
debug_verbose = False
|
||||
|
||||
# ORT ep names
|
||||
cpu_ep = "CPUExecutionProvider"
|
||||
cuda_ep = "CUDAExecutionProvider"
|
||||
trt_ep = "TensorrtExecutionProvider"
|
||||
acl_ep = "ACLExecutionProvider"
|
||||
|
||||
# provider names
|
||||
# provider names
|
||||
cpu = "ORT-CPUFp32"
|
||||
cuda = "ORT-CUDAFp32"
|
||||
cuda_fp16 = "ORT-CUDAFp16"
|
||||
|
@ -26,56 +27,70 @@ standalone_trt_fp16 = "TRTFp16"
|
|||
acl = "ORT-ACLFp32"
|
||||
|
||||
# table names
|
||||
metrics_name = 'metrics'
|
||||
success_name = 'success'
|
||||
fail_name = 'fail'
|
||||
memory_name = 'memory'
|
||||
latency_name = 'latency'
|
||||
status_name = 'status'
|
||||
latency_over_time_name = 'latency_over_time'
|
||||
specs_name = 'specs'
|
||||
session_name = 'session'
|
||||
metrics_name = "metrics"
|
||||
success_name = "success"
|
||||
fail_name = "fail"
|
||||
memory_name = "memory"
|
||||
latency_name = "latency"
|
||||
status_name = "status"
|
||||
latency_over_time_name = "latency_over_time"
|
||||
specs_name = "specs"
|
||||
session_name = "session"
|
||||
|
||||
# column names
|
||||
model_title = 'Model'
|
||||
group_title = 'Group'
|
||||
# column names
|
||||
model_title = "Model"
|
||||
group_title = "Group"
|
||||
|
||||
# endings
|
||||
# endings
|
||||
second = "_second"
|
||||
csv_ending = '.csv'
|
||||
avg_ending = ' \nmean (ms)'
|
||||
percentile_ending = ' \n90th percentile (ms)'
|
||||
memory_ending = ' \npeak memory usage (MiB)'
|
||||
session_ending = ' \n session creation time (s)'
|
||||
second_session_ending = ' \n second session creation time (s)'
|
||||
csv_ending = ".csv"
|
||||
avg_ending = " \nmean (ms)"
|
||||
percentile_ending = " \n90th percentile (ms)"
|
||||
memory_ending = " \npeak memory usage (MiB)"
|
||||
session_ending = " \n session creation time (s)"
|
||||
second_session_ending = " \n second session creation time (s)"
|
||||
ort_provider_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
|
||||
provider_list = [cpu, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
|
||||
provider_list = [
|
||||
cpu,
|
||||
cuda,
|
||||
trt,
|
||||
standalone_trt,
|
||||
cuda_fp16,
|
||||
trt_fp16,
|
||||
standalone_trt_fp16,
|
||||
]
|
||||
table_headers = [model_title] + provider_list
|
||||
|
||||
# graph options
|
||||
disable = 'disable'
|
||||
basic = 'basic'
|
||||
extended = 'extended'
|
||||
enable_all = 'all'
|
||||
# graph options
|
||||
disable = "disable"
|
||||
basic = "basic"
|
||||
extended = "extended"
|
||||
enable_all = "all"
|
||||
|
||||
|
||||
def is_standalone(ep):
|
||||
return ep == standalone_trt or ep == standalone_trt_fp16
|
||||
|
||||
|
||||
def get_output(command):
|
||||
p = subprocess.run(command, check=True, stdout=subprocess.PIPE)
|
||||
output = p.stdout.decode("ascii").strip()
|
||||
return output
|
||||
|
||||
def find(regex_string):
|
||||
|
||||
def find(regex_string):
|
||||
import glob
|
||||
|
||||
results = glob.glob(regex_string)
|
||||
results.sort()
|
||||
return results
|
||||
|
||||
|
||||
def pretty_print(pp, json_object):
|
||||
pp.pprint(json_object)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def parse_single_file(f):
|
||||
|
||||
try:
|
||||
|
@ -86,7 +101,7 @@ def parse_single_file(f):
|
|||
model_run_flag = False
|
||||
first_run_flag = True
|
||||
provider_op_map = {} # ep -> map of operator to duration
|
||||
provider_op_map_first_run = {} # ep -> map of operator to duration
|
||||
provider_op_map_first_run = {} # ep -> map of operator to duration
|
||||
|
||||
for row in data:
|
||||
if not "cat" in row:
|
||||
|
@ -134,20 +149,19 @@ def parse_single_file(f):
|
|||
op_map[row["name"]] = row["dur"]
|
||||
provider_op_map[provider] = op_map
|
||||
|
||||
|
||||
if debug_verbose:
|
||||
pprint._sorted = lambda x:x
|
||||
pprint._sorted = lambda x: x
|
||||
pprint.sorted = lambda x, key=None: x
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
print("------First run ops map (START)------")
|
||||
for key, map in provider_op_map_first_run.items():
|
||||
print(key)
|
||||
print(key)
|
||||
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
|
||||
|
||||
print("------First run ops map (END) ------")
|
||||
print("------Second run ops map (START)------")
|
||||
for key, map in provider_op_map.items():
|
||||
print(key)
|
||||
print(key)
|
||||
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
|
||||
print("------Second run ops map (END) ------")
|
||||
|
||||
|
@ -156,6 +170,7 @@ def parse_single_file(f):
|
|||
|
||||
return None
|
||||
|
||||
|
||||
def calculate_cuda_op_percentage(cuda_op_map):
|
||||
if not cuda_op_map or len(cuda_op_map) == 0:
|
||||
return 0
|
||||
|
@ -163,14 +178,15 @@ def calculate_cuda_op_percentage(cuda_op_map):
|
|||
cuda_ops = 0
|
||||
cpu_ops = 0
|
||||
for key, value in cuda_op_map.items():
|
||||
if key == 'CUDAExecutionProvider':
|
||||
if key == "CUDAExecutionProvider":
|
||||
cuda_ops += len(value)
|
||||
|
||||
if key == 'CPUExecutionProvider':
|
||||
if key == "CPUExecutionProvider":
|
||||
cpu_ops += len(value)
|
||||
|
||||
return cuda_ops / (cuda_ops + cpu_ops)
|
||||
|
||||
|
||||
##########################################
|
||||
# Return: total ops executed in TRT,
|
||||
# total ops,
|
||||
|
@ -208,6 +224,7 @@ def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
|
|||
|
||||
return ((total_ops - total_cuda_and_cpu_ops), total_ops, ratio_of_ops_in_trt)
|
||||
|
||||
|
||||
def get_total_ops(op_map):
|
||||
total_ops = 0
|
||||
|
||||
|
@ -227,7 +244,11 @@ def calculate_trt_latency_percentage(trt_op_map):
|
|||
# % of TRT execution time
|
||||
total_execution_time = 0
|
||||
total_trt_execution_time = 0
|
||||
for ep in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]:
|
||||
for ep in [
|
||||
"TensorrtExecutionProvider",
|
||||
"CUDAExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]:
|
||||
if ep in trt_op_map:
|
||||
op_map = trt_op_map[ep]
|
||||
|
||||
|
@ -240,8 +261,6 @@ def calculate_trt_latency_percentage(trt_op_map):
|
|||
|
||||
total_execution_time += total_time
|
||||
|
||||
|
||||
|
||||
if total_execution_time == 0:
|
||||
ratio_of_trt_execution_time = 0
|
||||
else:
|
||||
|
@ -257,7 +276,10 @@ def calculate_trt_latency_percentage(trt_op_map):
|
|||
|
||||
def get_profile_metrics(path, profile_already_parsed, logger=None):
|
||||
logger.info("Parsing/Analyzing profiling files in {} ...".format(path))
|
||||
p1 = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"], stdout=subprocess.PIPE)
|
||||
p1 = subprocess.Popen(
|
||||
["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"],
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
||||
stdout, sterr = p2.communicate()
|
||||
stdout = stdout.decode("ascii").strip()
|
||||
|
@ -266,7 +288,7 @@ def get_profile_metrics(path, profile_already_parsed, logger=None):
|
|||
|
||||
data = []
|
||||
for profile in profiling_files:
|
||||
profile = profile.split('\t')[1]
|
||||
profile = profile.split("\t")[1]
|
||||
if profile in profile_already_parsed:
|
||||
continue
|
||||
profile_already_parsed.add(profile)
|
||||
|
|
|
@ -1,137 +1,171 @@
|
|||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
from azure.kusto.data import KustoConnectionStringBuilder
|
||||
from azure.kusto.data.data_format import DataFormat
|
||||
from azure.kusto.data.helpers import dataframe_from_result_table
|
||||
from azure.kusto.ingest import (
|
||||
IngestionProperties,
|
||||
ReportLevel,
|
||||
QueuedIngestClient,
|
||||
)
|
||||
from azure.kusto.data.helpers import dataframe_from_result_table
|
||||
from azure.kusto.ingest import IngestionProperties, QueuedIngestClient, ReportLevel
|
||||
from perf_utils import *
|
||||
|
||||
# database connection strings
|
||||
# database connection strings
|
||||
cluster_ingest = "https://ingest-onnxruntimedashboarddb.southcentralus.kusto.windows.net"
|
||||
database = "ep_perf_dashboard"
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-r", "--report_folder", help="Path to the local file report", required=True)
|
||||
parser.add_argument(
|
||||
"-c", "--commit_hash", help="Commit id", required=True)
|
||||
parser.add_argument(
|
||||
"-u", "--report_url", help="Report Url", required=True)
|
||||
parser.add_argument(
|
||||
"-t", "--trt_version", help="Tensorrt Version", required=True)
|
||||
parser.add_argument(
|
||||
"-b", "--branch", help="Branch", required=True)
|
||||
parser.add_argument(
|
||||
"-d", "--datetime", help="Commit Datetime", required=True)
|
||||
parser.add_argument("-r", "--report_folder", help="Path to the local file report", required=True)
|
||||
parser.add_argument("-c", "--commit_hash", help="Commit id", required=True)
|
||||
parser.add_argument("-u", "--report_url", help="Report Url", required=True)
|
||||
parser.add_argument("-t", "--trt_version", help="Tensorrt Version", required=True)
|
||||
parser.add_argument("-b", "--branch", help="Branch", required=True)
|
||||
parser.add_argument("-d", "--datetime", help="Commit Datetime", required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
def adjust_columns(table, columns, db_columns, model_group):
|
||||
|
||||
def adjust_columns(table, columns, db_columns, model_group):
|
||||
table = table[columns]
|
||||
table = table.set_axis(db_columns, axis=1)
|
||||
table = table.assign(Group=model_group)
|
||||
return table
|
||||
return table
|
||||
|
||||
|
||||
def get_latency_over_time(commit_hash, report_url, branch, latency_table):
|
||||
if not latency_table.empty:
|
||||
over_time = latency_table
|
||||
over_time = over_time.melt(id_vars=[model_title, group_title], var_name='Ep', value_name='Latency')
|
||||
over_time = over_time.melt(id_vars=[model_title, group_title], var_name="Ep", value_name="Latency")
|
||||
over_time = over_time.assign(CommitId=commit_hash)
|
||||
over_time = over_time.assign(ReportUrl=report_url)
|
||||
over_time = over_time.assign(Branch=branch)
|
||||
over_time = over_time[['CommitId', model_title, 'Ep', 'Latency', 'ReportUrl', group_title, 'Branch']]
|
||||
over_time.fillna('', inplace=True)
|
||||
over_time = over_time[
|
||||
[
|
||||
"CommitId",
|
||||
model_title,
|
||||
"Ep",
|
||||
"Latency",
|
||||
"ReportUrl",
|
||||
group_title,
|
||||
"Branch",
|
||||
]
|
||||
]
|
||||
over_time.fillna("", inplace=True)
|
||||
return over_time
|
||||
|
||||
|
||||
|
||||
def get_failures(fail, model_group):
|
||||
fail_columns = fail.keys()
|
||||
fail_db_columns = [model_title, 'Ep', 'ErrorType', 'ErrorMessage']
|
||||
fail_db_columns = [model_title, "Ep", "ErrorType", "ErrorMessage"]
|
||||
fail = adjust_columns(fail, fail_columns, fail_db_columns, model_group)
|
||||
return fail
|
||||
|
||||
def get_memory(memory, model_group):
|
||||
|
||||
def get_memory(memory, model_group):
|
||||
memory_columns = [model_title]
|
||||
for provider in provider_list:
|
||||
for provider in provider_list:
|
||||
if cpu not in provider:
|
||||
memory_columns.append(provider + memory_ending)
|
||||
memory_db_columns = [model_title, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
|
||||
memory_db_columns = [
|
||||
model_title,
|
||||
cuda,
|
||||
trt,
|
||||
standalone_trt,
|
||||
cuda_fp16,
|
||||
trt_fp16,
|
||||
standalone_trt_fp16,
|
||||
]
|
||||
memory = adjust_columns(memory, memory_columns, memory_db_columns, model_group)
|
||||
return memory
|
||||
|
||||
|
||||
def get_latency(latency, model_group):
|
||||
latency_columns = [model_title]
|
||||
for provider in provider_list:
|
||||
for provider in provider_list:
|
||||
latency_columns.append(provider + avg_ending)
|
||||
latency_db_columns = table_headers
|
||||
latency = adjust_columns(latency, latency_columns, latency_db_columns, model_group)
|
||||
return latency
|
||||
|
||||
|
||||
|
||||
def get_status(status, model_group):
|
||||
status_columns = status.keys()
|
||||
status_db_columns = table_headers
|
||||
status = adjust_columns(status, status_columns, status_db_columns, model_group)
|
||||
return status
|
||||
|
||||
|
||||
def get_specs(specs, branch, commit_id, date_time):
|
||||
init_id = int(specs.tail(1).get('.', 0)) + 1
|
||||
specs_additional = pd.DataFrame({'.': [init_id, init_id + 1, init_id + 2],
|
||||
'Spec': ['Branch', 'CommitId', 'CommitTime'],
|
||||
'Version': [branch, commit_id, date_time]})
|
||||
init_id = int(specs.tail(1).get(".", 0)) + 1
|
||||
specs_additional = pd.DataFrame(
|
||||
{
|
||||
".": [init_id, init_id + 1, init_id + 2],
|
||||
"Spec": ["Branch", "CommitId", "CommitTime"],
|
||||
"Version": [branch, commit_id, date_time],
|
||||
}
|
||||
)
|
||||
|
||||
return pd.concat([specs, specs_additional], ignore_index=True)
|
||||
|
||||
|
||||
def get_session(session, model_group):
|
||||
session_columns = session.keys()
|
||||
session_db_columns = [model_title] + ort_provider_list + [p + second for p in ort_provider_list]
|
||||
session = adjust_columns(session, session_columns, session_db_columns, model_group)
|
||||
return session
|
||||
|
||||
|
||||
def write_table(ingest_client, table, table_name, commit_time, identifier):
|
||||
if table.empty:
|
||||
return
|
||||
table = table.assign(UploadTime=commit_time) # add Commit DateTime
|
||||
table = table.assign(Identifier=identifier) # add Identifier
|
||||
table = table.assign(UploadTime=commit_time) # add Commit DateTime
|
||||
table = table.assign(Identifier=identifier) # add Identifier
|
||||
ingestion_props = IngestionProperties(
|
||||
database=database,
|
||||
table=table_name,
|
||||
data_format=DataFormat.CSV,
|
||||
report_level=ReportLevel.FailuresAndSuccesses
|
||||
database=database,
|
||||
table=table_name,
|
||||
data_format=DataFormat.CSV,
|
||||
report_level=ReportLevel.FailuresAndSuccesses,
|
||||
)
|
||||
# append rows
|
||||
ingest_client.ingest_from_dataframe(table, ingestion_properties=ingestion_props)
|
||||
|
||||
def get_time():
|
||||
|
||||
def get_time():
|
||||
date_time = time.strftime(time_string_format)
|
||||
return date_time
|
||||
|
||||
|
||||
def get_identifier(date_time, commit_id, trt_version, branch):
|
||||
date = date_time.split('T')[0] # extract date only
|
||||
return date + '_' + commit_id + '_' + trt_version + '_' + branch
|
||||
date = date_time.split("T")[0] # extract date only
|
||||
return date + "_" + commit_id + "_" + trt_version + "_" + branch
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
|
||||
args = parse_arguments()
|
||||
|
||||
|
||||
# connect to database
|
||||
kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(cluster_ingest)
|
||||
ingest_client = QueuedIngestClient(kcsb_ingest)
|
||||
date_time = args.datetime
|
||||
identifier = get_identifier(date_time, args.commit_hash, args.trt_version, args.branch)
|
||||
|
||||
|
||||
try:
|
||||
result_file = args.report_folder
|
||||
|
||||
folders = os.listdir(result_file)
|
||||
os.chdir(result_file)
|
||||
|
||||
tables = [fail_name, memory_name, latency_name, status_name, latency_over_time_name, specs_name, session_name]
|
||||
tables = [
|
||||
fail_name,
|
||||
memory_name,
|
||||
latency_name,
|
||||
status_name,
|
||||
latency_over_time_name,
|
||||
specs_name,
|
||||
session_name,
|
||||
]
|
||||
table_results = {}
|
||||
for table_name in tables:
|
||||
table_results[table_name] = pd.DataFrame()
|
||||
|
@ -142,26 +176,54 @@ def main():
|
|||
for csv in csv_filenames:
|
||||
table = pd.read_csv(csv)
|
||||
if session_name in csv:
|
||||
table_results[session_name] = table_results[session_name].append(get_session(table, model_group), ignore_index=True)
|
||||
table_results[session_name] = table_results[session_name].append(
|
||||
get_session(table, model_group), ignore_index=True
|
||||
)
|
||||
elif specs_name in csv:
|
||||
table_results[specs_name] = table_results[specs_name].append(get_specs(table, args.branch, args.commit_hash, date_time), ignore_index=True)
|
||||
table_results[specs_name] = table_results[specs_name].append(
|
||||
get_specs(table, args.branch, args.commit_hash, date_time),
|
||||
ignore_index=True,
|
||||
)
|
||||
elif fail_name in csv:
|
||||
table_results[fail_name] = table_results[fail_name].append(get_failures(table, model_group), ignore_index=True)
|
||||
table_results[fail_name] = table_results[fail_name].append(
|
||||
get_failures(table, model_group), ignore_index=True
|
||||
)
|
||||
elif latency_name in csv:
|
||||
table_results[memory_name] = table_results[memory_name].append(get_memory(table, model_group), ignore_index=True)
|
||||
table_results[latency_name] = table_results[latency_name].append(get_latency(table, model_group), ignore_index=True)
|
||||
table_results[latency_over_time_name] = table_results[latency_over_time_name].append(get_latency_over_time(args.commit_hash, args.report_url, args.branch, table_results[latency_name]), ignore_index=True)
|
||||
table_results[memory_name] = table_results[memory_name].append(
|
||||
get_memory(table, model_group), ignore_index=True
|
||||
)
|
||||
table_results[latency_name] = table_results[latency_name].append(
|
||||
get_latency(table, model_group), ignore_index=True
|
||||
)
|
||||
table_results[latency_over_time_name] = table_results[latency_over_time_name].append(
|
||||
get_latency_over_time(
|
||||
args.commit_hash,
|
||||
args.report_url,
|
||||
args.branch,
|
||||
table_results[latency_name],
|
||||
),
|
||||
ignore_index=True,
|
||||
)
|
||||
elif status_name in csv:
|
||||
table_results[status_name] = table_results[status_name].append(get_status(table, model_group), ignore_index=True)
|
||||
table_results[status_name] = table_results[status_name].append(
|
||||
get_status(table, model_group), ignore_index=True
|
||||
)
|
||||
os.chdir(result_file)
|
||||
for table in tables:
|
||||
print('writing ' + table + ' to database')
|
||||
db_table_name = 'ep_model_' + table
|
||||
write_table(ingest_client, table_results[table], db_table_name, date_time, identifier)
|
||||
for table in tables:
|
||||
print("writing " + table + " to database")
|
||||
db_table_name = "ep_model_" + table
|
||||
write_table(
|
||||
ingest_client,
|
||||
table_results[table],
|
||||
db_table_name,
|
||||
date_time,
|
||||
identifier,
|
||||
)
|
||||
|
||||
except BaseException as e:
|
||||
except BaseException as e:
|
||||
print(str(e))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,17 +1,21 @@
|
|||
import json
|
||||
import os
|
||||
import wget
|
||||
import tarfile
|
||||
import json
|
||||
|
||||
import wget
|
||||
|
||||
|
||||
def get_tar_file(link):
|
||||
file_name = link.split("/")[-1]
|
||||
return file_name
|
||||
|
||||
|
||||
def create_model_folder(model):
|
||||
os.mkdir(model)
|
||||
|
||||
|
||||
def extract_and_get_files(file_name):
|
||||
model_folder = file_name.replace(".tar.gz", "") + '/'
|
||||
model_folder = file_name.replace(".tar.gz", "") + "/"
|
||||
create_model_folder(model_folder)
|
||||
model_tar = tarfile.open(file_name)
|
||||
model_tar.extractall(model_folder)
|
||||
|
@ -20,21 +24,25 @@ def extract_and_get_files(file_name):
|
|||
model_tar.close()
|
||||
return model_folder, file_list
|
||||
|
||||
|
||||
def download_model(link):
|
||||
file_name = get_tar_file(link)
|
||||
wget.download(link)
|
||||
model_folder, file_list = extract_and_get_files(file_name)
|
||||
return model_folder, file_list
|
||||
|
||||
|
||||
def get_model_path(file_list):
|
||||
for file_name in file_list:
|
||||
if ".onnx" in file_name:
|
||||
return file_name
|
||||
|
||||
def get_test_path(model_path):
|
||||
model_filename = os.path.basename(model_path)
|
||||
|
||||
def get_test_path(model_path):
|
||||
model_filename = os.path.basename(model_path)
|
||||
test_path = model_path.split(model_filename)[0]
|
||||
return test_path
|
||||
return test_path
|
||||
|
||||
|
||||
def create_model_object(model, folder, model_file_path, test_path):
|
||||
model_dict = {}
|
||||
|
@ -44,6 +52,7 @@ def create_model_object(model, folder, model_file_path, test_path):
|
|||
model_dict["test_data_path"] = "./" + test_path
|
||||
return model_dict
|
||||
|
||||
|
||||
def get_model_info(link):
|
||||
model_folder, file_list = download_model(link)
|
||||
model = model_folder[:-1]
|
||||
|
@ -52,20 +61,23 @@ def get_model_info(link):
|
|||
model_info = create_model_object(model, model_folder, model_file_path, test_path)
|
||||
return model_info
|
||||
|
||||
def write_json(models):
|
||||
model_json = json.dumps(models, indent=4)
|
||||
with open('model_list.json', 'w') as fp:
|
||||
|
||||
def write_json(models):
|
||||
model_json = json.dumps(models, indent=4)
|
||||
with open("model_list.json", "w") as fp:
|
||||
fp.write(model_json)
|
||||
|
||||
|
||||
def main():
|
||||
links = []
|
||||
with open('links.txt', 'r') as fh:
|
||||
with open("links.txt", "r") as fh:
|
||||
links = [link.rstrip() for link in fh.readlines()]
|
||||
|
||||
|
||||
model_list = []
|
||||
for link in links:
|
||||
model_list.append(get_model_info(link))
|
||||
write_json(model_list)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), 'models', 'gpt2'))
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
|
||||
|
||||
import convert_to_onnx
|
||||
|
||||
# added for backward compatible
|
||||
import gpt2_helper
|
||||
import convert_to_onnx
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# Get/Set cpu affinity. Currently only support part of Unix system
|
||||
import logging
|
||||
|
@ -10,11 +10,11 @@ import os
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AffinitySetting():
|
||||
class AffinitySetting:
|
||||
def __init__(self):
|
||||
self.pid = os.getpid()
|
||||
self.affinity = None
|
||||
self.is_os_supported = hasattr(os, 'sched_getaffinity') and hasattr(os, 'sched_setaffinity')
|
||||
self.is_os_supported = hasattr(os, "sched_getaffinity") and hasattr(os, "sched_setaffinity")
|
||||
if not self.is_os_supported:
|
||||
logger.warning("Current OS does not support os.get_affinity() and os.set_affinity()")
|
||||
|
||||
|
@ -25,12 +25,16 @@ class AffinitySetting():
|
|||
def set_affinity(self):
|
||||
if self.is_os_supported:
|
||||
current_affinity = os.sched_getaffinity(self.pid)
|
||||
if (self.affinity != current_affinity):
|
||||
logger.warning("Replacing affinity setting %s with %s", str(current_affinity), str(self.affinity))
|
||||
if self.affinity != current_affinity:
|
||||
logger.warning(
|
||||
"Replacing affinity setting %s with %s",
|
||||
str(current_affinity),
|
||||
str(self.affinity),
|
||||
)
|
||||
os.sched_setaffinity(self.pid, self.affinity)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
affi_helper = AffinitySetting()
|
||||
affi_helper.get_affinity()
|
||||
affi_helper.set_affinity()
|
||||
|
|
|
@ -42,24 +42,40 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import timeit
|
||||
from datetime import datetime
|
||||
import numpy
|
||||
|
||||
import os
|
||||
import psutil
|
||||
import onnx
|
||||
from enum import Enum
|
||||
from benchmark_helper import (OptimizerInfo, create_onnxruntime_session, Precision, setup_logger, get_latency_result,
|
||||
output_details, output_summary, output_fusion_statistics, inference_ort,
|
||||
inference_ort_with_io_binding, allocateOutputBuffers, ConfigModifier)
|
||||
|
||||
import numpy
|
||||
import onnx
|
||||
import psutil
|
||||
from benchmark_helper import (
|
||||
ConfigModifier,
|
||||
OptimizerInfo,
|
||||
Precision,
|
||||
allocateOutputBuffers,
|
||||
create_onnxruntime_session,
|
||||
get_latency_result,
|
||||
inference_ort,
|
||||
inference_ort_with_io_binding,
|
||||
output_details,
|
||||
output_fusion_statistics,
|
||||
output_summary,
|
||||
setup_logger,
|
||||
)
|
||||
from fusion_options import FusionOptions
|
||||
from onnx_exporter import (
|
||||
create_onnxruntime_input,
|
||||
export_onnx_model_from_pt,
|
||||
export_onnx_model_from_tf,
|
||||
load_pretrained_model,
|
||||
)
|
||||
from quantize_helper import QuantizeHelper
|
||||
from onnx_exporter import create_onnxruntime_input, load_pretrained_model, export_onnx_model_from_pt, export_onnx_model_from_tf
|
||||
|
||||
logger = logging.getLogger('')
|
||||
logger = logging.getLogger("")
|
||||
|
||||
from huggingface_models import MODELS, MODEL_CLASSES
|
||||
from huggingface_models import MODEL_CLASSES, MODELS
|
||||
|
||||
cpu_count = psutil.cpu_count(logical=False)
|
||||
|
||||
|
@ -68,35 +84,60 @@ if "OMP_NUM_THREADS" not in os.environ:
|
|||
os.environ["OMP_NUM_THREADS"] = str(cpu_count)
|
||||
|
||||
import torch
|
||||
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model, LxmertConfig)
|
||||
from transformers import AutoConfig, AutoModel, AutoTokenizer, GPT2Model, LxmertConfig
|
||||
|
||||
|
||||
def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
|
||||
sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir,
|
||||
verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics,
|
||||
model_source, args):
|
||||
def run_onnxruntime(
|
||||
use_gpu,
|
||||
provider,
|
||||
model_names,
|
||||
model_class,
|
||||
config_modifier,
|
||||
precision,
|
||||
num_threads,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
repeat_times,
|
||||
input_counts,
|
||||
optimizer_info,
|
||||
validate_onnx,
|
||||
cache_dir,
|
||||
onnx_dir,
|
||||
verbose,
|
||||
overwrite,
|
||||
disable_ort_io_binding,
|
||||
use_raw_attention_mask,
|
||||
model_fusion_statistics,
|
||||
model_source,
|
||||
args,
|
||||
):
|
||||
import onnxruntime
|
||||
|
||||
results = []
|
||||
if (use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers())
|
||||
and ('ROCMExecutionProvider' not in onnxruntime.get_available_providers())):
|
||||
if (
|
||||
use_gpu
|
||||
and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
):
|
||||
logger.error(
|
||||
"Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
return results
|
||||
|
||||
warm_up_repeat = 0
|
||||
if provider == 'tensorrt':
|
||||
if provider == "tensorrt":
|
||||
optimizer_info = OptimizerInfo.NOOPT
|
||||
warm_up_repeat = 5
|
||||
if 'TensorrtExecutionProvider' not in onnxruntime.get_available_providers():
|
||||
if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers():
|
||||
logger.error(
|
||||
"Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
return results
|
||||
|
||||
if optimizer_info == OptimizerInfo.NOOPT:
|
||||
logger.warning(f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied.")
|
||||
logger.warning(
|
||||
f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
|
||||
)
|
||||
|
||||
for model_name in model_names:
|
||||
all_input_names = MODELS[model_name][0]
|
||||
|
@ -108,27 +149,64 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
|
|||
args.model_type = MODELS[model_name][3]
|
||||
fusion_options = FusionOptions.parse(args)
|
||||
|
||||
if 'pt' in model_source:
|
||||
if "pt" in model_source:
|
||||
with torch.no_grad():
|
||||
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
|
||||
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
|
||||
config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
|
||||
validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options)
|
||||
if 'tf' in model_source:
|
||||
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
|
||||
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
|
||||
config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
|
||||
validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options)
|
||||
(
|
||||
onnx_model_file,
|
||||
is_valid_onnx_model,
|
||||
vocab_size,
|
||||
max_sequence_length,
|
||||
) = export_onnx_model_from_pt(
|
||||
model_name,
|
||||
MODELS[model_name][1],
|
||||
MODELS[model_name][2],
|
||||
MODELS[model_name][3],
|
||||
model_class,
|
||||
config_modifier,
|
||||
cache_dir,
|
||||
onnx_dir,
|
||||
input_names,
|
||||
use_gpu,
|
||||
precision,
|
||||
optimizer_info,
|
||||
validate_onnx,
|
||||
use_raw_attention_mask,
|
||||
overwrite,
|
||||
model_fusion_statistics,
|
||||
fusion_options,
|
||||
)
|
||||
if "tf" in model_source:
|
||||
(onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length,) = export_onnx_model_from_tf(
|
||||
model_name,
|
||||
MODELS[model_name][1],
|
||||
MODELS[model_name][2],
|
||||
MODELS[model_name][3],
|
||||
model_class,
|
||||
config_modifier,
|
||||
cache_dir,
|
||||
onnx_dir,
|
||||
input_names,
|
||||
use_gpu,
|
||||
precision,
|
||||
optimizer_info,
|
||||
validate_onnx,
|
||||
use_raw_attention_mask,
|
||||
overwrite,
|
||||
model_fusion_statistics,
|
||||
fusion_options,
|
||||
)
|
||||
|
||||
if not is_valid_onnx_model:
|
||||
continue
|
||||
|
||||
ort_session = create_onnxruntime_session(onnx_model_file,
|
||||
use_gpu,
|
||||
provider,
|
||||
enable_all_optimization=True,
|
||||
num_threads=num_threads,
|
||||
verbose=verbose)
|
||||
ort_session = create_onnxruntime_session(
|
||||
onnx_model_file,
|
||||
use_gpu,
|
||||
provider,
|
||||
enable_all_optimization=True,
|
||||
num_threads=num_threads,
|
||||
verbose=verbose,
|
||||
)
|
||||
if ort_session is None:
|
||||
continue
|
||||
|
||||
|
@ -137,8 +215,12 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
|
|||
device = "cuda" if use_gpu else "cpu"
|
||||
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
max_last_state_size = numpy.prod(
|
||||
[max(batch_sizes), max(sequence_lengths),
|
||||
max(vocab_size, config.hidden_size)])
|
||||
[
|
||||
max(batch_sizes),
|
||||
max(sequence_lengths),
|
||||
max(vocab_size, config.hidden_size),
|
||||
]
|
||||
)
|
||||
max_pooler_size = numpy.prod([max(batch_sizes), config.hidden_size])
|
||||
for batch_size in batch_sizes:
|
||||
if batch_size <= 0:
|
||||
|
@ -147,9 +229,15 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
|
|||
if max_sequence_length is not None and sequence_length > max_sequence_length:
|
||||
continue
|
||||
|
||||
input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32
|
||||
ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config,
|
||||
input_value_type)
|
||||
input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
|
||||
ort_inputs = create_onnxruntime_input(
|
||||
vocab_size,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
input_names,
|
||||
config,
|
||||
input_value_type,
|
||||
)
|
||||
result_template = {
|
||||
"engine": "onnxruntime",
|
||||
"version": onnxruntime.__version__,
|
||||
|
@ -167,12 +255,19 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
|
|||
"datetime": str(datetime.now()),
|
||||
}
|
||||
|
||||
logger.info("Run onnxruntime on {} with input shape {}".format(model_name,
|
||||
[batch_size, sequence_length]))
|
||||
logger.info(
|
||||
"Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
|
||||
)
|
||||
|
||||
if disable_ort_io_binding:
|
||||
result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size,
|
||||
warm_up_repeat)
|
||||
result = inference_ort(
|
||||
ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
batch_size,
|
||||
warm_up_repeat,
|
||||
)
|
||||
else:
|
||||
# Get output sizes from a dummy ort run
|
||||
ort_outputs = ort_session.run(ort_output_names, ort_inputs)
|
||||
|
@ -184,19 +279,41 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
|
|||
else:
|
||||
output_buffer_max_sizes.append(max_last_state_size)
|
||||
|
||||
data_type = numpy.longlong if 'pt' in model_source else numpy.intc
|
||||
result = inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times,
|
||||
ort_output_names, ort_outputs, output_buffers,
|
||||
output_buffer_max_sizes, batch_size, device, data_type,
|
||||
warm_up_repeat)
|
||||
data_type = numpy.longlong if "pt" in model_source else numpy.intc
|
||||
result = inference_ort_with_io_binding(
|
||||
ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
ort_output_names,
|
||||
ort_outputs,
|
||||
output_buffers,
|
||||
output_buffer_max_sizes,
|
||||
batch_size,
|
||||
device,
|
||||
data_type,
|
||||
warm_up_repeat,
|
||||
)
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
|
||||
sequence_lengths, repeat_times, torchscript, cache_dir, verbose):
|
||||
def run_pytorch(
|
||||
use_gpu,
|
||||
model_names,
|
||||
model_class,
|
||||
config_modifier,
|
||||
precision,
|
||||
num_threads,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
repeat_times,
|
||||
torchscript,
|
||||
cache_dir,
|
||||
verbose,
|
||||
):
|
||||
results = []
|
||||
if use_gpu and not torch.cuda.is_available():
|
||||
logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
|
||||
|
@ -207,11 +324,17 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
|
|||
for model_name in model_names:
|
||||
config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir)
|
||||
config_modifier.modify(config)
|
||||
model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class)
|
||||
model = load_pretrained_model(
|
||||
model_name,
|
||||
config=config,
|
||||
cache_dir=cache_dir,
|
||||
custom_model_class=model_class,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
max_input_size = tokenizer.max_model_input_sizes[
|
||||
model_name] if model_name in tokenizer.max_model_input_sizes else 1024
|
||||
max_input_size = (
|
||||
tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
|
||||
)
|
||||
|
||||
logger.debug(f"Model {model}")
|
||||
logger.debug(f"Number of parameters {model.num_parameters()}")
|
||||
|
@ -234,11 +357,13 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
|
|||
continue
|
||||
|
||||
logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
|
||||
input_ids = torch.randint(low=0,
|
||||
high=config.vocab_size - 1,
|
||||
size=(batch_size, sequence_length),
|
||||
dtype=torch.long,
|
||||
device=device)
|
||||
input_ids = torch.randint(
|
||||
low=0,
|
||||
high=config.vocab_size - 1,
|
||||
size=(batch_size, sequence_length),
|
||||
dtype=torch.long,
|
||||
device=device,
|
||||
)
|
||||
try:
|
||||
inference = torch.jit.trace(model, input_ids) if torchscript else model
|
||||
inference(input_ids)
|
||||
|
@ -272,9 +397,10 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
|
|||
|
||||
|
||||
def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
|
||||
import tensorflow as tf
|
||||
from functools import wraps
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
def run_func(func):
|
||||
@wraps(func)
|
||||
def run_in_eager_mode(*args, **kwargs):
|
||||
|
@ -296,26 +422,38 @@ def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
|
|||
return run_func
|
||||
|
||||
|
||||
def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
|
||||
sequence_lengths, repeat_times, cache_dir, verbose):
|
||||
def run_tensorflow(
|
||||
use_gpu,
|
||||
model_names,
|
||||
model_class,
|
||||
config_modifier,
|
||||
precision,
|
||||
num_threads,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
repeat_times,
|
||||
cache_dir,
|
||||
verbose,
|
||||
):
|
||||
results = []
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
|
||||
|
||||
if not use_gpu:
|
||||
tf.config.set_visible_devices([], 'GPU')
|
||||
tf.config.set_visible_devices([], "GPU")
|
||||
|
||||
if use_gpu and not tf.test.is_built_with_cuda():
|
||||
logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
|
||||
return results
|
||||
|
||||
if use_gpu: # Restrict TensorFlow to only use the first GPU
|
||||
physical_devices = tf.config.list_physical_devices('GPU')
|
||||
physical_devices = tf.config.list_physical_devices("GPU")
|
||||
try:
|
||||
tf.config.set_visible_devices(physical_devices[0], 'GPU')
|
||||
tf.config.set_visible_devices(physical_devices[0], "GPU")
|
||||
tf.config.experimental.set_memory_growth(physical_devices[0], True)
|
||||
tf.distribute.OneDeviceStrategy(device='/gpu:0')
|
||||
tf.distribute.OneDeviceStrategy(device="/gpu:0")
|
||||
except RuntimeError as e:
|
||||
logger.exception(e)
|
||||
|
||||
|
@ -326,16 +464,19 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
|
|||
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
config_modifier.modify(config)
|
||||
|
||||
model = load_pretrained_model(model_name,
|
||||
config=config,
|
||||
cache_dir=cache_dir,
|
||||
custom_model_class=model_class,
|
||||
is_tf_model=True)
|
||||
model = load_pretrained_model(
|
||||
model_name,
|
||||
config=config,
|
||||
cache_dir=cache_dir,
|
||||
custom_model_class=model_class,
|
||||
is_tf_model=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
max_input_size = tokenizer.max_model_input_sizes[
|
||||
model_name] if model_name in tokenizer.max_model_input_sizes else 1024
|
||||
max_input_size = (
|
||||
tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
|
||||
)
|
||||
|
||||
for batch_size in batch_sizes:
|
||||
if batch_size <= 0:
|
||||
|
@ -345,10 +486,12 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
|
|||
if max_input_size is not None and sequence_length > max_input_size:
|
||||
continue
|
||||
|
||||
logger.info("Run Tensorflow on {} with input shape {}".format(model_name,
|
||||
[batch_size, sequence_length]))
|
||||
logger.info(
|
||||
"Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length])
|
||||
)
|
||||
|
||||
import random
|
||||
|
||||
rng = random.Random()
|
||||
values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
|
||||
input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
|
||||
|
@ -367,7 +510,12 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
|
|||
def lxmert_forward():
|
||||
feats = tf.random.normal([1, 1, config.visual_feat_dim])
|
||||
pos = tf.random.normal([1, 1, config.visual_pos_dim])
|
||||
return model(input_ids, visual_feats=feats, visual_pos=pos, training=False)
|
||||
return model(
|
||||
input_ids,
|
||||
visual_feats=feats,
|
||||
visual_pos=pos,
|
||||
training=False,
|
||||
)
|
||||
|
||||
inference = encoder_forward
|
||||
if config.is_encoder_decoder:
|
||||
|
@ -401,6 +549,7 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
|
|||
except RuntimeError as e:
|
||||
logger.exception(e)
|
||||
from numba import cuda
|
||||
|
||||
device = cuda.get_current_device()
|
||||
device.reset()
|
||||
|
||||
|
@ -410,55 +559,73 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
|
|||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("-m",
|
||||
"--models",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["bert-base-cased", "roberta-base", "gpt2"],
|
||||
choices=list(MODELS.keys()),
|
||||
help="Pre-trained models in the list: " + ", ".join(MODELS.keys()))
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--models",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["bert-base-cased", "roberta-base", "gpt2"],
|
||||
choices=list(MODELS.keys()),
|
||||
help="Pre-trained models in the list: " + ", ".join(MODELS.keys()),
|
||||
)
|
||||
|
||||
parser.add_argument("--model_source",
|
||||
required=False,
|
||||
nargs=1,
|
||||
type=str,
|
||||
default='pt',
|
||||
choices=['pt', 'tf'],
|
||||
help="Export onnx from pt or tf")
|
||||
parser.add_argument(
|
||||
"--model_source",
|
||||
required=False,
|
||||
nargs=1,
|
||||
type=str,
|
||||
default="pt",
|
||||
choices=["pt", "tf"],
|
||||
help="Export onnx from pt or tf",
|
||||
)
|
||||
|
||||
parser.add_argument('--model_class',
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
choices=list(MODEL_CLASSES),
|
||||
help='Model type selected in the list: ' + ', '.join(MODEL_CLASSES))
|
||||
parser.add_argument(
|
||||
"--model_class",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
choices=list(MODEL_CLASSES),
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
|
||||
)
|
||||
|
||||
parser.add_argument("-e",
|
||||
"--engines",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=['onnxruntime'],
|
||||
choices=['onnxruntime', 'torch', 'torchscript', 'tensorflow'],
|
||||
help="Engines to benchmark")
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--engines",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["onnxruntime"],
|
||||
choices=["onnxruntime", "torch", "torchscript", "tensorflow"],
|
||||
help="Engines to benchmark",
|
||||
)
|
||||
|
||||
parser.add_argument("-c",
|
||||
"--cache_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join('.', 'cache_models'),
|
||||
help="Directory to cache pre-trained models")
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--cache_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join(".", "cache_models"),
|
||||
help="Directory to cache pre-trained models",
|
||||
)
|
||||
|
||||
parser.add_argument("--onnx_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join('.', 'onnx_models'),
|
||||
help="Directory to store onnx models")
|
||||
parser.add_argument(
|
||||
"--onnx_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join(".", "onnx_models"),
|
||||
help="Directory to store onnx models",
|
||||
)
|
||||
|
||||
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
|
||||
|
||||
parser.add_argument("--provider", required=False, type=str, default=None, help="Execution provider to use")
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
|
@ -466,11 +633,17 @@ def parse_arguments():
|
|||
type=Precision,
|
||||
default=Precision.FLOAT32,
|
||||
choices=list(Precision),
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization")
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
|
||||
)
|
||||
|
||||
parser.add_argument("--verbose", required=False, action="store_true", help="Print more information")
|
||||
|
||||
parser.add_argument("--overwrite", required=False, action="store_true", help="Overwrite existing models")
|
||||
parser.add_argument(
|
||||
"--overwrite",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Overwrite existing models",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
|
@ -478,54 +651,96 @@ def parse_arguments():
|
|||
type=OptimizerInfo,
|
||||
default=OptimizerInfo.BYSCRIPT,
|
||||
choices=list(OptimizerInfo),
|
||||
help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt"
|
||||
help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt",
|
||||
)
|
||||
|
||||
parser.add_argument("-v", "--validate_onnx", required=False, action="store_true", help="Validate ONNX model")
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--validate_onnx",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Validate ONNX model",
|
||||
)
|
||||
|
||||
parser.add_argument("-f",
|
||||
"--fusion_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving summary results of graph optimization.")
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--fusion_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving summary results of graph optimization.",
|
||||
)
|
||||
|
||||
parser.add_argument("-d", "--detail_csv", required=False, default=None, help="CSV file for saving detail results.")
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--detail_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving detail results.",
|
||||
)
|
||||
|
||||
parser.add_argument("-r", "--result_csv", required=False, default=None, help="CSV file for saving summary results.")
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--result_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving summary results.",
|
||||
)
|
||||
|
||||
parser.add_argument("-i",
|
||||
"--input_counts",
|
||||
required=False,
|
||||
nargs="+",
|
||||
default=[1],
|
||||
type=int,
|
||||
choices=[1, 2, 3],
|
||||
help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.")
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input_counts",
|
||||
required=False,
|
||||
nargs="+",
|
||||
default=[1],
|
||||
type=int,
|
||||
choices=[1, 2, 3],
|
||||
help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.",
|
||||
)
|
||||
|
||||
parser.add_argument("-t",
|
||||
"--test_times",
|
||||
required=False,
|
||||
default=100,
|
||||
type=int,
|
||||
help="Number of repeat times to get average inference latency.")
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--test_times",
|
||||
required=False,
|
||||
default=100,
|
||||
type=int,
|
||||
help="Number of repeat times to get average inference latency.",
|
||||
)
|
||||
|
||||
parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1])
|
||||
|
||||
parser.add_argument("-s", "--sequence_lengths", nargs="+", type=int, default=[4, 8, 16, 32, 64, 128, 256])
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--sequence_lengths",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[4, 8, 16, 32, 64, 128, 256],
|
||||
)
|
||||
|
||||
parser.add_argument('--disable_ort_io_binding',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help='Disable running ONNX Runtime with binded inputs and outputs. ')
|
||||
parser.add_argument(
|
||||
"--disable_ort_io_binding",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Disable running ONNX Runtime with binded inputs and outputs. ",
|
||||
)
|
||||
parser.set_defaults(disable_ort_io_binding=False)
|
||||
|
||||
parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use")
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--num_threads",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[0],
|
||||
help="Threads to use",
|
||||
)
|
||||
|
||||
parser.add_argument("--force_num_layers",
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help="Manually set the model's layer number")
|
||||
parser.add_argument(
|
||||
"--force_num_layers",
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help="Manually set the model's layer number",
|
||||
)
|
||||
|
||||
FusionOptions.add_arguments(parser)
|
||||
|
||||
|
@ -573,30 +788,80 @@ def main():
|
|||
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
|
||||
|
||||
if enable_torchscript:
|
||||
results += run_pytorch(args.use_gpu, args.models, args.model_class, config_modifier, args.precision,
|
||||
num_threads, args.batch_sizes, args.sequence_lengths, args.test_times, True,
|
||||
args.cache_dir, args.verbose)
|
||||
results += run_pytorch(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
True,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
if enable_torch:
|
||||
results += run_pytorch(args.use_gpu, args.models, args.model_class, config_modifier, args.precision,
|
||||
num_threads, args.batch_sizes, args.sequence_lengths, args.test_times, False,
|
||||
args.cache_dir, args.verbose)
|
||||
results += run_pytorch(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
False,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
if enable_tensorflow:
|
||||
results += run_tensorflow(args.use_gpu, args.models, args.model_class, config_modifier, args.precision,
|
||||
num_threads, args.batch_sizes, args.sequence_lengths, args.test_times,
|
||||
args.cache_dir, args.verbose)
|
||||
results += run_tensorflow(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
model_fusion_statistics = {}
|
||||
if enable_onnxruntime:
|
||||
try:
|
||||
use_raw_attention_mask = True
|
||||
results += run_onnxruntime(args.use_gpu, args.provider, args.models, args.model_class, config_modifier,
|
||||
args.precision, num_threads, args.batch_sizes, args.sequence_lengths,
|
||||
args.test_times, args.input_counts, args.optimizer_info, args.validate_onnx,
|
||||
args.cache_dir, args.onnx_dir, args.verbose, args.overwrite,
|
||||
args.disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics,
|
||||
args.model_source, args)
|
||||
results += run_onnxruntime(
|
||||
args.use_gpu,
|
||||
args.provider,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
args.input_counts,
|
||||
args.optimizer_info,
|
||||
args.validate_onnx,
|
||||
args.cache_dir,
|
||||
args.onnx_dir,
|
||||
args.verbose,
|
||||
args.overwrite,
|
||||
args.disable_ort_io_binding,
|
||||
use_raw_attention_mask,
|
||||
model_fusion_statistics,
|
||||
args.model_source,
|
||||
args,
|
||||
)
|
||||
except:
|
||||
logger.error(f"Exception", exc_info=True)
|
||||
|
||||
|
|
|
@ -4,28 +4,29 @@
|
|||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import csv
|
||||
import numpy
|
||||
import time
|
||||
import timeit
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import logging
|
||||
import coloredlogs
|
||||
import torch
|
||||
import onnx
|
||||
from enum import Enum
|
||||
|
||||
import coloredlogs
|
||||
import numpy
|
||||
import onnx
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Precision(Enum):
|
||||
FLOAT32 = 'fp32'
|
||||
FLOAT16 = 'fp16'
|
||||
INT8 = 'int8'
|
||||
FLOAT32 = "fp32"
|
||||
FLOAT16 = "fp16"
|
||||
INT8 = "int8"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
@ -34,28 +35,28 @@ class Precision(Enum):
|
|||
class OptimizerInfo(Enum):
|
||||
# no_opt means using the raw ONNX model, but OnnxRuntime might still apply optimization as long as
|
||||
# graph optimization level is not 0 (disable all).
|
||||
NOOPT = 'no_opt'
|
||||
BYORT = 'by_ort'
|
||||
BYSCRIPT = 'by_script'
|
||||
NOOPT = "no_opt"
|
||||
BYORT = "by_ort"
|
||||
BYSCRIPT = "by_script"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class ConfigModifier():
|
||||
class ConfigModifier:
|
||||
def __init__(self, num_layers):
|
||||
self.num_layers = num_layers
|
||||
|
||||
def modify(self, config):
|
||||
if self.num_layers is None:
|
||||
return
|
||||
if hasattr(config, 'num_hidden_layers'):
|
||||
if hasattr(config, "num_hidden_layers"):
|
||||
config.num_hidden_layers = self.num_layers
|
||||
logger.info(f"Modifying pytorch model's number of hidden layers to: {self.num_layers}")
|
||||
if hasattr(config, 'encoder_layers'):
|
||||
if hasattr(config, "encoder_layers"):
|
||||
config.encoder_layers = self.num_layers
|
||||
logger.info(f"Modifying pytorch model's number of encoder layers to: {self.num_layers}")
|
||||
if hasattr(config, 'decoder_layers '):
|
||||
if hasattr(config, "decoder_layers "):
|
||||
config.decoder_layers = self.num_layers
|
||||
logger.info(f"Modifying pytorch model's number of decoder layers to: {self.num_layers}")
|
||||
|
||||
|
@ -69,16 +70,20 @@ IO_BINDING_DATA_TYPE_MAP = {
|
|||
}
|
||||
|
||||
|
||||
def create_onnxruntime_session(onnx_model_path,
|
||||
use_gpu,
|
||||
provider=None,
|
||||
enable_all_optimization=True,
|
||||
num_threads=-1,
|
||||
enable_profiling=False,
|
||||
verbose=False):
|
||||
def create_onnxruntime_session(
|
||||
onnx_model_path,
|
||||
use_gpu,
|
||||
provider=None,
|
||||
enable_all_optimization=True,
|
||||
num_threads=-1,
|
||||
enable_profiling=False,
|
||||
verbose=False,
|
||||
):
|
||||
session = None
|
||||
try:
|
||||
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
|
||||
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
|
||||
from onnxruntime import __version__ as onnxruntime_version
|
||||
|
||||
sess_options = SessionOptions()
|
||||
|
||||
if enable_all_optimization:
|
||||
|
@ -100,20 +105,28 @@ def create_onnxruntime_session(onnx_model_path,
|
|||
|
||||
logger.debug(f"Create session for onnx model: {onnx_model_path}")
|
||||
if use_gpu:
|
||||
if provider == 'dml':
|
||||
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'rocm':
|
||||
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'migraphx':
|
||||
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'cuda':
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'tensorrt':
|
||||
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
if provider == "dml":
|
||||
execution_providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "rocm":
|
||||
execution_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "migraphx":
|
||||
execution_providers = [
|
||||
"MIGraphXExecutionProvider",
|
||||
"ROCMExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
elif provider == "cuda":
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "tensorrt":
|
||||
execution_providers = [
|
||||
"TensorrtExecutionProvider",
|
||||
"CUDAExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
else:
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
else:
|
||||
execution_providers = ['CPUExecutionProvider']
|
||||
execution_providers = ["CPUExecutionProvider"]
|
||||
session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
|
||||
except:
|
||||
logger.error(f"Exception", exc_info=True)
|
||||
|
@ -123,9 +136,12 @@ def create_onnxruntime_session(onnx_model_path,
|
|||
|
||||
def setup_logger(verbose=True):
|
||||
if verbose:
|
||||
coloredlogs.install(level='DEBUG', fmt='[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s')
|
||||
coloredlogs.install(
|
||||
level="DEBUG",
|
||||
fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
|
||||
)
|
||||
else:
|
||||
coloredlogs.install(fmt='%(message)s')
|
||||
coloredlogs.install(fmt="%(message)s")
|
||||
logging.getLogger("transformers").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
|
@ -137,25 +153,30 @@ def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
|
|||
os.makedirs(output_dir)
|
||||
|
||||
import onnxruntime
|
||||
|
||||
if use_gpu:
|
||||
if provider == 'dml':
|
||||
assert 'DmlExecutionProvider' in onnxruntime.get_available_providers(
|
||||
if provider == "dml":
|
||||
assert (
|
||||
"DmlExecutionProvider" in onnxruntime.get_available_providers()
|
||||
), "Please install onnxruntime-directml package to test GPU inference."
|
||||
|
||||
else:
|
||||
assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers(
|
||||
assert (
|
||||
"CUDAExecutionProvider" in onnxruntime.get_available_providers()
|
||||
), "Please install onnxruntime-gpu package to test GPU inference."
|
||||
|
||||
import transformers
|
||||
logger.info(f'PyTorch Version:{torch.__version__}')
|
||||
logger.info(f'Transformers Version:{transformers.__version__}')
|
||||
logger.info(f'Onnxruntime Version:{onnxruntime.__version__}')
|
||||
|
||||
logger.info(f"PyTorch Version:{torch.__version__}")
|
||||
logger.info(f"Transformers Version:{transformers.__version__}")
|
||||
logger.info(f"Onnxruntime Version:{onnxruntime.__version__}")
|
||||
|
||||
# Support three major versions of PyTorch and OnnxRuntime, and up to 6 months of transformers.
|
||||
from packaging import version
|
||||
assert version.parse(torch.__version__) >= version.parse('1.5.0')
|
||||
assert version.parse(transformers.__version__) >= version.parse('3.0.0')
|
||||
assert version.parse(onnxruntime.__version__) >= version.parse('1.4.0')
|
||||
|
||||
assert version.parse(torch.__version__) >= version.parse("1.5.0")
|
||||
assert version.parse(transformers.__version__) >= version.parse("3.0.0")
|
||||
assert version.parse(onnxruntime.__version__) >= version.parse("1.4.0")
|
||||
|
||||
|
||||
def get_latency_result(runtimes, batch_size):
|
||||
|
@ -175,12 +196,29 @@ def get_latency_result(runtimes, batch_size):
|
|||
|
||||
|
||||
def output_details(results, csv_filename):
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
with open(csv_filename, mode="a", newline="") as csv_file:
|
||||
column_names = [
|
||||
"engine", "version", "providers", "device", "precision", "optimizer", "io_binding", "model_name", "inputs",
|
||||
"threads", "batch_size", "sequence_length", "custom_layer_num", "datetime", "test_times", "QPS",
|
||||
"average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile",
|
||||
"latency_99_percentile"
|
||||
"engine",
|
||||
"version",
|
||||
"providers",
|
||||
"device",
|
||||
"precision",
|
||||
"optimizer",
|
||||
"io_binding",
|
||||
"model_name",
|
||||
"inputs",
|
||||
"threads",
|
||||
"batch_size",
|
||||
"sequence_length",
|
||||
"custom_layer_num",
|
||||
"datetime",
|
||||
"test_times",
|
||||
"QPS",
|
||||
"average_latency_ms",
|
||||
"latency_variance",
|
||||
"latency_90_percentile",
|
||||
"latency_95_percentile",
|
||||
"latency_99_percentile",
|
||||
]
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
|
@ -192,10 +230,19 @@ def output_details(results, csv_filename):
|
|||
|
||||
|
||||
def output_summary(results, csv_filename, args):
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
with open(csv_filename, mode="a", newline="") as csv_file:
|
||||
header_names = [
|
||||
"model_name", "inputs", "custom_layer_num", "engine", "version", "providers", "device", "precision",
|
||||
"optimizer", "io_binding", "threads"
|
||||
"model_name",
|
||||
"inputs",
|
||||
"custom_layer_num",
|
||||
"engine",
|
||||
"version",
|
||||
"providers",
|
||||
"device",
|
||||
"precision",
|
||||
"optimizer",
|
||||
"io_binding",
|
||||
"threads",
|
||||
]
|
||||
data_names = []
|
||||
for batch_size in args.batch_sizes:
|
||||
|
@ -211,9 +258,13 @@ def output_summary(results, csv_filename, args):
|
|||
for threads in args.num_threads:
|
||||
row = {}
|
||||
for result in results:
|
||||
if result["model_name"] == model_name and result["inputs"] == input_count and result[
|
||||
"engine"] == engine_name and result["io_binding"] == io_binding and result[
|
||||
"threads"] == threads:
|
||||
if (
|
||||
result["model_name"] == model_name
|
||||
and result["inputs"] == input_count
|
||||
and result["engine"] == engine_name
|
||||
and result["io_binding"] == io_binding
|
||||
and result["threads"] == threads
|
||||
):
|
||||
headers = {k: v for k, v in result.items() if k in header_names}
|
||||
if not row:
|
||||
row.update(headers)
|
||||
|
@ -232,9 +283,11 @@ def output_summary(results, csv_filename, args):
|
|||
|
||||
def output_fusion_statistics(model_fusion_statistics, csv_filename):
|
||||
from transformers import __version__ as transformers_version
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
|
||||
with open(csv_filename, mode="a", newline="") as csv_file:
|
||||
column_names = ["model_filename", "datetime", "transformers", "torch"] + list(
|
||||
next(iter(model_fusion_statistics.values())).keys())
|
||||
next(iter(model_fusion_statistics.values())).keys()
|
||||
)
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
csv_writer.writeheader()
|
||||
for key in model_fusion_statistics.keys():
|
||||
|
@ -256,18 +309,20 @@ def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_
|
|||
return result
|
||||
|
||||
|
||||
def inference_ort_with_io_binding(ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
ort_output_names,
|
||||
ort_outputs,
|
||||
output_buffers,
|
||||
output_buffer_max_sizes,
|
||||
batch_size,
|
||||
device,
|
||||
data_type=numpy.longlong,
|
||||
warm_up_repeat=0):
|
||||
def inference_ort_with_io_binding(
|
||||
ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
ort_output_names,
|
||||
ort_outputs,
|
||||
output_buffers,
|
||||
output_buffer_max_sizes,
|
||||
batch_size,
|
||||
device,
|
||||
data_type=numpy.longlong,
|
||||
warm_up_repeat=0,
|
||||
):
|
||||
result = {}
|
||||
|
||||
# Bind inputs and outputs to onnxruntime session
|
||||
|
@ -275,18 +330,42 @@ def inference_ort_with_io_binding(ort_session,
|
|||
# Bind inputs to device
|
||||
for name in ort_inputs.keys():
|
||||
np_input = torch.from_numpy(ort_inputs[name]).to(device)
|
||||
input_type = IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)] if str(
|
||||
ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP else data_type
|
||||
io_binding.bind_input(name, np_input.device.type, 0, input_type, np_input.shape, np_input.data_ptr())
|
||||
input_type = (
|
||||
IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)]
|
||||
if str(ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP
|
||||
else data_type
|
||||
)
|
||||
io_binding.bind_input(
|
||||
name,
|
||||
np_input.device.type,
|
||||
0,
|
||||
input_type,
|
||||
np_input.shape,
|
||||
np_input.data_ptr(),
|
||||
)
|
||||
# Bind outputs buffers with the sizes needed if not allocated already
|
||||
if len(output_buffers) == 0:
|
||||
allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device)
|
||||
|
||||
for i in range(len(ort_output_names)):
|
||||
io_binding.bind_output(ort_output_names[i], output_buffers[i].device.type, 0, numpy.float32,
|
||||
ort_outputs[i].shape, output_buffers[i].data_ptr())
|
||||
timeit.repeat(lambda: ort_session.run_with_iobinding(io_binding), number=1, repeat=warm_up_repeat) # Dry run
|
||||
runtimes = timeit.repeat(lambda: ort_session.run_with_iobinding(io_binding), number=1, repeat=repeat_times)
|
||||
io_binding.bind_output(
|
||||
ort_output_names[i],
|
||||
output_buffers[i].device.type,
|
||||
0,
|
||||
numpy.float32,
|
||||
ort_outputs[i].shape,
|
||||
output_buffers[i].data_ptr(),
|
||||
)
|
||||
timeit.repeat(
|
||||
lambda: ort_session.run_with_iobinding(io_binding),
|
||||
number=1,
|
||||
repeat=warm_up_repeat,
|
||||
) # Dry run
|
||||
runtimes = timeit.repeat(
|
||||
lambda: ort_session.run_with_iobinding(io_binding),
|
||||
number=1,
|
||||
repeat=repeat_times,
|
||||
)
|
||||
result.update(result_template)
|
||||
result.update({"io_binding": True})
|
||||
result.update(get_latency_result(runtimes, batch_size))
|
||||
|
@ -304,21 +383,23 @@ def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):
|
|||
def set_random_seed(seed=123):
|
||||
"""Set random seed manully to get deterministic results"""
|
||||
import random
|
||||
|
||||
random.seed(seed)
|
||||
numpy.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
#torch.backends.cudnn.enabled = False
|
||||
#torch.backends.cudnn.benchmark = False
|
||||
#torch.backends.cudnn.deterministic = True
|
||||
# torch.backends.cudnn.enabled = False
|
||||
# torch.backends.cudnn.benchmark = False
|
||||
# torch.backends.cudnn.deterministic = True
|
||||
|
||||
|
||||
def measure_memory(is_gpu, func):
|
||||
import os
|
||||
import psutil
|
||||
from time import sleep
|
||||
|
||||
import psutil
|
||||
|
||||
class MemoryMonitor:
|
||||
def __init__(self, keep_measuring=True):
|
||||
self.keep_measuring = keep_measuring
|
||||
|
@ -333,8 +414,16 @@ def measure_memory(is_gpu, func):
|
|||
return max_usage
|
||||
|
||||
def measure_gpu_usage(self):
|
||||
from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
|
||||
nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
|
||||
from py3nvml.py3nvml import (
|
||||
NVMLError,
|
||||
nvmlDeviceGetCount,
|
||||
nvmlDeviceGetHandleByIndex,
|
||||
nvmlDeviceGetMemoryInfo,
|
||||
nvmlDeviceGetName,
|
||||
nvmlInit,
|
||||
nvmlShutdown,
|
||||
)
|
||||
|
||||
max_gpu_usage = []
|
||||
gpu_name = []
|
||||
try:
|
||||
|
@ -350,11 +439,14 @@ def measure_memory(is_gpu, func):
|
|||
if not self.keep_measuring:
|
||||
break
|
||||
nvmlShutdown()
|
||||
return [{
|
||||
"device_id": i,
|
||||
"name": gpu_name[i],
|
||||
"max_used_MB": max_gpu_usage[i]
|
||||
} for i in range(deviceCount)]
|
||||
return [
|
||||
{
|
||||
"device_id": i,
|
||||
"name": gpu_name[i],
|
||||
"max_used_MB": max_gpu_usage[i],
|
||||
}
|
||||
for i in range(deviceCount)
|
||||
]
|
||||
except NVMLError as error:
|
||||
if not self.silent:
|
||||
self.logger.error("Error fetching GPU information using nvml: %s", error)
|
||||
|
@ -365,6 +457,7 @@ def measure_memory(is_gpu, func):
|
|||
memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage()
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
monitor = MemoryMonitor()
|
||||
mem_thread = executor.submit(monitor.measure_gpu_usage if is_gpu else monitor.measure_cpu_usage)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# This tool measures the inference performance of onnxruntime or onnxruntime-gpu python package on Bert model.
|
||||
|
||||
|
@ -12,22 +12,22 @@
|
|||
# Example command to run test on batch_size 1 and 2 for a model on GPU:
|
||||
# python bert_perf_test.py --model bert.onnx --batch_size 1 2 --sequence_length 128 --use_gpu --samples 1000 --test_times 1
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import timeit
|
||||
import statistics
|
||||
import psutil
|
||||
import csv
|
||||
import numpy as np
|
||||
import torch
|
||||
import random
|
||||
from datetime import datetime
|
||||
import multiprocessing
|
||||
from bert_test_data import get_bert_inputs, generate_test_data
|
||||
|
||||
import os
|
||||
import random
|
||||
import statistics
|
||||
import sys
|
||||
import timeit
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import psutil
|
||||
import torch
|
||||
from bert_test_data import generate_test_data, get_bert_inputs
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -56,7 +56,7 @@ class ModelSetting:
|
|||
def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_optimization_level=None):
|
||||
import onnxruntime
|
||||
|
||||
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
|
||||
if use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()):
|
||||
print(
|
||||
"Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
|
@ -65,20 +65,28 @@ def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_op
|
|||
session = onnxruntime.InferenceSession(model_path)
|
||||
else:
|
||||
if use_gpu:
|
||||
if provider == 'dml':
|
||||
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'rocm':
|
||||
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'migraphx':
|
||||
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'cuda':
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'tensorrt':
|
||||
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
if provider == "dml":
|
||||
execution_providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "rocm":
|
||||
execution_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "migraphx":
|
||||
execution_providers = [
|
||||
"MIGraphXExecutionProvider",
|
||||
"ROCMExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
elif provider == "cuda":
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "tensorrt":
|
||||
execution_providers = [
|
||||
"TensorrtExecutionProvider",
|
||||
"CUDAExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
else:
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
else:
|
||||
execution_providers = ['CPUExecutionProvider']
|
||||
execution_providers = ["CPUExecutionProvider"]
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||
|
@ -102,55 +110,69 @@ def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_op
|
|||
session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)
|
||||
|
||||
if use_gpu:
|
||||
if provider == 'dml':
|
||||
assert 'DmlExecutionProvider' in session.get_providers()
|
||||
elif provider == 'rocm':
|
||||
assert 'ROCMExecutionProvider' in session.get_providers()
|
||||
elif provider == 'migraphx':
|
||||
assert 'MIGraphXExecutionProvider' in session.get_providers()
|
||||
assert 'ROCMExecutionProvider' in session.get_providers()
|
||||
elif provider == 'cuda':
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
elif provider == 'tensorrt':
|
||||
assert 'TensorrtExecutionProvider' in session.get_providers()
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
if provider == "dml":
|
||||
assert "DmlExecutionProvider" in session.get_providers()
|
||||
elif provider == "rocm":
|
||||
assert "ROCMExecutionProvider" in session.get_providers()
|
||||
elif provider == "migraphx":
|
||||
assert "MIGraphXExecutionProvider" in session.get_providers()
|
||||
assert "ROCMExecutionProvider" in session.get_providers()
|
||||
elif provider == "cuda":
|
||||
assert "CUDAExecutionProvider" in session.get_providers()
|
||||
elif provider == "tensorrt":
|
||||
assert "TensorrtExecutionProvider" in session.get_providers()
|
||||
assert "CUDAExecutionProvider" in session.get_providers()
|
||||
else:
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
assert "CUDAExecutionProvider" in session.get_providers()
|
||||
else:
|
||||
assert 'CPUExecutionProvider' in session.get_providers()
|
||||
assert "CPUExecutionProvider" in session.get_providers()
|
||||
|
||||
return session
|
||||
|
||||
|
||||
def numpy_type(torch_type):
|
||||
type_map = {torch.float32: np.float32,
|
||||
torch.float16: np.float16,
|
||||
torch.int32: np.int32,
|
||||
torch.int64: np.longlong}
|
||||
type_map = {
|
||||
torch.float32: np.float32,
|
||||
torch.float16: np.float16,
|
||||
torch.int32: np.int32,
|
||||
torch.int64: np.longlong,
|
||||
}
|
||||
return type_map[torch_type]
|
||||
|
||||
|
||||
def create_input_output_tensors(inputs, outputs, device):
|
||||
input_tensors = {name: torch.from_numpy(array).to(device)
|
||||
for name, array in inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device)
|
||||
for name, array in outputs.items()}
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
|
||||
def create_io_binding(sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(name, tensor.device.type, 0,
|
||||
numpy_type(tensor.dtype), tensor.shape,
|
||||
tensor.data_ptr())
|
||||
io_binding.bind_input(
|
||||
name,
|
||||
tensor.device.type,
|
||||
0,
|
||||
numpy_type(tensor.dtype),
|
||||
tensor.shape,
|
||||
tensor.data_ptr(),
|
||||
)
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(name, tensor.device.type, 0,
|
||||
numpy_type(tensor.dtype), tensor.shape,
|
||||
tensor.data_ptr())
|
||||
io_binding.bind_output(
|
||||
name,
|
||||
tensor.device.type,
|
||||
0,
|
||||
numpy_type(tensor.dtype),
|
||||
tensor.shape,
|
||||
tensor.data_ptr(),
|
||||
)
|
||||
return io_binding
|
||||
|
||||
|
||||
def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting):
|
||||
results = []
|
||||
latency_list = []
|
||||
device = 'cuda' if test_setting.use_gpu else 'cpu'
|
||||
device = "cuda" if test_setting.use_gpu else "cpu"
|
||||
for test_case_id, inputs in enumerate(all_inputs):
|
||||
result = session.run(output_names, inputs)
|
||||
results.append(result)
|
||||
|
@ -171,6 +193,7 @@ def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, tes
|
|||
|
||||
return results, latency_list
|
||||
|
||||
|
||||
def onnxruntime_inference(session, all_inputs, output_names):
|
||||
if len(all_inputs) > 0:
|
||||
# Use a random input as warm up.
|
||||
|
@ -186,19 +209,25 @@ def onnxruntime_inference(session, all_inputs, output_names):
|
|||
latency_list.append(latency)
|
||||
return results, latency_list
|
||||
|
||||
|
||||
def to_string(model_path, session, test_setting):
|
||||
sess_options = session.get_session_options()
|
||||
option = "model={},".format(os.path.basename(model_path))
|
||||
option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level,
|
||||
sess_options.intra_op_num_threads).replace(
|
||||
'GraphOptimizationLevel.ORT_', '')
|
||||
option += "graph_optimization_level={},intra_op_num_threads={},".format(
|
||||
sess_options.graph_optimization_level, sess_options.intra_op_num_threads
|
||||
).replace("GraphOptimizationLevel.ORT_", "")
|
||||
option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
|
||||
return option
|
||||
|
||||
|
||||
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
session = create_session(model_setting.model_path, test_setting.use_gpu, test_setting.provider, intra_op_num_threads,
|
||||
model_setting.opt_level)
|
||||
session = create_session(
|
||||
model_setting.model_path,
|
||||
test_setting.use_gpu,
|
||||
test_setting.provider,
|
||||
intra_op_num_threads,
|
||||
model_setting.opt_level,
|
||||
)
|
||||
output_names = [output.name for output in session.get_outputs()]
|
||||
|
||||
key = to_string(model_setting.model_path, session, test_setting)
|
||||
|
@ -211,7 +240,9 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
|
|||
all_latency_list = []
|
||||
if test_setting.use_io_binding:
|
||||
for i in range(test_setting.test_times):
|
||||
results, latency_list = onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting)
|
||||
results, latency_list = onnxruntime_inference_with_io_binding(
|
||||
session, all_inputs, output_names, test_setting
|
||||
)
|
||||
all_latency_list.extend(latency_list)
|
||||
else:
|
||||
for i in range(test_setting.test_times):
|
||||
|
@ -229,23 +260,45 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
|
|||
latency_99 = np.percentile(latency_ms, 99)
|
||||
throughput = test_setting.batch_size * (1000.0 / average_latency)
|
||||
|
||||
perf_results[key] = (average_latency, latency_50, latency_75, latency_90, latency_95, latency_99, throughput)
|
||||
perf_results[key] = (
|
||||
average_latency,
|
||||
latency_50,
|
||||
latency_75,
|
||||
latency_90,
|
||||
latency_95,
|
||||
latency_99,
|
||||
throughput,
|
||||
)
|
||||
|
||||
print("Average latency = {} ms, Throughput = {} QPS".format(format(average_latency, '.2f'),
|
||||
format(throughput, '.2f')))
|
||||
print(
|
||||
"Average latency = {} ms, Throughput = {} QPS".format(format(average_latency, ".2f"), format(throughput, ".2f"))
|
||||
)
|
||||
|
||||
|
||||
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
process = multiprocessing.Process(target=run_one_test,
|
||||
args=(model_setting, test_setting, perf_results, all_inputs,
|
||||
intra_op_num_threads))
|
||||
process = multiprocessing.Process(
|
||||
target=run_one_test,
|
||||
args=(
|
||||
model_setting,
|
||||
test_setting,
|
||||
perf_results,
|
||||
all_inputs,
|
||||
intra_op_num_threads,
|
||||
),
|
||||
)
|
||||
process.start()
|
||||
process.join()
|
||||
|
||||
|
||||
def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
|
||||
if (test_setting.intra_op_num_threads is not None):
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads)
|
||||
if test_setting.intra_op_num_threads is not None:
|
||||
launch_test(
|
||||
model_setting,
|
||||
test_setting,
|
||||
perf_results,
|
||||
all_inputs,
|
||||
test_setting.intra_op_num_threads,
|
||||
)
|
||||
return
|
||||
|
||||
cpu_count = psutil.cpu_count(logical=False)
|
||||
|
@ -262,91 +315,139 @@ def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
|
|||
|
||||
|
||||
def run_performance(model_setting, test_setting, perf_results):
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name,
|
||||
model_setting.segment_ids_name, model_setting.input_mask_name)
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(
|
||||
model_setting.model_path,
|
||||
model_setting.input_ids_name,
|
||||
model_setting.segment_ids_name,
|
||||
model_setting.input_mask_name,
|
||||
)
|
||||
|
||||
# Do not generate random mask for performance test.
|
||||
print(
|
||||
f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}"
|
||||
)
|
||||
all_inputs = generate_test_data(test_setting.batch_size,
|
||||
test_setting.sequence_length,
|
||||
test_setting.test_cases,
|
||||
test_setting.seed,
|
||||
test_setting.verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=False)
|
||||
all_inputs = generate_test_data(
|
||||
test_setting.batch_size,
|
||||
test_setting.sequence_length,
|
||||
test_setting.test_cases,
|
||||
test_setting.seed,
|
||||
test_setting.verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=False,
|
||||
)
|
||||
|
||||
run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', required=True, type=str, help="bert onnx model path")
|
||||
|
||||
parser.add_argument('-b',
|
||||
'--batch_size',
|
||||
required=True,
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="batch size of input. Allow one or multiple values in the range of [1, 128].")
|
||||
|
||||
parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input")
|
||||
|
||||
parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated")
|
||||
|
||||
parser.add_argument('-t',
|
||||
'--test_times',
|
||||
required=False,
|
||||
type=int,
|
||||
default=0,
|
||||
help="number of times to run per sample. By default, the value is 1000 / samples")
|
||||
parser.add_argument("--model", required=True, type=str, help="bert onnx model path")
|
||||
|
||||
parser.add_argument(
|
||||
'--opt_level',
|
||||
"-b",
|
||||
"--batch_size",
|
||||
required=True,
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="batch size of input. Allow one or multiple values in the range of [1, 128].",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--sequence_length",
|
||||
required=True,
|
||||
type=int,
|
||||
help="maximum sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of samples to be generated",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--test_times",
|
||||
required=False,
|
||||
type=int,
|
||||
default=0,
|
||||
help="number of times to run per sample. By default, the value is 1000 / samples",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--opt_level",
|
||||
required=False,
|
||||
type=int,
|
||||
choices=[0, 1, 2, 99],
|
||||
default=99,
|
||||
help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 99 - enable all.")
|
||||
help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 99 - enable all.",
|
||||
)
|
||||
|
||||
parser.add_argument('--seed',
|
||||
required=False,
|
||||
type=int,
|
||||
default=3,
|
||||
help="random seed. Use the same seed to make sure test data is same in multiple tests.")
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
required=False,
|
||||
type=int,
|
||||
default=3,
|
||||
help="random seed. Use the same seed to make sure test data is same in multiple tests.",
|
||||
)
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="print verbose information",
|
||||
)
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
|
||||
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('--use_io_binding', required=False, action='store_true', help="use io_binding")
|
||||
parser.add_argument("--use_io_binding", required=False, action="store_true", help="use io_binding")
|
||||
parser.set_defaults(use_io_binding=False)
|
||||
|
||||
parser.add_argument("--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use")
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use",
|
||||
)
|
||||
|
||||
parser.add_argument('-n',
|
||||
'--intra_op_num_threads',
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help=">=0, set intra_op_num_threads")
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--intra_op_num_threads",
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help=">=0, set intra_op_num_threads",
|
||||
)
|
||||
|
||||
parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
|
||||
parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
|
||||
parser.add_argument('--input_mask_name',
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask")
|
||||
parser.add_argument(
|
||||
"--input_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--segment_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_mask_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
@ -365,12 +466,27 @@ def main():
|
|||
if not min(batch_size_set) >= 1 and max(batch_size_set) <= 128:
|
||||
raise Exception("batch_size not in range [1, 128]")
|
||||
|
||||
model_setting = ModelSetting(args.model, args.input_ids_name, args.segment_ids_name, args.input_mask_name,
|
||||
args.opt_level)
|
||||
model_setting = ModelSetting(
|
||||
args.model,
|
||||
args.input_ids_name,
|
||||
args.segment_ids_name,
|
||||
args.input_mask_name,
|
||||
args.opt_level,
|
||||
)
|
||||
|
||||
for batch_size in batch_size_set:
|
||||
test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu, args.use_io_binding,
|
||||
args.provider, args.intra_op_num_threads, args.seed, args.verbose)
|
||||
test_setting = TestSetting(
|
||||
batch_size,
|
||||
args.sequence_length,
|
||||
args.samples,
|
||||
args.test_times,
|
||||
args.use_gpu,
|
||||
args.use_io_binding,
|
||||
args.provider,
|
||||
args.intra_op_num_threads,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
print("test setting", test_setting)
|
||||
run_performance(model_setting, test_setting, perf_results)
|
||||
|
@ -380,25 +496,33 @@ def main():
|
|||
|
||||
summary_file = os.path.join(
|
||||
Path(args.model).parent,
|
||||
"perf_results_{}_B{}_S{}_{}.txt".format('GPU' if args.use_gpu else 'CPU',
|
||||
"-".join([str(x) for x in sorted(list(batch_size_set))]),
|
||||
args.sequence_length,
|
||||
datetime.now().strftime("%Y%m%d-%H%M%S")))
|
||||
with open(summary_file, 'w+', newline='') as tsv_file:
|
||||
tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
|
||||
"perf_results_{}_B{}_S{}_{}.txt".format(
|
||||
"GPU" if args.use_gpu else "CPU",
|
||||
"-".join([str(x) for x in sorted(list(batch_size_set))]),
|
||||
args.sequence_length,
|
||||
datetime.now().strftime("%Y%m%d-%H%M%S"),
|
||||
),
|
||||
)
|
||||
with open(summary_file, "w+", newline="") as tsv_file:
|
||||
tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
|
||||
headers = None
|
||||
for (key, perf_result) in sorted_results:
|
||||
params = key.split(',')
|
||||
params = key.split(",")
|
||||
if headers is None:
|
||||
headers = [
|
||||
"Latency(ms)", "Latency_P50", "Latency_P75", "Latency_P90", "Latency_P95", "Latency_P99",
|
||||
"Throughput(QPS)"
|
||||
"Latency(ms)",
|
||||
"Latency_P50",
|
||||
"Latency_P75",
|
||||
"Latency_P90",
|
||||
"Latency_P95",
|
||||
"Latency_P99",
|
||||
"Throughput(QPS)",
|
||||
]
|
||||
headers.extend([x.split('=')[0] for x in params])
|
||||
headers.extend([x.split("=")[0] for x in params])
|
||||
tsv_writer.writerow(headers)
|
||||
|
||||
values = [format(x, '.2f') for x in perf_result]
|
||||
values.extend([x.split('=')[1] for x in params])
|
||||
values = [format(x, ".2f") for x in perf_result]
|
||||
values.extend([x.split("=")[1] for x in params])
|
||||
tsv_writer.writerow(values)
|
||||
|
||||
print("Test summary is saved to", summary_file)
|
||||
|
|
|
@ -1,24 +1,26 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# It is a tool to generate test data for a bert model.
|
||||
# The test data can be used by onnxruntime_perf_test tool to evaluate the inference latency.
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Union
|
||||
from typing import Dict, List, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from onnx import ModelProto, TensorProto, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
|
||||
def fake_input_ids_data(input_ids: TensorProto, batch_size: int, sequence_length: int,
|
||||
dictionary_size: int) -> np.ndarray:
|
||||
def fake_input_ids_data(
|
||||
input_ids: TensorProto, batch_size: int, sequence_length: int, dictionary_size: int
|
||||
) -> np.ndarray:
|
||||
"""Create input tensor based on the graph input of input_ids
|
||||
|
||||
Args:
|
||||
|
@ -30,7 +32,11 @@ def fake_input_ids_data(input_ids: TensorProto, batch_size: int, sequence_length
|
|||
Returns:
|
||||
np.ndarray: the input tensor created
|
||||
"""
|
||||
assert input_ids.type.tensor_type.elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
||||
assert input_ids.type.tensor_type.elem_type in [
|
||||
TensorProto.FLOAT,
|
||||
TensorProto.INT32,
|
||||
TensorProto.INT64,
|
||||
]
|
||||
|
||||
data = np.random.randint(dictionary_size, size=(batch_size, sequence_length), dtype=np.int32)
|
||||
|
||||
|
@ -43,7 +49,7 @@ def fake_input_ids_data(input_ids: TensorProto, batch_size: int, sequence_length
|
|||
|
||||
|
||||
def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_length: int) -> np.ndarray:
|
||||
"""Create input tensor based on the graph input of segment_ids
|
||||
"""Create input tensor based on the graph input of segment_ids
|
||||
|
||||
Args:
|
||||
segment_ids (TensorProto): graph input of the token_type_ids input tensor
|
||||
|
@ -53,7 +59,11 @@ def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_le
|
|||
Returns:
|
||||
np.ndarray: the input tensor created
|
||||
"""
|
||||
assert segment_ids.type.tensor_type.elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
||||
assert segment_ids.type.tensor_type.elem_type in [
|
||||
TensorProto.FLOAT,
|
||||
TensorProto.INT32,
|
||||
TensorProto.INT64,
|
||||
]
|
||||
|
||||
data = np.zeros((batch_size, sequence_length), dtype=np.int32)
|
||||
|
||||
|
@ -65,8 +75,12 @@ def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_le
|
|||
return data
|
||||
|
||||
|
||||
def fake_input_mask_data(input_mask: TensorProto, batch_size: int, sequence_length: int,
|
||||
random_mask_length: bool) -> np.ndarray:
|
||||
def fake_input_mask_data(
|
||||
input_mask: TensorProto,
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
random_mask_length: bool,
|
||||
) -> np.ndarray:
|
||||
"""Create input tensor based on the graph input of segment_ids.
|
||||
|
||||
Args:
|
||||
|
@ -79,13 +93,17 @@ def fake_input_mask_data(input_mask: TensorProto, batch_size: int, sequence_leng
|
|||
np.ndarray: the input tensor created
|
||||
"""
|
||||
|
||||
assert input_mask.type.tensor_type.elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
||||
assert input_mask.type.tensor_type.elem_type in [
|
||||
TensorProto.FLOAT,
|
||||
TensorProto.INT32,
|
||||
TensorProto.INT64,
|
||||
]
|
||||
|
||||
if random_mask_length:
|
||||
actual_seq_len = random.randint(int(sequence_length * 2 / 3), sequence_length)
|
||||
data = np.zeros((batch_size, sequence_length), dtype=np.int32)
|
||||
temp = np.ones((batch_size, actual_seq_len), dtype=np.int32)
|
||||
data[:temp.shape[0], :temp.shape[1]] = temp
|
||||
data[: temp.shape[0], : temp.shape[1]] = temp
|
||||
else:
|
||||
data = np.ones((batch_size, sequence_length), dtype=np.int32)
|
||||
|
||||
|
@ -117,14 +135,23 @@ def output_test_data(dir: str, inputs: np.ndarray):
|
|||
index = 0
|
||||
for name, data in inputs.items():
|
||||
tensor = numpy_helper.from_array(data, name)
|
||||
with open(os.path.join(dir, 'input_{}.pb'.format(index)), 'wb') as f:
|
||||
with open(os.path.join(dir, "input_{}.pb".format(index)), "wb") as f:
|
||||
f.write(tensor.SerializeToString())
|
||||
index += 1
|
||||
|
||||
|
||||
def fake_test_data(batch_size: int, sequence_length: int, test_cases: int, dictionary_size: int, verbose: bool,
|
||||
random_seed: int, input_ids: TensorProto, segment_ids: TensorProto, input_mask: TensorProto,
|
||||
random_mask_length: bool):
|
||||
def fake_test_data(
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
test_cases: int,
|
||||
dictionary_size: int,
|
||||
verbose: bool,
|
||||
random_seed: int,
|
||||
input_ids: TensorProto,
|
||||
segment_ids: TensorProto,
|
||||
input_mask: TensorProto,
|
||||
random_mask_length: bool,
|
||||
):
|
||||
"""Create given number of input data for testing
|
||||
|
||||
Args:
|
||||
|
@ -164,9 +191,17 @@ def fake_test_data(batch_size: int, sequence_length: int, test_cases: int, dicti
|
|||
return all_inputs
|
||||
|
||||
|
||||
def generate_test_data(batch_size: int, sequence_length: int, test_cases: int, seed: int, verbose: bool,
|
||||
input_ids: TensorProto, segment_ids: TensorProto, input_mask: TensorProto,
|
||||
random_mask_length: bool):
|
||||
def generate_test_data(
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
test_cases: int,
|
||||
seed: int,
|
||||
verbose: bool,
|
||||
input_ids: TensorProto,
|
||||
segment_ids: TensorProto,
|
||||
input_mask: TensorProto,
|
||||
random_mask_length: bool,
|
||||
):
|
||||
"""Create given number of minput data for testing
|
||||
|
||||
Args:
|
||||
|
@ -184,8 +219,18 @@ def generate_test_data(batch_size: int, sequence_length: int, test_cases: int, s
|
|||
List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictonary with input name as key and a tensor as value
|
||||
"""
|
||||
dictionary_size = 10000
|
||||
all_inputs = fake_test_data(batch_size, sequence_length, test_cases, dictionary_size, verbose, seed, input_ids,
|
||||
segment_ids, input_mask, random_mask_length)
|
||||
all_inputs = fake_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
dictionary_size,
|
||||
verbose,
|
||||
seed,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length,
|
||||
)
|
||||
if len(all_inputs) != test_cases:
|
||||
print("Failed to create test data for test.")
|
||||
return all_inputs
|
||||
|
@ -199,16 +244,17 @@ def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
|
|||
graph_input = onnx_model.find_graph_input(input)
|
||||
if graph_input is None:
|
||||
parent_node = onnx_model.get_parent(embed_node, input_index)
|
||||
if parent_node is not None and parent_node.op_type == 'Cast':
|
||||
if parent_node is not None and parent_node.op_type == "Cast":
|
||||
graph_input = onnx_model.find_graph_input(parent_node.input[0])
|
||||
return graph_input
|
||||
|
||||
|
||||
def find_bert_inputs(onnx_model: OnnxModel,
|
||||
input_ids_name: str = None,
|
||||
segment_ids_name: str = None,
|
||||
input_mask_name: str = None
|
||||
) -> Tuple[Union[None, np.ndarray], Union[None, np.ndarray], Union[None, np.ndarray]]:
|
||||
def find_bert_inputs(
|
||||
onnx_model: OnnxModel,
|
||||
input_ids_name: str = None,
|
||||
segment_ids_name: str = None,
|
||||
input_mask_name: str = None,
|
||||
) -> Tuple[Union[None, np.ndarray], Union[None, np.ndarray], Union[None, np.ndarray]]:
|
||||
"""Find graph inputs for BERT model.
|
||||
First, we will deduce inputs from EmbedLayerNormalization node. If not found, we will guess the meaning of graph inputs based on naming.
|
||||
|
||||
|
@ -254,7 +300,7 @@ def find_bert_inputs(onnx_model: OnnxModel,
|
|||
if len(graph_inputs) != 3:
|
||||
raise ValueError("Expect the graph to have 3 inputs. Got {}".format(len(graph_inputs)))
|
||||
|
||||
embed_nodes = onnx_model.get_nodes_by_op_type('EmbedLayerNormalization')
|
||||
embed_nodes = onnx_model.get_nodes_by_op_type("EmbedLayerNormalization")
|
||||
if len(embed_nodes) == 1:
|
||||
embed_node = embed_nodes[0]
|
||||
input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
|
||||
|
@ -279,7 +325,9 @@ def find_bert_inputs(onnx_model: OnnxModel,
|
|||
input_name_lower = input.name.lower()
|
||||
if "mask" in input_name_lower: # matches input with name like "attention_mask" or "input_mask"
|
||||
input_mask = input
|
||||
elif "token" in input_name_lower or "segment" in input_name_lower: # matches input with name like "segment_ids" or "token_type_ids"
|
||||
elif (
|
||||
"token" in input_name_lower or "segment" in input_name_lower
|
||||
): # matches input with name like "segment_ids" or "token_type_ids"
|
||||
segment_ids = input
|
||||
else:
|
||||
input_ids = input
|
||||
|
@ -290,10 +338,12 @@ def find_bert_inputs(onnx_model: OnnxModel,
|
|||
raise ValueError("Fail to assign 3 inputs. You might try rename the graph inputs.")
|
||||
|
||||
|
||||
def get_bert_inputs(onnx_file: str,
|
||||
input_ids_name: str = None,
|
||||
segment_ids_name: str = None,
|
||||
input_mask_name: str = None):
|
||||
def get_bert_inputs(
|
||||
onnx_file: str,
|
||||
input_ids_name: str = None,
|
||||
segment_ids_name: str = None,
|
||||
input_mask_name: str = None,
|
||||
):
|
||||
"""Find graph inputs for BERT model.
|
||||
First, we will deduce inputs from EmbedLayerNormalization node. If not found, we will guess the meaning of graph inputs based on naming.
|
||||
|
||||
|
@ -317,54 +367,95 @@ def get_bert_inputs(onnx_file: str,
|
|||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('--model', required=True, type=str, help="bert onnx model path.")
|
||||
parser.add_argument("--model", required=True, type=str, help="bert onnx model path.")
|
||||
|
||||
parser.add_argument('--output_dir',
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="output test data path. Default is current directory.")
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="output test data path. Default is current directory.",
|
||||
)
|
||||
|
||||
parser.add_argument('--batch_size', required=False, type=int, default=1, help="batch size of input")
|
||||
parser.add_argument("--batch_size", required=False, type=int, default=1, help="batch size of input")
|
||||
|
||||
parser.add_argument('--sequence_length',
|
||||
required=False,
|
||||
type=int,
|
||||
default=128,
|
||||
help="maximum sequence length of input")
|
||||
parser.add_argument(
|
||||
"--sequence_length",
|
||||
required=False,
|
||||
type=int,
|
||||
default=128,
|
||||
help="maximum sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
|
||||
parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
|
||||
parser.add_argument('--input_mask_name',
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask")
|
||||
parser.add_argument(
|
||||
"--input_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--segment_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_mask_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask",
|
||||
)
|
||||
|
||||
parser.add_argument('--samples', required=False, type=int, default=1, help="number of test cases to be generated")
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of test cases to be generated",
|
||||
)
|
||||
|
||||
parser.add_argument('--seed', required=False, type=int, default=3, help="random seed")
|
||||
parser.add_argument("--seed", required=False, type=int, default=3, help="random seed")
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="print verbose information",
|
||||
)
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument('--only_input_tensors',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help="only save input tensors and no output tensors")
|
||||
parser.add_argument(
|
||||
"--only_input_tensors",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="only save input tensors and no output tensors",
|
||||
)
|
||||
parser.set_defaults(only_input_tensors=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def create_and_save_test_data(model: str, output_dir: str, batch_size: int, sequence_length: int, test_cases: int,
|
||||
seed: int, verbose: bool, input_ids_name: str, segment_ids_name: str,
|
||||
input_mask_name: str, only_input_tensors: bool):
|
||||
def create_and_save_test_data(
|
||||
model: str,
|
||||
output_dir: str,
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
test_cases: int,
|
||||
seed: int,
|
||||
verbose: bool,
|
||||
input_ids_name: str,
|
||||
segment_ids_name: str,
|
||||
input_mask_name: str,
|
||||
only_input_tensors: bool,
|
||||
):
|
||||
"""Create test data for a model, and save test data to a directory.
|
||||
|
||||
Args:
|
||||
model (str): path of ONNX bert model
|
||||
model (str): path of ONNX bert model
|
||||
output_dir (str): output directory
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
|
@ -378,33 +469,36 @@ def create_and_save_test_data(model: str, output_dir: str, batch_size: int, sequ
|
|||
"""
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(model, input_ids_name, segment_ids_name, input_mask_name)
|
||||
|
||||
all_inputs = generate_test_data(batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=False)
|
||||
all_inputs = generate_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=False,
|
||||
)
|
||||
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
dir = os.path.join(output_dir, 'test_data_set_' + str(i))
|
||||
dir = os.path.join(output_dir, "test_data_set_" + str(i))
|
||||
output_test_data(dir, inputs)
|
||||
|
||||
if only_input_tensors:
|
||||
return
|
||||
|
||||
import onnxruntime
|
||||
|
||||
sess = onnxruntime.InferenceSession(model)
|
||||
output_names = [output.name for output in sess.get_outputs()]
|
||||
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
dir = os.path.join(output_dir, 'test_data_set_' + str(i))
|
||||
dir = os.path.join(output_dir, "test_data_set_" + str(i))
|
||||
result = sess.run(output_names, inputs)
|
||||
for i, output_name in enumerate(output_names):
|
||||
tensor_result = numpy_helper.from_array(np.asarray(result[i]), output_names[i])
|
||||
with open(os.path.join(dir, 'output_{}.pb'.format(i)), 'wb') as f:
|
||||
with open(os.path.join(dir, "output_{}.pb".format(i)), "wb") as f:
|
||||
f.write(tensor_result.SerializeToString())
|
||||
|
||||
|
||||
|
@ -424,9 +518,19 @@ def main():
|
|||
else:
|
||||
print("Directory existed. test data files will be overwritten.")
|
||||
|
||||
create_and_save_test_data(args.model, output_dir, args.batch_size, args.sequence_length, args.samples, args.seed,
|
||||
args.verbose, args.input_ids_name, args.segment_ids_name, args.input_mask_name,
|
||||
args.only_input_tensors)
|
||||
create_and_save_test_data(
|
||||
args.model,
|
||||
output_dir,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.samples,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
args.input_ids_name,
|
||||
args.segment_ids_name,
|
||||
args.input_mask_name,
|
||||
args.only_input_tensors,
|
||||
)
|
||||
|
||||
print("Test data is saved to directory:", output_dir)
|
||||
|
||||
|
|
|
@ -1,27 +1,28 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# It is a tool to compare the inference results of the original model and optimized model.
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import numpy as np
|
||||
import csv
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
import statistics
|
||||
import sys
|
||||
import timeit
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
import onnx.utils
|
||||
import psutil
|
||||
import csv
|
||||
import timeit
|
||||
from datetime import datetime
|
||||
from bert_perf_test import create_session, onnxruntime_inference
|
||||
from bert_test_data import generate_test_data, get_bert_inputs, output_test_data
|
||||
from onnx import ModelProto, TensorProto, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
|
||||
from bert_perf_test import create_session, onnxruntime_inference
|
||||
|
||||
|
||||
def run_model(model_path, all_inputs, use_gpu, disable_optimization):
|
||||
|
@ -64,51 +65,75 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
|
|||
print("rel_diff={} abs_diff={}".format(rel_diff, abs_diff))
|
||||
|
||||
if diff_count == 0:
|
||||
print("100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
|
||||
len(baseline_results), rtol, atol))
|
||||
print(
|
||||
"100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
|
||||
len(baseline_results), rtol, atol
|
||||
)
|
||||
)
|
||||
else:
|
||||
print("WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
|
||||
diff_count, len(baseline_results), rtol, atol))
|
||||
print(
|
||||
"WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
|
||||
diff_count, len(baseline_results), rtol, atol
|
||||
)
|
||||
)
|
||||
|
||||
print("maximum absolute difference={}".format(max_abs_diff))
|
||||
|
||||
print("maximum relative difference={}".format(max_rel_diff))
|
||||
|
||||
|
||||
def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed,
|
||||
verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
|
||||
def run_test(
|
||||
baseline_model,
|
||||
optimized_model,
|
||||
output_dir,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
use_gpu,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
rtol,
|
||||
atol,
|
||||
input_ids_name,
|
||||
segment_ids_name,
|
||||
input_mask_name,
|
||||
):
|
||||
|
||||
# Try deduce input names from optimized model.
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name,
|
||||
input_mask_name)
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(
|
||||
optimized_model, input_ids_name, segment_ids_name, input_mask_name
|
||||
)
|
||||
|
||||
# Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
|
||||
all_inputs = generate_test_data(batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=True)
|
||||
all_inputs = generate_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=True,
|
||||
)
|
||||
|
||||
baseline_results, baseline_latency, output_names = run_model(baseline_model,
|
||||
all_inputs,
|
||||
use_gpu,
|
||||
disable_optimization=True)
|
||||
baseline_results, baseline_latency, output_names = run_model(
|
||||
baseline_model, all_inputs, use_gpu, disable_optimization=True
|
||||
)
|
||||
if verbose:
|
||||
print("baseline average latency (all optimizations disabled): {} ms".format(
|
||||
statistics.mean(baseline_latency) * 1000))
|
||||
print(
|
||||
"baseline average latency (all optimizations disabled): {} ms".format(
|
||||
statistics.mean(baseline_latency) * 1000
|
||||
)
|
||||
)
|
||||
|
||||
if output_dir is not None:
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
output_test_data(output_dir, i, inputs)
|
||||
|
||||
treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model,
|
||||
all_inputs,
|
||||
use_gpu,
|
||||
disable_optimization=False)
|
||||
treatment_results, treatment_latency, treatment_output_names = run_model(
|
||||
optimized_model, all_inputs, use_gpu, disable_optimization=False
|
||||
)
|
||||
if verbose:
|
||||
print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
|
||||
|
||||
|
@ -118,41 +143,79 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
|
|||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--baseline_model', required=True, type=str, help="baseline onnx model path.")
|
||||
parser.add_argument("--baseline_model", required=True, type=str, help="baseline onnx model path.")
|
||||
|
||||
parser.add_argument('--optimized_model',
|
||||
required=True,
|
||||
type=str,
|
||||
default=None,
|
||||
help="path of the optimized model. It shall have same inputs as the baseline model.")
|
||||
parser.add_argument(
|
||||
"--optimized_model",
|
||||
required=True,
|
||||
type=str,
|
||||
default=None,
|
||||
help="path of the optimized model. It shall have same inputs as the baseline model.",
|
||||
)
|
||||
|
||||
parser.add_argument('--output_dir',
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="output test data path. If not specified, test data will not be saved.")
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="output test data path. If not specified, test data will not be saved.",
|
||||
)
|
||||
|
||||
parser.add_argument('--batch_size', required=True, type=int, help="batch size of input")
|
||||
parser.add_argument("--batch_size", required=True, type=int, help="batch size of input")
|
||||
|
||||
parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input")
|
||||
parser.add_argument(
|
||||
"--sequence_length",
|
||||
required=True,
|
||||
type=int,
|
||||
help="maximum sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument('--rtol', required=False, type=float, default=1e-3, help="relative tolerance")
|
||||
parser.add_argument("--rtol", required=False, type=float, default=1e-3, help="relative tolerance")
|
||||
|
||||
parser.add_argument('--atol', required=False, type=float, default=1e-4, help="absolute tolerance")
|
||||
parser.add_argument("--atol", required=False, type=float, default=1e-4, help="absolute tolerance")
|
||||
|
||||
parser.add_argument('--samples', required=False, type=int, default=100, help="number of test cases to be generated")
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=100,
|
||||
help="number of test cases to be generated",
|
||||
)
|
||||
|
||||
parser.add_argument('--seed', required=False, type=int, default=3, help="random seed")
|
||||
parser.add_argument("--seed", required=False, type=int, default=3, help="random seed")
|
||||
|
||||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
|
||||
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="print verbose information",
|
||||
)
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument('--input_ids', required=False, type=str, default=None, help="input name for input ids")
|
||||
parser.add_argument('--segment_ids', required=False, type=str, default=None, help="input name for segment ids")
|
||||
parser.add_argument('--input_mask', required=False, type=str, default=None, help="input name for attention mask")
|
||||
parser.add_argument(
|
||||
"--input_ids",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--segment_ids",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_mask",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
@ -166,9 +229,22 @@ def main():
|
|||
path = Path(args.output_dir)
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length,
|
||||
args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids,
|
||||
args.segment_ids, args.input_mask)
|
||||
run_test(
|
||||
args.baseline_model,
|
||||
args.optimized_model,
|
||||
args.output_dir,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.use_gpu,
|
||||
args.samples,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
args.rtol,
|
||||
args.atol,
|
||||
args.input_ids,
|
||||
args.segment_ids,
|
||||
args.input_mask,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
"""
|
||||
This converts GPT2 or T5 model to onnx with beam search operator.
|
||||
|
||||
|
@ -13,161 +13,203 @@ Example 2: convert T5 model with beam search:
|
|||
python convert_beam_search.py -m t5-small --model_type t5 --decoder_onnx ./onnx_models/t5-small_decoder.onnx --encoder_decoder_init_onnx ./onnx_models/t5-small_encoder_decoder_init.onnx --output ./onnx_models/t5_small_beam_search.onnx
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import onnx
|
||||
import logging
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from onnx import helper
|
||||
import numpy as np
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
import torch
|
||||
from benchmark_helper import Precision
|
||||
from onnx import helper
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
from packaging import version
|
||||
from transformers import GPT2Config, T5Config
|
||||
from benchmark_helper import Precision
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), 'models', 'gpt2'))
|
||||
from gpt2_helper import PRETRAINED_GPT2_MODELS
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
|
||||
from convert_to_onnx import main as convert_gpt2_to_onnx
|
||||
from gpt2_helper import PRETRAINED_GPT2_MODELS
|
||||
|
||||
config: Union[GPT2Config, T5Config] = None
|
||||
|
||||
logger = logging.getLogger('')
|
||||
logger = logging.getLogger("")
|
||||
|
||||
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-m',
|
||||
'--model_name_or_path',
|
||||
required=True,
|
||||
type=str,
|
||||
help='Model path, or pretrained model name in the list: ' + ', '.join(PRETRAINED_GPT2_MODELS))
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--model_name_or_path",
|
||||
required=True,
|
||||
type=str,
|
||||
help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
|
||||
)
|
||||
|
||||
parser.add_argument('--model_type',
|
||||
required=False,
|
||||
type=str,
|
||||
default="gpt2",
|
||||
choices=["gpt2", "t5"],
|
||||
help='Model type in the list: ' + ', '.join(["gpt2", "t5"]))
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
required=False,
|
||||
type=str,
|
||||
default="gpt2",
|
||||
choices=["gpt2", "t5"],
|
||||
help="Model type in the list: " + ", ".join(["gpt2", "t5"]),
|
||||
)
|
||||
|
||||
parser.add_argument('--cache_dir',
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join('.', 'cache_models'),
|
||||
help='Directory to cache pre-trained models')
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join(".", "cache_models"),
|
||||
help="Directory to cache pre-trained models",
|
||||
)
|
||||
|
||||
parser.add_argument('--decoder_onnx',
|
||||
required=True,
|
||||
type=str,
|
||||
help='Output directory for decoder onnx model, or model path ends with .onnx')
|
||||
parser.add_argument(
|
||||
"--decoder_onnx",
|
||||
required=True,
|
||||
type=str,
|
||||
help="Output directory for decoder onnx model, or model path ends with .onnx",
|
||||
)
|
||||
|
||||
parser.add_argument('--encoder_decoder_init_onnx',
|
||||
required=False,
|
||||
type=str,
|
||||
default="",
|
||||
help='path of ONNX model for encoder and decoder initialization. Required for t5 model type.')
|
||||
parser.add_argument(
|
||||
"--encoder_decoder_init_onnx",
|
||||
required=False,
|
||||
type=str,
|
||||
default="",
|
||||
help="path of ONNX model for encoder and decoder initialization. Required for t5 model type.",
|
||||
)
|
||||
|
||||
parser.add_argument('--output',
|
||||
required=False,
|
||||
type=str,
|
||||
help='Output directory for beam search model, or model path ends with .onnx')
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Output directory for beam search model, or model path ends with .onnx",
|
||||
)
|
||||
|
||||
parser.add_argument("-p",
|
||||
"--precision",
|
||||
required=False,
|
||||
type=Precision,
|
||||
default=Precision.FLOAT32,
|
||||
choices=[Precision.FLOAT32, Precision.FLOAT16],
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision")
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--precision",
|
||||
required=False,
|
||||
type=Precision,
|
||||
default=Precision.FLOAT32,
|
||||
choices=[Precision.FLOAT32, Precision.FLOAT16],
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision",
|
||||
)
|
||||
|
||||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference")
|
||||
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('-e', '--use_external_data_format', required=False, action='store_true')
|
||||
parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
|
||||
parser.set_defaults(use_external_data_format=False)
|
||||
|
||||
parser.add_argument('--disable_parity', required=False, action='store_true', help="do not run parity test")
|
||||
parser.add_argument(
|
||||
"--disable_parity",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="do not run parity test",
|
||||
)
|
||||
parser.set_defaults(disable_parity=False)
|
||||
|
||||
parser.add_argument('--torch_performance', required=False, action='store_true', help="test PyTorch performance")
|
||||
parser.add_argument(
|
||||
"--torch_performance",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="test PyTorch performance",
|
||||
)
|
||||
parser.set_defaults(torch_performance=False)
|
||||
|
||||
parser.add_argument('--total_runs',
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of times of inference for latency measurement')
|
||||
parser.add_argument(
|
||||
"--total_runs",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of times of inference for latency measurement",
|
||||
)
|
||||
|
||||
beam_search_group = parser.add_argument_group("beam search options")
|
||||
|
||||
beam_search_group.add_argument('--output_sequences_scores',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help="output sequences scores")
|
||||
beam_search_group.add_argument(
|
||||
"--output_sequences_scores",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="output sequences scores",
|
||||
)
|
||||
beam_search_group.set_defaults(output_sequences_scores=False)
|
||||
|
||||
beam_search_group.add_argument('--output_token_scores',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help="output token scores")
|
||||
beam_search_group.add_argument(
|
||||
"--output_token_scores",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="output token scores",
|
||||
)
|
||||
beam_search_group.set_defaults(output_token_scores=False)
|
||||
|
||||
beam_search_group.add_argument('--early_stopping', required=False, action='store_true')
|
||||
beam_search_group.add_argument("--early_stopping", required=False, action="store_true")
|
||||
beam_search_group.set_defaults(early_stopping=False)
|
||||
|
||||
beam_search_group.add_argument('--min_length', type=int, required=False, default=1, help='Min sequence length')
|
||||
beam_search_group.add_argument("--min_length", type=int, required=False, default=1, help="Min sequence length")
|
||||
|
||||
beam_search_group.add_argument('--max_length', type=int, required=False, default=50, help='Max sequence length')
|
||||
|
||||
beam_search_group.add_argument('--no_repeat_ngram_size',
|
||||
type=int,
|
||||
required=False,
|
||||
default=0,
|
||||
help='No repeat ngram size')
|
||||
|
||||
beam_search_group.add_argument('--num_beams', type=int, required=False, default=4, help='Beam size')
|
||||
|
||||
beam_search_group.add_argument('--num_return_sequences',
|
||||
type=int,
|
||||
required=False,
|
||||
default=1,
|
||||
help='Number of return sequence <= num_beams')
|
||||
|
||||
beam_search_group.add_argument('--temperature',
|
||||
type=float,
|
||||
required=False,
|
||||
default=1,
|
||||
help='Softmax temperature for output logits.')
|
||||
|
||||
beam_search_group.add_argument('--length_penalty',
|
||||
type=float,
|
||||
required=False,
|
||||
default=1,
|
||||
help='Positive. >1 to penalize and <1 to encorage short sentence.')
|
||||
|
||||
beam_search_group.add_argument('--repetition_penalty',
|
||||
type=float,
|
||||
required=False,
|
||||
default=1,
|
||||
help='Positive. >1 to penalize and <1 to encorage.')
|
||||
|
||||
beam_search_group.add_argument('--vocab_size',
|
||||
type=int,
|
||||
required=False,
|
||||
default=-1,
|
||||
help="Vocab_size of the underlying model")
|
||||
beam_search_group.add_argument("--max_length", type=int, required=False, default=50, help="Max sequence length")
|
||||
|
||||
beam_search_group.add_argument(
|
||||
'--prefix_vocab_mask',
|
||||
"--no_repeat_ngram_size",
|
||||
type=int,
|
||||
required=False,
|
||||
action='store_true',
|
||||
help="This vocab mask applies only to first iteration, enable if last word in query might need auto complete")
|
||||
default=0,
|
||||
help="No repeat ngram size",
|
||||
)
|
||||
|
||||
beam_search_group.add_argument("--num_beams", type=int, required=False, default=4, help="Beam size")
|
||||
|
||||
beam_search_group.add_argument(
|
||||
"--num_return_sequences",
|
||||
type=int,
|
||||
required=False,
|
||||
default=1,
|
||||
help="Number of return sequence <= num_beams",
|
||||
)
|
||||
|
||||
beam_search_group.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
required=False,
|
||||
default=1,
|
||||
help="Softmax temperature for output logits.",
|
||||
)
|
||||
|
||||
beam_search_group.add_argument(
|
||||
"--length_penalty",
|
||||
type=float,
|
||||
required=False,
|
||||
default=1,
|
||||
help="Positive. >1 to penalize and <1 to encorage short sentence.",
|
||||
)
|
||||
|
||||
beam_search_group.add_argument(
|
||||
"--repetition_penalty",
|
||||
type=float,
|
||||
required=False,
|
||||
default=1,
|
||||
help="Positive. >1 to penalize and <1 to encorage.",
|
||||
)
|
||||
|
||||
beam_search_group.add_argument(
|
||||
"--vocab_size",
|
||||
type=int,
|
||||
required=False,
|
||||
default=-1,
|
||||
help="Vocab_size of the underlying model",
|
||||
)
|
||||
|
||||
beam_search_group.add_argument(
|
||||
"--prefix_vocab_mask",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="This vocab mask applies only to first iteration, enable if last word in query might need auto complete",
|
||||
)
|
||||
beam_search_group.set_defaults(prefix_vocab_mask=False)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
@ -180,39 +222,40 @@ def gpt2_to_onnx(args):
|
|||
|
||||
print(f"use convert_to_onnx.py to convert model {model_name} to onnx {args.decoder_onnx} ...")
|
||||
arguments = [
|
||||
'--model_name_or_path',
|
||||
"--model_name_or_path",
|
||||
model_name,
|
||||
'--output',
|
||||
"--output",
|
||||
args.decoder_onnx,
|
||||
'--optimize_onnx',
|
||||
'--precision',
|
||||
'fp32' if args.precision == Precision.FLOAT32 else 'fp16',
|
||||
'--test_runs',
|
||||
'1',
|
||||
'--test_cases',
|
||||
'10',
|
||||
'--use_int32_inputs' # BeamSearch requires to use int32 for input_ids, postion_ids and attention_mask
|
||||
"--optimize_onnx",
|
||||
"--precision",
|
||||
"fp32" if args.precision == Precision.FLOAT32 else "fp16",
|
||||
"--test_runs",
|
||||
"1",
|
||||
"--test_cases",
|
||||
"10",
|
||||
"--use_int32_inputs", # BeamSearch requires to use int32 for input_ids, postion_ids and attention_mask
|
||||
]
|
||||
if args.use_gpu:
|
||||
arguments.append('--use_gpu')
|
||||
arguments.append("--use_gpu")
|
||||
if args.use_external_data_format:
|
||||
arguments.append('--use_external_data_format')
|
||||
arguments.append("--use_external_data_format")
|
||||
|
||||
if args.precision == Precision.FLOAT16:
|
||||
assert args.use_gpu, "fp16 or mixed precision model cannot run in CPU. Please add --use_gpu"
|
||||
# TODO: Use auto mixed precision for fp16 conversion: arguments.append('--auto_mixed_precision')
|
||||
# Need change cuda kernel to support a combination of fp32 logits and fp16 past state.
|
||||
# Currently logits and past state shall be same data type.
|
||||
arguments.extend(['--op_block_list', 'Add', 'LayerNormalization', 'FastGelu'])
|
||||
arguments.extend(["--op_block_list", "Add", "LayerNormalization", "FastGelu"])
|
||||
convert_gpt2_to_onnx(arguments)
|
||||
|
||||
|
||||
def shape_inference(decoder_onnx_path):
|
||||
if version.parse(onnx.__version__) >= version.parse('1.11.0'):
|
||||
if version.parse(onnx.__version__) >= version.parse("1.11.0"):
|
||||
logger.warn("SymbolicShapeInference might fail using onnx version 1.11. Please install 1.10.0 for now.")
|
||||
|
||||
# Run symbolic shape inference to walk around ORT shape inference issue for subgraph.
|
||||
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
|
||||
|
||||
out = SymbolicShapeInference.infer_shapes(onnx.load(decoder_onnx_path), auto_merge=True, guess_output_rank=False)
|
||||
if out:
|
||||
# TODO: Use external format if input has extra data.
|
||||
|
@ -222,12 +265,15 @@ def shape_inference(decoder_onnx_path):
|
|||
|
||||
|
||||
def create_ort_session(model_path, use_gpu):
|
||||
from onnxruntime import SessionOptions, InferenceSession, __version__ as ort_version, GraphOptimizationLevel, get_available_providers
|
||||
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
|
||||
from onnxruntime import __version__ as ort_version
|
||||
from onnxruntime import get_available_providers
|
||||
|
||||
sess_options = SessionOptions()
|
||||
sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider']
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
|
||||
if use_gpu:
|
||||
if 'CUDAExecutionProvider' not in get_available_providers():
|
||||
if "CUDAExecutionProvider" not in get_available_providers():
|
||||
raise RuntimeError("CUDAExecutionProvider is not avaiable for --use_gpu!")
|
||||
else:
|
||||
print("use CUDAExecutionProvider")
|
||||
|
@ -237,12 +283,12 @@ def create_ort_session(model_path, use_gpu):
|
|||
|
||||
|
||||
def verify_gpt2_subgraph(graph, precision):
|
||||
is_float16 = (Precision.FLOAT16 == precision)
|
||||
is_float16 = Precision.FLOAT16 == precision
|
||||
|
||||
input_count = len(graph.input)
|
||||
layer_count = input_count - 3
|
||||
|
||||
expected_inputs = ['input_ids', 'position_ids', 'attention_mask'] + [f"past_{i}" for i in range(layer_count)]
|
||||
expected_inputs = ["input_ids", "position_ids", "attention_mask"] + [f"past_{i}" for i in range(layer_count)]
|
||||
if len(graph.input) != len(expected_inputs):
|
||||
raise ValueError(f"Number of inputs expected to be {len(expected_inputs)}. Got {len(graph.input)}")
|
||||
|
||||
|
@ -260,7 +306,7 @@ def verify_gpt2_subgraph(graph, precision):
|
|||
)
|
||||
print("Verifying GPT-2 graph inputs: name and data type are good.")
|
||||
|
||||
expected_outputs = ['logits'] + [f"present_{i}" for i in range(layer_count)]
|
||||
expected_outputs = ["logits"] + [f"present_{i}" for i in range(layer_count)]
|
||||
if len(graph.output) != len(expected_outputs):
|
||||
raise ValueError(f"Number of outputs expected to be {len(expected_outputs)}. Got {len(graph.output)}")
|
||||
|
||||
|
@ -327,8 +373,15 @@ def convert_model(args):
|
|||
verify_t5_decoder_subgraph(model.graph, args.precision)
|
||||
|
||||
inputs = [
|
||||
"input_ids", "max_length", "min_length", "num_beams", "num_return_sequences", "temperature", "length_penalty",
|
||||
"repetition_penalty", "vocab_mask"
|
||||
"input_ids",
|
||||
"max_length",
|
||||
"min_length",
|
||||
"num_beams",
|
||||
"num_return_sequences",
|
||||
"temperature",
|
||||
"length_penalty",
|
||||
"repetition_penalty",
|
||||
"vocab_mask",
|
||||
]
|
||||
if args.prefix_vocab_mask:
|
||||
inputs.append("prefix_vocab_mask")
|
||||
|
@ -341,16 +394,23 @@ def convert_model(args):
|
|||
assert args.output_sequences_scores, "--output_token_scores requires --output_sequences_scores"
|
||||
outputs.append("scores")
|
||||
|
||||
node = helper.make_node('BeamSearch', inputs=inputs, outputs=outputs, name=f'BeamSearch_{args.model_type}')
|
||||
node = helper.make_node(
|
||||
"BeamSearch",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
name=f"BeamSearch_{args.model_type}",
|
||||
)
|
||||
node.domain = "com.microsoft"
|
||||
node.attribute.extend([
|
||||
helper.make_attribute("eos_token_id", eos_token_id),
|
||||
helper.make_attribute("pad_token_id", pad_token_id),
|
||||
helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
|
||||
helper.make_attribute("early_stopping", 1 if args.early_stopping else 0),
|
||||
helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1),
|
||||
helper.make_attribute("decoder", model.graph),
|
||||
])
|
||||
node.attribute.extend(
|
||||
[
|
||||
helper.make_attribute("eos_token_id", eos_token_id),
|
||||
helper.make_attribute("pad_token_id", pad_token_id),
|
||||
helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
|
||||
helper.make_attribute("early_stopping", 1 if args.early_stopping else 0),
|
||||
helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1),
|
||||
helper.make_attribute("decoder", model.graph),
|
||||
]
|
||||
)
|
||||
|
||||
if args.model_type == "t5":
|
||||
if enable_shape_inference:
|
||||
|
@ -359,42 +419,59 @@ def convert_model(args):
|
|||
init_model = onnx.load(args.encoder_decoder_init_onnx)
|
||||
init_model.graph.name = f"{args.model_type} encoder decoder init subgraph"
|
||||
verify_t5_encoder_decoder_init_subgraph(init_model.graph, args.precision)
|
||||
node.attribute.extend([
|
||||
helper.make_attribute("encoder_decoder_init", init_model.graph),
|
||||
])
|
||||
node.attribute.extend(
|
||||
[
|
||||
helper.make_attribute("encoder_decoder_init", init_model.graph),
|
||||
]
|
||||
)
|
||||
|
||||
from onnx import TensorProto
|
||||
|
||||
# graph inputs
|
||||
input_ids = helper.make_tensor_value_info('input_ids', TensorProto.INT32, ['batch_size', 'sequence_length'])
|
||||
max_length = helper.make_tensor_value_info('max_length', TensorProto.INT32, [1])
|
||||
min_length = helper.make_tensor_value_info('min_length', TensorProto.INT32, [1])
|
||||
num_beams = helper.make_tensor_value_info('num_beams', TensorProto.INT32, [1])
|
||||
num_return_sequences = helper.make_tensor_value_info('num_return_sequences', TensorProto.INT32, [1])
|
||||
temperature = helper.make_tensor_value_info('temperature', TensorProto.FLOAT, [1])
|
||||
length_penalty = helper.make_tensor_value_info('length_penalty', TensorProto.FLOAT, [1])
|
||||
repetition_penalty = helper.make_tensor_value_info('repetition_penalty', TensorProto.FLOAT, [1])
|
||||
vocab_mask = helper.make_tensor_value_info('vocab_mask', TensorProto.INT32, [vocab_size])
|
||||
input_ids = helper.make_tensor_value_info("input_ids", TensorProto.INT32, ["batch_size", "sequence_length"])
|
||||
max_length = helper.make_tensor_value_info("max_length", TensorProto.INT32, [1])
|
||||
min_length = helper.make_tensor_value_info("min_length", TensorProto.INT32, [1])
|
||||
num_beams = helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1])
|
||||
num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
|
||||
temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1])
|
||||
length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
|
||||
repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
|
||||
vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [vocab_size])
|
||||
|
||||
graph_inputs = [
|
||||
input_ids, max_length, min_length, num_beams, num_return_sequences, temperature, length_penalty,
|
||||
repetition_penalty, vocab_mask
|
||||
input_ids,
|
||||
max_length,
|
||||
min_length,
|
||||
num_beams,
|
||||
num_return_sequences,
|
||||
temperature,
|
||||
length_penalty,
|
||||
repetition_penalty,
|
||||
vocab_mask,
|
||||
]
|
||||
|
||||
if args.prefix_vocab_mask:
|
||||
prefix_vocab_mask = helper.make_tensor_value_info('prefix_vocab_mask', TensorProto.INT32,
|
||||
['batch_size', vocab_size])
|
||||
prefix_vocab_mask = helper.make_tensor_value_info(
|
||||
"prefix_vocab_mask", TensorProto.INT32, ["batch_size", vocab_size]
|
||||
)
|
||||
graph_inputs.append(prefix_vocab_mask)
|
||||
|
||||
# graph outputs
|
||||
sequences = helper.make_tensor_value_info('sequences', TensorProto.INT32,
|
||||
['batch_size', 'num_return_sequences', 'max_length'])
|
||||
sequences = helper.make_tensor_value_info(
|
||||
"sequences",
|
||||
TensorProto.INT32,
|
||||
["batch_size", "num_return_sequences", "max_length"],
|
||||
)
|
||||
|
||||
sequences_scores = helper.make_tensor_value_info('sequences_scores', TensorProto.FLOAT,
|
||||
['batch_size', 'num_return_sequences'])
|
||||
sequences_scores = helper.make_tensor_value_info(
|
||||
"sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"]
|
||||
)
|
||||
|
||||
scores = helper.make_tensor_value_info('scores', TensorProto.FLOAT,
|
||||
['max_length - sequence_length', 'batch_size', 'num_beams', vocab_size])
|
||||
scores = helper.make_tensor_value_info(
|
||||
"scores",
|
||||
TensorProto.FLOAT,
|
||||
["max_length - sequence_length", "batch_size", "num_beams", vocab_size],
|
||||
)
|
||||
|
||||
initializers = []
|
||||
|
||||
|
@ -406,10 +483,20 @@ def convert_model(args):
|
|||
if args.output_token_scores:
|
||||
graph_outputs.append(scores)
|
||||
|
||||
new_graph = helper.make_graph([node], f'{args.model_type}-beam-search', graph_inputs, graph_outputs, initializers)
|
||||
new_graph = helper.make_graph(
|
||||
[node],
|
||||
f"{args.model_type}-beam-search",
|
||||
graph_inputs,
|
||||
graph_outputs,
|
||||
initializers,
|
||||
)
|
||||
|
||||
# Create the model
|
||||
new_model = helper.make_model(new_graph, producer_name='onnxruntime.transformers', opset_imports=model.opset_import)
|
||||
new_model = helper.make_model(
|
||||
new_graph,
|
||||
producer_name="onnxruntime.transformers",
|
||||
opset_imports=model.opset_import,
|
||||
)
|
||||
onnx.save(new_model, args.output)
|
||||
|
||||
|
||||
|
@ -431,25 +518,28 @@ def test_torch_performance(args, model, input_ids, attention_mask, eos_token_id,
|
|||
torch_latency = []
|
||||
for _ in range(args.total_runs):
|
||||
start = time.time()
|
||||
_ = model.generate(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_length=args.max_length,
|
||||
min_length=args.min_length,
|
||||
num_beams=args.num_beams,
|
||||
early_stopping=args.early_stopping,
|
||||
no_repeat_ngram_size=args.no_repeat_ngram_size,
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=pad_token_id,
|
||||
num_return_sequences=args.num_return_sequences,
|
||||
temperature=args.temperature,
|
||||
length_penalty=args.length_penalty,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
bad_words_ids=bad_words_ids,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=args.output_sequences_scores or args.output_token_scores)
|
||||
_ = model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_length=args.max_length,
|
||||
min_length=args.min_length,
|
||||
num_beams=args.num_beams,
|
||||
early_stopping=args.early_stopping,
|
||||
no_repeat_ngram_size=args.no_repeat_ngram_size,
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=pad_token_id,
|
||||
num_return_sequences=args.num_return_sequences,
|
||||
temperature=args.temperature,
|
||||
length_penalty=args.length_penalty,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
bad_words_ids=bad_words_ids,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=args.output_sequences_scores or args.output_token_scores,
|
||||
)
|
||||
torch_latency.append(time.time() - start)
|
||||
batch_size = input_ids.shape[0]
|
||||
from benchmark_helper import get_latency_result
|
||||
|
||||
return get_latency_result(torch_latency, batch_size)
|
||||
|
||||
|
||||
|
@ -469,21 +559,27 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
|
|||
print("Skipping parity test as prefix vocab mask is not implemented by Hugging Face")
|
||||
return True
|
||||
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
|
||||
tokenizer.padding_side = "left"
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path,
|
||||
cache_dir=args.cache_dir,
|
||||
pad_token_id=tokenizer.eos_token_id)
|
||||
model = GPT2LMHeadModel.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
cache_dir=args.cache_dir,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
# Use different length sentences to test batching
|
||||
if sentences is None:
|
||||
sentences = ["The product is released", "I enjoy walking in the park", "Test best way to invest"]
|
||||
sentences = [
|
||||
"The product is released",
|
||||
"I enjoy walking in the park",
|
||||
"Test best way to invest",
|
||||
]
|
||||
|
||||
inputs = tokenizer(sentences, return_tensors='pt', padding=True)
|
||||
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
|
||||
input_ids = inputs["input_ids"]
|
||||
attention_mask = inputs["attention_mask"]
|
||||
|
||||
|
@ -503,24 +599,26 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
|
|||
|
||||
torch_decoded_sequences = []
|
||||
if not args.disable_parity:
|
||||
print('-' * 50)
|
||||
print("-" * 50)
|
||||
print("Test PyTorch model and beam search with huggingface transformers...")
|
||||
beam_outputs = model.generate(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_length=args.max_length,
|
||||
min_length=args.min_length,
|
||||
num_beams=args.num_beams,
|
||||
early_stopping=args.early_stopping,
|
||||
no_repeat_ngram_size=args.no_repeat_ngram_size,
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=pad_token_id,
|
||||
num_return_sequences=args.num_return_sequences,
|
||||
temperature=args.temperature,
|
||||
length_penalty=args.length_penalty,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
bad_words_ids=bad_words_ids,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=args.output_sequences_scores or args.output_token_scores)
|
||||
beam_outputs = model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_length=args.max_length,
|
||||
min_length=args.min_length,
|
||||
num_beams=args.num_beams,
|
||||
early_stopping=args.early_stopping,
|
||||
no_repeat_ngram_size=args.no_repeat_ngram_size,
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=pad_token_id,
|
||||
num_return_sequences=args.num_return_sequences,
|
||||
temperature=args.temperature,
|
||||
length_penalty=args.length_penalty,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
bad_words_ids=bad_words_ids,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=args.output_sequences_scores or args.output_token_scores,
|
||||
)
|
||||
print("input_ids", input_ids)
|
||||
print("huggingface transformers outputs:")
|
||||
print("sequences", beam_outputs.sequences)
|
||||
|
@ -533,7 +631,7 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
|
|||
torch_decoded_sequences.append(decoded_sequence)
|
||||
print("{}: {}".format(i, decoded_sequence))
|
||||
|
||||
print('-' * 50)
|
||||
print("-" * 50)
|
||||
print("Test ONNX model and bream search with onnxruntime...")
|
||||
|
||||
ort_session = create_ort_session(args.output, args.use_gpu)
|
||||
|
@ -552,15 +650,16 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
|
|||
"temperature": np.array([args.temperature], dtype=np.float32),
|
||||
"length_penalty": np.array([args.length_penalty], dtype=np.float32),
|
||||
"repetition_penalty": np.array([args.repetition_penalty], dtype=np.float32),
|
||||
"vocab_mask": vocab_mask
|
||||
"vocab_mask": vocab_mask,
|
||||
}
|
||||
|
||||
test_data_dir = Path(args.output).parent.as_posix()
|
||||
print("test_data_dir", test_data_dir)
|
||||
from bert_test_data import output_test_data
|
||||
|
||||
all_inputs = [inputs]
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
dir = os.path.join(test_data_dir, 'test_data_set_' + str(i))
|
||||
dir = os.path.join(test_data_dir, "test_data_set_" + str(i))
|
||||
output_test_data(dir, inputs)
|
||||
|
||||
print("inputs", inputs)
|
||||
|
@ -573,6 +672,7 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
|
|||
latency.append(time.time() - start)
|
||||
batch_size = input_ids.shape[0]
|
||||
from benchmark_helper import get_latency_result
|
||||
|
||||
output = get_latency_result(latency, batch_size)
|
||||
|
||||
print("ORT outputs:")
|
||||
|
@ -604,13 +704,20 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
|
|||
print(ort_decoded_sequences)
|
||||
print("-" * 50)
|
||||
# Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not.
|
||||
is_same = (torch_decoded_sequences == ort_decoded_sequences)
|
||||
is_same = torch_decoded_sequences == ort_decoded_sequences
|
||||
print("Torch and ORT result is ", "same" if is_same else "different")
|
||||
output["parity"] = is_same
|
||||
|
||||
if args.torch_performance:
|
||||
torch_latency_output = test_torch_performance(args, model, input_ids, attention_mask, eos_token_id,
|
||||
pad_token_id, bad_words_ids)
|
||||
torch_latency_output = test_torch_performance(
|
||||
args,
|
||||
model,
|
||||
input_ids,
|
||||
attention_mask,
|
||||
eos_token_id,
|
||||
pad_token_id,
|
||||
bad_words_ids,
|
||||
)
|
||||
print("Torch Latency", torch_latency_output)
|
||||
|
||||
print("ORT", output)
|
||||
|
@ -630,5 +737,5 @@ def main(argv=None, sentences=None):
|
|||
return test_model(args, use_vocab_mask=True, sentences=sentences)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,24 +1,56 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
TFMODELS = {
|
||||
"bert-base-uncased":
|
||||
("bert", "BertConfig", "", "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip"),
|
||||
"bert-base-cased":
|
||||
("bert", "BertConfig", "", "https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip"),
|
||||
"bert-large-uncased":
|
||||
("bert", "BertConfig", "", "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip"),
|
||||
"albert-base": ("albert", "AlbertConfig", "", "https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz"),
|
||||
"albert-large":
|
||||
("albert", "AlbertConfig", "", "https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz"),
|
||||
"gpt-2-117M": ("gpt2", "GPT2Config", "GPT2Model", "https://storage.googleapis.com/gpt-2/models/117M"),
|
||||
"gpt-2-124M": ("gpt2", "GPT2Config", "GPT2Model", "https://storage.googleapis.com/gpt-2/models/124M")
|
||||
"bert-base-uncased": (
|
||||
"bert",
|
||||
"BertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip",
|
||||
),
|
||||
"bert-base-cased": (
|
||||
"bert",
|
||||
"BertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip",
|
||||
),
|
||||
"bert-large-uncased": (
|
||||
"bert",
|
||||
"BertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip",
|
||||
),
|
||||
"albert-base": (
|
||||
"albert",
|
||||
"AlbertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz",
|
||||
),
|
||||
"albert-large": (
|
||||
"albert",
|
||||
"AlbertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz",
|
||||
),
|
||||
"gpt-2-117M": (
|
||||
"gpt2",
|
||||
"GPT2Config",
|
||||
"GPT2Model",
|
||||
"https://storage.googleapis.com/gpt-2/models/117M",
|
||||
),
|
||||
"gpt-2-124M": (
|
||||
"gpt2",
|
||||
"GPT2Config",
|
||||
"GPT2Model",
|
||||
"https://storage.googleapis.com/gpt-2/models/124M",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
@ -26,7 +58,7 @@ def download_compressed_file(tf_ckpt_url, ckpt_dir):
|
|||
r = requests.get(tf_ckpt_url)
|
||||
compressed_file_name = tf_ckpt_url.split("/")[-1]
|
||||
compressed_file_dir = os.path.join(ckpt_dir, compressed_file_name)
|
||||
with open(compressed_file_dir, 'wb') as f:
|
||||
with open(compressed_file_dir, "wb") as f:
|
||||
f.write(r.content)
|
||||
return compressed_file_dir
|
||||
|
||||
|
@ -40,13 +72,14 @@ def get_ckpt_prefix_path(ckpt_dir):
|
|||
if os.path.isfile(sub_folder_dir):
|
||||
sub_folder_dir = ckpt_dir
|
||||
unique_file_name = str(glob.glob(sub_folder_dir + "/*data-00000-of-00001"))
|
||||
prefix = (unique_file_name.rpartition('.')[0]).split("/")[-1]
|
||||
prefix = (unique_file_name.rpartition(".")[0]).split("/")[-1]
|
||||
|
||||
return os.path.join(sub_folder_dir, prefix)
|
||||
|
||||
|
||||
def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
|
||||
import pathlib
|
||||
|
||||
base_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), tf_models_dir)
|
||||
ckpt_dir = os.path.join(base_dir, model_name)
|
||||
|
||||
|
@ -56,32 +89,40 @@ def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
|
|||
tf_ckpt_url = TFMODELS[model_name][3]
|
||||
|
||||
import re
|
||||
if (re.search('.zip$', tf_ckpt_url) != None):
|
||||
|
||||
if re.search(".zip$", tf_ckpt_url) != None:
|
||||
zip_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
|
||||
|
||||
# unzip file
|
||||
import zipfile
|
||||
with zipfile.ZipFile(zip_dir, 'r') as zip_ref:
|
||||
|
||||
with zipfile.ZipFile(zip_dir, "r") as zip_ref:
|
||||
zip_ref.extractall(ckpt_dir)
|
||||
os.remove(zip_dir)
|
||||
|
||||
return get_ckpt_prefix_path(ckpt_dir)
|
||||
|
||||
elif (re.search('.tar.gz$', tf_ckpt_url) != None):
|
||||
elif re.search(".tar.gz$", tf_ckpt_url) != None:
|
||||
tar_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
|
||||
|
||||
# untar file
|
||||
import tarfile
|
||||
with tarfile.open(tar_dir, 'r') as tar_ref:
|
||||
|
||||
with tarfile.open(tar_dir, "r") as tar_ref:
|
||||
tar_ref.extractall(ckpt_dir)
|
||||
os.remove(tar_dir)
|
||||
|
||||
return get_ckpt_prefix_path(ckpt_dir)
|
||||
|
||||
else:
|
||||
for filename in ['checkpoint', 'model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta']:
|
||||
for filename in [
|
||||
"checkpoint",
|
||||
"model.ckpt.data-00000-of-00001",
|
||||
"model.ckpt.index",
|
||||
"model.ckpt.meta",
|
||||
]:
|
||||
r = requests.get(tf_ckpt_url + "/" + filename)
|
||||
with open(os.path.join(ckpt_dir, filename), 'wb') as f:
|
||||
with open(os.path.join(ckpt_dir, filename), "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
return get_ckpt_prefix_path(ckpt_dir)
|
||||
|
@ -92,12 +133,13 @@ def init_pytorch_model(model_name, tf_checkpoint_path):
|
|||
config_module = __import__("transformers", fromlist=[config_name])
|
||||
model_config = getattr(config_module, config_name)
|
||||
|
||||
parent_path = tf_checkpoint_path.rpartition('/')[0]
|
||||
parent_path = tf_checkpoint_path.rpartition("/")[0]
|
||||
config_path = glob.glob(parent_path + "/*config.json")
|
||||
config = model_config() if len(config_path) == 0 else model_config.from_json_file(str(config_path[0]))
|
||||
|
||||
if TFMODELS[model_name][2] == "":
|
||||
from transformers import AutoModelForPreTraining
|
||||
|
||||
init_model = AutoModelForPreTraining.from_config(config)
|
||||
else:
|
||||
model_categroy_name = TFMODELS[model_name][2]
|
||||
|
@ -118,11 +160,15 @@ def convert_tf_checkpoint_to_pytorch(model_name, config, init_model, tf_checkpoi
|
|||
if TFMODELS[model_name][0] != "bert":
|
||||
raise NotImplementedError("Only support tf2 ckeckpoint for Bert model")
|
||||
from transformers import convert_bert_original_tf2_checkpoint_to_pytorch
|
||||
|
||||
load_tf_weight_func = convert_bert_original_tf2_checkpoint_to_pytorch.load_tf2_weights_in_bert
|
||||
|
||||
# Expect transformers team will unify the order of signature in the future
|
||||
model = load_tf_weight_func(init_model, config, tf_checkpoint_path) if is_tf2 is False else load_tf_weight_func(
|
||||
init_model, tf_checkpoint_path, config)
|
||||
model = (
|
||||
load_tf_weight_func(init_model, config, tf_checkpoint_path)
|
||||
if is_tf2 is False
|
||||
else load_tf_weight_func(init_model, tf_checkpoint_path, config)
|
||||
)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
@ -140,11 +186,13 @@ def tf2pt_pipeline(model_name, is_tf2=False):
|
|||
def tf2pt_pipeline_test():
|
||||
# For test on linux only
|
||||
import logging
|
||||
|
||||
import torch
|
||||
logger = logging.getLogger('')
|
||||
|
||||
logger = logging.getLogger("")
|
||||
for model_name in TFMODELS.keys():
|
||||
config, model = tf2pt_pipeline(model_name)
|
||||
assert (config.model_type is TFMODELS[model_name][0])
|
||||
assert config.model_type is TFMODELS[model_name][0]
|
||||
|
||||
input = torch.randint(low=0, high=config.vocab_size - 1, size=(4, 128), dtype=torch.long)
|
||||
try:
|
||||
|
@ -153,5 +201,5 @@ def tf2pt_pipeline_test():
|
|||
logger.exception(e)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
tf2pt_pipeline_test()
|
||||
|
|
|
@ -1,48 +1,48 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
|
||||
# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import helper, numpy_helper
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
from typing import List, Dict
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _npfloat16_to_int(np_list):
|
||||
'''
|
||||
"""
|
||||
Convert numpy float16 to python int.
|
||||
|
||||
:param np_list: numpy float16 list
|
||||
:return int_list: python int list
|
||||
'''
|
||||
return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
|
||||
"""
|
||||
return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
|
||||
|
||||
|
||||
def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
|
||||
'''
|
||||
"""
|
||||
Convert float32 numpy array to float16 without changing sign or finiteness.
|
||||
Positive values less than min_positive_val are mapped to min_positive_val.
|
||||
Positive finite values greater than max_finite_val are mapped to max_finite_val.
|
||||
Similar for negative values. NaN, 0, inf, and -inf are unchanged.
|
||||
'''
|
||||
"""
|
||||
|
||||
def between(a, b, c):
|
||||
return np.logical_and(a < b, b < c)
|
||||
|
||||
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
|
||||
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
|
||||
np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
|
||||
np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
|
||||
np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)
|
||||
np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)
|
||||
return np.float16(np_array)
|
||||
|
||||
|
||||
|
@ -62,7 +62,7 @@ def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finit
|
|||
"""
|
||||
|
||||
if not isinstance(tensor, onnx_proto.TensorProto):
|
||||
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
|
||||
raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
|
||||
|
||||
if tensor.data_type == onnx_proto.TensorProto.FLOAT:
|
||||
tensor.data_type = onnx_proto.TensorProto.FLOAT16
|
||||
|
@ -75,7 +75,7 @@ def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finit
|
|||
# convert raw_data (bytes type)
|
||||
if tensor.raw_data:
|
||||
# convert n.raw_data to float
|
||||
float32_list = np.fromstring(tensor.raw_data, dtype='float32')
|
||||
float32_list = np.fromstring(tensor.raw_data, dtype="float32")
|
||||
# convert float to float16
|
||||
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
|
||||
# convert float16 to bytes and write back to raw_data
|
||||
|
@ -89,10 +89,33 @@ def make_value_info_from_tensor(tensor):
|
|||
|
||||
|
||||
DEFAULT_OP_BLOCK_LIST = [
|
||||
'ArrayFeatureExtractor', 'Binarizer', 'CastMap', 'CategoryMapper', 'DictVectorizer', 'FeatureVectorizer', 'Imputer',
|
||||
'LabelEncoder', 'LinearClassifier', 'LinearRegressor', 'Normalizer', 'OneHotEncoder', 'SVMClassifier',
|
||||
'SVMRegressor', 'Scaler', 'TreeEnsembleClassifier', 'TreeEnsembleRegressor', 'ZipMap', 'NonMaxSuppression', 'TopK',
|
||||
'RoiAlign', 'Resize', 'Range', 'CumSum', 'Min', 'Max', 'Upsample'
|
||||
"ArrayFeatureExtractor",
|
||||
"Binarizer",
|
||||
"CastMap",
|
||||
"CategoryMapper",
|
||||
"DictVectorizer",
|
||||
"FeatureVectorizer",
|
||||
"Imputer",
|
||||
"LabelEncoder",
|
||||
"LinearClassifier",
|
||||
"LinearRegressor",
|
||||
"Normalizer",
|
||||
"OneHotEncoder",
|
||||
"SVMClassifier",
|
||||
"SVMRegressor",
|
||||
"Scaler",
|
||||
"TreeEnsembleClassifier",
|
||||
"TreeEnsembleRegressor",
|
||||
"ZipMap",
|
||||
"NonMaxSuppression",
|
||||
"TopK",
|
||||
"RoiAlign",
|
||||
"Resize",
|
||||
"Range",
|
||||
"CumSum",
|
||||
"Min",
|
||||
"Max",
|
||||
"Upsample",
|
||||
]
|
||||
|
||||
|
||||
|
@ -111,14 +134,16 @@ class InitializerTracker:
|
|||
self.fp16_nodes.append(node)
|
||||
|
||||
|
||||
def convert_float_to_float16(model,
|
||||
min_positive_val=5.96e-08,
|
||||
max_finite_val=65504.0,
|
||||
keep_io_types=False,
|
||||
disable_shape_infer=False,
|
||||
op_block_list=None,
|
||||
node_block_list=None,
|
||||
force_fp16_initializers=False):
|
||||
def convert_float_to_float16(
|
||||
model,
|
||||
min_positive_val=5.96e-08,
|
||||
max_finite_val=65504.0,
|
||||
keep_io_types=False,
|
||||
disable_shape_infer=False,
|
||||
op_block_list=None,
|
||||
node_block_list=None,
|
||||
force_fp16_initializers=False,
|
||||
):
|
||||
"""Convert model tensor float type in the ONNX ModelProto input to tensor float16.
|
||||
|
||||
Args:
|
||||
|
@ -139,19 +164,22 @@ def convert_float_to_float16(model,
|
|||
Returns:
|
||||
ModelProto: converted model.
|
||||
"""
|
||||
assert min_positive_val >= 5.96e-08, "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
|
||||
assert (
|
||||
min_positive_val >= 5.96e-08
|
||||
), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
|
||||
assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"
|
||||
|
||||
func_infer_shape = None
|
||||
if not disable_shape_infer and onnx.__version__ >= '1.2':
|
||||
if not disable_shape_infer and onnx.__version__ >= "1.2":
|
||||
try:
|
||||
from onnx.shape_inference import infer_shapes
|
||||
|
||||
func_infer_shape = infer_shapes
|
||||
finally:
|
||||
pass
|
||||
|
||||
if not isinstance(model, onnx_proto.ModelProto):
|
||||
raise ValueError('Expected model type is an ONNX ModelProto but got %s' % type(model))
|
||||
raise ValueError("Expected model type is an ONNX ModelProto but got %s" % type(model))
|
||||
|
||||
# create blocklists
|
||||
if op_block_list is None:
|
||||
|
@ -188,34 +216,34 @@ def convert_float_to_float16(model,
|
|||
|
||||
for i, n in enumerate(model.graph.input):
|
||||
if n.name in fp32_inputs:
|
||||
output_name = 'graph_input_cast_' + str(i)
|
||||
output_name = "graph_input_cast_" + str(i)
|
||||
name_mapping[n.name] = output_name
|
||||
graph_io_to_skip.add(n.name)
|
||||
|
||||
node_name = 'graph_input_cast' + str(i)
|
||||
node_name = "graph_input_cast" + str(i)
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(n)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
# add Cast node (from tensor(float) to tensor(float16) after graph input
|
||||
new_node = [helper.make_node('Cast', [n.name], [output_name], to=10, name=node_name)]
|
||||
new_node = [helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
value_info_list.append(new_value_info)
|
||||
io_casts.add(node_name)
|
||||
|
||||
for i, n in enumerate(model.graph.output):
|
||||
if n.name in fp32_outputs:
|
||||
input_name = 'graph_output_cast_' + str(i)
|
||||
input_name = "graph_output_cast_" + str(i)
|
||||
name_mapping[n.name] = input_name
|
||||
graph_io_to_skip.add(n.name)
|
||||
|
||||
node_name = 'graph_output_cast' + str(i)
|
||||
node_name = "graph_output_cast" + str(i)
|
||||
# add Cast node (from tensor(float16) to tensor(float) before graph output
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(n)
|
||||
new_value_info.name = input_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
new_node = [helper.make_node('Cast', [input_name], [n.name], to=1, name=node_name)]
|
||||
new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
value_info_list.append(new_value_info)
|
||||
io_casts.add(node_name)
|
||||
|
@ -254,9 +282,9 @@ def convert_float_to_float16(model,
|
|||
if is_node_blocked:
|
||||
node_list.append(n)
|
||||
else:
|
||||
if n.op_type == 'Cast':
|
||||
if n.op_type == "Cast":
|
||||
for attr in n.attribute:
|
||||
if attr.name == 'to' and attr.i == 1:
|
||||
if attr.name == "to" and attr.i == 1:
|
||||
attr.i = 10
|
||||
break
|
||||
for attr in n.attribute:
|
||||
|
@ -280,12 +308,12 @@ def convert_float_to_float16(model,
|
|||
if n.name not in graph_io_to_skip:
|
||||
n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
value_info_list.append(n)
|
||||
if n.type.HasField('sequence_type'):
|
||||
if n.type.HasField("sequence_type"):
|
||||
if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
|
||||
if n.name not in graph_io_to_skip:
|
||||
n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
value_info_list.append(n)
|
||||
|
||||
|
||||
queue = next_level
|
||||
|
||||
for key, value in fp32_initializers.items():
|
||||
|
@ -296,7 +324,9 @@ def convert_float_to_float16(model,
|
|||
if value.fp32_nodes and not force_fp16_initializers:
|
||||
logger.info(
|
||||
"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
|
||||
value.fp16_nodes))
|
||||
value.fp16_nodes
|
||||
)
|
||||
)
|
||||
|
||||
# process the nodes in block list that doesn't support tensor(float16)
|
||||
for node in node_list:
|
||||
|
@ -310,12 +340,12 @@ def convert_float_to_float16(model,
|
|||
# create new value_info for current node's new input name
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
output_name = node.name + '_input_cast_' + str(i)
|
||||
output_name = node.name + "_input_cast_" + str(i)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
|
||||
# add Cast node (from tensor(float16) to tensor(float) before current node
|
||||
node_name = node.name + '_input_cast' + str(i)
|
||||
new_node = [helper.make_node('Cast', [input], [output_name], to=1, name=node_name)]
|
||||
node_name = node.name + "_input_cast" + str(i)
|
||||
new_node = [helper.make_node("Cast", [input], [output_name], to=1, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.input[i] = output_name
|
||||
|
@ -329,12 +359,12 @@ def convert_float_to_float16(model,
|
|||
# create new value_info for current node's new output
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
input_name = node.name + '_output_cast_' + str(i)
|
||||
input_name = node.name + "_output_cast_" + str(i)
|
||||
new_value_info.name = input_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
|
||||
# add Cast node (from tensor(float) to tensor(float16) after current node
|
||||
node_name = node.name + '_output_cast' + str(i)
|
||||
new_node = [helper.make_node('Cast', [input_name], [output], to=10, name=node_name)]
|
||||
node_name = node.name + "_output_cast" + str(i)
|
||||
new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.output[i] = input_name
|
||||
|
@ -345,15 +375,15 @@ def convert_float_to_float16(model,
|
|||
def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
|
||||
"""Measure the maximum absolute difference after converting a float tensor to float16."""
|
||||
if not isinstance(tensor, onnx_proto.TensorProto):
|
||||
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
|
||||
raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
|
||||
if tensor.data_type != onnx_proto.TensorProto.FLOAT:
|
||||
raise ValueError('Expected tensor data type is float.')
|
||||
raise ValueError("Expected tensor data type is float.")
|
||||
|
||||
if tensor.float_data:
|
||||
float32_data = np.array(tensor.float_data)
|
||||
|
||||
if tensor.raw_data:
|
||||
float32_data = np.fromstring(tensor.raw_data, dtype='float32')
|
||||
float32_data = np.fromstring(tensor.raw_data, dtype="float32")
|
||||
|
||||
float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
|
||||
return np.amax(np.abs(float32_data - np.float32(float16_data)))
|
||||
|
|
|
@ -1,27 +1,29 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
from enum import Enum
|
||||
from logging import getLogger
|
||||
from os import name
|
||||
from sys import path
|
||||
import numpy as np
|
||||
from logging import getLogger
|
||||
from enum import Enum
|
||||
from typing import Tuple, Union
|
||||
from onnx import helper, numpy_helper, TensorProto, NodeProto
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
import numpy as np
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import FusionUtils, NumpyHelper
|
||||
from fusion_options import AttentionMaskFormat
|
||||
from fusion_utils import FusionUtils, NumpyHelper
|
||||
from onnx import NodeProto, TensorProto, helper, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
from shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class AttentionMask():
|
||||
class AttentionMask:
|
||||
"""
|
||||
Fuse Attention subgraph into one Attention node.
|
||||
"""
|
||||
|
||||
def __init__(self, model: OnnxModel):
|
||||
self.model = model
|
||||
# A lookup table with mask input as key, and mask index output as value
|
||||
|
@ -66,11 +68,13 @@ class AttentionMask():
|
|||
return input_name
|
||||
|
||||
# Add a mask processing node to convert attention mask to mask index (1D)
|
||||
output_name = self.model.create_node_name('mask_index')
|
||||
mask_index_node = helper.make_node('ReduceSum',
|
||||
inputs=[input_name],
|
||||
outputs=[output_name],
|
||||
name=self.model.create_node_name('ReduceSum', 'MaskReduceSum'))
|
||||
output_name = self.model.create_node_name("mask_index")
|
||||
mask_index_node = helper.make_node(
|
||||
"ReduceSum",
|
||||
inputs=[input_name],
|
||||
outputs=[output_name],
|
||||
name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
|
||||
)
|
||||
mask_index_node.attribute.extend([helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)])
|
||||
self.model.add_node(mask_index_node)
|
||||
|
||||
|
@ -82,7 +86,14 @@ class FusionAttention(Fusion):
|
|||
"""
|
||||
Fuse Attention subgraph into one Attention node.
|
||||
"""
|
||||
def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int, attention_mask: AttentionMask):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: OnnxModel,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
attention_mask: AttentionMask,
|
||||
):
|
||||
super().__init__(model, "Attention", ["SkipLayerNormalization", "LayerNormalization"])
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_heads
|
||||
|
@ -93,7 +104,7 @@ class FusionAttention(Fusion):
|
|||
self.hidden_size_warning = True
|
||||
|
||||
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
|
||||
""" Detect num_heads and hidden_size from a reshape node.
|
||||
"""Detect num_heads and hidden_size from a reshape node.
|
||||
|
||||
Args:
|
||||
reshape_q (NodeProto): reshape node for Q
|
||||
|
@ -125,7 +136,8 @@ class FusionAttention(Fusion):
|
|||
if self.hidden_size > 0 and hidden_size != self.hidden_size:
|
||||
if self.hidden_size_warning:
|
||||
logger.warning(
|
||||
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value.")
|
||||
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
|
||||
)
|
||||
self.hidden_size_warning = False # Do not show the warning more than once
|
||||
|
||||
return num_heads, hidden_size
|
||||
|
@ -148,10 +160,22 @@ class FusionAttention(Fusion):
|
|||
|
||||
return add_qk.input[1]
|
||||
|
||||
def create_attention_node(self, mask_index: str, q_matmul: NodeProto, k_matmul: NodeProto, v_matmul: NodeProto,
|
||||
q_add: NodeProto, k_add: NodeProto, v_add: NodeProto, num_heads: int, hidden_size: int,
|
||||
input: str, output: str, add_qk_str: str) -> Union[NodeProto, None]:
|
||||
""" Create an Attention node.
|
||||
def create_attention_node(
|
||||
self,
|
||||
mask_index: str,
|
||||
q_matmul: NodeProto,
|
||||
k_matmul: NodeProto,
|
||||
v_matmul: NodeProto,
|
||||
q_add: NodeProto,
|
||||
k_add: NodeProto,
|
||||
v_add: NodeProto,
|
||||
num_heads: int,
|
||||
hidden_size: int,
|
||||
input: str,
|
||||
output: str,
|
||||
add_qk_str: str,
|
||||
) -> Union[NodeProto, None]:
|
||||
"""Create an Attention node.
|
||||
|
||||
Args:
|
||||
mask_index (str): mask input
|
||||
|
@ -244,27 +268,35 @@ class FusionAttention(Fusion):
|
|||
qkv_bias = np.stack((qb, kb, vb), axis=0)
|
||||
qkv_bias_dim = 3 * q_bias_shape
|
||||
|
||||
attention_node_name = self.model.create_node_name('Attention')
|
||||
attention_node_name = self.model.create_node_name("Attention")
|
||||
|
||||
weight = helper.make_tensor(name=attention_node_name + '_qkv_weight',
|
||||
data_type=TensorProto.FLOAT,
|
||||
dims=[qw_in_size, qkv_weight_dim],
|
||||
vals=qkv_weight.flatten().tolist())
|
||||
weight = helper.make_tensor(
|
||||
name=attention_node_name + "_qkv_weight",
|
||||
data_type=TensorProto.FLOAT,
|
||||
dims=[qw_in_size, qkv_weight_dim],
|
||||
vals=qkv_weight.flatten().tolist(),
|
||||
)
|
||||
|
||||
# Sometimes weights and bias are stored in fp16
|
||||
if q_weight.data_type == 10:
|
||||
weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
|
||||
self.model.add_initializer(weight, self.this_graph_name)
|
||||
|
||||
bias = helper.make_tensor(name=attention_node_name + '_qkv_bias',
|
||||
data_type=TensorProto.FLOAT,
|
||||
dims=[qkv_bias_dim],
|
||||
vals=qkv_bias.flatten().tolist())
|
||||
bias = helper.make_tensor(
|
||||
name=attention_node_name + "_qkv_bias",
|
||||
data_type=TensorProto.FLOAT,
|
||||
dims=[qkv_bias_dim],
|
||||
vals=qkv_bias.flatten().tolist(),
|
||||
)
|
||||
if q_bias.data_type == 10:
|
||||
bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
|
||||
self.model.add_initializer(bias, self.this_graph_name)
|
||||
|
||||
attention_inputs = [input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias']
|
||||
attention_inputs = [
|
||||
input,
|
||||
attention_node_name + "_qkv_weight",
|
||||
attention_node_name + "_qkv_bias",
|
||||
]
|
||||
if mask_index is not None:
|
||||
attention_inputs.append(mask_index)
|
||||
else:
|
||||
|
@ -274,16 +306,19 @@ class FusionAttention(Fusion):
|
|||
attention_inputs.append("")
|
||||
attention_inputs.append(add_qk_str)
|
||||
|
||||
attention_node = helper.make_node('Attention',
|
||||
inputs=attention_inputs,
|
||||
outputs=[output],
|
||||
name=attention_node_name)
|
||||
attention_node = helper.make_node(
|
||||
"Attention",
|
||||
inputs=attention_inputs,
|
||||
outputs=[output],
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
|
||||
|
||||
if is_qkv_diff_dims:
|
||||
attention_node.attribute.extend(
|
||||
[helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])])
|
||||
[helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
|
||||
)
|
||||
|
||||
return attention_node
|
||||
|
||||
|
@ -291,23 +326,27 @@ class FusionAttention(Fusion):
|
|||
# Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
|
||||
# Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
|
||||
start_node = normalize_node
|
||||
if normalize_node.op_type == 'LayerNormalization':
|
||||
add_before_layernorm = self.model.match_parent(normalize_node, 'Add', 0)
|
||||
if normalize_node.op_type == "LayerNormalization":
|
||||
add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
|
||||
if add_before_layernorm is not None:
|
||||
start_node = add_before_layernorm
|
||||
else:
|
||||
return
|
||||
|
||||
# SkipLayerNormalization has two inputs, and one of them is the root input for attention.
|
||||
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
|
||||
[None, None, 0, 0, 0])
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
start_node,
|
||||
["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
|
||||
[None, None, 0, 0, 0],
|
||||
)
|
||||
einsum_node = None
|
||||
if qkv_nodes is not None:
|
||||
(_, matmul_qkv, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
|
||||
else:
|
||||
# Match Albert
|
||||
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'],
|
||||
[1, None, 0, 0])
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
|
||||
)
|
||||
if qkv_nodes is not None:
|
||||
(_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
|
||||
else:
|
||||
|
@ -333,12 +372,12 @@ class FusionAttention(Fusion):
|
|||
| |
|
||||
+---------------------------------------------------------
|
||||
"""
|
||||
mul_before_layernorm = self.model.match_parent(start_node, 'Mul', 0)
|
||||
mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
|
||||
if mul_before_layernorm is not None:
|
||||
mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
|
||||
if mul_children is not None and len(mul_children) == 2:
|
||||
layernorm_node = mul_children[1]
|
||||
if layernorm_node.op_type == 'LayerNormalization':
|
||||
if layernorm_node.op_type == "LayerNormalization":
|
||||
root_input = layernorm_node.output[0]
|
||||
else:
|
||||
return
|
||||
|
@ -346,7 +385,7 @@ class FusionAttention(Fusion):
|
|||
root_input = mul_before_layernorm.output[0]
|
||||
else:
|
||||
return
|
||||
elif normalize_node.op_type == 'LayerNormalization':
|
||||
elif normalize_node.op_type == "LayerNormalization":
|
||||
children = input_name_to_nodes[root_input]
|
||||
for child in children:
|
||||
if child.op_type == "LayerNormalization":
|
||||
|
@ -354,10 +393,10 @@ class FusionAttention(Fusion):
|
|||
|
||||
children = input_name_to_nodes[root_input]
|
||||
children_types = [child.op_type for child in children]
|
||||
if children_types.count('MatMul') != 3:
|
||||
if children_types.count("MatMul") != 3:
|
||||
return
|
||||
|
||||
v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
|
||||
v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return
|
||||
|
@ -366,10 +405,10 @@ class FusionAttention(Fusion):
|
|||
is_distill = False
|
||||
is_distill_add = False
|
||||
qk_paths = {
|
||||
"path1": (['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, None, 0]),
|
||||
"path2": (['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, None, 0]),
|
||||
"path3": (['Softmax', 'Where', 'MatMul', 'Div'], [0, 0, 2, 0]),
|
||||
"path4": (['Softmax', 'Add', 'Where', 'MatMul'], [0, 0, 0, 2])
|
||||
"path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
|
||||
"path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
|
||||
"path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
|
||||
"path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
|
||||
}
|
||||
|
||||
qk_nodes = None
|
||||
|
@ -397,10 +436,13 @@ class FusionAttention(Fusion):
|
|||
else:
|
||||
(_, add_qk, _, matmul_qk) = qk_nodes
|
||||
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, None])
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
|
||||
if q_nodes is None:
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Add', 'MatMul'],
|
||||
[0, 0, 0, 0, None])
|
||||
q_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Div", "Transpose", "Reshape", "Add", "MatMul"],
|
||||
[0, 0, 0, 0, None],
|
||||
)
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return
|
||||
|
@ -408,10 +450,13 @@ class FusionAttention(Fusion):
|
|||
add_q = q_nodes[-2]
|
||||
matmul_q = q_nodes[-1]
|
||||
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
|
||||
if k_nodes is None:
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Transpose', 'Reshape', 'Add', 'MatMul'],
|
||||
[1, 0, 0, 0, None])
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
|
||||
[1, 0, 0, 0, None],
|
||||
)
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return
|
||||
|
@ -422,15 +467,24 @@ class FusionAttention(Fusion):
|
|||
mask_nodes = None
|
||||
add_qk_str = None
|
||||
if is_distill:
|
||||
_, mask_nodes, _ = self.model.match_parent_paths(where_qk,
|
||||
[(['Expand', 'Reshape', 'Equal'], [0, 0, 0]),
|
||||
(['Equal', 'Unsqueeze', 'Unsqueeze'], [0, 0, 0]),
|
||||
(['Cast', 'Expand', 'Reshape', 'Equal'], [0, 0, 0, 0])],
|
||||
output_name_to_node)
|
||||
_, mask_nodes, _ = self.model.match_parent_paths(
|
||||
where_qk,
|
||||
[
|
||||
(["Expand", "Reshape", "Equal"], [0, 0, 0]),
|
||||
(["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
|
||||
(["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
elif is_distill_add:
|
||||
_, mask_nodes, _ = self.model.match_parent_paths(
|
||||
where_qk, [(['Cast', 'Equal', 'Unsqueeze', 'Unsqueeze'], [0, 0, 0, 0]),
|
||||
(['Equal', 'Unsqueeze', 'Unsqueeze'], [0, 0, 0])], output_name_to_node)
|
||||
where_qk,
|
||||
[
|
||||
(["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
|
||||
(["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
if add_qk is not None:
|
||||
add_qk_str = self.get_add_qk_str(add_qk)
|
||||
if add_qk_str is None:
|
||||
|
@ -438,8 +492,16 @@ class FusionAttention(Fusion):
|
|||
return
|
||||
else:
|
||||
_, mask_nodes, _ = self.model.match_parent_paths(
|
||||
add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0, 0]),
|
||||
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0])], output_name_to_node)
|
||||
add_qk,
|
||||
[
|
||||
(
|
||||
["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
|
||||
[None, 0, 1, 0, 0],
|
||||
),
|
||||
(["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
if mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match mask path")
|
||||
return
|
||||
|
@ -452,9 +514,20 @@ class FusionAttention(Fusion):
|
|||
q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
|
||||
# number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
|
||||
# the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
|
||||
new_node = self.create_attention_node(mask_index, matmul_q, matmul_k, matmul_v, add_q, add_k, add_v,
|
||||
q_num_heads, q_hidden_size, root_input, attention_last_node.output[0],
|
||||
add_qk_str)
|
||||
new_node = self.create_attention_node(
|
||||
mask_index,
|
||||
matmul_q,
|
||||
matmul_k,
|
||||
matmul_v,
|
||||
add_q,
|
||||
add_k,
|
||||
add_v,
|
||||
q_num_heads,
|
||||
q_hidden_size,
|
||||
root_input,
|
||||
attention_last_node.output[0],
|
||||
add_qk_str,
|
||||
)
|
||||
if new_node is None:
|
||||
return
|
||||
|
||||
|
@ -464,16 +537,23 @@ class FusionAttention(Fusion):
|
|||
if einsum_node is not None:
|
||||
unique_index = einsum_node.input[0]
|
||||
new_edge = "edge_modified_" + unique_index
|
||||
shape_tensor = helper.make_tensor(name="shape_modified_tensor" + unique_index,
|
||||
data_type=TensorProto.INT64,
|
||||
dims=[4],
|
||||
vals=np.int64([0, 0, q_num_heads,
|
||||
int(q_hidden_size / q_num_heads)]).tobytes(),
|
||||
raw=True)
|
||||
shape_tensor = helper.make_tensor(
|
||||
name="shape_modified_tensor" + unique_index,
|
||||
data_type=TensorProto.INT64,
|
||||
dims=[4],
|
||||
vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
|
||||
raw=True,
|
||||
)
|
||||
self.model.add_initializer(shape_tensor, self.this_graph_name)
|
||||
self.model.add_node(
|
||||
helper.make_node("Reshape", [attention_last_node.output[0], shape_tensor.name], [new_edge],
|
||||
"reshape_modified_" + unique_index), self.this_graph_name)
|
||||
helper.make_node(
|
||||
"Reshape",
|
||||
[attention_last_node.output[0], shape_tensor.name],
|
||||
[new_edge],
|
||||
"reshape_modified_" + unique_index,
|
||||
),
|
||||
self.this_graph_name,
|
||||
)
|
||||
einsum_node.input[0] = new_edge
|
||||
|
||||
self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
|
||||
|
@ -483,5 +563,5 @@ class FusionAttention(Fusion):
|
|||
self.nodes_to_remove.extend(v_nodes)
|
||||
|
||||
# Use prune graph to remove mask nodes since they are shared by all attention nodes.
|
||||
#self.nodes_to_remove.extend(mask_nodes)
|
||||
# self.nodes_to_remove.extend(mask_nodes)
|
||||
self.prune_graph = True
|
||||
|
|
|
@ -1,21 +1,24 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from onnx_model import OnnxModel
|
||||
from typing import Union, List
|
||||
from typing import List, Union
|
||||
|
||||
from onnx import GraphProto
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class Fusion:
|
||||
def __init__(self,
|
||||
model: OnnxModel,
|
||||
fused_op_type: str,
|
||||
search_op_types: Union[str, List[str]],
|
||||
description: str = None):
|
||||
def __init__(
|
||||
self,
|
||||
model: OnnxModel,
|
||||
fused_op_type: str,
|
||||
search_op_types: Union[str, List[str]],
|
||||
description: str = None,
|
||||
):
|
||||
self.search_op_types: List[str] = [search_op_types] if isinstance(search_op_types, str) else search_op_types
|
||||
self.fused_op_type: str = fused_op_type
|
||||
self.description: str = f"{fused_op_type}({description})" if description else fused_op_type
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from logging import getLogger
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import NumpyHelper
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -15,18 +16,18 @@ logger = getLogger(__name__)
|
|||
class FusionBiasGelu(Fusion):
|
||||
def __init__(self, model: OnnxModel, is_fastgelu):
|
||||
if is_fastgelu:
|
||||
super().__init__(model, 'FastGelu', 'FastGelu', 'add bias')
|
||||
super().__init__(model, "FastGelu", "FastGelu", "add bias")
|
||||
else:
|
||||
super().__init__(model, 'BiasGelu', 'Gelu')
|
||||
super().__init__(model, "BiasGelu", "Gelu")
|
||||
|
||||
def fuse(self, node, input_name_to_nodes, output_name_to_node):
|
||||
gelu_op_type = node.op_type
|
||||
fuse_op_type = 'BiasGelu' if gelu_op_type == 'Gelu' else 'FastGelu'
|
||||
fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
|
||||
|
||||
if len(node.input) != 1:
|
||||
return
|
||||
|
||||
nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [0, None])
|
||||
nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
|
||||
if nodes is None:
|
||||
return
|
||||
(add, matmul) = nodes
|
||||
|
@ -47,16 +48,19 @@ class FusionBiasGelu(Fusion):
|
|||
return
|
||||
|
||||
subgraph_nodes = [node, add]
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [node.output[0]], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
|
||||
fused_node = helper.make_node(fuse_op_type,
|
||||
inputs=[matmul.output[0], add.input[bias_index]],
|
||||
outputs=node.output,
|
||||
name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"))
|
||||
fused_node = helper.make_node(
|
||||
fuse_op_type,
|
||||
inputs=[matmul.output[0], add.input[bias_index]],
|
||||
outputs=node.output,
|
||||
name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
|
|
@ -1,26 +1,32 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from typing import Dict, List, Tuple, Union
|
||||
from logging import getLogger
|
||||
from onnx import helper, TensorProto, NodeProto
|
||||
from onnx_model import OnnxModel
|
||||
from typing import Dict, List, Tuple, Union
|
||||
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import FusionUtils
|
||||
from onnx import NodeProto, TensorProto, helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class FusionEmbedLayerNoMask(Fusion):
|
||||
"""
|
||||
Fuse embedding layer into one node (EmbedLayerNormalization).
|
||||
It supports the following model types: BERT, DistilBert, ALBert.
|
||||
Fuse embedding layer into one node (EmbedLayerNormalization).
|
||||
It supports the following model types: BERT, DistilBert, ALBert.
|
||||
"""
|
||||
def __init__(self, model: OnnxModel, description: str = 'no mask'):
|
||||
super().__init__(model, "EmbedLayerNormalization", ["LayerNormalization", "SkipLayerNormalization"],
|
||||
description)
|
||||
|
||||
def __init__(self, model: OnnxModel, description: str = "no mask"):
|
||||
super().__init__(
|
||||
model,
|
||||
"EmbedLayerNormalization",
|
||||
["LayerNormalization", "SkipLayerNormalization"],
|
||||
description,
|
||||
)
|
||||
self.utils = FusionUtils(model)
|
||||
self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
|
||||
# The following will be reset in each fuse call of FusionEmbedLayerNormalization
|
||||
|
@ -28,18 +34,22 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
self.embed_node = None
|
||||
|
||||
def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeProto]]:
|
||||
gather_0_path = self.model.match_parent_path(add, ['Gather'], [0])
|
||||
gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
|
||||
if gather_0_path is None:
|
||||
return None
|
||||
|
||||
gather_1_path = self.model.match_parent_path(add, ['Gather'], [1])
|
||||
gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
|
||||
if gather_1_path is None:
|
||||
return None
|
||||
|
||||
return gather_0_path[0], gather_1_path[0]
|
||||
|
||||
def check_attention_subgraph(self, layernorm: NodeProto, input_name_to_nodes: Dict[str, List[NodeProto]],
|
||||
is_distil_bert: bool) -> bool:
|
||||
def check_attention_subgraph(
|
||||
self,
|
||||
layernorm: NodeProto,
|
||||
input_name_to_nodes: Dict[str, List[NodeProto]],
|
||||
is_distil_bert: bool,
|
||||
) -> bool:
|
||||
"""Check that LayerNormalization has a child of Attention node or subgraph like Attention.
|
||||
|
||||
Args:
|
||||
|
@ -50,10 +60,9 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
Returns:
|
||||
bool: whether there is Attention node or subgraph like Attention
|
||||
"""
|
||||
self.attention = self.model.find_first_child_by_type(layernorm,
|
||||
'Attention',
|
||||
input_name_to_nodes,
|
||||
recursive=False)
|
||||
self.attention = self.model.find_first_child_by_type(
|
||||
layernorm, "Attention", input_name_to_nodes, recursive=False
|
||||
)
|
||||
if self.attention is None:
|
||||
# In case user disables attention fusion, check whether subgraph looks like Attention.
|
||||
if layernorm.output[0] not in input_name_to_nodes:
|
||||
|
@ -63,8 +72,11 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
# For Albert, there is MatMul+Add after embedding layer before attention.
|
||||
if len(children) == 1 and children[0].op_type == "MatMul" and children[0].output[0] in input_name_to_nodes:
|
||||
grandchildren = input_name_to_nodes[children[0].output[0]]
|
||||
if len(grandchildren) == 1 and grandchildren[0].op_type == "Add" and grandchildren[0].output[
|
||||
0] in input_name_to_nodes:
|
||||
if (
|
||||
len(grandchildren) == 1
|
||||
and grandchildren[0].op_type == "Add"
|
||||
and grandchildren[0].output[0] in input_name_to_nodes
|
||||
):
|
||||
nodes = input_name_to_nodes[grandchildren[0].output[0]]
|
||||
for node in nodes:
|
||||
if node.op_type == "Attention":
|
||||
|
@ -77,14 +89,20 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
# Two Shape nodes might be merged by ORT
|
||||
if is_distil_bert:
|
||||
# SkipLayerNormailization might exist when model has been optimized by ORT first.
|
||||
if children_types != ['MatMul', 'MatMul', 'MatMul', 'Shape', 'SkipLayerNormalization'] and \
|
||||
children_types != ['Add', 'MatMul', 'MatMul', 'MatMul', 'Shape', 'Shape'] and \
|
||||
children_types != ['Add', 'MatMul', 'MatMul', 'MatMul', 'Shape']:
|
||||
if (
|
||||
children_types != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
|
||||
and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
|
||||
and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
|
||||
):
|
||||
logger.debug("No Attention like subgraph in children of LayerNormalization")
|
||||
return False
|
||||
else:
|
||||
if children_types != ['Add', 'MatMul', 'MatMul', 'MatMul'] and \
|
||||
children_types != ['MatMul', 'MatMul', 'MatMul', 'SkipLayerNormalization']:
|
||||
if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
|
||||
"MatMul",
|
||||
"MatMul",
|
||||
"MatMul",
|
||||
"SkipLayerNormalization",
|
||||
]:
|
||||
logger.debug("No Attention like subgraph in children of LayerNormalization")
|
||||
return False
|
||||
return True
|
||||
|
@ -110,9 +128,13 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
Gather
|
||||
"""
|
||||
# remove after tests pass
|
||||
path1 = self.model.match_parent_path(position_embedding_gather, ['Expand', 'Shape'], [1, 1])
|
||||
path1 = self.model.match_parent_path(position_embedding_gather, ["Expand", "Shape"], [1, 1])
|
||||
if path1 is None:
|
||||
path1 = self.model.match_parent_path(position_embedding_gather, ['Expand', 'Where', 'Reshape', 'Shape'], [1, 1, 2, 0])
|
||||
path1 = self.model.match_parent_path(
|
||||
position_embedding_gather,
|
||||
["Expand", "Where", "Reshape", "Shape"],
|
||||
[1, 1, 2, 0],
|
||||
)
|
||||
if path1 is None:
|
||||
return False
|
||||
|
||||
|
@ -120,14 +142,21 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
if shape.input[0] != input_ids:
|
||||
return False
|
||||
|
||||
_, path2, _ = self.model.match_parent_paths(expand, [(['Unsqueeze', 'Range', 'Cast', 'Gather', 'Shape'], [0, 0, 1, 0, 0]), \
|
||||
(['Unsqueeze', 'Range', 'Gather', 'Shape'], [0, 0, 1, 0])], output_name_to_node)
|
||||
_, path2, _ = self.model.match_parent_paths(
|
||||
expand,
|
||||
[
|
||||
(["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
|
||||
(["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
if path2 is None:
|
||||
return False
|
||||
|
||||
range_node = path2[1]
|
||||
if not (self.utils.check_node_input_value(range_node, 0, 0)
|
||||
and self.utils.check_node_input_value(range_node, 2, 1)):
|
||||
if not (
|
||||
self.utils.check_node_input_value(range_node, 0, 0) and self.utils.check_node_input_value(range_node, 2, 1)
|
||||
):
|
||||
return False
|
||||
|
||||
gather_node = path2[-2]
|
||||
|
@ -141,19 +170,19 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
return True
|
||||
|
||||
def match_position_embedding_roberta(self, position_embedding_gather, input_ids, output_name_to_node):
|
||||
""" Match position embedding path from input_ids to Gather for Roberta.
|
||||
"""Match position embedding path from input_ids to Gather for Roberta.
|
||||
|
||||
Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
|
||||
Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
|
||||
(input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
|
||||
| ^
|
||||
V |
|
||||
+------------------------------+
|
||||
+------------------------------+
|
||||
|
||||
Roberta new pattern from transformers v4.9:
|
||||
(input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
|
||||
| ^
|
||||
V |
|
||||
+-------------------------------------------+
|
||||
+-------------------------------------------+
|
||||
|
||||
start_node = position_embedding_gather
|
||||
start_index = 1
|
||||
|
@ -209,22 +238,30 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
|
|
||||
LayerNormalization
|
||||
"""
|
||||
path = self.model.match_parent_path(position_embedding_gather, ['Slice', 'Unsqueeze'], [1, 2],
|
||||
output_name_to_node)
|
||||
path = self.model.match_parent_path(
|
||||
position_embedding_gather,
|
||||
["Slice", "Unsqueeze"],
|
||||
[1, 2],
|
||||
output_name_to_node,
|
||||
)
|
||||
if path is None:
|
||||
return False
|
||||
|
||||
slice, unsqueeze = path
|
||||
slice_weight = self.model.get_constant_value(slice.input[0])
|
||||
if not (slice_weight is not None and len(slice_weight.shape) == 2 and slice_weight.shape[0] == 1 \
|
||||
and self.utils.check_node_input_value(slice, 1, [0]) \
|
||||
and self.utils.check_node_input_value(slice, 3, [1]) \
|
||||
and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))):
|
||||
if not (
|
||||
slice_weight is not None
|
||||
and len(slice_weight.shape) == 2
|
||||
and slice_weight.shape[0] == 1
|
||||
and self.utils.check_node_input_value(slice, 1, [0])
|
||||
and self.utils.check_node_input_value(slice, 3, [1])
|
||||
and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))
|
||||
):
|
||||
return False
|
||||
|
||||
opset_version = self.model.get_opset_version()
|
||||
if opset_version < 13:
|
||||
if not FusionUtils.check_node_attribute(unsqueeze, 'axes', [0]):
|
||||
if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
|
||||
return False
|
||||
else:
|
||||
if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
|
||||
|
@ -257,7 +294,7 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
|
||||
# TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
|
||||
# related: https://github.com/huggingface/transformers/issues/10736
|
||||
#if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
|
||||
# if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
|
||||
# return True
|
||||
|
||||
if self.match_position_embedding_distilbert(position_embedding_gather, input_ids, output_name_to_node):
|
||||
|
@ -266,8 +303,7 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
return False
|
||||
|
||||
def check_embedding(self, word_embedding_gather, segment_embedding_gather, position_embedding_gather):
|
||||
"""Sanity check of embedding weights, and match hidden_size of weights and shape of inputs.
|
||||
"""
|
||||
"""Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
|
||||
input_ids = word_embedding_gather.input[1]
|
||||
segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
|
||||
position_ids = position_embedding_gather.input[1]
|
||||
|
@ -276,17 +312,25 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
|
||||
position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
|
||||
assert input_ids_shape and position_ids_shape
|
||||
if not (len(input_ids_shape) == 2 and len(position_ids_shape) == 2
|
||||
and input_ids_shape[1] == position_ids_shape[1]):
|
||||
if not (
|
||||
len(input_ids_shape) == 2
|
||||
and len(position_ids_shape) == 2
|
||||
and input_ids_shape[1] == position_ids_shape[1]
|
||||
):
|
||||
logger.info(
|
||||
"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}"
|
||||
.format(input_ids_shape, position_ids_shape))
|
||||
"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
|
||||
input_ids_shape, position_ids_shape
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
|
||||
logger.info(
|
||||
"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".
|
||||
format(input_ids_shape, self.shape_infer_helper.get_edge_shape(segment_ids)))
|
||||
"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
|
||||
input_ids_shape,
|
||||
self.shape_infer_helper.get_edge_shape(segment_ids),
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
word_embedding_table = self.model.get_constant_value(word_embedding_gather.input[0])
|
||||
|
@ -295,15 +339,21 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
return False
|
||||
|
||||
position_embedding_table = self.model.get_constant_value(position_embedding_gather.input[0])
|
||||
if position_embedding_table is None or len(position_embedding_table.shape) != 2 or (
|
||||
word_embedding_table.shape[1] != position_embedding_table.shape[1]):
|
||||
if (
|
||||
position_embedding_table is None
|
||||
or len(position_embedding_table.shape) != 2
|
||||
or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
|
||||
):
|
||||
logger.info("Cannot fuse EmbedLayerNormalization: position embedding table is not expected")
|
||||
return False
|
||||
|
||||
if segment_ids:
|
||||
segment_embedding_table = self.model.get_constant_value(segment_embedding_gather.input[0])
|
||||
if segment_embedding_table is None or len(segment_embedding_table.shape) != 2 or (
|
||||
word_embedding_table.shape[1] != segment_embedding_table.shape[1]):
|
||||
if (
|
||||
segment_embedding_table is None
|
||||
or len(segment_embedding_table.shape) != 2
|
||||
or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
|
||||
):
|
||||
logger.info("Cannot fuse EmbedLayerNormalization: segment embedding table is not expected")
|
||||
return False
|
||||
|
||||
|
@ -350,9 +400,16 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
|
||||
return int32_output, input_cast_node
|
||||
|
||||
def create_fused_node(self, input_ids: str, layernorm: NodeProto, word_embedding_gather: NodeProto,
|
||||
position_embedding_gather: NodeProto, segment_embedding_gather: Union[None, NodeProto],
|
||||
position_ids: str = None, embedding_sum_output = False):
|
||||
def create_fused_node(
|
||||
self,
|
||||
input_ids: str,
|
||||
layernorm: NodeProto,
|
||||
word_embedding_gather: NodeProto,
|
||||
position_embedding_gather: NodeProto,
|
||||
segment_embedding_gather: Union[None, NodeProto],
|
||||
position_ids: str = None,
|
||||
embedding_sum_output=False,
|
||||
):
|
||||
"""Create an EmbedLayerNormalization node. Note that segment embedding is optional.
|
||||
|
||||
Args:
|
||||
|
@ -368,7 +425,7 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
nodes_to_add = []
|
||||
input_ids, _ = self.cast_to_int32(input_ids)
|
||||
|
||||
node_name = self.model.create_node_name('EmbedLayerNormalization')
|
||||
node_name = self.model.create_node_name("EmbedLayerNormalization")
|
||||
|
||||
if layernorm.op_type == "LayerNormalization":
|
||||
gamma = layernorm.input[1]
|
||||
|
@ -382,17 +439,28 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
|
||||
|
||||
embed_node_inputs = [
|
||||
input_ids, segment_ids, word_embedding_gather.input[0], position_embedding_gather.input[0],
|
||||
segment_embedding_gather.input[0], gamma, beta
|
||||
input_ids,
|
||||
segment_ids,
|
||||
word_embedding_gather.input[0],
|
||||
position_embedding_gather.input[0],
|
||||
segment_embedding_gather.input[0],
|
||||
gamma,
|
||||
beta,
|
||||
]
|
||||
else: # no segment embedding
|
||||
embed_node_inputs = [
|
||||
input_ids, '', word_embedding_gather.input[0], position_embedding_gather.input[0], '', gamma, beta
|
||||
input_ids,
|
||||
"",
|
||||
word_embedding_gather.input[0],
|
||||
position_embedding_gather.input[0],
|
||||
"",
|
||||
gamma,
|
||||
beta,
|
||||
]
|
||||
|
||||
if position_ids is not None:
|
||||
#Adding an empty input for mask before position_ids
|
||||
embed_node_inputs.append('')
|
||||
# Adding an empty input for mask before position_ids
|
||||
embed_node_inputs.append("")
|
||||
position_ids, _ = self.cast_to_int32(position_ids)
|
||||
embed_node_inputs.append(position_ids)
|
||||
|
||||
|
@ -400,22 +468,24 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
if embedding_sum_output:
|
||||
embed_node_outputs.append(node_name + "_embedding_sum")
|
||||
|
||||
embed_node = helper.make_node('EmbedLayerNormalization',
|
||||
embed_node_inputs,
|
||||
outputs=embed_node_outputs,
|
||||
name=node_name)
|
||||
embed_node = helper.make_node(
|
||||
"EmbedLayerNormalization",
|
||||
embed_node_inputs,
|
||||
outputs=embed_node_outputs,
|
||||
name=node_name,
|
||||
)
|
||||
|
||||
embed_node.domain = "com.microsoft"
|
||||
|
||||
# Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
|
||||
for att in layernorm.attribute:
|
||||
if att.name == 'epsilon':
|
||||
if att.name == "epsilon":
|
||||
embed_node.attribute.extend([att])
|
||||
|
||||
# Set default value to 1e-12 if no attribute is found.
|
||||
# OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
|
||||
if len(embed_node.attribute) == 0:
|
||||
embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0E-12)])
|
||||
embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
|
||||
|
||||
# Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
|
||||
nodes_to_add.append(embed_node)
|
||||
|
@ -446,7 +516,7 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
return len(nodes) > 1
|
||||
|
||||
def fuse_gpt2(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
|
||||
#graph checks
|
||||
# graph checks
|
||||
# gpt2 has no segment embedding, subgraph pattern is like
|
||||
# input_ids position_ids
|
||||
# | |
|
||||
|
@ -484,8 +554,15 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
optional_embedding_sum_output = True
|
||||
|
||||
# make the fused node
|
||||
embed_node = self.create_fused_node(input_ids, layernorm, word_embedding_gather, position_embedding_gather,
|
||||
None, position_ids, optional_embedding_sum_output)
|
||||
embed_node = self.create_fused_node(
|
||||
input_ids,
|
||||
layernorm,
|
||||
word_embedding_gather,
|
||||
position_embedding_gather,
|
||||
None,
|
||||
position_ids,
|
||||
optional_embedding_sum_output,
|
||||
)
|
||||
|
||||
# direct the output to another add too
|
||||
self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
|
||||
|
@ -529,8 +606,9 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
|
||||
return False
|
||||
|
||||
embed_node = self.create_fused_node(input_ids, layernorm, word_embedding_gather, position_embedding_gather,
|
||||
None)
|
||||
embed_node = self.create_fused_node(
|
||||
input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
|
||||
)
|
||||
self.finish_fusion(layernorm, embed_node)
|
||||
return True
|
||||
|
||||
|
@ -543,7 +621,7 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
|
||||
"""
|
||||
|
||||
add_2_gather = self.model.match_parent_path(add_before_layernorm, ['Add'], [0])
|
||||
add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
|
||||
if add_2_gather is None:
|
||||
return False
|
||||
|
||||
|
@ -558,7 +636,7 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
|
||||
return False
|
||||
|
||||
position_embedding_path = self.model.match_parent_path(add_before_layernorm, ['Gather'], [1])
|
||||
position_embedding_path = self.model.match_parent_path(add_before_layernorm, ["Gather"], [1])
|
||||
if position_embedding_path is None:
|
||||
return False
|
||||
|
||||
|
@ -574,14 +652,19 @@ class FusionEmbedLayerNoMask(Fusion):
|
|||
if not self.check_embedding(word_embedding_gather, segment_embedding_gather, position_embedding_gather):
|
||||
return False
|
||||
|
||||
embed_node = self.create_fused_node(input_ids, layernorm, word_embedding_gather, position_embedding_gather,
|
||||
segment_embedding_gather)
|
||||
embed_node = self.create_fused_node(
|
||||
input_ids,
|
||||
layernorm,
|
||||
word_embedding_gather,
|
||||
position_embedding_gather,
|
||||
segment_embedding_gather,
|
||||
)
|
||||
self.finish_fusion(layernorm, embed_node)
|
||||
return True
|
||||
|
||||
def fuse(self, node, input_name_to_nodes, output_name_to_node):
|
||||
if node.op_type == "LayerNormalization":
|
||||
first_add_path = self.model.match_parent_path(node, ['Add'], [0])
|
||||
first_add_path = self.model.match_parent_path(node, ["Add"], [0])
|
||||
if first_add_path is None:
|
||||
return
|
||||
add_before_layernorm = first_add_path[0]
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
from typing import Dict, Optional
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from typing import Dict, Optional
|
||||
|
||||
from fusion_base import Fusion
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
from fusion_base import Fusion
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -40,7 +41,7 @@ class FusionFastGelu(Fusion):
|
|||
if tanh_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[tanh_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Add':
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return
|
||||
add_after_tanh = children[0]
|
||||
|
||||
|
@ -50,11 +51,11 @@ class FusionFastGelu(Fusion):
|
|||
if add_after_tanh.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[add_after_tanh.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_after_tanh = children[0]
|
||||
|
||||
mul_half = self.model.match_parent(mul_after_tanh, 'Mul', None, output_name_to_node)
|
||||
mul_half = self.model.match_parent(mul_after_tanh, "Mul", None, output_name_to_node)
|
||||
if mul_half is None:
|
||||
return
|
||||
|
||||
|
@ -64,10 +65,10 @@ class FusionFastGelu(Fusion):
|
|||
|
||||
root_input = mul_half.input[0 if i == 1 else 1]
|
||||
|
||||
#root_node could be None when root_input is graph input
|
||||
# root_node could be None when root_input is graph input
|
||||
root_node = self.model.get_parent(mul_half, 0 if i == 1 else 1, output_name_to_node)
|
||||
|
||||
mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
|
||||
mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node)
|
||||
if mul_before_tanh is None:
|
||||
return
|
||||
|
||||
|
@ -75,15 +76,17 @@ class FusionFastGelu(Fusion):
|
|||
if i < 0:
|
||||
return
|
||||
|
||||
add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node)
|
||||
add_before_tanh = self.model.match_parent(mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node)
|
||||
if add_before_tanh is None:
|
||||
return
|
||||
|
||||
mul_after_pow = self.model.match_parent(add_before_tanh,
|
||||
'Mul',
|
||||
None,
|
||||
output_name_to_node,
|
||||
exclude=[root_node] if root_node else [])
|
||||
mul_after_pow = self.model.match_parent(
|
||||
add_before_tanh,
|
||||
"Mul",
|
||||
None,
|
||||
output_name_to_node,
|
||||
exclude=[root_node] if root_node else [],
|
||||
)
|
||||
if mul_after_pow is None:
|
||||
return
|
||||
|
||||
|
@ -91,7 +94,7 @@ class FusionFastGelu(Fusion):
|
|||
if i < 0:
|
||||
return
|
||||
|
||||
pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node)
|
||||
pow = self.model.match_parent(mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node)
|
||||
if pow is None:
|
||||
return
|
||||
|
||||
|
@ -102,17 +105,30 @@ class FusionFastGelu(Fusion):
|
|||
return
|
||||
|
||||
subgraph_nodes = [
|
||||
mul_after_tanh, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow, pow
|
||||
mul_after_tanh,
|
||||
mul_half,
|
||||
add_after_tanh,
|
||||
tanh_node,
|
||||
mul_before_tanh,
|
||||
add_before_tanh,
|
||||
mul_after_pow,
|
||||
pow,
|
||||
]
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_tanh.output[0]], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
[mul_after_tanh.output[0]],
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = helper.make_node('FastGelu',
|
||||
inputs=[root_input],
|
||||
outputs=mul_after_tanh.output,
|
||||
name=self.model.create_node_name('FastGelu'))
|
||||
fused_node = helper.make_node(
|
||||
"FastGelu",
|
||||
inputs=[root_input],
|
||||
outputs=mul_after_tanh.output,
|
||||
name=self.model.create_node_name("FastGelu"),
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
@ -134,7 +150,7 @@ class FusionFastGelu(Fusion):
|
|||
if tanh_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[tanh_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Add':
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return
|
||||
add_after_tanh = children[0]
|
||||
|
||||
|
@ -144,7 +160,7 @@ class FusionFastGelu(Fusion):
|
|||
if add_after_tanh.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[add_after_tanh.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_half = children[0]
|
||||
|
||||
|
@ -155,17 +171,19 @@ class FusionFastGelu(Fusion):
|
|||
if mul_half.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[mul_half.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_after_mul_half = children[0]
|
||||
|
||||
root_node = self.model.get_parent(mul_after_mul_half,
|
||||
0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
|
||||
output_name_to_node)
|
||||
root_node = self.model.get_parent(
|
||||
mul_after_mul_half,
|
||||
0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
|
||||
output_name_to_node,
|
||||
)
|
||||
if root_node is None:
|
||||
return
|
||||
|
||||
mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
|
||||
mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node)
|
||||
if mul_before_tanh is None:
|
||||
return
|
||||
|
||||
|
@ -173,11 +191,11 @@ class FusionFastGelu(Fusion):
|
|||
if i < 0:
|
||||
return
|
||||
|
||||
add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node)
|
||||
add_before_tanh = self.model.match_parent(mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node)
|
||||
if add_before_tanh is None:
|
||||
return
|
||||
|
||||
mul_after_pow = self.model.match_parent(add_before_tanh, 'Mul', None, output_name_to_node, exclude=[root_node])
|
||||
mul_after_pow = self.model.match_parent(add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node])
|
||||
if mul_after_pow is None:
|
||||
return
|
||||
|
||||
|
@ -185,7 +203,7 @@ class FusionFastGelu(Fusion):
|
|||
if i < 0:
|
||||
return
|
||||
|
||||
pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node)
|
||||
pow = self.model.match_parent(mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node)
|
||||
if pow is None:
|
||||
return
|
||||
|
||||
|
@ -196,18 +214,30 @@ class FusionFastGelu(Fusion):
|
|||
return
|
||||
|
||||
subgraph_nodes = [
|
||||
mul_after_mul_half, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow,
|
||||
pow
|
||||
mul_after_mul_half,
|
||||
mul_half,
|
||||
add_after_tanh,
|
||||
tanh_node,
|
||||
mul_before_tanh,
|
||||
add_before_tanh,
|
||||
mul_after_pow,
|
||||
pow,
|
||||
]
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_mul_half.output[0]], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
[mul_after_mul_half.output[0]],
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = helper.make_node('FastGelu',
|
||||
inputs=[root_node.output[0]],
|
||||
outputs=mul_after_mul_half.output,
|
||||
name=self.model.create_node_name('FastGelu'))
|
||||
fused_node = helper.make_node(
|
||||
"FastGelu",
|
||||
inputs=[root_node.output[0]],
|
||||
outputs=mul_after_mul_half.output,
|
||||
name=self.model.create_node_name("FastGelu"),
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
@ -215,25 +245,25 @@ class FusionFastGelu(Fusion):
|
|||
|
||||
def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
|
||||
"""
|
||||
OpenAI's gelu implementation, also used in Megatron:
|
||||
Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
|
||||
OpenAI's gelu implementation, also used in Megatron:
|
||||
Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
|
||||
|
||||
Fuse subgraph into a FastGelu node:
|
||||
+------------ Mul (B=0.79788456) -------------------+
|
||||
| |
|
||||
+-------------------------------+ |
|
||||
| | |
|
||||
| v v
|
||||
[root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
|
||||
| ^
|
||||
| |
|
||||
+-----------> Mul (B=0.5) --------------------------------------------------------+
|
||||
"""
|
||||
Fuse subgraph into a FastGelu node:
|
||||
+------------ Mul (B=0.79788456) -------------------+
|
||||
| |
|
||||
+-------------------------------+ |
|
||||
| | |
|
||||
| v v
|
||||
[root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
|
||||
| ^
|
||||
| |
|
||||
+-----------> Mul (B=0.5) --------------------------------------------------------+
|
||||
"""
|
||||
if tanh_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
|
||||
children = input_name_to_nodes[tanh_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Add':
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return
|
||||
add_after_tanh = children[0]
|
||||
|
||||
|
@ -243,11 +273,11 @@ class FusionFastGelu(Fusion):
|
|||
if add_after_tanh.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[add_after_tanh.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_last = children[0]
|
||||
|
||||
mul_half = self.model.match_parent(mul_last, 'Mul', None, output_name_to_node)
|
||||
mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
|
||||
if mul_half is None:
|
||||
return
|
||||
|
||||
|
@ -257,18 +287,18 @@ class FusionFastGelu(Fusion):
|
|||
|
||||
root_input = mul_half.input[0 if i == 1 else 1]
|
||||
|
||||
mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
|
||||
mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node)
|
||||
if mul_before_tanh is None:
|
||||
return
|
||||
|
||||
add_1 = self.model.match_parent(mul_before_tanh, 'Add', None, output_name_to_node)
|
||||
add_1 = self.model.match_parent(mul_before_tanh, "Add", None, output_name_to_node)
|
||||
if add_1 is None:
|
||||
return
|
||||
j = self.model.find_constant_input(add_1, 1.0)
|
||||
if j < 0:
|
||||
return
|
||||
|
||||
mul_7978 = self.model.match_parent(mul_before_tanh, 'Mul', None, output_name_to_node)
|
||||
mul_7978 = self.model.match_parent(mul_before_tanh, "Mul", None, output_name_to_node)
|
||||
if mul_7978 is None:
|
||||
return
|
||||
k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
|
||||
|
@ -277,7 +307,7 @@ class FusionFastGelu(Fusion):
|
|||
if mul_7978.input[0 if k == 1 else 1] != root_input:
|
||||
return
|
||||
|
||||
mul_before_add_1 = self.model.match_parent(add_1, 'Mul', 0 if j == 1 else 1, output_name_to_node)
|
||||
mul_before_add_1 = self.model.match_parent(add_1, "Mul", 0 if j == 1 else 1, output_name_to_node)
|
||||
if mul_before_add_1 is None:
|
||||
return
|
||||
|
||||
|
@ -288,7 +318,7 @@ class FusionFastGelu(Fusion):
|
|||
else:
|
||||
return
|
||||
|
||||
mul_0447 = self.model.match_parent(mul_before_add_1, 'Mul', another, output_name_to_node)
|
||||
mul_0447 = self.model.match_parent(mul_before_add_1, "Mul", another, output_name_to_node)
|
||||
if mul_0447 is None:
|
||||
return
|
||||
m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
|
||||
|
@ -299,17 +329,31 @@ class FusionFastGelu(Fusion):
|
|||
return
|
||||
|
||||
subgraph_nodes = [
|
||||
mul_0447, mul_before_add_1, add_1, mul_before_tanh, tanh_node, add_after_tanh, mul_7978, mul_half, mul_last
|
||||
mul_0447,
|
||||
mul_before_add_1,
|
||||
add_1,
|
||||
mul_before_tanh,
|
||||
tanh_node,
|
||||
add_after_tanh,
|
||||
mul_7978,
|
||||
mul_half,
|
||||
mul_last,
|
||||
]
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_last.output[0]], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
[mul_last.output[0]],
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = helper.make_node('FastGelu',
|
||||
inputs=[root_input],
|
||||
outputs=mul_last.output,
|
||||
name=self.model.create_node_name('FastGelu'))
|
||||
fused_node = helper.make_node(
|
||||
"FastGelu",
|
||||
inputs=[root_input],
|
||||
outputs=mul_last.output,
|
||||
name=self.model.create_node_name("FastGelu"),
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
from typing import Dict, Optional
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from typing import Dict, Optional
|
||||
|
||||
from fusion_base import Fusion
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
from fusion_base import Fusion
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -45,7 +46,7 @@ class FusionGelu(Fusion):
|
|||
if erf_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[erf_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Add':
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return
|
||||
add_after_erf = children[0]
|
||||
|
||||
|
@ -55,11 +56,11 @@ class FusionGelu(Fusion):
|
|||
if add_after_erf.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[add_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_after_erf = children[0]
|
||||
|
||||
div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node)
|
||||
div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
|
||||
if div is None:
|
||||
return
|
||||
|
||||
|
@ -71,14 +72,14 @@ class FusionGelu(Fusion):
|
|||
another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
|
||||
if subgraph_input == mul_after_erf.input[another]: # pattern 2
|
||||
children = input_name_to_nodes[mul_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_half = children[0]
|
||||
if not self.model.has_constant_input(mul_half, 0.5):
|
||||
return
|
||||
subgraph_output = mul_half.output[0]
|
||||
else: # pattern 1
|
||||
mul_half = self.model.match_parent(mul_after_erf, 'Mul', another, output_name_to_node)
|
||||
mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
|
||||
if mul_half is None:
|
||||
return
|
||||
|
||||
|
@ -91,12 +92,13 @@ class FusionGelu(Fusion):
|
|||
subgraph_output = mul_after_erf.output[0]
|
||||
|
||||
subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = helper.make_node('Gelu', inputs=[subgraph_input], outputs=[subgraph_output])
|
||||
fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
@ -117,7 +119,7 @@ class FusionGelu(Fusion):
|
|||
if erf_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[erf_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Add':
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return
|
||||
add_after_erf = children[0]
|
||||
|
||||
|
@ -127,7 +129,7 @@ class FusionGelu(Fusion):
|
|||
if add_after_erf.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[add_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_after_erf = children[0]
|
||||
|
||||
|
@ -137,17 +139,17 @@ class FusionGelu(Fusion):
|
|||
if mul_after_erf.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[mul_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul = children[0]
|
||||
|
||||
div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node)
|
||||
div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
|
||||
if div is None:
|
||||
return
|
||||
|
||||
sqrt_node = None
|
||||
if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
|
||||
sqrt_node = self.model.match_parent(div, 'Sqrt', 1, output_name_to_node)
|
||||
sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
|
||||
if sqrt_node is None:
|
||||
return
|
||||
if not self.model.has_constant_input(sqrt_node, 2.0):
|
||||
|
@ -164,12 +166,13 @@ class FusionGelu(Fusion):
|
|||
if sqrt_node:
|
||||
subgraph_nodes.append(sqrt_node)
|
||||
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[mul.output[0]])
|
||||
fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
@ -191,7 +194,7 @@ class FusionGelu(Fusion):
|
|||
if erf_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[erf_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Add':
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return
|
||||
add_after_erf = children[0]
|
||||
|
||||
|
@ -201,14 +204,14 @@ class FusionGelu(Fusion):
|
|||
if add_after_erf.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[add_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
mul_half = children[0]
|
||||
|
||||
if not self.model.has_constant_input(mul_half, 0.5):
|
||||
return
|
||||
|
||||
first_mul = self.model.match_parent(erf_node, 'Mul', 0, output_name_to_node)
|
||||
first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
|
||||
if first_mul is None:
|
||||
return
|
||||
|
||||
|
@ -223,7 +226,7 @@ class FusionGelu(Fusion):
|
|||
if mul_half.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
children = input_name_to_nodes[mul_half.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != 'Mul':
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return
|
||||
last_mul = children[0]
|
||||
|
||||
|
@ -231,12 +234,16 @@ class FusionGelu(Fusion):
|
|||
return
|
||||
|
||||
subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [last_mul.output[0]], input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
[last_mul.output[0]],
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
|
||||
fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
|
|
@ -1,23 +1,26 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from logging import getLogger
|
||||
|
||||
from fusion_base import Fusion
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
from fusion_base import Fusion
|
||||
|
||||
|
||||
class FusionGeluApproximation(Fusion):
|
||||
def __init__(self, model: OnnxModel):
|
||||
super().__init__(model, 'FastGelu', ['Gelu', 'BiasGelu'], 'GeluApproximation')
|
||||
super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
|
||||
|
||||
def fuse(self, node, input_name_to_nodes, output_name_to_node):
|
||||
new_node = helper.make_node("FastGelu",
|
||||
inputs=node.input,
|
||||
outputs=node.output,
|
||||
name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"))
|
||||
new_node = helper.make_node(
|
||||
"FastGelu",
|
||||
inputs=node.input,
|
||||
outputs=node.output,
|
||||
name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"),
|
||||
)
|
||||
new_node.domain = "com.microsoft"
|
||||
self.nodes_to_remove.append(node)
|
||||
self.nodes_to_add.append(new_node)
|
||||
|
|
|
@ -1,20 +1,21 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
import numpy as np
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from onnx import helper, numpy_helper, TensorProto
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
import numpy as np
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import FusionUtils
|
||||
from onnx import TensorProto, helper, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class FusionGptAttentionPastBase(Fusion):
|
||||
"""Base class for GPT Attention Fusion with past state
|
||||
"""
|
||||
"""Base class for GPT Attention Fusion with past state"""
|
||||
|
||||
def __init__(self, model: OnnxModel, num_heads: int):
|
||||
super().__init__(model, "Attention", "LayerNormalization", "with past")
|
||||
self.num_heads = num_heads
|
||||
|
@ -41,7 +42,7 @@ class FusionGptAttentionPastBase(Fusion):
|
|||
# |
|
||||
# {present}
|
||||
gather = self.model.get_parent(concat_v, 0, output_name_to_node)
|
||||
if gather.op_type != 'Gather':
|
||||
if gather.op_type != "Gather":
|
||||
logger.debug("match_past_pattern_1: expect Gather for past")
|
||||
return None
|
||||
|
||||
|
@ -51,10 +52,10 @@ class FusionGptAttentionPastBase(Fusion):
|
|||
past = gather.input[0]
|
||||
|
||||
parent = self.model.get_parent(concat_k, 0, output_name_to_node)
|
||||
if parent.op_type == 'Gather':
|
||||
if parent.op_type == "Gather":
|
||||
gather_past_k = parent
|
||||
else:
|
||||
past_k_nodes = self.model.match_parent_path(concat_k, ['Transpose', 'Gather'], [0, 0])
|
||||
past_k_nodes = self.model.match_parent_path(concat_k, ["Transpose", "Gather"], [0, 0])
|
||||
if past_k_nodes is None:
|
||||
logger.debug("match_past_pattern_1: failed match Transpose and Gather")
|
||||
return None
|
||||
|
@ -93,7 +94,7 @@ class FusionGptAttentionPastBase(Fusion):
|
|||
# {present}
|
||||
#
|
||||
squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
|
||||
if squeeze.op_type != 'Squeeze':
|
||||
if squeeze.op_type != "Squeeze":
|
||||
logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
|
||||
return None
|
||||
|
||||
|
@ -104,11 +105,11 @@ class FusionGptAttentionPastBase(Fusion):
|
|||
|
||||
opset_version = self.model.get_opset_version()
|
||||
if opset_version < 13:
|
||||
if not FusionUtils.check_node_attribute(squeeze, 'axes', [0]):
|
||||
if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
|
||||
logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
|
||||
return None
|
||||
|
||||
if not FusionUtils.check_node_attribute(split, 'split', [1, 1]):
|
||||
if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
|
||||
logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
|
||||
return None
|
||||
else:
|
||||
|
@ -120,12 +121,12 @@ class FusionGptAttentionPastBase(Fusion):
|
|||
logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
|
||||
return None
|
||||
|
||||
if not FusionUtils.check_node_attribute(split, 'axis', 0, default_value=0):
|
||||
if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
|
||||
logger.debug("match_past_pattern_2: attribute axis of Split are not expected in past path")
|
||||
return None
|
||||
past = split.input[0]
|
||||
|
||||
past_k_nodes = self.model.match_parent_path(concat_k, ['Squeeze', 'Split'], [0, 0])
|
||||
past_k_nodes = self.model.match_parent_path(concat_k, ["Squeeze", "Split"], [0, 0])
|
||||
if past_k_nodes is None:
|
||||
logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
|
||||
return None
|
||||
|
@ -138,17 +139,15 @@ class FusionGptAttentionPastBase(Fusion):
|
|||
return past
|
||||
|
||||
def match_present(self, concat_v, input_name_to_nodes):
|
||||
unsqueeze_present_v = self.model.find_first_child_by_type(concat_v,
|
||||
'Unsqueeze',
|
||||
input_name_to_nodes,
|
||||
recursive=False)
|
||||
unsqueeze_present_v = self.model.find_first_child_by_type(
|
||||
concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
|
||||
)
|
||||
if not unsqueeze_present_v:
|
||||
logger.info("expect unsqueeze for present")
|
||||
return None
|
||||
concat_present = self.model.find_first_child_by_type(unsqueeze_present_v,
|
||||
'Concat',
|
||||
input_name_to_nodes,
|
||||
recursive=False)
|
||||
concat_present = self.model.find_first_child_by_type(
|
||||
unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
|
||||
)
|
||||
if not concat_present:
|
||||
logger.info("expect concat for present")
|
||||
return None
|
||||
|
@ -172,31 +171,50 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
"""
|
||||
Fuse GPT-2 Attention with past state subgraph into one Attention node.
|
||||
"""
|
||||
|
||||
def __init__(self, model: OnnxModel, num_heads: int):
|
||||
super().__init__(model, num_heads)
|
||||
|
||||
def create_attention_node(self, fc_weight, fc_bias, gemm_qkv, past, present, input, output, mask,
|
||||
is_unidirectional):
|
||||
attention_node_name = self.model.create_node_name('GptAttention')
|
||||
attention_node = helper.make_node('Attention',
|
||||
inputs=[input, fc_weight, fc_bias, mask, past],
|
||||
outputs=[attention_node_name + "_output", present],
|
||||
name=attention_node_name)
|
||||
def create_attention_node(
|
||||
self,
|
||||
fc_weight,
|
||||
fc_bias,
|
||||
gemm_qkv,
|
||||
past,
|
||||
present,
|
||||
input,
|
||||
output,
|
||||
mask,
|
||||
is_unidirectional,
|
||||
):
|
||||
attention_node_name = self.model.create_node_name("GptAttention")
|
||||
attention_node = helper.make_node(
|
||||
"Attention",
|
||||
inputs=[input, fc_weight, fc_bias, mask, past],
|
||||
outputs=[attention_node_name + "_output", present],
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend([
|
||||
helper.make_attribute("num_heads", self.num_heads),
|
||||
helper.make_attribute("unidirectional", 1 if is_unidirectional else 0)
|
||||
])
|
||||
attention_node.attribute.extend(
|
||||
[
|
||||
helper.make_attribute("num_heads", self.num_heads),
|
||||
helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
|
||||
]
|
||||
)
|
||||
|
||||
matmul_node = helper.make_node('MatMul',
|
||||
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
|
||||
outputs=[attention_node_name + "_matmul_output"],
|
||||
name=attention_node_name + "_matmul")
|
||||
matmul_node = helper.make_node(
|
||||
"MatMul",
|
||||
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
|
||||
outputs=[attention_node_name + "_matmul_output"],
|
||||
name=attention_node_name + "_matmul",
|
||||
)
|
||||
|
||||
add_node = helper.make_node('Add',
|
||||
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
|
||||
outputs=[output],
|
||||
name=attention_node_name + "_add")
|
||||
add_node = helper.make_node(
|
||||
"Add",
|
||||
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
|
||||
outputs=[output],
|
||||
name=attention_node_name + "_add",
|
||||
)
|
||||
self.nodes_to_add.extend([attention_node, matmul_node, add_node])
|
||||
self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
|
||||
self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
|
||||
|
@ -208,28 +226,44 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
return_indice = []
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
['Add', 'Reshape', 'Gemm', 'Reshape', 'Reshape', 'Transpose', 'MatMul'],
|
||||
[0, None, 0, 0, 0, 0, 0],
|
||||
["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
|
||||
[0, None, 0, 0, 0, 0, 0],
|
||||
output_name_to_node=output_name_to_node,
|
||||
return_indice=return_indice
|
||||
) # yapf: disable
|
||||
return_indice=return_indice,
|
||||
) # yapf: disable
|
||||
if qkv_nodes is None:
|
||||
return
|
||||
(add_qkv, reshape_qkv, gemm_qkv, reshape_1, reshape_2, transpose_qkv, matmul_qkv) = qkv_nodes
|
||||
(
|
||||
add_qkv,
|
||||
reshape_qkv,
|
||||
gemm_qkv,
|
||||
reshape_1,
|
||||
reshape_2,
|
||||
transpose_qkv,
|
||||
matmul_qkv,
|
||||
) = qkv_nodes
|
||||
|
||||
another_input = add_qkv.input[1 - return_indice[0]]
|
||||
|
||||
v_nodes = self.model.match_parent_path(matmul_qkv, ['Concat', 'Transpose', 'Reshape', 'Split'], [1, 1, 0, 0])
|
||||
v_nodes = self.model.match_parent_path(matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return
|
||||
(concat_v, transpose_v, reshape_v, split_fc) = v_nodes
|
||||
|
||||
fc_nodes = self.model.match_parent_path(split_fc, ['Reshape', 'Gemm', 'Reshape', 'LayerNormalization'],
|
||||
[0, 0, 0, 0], output_name_to_node)
|
||||
fc_nodes = self.model.match_parent_path(
|
||||
split_fc,
|
||||
["Reshape", "Gemm", "Reshape", "LayerNormalization"],
|
||||
[0, 0, 0, 0],
|
||||
output_name_to_node,
|
||||
)
|
||||
if fc_nodes is None:
|
||||
fc_nodes = self.model.match_parent_path(split_fc, ['Add', 'MatMul', 'LayerNormalization'], [0, None, 0],
|
||||
output_name_to_node)
|
||||
fc_nodes = self.model.match_parent_path(
|
||||
split_fc,
|
||||
["Add", "MatMul", "LayerNormalization"],
|
||||
[0, None, 0],
|
||||
output_name_to_node,
|
||||
)
|
||||
if fc_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match fc path")
|
||||
return
|
||||
|
@ -250,13 +284,25 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
slice_mask = None
|
||||
input_mask_nodes = None
|
||||
concat_k_to_match = None
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'Div', 'MatMul'], [0, 0, 0, 0, 0])
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
|
||||
if qk_nodes is not None:
|
||||
(softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
|
||||
mask_nodes = self.model.match_parent_path(
|
||||
sub_qk,
|
||||
['Mul', 'Sub', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
|
||||
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable
|
||||
[
|
||||
"Mul",
|
||||
"Sub",
|
||||
"Slice",
|
||||
"Slice",
|
||||
"Unsqueeze",
|
||||
"Sub",
|
||||
"Squeeze",
|
||||
"Slice",
|
||||
"Shape",
|
||||
"Div",
|
||||
],
|
||||
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
|
||||
) # yapf: disable
|
||||
if mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match unidirectional mask path")
|
||||
return
|
||||
|
@ -269,8 +315,13 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
else:
|
||||
# New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
|
||||
i, qk_nodes, _ = self.model.match_parent_paths(
|
||||
matmul_qkv, [(['Softmax', 'Where', 'Div', 'MatMul'], [0, 0, 1, 0]),
|
||||
(['Softmax', 'Add', 'Where', 'Div', 'MatMul'], [0, 0, None, 1, 0])], output_name_to_node)
|
||||
matmul_qkv,
|
||||
[
|
||||
(["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
|
||||
(["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
if qk_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match qk nodes")
|
||||
return
|
||||
|
@ -284,20 +335,40 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
_, input_mask_nodes, _ = self.model.match_parent_paths(
|
||||
add_qk,
|
||||
[
|
||||
(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze', 'Reshape'], [None, 0, 1, 0, 0, 0]),
|
||||
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze', 'Reshape'], [None, 0, 1, 0, 0]),
|
||||
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0]), # useless cast and reshape are removed.
|
||||
(
|
||||
["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
|
||||
[None, 0, 1, 0, 0, 0],
|
||||
),
|
||||
(
|
||||
["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
|
||||
[None, 0, 1, 0, 0],
|
||||
),
|
||||
(
|
||||
["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
|
||||
[None, 0, 1, 0],
|
||||
), # useless cast and reshape are removed.
|
||||
],
|
||||
output_name_to_node) # yapf: disable
|
||||
output_name_to_node,
|
||||
) # yapf: disable
|
||||
if input_mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match input attention mask path")
|
||||
return
|
||||
|
||||
mask_nodes = self.model.match_parent_path(
|
||||
where_qk,
|
||||
['Cast', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape'],
|
||||
[ 0, 0, 0, 1, 0, 0, 0, 0],
|
||||
output_name_to_node) # yapf: disable
|
||||
[
|
||||
"Cast",
|
||||
"Slice",
|
||||
"Slice",
|
||||
"Unsqueeze",
|
||||
"Sub",
|
||||
"Squeeze",
|
||||
"Slice",
|
||||
"Shape",
|
||||
],
|
||||
[0, 0, 0, 1, 0, 0, 0, 0],
|
||||
output_name_to_node,
|
||||
) # yapf: disable
|
||||
if mask_nodes is None:
|
||||
# TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
|
||||
logger.debug("fuse_attention: failed to match mask path")
|
||||
|
@ -318,8 +389,9 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
|
||||
# Validate that the mask data is either lower triangular (unidirectional) or all ones
|
||||
mask_data = numpy_helper.to_array(self.model.get_initializer(slice_mask.input[0]))
|
||||
if not (len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1)
|
||||
and mask_data.shape[2] == mask_data.shape[3]):
|
||||
if not (
|
||||
len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1) and mask_data.shape[2] == mask_data.shape[3]
|
||||
):
|
||||
logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
|
||||
return
|
||||
if np.allclose(mask_data, np.ones_like(mask_data)):
|
||||
|
@ -328,7 +400,7 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
logger.debug("fuse_attention: skip since mask is neither lower triangular nor ones")
|
||||
return
|
||||
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [0, 0, 0])
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return
|
||||
|
@ -337,11 +409,14 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
logger.debug("fuse_attention: skip since split_fc != split_q")
|
||||
return
|
||||
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ['Concat', 'Transpose', 'Reshape', 'Split'], [1, 1, 0, 0])
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
|
||||
if k_nodes is None:
|
||||
# This pattern is from pytorch 1.7.1 and transformers 4.6.1
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Concat', 'Transpose', 'Reshape', 'Split'],
|
||||
[1, 0, 1, 0, 0])
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Transpose", "Concat", "Transpose", "Reshape", "Split"],
|
||||
[1, 0, 1, 0, 0],
|
||||
)
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return
|
||||
|
@ -357,14 +432,15 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
|
||||
return
|
||||
|
||||
attention_mask_input_name = ''
|
||||
attention_mask_input_name = ""
|
||||
if input_mask_nodes is not None:
|
||||
input_name = input_mask_nodes[-1].input[0]
|
||||
attention_mask_input_name = self.cast_attention_mask(input_name)
|
||||
|
||||
# Match past and present paths
|
||||
past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or \
|
||||
self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
|
||||
past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or self.match_past_pattern_2(
|
||||
concat_k, concat_v, output_name_to_node
|
||||
)
|
||||
if past is None:
|
||||
logger.info("fuse_attention: failed to match past path")
|
||||
return
|
||||
|
@ -380,8 +456,17 @@ class FusionGptAttention(FusionGptAttentionPastBase):
|
|||
logger.info("expect present to be graph output")
|
||||
return
|
||||
|
||||
self.create_attention_node(fc_weight, fc_bias, gemm_qkv, past, present, layernorm_before_attention.output[0],
|
||||
reshape_qkv.output[0], attention_mask_input_name, is_unidirectional)
|
||||
self.create_attention_node(
|
||||
fc_weight,
|
||||
fc_bias,
|
||||
gemm_qkv,
|
||||
past,
|
||||
present,
|
||||
layernorm_before_attention.output[0],
|
||||
reshape_qkv.output[0],
|
||||
attention_mask_input_name,
|
||||
is_unidirectional,
|
||||
)
|
||||
|
||||
# we rely on prune_graph() to clean old subgraph nodes:
|
||||
# qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
import numpy as np
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from onnx import helper, numpy_helper, TensorProto
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
import numpy as np
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import FusionUtils
|
||||
from fusion_gpt_attention import FusionGptAttentionPastBase
|
||||
from fusion_utils import FusionUtils
|
||||
from onnx import TensorProto, helper, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -21,24 +22,43 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
"""
|
||||
Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
|
||||
"""
|
||||
|
||||
def __init__(self, model: OnnxModel, num_heads: int):
|
||||
super().__init__(model, num_heads)
|
||||
|
||||
def fuse_attention_node(self, matmul_before_split, add_before_split, past, present, input, reshape_qkv, mask):
|
||||
attention_node_name = self.model.create_node_name('GptAttention')
|
||||
def fuse_attention_node(
|
||||
self,
|
||||
matmul_before_split,
|
||||
add_before_split,
|
||||
past,
|
||||
present,
|
||||
input,
|
||||
reshape_qkv,
|
||||
mask,
|
||||
):
|
||||
attention_node_name = self.model.create_node_name("GptAttention")
|
||||
int32_mask = self.cast_attention_mask(mask)
|
||||
output = reshape_qkv.output[0]
|
||||
i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
|
||||
attention_node = helper.make_node(
|
||||
'Attention',
|
||||
inputs=[input, matmul_before_split.input[1], add_before_split.input[i], int32_mask, past],
|
||||
"Attention",
|
||||
inputs=[
|
||||
input,
|
||||
matmul_before_split.input[1],
|
||||
add_before_split.input[i],
|
||||
int32_mask,
|
||||
past,
|
||||
],
|
||||
outputs=[output, present],
|
||||
name=attention_node_name)
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend([
|
||||
helper.make_attribute("num_heads", self.num_heads),
|
||||
helper.make_attribute("unidirectional", 0) # unidirectional shall not be ON for 4D attention mask
|
||||
])
|
||||
attention_node.attribute.extend(
|
||||
[
|
||||
helper.make_attribute("num_heads", self.num_heads),
|
||||
helper.make_attribute("unidirectional", 0), # unidirectional shall not be ON for 4D attention mask
|
||||
]
|
||||
)
|
||||
|
||||
nodes_to_add = [attention_node]
|
||||
self.nodes_to_add.extend(nodes_to_add)
|
||||
|
@ -53,9 +73,8 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
|
||||
def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
|
||||
mask_nodes = self.model.match_parent_path(
|
||||
sub_qk,
|
||||
['Mul', 'Sub', 'Slice', 'Slice'],
|
||||
[1, 0, 1, 0]) # yapf: disable
|
||||
sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
|
||||
) # yapf: disable
|
||||
if mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match unidirectional mask path")
|
||||
return None
|
||||
|
@ -97,27 +116,34 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]")
|
||||
return None
|
||||
|
||||
last_slice_path = self.model.match_parent_path(last_slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'],
|
||||
[2, 0, 0, 0])
|
||||
last_slice_path = self.model.match_parent_path(
|
||||
last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
|
||||
)
|
||||
if last_slice_path is None or last_slice_path[-1] != matmul_qk:
|
||||
logger.debug("fuse_attention: failed to match last slice path")
|
||||
return None
|
||||
|
||||
first_slice_path = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'],
|
||||
[2, 0, 0, 0])
|
||||
first_slice_path = self.model.match_parent_path(
|
||||
slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
|
||||
)
|
||||
if first_slice_path is None or first_slice_path[-1] != matmul_qk:
|
||||
logger.debug("fuse_attention: failed to match first slice path")
|
||||
return None
|
||||
|
||||
first_slice_sub = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Sub', 'Gather', 'Shape', 'MatMul'],
|
||||
[1, 0, 0, 0, 0])
|
||||
first_slice_sub = self.model.match_parent_path(
|
||||
slice_mask,
|
||||
["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
|
||||
[1, 0, 0, 0, 0],
|
||||
)
|
||||
if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
|
||||
logger.debug("fuse_attention: failed to match last slice sub path")
|
||||
return None
|
||||
|
||||
first_slice_sub_1 = self.model.match_parent_path(slice_mask,
|
||||
['Unsqueeze', 'Sub', 'Gather', 'Shape', 'LayerNormalization'],
|
||||
[1, 0, 1, 0, 0])
|
||||
first_slice_sub_1 = self.model.match_parent_path(
|
||||
slice_mask,
|
||||
["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
|
||||
[1, 0, 1, 0, 0],
|
||||
)
|
||||
if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention:
|
||||
logger.debug("fuse_attention: failed to match last slice sub path 1")
|
||||
return None
|
||||
|
@ -130,30 +156,53 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
['Add', 'Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
|
||||
[ 0, 1, None, 0, 0, 0],
|
||||
["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
|
||||
[0, 1, None, 0, 0, 0],
|
||||
output_name_to_node=output_name_to_node,
|
||||
) # yapf: disable
|
||||
) # yapf: disable
|
||||
if qkv_nodes is None:
|
||||
return
|
||||
(add_skip, add_after_attention, matmul_after_attention, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
|
||||
(
|
||||
add_skip,
|
||||
add_after_attention,
|
||||
matmul_after_attention,
|
||||
reshape_qkv,
|
||||
transpose_qkv,
|
||||
matmul_qkv,
|
||||
) = qkv_nodes
|
||||
|
||||
skip_input = add_skip.input[0]
|
||||
|
||||
v_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
['Concat', 'Transpose', 'Reshape', 'Split', 'Add', 'MatMul', 'LayerNormalization'],
|
||||
[1, 1, 0, 0, 0, None, 0]) # yapf: disable
|
||||
[
|
||||
"Concat",
|
||||
"Transpose",
|
||||
"Reshape",
|
||||
"Split",
|
||||
"Add",
|
||||
"MatMul",
|
||||
"LayerNormalization",
|
||||
],
|
||||
[1, 1, 0, 0, 0, None, 0],
|
||||
) # yapf: disable
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return
|
||||
(concat_v, transpose_v, reshape_v, split_v, add_before_split, matmul_before_split,
|
||||
layernorm_before_attention) = v_nodes
|
||||
(
|
||||
concat_v,
|
||||
transpose_v,
|
||||
reshape_v,
|
||||
split_v,
|
||||
add_before_split,
|
||||
matmul_before_split,
|
||||
layernorm_before_attention,
|
||||
) = v_nodes
|
||||
if skip_input != layernorm_before_attention.input[0]:
|
||||
logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]")
|
||||
return
|
||||
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'MatMul'], [0, 0, 0, 0])
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0])
|
||||
if qk_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match qk path")
|
||||
return None
|
||||
|
@ -164,7 +213,7 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
|
||||
attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention)
|
||||
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Split'], [0, 0, 0, 0])
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0])
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return
|
||||
|
@ -173,9 +222,11 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
logger.debug("fuse_attention: skip since split_v != split_q")
|
||||
return
|
||||
|
||||
k_nodes = self.model.match_parent_path(matmul_qk,
|
||||
['Div', 'Transpose', 'Concat', 'Transpose', 'Reshape', 'Split'],
|
||||
[1, 0, 0, 1, 0, 0])
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
|
||||
[1, 0, 0, 1, 0, 0],
|
||||
)
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return
|
||||
|
@ -185,8 +236,14 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
return
|
||||
|
||||
i, value = self.model.get_constant_input(reshape_k)
|
||||
if not (isinstance(value, np.ndarray) and list(value.shape) == [4] and value[0] == 0 and value[1] == 0
|
||||
and value[2] > 0 and value[3] > 0):
|
||||
if not (
|
||||
isinstance(value, np.ndarray)
|
||||
and list(value.shape) == [4]
|
||||
and value[0] == 0
|
||||
and value[1] == 0
|
||||
and value[2] > 0
|
||||
and value[3] > 0
|
||||
):
|
||||
logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
|
||||
return
|
||||
|
||||
|
@ -224,5 +281,12 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
|
|||
logger.info("fuse_attention: expect present to be graph output")
|
||||
return
|
||||
|
||||
self.fuse_attention_node(matmul_before_split, add_before_split, past, present,
|
||||
layernorm_before_attention.output[0], reshape_qkv, attention_mask)
|
||||
self.fuse_attention_node(
|
||||
matmul_before_split,
|
||||
add_before_split,
|
||||
past,
|
||||
present,
|
||||
layernorm_before_attention.output[0],
|
||||
reshape_qkv,
|
||||
attention_mask,
|
||||
)
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
import numpy as np
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from onnx import helper, numpy_helper, TensorProto
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
import numpy as np
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import FusionUtils
|
||||
from onnx import TensorProto, helper, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -17,31 +18,41 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
Fuse GPT-2 Attention without past state into one Attention node.
|
||||
This does not support attention_mask graph input right now.
|
||||
"""
|
||||
|
||||
def __init__(self, model: OnnxModel, num_heads: int):
|
||||
super().__init__(model, "Attention", "LayerNormalization", "without past")
|
||||
# TODO: detect num_heads from graph like FusionAttention
|
||||
self.num_heads = num_heads
|
||||
|
||||
def create_attention_node(self, gemm, gemm_qkv, input, output):
|
||||
attention_node_name = self.model.create_node_name('Attention')
|
||||
attention_node = helper.make_node('Attention',
|
||||
inputs=[input, gemm.input[1], gemm.input[2]],
|
||||
outputs=[attention_node_name + "_output"],
|
||||
name=attention_node_name)
|
||||
attention_node_name = self.model.create_node_name("Attention")
|
||||
attention_node = helper.make_node(
|
||||
"Attention",
|
||||
inputs=[input, gemm.input[1], gemm.input[2]],
|
||||
outputs=[attention_node_name + "_output"],
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend(
|
||||
[helper.make_attribute("num_heads", self.num_heads),
|
||||
helper.make_attribute("unidirectional", 1)])
|
||||
[
|
||||
helper.make_attribute("num_heads", self.num_heads),
|
||||
helper.make_attribute("unidirectional", 1),
|
||||
]
|
||||
)
|
||||
|
||||
matmul_node = helper.make_node('MatMul',
|
||||
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
|
||||
outputs=[attention_node_name + "_matmul_output"],
|
||||
name=attention_node_name + "_matmul")
|
||||
matmul_node = helper.make_node(
|
||||
"MatMul",
|
||||
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
|
||||
outputs=[attention_node_name + "_matmul_output"],
|
||||
name=attention_node_name + "_matmul",
|
||||
)
|
||||
|
||||
add_node = helper.make_node('Add',
|
||||
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
|
||||
outputs=[output],
|
||||
name=attention_node_name + "_add")
|
||||
add_node = helper.make_node(
|
||||
"Add",
|
||||
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
|
||||
outputs=[output],
|
||||
name=attention_node_name + "_add",
|
||||
)
|
||||
|
||||
self.nodes_to_add.extend([attention_node, matmul_node, add_node])
|
||||
self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
|
||||
|
@ -52,29 +63,45 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
return_indice = []
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
['Add', 'Reshape', 'Gemm', 'Reshape', 'Reshape', 'Transpose', 'MatMul'],
|
||||
["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
|
||||
[0, None, 0, 0, 0, 0, 0],
|
||||
output_name_to_node=output_name_to_node,
|
||||
return_indice=return_indice
|
||||
) # yapf: disable
|
||||
return_indice=return_indice,
|
||||
) # yapf: disable
|
||||
if qkv_nodes is None:
|
||||
return
|
||||
(add_qkv, reshape_qkv, gemm_qkv, reshape_1, reshape_2, transpose_qkv, matmul_qkv) = qkv_nodes
|
||||
(
|
||||
add_qkv,
|
||||
reshape_qkv,
|
||||
gemm_qkv,
|
||||
reshape_1,
|
||||
reshape_2,
|
||||
transpose_qkv,
|
||||
matmul_qkv,
|
||||
) = qkv_nodes
|
||||
|
||||
another_input = add_qkv.input[1 - return_indice[0]]
|
||||
|
||||
v_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
['Transpose', 'Reshape', 'Split', 'Reshape', 'Gemm', 'Reshape'],
|
||||
[1, 0, 0, 0, 0, 0]) # yapf: disable
|
||||
["Transpose", "Reshape", "Split", "Reshape", "Gemm", "Reshape"],
|
||||
[1, 0, 0, 0, 0, 0],
|
||||
) # yapf: disable
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return
|
||||
(transpose_v, reshape_v, split_v, reshape_after_gemm, gemm, reshape_before_gemm) = v_nodes
|
||||
(
|
||||
transpose_v,
|
||||
reshape_v,
|
||||
split_v,
|
||||
reshape_after_gemm,
|
||||
gemm,
|
||||
reshape_before_gemm,
|
||||
) = v_nodes
|
||||
|
||||
layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node)
|
||||
if layernorm_before_attention is None or layernorm_before_attention.op_type != 'LayerNormalization':
|
||||
if layernorm_before_attention.op_type != 'Add':
|
||||
if layernorm_before_attention is None or layernorm_before_attention.op_type != "LayerNormalization":
|
||||
if layernorm_before_attention.op_type != "Add":
|
||||
logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}")
|
||||
return
|
||||
|
||||
|
@ -84,13 +111,25 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
logger.debug("Add and LayerNormalization shall have one same input")
|
||||
return
|
||||
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'Div', 'MatMul'], [0, 0, 0, 0, 0])
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
|
||||
if qk_nodes is not None:
|
||||
(softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
|
||||
mask_nodes = self.model.match_parent_path(
|
||||
sub_qk,
|
||||
['Mul', 'Sub', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
|
||||
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable
|
||||
[
|
||||
"Mul",
|
||||
"Sub",
|
||||
"Slice",
|
||||
"Slice",
|
||||
"Unsqueeze",
|
||||
"Sub",
|
||||
"Squeeze",
|
||||
"Slice",
|
||||
"Shape",
|
||||
"Div",
|
||||
],
|
||||
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
|
||||
) # yapf: disable
|
||||
if mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match mask path")
|
||||
return
|
||||
|
@ -101,13 +140,24 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
return
|
||||
else:
|
||||
# New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'Div', 'MatMul'], [0, 0, 1, 0])
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0])
|
||||
if qk_nodes is not None:
|
||||
(softmax_qk, where_qk, div_qk, matmul_qk) = qk_nodes
|
||||
mask_nodes = self.model.match_parent_path(
|
||||
where_qk,
|
||||
['Cast', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
|
||||
[ 0, 0, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable
|
||||
[
|
||||
"Cast",
|
||||
"Slice",
|
||||
"Slice",
|
||||
"Unsqueeze",
|
||||
"Sub",
|
||||
"Squeeze",
|
||||
"Slice",
|
||||
"Shape",
|
||||
"Div",
|
||||
],
|
||||
[0, 0, 0, 1, 0, 0, 0, 0, 0],
|
||||
) # yapf: disable
|
||||
if mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match mask path")
|
||||
return
|
||||
|
@ -118,16 +168,20 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
return
|
||||
else:
|
||||
# match openai-gpt
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'Div', 'MatMul'],
|
||||
[0, 0, 0, 0, 0])
|
||||
qk_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
["Softmax", "Add", "Mul", "Div", "MatMul"],
|
||||
[0, 0, 0, 0, 0],
|
||||
)
|
||||
if qk_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match qk path")
|
||||
return
|
||||
(softmax_qk, add_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
|
||||
mask_nodes = self.model.match_parent_path(
|
||||
mul_qk,
|
||||
['Slice', 'Slice', 'Unsqueeze', 'Squeeze', 'Slice', 'Shape', 'Div'],
|
||||
[ 1, 0, 2, 0, 0, 0, 0]) # yapf: disable
|
||||
["Slice", "Slice", "Unsqueeze", "Squeeze", "Slice", "Shape", "Div"],
|
||||
[1, 0, 2, 0, 0, 0, 0],
|
||||
) # yapf: disable
|
||||
if mask_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match mask path")
|
||||
return
|
||||
|
@ -137,7 +191,7 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
logger.debug("fuse_attention: skip since div_qk != div_mask")
|
||||
return
|
||||
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [0, 0, 0])
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return
|
||||
|
@ -146,7 +200,7 @@ class FusionGptAttentionNoPast(Fusion):
|
|||
logger.debug("fuse_attention: skip since split_v != split_q")
|
||||
return
|
||||
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [1, 0, 0])
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0])
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
from typing import Dict
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
from typing import Dict
|
||||
|
||||
from fusion_base import Fusion
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
from fusion_base import Fusion
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -43,24 +44,32 @@ class FusionLayerNormalization(Fusion):
|
|||
|
||||
root_input = node.input[0]
|
||||
|
||||
if children[0].op_type != 'Sub' or children[0].input[0] != root_input:
|
||||
if children[0].op_type != "Sub" or children[0].input[0] != root_input:
|
||||
return
|
||||
|
||||
if len(children) == 2:
|
||||
if children[1].op_type != 'Sub' or children[1].input[0] != root_input:
|
||||
if children[1].op_type != "Sub" or children[1].input[0] != root_input:
|
||||
return
|
||||
|
||||
div_node = None
|
||||
for child in children:
|
||||
div_node = self.model.find_first_child_by_type(child, 'Div', input_name_to_nodes, recursive=False)
|
||||
div_node = self.model.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
|
||||
if div_node is not None:
|
||||
break
|
||||
if div_node is None:
|
||||
return
|
||||
|
||||
path_id, parent_nodes, _ = self.model.match_parent_paths(
|
||||
div_node, [(['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Sub'], [1, 0, 0, 0, 0]),
|
||||
(['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Cast', 'Sub'], [1, 0, 0, 0, 0, 0])], output_name_to_node)
|
||||
div_node,
|
||||
[
|
||||
(["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
|
||||
(
|
||||
["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
|
||||
[1, 0, 0, 0, 0, 0],
|
||||
),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
if path_id < 0:
|
||||
return
|
||||
|
||||
|
@ -70,7 +79,7 @@ class FusionLayerNormalization(Fusion):
|
|||
|
||||
second_add_node = parent_nodes[1]
|
||||
i, add_weight = self.model.get_constant_input(second_add_node)
|
||||
if add_weight is None or add_weight <= 0 or add_weight > 1.0E-4:
|
||||
if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
|
||||
logger.warning(f"epsilon value is not expeced: {add_weight}")
|
||||
return
|
||||
|
||||
|
@ -79,11 +88,11 @@ class FusionLayerNormalization(Fusion):
|
|||
return
|
||||
|
||||
mul_node = input_name_to_nodes[div_node.output[0]][0]
|
||||
if mul_node.op_type != 'Mul':
|
||||
if mul_node.op_type != "Mul":
|
||||
return
|
||||
|
||||
last_add_node = input_name_to_nodes[mul_node.output[0]][0]
|
||||
if last_add_node.op_type != 'Add':
|
||||
if last_add_node.op_type != "Add":
|
||||
return
|
||||
|
||||
subgraph_nodes = [node]
|
||||
|
@ -91,8 +100,12 @@ class FusionLayerNormalization(Fusion):
|
|||
subgraph_nodes.extend(parent_nodes[:-1])
|
||||
|
||||
subgraph_nodes.extend([last_add_node, mul_node, div_node])
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, last_add_node.output, input_name_to_nodes,
|
||||
output_name_to_node):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
last_add_node.output,
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
|
||||
return
|
||||
|
||||
|
@ -106,11 +119,12 @@ class FusionLayerNormalization(Fusion):
|
|||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
|
||||
normalize_node = helper.make_node('LayerNormalization',
|
||||
inputs=[node.input[0], weight_input, bias_input],
|
||||
outputs=[last_add_node.output[0]],
|
||||
name=self.model.create_node_name("LayerNormalization",
|
||||
name_prefix="LayerNorm"))
|
||||
normalize_node = helper.make_node(
|
||||
"LayerNormalization",
|
||||
inputs=[node.input[0], weight_input, bias_input],
|
||||
outputs=[last_add_node.output[0]],
|
||||
name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
|
||||
)
|
||||
normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
|
||||
self.nodes_to_add.append(normalize_node)
|
||||
self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
|
||||
|
@ -122,28 +136,58 @@ class FusionLayerNormalizationTF(Fusion):
|
|||
|
||||
def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
|
||||
"""
|
||||
Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
|
||||
+------------------------------------+
|
||||
| |
|
||||
| |
|
||||
(Cast_1) |
|
||||
| |
|
||||
| v (B) (B) (A)
|
||||
Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
|
||||
| | | ^ ^
|
||||
| | | | |
|
||||
| +--------------------------------------------------(Cast_2)-------------------------------|-------+ |
|
||||
| v |
|
||||
+---------------------------------------------------------------------------------------------------------------> Mul--------------------+
|
||||
Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
|
||||
+------------------------------------+
|
||||
| |
|
||||
| |
|
||||
(Cast_1) |
|
||||
| |
|
||||
| v (B) (B) (A)
|
||||
Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
|
||||
| | | ^ ^
|
||||
| | | | |
|
||||
| +--------------------------------------------------(Cast_2)-------------------------------|-------+ |
|
||||
| v |
|
||||
+---------------------------------------------------------------------------------------------------------------> Mul--------------------+
|
||||
"""
|
||||
return_indice = []
|
||||
_, parent_nodes, return_indice = self.model.match_parent_paths(
|
||||
node,
|
||||
[(['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'],
|
||||
[ 1, 1, None, 0, 0, 0, None, 0, 0, None]),
|
||||
(['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'Cast', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'],
|
||||
[ 1, 1, None, 0, 0, 0, 0, None, 0, 0, None])],
|
||||
output_name_to_node) # yapf: disable
|
||||
[
|
||||
(
|
||||
[
|
||||
"Sub",
|
||||
"Mul",
|
||||
"Mul",
|
||||
"Reciprocal",
|
||||
"Sqrt",
|
||||
"Add",
|
||||
"ReduceMean",
|
||||
"Mul",
|
||||
"Sub",
|
||||
"ReduceMean",
|
||||
],
|
||||
[1, 1, None, 0, 0, 0, None, 0, 0, None],
|
||||
),
|
||||
(
|
||||
[
|
||||
"Sub",
|
||||
"Mul",
|
||||
"Mul",
|
||||
"Reciprocal",
|
||||
"Sqrt",
|
||||
"Add",
|
||||
"Cast",
|
||||
"ReduceMean",
|
||||
"Mul",
|
||||
"Sub",
|
||||
"ReduceMean",
|
||||
],
|
||||
[1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
|
||||
),
|
||||
],
|
||||
output_name_to_node,
|
||||
) # yapf: disable
|
||||
|
||||
if parent_nodes is None:
|
||||
return
|
||||
|
@ -153,38 +197,50 @@ class FusionLayerNormalizationTF(Fusion):
|
|||
logger.debug("return indice is exepected in [0, 1], but got {return_indice}")
|
||||
return
|
||||
|
||||
sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0 = parent_nodes[:6]
|
||||
(
|
||||
sub_node_0,
|
||||
mul_node_0,
|
||||
mul_node_1,
|
||||
reciprocol_node,
|
||||
sqrt_node,
|
||||
add_node_0,
|
||||
) = parent_nodes[:6]
|
||||
reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[-4:]
|
||||
|
||||
cast_node_3 = None
|
||||
if len(parent_nodes) == 11:
|
||||
cast_node_3 = parent_nodes[6]
|
||||
assert (cast_node_3.op_type == 'Cast')
|
||||
assert cast_node_3.op_type == "Cast"
|
||||
|
||||
mul_node_3 = self.model.match_parent(node, 'Mul', 0, output_name_to_node)
|
||||
mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
|
||||
if mul_node_3 is None:
|
||||
logger.debug("mul_node_3 not found")
|
||||
return
|
||||
|
||||
node_before_reduce = self.model.get_parent(reduce_mean_node_1, 0, output_name_to_node)
|
||||
root_node = node_before_reduce if cast_node_3 is None else self.model.get_parent(
|
||||
node_before_reduce, 0, output_name_to_node)
|
||||
root_node = (
|
||||
node_before_reduce
|
||||
if cast_node_3 is None
|
||||
else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
|
||||
)
|
||||
if root_node is None:
|
||||
logger.debug("root node is none")
|
||||
return
|
||||
|
||||
i, epsilon = self.model.get_constant_input(add_node_0)
|
||||
if epsilon is None or epsilon <= 0 or (epsilon > 1.0E-5 and cast_node_3 is None):
|
||||
if epsilon is None or epsilon <= 0 or (epsilon > 1.0e-5 and cast_node_3 is None):
|
||||
logger.debug("epsilon is not matched")
|
||||
return
|
||||
|
||||
if cast_node_3 is None and (reduce_mean_node_1.input[0] not in mul_node_3.input
|
||||
or reduce_mean_node_1.input[0] not in sub_node_1.input):
|
||||
if cast_node_3 is None and (
|
||||
reduce_mean_node_1.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
|
||||
):
|
||||
logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
|
||||
return
|
||||
|
||||
if cast_node_3 is not None and (node_before_reduce.input[0] not in mul_node_3.input
|
||||
or reduce_mean_node_1.input[0] not in sub_node_1.input):
|
||||
if cast_node_3 is not None and (
|
||||
node_before_reduce.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
|
||||
):
|
||||
logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
|
||||
return
|
||||
|
||||
|
@ -193,19 +249,33 @@ class FusionLayerNormalizationTF(Fusion):
|
|||
return
|
||||
|
||||
subgraph_nodes = [
|
||||
node, sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0, reduce_mean_node_0,
|
||||
mul_node_2, sub_node_1, reduce_mean_node_1, mul_node_3
|
||||
node,
|
||||
sub_node_0,
|
||||
mul_node_0,
|
||||
mul_node_1,
|
||||
reciprocol_node,
|
||||
sqrt_node,
|
||||
add_node_0,
|
||||
reduce_mean_node_0,
|
||||
mul_node_2,
|
||||
sub_node_1,
|
||||
reduce_mean_node_1,
|
||||
mul_node_3,
|
||||
]
|
||||
|
||||
if cast_node_3 is not None:
|
||||
cast_node_2 = self.model.match_parent(mul_node_0, 'Cast', 0, output_name_to_node)
|
||||
cast_node_2 = self.model.match_parent(mul_node_0, "Cast", 0, output_name_to_node)
|
||||
if cast_node_2 is None:
|
||||
logger.debug("cast_node_2 not found")
|
||||
return
|
||||
subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
|
||||
|
||||
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, node.output, self.model.input_name_to_nodes(),
|
||||
self.model.output_name_to_node()):
|
||||
if not self.model.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
node.output,
|
||||
self.model.input_name_to_nodes(),
|
||||
self.model.output_name_to_node(),
|
||||
):
|
||||
logger.debug("not safe to fuse layer normalization")
|
||||
return
|
||||
|
||||
|
@ -214,11 +284,13 @@ class FusionLayerNormalizationTF(Fusion):
|
|||
weight_input = mul_node_1.input[1]
|
||||
bias_input = sub_node_0.input[0]
|
||||
|
||||
#TODO: add epsilon attribute
|
||||
fused_node = helper.make_node('LayerNormalization',
|
||||
inputs=[mul_node_3.input[0], weight_input, bias_input],
|
||||
outputs=[node.output[0]],
|
||||
name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"))
|
||||
# TODO: add epsilon attribute
|
||||
fused_node = helper.make_node(
|
||||
"LayerNormalization",
|
||||
inputs=[mul_node_3.input[0], weight_input, bias_input],
|
||||
outputs=[node.output[0]],
|
||||
name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
|
||||
)
|
||||
fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
|
||||
self.nodes_to_add.append(fused_node)
|
||||
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче