Format all python files under onnxruntime with black and isort (#11324)

Description: Format all python files under onnxruntime with black and isort.

After checking in, we can use .git-blame-ignore-revs to ignore the formatting PR in git blame.

#11315, #11316
This commit is contained in:
Justin Chu 2022-04-26 09:35:16 -07:00 коммит произвёл GitHub
Родитель 13f86e7d56
Коммит fdce4fa6af
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
526 изменённых файлов: 54228 добавлений и 36170 удалений

Просмотреть файл

@ -22,3 +22,4 @@ exclude =
./orttraining,
# ignore server code for now
./server,
ignore = W503, E203

Просмотреть файл

@ -15,8 +15,10 @@ package_url = None
registrations = []
with open(os.path.join(REPO_DIR, 'tools', 'ci_build', 'github', 'linux', 'docker', 'Dockerfile.manylinux2014_cuda11'),
mode="r") as f:
with open(
os.path.join(REPO_DIR, "tools", "ci_build", "github", "linux", "docker", "Dockerfile.manylinux2014_cuda11"),
mode="r",
) as f:
for line in f:
if not line.strip():
package_name = None
@ -36,15 +38,12 @@ with open(os.path.join(REPO_DIR, 'tools', 'ci_build', 'github', 'linux', 'docker
m = re.match(r"(.+?)_DOWNLOAD_URL=(\S+)", line)
if m is not None:
package_url = m.group(2)
if package_name == 'LIBXCRYPT':
package_url = m.group(2) + "/v" + \
package_filename + ".tar.gz"
elif package_name == 'CMAKE':
package_url = m.group(
2) + "/v" + package_filename + "/cmake-" + package_filename + ".tar.gz"
if package_name == "LIBXCRYPT":
package_url = m.group(2) + "/v" + package_filename + ".tar.gz"
elif package_name == "CMAKE":
package_url = m.group(2) + "/v" + package_filename + "/cmake-" + package_filename + ".tar.gz"
else:
package_url = m.group(2) + "/" + \
package_filename + ".tar.gz"
package_url = m.group(2) + "/" + package_filename + ".tar.gz"
registration = {
"Component": {
"Type": "other",
@ -53,7 +52,7 @@ with open(os.path.join(REPO_DIR, 'tools', 'ci_build', 'github', 'linux', 'docker
"Version": package_filename.split("-")[-1],
"DownloadUrl": package_url,
},
"comments": "manylinux dependency"
"comments": "manylinux dependency",
}
}
registrations.append(registration)
@ -67,14 +66,23 @@ def normalize_path_separators(path):
proc = subprocess.run(
["git", "submodule", "foreach", "--quiet", "--recursive", "{} {} $toplevel/$sm_path".format(
normalize_path_separators(sys.executable),
normalize_path_separators(os.path.join(SCRIPT_DIR, "print_submodule_info.py")))],
[
"git",
"submodule",
"foreach",
"--quiet",
"--recursive",
"{} {} $toplevel/$sm_path".format(
normalize_path_separators(sys.executable),
normalize_path_separators(os.path.join(SCRIPT_DIR, "print_submodule_info.py")),
),
],
check=True,
cwd=REPO_DIR,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True)
universal_newlines=True,
)
submodule_lines = proc.stdout.splitlines()
@ -88,7 +96,8 @@ for submodule_line in submodule_lines:
"repositoryUrl": url,
},
"comments": "git submodule at {}".format(
normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR)))
normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))
),
}
}
registrations.append(registration)

Просмотреть файл

@ -10,19 +10,19 @@ assert len(sys.argv) == 2
path = sys.argv[1]
proc = subprocess.run(["git", "config", "--get", "remote.origin.url"],
check=True,
cwd=path,
stdout=subprocess.PIPE,
universal_newlines=True)
proc = subprocess.run(
["git", "config", "--get", "remote.origin.url"],
check=True,
cwd=path,
stdout=subprocess.PIPE,
universal_newlines=True,
)
url = proc.stdout.strip()
proc = subprocess.run(["git", "rev-parse", "HEAD"],
check=True,
cwd=path,
stdout=subprocess.PIPE,
universal_newlines=True)
proc = subprocess.run(
["git", "rev-parse", "HEAD"], check=True, cwd=path, stdout=subprocess.PIPE, universal_newlines=True
)
commit = proc.stdout.strip()

21
csharp/testdata/test_input_BFLOAT16.py поставляемый
Просмотреть файл

@ -2,28 +2,21 @@
# Licensed under the MIT License.
import onnx
from onnx import helper
from onnx import TensorProto, helper
from onnx.helper import make_opsetid
from onnx import TensorProto
input_info = helper.make_tensor_value_info('input', TensorProto.BFLOAT16, [1, 5])
output_info = helper.make_tensor_value_info('output', TensorProto.BFLOAT16, [1, 5])
input_info = helper.make_tensor_value_info("input", TensorProto.BFLOAT16, [1, 5])
output_info = helper.make_tensor_value_info("output", TensorProto.BFLOAT16, [1, 5])
# Create a node (NodeProto) - This is based on Pad-11
node_def = helper.make_node(
'Identity', # node name
['input'], # inputs
['output'] # outputs
)
node_def = helper.make_node("Identity", ["input"], ["output"]) # node name # inputs # outputs
graph_def = helper.make_graph(nodes=[node_def], name='test_types_BLOAT16',
inputs=[input_info], outputs=[output_info])
graph_def = helper.make_graph(nodes=[node_def], name="test_types_BLOAT16", inputs=[input_info], outputs=[output_info])
model_def = helper.make_model(graph_def, producer_name='AIInfra',
opset_imports=[make_opsetid('', 13)])
model_def = helper.make_model(graph_def, producer_name="AIInfra", opset_imports=[make_opsetid("", 13)])
onnx.checker.check_model(model_def)
onnx.helper.strip_doc_string(model_def)
final_model = onnx.shape_inference.infer_shapes(model_def)
onnx.checker.check_model(final_model)
onnx.save(final_model, 'test_types_BFLOAT16.onnx')
onnx.save(final_model, "test_types_BFLOAT16.onnx")

23
csharp/testdata/test_input_FLOAT16.py поставляемый
Просмотреть файл

@ -2,31 +2,28 @@
# Licensed under the MIT License.
import onnx
from onnx import helper
from onnx import TensorProto, helper
from onnx.helper import make_opsetid
from onnx import TensorProto
input_info = helper.make_tensor_value_info('input', TensorProto.FLOAT16, [1, 5])
output_info = helper.make_tensor_value_info('output', TensorProto.FLOAT16, [1, 5])
input_info = helper.make_tensor_value_info("input", TensorProto.FLOAT16, [1, 5])
output_info = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [1, 5])
# Create a node (NodeProto) - This is based on Pad-11
node_def = helper.make_node(
'Slice', # node name
['input'], # inputs
['output'], # outputs
"Slice", # node name
["input"], # inputs
["output"], # outputs
axes=[0, 1], # attributes
ends=[1, 5],
starts=[0, 0]
starts=[0, 0],
)
graph_def = helper.make_graph(nodes=[node_def], name='test_input_FLOAT16',
inputs=[input_info], outputs=[output_info])
graph_def = helper.make_graph(nodes=[node_def], name="test_input_FLOAT16", inputs=[input_info], outputs=[output_info])
model_def = helper.make_model(graph_def, producer_name='AIInfra',
opset_imports=[make_opsetid('', 7)])
model_def = helper.make_model(graph_def, producer_name="AIInfra", opset_imports=[make_opsetid("", 7)])
onnx.checker.check_model(model_def)
onnx.helper.strip_doc_string(model_def)
final_model = onnx.shape_inference.infer_shapes(model_def)
onnx.checker.check_model(final_model)
onnx.save(final_model, 'test_types_FLOAT16.onnx')
onnx.save(final_model, "test_types_FLOAT16.onnx")

Просмотреть файл

@ -6,16 +6,18 @@
# Configuration file for the Sphinx documentation builder.
import os
import sys
import shutil
import sys
import onnxruntime
# import recommonmark
# -- Project information -----------------------------------------------------
project = 'ONNX Runtime'
copyright = '2018-2021, Microsoft'
author = 'Microsoft'
project = "ONNX Runtime"
copyright = "2018-2021, Microsoft"
author = "Microsoft"
version = onnxruntime.__version__
release = version
@ -23,70 +25,72 @@ release = version
extensions = [
"alabaster",
'sphinx.ext.intersphinx',
'sphinx.ext.imgmath',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode',
"sphinx.ext.intersphinx",
"sphinx.ext.imgmath",
"sphinx.ext.ifconfig",
"sphinx.ext.viewcode",
"sphinx.ext.autodoc",
'sphinx.ext.githubpages',
"sphinx.ext.githubpages",
"sphinx_gallery.gen_gallery",
'sphinx.ext.graphviz',
"sphinx.ext.graphviz",
"pyquickhelper.sphinxext.sphinx_runpython_extension",
]
templates_path = ['_templates']
templates_path = ["_templates"]
source_parsers = {
'.md': 'recommonmark.parser.CommonMarkParser',
".md": "recommonmark.parser.CommonMarkParser",
}
source_suffix = ['.rst'] # , '.md']
source_suffix = [".rst"] # , '.md']
master_doc = 'index'
master_doc = "index"
language = "en"
exclude_patterns = []
pygments_style = 'default'
autoclass_content = 'both'
pygments_style = "default"
autoclass_content = "both"
# -- Options for HTML output -------------------------------------------------
html_theme = "alabaster"
html_logo = "ONNX_Runtime_icon.png"
html_static_path = ['_static']
html_static_path = ["_static"]
graphviz_output_format = "svg"
# -- Options for intersphinx extension ---------------------------------------
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'https://docs.python.org/': None}
intersphinx_mapping = {"https://docs.python.org/": None}
# -- Options for Sphinx Gallery ----------------------------------------------
sphinx_gallery_conf = {
'examples_dirs': 'examples',
'gallery_dirs': 'auto_examples',
"examples_dirs": "examples",
"gallery_dirs": "auto_examples",
}
# -- markdown options -----------------------------------------------------------
md_image_dest = "media"
md_link_replace = {
'#onnxruntimesessionoptionsenable-profiling)': '#class-onnxruntimesessionoptions)',
"#onnxruntimesessionoptionsenable-profiling)": "#class-onnxruntimesessionoptions)",
}
# -- Setup actions -----------------------------------------------------------
def setup(app):
# download examples for the documentation
this = os.path.abspath(os.path.dirname(__file__))
dest = os.path.join(this, "model.onnx")
if not os.path.exists(dest):
import urllib.request
url = 'https://raw.githubusercontent.com/onnx/onnx/master/onnx/backend/test/data/node/test_sigmoid/model.onnx'
url = "https://raw.githubusercontent.com/onnx/onnx/master/onnx/backend/test/data/node/test_sigmoid/model.onnx"
urllib.request.urlretrieve(url, dest)
loc = os.path.split(dest)[-1]
if not os.path.exists(loc):
import shutil
shutil.copy(dest, loc)
return app

Просмотреть файл

@ -15,15 +15,16 @@ Let's use the API to compute the prediction
of a simple logistic regression model.
"""
import numpy as np
from onnxruntime import datasets
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
import onnxruntime.backend as backend
from onnx import load
import onnxruntime.backend as backend
########################################
# The device depends on how the package was compiled,
# GPU or CPU.
from onnxruntime import get_device
from onnxruntime import datasets, get_device
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
device = get_device()
name = datasets.get_example("logreg_iris.onnx")

Просмотреть файл

@ -15,9 +15,10 @@ It starts by loading the model trained in example
trained on *Iris* datasets. The model takes
a vector of dimension 2 and returns a class among three.
"""
import numpy
import onnxruntime as rt
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
import numpy
from onnxruntime.datasets import get_example
example2 = get_example("logreg_iris.onnx")
@ -37,7 +38,7 @@ try:
except Exception as e:
print("Unexpected type")
print("{0}: {1}".format(type(e), e))
#########################
# The model fails to return an output if the name
# is misspelled.
@ -76,12 +77,12 @@ except Exception as e:
# dimension is a multiple of the expected input dimension.
for x in [
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
]:
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
]:
try:
r = sess.run([output_name], {input_name: x})
print("Shape={0} and predicted labels={1}".format(x.shape, r))
@ -89,12 +90,12 @@ for x in [
print("ERROR with Shape={0} - {1}".format(x.shape, e))
for x in [
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
]:
numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0, 4.0]], dtype=numpy.float32),
numpy.array([[1.0, 2.0], [3.0, 4.0]], dtype=numpy.float32),
numpy.array([1.0, 2.0, 3.0], dtype=numpy.float32),
numpy.array([[1.0, 2.0, 3.0]], dtype=numpy.float32),
]:
try:
r = sess.run(None, {input_name: x})
print("Shape={0} and predicted probabilities={1}".format(x.shape, r[1]))
@ -106,10 +107,10 @@ for x in [
# is higher than expects but produces a warning.
for x in [
numpy.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=numpy.float32),
numpy.array([[[1.0, 2.0, 3.0]]], dtype=numpy.float32),
numpy.array([[[1.0, 2.0]], [[3.0, 4.0]]], dtype=numpy.float32),
]:
numpy.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=numpy.float32),
numpy.array([[[1.0, 2.0, 3.0]]], dtype=numpy.float32),
numpy.array([[[1.0, 2.0]], [[3.0, 4.0]]], dtype=numpy.float32),
]:
try:
r = sess.run([output_name], {input_name: x})
print("Shape={0} and predicted labels={1}".format(x.shape, r))

Просмотреть файл

@ -21,24 +21,25 @@ The first step consists in retrieving the boston datset.
"""
import pandas
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data, boston.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train_dict = pandas.DataFrame(X_train[:,1:]).T.to_dict().values()
X_test_dict = pandas.DataFrame(X_test[:,1:]).T.to_dict().values()
X_train_dict = pandas.DataFrame(X_train[:, 1:]).T.to_dict().values()
X_test_dict = pandas.DataFrame(X_test[:, 1:]).T.to_dict().values()
####################################
# We create a pipeline.
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction import DictVectorizer
pipe = make_pipeline(
DictVectorizer(sparse=False),
GradientBoostingRegressor())
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(DictVectorizer(sparse=False), GradientBoostingRegressor())
pipe.fit(X_train_dict, y_train)
####################################
@ -53,15 +54,15 @@ print(r2_score(y_test, pred))
# Conversion to ONNX format
# +++++++++++++++++++++++++
#
# We use module
# We use module
# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
# to convert the model into ONNX format.
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType, DictionaryType, SequenceType
from skl2onnx.common.data_types import DictionaryType, FloatTensorType, Int64TensorType, SequenceType
# initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
initial_type = [("float_input", DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
onx = convert_sklearn(pipe, initial_types=initial_type)
with open("pipeline_vectorize.onnx", "wb") as f:
f.write(onx.SerializeToString())
@ -75,6 +76,7 @@ from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers())
import numpy
inp, out = sess.get_inputs()[0], sess.get_outputs()[0]
print("input name='{}' and shape={} and type={}".format(inp.name, inp.shape, inp.type))
print("output name='{}' and shape={} and type={}".format(out.name, out.shape, out.type))
@ -100,4 +102,3 @@ print(r2_score(pred, pred_onx))
#########################
# Very similar. *ONNX Runtime* uses floats instead of doubles,
# that explains the small discrepencies.

Просмотреть файл

@ -12,8 +12,9 @@ the output for an input vector. It also shows how to
retrieve the definition of its inputs and outputs.
"""
import onnxruntime as rt
import numpy
import onnxruntime as rt
from onnxruntime.datasets import get_example
#########################
@ -37,7 +38,7 @@ print("input type", input_type)
# Let's see the output name and shape.
output_name = sess.get_outputs()[0].name
print("output name", output_name)
print("output name", output_name)
output_shape = sess.get_outputs()[0].shape
print("output shape", output_shape)
output_type = sess.get_outputs()[0].type
@ -47,7 +48,8 @@ print("output type", output_type)
# Let's compute its outputs (or predictions if it is a machine learned model).
import numpy.random
x = numpy.random.random((3,4,5))
x = numpy.random.random((3, 4, 5))
x = x.astype(numpy.float32)
res = sess.run([output_name], {input_name: x})
print(res)

Просмотреть файл

@ -15,9 +15,11 @@ logistic regression model trained with
"""
from onnxruntime.datasets import get_example
example = get_example("logreg_iris.onnx")
import onnx
model = onnx.load(example)
print("doc_string={}".format(model.doc_string))
@ -32,6 +34,7 @@ print("producer_version={}".format(model.producer_version))
# With *ONNX Runtime*:
import onnxruntime as rt
sess = rt.InferenceSession(example, providers=rt.get_available_providers())
meta = sess.get_modelmeta()

Просмотреть файл

@ -21,12 +21,14 @@ That's the most simple way.
"""
from onnxruntime.datasets import get_example
example1 = get_example("mul_1.onnx")
import onnx
model = onnx.load(example1) # model is a ModelProto protobuf message
print(model)
print(model)
#################################
@ -39,31 +41,30 @@ print(model)
from onnx import ModelProto
model = ModelProto()
with open(example1, 'rb') as fid:
with open(example1, "rb") as fid:
content = fid.read()
model.ParseFromString(content)
###################################
# We convert it into a graph.
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
pydot_graph = GetPydotGraph(model.graph, name=model.graph.name, rankdir="LR",
node_producer=GetOpNodeProducer("docstring"))
from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph
pydot_graph = GetPydotGraph(
model.graph, name=model.graph.name, rankdir="LR", node_producer=GetOpNodeProducer("docstring")
)
pydot_graph.write_dot("graph.dot")
#######################################
# Then into an image
import os
os.system('dot -O -Tpng graph.dot')
os.system("dot -O -Tpng graph.dot")
################################
# Which we display...
import matplotlib.pyplot as plt
image = plt.imread("graph.dot.png")
plt.imshow(image)

Просмотреть файл

@ -11,9 +11,10 @@ Profile the execution of a simple model
*ONNX Runtime* can profile the execution of the model.
This example shows how to interpret the results.
"""
import onnx
import onnxruntime as rt
import numpy
import onnx
import onnxruntime as rt
from onnxruntime.datasets import get_example
@ -27,8 +28,6 @@ def change_ir_version(filename, ir_version=6):
return model
#########################
# Let's load a very simple model and compute some prediction.
@ -61,10 +60,9 @@ print(prof_file)
# The results are stored un a file in JSON format.
# Let's see what it contains.
import json
with open(prof_file, "r") as f:
sess_time = json.load(f)
import pprint
pprint.pprint(sess_time)

Просмотреть файл

@ -22,16 +22,19 @@ The first step consists in retrieving the iris datset.
"""
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
####################################
# Then we fit a model.
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
clr.fit(X_train, y_train)
@ -47,14 +50,14 @@ print(confusion_matrix(y_test, pred))
# Conversion to ONNX format
# +++++++++++++++++++++++++
#
# We use module
# We use module
# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
# to convert the model into ONNX format.
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 4]))]
initial_type = [("float_input", FloatTensorType([None, 4]))]
onx = convert_sklearn(clr, initial_types=initial_type)
with open("logreg_iris.onnx", "wb") as f:
f.write(onx.SerializeToString())
@ -64,12 +67,11 @@ with open("logreg_iris.onnx", "wb") as f:
# its input and output.
import onnxruntime as rt
sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())
print("input name='{}' and shape={}".format(
sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
print("output name='{}' and shape={}".format(
sess.get_outputs()[0].name, sess.get_outputs()[0].shape))
print("input name='{}' and shape={}".format(sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
print("output name='{}' and shape={}".format(sess.get_outputs()[0].name, sess.get_outputs()[0].shape))
##################################
# We compute the predictions.
@ -78,6 +80,7 @@ input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
import numpy
pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]
print(confusion_matrix(pred, pred_onx))
@ -97,18 +100,20 @@ print(prob_sklearn[:3])
#############################
# And then with ONNX Runtime.
# The probabilies appear to be
# The probabilies appear to be
prob_name = sess.get_outputs()[1].name
prob_rt = sess.run([prob_name], {input_name: X_test.astype(numpy.float32)})[0]
import pprint
pprint.pprint(prob_rt[0:3])
###############################
# Let's benchmark.
from timeit import Timer
def speed(inst, number=10, repeat=20):
timer = Timer(inst, globals=globals())
raw = numpy.array(timer.repeat(repeat, number=number))
@ -117,6 +122,7 @@ def speed(inst, number=10, repeat=20):
print("Average %1.3g min=%1.3g max=%1.3g" % (ave, mi, ma))
return ave
print("Execution time for clr.predict")
speed("clr.predict(X_test)")
@ -128,20 +134,24 @@ speed("sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]")
# experiences: the model has to do one prediction at a time
# as opposed to a batch of prediction.
def loop(X_test, fct, n=None):
nrow = X_test.shape[0]
if n is None:
n = nrow
for i in range(0, n):
im = i % nrow
fct(X_test[im: im+1])
fct(X_test[im : im + 1])
print("Execution time for clr.predict")
speed("loop(X_test, clr.predict, 100)")
def sess_predict(x):
return sess.run([label_name], {input_name: x.astype(numpy.float32)})[0]
print("Execution time for sess_predict")
speed("loop(X_test, sess_predict, 100)")
@ -151,14 +161,16 @@ speed("loop(X_test, sess_predict, 100)")
print("Execution time for predict_proba")
speed("loop(X_test, clr.predict_proba, 100)")
def sess_predict_proba(x):
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
print("Execution time for sess_predict_proba")
speed("loop(X_test, sess_predict_proba, 100)")
#####################################
# This second comparison is better as
# This second comparison is better as
# ONNX Runtime, in this experience,
# computes the label and the probabilities
# in every case.
@ -169,10 +181,11 @@ speed("loop(X_test, sess_predict_proba, 100)")
#
# We first train and save a model in ONNX format.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
initial_type = [('float_input', FloatTensorType([1, 4]))]
initial_type = [("float_input", FloatTensorType([1, 4]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("rf_iris.onnx", "wb") as f:
f.write(onx.SerializeToString())
@ -182,9 +195,11 @@ with open("rf_iris.onnx", "wb") as f:
sess = rt.InferenceSession("rf_iris.onnx", providers=rt.get_available_providers())
def sess_predict_proba_rf(x):
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
print("Execution time for predict_proba")
speed("loop(X_test, rf.predict_proba, 100)")
@ -196,26 +211,28 @@ speed("loop(X_test, sess_predict_proba_rf, 100)")
measures = []
for n_trees in range(5, 51, 5):
for n_trees in range(5, 51, 5):
print(n_trees)
rf = RandomForestClassifier(n_estimators=n_trees)
rf.fit(X_train, y_train)
initial_type = [('float_input', FloatTensorType([1, 4]))]
initial_type = [("float_input", FloatTensorType([1, 4]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("rf_iris_%d.onnx" % n_trees, "wb") as f:
f.write(onx.SerializeToString())
sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees, providers=rt.get_available_providers())
def sess_predict_proba_loop(x):
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
tsk = speed("loop(X_test, rf.predict_proba, 100)", number=5, repeat=5)
trt = speed("loop(X_test, sess_predict_proba_loop, 100)", number=5, repeat=5)
measures.append({'n_trees': n_trees, 'sklearn': tsk, 'rt': trt})
measures.append({"n_trees": n_trees, "sklearn": tsk, "rt": trt})
from pandas import DataFrame
df = DataFrame(measures)
ax = df.plot(x="n_trees", y="sklearn", label="scikit-learn", c="blue", logy=True)
df.plot(x="n_trees", y="rt", label="onnxruntime",
ax=ax, c="green", logy=True)
df.plot(x="n_trees", y="rt", label="onnxruntime", ax=ax, c="green", logy=True)
ax.set_xlabel("Number of trees")
ax.set_ylabel("Prediction time (s)")
ax.set_title("Speed comparison between scikit-learn and ONNX Runtime\nFor a random forest on Iris dataset")

Просмотреть файл

@ -7,30 +7,28 @@
# -- Project information -----------------------------------------------------
project = 'ORTModule'
copyright = '2018-2021, Microsoft'
author = 'Microsoft'
version = '0.1' # TODO: Should use `onnxruntime.__version__` instead?
project = "ORTModule"
copyright = "2018-2021, Microsoft"
author = "Microsoft"
version = "0.1" # TODO: Should use `onnxruntime.__version__` instead?
release = version
# -- General configuration ---------------------------------------------------
extensions = ['sphinx.ext.autodoc',
'sphinx.ext.intersphinx'
]
templates_path = ['_templates']
extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx"]
templates_path = ["_templates"]
exclude_patterns = []
autoclass_content = 'both'
autoclass_content = "both"
# -- Options for HTML output -------------------------------------------------
html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static']
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
# -- Options for intersphinx extension ---------------------------------------
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None),
'numpy': ('https://numpy.org/doc/stable', None),
'torch': ('https://pytorch.org/docs/stable/', None),
"python": ("https://docs.python.org/3", None),
"numpy": ("https://numpy.org/doc/stable", None),
"torch": ("https://pytorch.org/docs/stable/", None),
}

14
objectivec/test/testdata/single_add_gen.py поставляемый
Просмотреть файл

@ -1,6 +1,5 @@
import onnx
from onnx import helper
from onnx import TensorProto
from onnx import TensorProto, helper
graph = helper.make_graph(
[ # nodes
@ -8,12 +7,13 @@ graph = helper.make_graph(
],
"SingleAdd", # name
[ # inputs
helper.make_tensor_value_info('A', TensorProto.FLOAT, [1]),
helper.make_tensor_value_info('B', TensorProto.FLOAT, [1]),
helper.make_tensor_value_info("A", TensorProto.FLOAT, [1]),
helper.make_tensor_value_info("B", TensorProto.FLOAT, [1]),
],
[ # outputs
helper.make_tensor_value_info('C', TensorProto.FLOAT, [1]),
])
helper.make_tensor_value_info("C", TensorProto.FLOAT, [1]),
],
)
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 12)])
onnx.save(model, r'single_add.onnx')
onnx.save(model, r"single_add.onnx")

Просмотреть файл

@ -20,11 +20,31 @@ __author__ = "Microsoft"
# meaningful messages to the user.
# the saved exception is raised after device version validation.
try:
from onnxruntime.capi._pybind_state import get_all_providers, get_available_providers, get_device, set_seed, \
RunOptions, SessionOptions, set_default_logger_severity, enable_telemetry_events, disable_telemetry_events, \
NodeArg, ModelMetadata, GraphOptimizationLevel, ExecutionMode, ExecutionOrder, SessionIOBinding, \
OrtAllocatorType, OrtMemType, OrtArenaCfg, OrtMemoryInfo, create_and_register_allocator, OrtSparseFormat, \
set_default_logger_verbosity
from onnxruntime.capi._pybind_state import (
ExecutionMode,
ExecutionOrder,
GraphOptimizationLevel,
ModelMetadata,
NodeArg,
OrtAllocatorType,
OrtArenaCfg,
OrtMemoryInfo,
OrtMemType,
OrtSparseFormat,
RunOptions,
SessionIOBinding,
SessionOptions,
create_and_register_allocator,
disable_telemetry_events,
enable_telemetry_events,
get_all_providers,
get_available_providers,
get_device,
set_default_logger_severity,
set_default_logger_verbosity,
set_seed,
)
import_capi_exception = None
except Exception as e:
import_capi_exception = e
@ -34,9 +54,13 @@ from onnxruntime.capi import onnxruntime_validation
if import_capi_exception:
raise import_capi_exception
from onnxruntime.capi.onnxruntime_inference_collection import InferenceSession, IOBinding, OrtValue, SparseTensor, \
OrtDevice
from onnxruntime.capi.onnxruntime_inference_collection import (
InferenceSession,
IOBinding,
OrtDevice,
OrtValue,
SparseTensor,
)
from onnxruntime.capi.training import * # noqa: F403
# TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
@ -45,7 +69,8 @@ try:
except ImportError:
pass
from onnxruntime.capi.onnxruntime_validation import package_name, version, cuda_version
from onnxruntime.capi.onnxruntime_validation import cuda_version, package_name, version
if version:
__version__ = version

Просмотреть файл

@ -1,4 +1,4 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------

Просмотреть файл

@ -8,14 +8,16 @@ import os
import subprocess
import sys
def is_windows():
return sys.platform.startswith("win")
def gen_md5(filename):
if not os.path.exists(filename):
return False
hash_md5 = hashlib.md5()
BLOCKSIZE = 1024*64
BLOCKSIZE = 1024 * 64
with open(filename, "rb") as f:
buf = f.read(BLOCKSIZE)
while len(buf) > 0:
@ -23,54 +25,61 @@ def gen_md5(filename):
buf = f.read(BLOCKSIZE)
return hash_md5.hexdigest()
def gen_checksum(file_checksum, input_dir):
if not file_checksum:
return
name = 'ORTInternal_checksum'
with open(os.path.join(input_dir, name + '.cc'), 'w') as checksum_cc:
print('#include <stdlib.h>', file=checksum_cc)
name = "ORTInternal_checksum"
with open(os.path.join(input_dir, name + ".cc"), "w") as checksum_cc:
print("#include <stdlib.h>", file=checksum_cc)
print('static const char model_checksum[] = "' + file_checksum + '";', file=checksum_cc)
print('extern "C"', file=checksum_cc)
if is_windows():
print('__declspec(dllexport)', file=checksum_cc)
print('void _ORTInternal_GetCheckSum(const char*& cs, size_t& len) {', file=checksum_cc)
print(' cs = model_checksum; len = sizeof(model_checksum)/sizeof(model_checksum[0]) - 1;', file=checksum_cc)
print('}', file=checksum_cc)
print("__declspec(dllexport)", file=checksum_cc)
print("void _ORTInternal_GetCheckSum(const char*& cs, size_t& len) {", file=checksum_cc)
print(" cs = model_checksum; len = sizeof(model_checksum)/sizeof(model_checksum[0]) - 1;", file=checksum_cc)
print("}", file=checksum_cc)
def gen_cache_version(input_dir):
name = 'ORTInternal_cache_version'
with open(os.path.join(input_dir, name + '.cc'), 'w') as cache_version_cc:
header_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NUPHAR_CACHE_VERSION')
name = "ORTInternal_cache_version"
with open(os.path.join(input_dir, name + ".cc"), "w") as cache_version_cc:
header_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "NUPHAR_CACHE_VERSION")
print('#include "{}"'.format(header_file), file=cache_version_cc)
print('extern "C"', file=cache_version_cc)
if is_windows():
print('__declspec(dllexport)', file=cache_version_cc)
print('const char* _ORTInternal_GetCacheVersion() {', file=cache_version_cc)
print(' return __NUPHAR_CACHE_VERSION__;', file=cache_version_cc)
print('}', file=cache_version_cc)
print("__declspec(dllexport)", file=cache_version_cc)
print("const char* _ORTInternal_GetCacheVersion() {", file=cache_version_cc)
print(" return __NUPHAR_CACHE_VERSION__;", file=cache_version_cc)
print("}", file=cache_version_cc)
def compile_all_cc(path):
for f in os.listdir(path):
name, ext = os.path.splitext(f)
if ext != '.cc':
if ext != ".cc":
continue
if is_windows():
subprocess.run(['cl', '/Fo' + name + '.o', '/c', f], cwd=path, check=True)
subprocess.run(["cl", "/Fo" + name + ".o", "/c", f], cwd=path, check=True)
else:
subprocess.run(['g++', '-std=c++14', '-fPIC', '-o', name + '.o', '-c', f], cwd=path, check=True)
subprocess.run(["g++", "-std=c++14", "-fPIC", "-o", name + ".o", "-c", f], cwd=path, check=True)
os.remove(os.path.join(path, f))
def parse_arguments():
parser = argparse.ArgumentParser(description="Offline shared lib creation tool.")
# Main arguments
parser.add_argument('--keep_input', action='store_true', help="Keep input files after created so.")
parser.add_argument('--input_dir', help="The input directory that contains obj files.", required=True)
parser.add_argument('--output_name', help="The output so file name.", default='jit.so')
parser.add_argument('--input_model', help="The input model file name to generate checksum into shared lib.", default=None)
parser.add_argument("--keep_input", action="store_true", help="Keep input files after created so.")
parser.add_argument("--input_dir", help="The input directory that contains obj files.", required=True)
parser.add_argument("--output_name", help="The output so file name.", default="jit.so")
parser.add_argument(
"--input_model", help="The input model file name to generate checksum into shared lib.", default=None
)
return parser.parse_args()
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_arguments()
if args.input_model:
@ -81,8 +90,8 @@ if __name__ == '__main__':
if is_windows():
# create dllmain
name = 'ORTInternal_dllmain'
with open(os.path.join(args.input_dir, name + '.cc'), 'w') as dllmain_cc:
name = "ORTInternal_dllmain"
with open(os.path.join(args.input_dir, name + ".cc"), "w") as dllmain_cc:
print("#include <windows.h>", file=dllmain_cc)
print("BOOL APIENTRY DllMain(HMODULE hModule,", file=dllmain_cc)
print(" DWORD ul_reason_for_call,", file=dllmain_cc)
@ -90,12 +99,20 @@ if __name__ == '__main__':
print(" {return TRUE;}", file=dllmain_cc)
compile_all_cc(args.input_dir)
objs = [f for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and '.o' == os.path.splitext(f)[1]]
objs = [
f
for f in os.listdir(args.input_dir)
if os.path.isfile(os.path.join(args.input_dir, f)) and ".o" == os.path.splitext(f)[1]
]
if is_windows():
subprocess.run(['link', '-dll', '-FORCE:MULTIPLE', '-EXPORT:__tvm_main__', '-out:' + args.output_name, '*.o'], cwd=args.input_dir, check=True)
subprocess.run(
["link", "-dll", "-FORCE:MULTIPLE", "-EXPORT:__tvm_main__", "-out:" + args.output_name, "*.o"],
cwd=args.input_dir,
check=True,
)
else:
subprocess.run(['g++', '-shared', '-fPIC', '-o', args.output_name] + objs, cwd=args.input_dir, check=True)
subprocess.run(["g++", "-shared", "-fPIC", "-o", args.output_name] + objs, cwd=args.input_dir, check=True)
if not args.keep_input:
for f in objs:

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -3,13 +3,16 @@
# -*- coding: UTF-8 -*-
import argparse
from enum import Enum
import json
from enum import Enum
import numpy as np
import onnx
from onnx import helper, numpy_helper
from .node_factory import NodeFactory, ensure_opset
class QuantizeConfig:
def __init__(self, signed, reserved_bits, type_bits):
self.sign_bit_ = 1 if signed else 0
@ -18,9 +21,9 @@ class QuantizeConfig:
@staticmethod
def from_dict(qcfg_dict):
return QuantizeConfig(1 if qcfg_dict['QuantizationType'] == 'Signed' else 0,
qcfg_dict['ReservedBit'],
qcfg_dict['QuantizeBit'])
return QuantizeConfig(
1 if qcfg_dict["QuantizationType"] == "Signed" else 0, qcfg_dict["ReservedBit"], qcfg_dict["QuantizeBit"]
)
def signed(self):
return self.sign_bit_ == 1
@ -47,10 +50,15 @@ class QuantizeConfig:
def q_type_bits(self):
return self.type_bits_
def __iter__(self): # need this to make dict for json
return iter([('QuantizeBit', self.type_bits_),
('QuantizationType', 'Signed' if self.sign_bit_ else 'Unsigned'),
('ReservedBit', self.reserved_bits_)])
def __iter__(self): # need this to make dict for json
return iter(
[
("QuantizeBit", self.type_bits_),
("QuantizationType", "Signed" if self.sign_bit_ else "Unsigned"),
("ReservedBit", self.reserved_bits_),
]
)
def parse_custom_attributes(in_node):
if in_node.doc_string:
@ -67,40 +75,56 @@ def parse_custom_attributes(in_node):
# "ReservedBitOfMatrix":0}}
qcfg_str = in_node.doc_string
# make sure it's the string we can parse
if 'custom_attributes' in qcfg_str:
if "custom_attributes" in qcfg_str:
# some fixes to make it a valid JSON string, when model keys are not string
if qcfg_str[1] == 'c':
qcfg_str = qcfg_str.replace('{', '{"')
qcfg_str = qcfg_str.replace(',', ',"')
qcfg_str = qcfg_str.replace(':', '":')
qcfg_str = qcfg_str.replace('{"}', '{}')
qcfg = json.loads(qcfg_str)['custom_attributes']
if qcfg_str[1] == "c":
qcfg_str = qcfg_str.replace("{", '{"')
qcfg_str = qcfg_str.replace(",", ',"')
qcfg_str = qcfg_str.replace(":", '":')
qcfg_str = qcfg_str.replace('{"}', "{}")
qcfg = json.loads(qcfg_str)["custom_attributes"]
if qcfg:
return qcfg
return None
def parse_node_description(in_node):
if not in_node.doc_string:
return None
custom_qcfg = parse_custom_attributes(in_node)
if custom_qcfg:
assert custom_qcfg['IntermediateBit'] == 32
assert custom_qcfg['PerRowQuantization']
assert custom_qcfg['QuantizeBitOfVector'] == custom_qcfg['QuantizeBitOfMatrix']
qbits = custom_qcfg['QuantizeBitOfVector']
assert ("Asymmetric" in custom_qcfg['VectorQuantizationType']) == ("Asymmetric" in custom_qcfg['MatrixQuantizationType'])
symmetric = 0 if "Asymmetric" in custom_qcfg['VectorQuantizationType'] else 1
x_signed = 0 if "Unsigned" in custom_qcfg['VectorQuantizationType'] else 1
w_signed = 0 if "Unsigned" in custom_qcfg['MatrixQuantizationType'] else 1
x_reserved_bits = custom_qcfg['ReservedBitOfVector']
w_reserved_bits = custom_qcfg['ReservedBitOfMatrix']
return {'W' : dict(QuantizeConfig(signed=w_signed, reserved_bits=w_reserved_bits, type_bits=qbits)),
'X' : dict(QuantizeConfig(signed=x_signed, reserved_bits=x_reserved_bits, type_bits=qbits)),
'Symmetric' : symmetric}
assert custom_qcfg["IntermediateBit"] == 32
assert custom_qcfg["PerRowQuantization"]
assert custom_qcfg["QuantizeBitOfVector"] == custom_qcfg["QuantizeBitOfMatrix"]
qbits = custom_qcfg["QuantizeBitOfVector"]
assert ("Asymmetric" in custom_qcfg["VectorQuantizationType"]) == (
"Asymmetric" in custom_qcfg["MatrixQuantizationType"]
)
symmetric = 0 if "Asymmetric" in custom_qcfg["VectorQuantizationType"] else 1
x_signed = 0 if "Unsigned" in custom_qcfg["VectorQuantizationType"] else 1
w_signed = 0 if "Unsigned" in custom_qcfg["MatrixQuantizationType"] else 1
x_reserved_bits = custom_qcfg["ReservedBitOfVector"]
w_reserved_bits = custom_qcfg["ReservedBitOfMatrix"]
return {
"W": dict(QuantizeConfig(signed=w_signed, reserved_bits=w_reserved_bits, type_bits=qbits)),
"X": dict(QuantizeConfig(signed=x_signed, reserved_bits=x_reserved_bits, type_bits=qbits)),
"Symmetric": symmetric,
}
return None
def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg, onnx_opset_ver):
assert in_node.op_type == 'MatMul'
def quantize_matmul_2d_with_weight(
in_node,
in_graph,
nf,
converted_weights,
quantized_inputs,
qcfg_dict,
update_qcfg_dict,
default_qcfg,
onnx_opset_ver,
):
assert in_node.op_type == "MatMul"
# quantize weight
# only handles weight being inputs[1] of MatMul/Gemm node
@ -108,7 +132,7 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
# skip if weights shared by other nodes that's not MatMul
# TODO: support GEMM op if needed
other_nodes = [n for n in in_graph.node if n != in_node and fparam_name in n.input and n.op_type != 'MatMul']
other_nodes = [n for n in in_graph.node if n != in_node and fparam_name in n.input and n.op_type != "MatMul"]
if other_nodes:
return False
@ -119,12 +143,16 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
if not node_qcfg:
if not update_qcfg_dict and qcfg_dict:
# when qcfg_dict is readonly, raise warning if qcfg is not found for this node
print("Warning: qcfg is not found for node with output: " + in_node.output[0] + ", fall back to default qcfg.")
print(
"Warning: qcfg is not found for node with output: "
+ in_node.output[0]
+ ", fall back to default qcfg."
)
node_qcfg = default_qcfg
w_qcfg = QuantizeConfig.from_dict(node_qcfg['W'])
x_qcfg = QuantizeConfig.from_dict(node_qcfg['X'])
symmetric = node_qcfg['Symmetric']
w_qcfg = QuantizeConfig.from_dict(node_qcfg["W"])
x_qcfg = QuantizeConfig.from_dict(node_qcfg["X"])
symmetric = node_qcfg["Symmetric"]
# for symmetric quantization, both weight and input should be quantized to signed
assert not symmetric or (w_qcfg.signed() and x_qcfg.signed())
@ -149,32 +177,34 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
else:
fmin = np.amin(fparam, axis=0)
fmax = np.amax(fparam, axis=0)
fscale = (fmax - fmin)/(2 if w_qcfg.signed() else 1) # signed would be normalized to [-1, 1], and unsigned to [0, 1]
fscale = (fmax - fmin) / (
2 if w_qcfg.signed() else 1
) # signed would be normalized to [-1, 1], and unsigned to [0, 1]
step = fscale / q_range
base = (fmax + fmin + step) * 0.5 if w_qcfg.signed() else fmin
fparam_norm = np.zeros_like(fparam)
expand_fscale = np.expand_dims(fscale,0)
np.divide((fparam - np.expand_dims(base,0)), expand_fscale, out=fparam_norm, where=expand_fscale!=0)
expand_fscale = np.expand_dims(fscale, 0)
np.divide((fparam - np.expand_dims(base, 0)), expand_fscale, out=fparam_norm, where=expand_fscale != 0)
qparam = np.round(fparam_norm * q_range)
qparam = np.clip(qparam, w_qcfg.q_min(), w_qcfg.q_max())
qparam_rowsum = np.sum(qparam, axis=0)
qparam = qparam.astype(w_qcfg.q_type())
# create new weights in main graph in case other Scans share via converted_weights
nf.make_initializer(step, fparam_name + '_step', in_main_graph=True)
nf.make_initializer(qparam, fparam_name + '_qparam', in_main_graph=True)
step = fparam_name + '_step'
qparam = fparam_name + '_qparam'
nf.make_initializer(step, fparam_name + "_step", in_main_graph=True)
nf.make_initializer(qparam, fparam_name + "_qparam", in_main_graph=True)
step = fparam_name + "_step"
qparam = fparam_name + "_qparam"
if symmetric:
# no need to compute qparam_rowsum and base for symmetric quantization
base = None
qparam_rowsum = None
else:
nf.make_initializer(base, fparam_name + '_base', in_main_graph=True)
base = fparam_name + '_base'
nf.make_initializer(qparam_rowsum, fparam_name + '_qparam_rowsum', in_main_graph=True)
qparam_rowsum = fparam_name + '_qparam_rowsum'
nf.make_initializer(base, fparam_name + "_base", in_main_graph=True)
base = fparam_name + "_base"
nf.make_initializer(qparam_rowsum, fparam_name + "_qparam_rowsum", in_main_graph=True)
qparam_rowsum = fparam_name + "_qparam_rowsum"
converted_weights[fparam_name] = (step, base, qparam_rowsum, qparam, w_qcfg, symmetric)
nf.remove_initializer(fparam_name)
@ -183,136 +213,216 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
input_dim = nf.get_initializer(qparam).shape[0]
X = in_node.input[0]
if quantized_inputs is not None:
quantized_inputs_key = '{}_{}_{}'.format(X, symmetric, '|'.join(['{}:{}'.format(k,v) for (k, v) in x_qcfg]))
quantized_inputs_key = "{}_{}_{}".format(
X, symmetric, "|".join(["{}:{}".format(k, v) for (k, v) in x_qcfg])
)
if quantized_inputs is not None and quantized_inputs_key in quantized_inputs:
scale_X, bias_X, Q_X, Q_X_sum_int32 = quantized_inputs[quantized_inputs_key]
else:
if symmetric:
delta_X = nf.make_node('ReduceMax', nf.make_node('Abs', X), {'axes':[-1]}) # keepdims = 1
inv_delta_X = nf.make_node('Reciprocal', delta_X)
norm_X = nf.make_node('Mul', [X, inv_delta_X])
delta_X = nf.make_node("ReduceMax", nf.make_node("Abs", X), {"axes": [-1]}) # keepdims = 1
inv_delta_X = nf.make_node("Reciprocal", delta_X)
norm_X = nf.make_node("Mul", [X, inv_delta_X])
bias_X = None
assert x_qcfg.signed()
else:
reduce_max_X = nf.make_node('ReduceMax', X, {'axes':[-1]}) # keepdims = 1
bias_X = nf.make_node('ReduceMin', X, {'axes':[-1]})
delta_X = nf.make_node('Sub', [reduce_max_X, bias_X])
inv_delta_X = nf.make_node('Reciprocal', delta_X)
norm_X = nf.make_node('Mul', [nf.make_node('Sub', [X, bias_X]), inv_delta_X])
reduce_max_X = nf.make_node("ReduceMax", X, {"axes": [-1]}) # keepdims = 1
bias_X = nf.make_node("ReduceMin", X, {"axes": [-1]})
delta_X = nf.make_node("Sub", [reduce_max_X, bias_X])
inv_delta_X = nf.make_node("Reciprocal", delta_X)
norm_X = nf.make_node("Mul", [nf.make_node("Sub", [X, bias_X]), inv_delta_X])
scale_X = nf.make_node('Mul', [delta_X, np.asarray(1.0 / x_qcfg.q_range()).astype(np.float32)])
Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)])
Q_Xf = nf.make_node('Floor', Q_Xf)
scale_X = nf.make_node("Mul", [delta_X, np.asarray(1.0 / x_qcfg.q_range()).astype(np.float32)])
Q_Xf = nf.make_node("Mul", [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
Q_Xf = nf.make_node("Add", [Q_Xf, np.asarray(0.5).astype(np.float32)])
Q_Xf = nf.make_node("Floor", Q_Xf)
if onnx_opset_ver < 11:
Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
Q_Xf = nf.make_node("Clip", Q_Xf, {"max": x_qcfg.q_max(), "min": x_qcfg.q_min()})
else:
# Clip changed min max to inputs in opset 11
Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8 : onnx.TensorProto.UINT8,
np.int8 : onnx.TensorProto.INT8,
np.uint16 : onnx.TensorProto.UINT16,
np.int16 : onnx.TensorProto.INT16}[x_qcfg.q_type()])})
Q_Xf = nf.make_node(
"Clip",
[
Q_Xf,
np.asarray(x_qcfg.q_min()).astype(np.float32),
np.asarray(x_qcfg.q_max()).astype(np.float32),
],
)
Q_X = nf.make_node(
"Cast",
Q_Xf,
{
"to": int(
{
np.uint8: onnx.TensorProto.UINT8,
np.int8: onnx.TensorProto.INT8,
np.uint16: onnx.TensorProto.UINT16,
np.int16: onnx.TensorProto.INT16,
}[x_qcfg.q_type()]
)
},
)
if symmetric:
Q_X_sum_int32 = None
else:
Q_X_sum_int32 = nf.make_node_with_axes('ReduceSum', nf.make_node('Cast', Q_X, {'to':int(onnx.TensorProto.INT32)}), [-1], onnx_opset_ver)
Q_X_sum_int32 = nf.make_node_with_axes(
"ReduceSum", nf.make_node("Cast", Q_X, {"to": int(onnx.TensorProto.INT32)}), [-1], onnx_opset_ver
)
if quantized_inputs is not None:
quantized_inputs[quantized_inputs_key] = (scale_X, bias_X, Q_X, Q_X_sum_int32)
# MatMulInteger
if x_qcfg.q_type_bits() == 8:
Q_Y = nf.make_node('MatMulInteger', [Q_X, qparam])
Q_Y = nf.make_node("MatMulInteger", [Q_X, qparam])
else:
Q_Y = nf.make_node('MatMulInteger16', [Q_X, qparam])
Q_Y = nf.make_node("MatMulInteger16", [Q_X, qparam])
Q_Y.domain = "com.microsoft"
# Dequantize
Y = in_node.output[0]
if symmetric:
nf.make_node('Mul',
[nf.make_node('Mul', [step, scale_X]),
nf.make_node('Cast', Q_Y, {'to': int(onnx.TensorProto.FLOAT)})],
output_names=Y)
nf.make_node(
"Mul",
[nf.make_node("Mul", [step, scale_X]), nf.make_node("Cast", Q_Y, {"to": int(onnx.TensorProto.FLOAT)})],
output_names=Y,
)
else:
o0 = nf.make_node('Mul', [nf.make_node('Mul', [step, scale_X]),
nf.make_node('Cast', Q_Y, {'to': int(onnx.TensorProto.FLOAT)})])
o1 = nf.make_node('Mul', [nf.make_node('Mul', [step, bias_X]), qparam_rowsum])
o2 = nf.make_node('Mul', [base, nf.make_node('Mul', [scale_X, nf.make_node('Cast', Q_X_sum_int32, {'to':int(onnx.TensorProto.FLOAT)})])])
o3 = nf.make_node('Mul', [base, nf.make_node('Mul', [bias_X, np.asarray(float(input_dim)).astype(np.float32)])])
nf.make_node('Sum', [o3, o2, o1, o0], output_names=Y)
o0 = nf.make_node(
"Mul",
[nf.make_node("Mul", [step, scale_X]), nf.make_node("Cast", Q_Y, {"to": int(onnx.TensorProto.FLOAT)})],
)
o1 = nf.make_node("Mul", [nf.make_node("Mul", [step, bias_X]), qparam_rowsum])
o2 = nf.make_node(
"Mul",
[
base,
nf.make_node(
"Mul", [scale_X, nf.make_node("Cast", Q_X_sum_int32, {"to": int(onnx.TensorProto.FLOAT)})]
),
],
)
o3 = nf.make_node(
"Mul", [base, nf.make_node("Mul", [bias_X, np.asarray(float(input_dim)).astype(np.float32)])]
)
nf.make_node("Sum", [o3, o2, o1, o0], output_names=Y)
if update_qcfg_dict:
qcfg_dict[in_node.output[0]] = node_qcfg
return True
def upgrade_op(nf, in_n):
if in_n.op_type == 'Slice' and len(in_n.input) == 1:
if in_n.op_type == "Slice" and len(in_n.input) == 1:
# convert opset9 Slice to opset10
with nf.scoped_prefix(in_n.name) as scoped_prefix:
slice_inputs = [in_n.input[0],
np.asarray(NodeFactory.get_attribute(in_n,'starts')).astype(np.int64),
np.asarray(NodeFactory.get_attribute(in_n,'ends')).astype(np.int64),
np.asarray(NodeFactory.get_attribute(in_n,'axes')).astype(np.int64)]
nf.make_node('Slice', slice_inputs, output_names=list(in_n.output))
slice_inputs = [
in_n.input[0],
np.asarray(NodeFactory.get_attribute(in_n, "starts")).astype(np.int64),
np.asarray(NodeFactory.get_attribute(in_n, "ends")).astype(np.int64),
np.asarray(NodeFactory.get_attribute(in_n, "axes")).astype(np.int64),
]
nf.make_node("Slice", slice_inputs, output_names=list(in_n.output))
return True
elif in_n.op_type == 'TopK' and len(in_n.input) == 1:
elif in_n.op_type == "TopK" and len(in_n.input) == 1:
# convert opset1 TopK to opset10
with nf.scoped_prefix(in_n.name) as scoped_prefix:
topk_inputs = [in_n.input[0],
np.asarray([NodeFactory.get_attribute(in_n,'k')]).astype(np.int64)]
nf.make_node('TopK', topk_inputs, {'axis':NodeFactory.get_attribute(in_n,'axis',-1)}, output_names=list(in_n.output))
topk_inputs = [in_n.input[0], np.asarray([NodeFactory.get_attribute(in_n, "k")]).astype(np.int64)]
nf.make_node(
"TopK",
topk_inputs,
{"axis": NodeFactory.get_attribute(in_n, "axis", -1)},
output_names=list(in_n.output),
)
return True
else:
return False
# quantize matmul to MatMulInteger using asymm uint8
def convert_matmul_model(input_model, output_model, only_for_scan=False, share_input_quantization=False, preset_str='asymm8_param0_input1', qcfg_json=None, export_qcfg_json=None):
preset_qcfgs = {'asymm8_param0_input1' : {'W' : dict(QuantizeConfig(signed=1, reserved_bits=0, type_bits=8)),
'X' : dict(QuantizeConfig(signed=0, reserved_bits=1, type_bits=8)),
'Symmetric' : 0},
'symm16_param3_input3' : {'W' : dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
'X' : dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
'Symmetric' : 1}}
def convert_matmul_model(
input_model,
output_model,
only_for_scan=False,
share_input_quantization=False,
preset_str="asymm8_param0_input1",
qcfg_json=None,
export_qcfg_json=None,
):
preset_qcfgs = {
"asymm8_param0_input1": {
"W": dict(QuantizeConfig(signed=1, reserved_bits=0, type_bits=8)),
"X": dict(QuantizeConfig(signed=0, reserved_bits=1, type_bits=8)),
"Symmetric": 0,
},
"symm16_param3_input3": {
"W": dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
"X": dict(QuantizeConfig(signed=1, reserved_bits=3, type_bits=16)),
"Symmetric": 1,
},
}
default_qcfg = preset_qcfgs[preset_str]
in_mp = onnx.load(input_model)
qcfg_dict = {}
if qcfg_json and not export_qcfg_json:
with open(qcfg_json, 'r') as f:
with open(qcfg_json, "r") as f:
qcfg_dict = json.load(f)
out_mp = onnx.ModelProto()
out_mp.CopyFrom(in_mp)
out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
ensure_opset(out_mp, 1, 'com.microsoft') # add MS domain for MatMulInteger16
out_mp.graph.ClearField('node')
out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
ensure_opset(out_mp, 1, "com.microsoft") # add MS domain for MatMulInteger16
out_mp.graph.ClearField("node")
nf = NodeFactory(out_mp.graph)
converted_weights = {} # remember MatMul weights that have been converted, in case of sharing
quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
converted_weights = {} # remember MatMul weights that have been converted, in case of sharing
quantized_inputs = (
{} if share_input_quantization else None
) # remember quantized inputs that might be able to share between MatMuls
for in_n in in_mp.graph.node:
if upgrade_op(nf, in_n):
continue
if in_n.op_type == 'MatMul' and not only_for_scan:
if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
if in_n.op_type == "MatMul" and not only_for_scan:
if quantize_matmul_2d_with_weight(
in_n,
in_mp.graph,
nf,
converted_weights,
quantized_inputs,
qcfg_dict,
export_qcfg_json,
default_qcfg,
onnx_opset_ver,
):
continue
out_n = out_mp.graph.node.add()
out_n.CopyFrom(in_n)
if in_n.op_type == 'Scan' or in_n.op_type == 'Loop':
in_subgraph = NodeFactory.get_attribute(in_n, 'body')
out_subgraph = NodeFactory.get_attribute(out_n, 'body')
out_subgraph.ClearField('node')
if in_n.op_type == "Scan" or in_n.op_type == "Loop":
in_subgraph = NodeFactory.get_attribute(in_n, "body")
out_subgraph = NodeFactory.get_attribute(out_n, "body")
out_subgraph.ClearField("node")
scan_nf = NodeFactory(out_mp.graph, out_subgraph)
subgraph_quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
subgraph_quantized_inputs = (
{} if share_input_quantization else None
) # remember quantized inputs that might be able to share between MatMuls
for in_sn in in_subgraph.node:
if in_sn.op_type == 'MatMul':
if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
if in_sn.op_type == "MatMul":
if quantize_matmul_2d_with_weight(
in_sn,
in_subgraph,
scan_nf,
converted_weights,
subgraph_quantized_inputs,
qcfg_dict,
export_qcfg_json,
default_qcfg,
onnx_opset_ver,
):
continue
if upgrade_op(scan_nf, in_sn):
@ -323,25 +433,55 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
onnx.save(out_mp, output_model)
if export_qcfg_json:
with open(qcfg_json, 'w') as f:
with open(qcfg_json, "w") as f:
f.write(json.dumps(qcfg_dict, indent=2))
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True, help='The input model file')
parser.add_argument('--output', required=True, help='The output model file')
parser.add_argument('--default_qcfg', help='The preset of quantization of <asymm|symm><qbits>_param<reserve_bit>_input<reserve_bit>', choices=['asymm8_param0_input1', 'symm16_param3_input3'], default='asymm8_param0_input1')
parser.add_argument('--qcfg_json', help='The quantization config json file for read or write.', default=None)
parser.add_argument('--export_qcfg_json', help='If set, write default quantization config to qcfg_json file.', action='store_true', default=False)
parser.add_argument('--only_for_scan', help='If set, apply quantization of MatMul only inside scan', action='store_true', default=False)
parser.add_argument('--share_input_quantization', help='If set, allow input quantization to be shared if the same input is used in multiple MatMul', action='store_true', default=False)
return parser.parse_args()
if __name__ == '__main__':
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, help="The input model file")
parser.add_argument("--output", required=True, help="The output model file")
parser.add_argument(
"--default_qcfg",
help="The preset of quantization of <asymm|symm><qbits>_param<reserve_bit>_input<reserve_bit>",
choices=["asymm8_param0_input1", "symm16_param3_input3"],
default="asymm8_param0_input1",
)
parser.add_argument("--qcfg_json", help="The quantization config json file for read or write.", default=None)
parser.add_argument(
"--export_qcfg_json",
help="If set, write default quantization config to qcfg_json file.",
action="store_true",
default=False,
)
parser.add_argument(
"--only_for_scan",
help="If set, apply quantization of MatMul only inside scan",
action="store_true",
default=False,
)
parser.add_argument(
"--share_input_quantization",
help="If set, allow input quantization to be shared if the same input is used in multiple MatMul",
action="store_true",
default=False,
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
print('input model: ' + args.input)
print('output model ' + args.output)
print('Quantize MatMul to MatMulInteger...')
print("input model: " + args.input)
print("output model " + args.output)
print("Quantize MatMul to MatMulInteger...")
assert not args.export_qcfg_json or args.qcfg_json, "--qcfg_json must be specified when --export_qcfg_json is used"
convert_matmul_model(args.input, args.output, args.only_for_scan, args.share_input_quantization, args.default_qcfg, args.qcfg_json, args.export_qcfg_json)
print('Done!')
convert_matmul_model(
args.input,
args.output,
args.only_for_scan,
args.share_input_quantization,
args.default_qcfg,
args.qcfg_json,
args.export_qcfg_json,
)
print("Done!")

Просмотреть файл

@ -1,21 +1,25 @@
import numpy as np
from numpy.testing import assert_array_equal
import onnxruntime as ort
import onnx
from onnx import helper
from onnxruntime.nuphar.node_factory import ensure_opset
from onnxruntime.nuphar.model_editor import convert_loop_to_scan_model
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto
import onnxruntime.tools.onnxruntime_test as ort_test
import argparse
import copy
import os
import numpy as np
import onnx
from numpy.testing import assert_array_equal
from onnx import helper
import onnxruntime as ort
import onnxruntime.tools.onnxruntime_test as ort_test
from onnxruntime.nuphar.model_editor import convert_loop_to_scan_model
from onnxruntime.nuphar.node_factory import ensure_opset
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto
def run_shape_inference(input_model, output_model):
in_mp = onnx.load(input_model)
in_mp = SymbolicShapeInference.infer_shapes(in_mp, auto_merge=True)
onnx.save(in_mp, output_model)
# use this function to make a loop op's output as model output.
# it helps to debug data issues when edited model outputs do not match the original model.
def extract_loop_outputs_as_model_outputs(model):
@ -29,13 +33,15 @@ def extract_loop_outputs_as_model_outputs(model):
break
for node in model.graph.node:
if node.op_type == 'Loop':
if node.op_type == "Loop":
# for debugging to make scan output as model graph output
set_op_output_as_model_output(node, model.graph)
def run_with_ort(model_path, symbolic_dims={}, feeds=None, ort_test_case_dir=None):
_, feeds, outputs = ort_test.run_model(model_path, symbolic_dims=symbolic_dims,
feeds=feeds, override_initializers=False)
_, feeds, outputs = ort_test.run_model(
model_path, symbolic_dims=symbolic_dims, feeds=feeds, override_initializers=False
)
if ort_test_case_dir:
model = onnx.load(model_path)
@ -44,61 +50,73 @@ def run_with_ort(model_path, symbolic_dims={}, feeds=None, ort_test_case_dir=Non
if not os.path.exists(ort_test_case_dir):
os.makedirs(ort_test_case_dir)
test_data_set_dir = os.path.join(ort_test_case_dir, 'test_data_set_0')
test_data_set_dir = os.path.join(ort_test_case_dir, "test_data_set_0")
if not os.path.exists(test_data_set_dir):
os.makedirs(test_data_set_dir)
onnx.save(model, os.path.join(ort_test_case_dir, 'model.onnx'))
onnx.save(model, os.path.join(ort_test_case_dir, "model.onnx"))
for i, (input_name, input) in enumerate(feeds.items()):
onnx.save_tensor(onnx.numpy_helper.from_array(input, input_name),
os.path.join(test_data_set_dir, 'input_{0}.pb'.format(i)))
onnx.save_tensor(
onnx.numpy_helper.from_array(input, input_name),
os.path.join(test_data_set_dir, "input_{0}.pb".format(i)),
)
output_names = [output.name for output in model.graph.output]
output_dict = dict(zip(output_names, outputs))
for i, (output_name, output) in enumerate(output_dict.items()):
onnx.save_tensor(onnx.numpy_helper.from_array(output, output_name),
os.path.join(test_data_set_dir, 'output_{0}.pb'.format(i)))
onnx.save_tensor(
onnx.numpy_helper.from_array(output, output_name),
os.path.join(test_data_set_dir, "output_{0}.pb".format(i)),
)
save_ort_test_case(ort_test_case_dir)
return feeds, outputs
def validate_with_ort(input_filename, output_filename, symbolic_dims={}):
feeds, loop_output = run_with_ort(input_filename, symbolic_dims=symbolic_dims)
_, scan_output = run_with_ort(output_filename, symbolic_dims=symbolic_dims, feeds=feeds)
assert(len(loop_output) == len(scan_output))
assert len(loop_output) == len(scan_output)
for index in range(0, len(loop_output)):
assert_array_equal(loop_output[index], scan_output[index])
def convert_loop_to_scan_and_validate(input_filename, output_filename, symbolic_dims={}):
convert_loop_to_scan_model(args.input, args.output)
validate_with_ort(args.input, args.output, symbolic_dims=symbolic_dims)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--tool', help='what to do',
choices=['run_shape_inference',
'run_with_ort',
'validate_with_ort',
'convert_loop_to_scan_and_validate'])
parser.add_argument(
"--tool",
help="what to do",
choices=["run_shape_inference", "run_with_ort", "validate_with_ort", "convert_loop_to_scan_and_validate"],
)
parser.add_argument('--input', help='The input model file', default=None)
parser.add_argument('--output', help='The output model file', default=None)
parser.add_argument('--symbolic_dims', default={}, type=lambda s: dict(x.split("=") for x in s.split(",")),
help='Comma separated name=value pairs for any symbolic dimensions in the model input. '
'e.g. --symbolic_dims batch=1,seqlen=5. '
'If not provided, the value of 1 will be used for all symbolic dimensions.')
parser.add_argument('--ort_test_case_dir', help='ort test case dir', default=None)
parser.add_argument("--input", help="The input model file", default=None)
parser.add_argument("--output", help="The output model file", default=None)
parser.add_argument(
"--symbolic_dims",
default={},
type=lambda s: dict(x.split("=") for x in s.split(",")),
help="Comma separated name=value pairs for any symbolic dimensions in the model input. "
"e.g. --symbolic_dims batch=1,seqlen=5. "
"If not provided, the value of 1 will be used for all symbolic dimensions.",
)
parser.add_argument("--ort_test_case_dir", help="ort test case dir", default=None)
return parser.parse_args()
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_arguments()
if args.tool == 'run_shape_inference':
if args.tool == "run_shape_inference":
run_shape_inference(args.input, args.output)
elif args.tool == 'run_with_ort':
elif args.tool == "run_with_ort":
run_with_ort(args.input, symbolic_dims=args.symbolic_dims, ort_test_case_dir=args.ort_test_case_dir)
elif args.tool == 'validate_with_ort':
elif args.tool == "validate_with_ort":
validate_with_ort(args.input, args.output, symbolic_dims=args.symbolic_dims)
elif args.tool == 'convert_loop_to_scan_and_validate':
elif args.tool == "convert_loop_to_scan_and_validate":
convert_loop_to_scan_and_validate(args.input, args.output, symbolic_dims=args.symbolic_dims)

Просмотреть файл

@ -1,19 +1,22 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import json
import re
# -*- coding: UTF-8 -*-
from enum import Enum
import json
import numpy as np
import onnx
from onnx import helper, numpy_helper
import re
class NodeFactory:
node_count_ = 0
const_count_ = 0
def __init__(self, main_graph, sub_graph=None, prefix=''):
def __init__(self, main_graph, sub_graph=None, prefix=""):
self.graph_ = sub_graph if sub_graph else main_graph
self.main_graph_ = main_graph
self.name_prefix_ = prefix
@ -91,15 +94,17 @@ class NodeFactory:
value_info.CopyFrom(helper.make_tensor_value_info(name, data_type, shape))
def make_initializer(self, ndarray, name='', in_main_graph=False):
def make_initializer(self, ndarray, name="", in_main_graph=False):
new_initializer = (self.main_graph_ if in_main_graph else self.graph_).initializer.add()
new_name = name
if len(new_name) == 0:
already_existed = True
while already_existed:
new_name = self.name_prefix_ + '_Const_' + str(NodeFactory.const_count_)
new_name = self.name_prefix_ + "_Const_" + str(NodeFactory.const_count_)
NodeFactory.const_count_ = NodeFactory.const_count_ + 1
already_existed = new_name in [i.name for i in list(self.main_graph_.initializer) + list(self.graph_.initializer)]
already_existed = new_name in [
i.name for i in list(self.main_graph_.initializer) + list(self.graph_.initializer)
]
new_initializer.CopyFrom(numpy_helper.from_array(ndarray, new_name))
return new_initializer
@ -118,12 +123,12 @@ class NodeFactory:
new_initializer = self.make_initializer(i)
input_names.append(new_initializer.name)
else:
assert False # unexpected type in input
assert False # unexpected type in input
if not node:
node = self.graph_.node.add()
name = self.name_prefix_ + op_type + '_' + str(NodeFactory.node_count_)
name = self.name_prefix_ + op_type + "_" + str(NodeFactory.node_count_)
NodeFactory.node_count_ = NodeFactory.node_count_ + 1
if not output_names:
@ -134,9 +139,9 @@ class NodeFactory:
# Squeeze/Unsqueeze/ReduceSum changed axes to input[1] in opset 13
def make_node_with_axes(self, op_type, input, axes, onnx_opset_ver, attributes={}, output_names=None):
assert op_type in ['Squeeze', 'Unsqueeze', 'ReduceSum']
assert op_type in ["Squeeze", "Unsqueeze", "ReduceSum"]
if onnx_opset_ver < 13:
attributes.update({'axes':axes})
attributes.update({"axes": axes})
return self.make_node(op_type, input, attributes=attributes, output_names=output_names)
else:
axes = np.asarray(axes).astype(np.int64)
@ -149,13 +154,14 @@ class NodeFactory:
# Split changed split to input[1] in opset 13
def make_split_node(self, input, split, onnx_opset_ver, attributes, output_names=None):
if onnx_opset_ver < 13:
attributes.update({'split':split})
return self.make_node('Split', input, attributes=attributes, output_names=output_names)
attributes.update({"split": split})
return self.make_node("Split", input, attributes=attributes, output_names=output_names)
else:
split = np.asarray(split).astype(np.int64)
return self.make_node('Split', [input, split], attributes=attributes, output_names=output_names)
return self.make_node("Split", [input, split], attributes=attributes, output_names=output_names)
def ensure_opset(mp, ver, domains=['onnx', '']):
def ensure_opset(mp, ver, domains=["onnx", ""]):
if type(domains) == str:
domains = [domains]
assert type(domains) == list

Просмотреть файл

@ -4,54 +4,107 @@
# -*- coding: UTF-8 -*-
import argparse
import multiprocessing
import numpy as np
import onnx
# use lines below when building ONNX Runtime from source with --enable_pybind
#import sys
#sys.path.append(r'X:\Repos\Lotus\build\Windows\Release\Release')
#sys.path.append('/repos/Lotus/build/Linux/Release')
import onnxruntime
from onnx import helper, numpy_helper
from onnx import shape_inference
from onnx import IR_VERSION
import os
from timeit import default_timer as timer
def generate_model(rnn_type, input_dim, hidden_dim, bidirectional, layers, model_name, batch_one=True, has_seq_len=False, onnx_opset_ver=7):
import numpy as np
import onnx
from onnx import IR_VERSION, helper, numpy_helper, shape_inference
# use lines below when building ONNX Runtime from source with --enable_pybind
# import sys
# sys.path.append(r'X:\Repos\Lotus\build\Windows\Release\Release')
# sys.path.append('/repos/Lotus/build/Linux/Release')
import onnxruntime
def generate_model(
rnn_type,
input_dim,
hidden_dim,
bidirectional,
layers,
model_name,
batch_one=True,
has_seq_len=False,
onnx_opset_ver=7,
):
model = onnx.ModelProto()
model.ir_version = IR_VERSION
opset = model.opset_import.add()
opset.domain == 'onnx'
opset.domain == "onnx"
opset.version = onnx_opset_ver
num_directions = 2 if bidirectional else 1
X = 'input'
model.graph.input.add().CopyFrom(helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ['s', 1 if batch_one else 'b', input_dim]))
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.asarray([0, 0, -1], dtype=np.int64), 'shape'))
X = "input"
model.graph.input.add().CopyFrom(
helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ["s", 1 if batch_one else "b", input_dim])
)
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.asarray([0, 0, -1], dtype=np.int64), "shape"))
if has_seq_len:
seq_len = 'seq_len'
model.graph.input.add().CopyFrom(helper.make_tensor_value_info(seq_len, onnx.TensorProto.INT32, [1 if batch_one else 'b',]))
seq_len = "seq_len"
model.graph.input.add().CopyFrom(
helper.make_tensor_value_info(
seq_len,
onnx.TensorProto.INT32,
[
1 if batch_one else "b",
],
)
)
gates = {'lstm':4, 'gru':3, 'rnn':1}[rnn_type]
gates = {"lstm": 4, "gru": 3, "rnn": 1}[rnn_type]
for i in range(layers):
layer_input_dim = (input_dim if i == 0 else hidden_dim * num_directions)
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.random.rand(num_directions, gates*hidden_dim, layer_input_dim).astype(np.float32), 'W'+str(i)))
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.random.rand(num_directions, gates*hidden_dim, hidden_dim).astype(np.float32), 'R'+str(i)))
model.graph.initializer.add().CopyFrom(numpy_helper.from_array(np.random.rand(num_directions, 2*gates*hidden_dim).astype(np.float32), 'B'+str(i)))
layer_inputs = [X, 'W'+str(i), 'R'+str(i), 'B'+str(i)]
layer_input_dim = input_dim if i == 0 else hidden_dim * num_directions
model.graph.initializer.add().CopyFrom(
numpy_helper.from_array(
np.random.rand(num_directions, gates * hidden_dim, layer_input_dim).astype(np.float32), "W" + str(i)
)
)
model.graph.initializer.add().CopyFrom(
numpy_helper.from_array(
np.random.rand(num_directions, gates * hidden_dim, hidden_dim).astype(np.float32), "R" + str(i)
)
)
model.graph.initializer.add().CopyFrom(
numpy_helper.from_array(
np.random.rand(num_directions, 2 * gates * hidden_dim).astype(np.float32), "B" + str(i)
)
)
layer_inputs = [X, "W" + str(i), "R" + str(i), "B" + str(i)]
if has_seq_len:
layer_inputs += [seq_len]
layer_outputs = ['layer_output_'+str(i)]
model.graph.node.add().CopyFrom(helper.make_node(rnn_type.upper(), layer_inputs, layer_outputs, rnn_type+str(i), hidden_size=hidden_dim, direction='bidirectional' if bidirectional else 'forward'))
model.graph.node.add().CopyFrom(helper.make_node('Transpose', layer_outputs, ['transposed_output_'+str(i)], 'transpose'+str(i), perm=[0,2,1,3]))
model.graph.node.add().CopyFrom(helper.make_node('Reshape', ['transposed_output_'+str(i), 'shape'], ['reshaped_output_'+str(i)], 'reshape'+str(i)))
X = 'reshaped_output_'+str(i)
model.graph.output.add().CopyFrom(helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ['s', 'b', hidden_dim * num_directions]))
layer_outputs = ["layer_output_" + str(i)]
model.graph.node.add().CopyFrom(
helper.make_node(
rnn_type.upper(),
layer_inputs,
layer_outputs,
rnn_type + str(i),
hidden_size=hidden_dim,
direction="bidirectional" if bidirectional else "forward",
)
)
model.graph.node.add().CopyFrom(
helper.make_node(
"Transpose", layer_outputs, ["transposed_output_" + str(i)], "transpose" + str(i), perm=[0, 2, 1, 3]
)
)
model.graph.node.add().CopyFrom(
helper.make_node(
"Reshape", ["transposed_output_" + str(i), "shape"], ["reshaped_output_" + str(i)], "reshape" + str(i)
)
)
X = "reshaped_output_" + str(i)
model.graph.output.add().CopyFrom(
helper.make_tensor_value_info(X, onnx.TensorProto.FLOAT, ["s", "b", hidden_dim * num_directions])
)
model = shape_inference.infer_shapes(model)
onnx.save(model, model_name)
def perf_run(sess, feeds, min_counts=5, min_duration_seconds=10):
# warm up
sess.run([], feeds)
@ -70,19 +123,23 @@ def perf_run(sess, feeds, min_counts=5, min_duration_seconds=10):
run = False
return count, (end - start), per_iter_cost
def top_n_avg(per_iter_cost, n):
# following the perf test methodology in [timeit](https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat)
per_iter_cost.sort()
return sum(per_iter_cost[:n]) * 1000 / n
def get_num_threads():
return os.environ['OMP_NUM_THREADS'] if 'OMP_NUM_THREADS' in os.environ else None
return os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else None
def set_num_threads(num_threads):
if num_threads:
os.environ['OMP_NUM_THREADS'] = str(num_threads)
os.environ["OMP_NUM_THREADS"] = str(num_threads)
else:
del os.environ['OMP_NUM_THREADS']
del os.environ["OMP_NUM_THREADS"]
class ScopedSetNumThreads:
def __init__(self, num_threads):
@ -95,117 +152,222 @@ class ScopedSetNumThreads:
def __exit__(self, type, value, tb):
set_num_threads(self.saved_num_threads_)
def perf_test(rnn_type, num_threads, input_dim, hidden_dim, bidirectional, layers, seq_len, batch_size, top_n=5, min_duration_seconds=10):
model_name = '{}_i{}_h{}_{}_l{}_{}.onnx'.format(rnn_type, input_dim, hidden_dim,
'bi' if bidirectional else '',
layers,
'batched' if batch_size > 1 else 'no_batch')
def perf_test(
rnn_type,
num_threads,
input_dim,
hidden_dim,
bidirectional,
layers,
seq_len,
batch_size,
top_n=5,
min_duration_seconds=10,
):
model_name = "{}_i{}_h{}_{}_l{}_{}.onnx".format(
rnn_type,
input_dim,
hidden_dim,
"bi" if bidirectional else "",
layers,
"batched" if batch_size > 1 else "no_batch",
)
generate_model(rnn_type, input_dim, hidden_dim, bidirectional, layers, model_name, batch_size == 1)
feeds = {'input':np.random.rand(seq_len, batch_size, input_dim).astype(np.float32)}
feeds = {"input": np.random.rand(seq_len, batch_size, input_dim).astype(np.float32)}
# run original model in CPU provider, using all threads
# there are some local thread pool inside LSTM/GRU CPU kernel
# that cannot be controlled by OMP or intra_op_num_threads
sess = onnxruntime.InferenceSession(model_name, providers=['CPUExecutionProvider'])
sess = onnxruntime.InferenceSession(model_name, providers=["CPUExecutionProvider"])
count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
avg_rnn = top_n_avg(per_iter_cost, top_n)
print('perf_rnn (with default threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(model_name, count, top_n, avg_rnn))
print(
"perf_rnn (with default threads) {}: run for {} iterations, top {} avg {:.3f} ms".format(
model_name, count, top_n, avg_rnn
)
)
# run converted model in Nuphar, using specified threads
with ScopedSetNumThreads(num_threads) as scoped_set_num_threads:
# run Scan model converted from original in Nuphar
from .model_editor import convert_to_scan_model
from ..tools.symbolic_shape_infer import SymbolicShapeInference
scan_model_name = os.path.splitext(model_name)[0] + '_scan.onnx'
from .model_editor import convert_to_scan_model
scan_model_name = os.path.splitext(model_name)[0] + "_scan.onnx"
convert_to_scan_model(model_name, scan_model_name)
# note that symbolic shape inference is needed because model has symbolic batch dim, thus init_state is ConstantOfShape
onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(scan_model_name)), scan_model_name)
sess = onnxruntime.InferenceSession(scan_model_name, providers=onnxruntime.get_available_providers())
count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
count, duration, per_iter_cost = perf_run(
sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds
)
avg_scan = top_n_avg(per_iter_cost, top_n)
print('perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, scan_model_name, count, top_n, avg_scan))
print(
"perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms".format(
num_threads, scan_model_name, count, top_n, avg_scan
)
)
# quantize Scan model to int8 and run in Nuphar
from .model_quantizer import convert_matmul_model
int8_model_name = os.path.splitext(model_name)[0] + '_int8.onnx'
int8_model_name = os.path.splitext(model_name)[0] + "_int8.onnx"
convert_matmul_model(scan_model_name, int8_model_name)
onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(int8_model_name)), int8_model_name)
sess = onnxruntime.InferenceSession(int8_model_name, providers=onnxruntime.get_available_providers())
count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
count, duration, per_iter_cost = perf_run(
sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds
)
avg_int8 = top_n_avg(per_iter_cost, top_n)
print('perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, int8_model_name, count, top_n, avg_int8))
print(
"perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms".format(
num_threads, int8_model_name, count, top_n, avg_int8
)
)
return avg_rnn, avg_scan, avg_int8
def perf_test_auto(auto_file):
# generate reports in csv format
with open('single_thread_' + auto_file + '.csv', 'w') as f:
print('single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 batch_size=1', file=f)
print('rnn_type,hidden,seq_len,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
for rnn_type in ['lstm', 'gru', 'rnn']:
with open("single_thread_" + auto_file + ".csv", "w") as f:
print("single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 batch_size=1", file=f)
print("rnn_type,hidden,seq_len,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
for rnn_type in ["lstm", "gru", "rnn"]:
for hidden_dim in [32, 128, 1024, 2048]:
for seq_len in [1, 16, 32, 64]:
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, 1, 128, hidden_dim, False, 4, seq_len, 1)
print('{},{},{},{},{},{},{},{}'.format(rnn_type,hidden_dim, seq_len, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
print(
"{},{},{},{},{},{},{},{}".format(
rnn_type,
hidden_dim,
seq_len,
avg_rnn,
avg_scan,
avg_int8,
avg_rnn / avg_scan,
avg_rnn / avg_int8,
),
file=f,
)
with open('multi_thread_' + auto_file + '.csv', 'w') as f:
print('multi-thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 seq_len=32 batch_size=1', file=f)
print('rnn_type,threads,hidden,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
for rnn_type in ['lstm', 'gru', 'rnn']:
with open("multi_thread_" + auto_file + ".csv", "w") as f:
print("multi-thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 seq_len=32 batch_size=1", file=f)
print("rnn_type,threads,hidden,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
for rnn_type in ["lstm", "gru", "rnn"]:
for num_threads in [1, 2, 4]:
for hidden_dim in [32, 128, 1024, 2048]:
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, num_threads, 128, hidden_dim, False, 4, seq_len, 1)
print('{},{},{},{},{},{},{},{}'.format(rnn_type,num_threads, hidden_dim, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
avg_rnn, avg_scan, avg_int8 = perf_test(
rnn_type, num_threads, 128, hidden_dim, False, 4, seq_len, 1
)
print(
"{},{},{},{},{},{},{},{}".format(
rnn_type,
num_threads,
hidden_dim,
avg_rnn,
avg_scan,
avg_int8,
avg_rnn / avg_scan,
avg_rnn / avg_int8,
),
file=f,
)
with open('batch_single_thread_' + auto_file + '.csv', 'w') as f:
print('single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024', file=f)
print('rnn_type,seq_len,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
for rnn_type in ['lstm', 'gru', 'rnn']:
with open("batch_single_thread_" + auto_file + ".csv", "w") as f:
print("single thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024", file=f)
print("rnn_type,seq_len,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
for rnn_type in ["lstm", "gru", "rnn"]:
for seq_len in [1, 16, 32, 64]:
for batch_size in [1, 4, 16, 64]:
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, 1, 128, 1024, False, 4, seq_len, batch_size)
print('{},{},{},{},{},{},{},{}'.format(rnn_type,seq_len, batch_size, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
print(
"{},{},{},{},{},{},{},{}".format(
rnn_type,
seq_len,
batch_size,
avg_rnn,
avg_scan,
avg_int8,
avg_rnn / avg_scan,
avg_rnn / avg_int8,
),
file=f,
)
with open('batch_multi_thread_' + auto_file + '.csv', 'w') as f:
print('batch thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024 seq_len=32', file=f)
print('rnn_type,threads,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8', file=f)
for rnn_type in ['lstm', 'gru', 'rnn']:
with open("batch_multi_thread_" + auto_file + ".csv", "w") as f:
print(
"batch thread test: unidirection 4-layer lstm/gru/rnn with input_dim=128 hidden_dim=1024 seq_len=32", file=f
)
print("rnn_type,threads,batch_size,avg_rnn,avg_nuphar_fp,avg_nuphar_int8,speedup_fp,speedup_int8", file=f)
for rnn_type in ["lstm", "gru", "rnn"]:
for num_threads in [1, 2, 4]:
for batch_size in [1, 4, 16, 64]:
avg_rnn, avg_scan, avg_int8 = perf_test(rnn_type, num_threads, 128, 1024, False, 4, 32, batch_size)
print('{},{},{},{},{},{},{},{}'.format(rnn_type,num_threads, batch_size, avg_rnn, avg_scan, avg_int8, avg_rnn/avg_scan, avg_rnn/avg_int8), file=f)
print(
"{},{},{},{},{},{},{},{}".format(
rnn_type,
num_threads,
batch_size,
avg_rnn,
avg_scan,
avg_int8,
avg_rnn / avg_scan,
avg_rnn / avg_int8,
),
file=f,
)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--rnn_type', help='Type of rnn, one of lstm/gru/rnn', choices=['lstm', 'gru', 'rnn'], default='lstm')
parser.add_argument('--input_dim', help='Input size of lstm/gru/rnn', type=int, default=128)
parser.add_argument('--hidden_dim', help='Hidden size of lstm/gru/rnn', type=int, default=1024)
parser.add_argument('--bidirectional', help='Use bidirectional', action='store_true', default=False)
parser.add_argument('--layers', help='Number of layers', type=int, default=4)
parser.add_argument('--seq_len', help='Sequence length', type=int, default=32)
parser.add_argument('--batch_size', help='Batch size', type=int, default=1)
parser.add_argument('--num_threads', help='Number of MKL threads', type=int, default=multiprocessing.cpu_count())
parser.add_argument('--top_n', help='Fastest N samples to compute average time', type=int, default=5)
parser.add_argument('--auto', help='Auto_name (usually CPU type) for auto test to generate (batch_)single|multithread_<auto_name>.csv files', default=None)
return parser.parse_args()
parser = argparse.ArgumentParser()
parser.add_argument(
"--rnn_type", help="Type of rnn, one of lstm/gru/rnn", choices=["lstm", "gru", "rnn"], default="lstm"
)
parser.add_argument("--input_dim", help="Input size of lstm/gru/rnn", type=int, default=128)
parser.add_argument("--hidden_dim", help="Hidden size of lstm/gru/rnn", type=int, default=1024)
parser.add_argument("--bidirectional", help="Use bidirectional", action="store_true", default=False)
parser.add_argument("--layers", help="Number of layers", type=int, default=4)
parser.add_argument("--seq_len", help="Sequence length", type=int, default=32)
parser.add_argument("--batch_size", help="Batch size", type=int, default=1)
parser.add_argument("--num_threads", help="Number of MKL threads", type=int, default=multiprocessing.cpu_count())
parser.add_argument("--top_n", help="Fastest N samples to compute average time", type=int, default=5)
parser.add_argument(
"--auto",
help="Auto_name (usually CPU type) for auto test to generate (batch_)single|multithread_<auto_name>.csv files",
default=None,
)
return parser.parse_args()
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_arguments()
if args.auto:
perf_test_auto(args.auto)
else:
print('Testing model: ', args.rnn_type.upper())
print(' input_dim: ', args.input_dim)
print(' hidden_dim: ', args.hidden_dim)
print("Testing model: ", args.rnn_type.upper())
print(" input_dim: ", args.input_dim)
print(" hidden_dim: ", args.hidden_dim)
if args.bidirectional:
print(' bidirectional')
print(' layers: ', args.layers)
print(" bidirectional")
print(" layers: ", args.layers)
cpu_count = multiprocessing.cpu_count()
num_threads = max(min(args.num_threads, cpu_count), 1)
print('Test setup')
print(' cpu_count: ', cpu_count)
print(' num_threads: ', num_threads)
print(' seq_len: ', args.seq_len)
print(' batch_size: ', args.batch_size)
perf_test(args.rnn_type, num_threads, args.input_dim, args.hidden_dim, args.bidirectional, args.layers, args.seq_len, args.batch_size, args.top_n)
print("Test setup")
print(" cpu_count: ", cpu_count)
print(" num_threads: ", num_threads)
print(" seq_len: ", args.seq_len)
print(" batch_size: ", args.batch_size)
perf_test(
args.rnn_type,
num_threads,
args.input_dim,
args.hidden_dim,
args.bidirectional,
args.layers,
args.seq_len,
args.batch_size,
args.top_n,
)

Просмотреть файл

@ -5,15 +5,15 @@
"""
Implements ONNX's backend API.
"""
from onnx import ModelProto
from onnx import helper
from onnx import version
from onnx.checker import check_model
from onnx.backend.base import Backend
from onnxruntime import InferenceSession, SessionOptions, get_device, get_available_providers
from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep
import unittest
import os
import unittest
from onnx import ModelProto, helper, version
from onnx.backend.base import Backend
from onnx.checker import check_model
from onnxruntime import InferenceSession, SessionOptions, get_available_providers, get_device
from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep
class OnnxRuntimeBackend(Backend):
@ -28,7 +28,7 @@ class OnnxRuntimeBackend(Backend):
Note: This is not the official Python API.
""" # noqa: E501
allowReleasedOpsetsOnly = bool(os.getenv('ALLOW_RELEASED_ONNX_OPSET_ONLY', '1') == '1')
allowReleasedOpsetsOnly = bool(os.getenv("ALLOW_RELEASED_ONNX_OPSET_ONLY", "1") == "1")
@classmethod
def is_compatible(cls, model, device=None, **kwargs):
@ -55,22 +55,26 @@ class OnnxRuntimeBackend(Backend):
"""
if cls.allowReleasedOpsetsOnly:
for opset in model.opset_import:
domain = opset.domain if opset.domain else 'ai.onnx'
domain = opset.domain if opset.domain else "ai.onnx"
try:
key = (domain, opset.version)
if not (key in helper.OP_SET_ID_VERSION_MAP):
error_message = ("Skipping this test as only released onnx opsets are supported."
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
" Got Domain '{0}' version '{1}'.".format(domain, opset.version))
error_message = (
"Skipping this test as only released onnx opsets are supported."
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
" Got Domain '{0}' version '{1}'.".format(domain, opset.version)
)
return False, error_message
except AttributeError:
# for some CI pipelines accessing helper.OP_SET_ID_VERSION_MAP
# is generating attribute error. TODO investigate the pipelines to
# fix this error. Falling back to a simple version check when this error is encountered
if (domain == 'ai.onnx' and opset.version > 12) or (domain == 'ai.ommx.ml' and opset.version > 2):
error_message = ("Skipping this test as only released onnx opsets are supported."
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
" Got Domain '{0}' version '{1}'.".format(domain, opset.version))
if (domain == "ai.onnx" and opset.version > 12) or (domain == "ai.ommx.ml" and opset.version > 2):
error_message = (
"Skipping this test as only released onnx opsets are supported."
"To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
" Got Domain '{0}' version '{1}'.".format(domain, opset.version)
)
return False, error_message
return True, ""
@ -80,8 +84,8 @@ class OnnxRuntimeBackend(Backend):
Check whether the backend is compiled with particular device support.
In particular it's used in the testing suite.
"""
if device == 'CUDA':
device = 'GPU'
if device == "CUDA":
device = "GPU"
return device in get_device()
@classmethod
@ -108,7 +112,7 @@ class OnnxRuntimeBackend(Backend):
if hasattr(options, k):
setattr(options, k, v)
excluded_providers = os.getenv('ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS', default="").split(',')
excluded_providers = os.getenv("ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS", default="").split(",")
providers = [x for x in get_available_providers() if (x not in excluded_providers)]
inf = InferenceSession(model, sess_options=options, providers=providers)
@ -156,10 +160,10 @@ class OnnxRuntimeBackend(Backend):
@classmethod
def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
'''
"""
This method is not implemented as it is much more efficient
to run a whole model than every node independently.
'''
"""
raise NotImplementedError("It is much more efficient to run a whole model than every node independently.")

Просмотреть файл

@ -5,10 +5,12 @@
"""
Implements ONNX's backend API.
"""
from onnxruntime import RunOptions
from onnx.backend.base import BackendRep
from typing import Any, Tuple
from onnx.backend.base import BackendRep
from onnxruntime import RunOptions
class OnnxRuntimeBackendRep(BackendRep):
"""

Просмотреть файл

@ -2,9 +2,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import warnings
import ctypes
import sys
import warnings
def find_cudart_versions(build_env=False, build_cuda_version=None):
@ -16,16 +16,16 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
# for the above reason, we need find all versions in the environment and
# only give warnings if the expected cuda version is not found.
# in onnxruntime build environment, we expected only one Cuda version.
if not sys.platform.startswith('linux'):
warnings.warn('find_cudart_versions only works on Linux')
if not sys.platform.startswith("linux"):
warnings.warn("find_cudart_versions only works on Linux")
return None
cudart_possible_versions = {None, build_cuda_version}
def get_cudart_version(find_cudart_version=None):
cudart_lib_filename = 'libcudart.so'
cudart_lib_filename = "libcudart.so"
if find_cudart_version:
cudart_lib_filename = cudart_lib_filename + '.' + find_cudart_version
cudart_lib_filename = cudart_lib_filename + "." + find_cudart_version
try:
cudart = ctypes.CDLL(cudart_lib_filename)
@ -35,14 +35,13 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
status = cudart.cudaRuntimeGetVersion(ctypes.byref(version))
if status != 0:
return None
except: # noqa
except: # noqa
return None
return version.value
# use set to avoid duplications
cudart_found_versions = {
get_cudart_version(cudart_version) for cudart_version in cudart_possible_versions}
cudart_found_versions = {get_cudart_version(cudart_version) for cudart_version in cudart_possible_versions}
# convert to list and remove None
return [ver for ver in cudart_found_versions if ver]
@ -50,27 +49,42 @@ def find_cudart_versions(build_env=False, build_cuda_version=None):
def find_cudnn_supported_cuda_versions(build_env=False):
# comments in get_cudart_version apply here
if not sys.platform.startswith('linux'):
warnings.warn('find_cudnn_versions only works on Linux')
if not sys.platform.startswith("linux"):
warnings.warn("find_cudnn_versions only works on Linux")
cudnn_possible_versions = {None}
if not build_env:
# if not in a build environment, there may be more than one installed cudnn.
# https://developer.nvidia.com/rdp/cudnn-archive to include all that may support Cuda 10+.
cudnn_possible_versions.update({
'8.2',
'8.1.1', '8.1.0',
'8.0.5', '8.0.4', '8.0.3', '8.0.2', '8.0.1',
'7.6.5', '7.6.4', '7.6.3', '7.6.2', '7.6.1', '7.6.0',
'7.5.1', '7.5.0',
'7.4.2', '7.4.1',
'7.3.1', '7.3.0',
})
cudnn_possible_versions.update(
{
"8.2",
"8.1.1",
"8.1.0",
"8.0.5",
"8.0.4",
"8.0.3",
"8.0.2",
"8.0.1",
"7.6.5",
"7.6.4",
"7.6.3",
"7.6.2",
"7.6.1",
"7.6.0",
"7.5.1",
"7.5.0",
"7.4.2",
"7.4.1",
"7.3.1",
"7.3.0",
}
)
def get_cudnn_supported_cuda_version(find_cudnn_version=None):
cudnn_lib_filename = 'libcudnn.so'
cudnn_lib_filename = "libcudnn.so"
if find_cudnn_version:
cudnn_lib_filename = cudnn_lib_filename + '.' + find_cudnn_version
cudnn_lib_filename = cudnn_lib_filename + "." + find_cudnn_version
# in cudnn.h cudnn version are calculated as:
# #define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
@ -79,7 +93,7 @@ def find_cudnn_supported_cuda_versions(build_env=False):
# cudnn_ver = cudnn.cudnnGetVersion()
cuda_ver = cudnn.cudnnGetCudartVersion()
return cuda_ver
except: # noqa
except: # noqa
return None
# use set to avoid duplications

Просмотреть файл

@ -12,14 +12,14 @@ from onnxruntime.capi import _pybind_state as C
def get_ort_device_type(device):
device_type = device if type(device) is str else device.type.lower()
if device_type == 'cuda':
if device_type == "cuda":
return C.OrtDevice.cuda()
elif device_type == 'cpu':
elif device_type == "cpu":
return C.OrtDevice.cpu()
elif device_type == 'ort':
elif device_type == "ort":
return C.get_ort_device(device.index).device_type()
else:
raise Exception('Unsupported device type: ' + device_type)
raise Exception("Unsupported device type: " + device_type)
def check_and_normalize_provider_args(providers, provider_options, available_provider_names):
@ -52,8 +52,10 @@ def check_and_normalize_provider_args(providers, provider_options, available_pro
def set_provider_options(name, options):
if name not in available_provider_names:
warnings.warn("Specified provider '{}' is not in available provider names."
"Available providers: '{}'".format(name, ", ".join(available_provider_names)))
warnings.warn(
"Specified provider '{}' is not in available provider names."
"Available providers: '{}'".format(name, ", ".join(available_provider_names))
)
if name in provider_name_to_options:
warnings.warn("Duplicate provider '{}' encountered, ignoring.".format(name))
@ -85,8 +87,12 @@ def check_and_normalize_provider_args(providers, provider_options, available_pro
for provider in providers:
if isinstance(provider, str):
set_provider_options(provider, dict())
elif isinstance(provider, tuple) and len(provider) == 2 and \
isinstance(provider[0], str) and isinstance(provider[1], dict):
elif (
isinstance(provider, tuple)
and len(provider) == 2
and isinstance(provider[0], str)
and isinstance(provider[1], dict)
):
set_provider_options(provider[0], provider[1])
else:
raise ValueError("'providers' values must be either strings or (string, dict) tuples.")
@ -98,6 +104,7 @@ class Session:
"""
This is the main class used to run a model.
"""
def __init__(self):
# self._sess is managed by the derived class and relies on bindings from C.InferenceSession
@ -216,6 +223,7 @@ class Session:
sess.run([output_name], {input_name: x})
"""
def invoke(sess, output_names, input_dict_ort_values, run_options):
input_dict = {}
for n, v in input_dict_ort_values.items():
@ -268,10 +276,10 @@ class Session:
def run_with_iobinding(self, iobinding, run_options=None):
"""
Compute the predictions.
Compute the predictions.
:param iobinding: the iobinding object that has graph inputs/outputs bind.
:param run_options: See :class:`onnxruntime.RunOptions`.
:param iobinding: the iobinding object that has graph inputs/outputs bind.
:param run_options: See :class:`onnxruntime.RunOptions`.
"""
self._sess.run_with_iobinding(iobinding._iobinding, run_options)
@ -280,6 +288,7 @@ class InferenceSession(Session):
"""
This is the main class used to run a model.
"""
def __init__(self, path_or_bytes, sess_options=None, providers=None, provider_options=None, **kwargs):
"""
:param path_or_bytes: filename or serialized ONNX or ORT format model in a byte string
@ -326,10 +335,10 @@ class InferenceSession(Session):
self._sess_options = sess_options
self._sess_options_initial = sess_options
self._enable_fallback = True
self._read_config_from_model = os.environ.get('ORT_LOAD_CONFIG_FROM_MODEL') == '1'
self._read_config_from_model = os.environ.get("ORT_LOAD_CONFIG_FROM_MODEL") == "1"
# internal parameters that we don't expect to be used in general so aren't documented
disabled_optimizers = kwargs['disabled_optimizers'] if 'disabled_optimizers' in kwargs else None
disabled_optimizers = kwargs["disabled_optimizers"] if "disabled_optimizers" in kwargs else None
try:
self._create_inference_session(providers, provider_options, disabled_optimizers)
@ -347,23 +356,25 @@ class InferenceSession(Session):
available_providers = C.get_available_providers()
# Tensorrt can fall back to CUDA. All others fall back to CPU.
if 'TensorrtExecutionProvider' in available_providers:
self._fallback_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif 'MIGraphXExecutionProvider' in available_providers:
self._fallback_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
if "TensorrtExecutionProvider" in available_providers:
self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif "MIGraphXExecutionProvider" in available_providers:
self._fallback_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
else:
self._fallback_providers = ['CPUExecutionProvider']
self._fallback_providers = ["CPUExecutionProvider"]
# validate providers and provider_options before other initialization
providers, provider_options = check_and_normalize_provider_args(providers,
provider_options,
available_providers)
providers, provider_options = check_and_normalize_provider_args(
providers, provider_options, available_providers
)
if providers == [] and len(available_providers) > 1:
self.disable_fallback()
raise ValueError("This ORT build has {} enabled. ".format(available_providers) +
"Since ORT 1.9, you are required to explicitly set " +
"the providers parameter when instantiating InferenceSession. For example, "
"onnxruntime.InferenceSession(..., providers={}, ...)".format(available_providers))
raise ValueError(
"This ORT build has {} enabled. ".format(available_providers)
+ "Since ORT 1.9, you are required to explicitly set "
+ "the providers parameter when instantiating InferenceSession. For example, "
"onnxruntime.InferenceSession(..., providers={}, ...)".format(available_providers)
)
session_options = self._sess_options if self._sess_options else C.get_default_session_options()
if self._model_path:
@ -410,19 +421,20 @@ class InferenceSession(Session):
class IOBinding:
'''
"""
This class provides API to bind input/output to a specified device, e.g. GPU.
'''
"""
def __init__(self, session):
self._iobinding = C.SessionIOBinding(session._sess)
self._numpy_obj_references = {}
def bind_cpu_input(self, name, arr_on_cpu):
'''
"""
bind an input to array on CPU
:param name: input name
:param arr_on_cpu: input values as a python array on CPU
'''
"""
# Hold a reference to the numpy object as the bound OrtValue is backed
# directly by the data buffer of the numpy object and so the numpy object
# must be around until this IOBinding instance is around
@ -430,38 +442,53 @@ class IOBinding:
self._iobinding.bind_input(name, arr_on_cpu)
def bind_input(self, name, device_type, device_id, element_type, shape, buffer_ptr):
'''
"""
:param name: input name
:param device_type: e.g. cpu, cuda
:param device_id: device id, e.g. 0
:param element_type: input element type
:param shape: input shape
:param buffer_ptr: memory pointer to input data
'''
self._iobinding.bind_input(name,
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(),
device_id),
element_type, shape, buffer_ptr)
"""
self._iobinding.bind_input(
name,
C.OrtDevice(
get_ort_device_type(device_type),
C.OrtDevice.default_memory(),
device_id,
),
element_type,
shape,
buffer_ptr,
)
def bind_ortvalue_input(self, name, ortvalue):
'''
"""
:param name: input name
:param ortvalue: OrtValue instance to bind
'''
"""
self._iobinding.bind_ortvalue_input(name, ortvalue._ortvalue)
def synchronize_inputs(self):
self._iobinding.synchronize_inputs()
def bind_output(self, name, device_type='cpu', device_id=0, element_type=None, shape=None, buffer_ptr=None):
'''
def bind_output(
self,
name,
device_type="cpu",
device_id=0,
element_type=None,
shape=None,
buffer_ptr=None,
):
"""
:param name: output name
:param device_type: e.g. cpu, cuda, cpu by default
:param device_id: device id, e.g. 0
:param element_type: output element type
:param shape: output shape
:param buffer_ptr: memory pointer to output data
'''
"""
# Follow the `if` path when the user has not provided any pre-allocated buffer but still
# would like to bind an output to a specific device (e.g. cuda).
@ -470,32 +497,44 @@ class IOBinding:
# in which case ORT will allocate the memory for the user
# (2) The output has a dynamic shape and hence the size of the buffer may not be fixed across runs
if buffer_ptr is None:
self._iobinding.bind_output(name,
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(),
device_id))
self._iobinding.bind_output(
name,
C.OrtDevice(
get_ort_device_type(device_type),
C.OrtDevice.default_memory(),
device_id,
),
)
else:
if element_type is None or shape is None:
raise ValueError("`element_type` and `shape` are to be provided if pre-allocated memory is provided")
self._iobinding.bind_output(name,
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(),
device_id),
element_type, shape, buffer_ptr)
self._iobinding.bind_output(
name,
C.OrtDevice(
get_ort_device_type(device_type),
C.OrtDevice.default_memory(),
device_id,
),
element_type,
shape,
buffer_ptr,
)
def bind_ortvalue_output(self, name, ortvalue):
'''
"""
:param name: output name
:param ortvalue: OrtValue instance to bind
'''
"""
self._iobinding.bind_ortvalue_output(name, ortvalue._ortvalue)
def synchronize_outputs(self):
self._iobinding.synchronize_outputs()
def get_outputs(self):
'''
"""
Returns the output OrtValues from the Run() that preceded the call.
The data buffer of the obtained OrtValues may not reside on CPU memory
'''
"""
returned_ortvalues = []
for ortvalue in self._iobinding.get_outputs():
@ -504,7 +543,7 @@ class IOBinding:
return returned_ortvalues
def copy_outputs_to_cpu(self):
'''Copy output contents to CPU (if on another device). No-op if already on the CPU.'''
"""Copy output contents to CPU (if on another device). No-op if already on the CPU."""
return self._iobinding.copy_outputs_to_cpu()
def clear_binding_inputs(self):
@ -515,11 +554,12 @@ class IOBinding:
class OrtValue:
'''
"""
A data structure that supports all ONNX data formats (tensors and non-tensors) that allows users
to place the data backing these on a device, for example, on a CUDA supported device.
This class provides APIs to construct and deal with OrtValues.
'''
"""
def __init__(self, ortvalue, numpy_obj=None):
if isinstance(ortvalue, C.OrtValue):
self._ortvalue = ortvalue
@ -528,157 +568,183 @@ class OrtValue:
self._numpy_obj = numpy_obj
else:
# An end user won't hit this error
raise ValueError("`Provided ortvalue` needs to be of type " +
"`onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`")
raise ValueError(
"`Provided ortvalue` needs to be of type " + "`onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`"
)
def _get_c_value(self):
return self._ortvalue
@staticmethod
def ortvalue_from_numpy(numpy_obj, device_type='cpu', device_id=0):
'''
def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
"""
Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object
A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu
:param numpy_obj: The Numpy object to construct the OrtValue from
:param device_type: e.g. cpu, cuda, cpu by default
:param device_id: device id, e.g. 0
'''
"""
# Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue
# is backed directly by the data buffer of the numpy object and so the numpy object
# must be around until this OrtValue instance is around
return OrtValue(C.OrtValue.ortvalue_from_numpy(numpy_obj, C.OrtDevice(get_ort_device_type(device_type),
C.OrtDevice.default_memory(), device_id)), numpy_obj if device_type.lower() == 'cpu' else None)
return OrtValue(
C.OrtValue.ortvalue_from_numpy(
numpy_obj,
C.OrtDevice(
get_ort_device_type(device_type),
C.OrtDevice.default_memory(),
device_id,
),
),
numpy_obj if device_type.lower() == "cpu" else None,
)
@staticmethod
def ortvalue_from_shape_and_type(shape=None, element_type=None, device_type='cpu', device_id=0):
'''
def ortvalue_from_shape_and_type(shape=None, element_type=None, device_type="cpu", device_id=0):
"""
Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
:param shape: List of integers indicating the shape of the OrtValue
:param element_type: The data type of the elements in the OrtValue (numpy type)
:param device_type: e.g. cpu, cuda, cpu by default
:param device_id: device id, e.g. 0
'''
"""
if shape is None or element_type is None:
raise ValueError("`element_type` and `shape` are to be provided if pre-allocated memory is provided")
return OrtValue(C.OrtValue.ortvalue_from_shape_and_type(shape, element_type,
C.OrtDevice(get_ort_device_type(device_type), C.OrtDevice.default_memory(), device_id)))
return OrtValue(
C.OrtValue.ortvalue_from_shape_and_type(
shape,
element_type,
C.OrtDevice(
get_ort_device_type(device_type),
C.OrtDevice.default_memory(),
device_id,
),
)
)
@staticmethod
def ort_value_from_sparse_tensor(sparse_tensor):
'''
"""
The function will construct an OrtValue instance from a valid SparseTensor
The new instance of OrtValue will assume the ownership of sparse_tensor
'''
"""
return OrtValue(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
def as_sparse_tensor(self):
'''
"""
The function will return SparseTensor contained in this OrtValue
'''
"""
return SparseTensor(self._ortvalue.as_sparse_tensor())
def data_ptr(self):
'''
"""
Returns the address of the first element in the OrtValue's data buffer
'''
"""
return self._ortvalue.data_ptr()
def device_name(self):
'''
"""
Returns the name of the device where the OrtValue's data buffer resides e.g. cpu, cuda
'''
"""
return self._ortvalue.device_name().lower()
def shape(self):
'''
"""
Returns the shape of the data in the OrtValue
'''
"""
return self._ortvalue.shape()
def data_type(self):
'''
"""
Returns the data type of the data in the OrtValue
'''
"""
return self._ortvalue.data_type()
def element_type(self):
'''
"""
Returns the proto type of the data in the OrtValue
if the OrtValue is a tensor.
'''
"""
return self._ortvalue.element_type()
def has_value(self):
'''
"""
Returns True if the OrtValue corresponding to an
optional type contains data, else returns False
'''
"""
return self._ortvalue.has_value()
def is_tensor(self):
'''
"""
Returns True if the OrtValue contains a Tensor, else returns False
'''
"""
return self._ortvalue.is_tensor()
def is_sparse_tensor(self):
'''
"""
Returns True if the OrtValue contains a SparseTensor, else returns False
'''
"""
return self._ortvalue.is_sparse_tensor()
def is_tensor_sequence(self):
'''
"""
Returns True if the OrtValue contains a Tensor Sequence, else returns False
'''
"""
return self._ortvalue.is_tensor_sequence()
def numpy(self):
'''
"""
Returns a Numpy object from the OrtValue.
Valid only for OrtValues holding Tensors. Throws for OrtValues holding non-Tensors.
Use accessors to gain a reference to non-Tensor objects such as SparseTensor
'''
"""
return self._ortvalue.numpy()
def update_inplace(self, np_arr):
'''
"""
Update the OrtValue in place with a new Numpy array. The numpy contents
are copied over to the device memory backing the OrtValue. It can be used
to update the input valuess for an InferenceSession with CUDA graph
enabled or other scenarios where the OrtValue needs to be updated while
the memory address can not be changed.
'''
"""
self._ortvalue.update_inplace(np_arr)
class OrtDevice:
'''
"""
A data structure that exposes the underlying C++ OrtDevice
'''
"""
def __init__(self, c_ort_device):
'''
"""
Internal constructor
'''
"""
if isinstance(c_ort_device, C.OrtDevice):
self._ort_device = c_ort_device
else:
raise ValueError("`Provided object` needs to be of type " +
"`onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`")
raise ValueError(
"`Provided object` needs to be of type " + "`onnxruntime.capi.onnxruntime_pybind11_state.OrtDevice`"
)
def _get_c_device(self):
'''
"""
Internal accessor to underlying object
'''
"""
return self._ort_device
@staticmethod
def make(ort_device_name, device_id):
return OrtDevice(C.OrtDevice(get_ort_device_type(ort_device_name),
C.OrtDevice.default_memory(), device_id))
return OrtDevice(
C.OrtDevice(
get_ort_device_type(ort_device_name),
C.OrtDevice.default_memory(),
device_id,
)
)
def device_id(self):
return self._ort_device.device_id()
@ -688,29 +754,31 @@ class OrtDevice:
class SparseTensor:
'''
"""
A data structure that project the C++ SparseTensor object
The class provides API to work with the object.
Depending on the format, the class will hold more than one buffer
depending on the format
'''
"""
def __init__(self, sparse_tensor):
'''
"""
Internal constructor
'''
"""
if isinstance(sparse_tensor, C.SparseTensor):
self._tensor = sparse_tensor
else:
# An end user won't hit this error
raise ValueError("`Provided object` needs to be of type " +
"`onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`")
raise ValueError(
"`Provided object` needs to be of type " + "`onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`"
)
def _get_c_tensor(self):
return self._tensor
@staticmethod
def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
'''
"""
Factory method to construct a SparseTensor in COO format from given arguments
:param dense_shape: 1-D numpy array(int64) or a python list that contains a dense_shape of the sparse tensor
@ -729,13 +797,14 @@ class SparseTensor:
on GC. The buffers may reside in any storage either CPU or GPU.
For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
on other devices and their memory can not be mapped.
'''
return SparseTensor(C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices,
ort_device._get_c_device()))
"""
return SparseTensor(
C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device())
)
@staticmethod
def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort_device):
'''
"""
Factory method to construct a SparseTensor in CSR format from given arguments
:param dense_shape: 1-D numpy array(int64) or a python list that contains a dense_shape of the
@ -754,20 +823,27 @@ class SparseTensor:
The buffers may reside in any storage either CPU or GPU.
For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
on other devices and their memory can not be mapped.
'''
return SparseTensor(C.SparseTensor.sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices,
ort_device._get_c_device()))
"""
return SparseTensor(
C.SparseTensor.sparse_csr_from_numpy(
dense_shape,
values,
inner_indices,
outer_indices,
ort_device._get_c_device(),
)
)
def values(self):
'''
"""
The method returns a numpy array that is backed by the native memory
if the data type is numeric. Otherwise, the returned numpy array that contains
copies of the strings.
'''
"""
return self._tensor.values()
def as_coo_view(self):
'''
"""
The method will return coo representation of the sparse tensor which will enable
querying COO indices. If the instance did not contain COO format, it would throw.
You can query coo indices as:
@ -777,11 +853,11 @@ class SparseTensor:
coo_indices = sparse_tensor.as_coo_view().indices()
which will return a numpy array that is backed by the native memory.
'''
"""
return self._tensor.get_coo_data()
def as_csrc_view(self):
'''
"""
The method will return CSR(C) representation of the sparse tensor which will enable
querying CRS(C) indices. If the instance dit not contain CSR(C) format, it would throw.
You can query indices as:
@ -792,11 +868,11 @@ class SparseTensor:
outer_ndices = sparse_tensor.as_csrc_view().outer()
returning numpy arrays backed by the native memory.
'''
"""
return self._tensor.get_csrc_data()
def as_blocksparse_view(self):
'''
"""
The method will return coo representation of the sparse tensor which will enable
querying BlockSparse indices. If the instance did not contain BlockSparse format, it would throw.
You can query coo indices as:
@ -806,11 +882,11 @@ class SparseTensor:
block_sparse_indices = sparse_tensor.as_blocksparse_view().indices()
which will return a numpy array that is backed by the native memory
'''
"""
return self._tensor.get_blocksparse_data()
def to_cuda(self, ort_device):
'''
"""
Returns a copy of this instance on the specified cuda device
:param ort_device: with name 'cuda' and valid gpu device id
@ -821,29 +897,29 @@ class SparseTensor:
- this instance is already on GPU. Cross GPU copy is not supported
- CUDA is not present in this build
- if the specified device is not valid
'''
"""
return SparseTensor(self._tensor.to_cuda(ort_device._get_c_device()))
def format(self):
'''
"""
Returns a OrtSparseFormat enumeration
'''
"""
return self._tensor.format
def dense_shape(self):
'''
"""
Returns a numpy array(int64) containing a dense shape of a sparse tensor
'''
"""
return self._tensor.dense_shape()
def data_type(self):
'''
"""
Returns a string data type of the data in the OrtValue
'''
"""
return self._tensor.data_type()
def device_name(self):
'''
"""
Returns the name of the device where the SparseTensor data buffers reside e.g. cpu, cuda
'''
"""
return self._tensor.device_name().lower()

Просмотреть файл

@ -5,34 +5,36 @@
"""
Check OS requirements for ONNX Runtime Python Bindings.
"""
import platform
import linecache
import platform
import warnings
def check_distro_info():
__my_distro__ = ''
__my_distro_ver__ = ''
__my_distro__ = ""
__my_distro_ver__ = ""
__my_system__ = platform.system().lower()
__OS_RELEASE_FILE__ = '/etc/os-release'
__LSB_RELEASE_FILE__ = '/etc/lsb-release'
__OS_RELEASE_FILE__ = "/etc/os-release"
__LSB_RELEASE_FILE__ = "/etc/lsb-release"
if __my_system__ == 'windows':
if __my_system__ == "windows":
__my_distro__ = __my_system__
__my_distro_ver__ = platform.release().lower()
if __my_distro_ver__ != '10':
warnings.warn('Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only.' %
__my_distro_ver__)
elif __my_system__ == 'linux':
''' Although the 'platform' python module for getting Distro information works well on standard OS images
if __my_distro_ver__ != "10":
warnings.warn(
"Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
% __my_distro_ver__
)
elif __my_system__ == "linux":
"""Although the 'platform' python module for getting Distro information works well on standard OS images
running on real hardware, it is not accurate when running on Azure VMs, Git Bash, Cygwin, etc.
The returned values for release and version are unpredictable for virtualized or emulated environments.
/etc/os-release and /etc/lsb_release files, on the other hand, are guaranteed to exist and have standard values
in all OSes supported by onnxruntime. The former is the current standard file to check OS info and the latter
is its predecessor.
'''
"""
# Newer systems have /etc/os-release with relevant distro info
__my_distro__ = linecache.getline(__OS_RELEASE_FILE__, 3)[3:-1]
__my_distro_ver__ = linecache.getline(__OS_RELEASE_FILE__, 6)[12:-2]
@ -46,16 +48,18 @@ def check_distro_info():
# warn the user ONNX Runtime may not work out of the box
__my_distro__ = __my_distro__.lower()
__my_distro_ver__ = __my_distro_ver__.lower()
elif __my_system__ == 'darwin':
elif __my_system__ == "darwin":
__my_distro__ = __my_system__
__my_distro_ver__ = platform.release().lower()
if int(__my_distro_ver__.split('.')[0]) < 11:
warnings.warn('Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later.' %
(__my_distro_ver__))
if int(__my_distro_ver__.split(".")[0]) < 11:
warnings.warn(
"Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later." % (__my_distro_ver__)
)
else:
warnings.warn('Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only.' %
__my_system__)
warnings.warn(
"Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only." % __my_system__
)
def validate_build_package_info():
@ -63,7 +67,8 @@ def validate_build_package_info():
has_ortmodule = False
try:
from onnxruntime.training.ortmodule import ORTModule # noqa
from onnxruntime.training.ortmodule import ORTModule # noqa
has_ortmodule = True
except ImportError:
# ORTModule not present
@ -74,6 +79,7 @@ def validate_build_package_info():
# device version validation and raise the exception after.
try:
from onnxruntime.training.ortmodule._fallback import ORTModuleInitException
if isinstance(e, ORTModuleInitException):
# ORTModule is present but not ready to run yet
has_ortmodule = True
@ -84,19 +90,19 @@ def validate_build_package_info():
if not has_ortmodule:
import_ortmodule_exception = e
package_name = ''
version = ''
cuda_version = ''
package_name = ""
version = ""
cuda_version = ""
if has_ortmodule:
try:
# collect onnxruntime package name, version, and cuda version
from .build_and_package_info import package_name
from .build_and_package_info import __version__ as version
from .build_and_package_info import package_name
try:
from .build_and_package_info import cuda_version
except: # noqa
except: # noqa
pass
if cuda_version:
@ -104,29 +110,30 @@ def validate_build_package_info():
# when the build environment has none or multiple libraries installed
try:
from .build_and_package_info import cudart_version
except: # noqa
warnings.warn('WARNING: failed to get cudart_version from onnxruntime build info.')
except: # noqa
warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
cudart_version = None
def print_build_package_info():
warnings.warn('onnxruntime training package info: package_name: %s' % package_name)
warnings.warn('onnxruntime training package info: __version__: %s' % version)
warnings.warn('onnxruntime training package info: cuda_version: %s' % cuda_version)
warnings.warn('onnxruntime build info: cudart_version: %s' % cudart_version)
warnings.warn("onnxruntime training package info: package_name: %s" % package_name)
warnings.warn("onnxruntime training package info: __version__: %s" % version)
warnings.warn("onnxruntime training package info: cuda_version: %s" % cuda_version)
warnings.warn("onnxruntime build info: cudart_version: %s" % cudart_version)
# collection cuda library info from current environment.
from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
local_cudart_versions = find_cudart_versions(build_env=False, build_cuda_version=cuda_version)
if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
print_build_package_info()
warnings.warn('WARNING: failed to find cudart version that matches onnxruntime build info')
warnings.warn('WARNING: found cudart versions: %s' % local_cudart_versions)
warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
warnings.warn("WARNING: found cudart versions: %s" % local_cudart_versions)
else:
# TODO: rcom
pass
except Exception as e: # noqa
warnings.warn('WARNING: failed to collect onnxruntime version and build info')
except Exception as e: # noqa
warnings.warn("WARNING: failed to collect onnxruntime version and build info")
print(e)
if import_ortmodule_exception:

Просмотреть файл

@ -9,9 +9,10 @@ import textwrap
def rewrite_target_file(target):
with open(target, 'a') as f:
f.write(textwrap.dedent(
"""
with open(target, "a") as f:
f.write(
textwrap.dedent(
"""
import warnings
try:
@ -33,15 +34,21 @@ def rewrite_target_file(target):
f"WARNING: Failed to register python functions to work with TVM EP. More details: {e}"
)
"""
))
)
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--target_file", type=str, required=True, help="Path to the file to be expanded.")
parser.add_argument(
"--target_file",
type=str,
required=True,
help="Path to the file to be expanded.",
)
args = parser.parse_args()
rewrite_target_file(args.target_file)
if __name__ == '__main__':
if __name__ == "__main__":
main()

Просмотреть файл

@ -4,17 +4,16 @@
# license information.
# --------------------------------------------------------------------------
import os
import collections
import copy
import logging
import os
import onnx
import tvm
from tvm import relay, auto_scheduler
from tvm.relay import vm
from tvm import auto_scheduler, autotvm, relay
from tvm.contrib import graph_executor
from tvm import autotvm
from tvm.relay import vm
log = logging.getLogger("tvm_ep")
@ -23,18 +22,20 @@ AUTO_TVM_TYPE = "AutoTVM"
@tvm.register_func("tvm_onnx_import_and_compile")
def onnx_compile(model_string,
model_path,
executor,
target,
target_host,
opt_level,
opset,
freeze_params,
input_shapes,
nhwc=False,
tuning_logfile="",
tuning_type=AUTO_TVM_TYPE):
def onnx_compile(
model_string,
model_path,
executor,
target,
target_host,
opt_level,
opset,
freeze_params,
input_shapes,
nhwc=False,
tuning_logfile="",
tuning_type=AUTO_TVM_TYPE,
):
def get_tvm_executor(irmod, executor, target, params):
if executor == "vm":
log.info("Build TVM virtual machine")
@ -47,8 +48,9 @@ def onnx_compile(model_string,
log.info("Build TVM graph executor")
lib = relay.build(irmod, target=target, params=params)
else:
log.error("Executor type {} is unsupported. ".format(executor) +
"Only \"vm\" and \"graph\" types are supported")
log.error(
"Executor type {} is unsupported. ".format(executor) + 'Only "vm" and "graph" types are supported'
)
return None
return lib
@ -94,7 +96,7 @@ def onnx_compile(model_string,
config={
"relay.backend.use_auto_scheduler": True,
"relay.FuseOps.max_depth": 30,
}
},
):
if nhwc:
seq = tvm.transform.Sequential(
@ -113,8 +115,10 @@ def onnx_compile(model_string,
with autotvm.apply_history_best(tuning_logfile):
lib = get_tvm_executor(irmod, executor, tvm_target, params)
else:
log.error("Tuning log type {} is unsupported. ".format(tuning_type) +
"Only {} and {} types are supported".format(ANSOR_TYPE, AUTO_TVM_TYPE))
log.error(
"Tuning log type {} is unsupported. ".format(tuning_type)
+ "Only {} and {} types are supported".format(ANSOR_TYPE, AUTO_TVM_TYPE)
)
return None
else:
with tvm.transform.PassContext(opt_level=opt_level):
@ -129,8 +133,10 @@ def onnx_compile(model_string,
elif executor == "graph":
m = graph_executor.GraphModule(lib["default"](ctx))
else:
print("ERROR: Executor type {} is unsupported. ".format(executor),
"Only \"vm\" and \"graph\" types are supported")
print(
"ERROR: Executor type {} is unsupported. ".format(executor),
'Only "vm" and "graph" types are supported',
)
return None
return m.module

Просмотреть файл

@ -1,10 +1,11 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@ -24,12 +25,21 @@ class BenchmarkAttention(BenchmarkOp):
def create_inputs_outputs(cls, op_param):
np.random.seed(0)
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
op_param.data_type
)
weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type)
bias = np.random.rand(op_param.length).astype(op_param.data_type)
mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32)
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index}
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
op_param.data_type
)
inputs = {
"INPUT": input_data,
"WEIGHT": weight,
"BIAS": bias,
"MASK_INDEX": mask_index,
}
outputs = {"return_val": output_data}
return inputs, outputs

Просмотреть файл

@ -1,44 +1,67 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import logging
import time
from abc import ABC, abstractmethod
from argparse import ArgumentParser
import logging
import numpy
import onnxruntime as ort
import time
import torch
import onnxruntime as ort
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def numpy_type(torch_type):
type_map = {torch.float32: numpy.float32,
torch.float16: numpy.float16,
torch.int32: numpy.int32}
type_map = {
torch.float32: numpy.float32,
torch.float16: numpy.float16,
torch.int32: numpy.int32,
}
return type_map[torch_type]
def add_arguments(parser: ArgumentParser):
parser.add_argument("--provider", required=False, type=str,
choices=["cuda", "rocm", "cpu", None], default=None,
help=("Execution provider to use. By default, a "
"provider is selected in the priority order "
"(cuda|rocm, cpu) depending on availability."))
parser.add_argument("--precision", required=False, type=str,
choices=["fp16", "fp32"], default="fp16",
help="Number format to use")
parser.add_argument('--profiling', required=False, type=bool,
default=False, help='If enable profiling')
parser.add_argument(
"--provider",
required=False,
type=str,
choices=["cuda", "rocm", "cpu", None],
default=None,
help=(
"Execution provider to use. By default, a "
"provider is selected in the priority order "
"(cuda|rocm, cpu) depending on availability."
),
)
parser.add_argument(
"--precision",
required=False,
type=str,
choices=["fp16", "fp32"],
default="fp16",
help="Number format to use",
)
parser.add_argument(
"--profiling",
required=False,
type=bool,
default=False,
help="If enable profiling",
)
def provider_name(name):
provider_map = {"cuda": "CUDAExecutionProvider",
"rocm": "ROCMExecutionProvider",
"cpu": "CPUExecutionProvider"}
provider_map = {
"cuda": "CUDAExecutionProvider",
"rocm": "ROCMExecutionProvider",
"cpu": "CPUExecutionProvider",
}
return provider_map[name]
@ -52,8 +75,7 @@ def get_default_provider():
class Benchmark:
def __init__(self, model, inputs, outputs, args):
self.provider = (get_default_provider() if args.provider == None
else provider_name(args.provider))
self.provider = get_default_provider() if args.provider == None else provider_name(args.provider)
logger.info(f"Execution provider: {self.provider}")
self.profiling = args.profiling
self.model = model
@ -62,43 +84,49 @@ class Benchmark:
self.outputs = outputs
def create_input_output_tensors(self):
on_gpu = (self.provider == "CUDAExecutionProvider"
or self.provider == "ROCMExecutionProvider")
on_gpu = self.provider == "CUDAExecutionProvider" or self.provider == "ROCMExecutionProvider"
device = "cuda" if on_gpu else "cpu"
input_tensors = {name: torch.from_numpy(array).to(device)
for name, array in self.inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device)
for name, array in self.outputs.items()}
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
return input_tensors, output_tensors
@classmethod
def create_io_binding(cls, sess, input_tensors, output_tensors):
io_binding = sess.io_binding()
for name, tensor in input_tensors.items():
io_binding.bind_input(name, tensor.device.type, 0,
numpy_type(tensor.dtype), tensor.shape,
tensor.data_ptr())
io_binding.bind_input(
name,
tensor.device.type,
0,
numpy_type(tensor.dtype),
tensor.shape,
tensor.data_ptr(),
)
for name, tensor in output_tensors.items():
io_binding.bind_output(name, tensor.device.type, 0,
numpy_type(tensor.dtype), tensor.shape,
tensor.data_ptr())
io_binding.bind_output(
name,
tensor.device.type,
0,
numpy_type(tensor.dtype),
tensor.shape,
tensor.data_ptr(),
)
return io_binding
def create_session(self):
sess_opt = ort.SessionOptions()
sess_opt.enable_profiling = self.profiling
sess = ort.InferenceSession(self.model, sess_options=sess_opt,
providers=[self.provider])
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=[self.provider])
return sess
def benchmark(self):
sess = self.create_session()
input_tensors, output_tensors = self.create_input_output_tensors()
io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
# warm up
for iter in range(10):
sess.run_with_iobinding(io_binding)
sess.run_with_iobinding(io_binding)
# measure
max_iters = 100

Просмотреть файл

@ -1,28 +1,29 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@dataclass
class OpParam:
x : int
y : int
m : int
n : int
input_data_type : type
output_data_type : type
x: int
y: int
m: int
n: int
input_data_type: type
output_data_type: type
@dataclass
class ModelParam:
token_type_ids_dim0 : int
input_ids_dim1 : int
token_type_ids_dim0: int
input_ids_dim1: int
class BenchmarkCast(BenchmarkOp):
@ -38,9 +39,39 @@ class BenchmarkCast(BenchmarkOp):
return inputs, outputs
def add_model_cases(self, mp, model, input_data_type, output_data_type):
self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1024, input_data_type, output_data_type), model)
self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1, input_data_type, output_data_type), model)
self.add_case(OpParam(16, mp.token_type_ids_dim0, mp.input_ids_dim1, mp.input_ids_dim1, input_data_type, output_data_type), model)
self.add_case(
OpParam(
1,
mp.token_type_ids_dim0,
mp.input_ids_dim1,
1024,
input_data_type,
output_data_type,
),
model,
)
self.add_case(
OpParam(
1,
mp.token_type_ids_dim0,
mp.input_ids_dim1,
1,
input_data_type,
output_data_type,
),
model,
)
self.add_case(
OpParam(
16,
mp.token_type_ids_dim0,
mp.input_ids_dim1,
mp.input_ids_dim1,
input_data_type,
output_data_type,
),
model,
)
def create_cases(self):
model = "models/cast_fp16tofp32.onnx" if self.args.precision == "fp16" else "models/cast_fp32tofp16.onnx"
@ -61,7 +92,7 @@ class BenchmarkCast(BenchmarkOp):
def case_profile(cls, op_param, time):
profile = f"(x y m n input_data_type) = ({op_param.x} {op_param.y} {op_param.m} {op_param.n} {op_param.input_data_type}), {time:7.4f} ms"
return profile
def main():
parser = argparse.ArgumentParser()

Просмотреть файл

@ -1,10 +1,11 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@ -43,7 +44,12 @@ class BenchmarkFastGelu(BenchmarkOp):
data_type = np.float16 if self.args.precision == "fp16" else np.float32
# bert-large
model_param = ModelParam(1, 384, 1024 * 4, data_type)
op_param = OpParam(model_param.batch_size, model_param.seq_len, model_param.inter_dim, model_param.data_type)
op_param = OpParam(
model_param.batch_size,
model_param.seq_len,
model_param.inter_dim,
model_param.data_type,
)
self.add_case(op_param, model)
def case_profile(cls, op_param, time):

Просмотреть файл

@ -1,10 +1,11 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@ -43,10 +44,36 @@ class BenchmarkMatMul(BenchmarkOp):
return inputs, outputs
def add_model_cases(self, mp, model):
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.hidden_size, mp.data_type), model)
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type), model)
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type), model)
self.add_case(OpParam(mp.batch_size, mp.num_heads, mp.seq_len, mp.seq_len, int(mp.hidden_size / mp.num_heads), mp.data_type), model)
self.add_case(
OpParam(
1,
mp.batch_size,
mp.seq_len,
mp.hidden_size,
mp.hidden_size,
mp.data_type,
),
model,
)
self.add_case(
OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type),
model,
)
self.add_case(
OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type),
model,
)
self.add_case(
OpParam(
mp.batch_size,
mp.num_heads,
mp.seq_len,
mp.seq_len,
int(mp.hidden_size / mp.num_heads),
mp.data_type,
),
model,
)
def create_cases(self):
model = "models/matmul_fp16.onnx" if self.args.precision == "fp16" else "models/matmul_fp32.onnx"

Просмотреть файл

@ -1,10 +1,11 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@ -23,20 +24,32 @@ class BenchmarkSkipLayerNorm(BenchmarkOp):
def create_inputs_outputs(cls, op_param):
np.random.seed(0)
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
op_param.data_type
)
skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias}
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
op_param.data_type
)
inputs = {
"INPUT": input_data,
"SKIP": skip,
"GAMMA": gamma,
"BETA": beta,
"BIAS": bias,
}
outputs = {"return_val": output_data}
return inputs, outputs
def create_cases(self):
model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
model = (
"models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
)
data_type = np.float16 if self.args.precision == "fp16" else np.float32
# bert-large
op_param = OpParam(1, 384, 1024, data_type)

Просмотреть файл

@ -1,21 +1,23 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
# An offline standalone script to declassify an ONNX model by randomizing the tensor data in initializers.
# The ORT Performance may change especially on generative models.
import argparse
import numpy as np
from onnx import onnx_pb, numpy_helper, save_model, load_model
from pathlib import Path
import numpy as np
from onnx import load_model, numpy_helper, onnx_pb, save_model
# An experimental small value for differentiating shape data and weights.
# The tensor data with larger size can't be shape data.
# User may adjust this value as needed.
SIZE_THRESHOLD = 10
def graph_iterator(model, func):
graph_queue = [model.graph]
while graph_queue:
@ -24,11 +26,11 @@ def graph_iterator(model, func):
for node in graph.node:
for attr in node.attribute:
if attr.type == onnx_pb.AttributeProto.AttributeType.GRAPH:
assert (isinstance(attr.g, onnx_pb.GraphProto))
assert isinstance(attr.g, onnx_pb.GraphProto)
graph_queue.append(attr.g)
if attr.type == onnx_pb.AttributeProto.AttributeType.GRAPHS:
for g in attr.graphs:
assert (isinstance(g, onnx_pb.GraphProto))
assert isinstance(g, onnx_pb.GraphProto)
graph_queue.append(g)
@ -37,54 +39,47 @@ def randomize_graph_initializer(graph):
array = numpy_helper.to_array(i_tensor)
# TODO: need to find a better way to differentiate shape data and weights.
if array.size > SIZE_THRESHOLD:
random_array = np.random.uniform(array.min(),
array.max(),
size=array.shape).astype(
array.dtype)
random_array = np.random.uniform(array.min(), array.max(), size=array.shape).astype(array.dtype)
o_tensor = numpy_helper.from_array(random_array, i_tensor.name)
i_tensor.CopyFrom(o_tensor)
def main():
parser = argparse.ArgumentParser(
description='Randomize the weights of an ONNX model')
parser.add_argument('-m',
type=str,
required=True,
help='input onnx model path')
parser.add_argument('-o',
type=str,
required=True,
help='output onnx model path')
parser.add_argument("--use_external_data_format",
required=False,
action="store_true",
help="Store or Save in external data format")
parser.add_argument("--all_tensors_to_one_file",
required=False,
action="store_true",
help="Save all tensors to one file")
parser = argparse.ArgumentParser(description="Randomize the weights of an ONNX model")
parser.add_argument("-m", type=str, required=True, help="input onnx model path")
parser.add_argument("-o", type=str, required=True, help="output onnx model path")
parser.add_argument(
"--use_external_data_format",
required=False,
action="store_true",
help="Store or Save in external data format",
)
parser.add_argument(
"--all_tensors_to_one_file",
required=False,
action="store_true",
help="Save all tensors to one file",
)
args = parser.parse_args()
data_path = None
if args.use_external_data_format:
if Path(args.m).parent == Path(args.o).parent:
raise RuntimeError(
"Please specify output directory with different parent path to input directory."
)
raise RuntimeError("Please specify output directory with different parent path to input directory.")
if args.all_tensors_to_one_file:
data_path = Path(args.o).name + ".data"
Path(args.o).parent.mkdir(parents=True, exist_ok=True)
onnx_model = load_model(args.m,
load_external_data=args.use_external_data_format)
onnx_model = load_model(args.m, load_external_data=args.use_external_data_format)
graph_iterator(onnx_model, randomize_graph_initializer)
save_model(onnx_model,
args.o,
save_as_external_data=args.use_external_data_format,
all_tensors_to_one_file=args.all_tensors_to_one_file,
location=data_path)
save_model(
onnx_model,
args.o,
save_as_external_data=args.use_external_data_format,
all_tensors_to_one_file=args.all_tensors_to_one_file,
location=data_path,
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,27 +1,34 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import argparse
import onnxruntime as onnxrt
import numpy as np
import os
import sys
from timeit import default_timer as timer
float_dict = {'tensor(float16)': 'float16', 'tensor(float)': 'float32', 'tensor(double)': 'float64'}
import numpy as np
import onnxruntime as onnxrt
float_dict = {
"tensor(float16)": "float16",
"tensor(float)": "float32",
"tensor(double)": "float64",
}
integer_dict = {
'tensor(int32)': 'int32',
'tensor(int8)': 'int8',
'tensor(uint8)': 'uint8',
'tensor(int16)': 'int16',
'tensor(uint16)': 'uint16',
'tensor(int64)': 'int64',
'tensor(uint64)': 'uint64'
"tensor(int32)": "int32",
"tensor(int8)": "int8",
"tensor(uint8)": "uint8",
"tensor(int16)": "int16",
"tensor(uint16)": "uint16",
"tensor(int64)": "int64",
"tensor(uint64)": "uint64",
}
def generate_feeds(sess, symbolic_dims={}):
feeds = {}
for input_meta in sess.get_inputs():
@ -43,23 +50,27 @@ def generate_feeds(sess, symbolic_dims={}):
if input_meta.type in float_dict:
feeds[input_meta.name] = np.random.rand(*shape).astype(float_dict[input_meta.type])
elif input_meta.type in integer_dict:
feeds[input_meta.name] = np.random.uniform(high=1000,
size=tuple(shape)).astype(integer_dict[input_meta.type])
elif input_meta.type == 'tensor(bool)':
feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype('bool')
feeds[input_meta.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(
integer_dict[input_meta.type]
)
elif input_meta.type == "tensor(bool)":
feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
else:
print("unsupported input type {} for input {}".format(input_meta.type, input_meta.name))
sys.exit(-1)
return feeds
# simple test program for loading onnx model, feeding all inputs and running the model num_iters times.
def run_model(model_path,
num_iters=1,
debug=None,
profile=None,
symbolic_dims={},
feeds=None,
override_initializers=True):
def run_model(
model_path,
num_iters=1,
debug=None,
profile=None,
symbolic_dims={},
feeds=None,
override_initializers=True,
):
if debug:
print("Pausing execution ready for debugger to attach to pid: {}".format(os.getpid()))
print("Press key to continue.")
@ -71,7 +82,11 @@ def run_model(model_path,
sess_options.enable_profiling = True
sess_options.profile_file_prefix = os.path.basename(model_path)
sess = onnxrt.InferenceSession(model_path, sess_options=sess_options, providers=onnxrt.get_available_providers())
sess = onnxrt.InferenceSession(
model_path,
sess_options=sess_options,
providers=onnxrt.get_available_providers(),
)
meta = sess.get_modelmeta()
if not feeds:
@ -86,10 +101,11 @@ def run_model(model_path,
if initializer.type in float_dict:
feeds[initializer.name] = np.random.rand(*shape).astype(float_dict[initializer.type])
elif initializer.type in integer_dict:
feeds[initializer.name] = np.random.uniform(high=1000,
size=tuple(shape)).astype(integer_dict[initializer.type])
elif initializer.type == 'tensor(bool)':
feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype('bool')
feeds[initializer.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(
integer_dict[initializer.type]
)
elif initializer.type == "tensor(bool)":
feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
else:
print("unsupported initializer type {} for initializer {}".format(initializer.type, initializer.name))
sys.exit(-1)
@ -112,15 +128,29 @@ def run_model(model_path,
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Simple ONNX Runtime Test Tool.')
parser.add_argument('model_path', help='model path')
parser.add_argument('num_iters', nargs='?', type=int, default=1000, help='model run iterations. default=1000')
parser.add_argument('--debug', action='store_true', help='pause execution to allow attaching a debugger.')
parser.add_argument('--profile', action='store_true', help='enable chrome timeline trace profiling.')
parser.add_argument('--symbolic_dims', default={}, type=lambda s: dict(x.split("=") for x in s.split(",")),
help='Comma separated name=value pairs for any symbolic dimensions in the model input. '
'e.g. --symbolic_dims batch=1,seqlen=5. '
'If not provided, the value of 1 will be used for all symbolic dimensions.')
parser = argparse.ArgumentParser(description="Simple ONNX Runtime Test Tool.")
parser.add_argument("model_path", help="model path")
parser.add_argument(
"num_iters",
nargs="?",
type=int,
default=1000,
help="model run iterations. default=1000",
)
parser.add_argument(
"--debug",
action="store_true",
help="pause execution to allow attaching a debugger.",
)
parser.add_argument("--profile", action="store_true", help="enable chrome timeline trace profiling.")
parser.add_argument(
"--symbolic_dims",
default={},
type=lambda s: dict(x.split("=") for x in s.split(",")),
help="Comma separated name=value pairs for any symbolic dimensions in the model input. "
"e.g. --symbolic_dims batch=1,seqlen=5. "
"If not provided, the value of 1 will be used for all symbolic dimensions.",
)
args = parser.parse_args()
exit_code, _, _ = run_model(args.model_path, args.num_iters, args.debug, args.profile, args.symbolic_dims)

Просмотреть файл

@ -12,8 +12,8 @@ try:
from torch.onnx import register_custom_op_symbolic
except ModuleNotFoundError:
raise ModuleNotFoundError(
"This module is only useful in combination with PyTorch. "
"To install PyTorch see https://pytorch.org/.")
"This module is only useful in combination with PyTorch. To install PyTorch see https://pytorch.org/."
)
import torch.onnx.symbolic_helper as sym_help
import torch.onnx.symbolic_registry as sym_registry
@ -44,8 +44,8 @@ def register():
# 'reflection' : onnx::Constant[value={2}]
mode = sym_help._maybe_get_const(mode, "i")
padding_mode = sym_help._maybe_get_const(padding_mode, "i")
mode_str = ['bilinear', 'nearest', 'bicubic'][mode]
padding_mode_str = ['zeros', 'border', 'reflection'][padding_mode]
mode_str = ["bilinear", "nearest", "bicubic"][mode]
padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
align_corners = int(sym_help._maybe_get_const(align_corners, "b"))
# From opset v13 onward, the output shape can be specified with
@ -55,28 +55,36 @@ def register():
# output_shape = input_shape[:2] + gird_shape[1:3]
# g.op(...).setType(input.type().with_sizes(output_shape))
return g.op("com.microsoft::GridSample", input, grid,
mode_s=mode_str,
padding_mode_s=padding_mode_str,
align_corners_i=align_corners)
return g.op(
"com.microsoft::GridSample",
input,
grid,
mode_s=mode_str,
padding_mode_s=padding_mode_str,
align_corners_i=align_corners,
)
_reg(grid_sampler)
def inverse(g, self):
return g.op("com.microsoft::Inverse", self).setType(self.type())
_reg(inverse)
def gelu(g, self):
return g.op("com.microsoft::Gelu", self).setType(self.type())
_reg(gelu)
def triu(g, self, diagonal):
return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())
_reg(triu)
def tril(g, self, diagonal):
return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0).setType(self.type())
_reg(tril)
_reg(tril)
def unregister():
@ -86,6 +94,5 @@ def unregister():
for name in _registered_ops:
ns, kind = name.split("::")
for version in sym_help._onnx_stable_opsets:
if (version >= _OPSET_VERSION and
sym_registry.is_registered_op(kind, ns, version)):
if version >= _OPSET_VERSION and sym_registry.is_registered_op(kind, ns, version):
del sym_registry._registry[(ns, version)][kind]

Просмотреть файл

@ -4,10 +4,12 @@
import flatbuffers
from flatbuffers.compat import import_numpy
np = import_numpy()
class KeyValue(object):
__slots__ = ['_tab']
__slots__ = ["_tab"]
@classmethod
def GetRootAs(cls, buf, offset=0):
@ -20,6 +22,7 @@ class KeyValue(object):
def GetRootAsKeyValue(cls, buf, offset=0):
"""This method is deprecated. Please switch to GetRootAs."""
return cls.GetRootAs(buf, offset)
# KeyValue
def Init(self, buf, pos):
self._tab = flatbuffers.table.Table(buf, pos)
@ -38,19 +41,38 @@ class KeyValue(object):
return self._tab.String(o + self._tab.Pos)
return None
def Start(builder): builder.StartObject(2)
def Start(builder):
builder.StartObject(2)
def KeyValueStart(builder):
"""This method is deprecated. Please switch to Start."""
return Start(builder)
def AddKey(builder, key): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
def AddKey(builder, key):
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
def KeyValueAddKey(builder, key):
"""This method is deprecated. Please switch to AddKey."""
return AddKey(builder, key)
def AddValue(builder, value): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
def AddValue(builder, value):
builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
def KeyValueAddValue(builder, value):
"""This method is deprecated. Please switch to AddValue."""
return AddValue(builder, value)
def End(builder): return builder.EndObject()
def End(builder):
return builder.EndObject()
def KeyValueEnd(builder):
"""This method is deprecated. Please switch to End."""
return End(builder)
return End(builder)

Просмотреть файл

@ -4,10 +4,12 @@
import flatbuffers
from flatbuffers.compat import import_numpy
np = import_numpy()
class TrtTable(object):
__slots__ = ['_tab']
__slots__ = ["_tab"]
@classmethod
def GetRootAs(cls, buf, offset=0):
@ -20,6 +22,7 @@ class TrtTable(object):
def GetRootAsTrtTable(cls, buf, offset=0):
"""This method is deprecated. Please switch to GetRootAs."""
return cls.GetRootAs(buf, offset)
# TrtTable
def Init(self, buf, pos):
self._tab = flatbuffers.table.Table(buf, pos)
@ -32,6 +35,7 @@ class TrtTable(object):
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
x = self._tab.Indirect(x)
from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue
obj = KeyValue()
obj.Init(self._tab.Bytes, x)
return obj
@ -49,19 +53,38 @@ class TrtTable(object):
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
return o == 0
def Start(builder): builder.StartObject(1)
def Start(builder):
builder.StartObject(1)
def TrtTableStart(builder):
"""This method is deprecated. Please switch to Start."""
return Start(builder)
def AddDict(builder, dict): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
def AddDict(builder, dict):
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
def TrtTableAddDict(builder, dict):
"""This method is deprecated. Please switch to AddDict."""
return AddDict(builder, dict)
def StartDictVector(builder, numElems): return builder.StartVector(4, numElems, 4)
def StartDictVector(builder, numElems):
return builder.StartVector(4, numElems, 4)
def TrtTableStartDictVector(builder, numElems):
"""This method is deprecated. Please switch to Start."""
return StartDictVector(builder, numElems)
def End(builder): return builder.EndObject()
def End(builder):
return builder.EndObject()
def TrtTableEnd(builder):
"""This method is deprecated. Please switch to End."""
return End(builder)
return End(builder)

Просмотреть файл

@ -1,5 +1,4 @@
from .quantize import quantize_static, quantize_dynamic
from .quantize import QuantizationMode
from .calibrate import CalibrationDataReader, CalibraterBase, MinMaxCalibrater, create_calibrator, CalibrationMethod
from .quant_utils import QuantType, QuantFormat, write_calibration_table
from .calibrate import CalibraterBase, CalibrationDataReader, CalibrationMethod, MinMaxCalibrater, create_calibrator
from .qdq_quantizer import QDQQuantizer
from .quant_utils import QuantFormat, QuantType, write_calibration_table
from .quantize import QuantizationMode, quantize_dynamic, quantize_static

Просмотреть файл

@ -7,27 +7,37 @@
# --------------------------------------------------------------------------
import abc
import itertools
import numpy as np
import onnxruntime
import onnx
from onnx import helper, TensorProto, ModelProto
from onnx import onnx_pb as onnx_proto
from enum import Enum
from pathlib import Path
from .quant_utils import QuantType, model_has_infer_metadata, smooth_distribution, apply_plot, load_model, clone_model_with_shape_infer
import numpy as np
import onnx
from onnx import ModelProto, TensorProto, helper
from onnx import onnx_pb as onnx_proto
import onnxruntime
from .quant_utils import (
QuantType,
apply_plot,
clone_model_with_shape_infer,
load_model,
model_has_infer_metadata,
smooth_distribution,
)
from .registry import QLinearOpsRegistry
class CalibrationMethod(Enum):
MinMax = 0
Entropy = 1
Percentile = 2
class CalibrationDataReader(metaclass=abc.ABCMeta):
@classmethod
def __subclasshook__(cls, subclass):
return (hasattr(subclass, 'get_next') and callable(subclass.get_next) or NotImplemented)
return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented
@abc.abstractmethod
def get_next(self) -> dict:
@ -36,14 +46,21 @@ class CalibrationDataReader(metaclass=abc.ABCMeta):
class CalibraterBase:
def __init__(self, model, op_types_to_calibrate=[], augmented_model_path='augmented_model.onnx', symmetric=False, use_external_data_format=False):
'''
def __init__(
self,
model,
op_types_to_calibrate=[],
augmented_model_path="augmented_model.onnx",
symmetric=False,
use_external_data_format=False,
):
"""
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
:param augmented_model_path: save augmented model to this path.
:param symmetric: make range of tensor symmetric (central point is 0).
:param use_external_data_format: use external data format to store model which size is >= 2Gb
'''
"""
if isinstance(model, str):
self.model = load_model(Path(model), False)
elif isinstance(model, Path):
@ -51,7 +68,7 @@ class CalibraterBase:
elif isinstance(model, ModelProto):
self.model = model
else:
raise ValueError('model should be either model path or onnx.ModelProto.')
raise ValueError("model should be either model path or onnx.ModelProto.")
self.op_types_to_calibrate = op_types_to_calibrate
self.augmented_model_path = augmented_model_path
@ -64,33 +81,35 @@ class CalibraterBase:
# Create InferenceSession
self.infer_session = None
self.execution_providers = ['CPUExecutionProvider']
self.execution_providers = ["CPUExecutionProvider"]
self._create_inference_session()
def set_execution_providers(self, execution_providers=['CPUExecutionProvider']):
'''
def set_execution_providers(self, execution_providers=["CPUExecutionProvider"]):
"""
reset the execution providers to execute the collect_data. It triggers to re-creating inference session.
'''
"""
self.execution_providers = execution_providers
self._create_inference_session()
def _create_inference_session(self):
'''
"""
create an OnnxRuntime InferenceSession.
'''
"""
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
self.infer_session = onnxruntime.InferenceSession(self.augmented_model_path,
sess_options=sess_options,
providers=self.execution_providers)
self.infer_session = onnxruntime.InferenceSession(
self.augmented_model_path,
sess_options=sess_options,
providers=self.execution_providers,
)
def select_tensors_to_calibrate(self, model):
'''
select all quantization_candidates op type nodes' input/output tensors.
"""
select all quantization_candidates op type nodes' input/output tensors.
returns:
tensors (set): set of tensor name.
value_infos (dict): tensor name to value info.
'''
"""
value_infos = {vi.name: vi for vi in model.graph.value_info}
value_infos.update({ot.name: ot for ot in model.graph.output})
value_infos.update({it.name: it for it in model.graph.input})
@ -104,50 +123,54 @@ class CalibraterBase:
for tensor_name in itertools.chain(node.input, node.output):
if tensor_name in value_infos.keys():
vi = value_infos[tensor_name]
if vi.type.HasField('tensor_type') and (
vi.type.tensor_type.elem_type in tensor_type_to_calibrate) and (
tensor_name not in initializer):
if (
vi.type.HasField("tensor_type")
and (vi.type.tensor_type.elem_type in tensor_type_to_calibrate)
and (tensor_name not in initializer)
):
tensors_to_calibrate.add(tensor_name)
return tensors_to_calibrate, value_infos
def get_augment_model(self):
'''
"""
return: augmented onnx model
'''
"""
return self.augment_model
def augment_graph(self):
'''
"""
abstract method: augment the input model to prepare for collecting data. It will:
1. save augmented model to augmented_model_path.
2. set the self.augment_model
'''
"""
raise NotImplementedError
def collect_data(self, data_reader: CalibrationDataReader):
'''
"""
abstract method: collect the tensors that will be used for range computation. It can be called multiple times.
'''
"""
raise NotImplementedError
def compute_range(self, data_reader: CalibrationDataReader):
'''
"""
abstract method: compute the [min, max] range for the tensors to calibrate based on the collected data.
'''
"""
raise NotImplementedError
class MinMaxCalibrater(CalibraterBase):
def __init__(self,
model,
op_types_to_calibrate=[],
augmented_model_path='augmented_model.onnx',
symmetric=False,
use_external_data_format=False,
moving_average=False,
averaging_constant=0.01):
'''
def __init__(
self,
model,
op_types_to_calibrate=[],
augmented_model_path="augmented_model.onnx",
symmetric=False,
use_external_data_format=False,
moving_average=False,
averaging_constant=0.01,
):
"""
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
:param augmented_model_path: save augmented model to this path.
@ -155,8 +178,14 @@ class MinMaxCalibrater(CalibraterBase):
:param use_external_data_format: use external data format to store model which size is >= 2Gb
:param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
:param averaging_constant: constant smoothing factor to use when computing the moving average.
'''
super(MinMaxCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, symmetric, use_external_data_format)
"""
super(MinMaxCalibrater, self).__init__(
model,
op_types_to_calibrate,
augmented_model_path,
symmetric,
use_external_data_format,
)
self.intermediate_outputs = []
self.calibrate_tensors_range = None
self.num_model_outputs = len(self.model.graph.output)
@ -167,16 +196,16 @@ class MinMaxCalibrater(CalibraterBase):
self.averaging_constant = averaging_constant
def augment_graph(self):
'''
"""
Adds ReduceMin and ReduceMax nodes to all quantization_candidates op type nodes in
model and ensures their outputs are stored as part of the graph output
:return: augmented ONNX model
'''
"""
model = clone_model_with_shape_infer(self.model)
added_nodes = []
added_outputs = []
tensors, value_infos = self.select_tensors_to_calibrate(model)
tensors, value_infos = self.select_tensors_to_calibrate(model)
for tensor in tensors:
@ -193,22 +222,38 @@ class MinMaxCalibrater(CalibraterBase):
shape = (1,) if len(dim) == 1 else tuple(1 for i in range(len(dim)))
# Adding ReduceMin nodes
reduce_min_name = tensor + '_ReduceMin'
reduce_min_node = onnx.helper.make_node('ReduceMin', [tensor], [tensor + '_ReduceMin'], reduce_min_name, keepdims=keepdims)
reduce_min_name = tensor + "_ReduceMin"
reduce_min_node = onnx.helper.make_node(
"ReduceMin",
[tensor],
[tensor + "_ReduceMin"],
reduce_min_name,
keepdims=keepdims,
)
added_nodes.append(reduce_min_node)
added_outputs.append(helper.make_tensor_value_info(reduce_min_node.output[0], TensorProto.FLOAT, shape))
# Adding ReduceMax nodes
reduce_max_name = tensor + '_ReduceMax'
reduce_max_node = onnx.helper.make_node('ReduceMax', [tensor], [tensor + '_ReduceMax'], reduce_max_name, keepdims=keepdims)
reduce_max_name = tensor + "_ReduceMax"
reduce_max_node = onnx.helper.make_node(
"ReduceMax",
[tensor],
[tensor + "_ReduceMax"],
reduce_max_name,
keepdims=keepdims,
)
added_nodes.append(reduce_max_node)
added_outputs.append(helper.make_tensor_value_info(reduce_max_node.output[0], TensorProto.FLOAT, shape))
model.graph.node.extend(added_nodes)
model.graph.output.extend(added_outputs)
onnx.save(model, self.augmented_model_path, save_as_external_data=self.use_external_data_format)
onnx.save(
model,
self.augmented_model_path,
save_as_external_data=self.use_external_data_format,
)
self.augment_model = model
def clear_collected_data(self):
@ -231,7 +276,7 @@ class MinMaxCalibrater(CalibraterBase):
if not old_range:
return new_range
for key, value in old_range.items():
for key, value in old_range.items():
if self.moving_average:
min_value = value[0] + self.averaging_constant * (new_range[key][0] - value[0])
max_value = value[1] + self.averaging_constant * (new_range[key][1] - value[1])
@ -243,10 +288,10 @@ class MinMaxCalibrater(CalibraterBase):
return new_range
def compute_range(self):
'''
"""
Compute the min-max range of tensor
:return: dictionary mapping: {added node names: (ReduceMin, ReduceMax) pairs }
'''
"""
if len(self.intermediate_outputs) == 0:
return self.calibrate_tensors_range
@ -260,21 +305,22 @@ class MinMaxCalibrater(CalibraterBase):
for d in output_dicts_list:
for k, v in d.items():
merged_output_dict.setdefault(k, []).append(v)
added_output_names = output_names[self.num_model_outputs:]
added_output_names = output_names[self.num_model_outputs :]
calibrate_tensor_names = [
added_output_names[i].rpartition('_')[0] for i in range(0, len(added_output_names), 2)
] #output names
added_output_names[i].rpartition("_")[0] for i in range(0, len(added_output_names), 2)
] # output names
merged_added_output_dict = dict(
(i, merged_output_dict[i]) for i in merged_output_dict if i not in self.model_original_outputs)
(i, merged_output_dict[i]) for i in merged_output_dict if i not in self.model_original_outputs
)
pairs = []
for i in range(0, len(added_output_names), 2):
min_value = 0
max_value = 0
if self.moving_average:
min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis = 0)
max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis = 0)
min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis=0)
max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
else:
min_value_array = min(merged_added_output_dict[added_output_names[i]])
max_value_array = max(merged_added_output_dict[added_output_names[i + 1]])
@ -293,22 +339,25 @@ class MinMaxCalibrater(CalibraterBase):
if self.calibrate_tensors_range:
self.calibrate_tensors_range = self.merge_range(self.calibrate_tensors_range, new_calibrate_tensors_range)
else:
self.calibrate_tensors_range = new_calibrate_tensors_range
self.calibrate_tensors_range = new_calibrate_tensors_range
return self.calibrate_tensors_range
class HistogramCalibrater(CalibraterBase):
def __init__(self,
model,
op_types_to_calibrate=[],
augmented_model_path='augmented_model.onnx',
use_external_data_format=False,
method='percentile',
symmetric=False,
num_bins=128,
num_quantized_bins=2048,
percentile=99.999):
'''
def __init__(
self,
model,
op_types_to_calibrate=[],
augmented_model_path="augmented_model.onnx",
use_external_data_format=False,
method="percentile",
symmetric=False,
num_bins=128,
num_quantized_bins=2048,
percentile=99.999,
):
"""
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
:param augmented_model_path: save augmented model to this path.
@ -318,8 +367,10 @@ class HistogramCalibrater(CalibraterBase):
:param num_bins: number of bins to create a new histogram for collecting tensor values.
:param num_quantized_bins: number of quantized bins. Default 128.
:param percentile: A float number between [0, 100]. Default 99.99.
'''
super(HistogramCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, use_external_data_format)
"""
super(HistogramCalibrater, self).__init__(
model, op_types_to_calibrate, augmented_model_path, use_external_data_format
)
self.intermediate_outputs = []
self.calibrate_tensors_range = None
self.num_model_outputs = len(self.model.graph.output)
@ -332,31 +383,35 @@ class HistogramCalibrater(CalibraterBase):
self.percentile = percentile
def augment_graph(self):
'''
"""
make all quantization_candidates op type nodes as part of the graph output.
:return: augmented ONNX model
'''
"""
model = clone_model_with_shape_infer(self.model)
added_nodes = []
added_outputs = []
tensors, value_infos = self.select_tensors_to_calibrate(model)
tensors, value_infos = self.select_tensors_to_calibrate(model)
for tensor in tensors:
added_outputs.append(value_infos[tensor])
model.graph.node.extend(added_nodes)
model.graph.output.extend(added_outputs)
onnx.save(model, self.augmented_model_path, save_as_external_data=self.use_external_data_format)
onnx.save(
model,
self.augmented_model_path,
save_as_external_data=self.use_external_data_format,
)
self.augment_model = model
def clear_collected_data(self):
self.intermediate_outputs = []
def collect_data(self, data_reader: CalibrationDataReader):
'''
Entropy Calibrator collects operators' tensors as well as generates tensor histogram for each operator.
'''
"""
Entropy Calibrator collects operators' tensors as well as generates tensor histogram for each operator.
"""
while True:
inputs = data_reader.get_next()
if not inputs:
@ -379,36 +434,41 @@ class HistogramCalibrater(CalibraterBase):
clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict if i not in self.model_original_outputs)
if not self.collector:
self.collector = HistogramCollector(method=self.method,
symmetric=self.symmetric,
num_bins=self.num_bins,
num_quantized_bins=self.num_quantized_bins,
percentile=self.percentile)
self.collector = HistogramCollector(
method=self.method,
symmetric=self.symmetric,
num_bins=self.num_bins,
num_quantized_bins=self.num_quantized_bins,
percentile=self.percentile,
)
self.collector.collect(clean_merged_dict)
self.clear_collected_data()
def compute_range(self):
'''
"""
Compute the min-max range of tensor
:return: dictionary mapping: {tensor name: (min value, max value)}
'''
"""
if not self.collector:
raise ValueError("No collector created and can't generate calibration data.")
return self.collector.compute_collection_result()
class EntropyCalibrater(HistogramCalibrater):
def __init__(self,
model,
op_types_to_calibrate=[],
augmented_model_path='augmented_model.onnx',
use_external_data_format=False,
method='entropy',
symmetric=False,
num_bins=128,
num_quantized_bins=128):
'''
def __init__(
self,
model,
op_types_to_calibrate=[],
augmented_model_path="augmented_model.onnx",
use_external_data_format=False,
method="entropy",
symmetric=False,
num_bins=128,
num_quantized_bins=128,
):
"""
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
:param augmented_model_path: save augmented model to this path.
@ -417,21 +477,32 @@ class EntropyCalibrater(HistogramCalibrater):
:param symmetric: make range of tensor symmetric (central point is 0).
:param num_bins: number of bins to create a new histogram for collecting tensor values.
:param num_quantized_bins: number of quantized bins. Default 128.
'''
super(EntropyCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, use_external_data_format,
method=method, symmetric=symmetric, num_bins=num_bins, num_quantized_bins=num_quantized_bins)
"""
super(EntropyCalibrater, self).__init__(
model,
op_types_to_calibrate,
augmented_model_path,
use_external_data_format,
method=method,
symmetric=symmetric,
num_bins=num_bins,
num_quantized_bins=num_quantized_bins,
)
class PercentileCalibrater(HistogramCalibrater):
def __init__(self,
model,
op_types_to_calibrate=[],
augmented_model_path='augmented_model.onnx',
use_external_data_format=False,
method='percentile',
symmetric=False,
num_bins=2048,
percentile=99.999):
'''
def __init__(
self,
model,
op_types_to_calibrate=[],
augmented_model_path="augmented_model.onnx",
use_external_data_format=False,
method="percentile",
symmetric=False,
num_bins=2048,
percentile=99.999,
):
"""
:param model: ONNX model to calibrate. It can be a ModelProto or a model path
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
:param augmented_model_path: save augmented model to this path.
@ -440,9 +511,18 @@ class PercentileCalibrater(HistogramCalibrater):
:param symmetric: make range of tensor symmetric (central point is 0).
:param num_quantized_bins: number of quantized bins. Default 128.
:param percentile: A float number between [0, 100]. Default 99.99.
'''
super(PercentileCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, use_external_data_format,
method=method, symmetric=symmetric, num_bins=num_bins, percentile=percentile)
"""
super(PercentileCalibrater, self).__init__(
model,
op_types_to_calibrate,
augmented_model_path,
use_external_data_format,
method=method,
symmetric=symmetric,
num_bins=num_bins,
percentile=percentile,
)
class CalibrationDataCollector(metaclass=abc.ABCMeta):
"""
@ -453,18 +533,19 @@ class CalibrationDataCollector(metaclass=abc.ABCMeta):
def collect(self, name_to_arr):
"""
Generate informative data based on given data.
name_to_arr : dict
tensor name to NDArray data
name_to_arr : dict
tensor name to NDArray data
"""
raise NotImplementedError
@abc.abstractmethod
def compute_collection_result(self):
"""
Get the optimal result among collection data.
Get the optimal result among collection data.
"""
raise NotImplementedError
class HistogramCollector(CalibrationDataCollector):
"""
Collecting histogram for each tensor. Percentile and Entropy method are supported.
@ -473,12 +554,13 @@ class HistogramCollector(CalibrationDataCollector):
ref: https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/_modules/
pytorch_quantization/calib/histogram.html
"""
def __init__(self, method, symmetric, num_bins, num_quantized_bins, percentile):
self.histogram_dict = {}
self.method = method
self.symmetric = symmetric
self.num_bins = num_bins
self.num_quantized_bins= num_quantized_bins
self.num_quantized_bins = num_quantized_bins
self.percentile = percentile
def get_histogram_dict(self):
@ -489,24 +571,24 @@ class HistogramCollector(CalibrationDataCollector):
# TODO: Currently we have different collect() for entropy and percentile method respectively.
# Need unified collect in the future.
if self.method == 'entropy':
if self.method == "entropy":
return self.collect_value(name_to_arr)
elif self.method == 'percentile':
elif self.method == "percentile":
if self.symmetric:
return self.collect_absolute_value(name_to_arr)
else:
return self.collect_value(name_to_arr)
else:
raise ValueError('Only \'entropy\' or \'percentile\' method are supported')
raise ValueError("Only 'entropy' or 'percentile' method are supported")
def collect_absolute_value(self, name_to_arr):
'''
"""
Collect histogram on absolute value
'''
"""
for tensor, data_arr in name_to_arr.items():
data_arr = np.asarray(data_arr)
data_arr = data_arr.flatten()
data_arr = np.absolute(data_arr) # only consider absolute value
data_arr = np.absolute(data_arr) # only consider absolute value
if tensor not in self.histogram_dict:
# first time it uses num_bins to compute histogram.
@ -524,13 +606,13 @@ class HistogramCollector(CalibrationDataCollector):
new_bin_edges = np.arange(old_hist_edges[-1] + width, temp_amax + width, width)
old_hist_edges = np.hstack((old_hist_edges, new_bin_edges))
hist, hist_edges = np.histogram(data_arr, bins=old_hist_edges)
hist[:len(old_hist)] += old_hist
hist[: len(old_hist)] += old_hist
self.histogram_dict[tensor] = (hist, hist_edges)
def collect_value(self, name_to_arr):
'''
"""
Collect histogram on real value
'''
"""
for tensor, data_arr in name_to_arr.items():
data_arr = np.asarray(data_arr)
data_arr = data_arr.flatten()
@ -546,10 +628,18 @@ class HistogramCollector(CalibrationDataCollector):
if tensor in self.histogram_dict:
old_histogram = self.histogram_dict[tensor]
self.histogram_dict[tensor] = self.merge_histogram(old_histogram, data_arr, min_value, max_value, threshold)
self.histogram_dict[tensor] = self.merge_histogram(
old_histogram, data_arr, min_value, max_value, threshold
)
else:
hist, hist_edges = np.histogram(data_arr, self.num_bins, range=(-threshold, threshold))
self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value, threshold)
self.histogram_dict[tensor] = (
hist,
hist_edges,
min_value,
max_value,
threshold,
)
def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_threshold):
@ -557,7 +647,13 @@ class HistogramCollector(CalibrationDataCollector):
if new_threshold <= old_threshold:
new_hist, _ = np.histogram(data_arr, len(old_hist), range=(-old_threshold, old_threshold))
return (new_hist + old_hist, old_hist_edges, min(old_min, new_min), max(old_max, new_max), old_threshold)
return (
new_hist + old_hist,
old_hist_edges,
min(old_min, new_min),
max(old_max, new_max),
old_threshold,
)
else:
if old_threshold == 0:
hist, hist_edges = np.histogram(data_arr, len(old_hist), range=(-new_threshold, new_threshold))
@ -565,24 +661,30 @@ class HistogramCollector(CalibrationDataCollector):
else:
old_num_bins = len(old_hist)
old_stride = 2 * old_threshold / old_num_bins
half_increased_bins = int((new_threshold - old_threshold) // old_stride + 1)
half_increased_bins = int((new_threshold - old_threshold) // old_stride + 1)
new_num_bins = old_num_bins + 2 * half_increased_bins
new_threshold = half_increased_bins * old_stride + old_threshold
hist, hist_edges = np.histogram(data_arr, new_num_bins, range=(-new_threshold, new_threshold))
hist[half_increased_bins:new_num_bins-half_increased_bins] += old_hist
return (hist, hist_edges, min(old_min, new_min), max(old_max, new_max), new_threshold)
hist[half_increased_bins : new_num_bins - half_increased_bins] += old_hist
return (
hist,
hist_edges,
min(old_min, new_min),
max(old_max, new_max),
new_threshold,
)
def compute_collection_result(self):
if not self.histogram_dict or len(self.histogram_dict) == 0:
raise ValueError("Histogram has not been collected. Please run collect() first.")
print("Finding optimal threshold for each tensor using {} algorithm ...".format(self.method))
if self.method == 'entropy':
if self.method == "entropy":
return self.compute_entropy()
elif self.method == 'percentile':
elif self.method == "percentile":
return self.compute_percentile()
else:
raise ValueError('Only \'entropy\' or \'percentile\' method are supported')
raise ValueError("Only 'entropy' or 'percentile' method are supported")
def compute_percentile(self):
if self.percentile < 0 or self.percentile > 100:
@ -591,7 +693,7 @@ class HistogramCollector(CalibrationDataCollector):
histogram_dict = self.histogram_dict
percentile = self.percentile
thresholds_dict = {} # per tensor thresholds
thresholds_dict = {} # per tensor thresholds
print("Number of tensors : {}".format(len(histogram_dict)))
print("Number of histogram bins : {}".format(self.num_bins))
@ -601,15 +703,21 @@ class HistogramCollector(CalibrationDataCollector):
hist = histogram[0]
hist_edges = histogram[1]
total = hist.sum()
cdf = np.cumsum(hist/total)
cdf = np.cumsum(hist / total)
if self.symmetric:
idx_right = np.searchsorted(cdf, percentile / 100.0)
thresholds_dict[tensor] = (-float(hist_edges[idx_right]), float(hist_edges[idx_right]))
thresholds_dict[tensor] = (
-float(hist_edges[idx_right]),
float(hist_edges[idx_right]),
)
else:
percent_to_cut_one_side = (100.0 - percentile) / 200.0
idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
idx_left = np.searchsorted(cdf, percent_to_cut_one_side)
thresholds_dict[tensor] = (float(hist_edges[idx_left]), float(hist_edges[idx_right]))
thresholds_dict[tensor] = (
float(hist_edges[idx_left]),
float(hist_edges[idx_right]),
)
# Plot histogram for debug only
if False:
@ -621,10 +729,14 @@ class HistogramCollector(CalibrationDataCollector):
histogram_dict = self.histogram_dict
num_quantized_bins = self.num_quantized_bins
thresholds_dict = {} # per tensor thresholds
thresholds_dict = {} # per tensor thresholds
print("Number of tensors : {}".format(len(histogram_dict)))
print("Number of histogram bins : {} (The number may increase depends on the data it collects)".format(self.num_bins))
print(
"Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
self.num_bins
)
)
print("Number of quantized bins : {}".format(self.num_quantized_bins))
for tensor, histogram in histogram_dict.items():
@ -643,17 +755,18 @@ class HistogramCollector(CalibrationDataCollector):
`q` is a truncated version of the original distribution.
Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
"""
from scipy.stats import entropy
import copy
from scipy.stats import entropy
hist = histogram[0]
hist_edges = histogram[1]
num_bins = hist.size
zero_bin_index = num_bins // 2
num_half_quantized_bin = num_quantized_bins // 2
kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
thresholds = [(0, 0) for i in range(kl_divergence.size)]
thresholds = [(0, 0) for i in range(kl_divergence.size)]
# <------------ num bins ---------------->
# <--- quantized bins ---->
@ -670,33 +783,36 @@ class HistogramCollector(CalibrationDataCollector):
# start index end index (end of iteration)
for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
start_index = zero_bin_index - i
start_index = zero_bin_index - i
end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
thresholds[i - num_half_quantized_bin] = (float(hist_edges[start_index]), float(hist_edges[end_index]))
thresholds[i - num_half_quantized_bin] = (
float(hist_edges[start_index]),
float(hist_edges[end_index]),
)
sliced_distribution = copy.deepcopy(hist[start_index:end_index])
# reference distribution p
p = sliced_distribution.copy() # a copy of np array
left_outliers_count = sum(hist[:start_index])
p = sliced_distribution.copy() # a copy of np array
left_outliers_count = sum(hist[:start_index])
right_outliers_count = sum(hist[end_index:])
p[0] += left_outliers_count
p[-1] += right_outliers_count
# nonzeros[i] incidates whether p[i] is non-zero
nonzeros = (p != 0).astype(np.int64)
# quantize p.size bins into quantized bins (default 128 bins)
# quantize p.size bins into quantized bins (default 128 bins)
quantized_bins = np.zeros(num_quantized_bins, dtype=np.int64)
num_merged_bins = sliced_distribution.size // num_quantized_bins
# merge bins into quantized bins
for index in range(num_quantized_bins):
start = index * num_merged_bins
start = index * num_merged_bins
end = start + num_merged_bins
quantized_bins[index] = sum(sliced_distribution[start:end])
quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins:])
quantized_bins[index] = sum(sliced_distribution[start:end])
quantized_bins[-1] += sum(sliced_distribution[num_quantized_bins * num_merged_bins :])
# in order to compare p and q, we need to make length of q equals to length of p
# expand quantized bins into p.size bins
@ -708,63 +824,71 @@ class HistogramCollector(CalibrationDataCollector):
norm = sum(nonzeros[start:end])
if norm != 0:
q[start:end] = float(quantized_bins[index]) / float(norm)
p = smooth_distribution(p)
q = smooth_distribution(q)
if isinstance(q, np.ndarray):
kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
else:
kl_divergence[i - num_half_quantized_bin] = float('inf')
kl_divergence[i - num_half_quantized_bin] = float("inf")
min_kl_divergence_idx = np.argmin(kl_divergence)
optimal_threshold = thresholds[min_kl_divergence_idx]
optimal_threshold = thresholds[min_kl_divergence_idx]
return optimal_threshold
def create_calibrator(model,
op_types_to_calibrate=[],
augmented_model_path='augmented_model.onnx',
calibrate_method=CalibrationMethod.MinMax,
use_external_data_format=False,
extra_options={}):
def create_calibrator(
model,
op_types_to_calibrate=[],
augmented_model_path="augmented_model.onnx",
calibrate_method=CalibrationMethod.MinMax,
use_external_data_format=False,
extra_options={},
):
if calibrate_method == CalibrationMethod.MinMax:
# default settings for min-max algorithm
symmetric = False if 'symmetric' not in extra_options else extra_options['symmetric']
moving_average = False if 'moving_average' not in extra_options else extra_options['moving_average']
averaging_constant = 0.01 if 'averaging_constant' not in extra_options else extra_options['averaging_constant']
symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
moving_average = False if "moving_average" not in extra_options else extra_options["moving_average"]
averaging_constant = 0.01 if "averaging_constant" not in extra_options else extra_options["averaging_constant"]
return MinMaxCalibrater(
model, op_types_to_calibrate, augmented_model_path,
model,
op_types_to_calibrate,
augmented_model_path,
use_external_data_format=use_external_data_format,
symmetric=symmetric,
moving_average=moving_average,
averaging_constant=averaging_constant
averaging_constant=averaging_constant,
)
elif calibrate_method == CalibrationMethod.Entropy:
# default settings for entropy algorithm
num_bins = 128 if 'num_bins' not in extra_options else extra_options['num_bins']
num_quantized_bins = 128 if 'num_quantized_bins' not in extra_options else extra_options['num_quantized_bins']
symmetric = False if 'symmetric' not in extra_options else extra_options['symmetric']
num_bins = 128 if "num_bins" not in extra_options else extra_options["num_bins"]
num_quantized_bins = 128 if "num_quantized_bins" not in extra_options else extra_options["num_quantized_bins"]
symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
return EntropyCalibrater(
model, op_types_to_calibrate, augmented_model_path,
model,
op_types_to_calibrate,
augmented_model_path,
use_external_data_format=use_external_data_format,
symmetric=symmetric,
num_bins=num_bins,
num_quantized_bins=num_quantized_bins
num_quantized_bins=num_quantized_bins,
)
elif calibrate_method == CalibrationMethod.Percentile:
# default settings for percentile algorithm
num_bins = 2048 if 'num_bins' not in extra_options else extra_options['num_bins']
percentile = 99.999 if 'percentile' not in extra_options else extra_options['percentile']
symmetric = True if 'symmetric' not in extra_options else extra_options['symmetric']
num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
percentile = 99.999 if "percentile" not in extra_options else extra_options["percentile"]
symmetric = True if "symmetric" not in extra_options else extra_options["symmetric"]
return PercentileCalibrater(
model, op_types_to_calibrate, augmented_model_path,
model,
op_types_to_calibrate,
augmented_model_path,
use_external_data_format=use_external_data_format,
symmetric=symmetric,
num_bins=num_bins,
percentile=percentile
percentile=percentile,
)
raise ValueError('Unsupported calibration method {}'.format(calibrate_method))
raise ValueError("Unsupported calibration method {}".format(calibrate_method))

Просмотреть файл

@ -1,8 +1,11 @@
import onnx
import itertools
from .quant_utils import find_by_name, attribute_to_kwarg
from pathlib import Path
import onnx
from .quant_utils import attribute_to_kwarg, find_by_name
class ONNXModel:
def __init__(self, model):
self.model = model
@ -121,19 +124,19 @@ class ONNXModel:
return output_name_to_node[input]
def find_node_by_name(self, node_name, new_nodes_list, graph):
'''
"""
Find out if a node exists in a graph or a node is in the
new set of nodes created during quantization. Return the node found.
'''
graph_nodes_list = list(graph.node) #deep copy
"""
graph_nodes_list = list(graph.node) # deep copy
graph_nodes_list.extend(new_nodes_list)
node = find_by_name(node_name, graph_nodes_list)
return node
def find_nodes_by_initializer(self, graph, initializer):
'''
"""
Find all nodes with given initializer as an input.
'''
"""
nodes = []
for node in graph.node:
for node_input in node.input:
@ -174,19 +177,19 @@ class ONNXModel:
kwargs.update(kv)
node = onnx.helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
if node.op_type == 'Gemm':
if node.op_type == "Gemm":
alpha = 1.0
beta = 1.0
transA = 0
transB = 0
for attr in node.attribute:
if attr.name == 'alpha':
if attr.name == "alpha":
alpha = onnx.helper.get_attribute_value(attr)
elif attr.name == 'beta':
elif attr.name == "beta":
beta = onnx.helper.get_attribute_value(attr)
elif attr.name == 'transA':
elif attr.name == "transA":
transA = onnx.helper.get_attribute_value(attr)
elif attr.name == 'transB':
elif attr.name == "transB":
transB = onnx.helper.get_attribute_value(attr)
if alpha == 1.0 and beta == 1.0 and transA == 0:
inputB = node.input[1]
@ -204,25 +207,30 @@ class ONNXModel:
break
Bs_graph.initializer.extend([B_trans])
else:
inputB += '_Transposed'
transpose_node = onnx.helper.make_node('Transpose',
inputs=[node.input[1]],
outputs=[inputB],
name=node.name + '_Transpose' if node.name != "" else "")
inputB += "_Transposed"
transpose_node = onnx.helper.make_node(
"Transpose",
inputs=[node.input[1]],
outputs=[inputB],
name=node.name + "_Transpose" if node.name != "" else "",
)
new_nodes.append(transpose_node)
matmul_node = onnx.helper.make_node(
'MatMul',
"MatMul",
inputs=[node.input[0], inputB],
outputs=[node.output[0] + ('_MatMul' if len(node.input) > 2 else '')],
name=node.name + '_MatMul' if node.name != "" else "")
outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
name=node.name + "_MatMul" if node.name != "" else "",
)
new_nodes.append(matmul_node)
if len(node.input) > 2:
add_node = onnx.helper.make_node('Add',
inputs=[node.output[0] + '_MatMul', node.input[2]],
outputs=node.output,
name=node.name + '_Add' if node.name != "" else "")
add_node = onnx.helper.make_node(
"Add",
inputs=[node.output[0] + "_MatMul", node.input[2]],
outputs=node.output,
name=node.name + "_Add" if node.name != "" else "",
)
new_nodes.append(add_node)
# unsupported
@ -233,7 +241,7 @@ class ONNXModel:
else:
new_nodes.append(node)
graph.ClearField('node')
graph.ClearField("node")
graph.node.extend(new_nodes)
graph_path.pop()
return graph
@ -243,14 +251,16 @@ class ONNXModel:
ONNXModel.__replace_gemm_with_matmul(graph_path)
def save_model_to_file(self, output_path, use_external_data_format=False):
'''
"""
Save model to external data, which is needed for model size > 2GB
'''
"""
self.topological_sort()
if use_external_data_format:
onnx.external_data_helper.convert_model_to_external_data(self.model,
all_tensors_to_one_file=True,
location=Path(output_path).name + ".data")
onnx.external_data_helper.convert_model_to_external_data(
self.model,
all_tensors_to_one_file=True,
location=Path(output_path).name + ".data",
)
onnx.save_model(self.model, output_path)
@staticmethod
@ -278,12 +288,15 @@ class ONNXModel:
def remove_unused_constant(self):
input_name_to_nodes = self.input_name_to_nodes()
#remove unused constant
# remove unused constant
unused_nodes = []
nodes = self.nodes()
for node in nodes:
if node.op_type == "Constant" and not self.is_graph_output(
node.output[0]) and node.output[0] not in input_name_to_nodes:
if (
node.op_type == "Constant"
and not self.is_graph_output(node.output[0])
and node.output[0] not in input_name_to_nodes
):
unused_nodes.append(node)
self.remove_nodes(unused_nodes)
@ -308,13 +321,13 @@ class ONNXModel:
# TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
# Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
def topological_sort(self):
deps_count = [0]*len(self.nodes()) # dependency count of each node
deps_to_nodes = {} # input to node indice
deps_count = [0] * len(self.nodes()) # dependency count of each node
deps_to_nodes = {} # input to node indice
sorted_nodes = [] # initialize sorted_nodes
for node_idx, node in enumerate(self.nodes()):
# CANNOT use len(node.input) directly because input can be optional
deps_count[node_idx] = sum(1 for _ in node.input if _ )
if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
deps_count[node_idx] = sum(1 for _ in node.input if _)
if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
sorted_nodes.append(self.nodes()[node_idx])
continue
@ -353,6 +366,6 @@ class ONNXModel:
end = end + 1
start = start + 1
assert(end == len(self.graph().node)), "Graph is not a DAG"
self.graph().ClearField('node')
self.graph().node.extend(sorted_nodes)
assert end == len(self.graph().node), "Graph is not a DAG"
self.graph().ClearField("node")
self.graph().node.extend(sorted_nodes)

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,2 +1,2 @@
#from .base_operator import QuantOperatorBase
#from .matmul import MatMulInteger
# from .base_operator import QuantOperatorBase
# from .matmul import MatMulInteger

Просмотреть файл

@ -1,8 +1,9 @@
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from onnx import onnx_pb as onnx_proto
class QLinearActivation(QuantOperatorBase):
@ -11,7 +12,7 @@ class QLinearActivation(QuantOperatorBase):
def QuantizeClipRelu(self):
node = self.node
assert (node.op_type == "Relu" or node.op_type == 'Clip')
assert node.op_type == "Relu" or node.op_type == "Clip"
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
@ -25,22 +26,34 @@ class QLinearActivation(QuantOperatorBase):
def quantize(self):
node = self.node
if node.op_type == "Relu" or node.op_type == 'Clip':
if node.op_type == "Relu" or node.op_type == "Clip":
self.QuantizeClipRelu()
return
nnapi_sigmoid_option = 'extra.Sigmoid.nnapi'
sigmoid_nnapi_mode = (node.op_type == 'Sigmoid' and
nnapi_sigmoid_option in self.quantizer.extra_options and
self.quantizer.extra_options[nnapi_sigmoid_option])
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
sigmoid_nnapi_mode = (
node.op_type == "Sigmoid"
and nnapi_sigmoid_option in self.quantizer.extra_options
and self.quantizer.extra_options[nnapi_sigmoid_option]
)
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
use_zeropoint = 0 if sigmoid_nnapi_mode else None
# No assert on op_type as it is controlled by registry
# only try to quantize when given quantization parameters for it
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
quantized_input_names, zero_point_names, scale_names, nodes = self.quantizer.quantize_inputs(node, [0])
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
@ -54,15 +67,29 @@ class QLinearActivation(QuantOperatorBase):
kwargs["domain"] = ms_domain
qlinear_activation_inputs = [
quantized_input_names[0], scale_names[0], zero_point_names[0], output_scale_name, output_zp_name
quantized_input_names[0],
scale_names[0],
zero_point_names[0],
output_scale_name,
output_zp_name,
]
qlinear_activation_node = onnx.helper.make_node("QLinear" + node.op_type, qlinear_activation_inputs,
[qlinear_activation_output], qlinear_activation_name, **kwargs)
qlinear_activation_node = onnx.helper.make_node(
"QLinear" + node.op_type,
qlinear_activation_inputs,
[qlinear_activation_output],
qlinear_activation_name,
**kwargs
)
# Create an entry for this quantized value
q_output = QuantizedValue(node.output[0], qlinear_activation_output, output_scale_name, output_zp_name,
QuantizedValueType.Input)
q_output = QuantizedValue(
node.output[0],
qlinear_activation_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
nodes.append(qlinear_activation_node)

Просмотреть файл

@ -1,5 +1,6 @@
from .base_operator import QuantOperatorBase
# Use the quantized tensor as input without DQ.
class QArgMax(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
@ -14,4 +15,4 @@ class QArgMax(QuantOperatorBase):
return
node.input[0] = quantized_input_value.q_name
self.quantizer.new_nodes += [node]
self.quantizer.new_nodes += [node]

Просмотреть файл

@ -1,10 +1,12 @@
import onnx
from .base_operator import QuantOperatorBase
from ..quant_utils import attribute_to_kwarg, ms_domain
from onnx import onnx_pb as onnx_proto
'''
from ..quant_utils import attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantize Attention
'''
"""
class AttentionQuant(QuantOperatorBase):
@ -12,23 +14,27 @@ class AttentionQuant(QuantOperatorBase):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
'''
parameter node: Attention node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
'''
"""
parameter node: Attention node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
"""
node = self.node
assert (node.op_type == "Attention")
assert node.op_type == "Attention"
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
# is implemented
for attr in node.attribute:
if 'qkv_hidden_sizes' == attr.name:
if "qkv_hidden_sizes" == attr.name:
return super().quantize()
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
if quantized_input_names is None:
return super().quantize()

Просмотреть файл

@ -4,13 +4,13 @@ class QuantOperatorBase:
self.node = onnx_node
def quantize(self):
'''
"""
Given a node which does not support quantization, this method checks whether the input to
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
parameter node: Current node
parameter new_nodes_list: List of new nodes created before processing current node
return: List of new nodes created
'''
"""
nodes = []
for index, node_input in enumerate(self.node.input):
dequantize_node = self.quantizer._dequantize_value(node_input)
@ -18,4 +18,4 @@ class QuantOperatorBase:
self.quantizer.new_nodes.append(dequantize_node)
# Append the original node
self.quantizer.new_nodes.append(self.node)
self.quantizer.new_nodes.append(self.node)

Просмотреть файл

@ -1,8 +1,9 @@
import onnx
from .base_operator import QuantOperatorBase
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantizedValue, QuantizedValueType
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearBinaryOp(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
@ -11,10 +12,19 @@ class QLinearBinaryOp(QuantOperatorBase):
def quantize(self):
node = self.node
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], initializer_use_weight_qType=False)
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], initializer_use_weight_qType=False)
if not data_found or quantized_input_names is None:
return super().quantize()
@ -40,14 +50,23 @@ class QLinearBinaryOp(QuantOperatorBase):
qlinear_binary_math_inputs.append(output_scale_name)
qlinear_binary_math_inputs.append(output_zp_name)
qlinear_binary_math_node = onnx.helper.make_node("QLinear" + node.op_type, qlinear_binary_math_inputs,
[qlinear_binary_math_output], qlinear_binary_math_name,
**kwargs)
qlinear_binary_math_node = onnx.helper.make_node(
"QLinear" + node.op_type,
qlinear_binary_math_inputs,
[qlinear_binary_math_output],
qlinear_binary_math_name,
**kwargs
)
nodes.append(qlinear_binary_math_node)
# Create an entry for this quantized value
q_output = QuantizedValue(node.output[0], qlinear_binary_math_output, output_scale_name, output_zp_name,
QuantizedValueType.Input)
q_output = QuantizedValue(
node.output[0],
qlinear_binary_math_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes

Просмотреть файл

@ -1,7 +1,9 @@
import onnx
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
from ..quant_utils import QuantizedValue, attribute_to_kwarg, ms_domain, QuantizedValueType
class QLinearConcat(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
@ -10,18 +12,31 @@ class QLinearConcat(QuantOperatorBase):
def quantize(self):
node = self.node
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(q_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [*range(0, len(node.input))], initializer_use_weight_qType=False)
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
q_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [*range(0, len(node.input))], initializer_use_weight_qType=False)
if not data_found or q_input_names is None:
return super().quantize()
# Create an entry for output quantized value
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
output_scale_name, output_zp_name,
quantized_input_value.value_type)
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + "_quantized",
output_scale_name,
output_zp_name,
quantized_input_value.value_type,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
kwargs = {}
@ -33,11 +48,14 @@ class QLinearConcat(QuantOperatorBase):
qlconcat_inputs = [output_scale_name, output_zp_name]
for i in range(0, len(q_input_names)):
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
qlconcat_node = onnx.helper.make_node("QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs)
qlconcat_node = onnx.helper.make_node(
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
)
self.quantizer.new_nodes += nodes
self.quantizer.new_nodes += [qlconcat_node]
class QDQConcat(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)

Просмотреть файл

@ -1,9 +1,17 @@
import onnx
import numpy as np
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import (
BiasToQuantize,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
find_by_name,
get_mul_node,
)
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
from ..quant_utils import find_by_name, get_mul_node, QuantizedValue, QuantizedValueType, attribute_to_kwarg, BiasToQuantize
from onnx import onnx_pb as onnx_proto
class ConvInteger(QuantOperatorBase):
@ -11,7 +19,7 @@ class ConvInteger(QuantOperatorBase):
super().__init__(onnx_quantizer, onnx_node)
def add_bias(self, nodes, scaled_output):
'''
"""
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
parameter nodes: new nodes would be appended into nodes
parameter node: current node (Conv)
@ -19,7 +27,7 @@ class ConvInteger(QuantOperatorBase):
parameter output: output of Conv
parameter bias_name: bias of Conv
return: the name of output
'''
"""
node = self.node
model = self.quantizer.model
# Add tensors for the shape to be reshaped to
@ -29,14 +37,15 @@ class ConvInteger(QuantOperatorBase):
# Add reshape for correct broadcase
output = node.output[0]
reshape_input_data = node.input[2] # bias of Conv
reshape_input_data = node.input[2] # bias of Conv
reshape_input_shape = output + "_bias_reshape_shape"
reshape_output = output + "_bias_reshape_output"
shape = np.ones((len(weight.dims)), dtype=np.int64)
shape[1] = -1
init_shape = onnx.helper.make_tensor(reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)],
shape)
init_shape = onnx.helper.make_tensor(
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
)
model.add_initializer(init_shape)
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
@ -48,10 +57,14 @@ class ConvInteger(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "Conv")
assert node.op_type == "Conv"
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
conv_integer_output = node.output[0] + "_output_quantized"
conv_integer_name = node.name + "_quant" if node.name != "" else ""
@ -59,19 +72,24 @@ class ConvInteger(QuantOperatorBase):
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
conv_integer_node = onnx.helper.make_node("ConvInteger", quantized_input_names + zero_point_names,
[conv_integer_output], conv_integer_name, **kwargs)
conv_integer_node = onnx.helper.make_node(
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
)
nodes.append(conv_integer_node)
# Add cast operation to cast convInteger output to float.
cast_op_output = conv_integer_output + "_cast_output"
cast_node = onnx.helper.make_node("Cast", [conv_integer_output], [cast_op_output],
conv_integer_output + "_cast",
to=onnx_proto.TensorProto.FLOAT)
cast_node = onnx.helper.make_node(
"Cast",
[conv_integer_output],
[cast_op_output],
conv_integer_output + "_cast",
to=onnx_proto.TensorProto.FLOAT,
)
nodes.append(cast_node)
# Add mul operation to multiply scales of two inputs.
assert (len(scale_names) == 2)
assert len(scale_names) == 2
if conv_integer_name != "":
scales_mul_op = conv_integer_name + "_scales_mul"
else:
@ -90,7 +108,13 @@ class ConvInteger(QuantOperatorBase):
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
# and make the output of this node the same as output of original conv node.
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name != "" else ""
nodes.append(get_mul_node([cast_op_output, scales_mul_op_output], scaled_output_name, output_scale_mul_op))
nodes.append(
get_mul_node(
[cast_op_output, scales_mul_op_output],
scaled_output_name,
output_scale_mul_op,
)
)
if has_bias:
self.add_bias(nodes, scaled_output_name)
@ -104,22 +128,36 @@ class QLinearConv(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "Conv")
assert node.op_type == "Conv"
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if self.quantizer.is_input_a_weight(node.input[1]) and self.quantizer.is_per_channel():
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1], onnx_proto.TensorProto.INT8,
0)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1], onnx_proto.TensorProto.INT8, 0
)
quantized_input_names.append(quant_weight_tuple[0])
zero_point_names.append(quant_weight_tuple[1])
scale_names.append(quant_weight_tuple[2])
else:
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
if not data_found or quantized_input_names is None:
return super().quantize()
@ -153,13 +191,19 @@ class QLinearConv(QuantOperatorBase):
if bias_present:
qlinear_conv_inputs.append(quantized_bias_name)
qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs, [qlinear_conv_output],
qlinear_conv_name, **kwargs)
qlinear_conv_node = onnx.helper.make_node(
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
)
nodes.append(qlinear_conv_node)
# Create an entry for this quantized value
q_output = QuantizedValue(node.output[0], qlinear_conv_output, output_scale_name, output_zp_name,
QuantizedValueType.Input)
q_output = QuantizedValue(
node.output[0],
qlinear_conv_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
@ -171,7 +215,7 @@ class QDQConv(QDQOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "Conv")
assert node.op_type == "Conv"
self.quantizer.quantize_tensor(node.input[0])
if not self.disable_qdq_for_node_output:

Просмотреть файл

@ -1,6 +1,7 @@
from ..quant_utils import QuantizedValue, QuantizedValueType
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
from ..quant_utils import QuantizedValue, QuantizedValueType
# For operators that support 8bits operations directly, and output could
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
@ -19,9 +20,13 @@ class Direct8BitOp(QuantOperatorBase):
self.quantizer.new_nodes += [node]
return
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
quantized_input_value.scale_name, quantized_input_value.zp_name,
quantized_input_value.value_type)
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + "_quantized",
quantized_input_value.scale_name,
quantized_input_value.zp_name,
quantized_input_value.value_type,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_value.q_name
@ -30,19 +35,27 @@ class Direct8BitOp(QuantOperatorBase):
else:
# Force quantize those ops if possible, use exclude node list if this is not you want
if (not self.quantizer.is_valid_quantize_weight(node.input[0])):
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
super().quantize()
return
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0])
if quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
scale_names[0], zero_point_names[0],
QuantizedValueType.Input)
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + "_quantized",
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_names[0]
@ -52,7 +65,6 @@ class Direct8BitOp(QuantOperatorBase):
self.quantizer.new_nodes += nodes
class QDQDirect8BitOp(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)

Просмотреть файл

@ -1,28 +1,32 @@
import onnx
import logging
from .base_operator import QuantOperatorBase
from ..quant_utils import attribute_to_kwarg, ms_domain
import onnx
from onnx import onnx_pb as onnx_proto
'''
from ..quant_utils import attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
weight inputs associated with the node to uint8.
'''
"""
class EmbedLayerNormalizationQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert (node.op_type == "EmbedLayerNormalization")
assert node.op_type == "EmbedLayerNormalization"
if len(node.output) > 2:
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
return super().quantize()
'''
"""
Pre-quantization EmbedLayerNorm inputs:
[0] input_ids (int32)
[1] segment_ids (int32)
@ -32,15 +36,19 @@ class EmbedLayerNormalizationQuant(QuantOperatorBase):
[5] gamma (float32)
[6] beta (float32)
[7] mask (int32) (optional)
'''
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6])
"""
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6])
if quantized_input_names is None:
return super().quantize()
qembed_layer_norm_name = "" if node.name == "" else node.name + "_quant"
'''
"""
Quantized Input Tensor List
[0] input_ids (int32)
[1] segment_ids (int32)
@ -60,7 +68,7 @@ class EmbedLayerNormalizationQuant(QuantOperatorBase):
[15] segment_embedding_zero_point (uint8)
[16] gamma_zero_point (uint8)
[17] beta_zero_point (uint8)
'''
"""
inputs = []
# 'input_ids'
inputs.extend([node.input[0]])
@ -98,8 +106,13 @@ class EmbedLayerNormalizationQuant(QuantOperatorBase):
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qembed_layer_norm_node = onnx.helper.make_node("QEmbedLayerNormalization", inputs, node.output,
qembed_layer_norm_name, **kwargs)
qembed_layer_norm_node = onnx.helper.make_node(
"QEmbedLayerNormalization",
inputs,
node.output,
qembed_layer_norm_name,
**kwargs,
)
nodes.append(qembed_layer_norm_node)
self.quantizer.new_nodes += nodes

Просмотреть файл

@ -1,10 +1,12 @@
import onnx
from .base_operator import QuantOperatorBase
from ..quant_utils import QuantizedValue, QuantizedValueType
from onnx import onnx_pb as onnx_proto
'''
from ..quant_utils import QuantizedValue, QuantizedValueType
from .base_operator import QuantOperatorBase
"""
Quantize Gather
'''
"""
class GatherQuant(QuantOperatorBase):
@ -13,21 +15,30 @@ class GatherQuant(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "Gather")
if (not self.quantizer.is_valid_quantize_weight(node.input[0])):
assert node.op_type == "Gather"
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
super().quantize()
return
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0])
if quantized_input_names is None:
return super().quantize()
gather_new_output = node.output[0] + "_quantized"
# Create an entry for this quantized value
q_output = QuantizedValue(node.output[0], gather_new_output, scale_names[0], zero_point_names[0],
QuantizedValueType.Input)
q_output = QuantizedValue(
node.output[0],
gather_new_output,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
gather_original_output = node.output[0]

Просмотреть файл

@ -1,6 +1,7 @@
import onnx
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantizedValue, QuantizedValueType
class QGlobalAveragePool(QuantOperatorBase):
@ -9,7 +10,7 @@ class QGlobalAveragePool(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "GlobalAveragePool")
assert node.op_type == "GlobalAveragePool"
# If input to this node is not quantized then keep this node.
if node.input[0] not in self.quantizer.quantized_value_map:
@ -19,13 +20,23 @@ class QGlobalAveragePool(QuantOperatorBase):
# Create an entry for output quantized value.
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
data_found, output_scale_name_from_parameter, output_zp_name_from_parameter, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(
data_found,
output_scale_name_from_parameter,
output_zp_name_from_parameter,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
# Just use input scale and zp if parameters for output is not specified.
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized", output_scale_name,
output_zp_name, QuantizedValueType.Input)
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + "_quantized",
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
kwargs = {}
@ -35,8 +46,17 @@ class QGlobalAveragePool(QuantOperatorBase):
kwargs["channels_last"] = 0
qnode_name = node.name + "_quant" if node.name != "" else ""
qnode = onnx.helper.make_node("QLinear" + node.op_type, [
quantized_input_value.q_name, quantized_input_value.scale_name, quantized_input_value.zp_name,
output_scale_name, output_zp_name
], [quantized_output_value.q_name], qnode_name, **kwargs)
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_value.q_name,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
output_scale_name,
output_zp_name,
],
[quantized_output_value.q_name],
qnode_name,
**kwargs
)
self.quantizer.new_nodes += [qnode]

Просмотреть файл

@ -1,55 +1,76 @@
import onnx
import numpy as np
import logging
import numpy as np
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, find_by_name, get_mul_node, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
from ..quant_utils import find_by_name, get_mul_node, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from onnx import onnx_pb as onnx_proto
def is_B_transposed(gemm_node):
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == 'transB']
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]
if len(transB_attribute):
return 0 < onnx.helper.get_attribute_value(transB_attribute[0])
return False
def get_beta(gemm_node):
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == 'beta']
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
if len(beta_attribute):
return onnx.helper.get_attribute_value(beta_attribute[0])
return 1.0
def set_default_beta(gemm_node):
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == 'beta']
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
if len(beta_attribute):
beta_attribute[0].f = 1.0
return 1.0
class QLinearGemm(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert (node.op_type == "Gemm")
assert node.op_type == "Gemm"
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if self.quantizer.is_input_a_weight(node.input[1]) and self.quantizer.is_per_channel():
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1], onnx_proto.TensorProto.INT8,
0 if is_B_transposed(node) else 1)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0], reduce_range=self.quantizer.reduce_range)
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1],
onnx_proto.TensorProto.INT8,
0 if is_B_transposed(node) else 1,
)
quantized_input_names.append(quant_weight_tuple[0])
zero_point_names.append(quant_weight_tuple[1])
scale_names.append(quant_weight_tuple[2])
else:
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=self.quantizer.reduce_range)
if not data_found or quantized_input_names is None:
return super().quantize()
@ -59,7 +80,9 @@ class QLinearGemm(QuantOperatorBase):
if not self.quantizer.is_input_a_weight(node.input[2]):
return super().quantize()
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1], get_beta(self.node))
quantized_bias_name = self.quantizer.quantize_bias_static(
node.input[2], node.input[0], node.input[1], get_beta(self.node)
)
qgemm_output = node.output[0] + "_quantized"
qgemm_name = qgemm_name = node.name + "_quant" if node.name != "" else ""
@ -77,13 +100,17 @@ class QLinearGemm(QuantOperatorBase):
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output],
qgemm_name, **kwargs)
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
nodes.append(qgemm_node)
# Create an entry for this quantized value
q_output = QuantizedValue(node.output[0], qgemm_output, output_scale_name, output_zp_name,
QuantizedValueType.Input)
q_output = QuantizedValue(
node.output[0],
qgemm_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
@ -95,7 +122,7 @@ class QDQGemm(QDQOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "Gemm")
assert node.op_type == "Gemm"
self.quantizer.quantize_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
@ -112,6 +139,7 @@ class QDQGemm(QDQOperatorBase):
set_default_beta(self.node)
else:
logging.warning(
"Bias of Gemm node '{}' is not constant. Please exclude this node for better performance."
.format(self.node.name))
"Bias of Gemm node '{}' is not constant. Please exclude this node for better performance.".format(
self.node.name
)
)

Просмотреть файл

@ -1,11 +1,13 @@
import onnx
import numpy
from .base_operator import QuantOperatorBase
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantType
import onnx
from onnx import onnx_pb as onnx_proto
'''
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantize LSTM
'''
"""
class LSTMQuant(QuantOperatorBase):
@ -13,16 +15,17 @@ class LSTMQuant(QuantOperatorBase):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
'''
parameter node: LSTM node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
'''
"""
parameter node: LSTM node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
"""
node = self.node
assert (node.op_type == "LSTM")
assert node.op_type == "LSTM"
if (not self.quantizer.is_valid_quantize_weight(node.input[1])
or not self.quantizer.is_valid_quantize_weight(node.input[2])):
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
node.input[2]
):
super().quantize()
return
@ -30,7 +33,7 @@ class LSTMQuant(QuantOperatorBase):
W = model.get_initializer(node.input[1])
R = model.get_initializer(node.input[2])
if (len(W.dims) != 3 or len(R.dims) != 3):
if len(W.dims) != 3 or len(R.dims) != 3:
super().quantize()
return
@ -43,10 +46,12 @@ class LSTMQuant(QuantOperatorBase):
W.dims[0] = W_num_dir * W_4_hidden_size
R.dims[0] = R_num_dir * R_4_hidden_size
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[1],
onnx_proto.TensorProto.INT8, 0)
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(node.input[2],
onnx_proto.TensorProto.INT8, 0)
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1], onnx_proto.TensorProto.INT8, 0
)
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[2], onnx_proto.TensorProto.INT8, 0
)
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])
@ -87,10 +92,14 @@ class LSTMQuant(QuantOperatorBase):
inputs.extend([node.input[5] if input_len > 5 else ""])
inputs.extend([node.input[6] if input_len > 6 else ""])
inputs.extend([node.input[7] if input_len > 7 else ""])
inputs.extend([
quant_input_weight_tuple[2], quant_input_weight_tuple[1], quant_recurrent_weight_tuple[2],
quant_recurrent_weight_tuple[1]
])
inputs.extend(
[
quant_input_weight_tuple[2],
quant_input_weight_tuple[1],
quant_recurrent_weight_tuple[2],
quant_recurrent_weight_tuple[1],
]
)
kwargs = {}
for attribute in node.attribute:

Просмотреть файл

@ -1,12 +1,15 @@
import onnx
import itertools
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
from ..quant_utils import find_by_name, get_mul_node, QuantizedValue, QuantizedValueType
from onnx import onnx_pb as onnx_proto
'''
"""
Used when quantize mode is QuantizationMode.IntegerOps.
'''
"""
class MatMulInteger(QuantOperatorBase):
@ -15,28 +18,43 @@ class MatMulInteger(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "MatMul")
assert node.op_type == "MatMul"
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
matmul_integer_output = node.output[0] + "_output_quantized"
matmul_integer_name = node.name + "_quant" if node.name != "" else ""
matmul_integer_node = onnx.helper.make_node("MatMulInteger", quantized_input_names + zero_point_names,
[matmul_integer_output], matmul_integer_name)
matmul_integer_node = onnx.helper.make_node(
"MatMulInteger",
quantized_input_names + zero_point_names,
[matmul_integer_output],
matmul_integer_name,
)
nodes.append(matmul_integer_node)
# Add cast operation to cast matmulInteger output to float.
cast_op_output = matmul_integer_output + "_cast_output"
cast_node = onnx.helper.make_node("Cast", [matmul_integer_output], [cast_op_output],
matmul_integer_output + "_cast",
to=onnx_proto.TensorProto.FLOAT)
cast_node = onnx.helper.make_node(
"Cast",
[matmul_integer_output],
[cast_op_output],
matmul_integer_output + "_cast",
to=onnx_proto.TensorProto.FLOAT,
)
nodes.append(cast_node)
# Add mul operation to multiply scales of two inputs.
assert (len(scale_names) == 2)
scales_mul_op = matmul_integer_name + "_scales_mul" if matmul_integer_name != "" else scale_names[
0] + "_" + scale_names[1] + "_mul"
assert len(scale_names) == 2
scales_mul_op = (
matmul_integer_name + "_scales_mul"
if matmul_integer_name != ""
else scale_names[0] + "_" + scale_names[1] + "_mul"
)
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
if scales_mul_node is None:
@ -50,13 +68,19 @@ class MatMulInteger(QuantOperatorBase):
output_scale_mul_op = ""
if matmul_integer_name != "":
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
nodes.append(get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op))
nodes.append(
get_mul_node(
[cast_op_output, scales_mul_op_output],
node.output[0],
output_scale_mul_op,
)
)
self.quantizer.new_nodes += nodes
'''
"""
Used when quantize mode is QuantizationMode.QLinearOps
'''
"""
class QLinearMatMul(QuantOperatorBase):
@ -65,12 +89,21 @@ class QLinearMatMul(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "MatMul")
assert node.op_type == "MatMul"
(quantized_input_names, zero_point_names, scale_names, nodes) = \
self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0, 1], reduce_range=True, op_level_per_channel=True)
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if not data_found or quantized_input_names is None:
return super().quantize()
@ -90,24 +123,34 @@ class QLinearMatMul(QuantOperatorBase):
qlinear_matmul_inputs.append(output_scale_name)
qlinear_matmul_inputs.append(output_zp_name)
qlinear_matmul_node = onnx.helper.make_node("QLinearMatMul", qlinear_matmul_inputs, [qlinear_matmul_output],
qlinear_matmul_name)
qlinear_matmul_node = onnx.helper.make_node(
"QLinearMatMul",
qlinear_matmul_inputs,
[qlinear_matmul_output],
qlinear_matmul_name,
)
nodes.append(qlinear_matmul_node)
# Create an entry for this quantized value
q_output = QuantizedValue(node.output[0], qlinear_matmul_output, output_scale_name, output_zp_name,
QuantizedValueType.Input)
q_output = QuantizedValue(
node.output[0],
qlinear_matmul_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQMatMul(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert (node.op_type == "MatMul")
assert node.op_type == "MatMul"
if self.disable_qdq_for_node_output:
nodes_to_iterate = node.input
@ -116,7 +159,7 @@ class QDQMatMul(QDQOperatorBase):
for tensor_name in nodes_to_iterate:
# only support per-channel quantization on weight
if self.quantizer.is_per_channel() and find_by_name(tensor_name, self.quantizer.model.initializer()) :
if self.quantizer.is_per_channel() and find_by_name(tensor_name, self.quantizer.model.initializer()):
channel_axis = self.quantizer.qdq_op_type_per_channel_support_to_axis.get(node.op_type, 1)
self.quantizer.quantize_tensor_per_channel(tensor_name, channel_axis)
else:

Просмотреть файл

@ -7,7 +7,7 @@ class QMaxPool(Direct8BitOp):
def quantize(self):
node = self.node
assert (node.op_type == "MaxPool")
assert node.op_type == "MaxPool"
# if version is less than 12, go to normal quantize.
if self.quantizer.opset_version < 12:
@ -24,7 +24,7 @@ class QDQMaxPool(QDQDirect8BitOp):
def quantize(self):
node = self.node
assert (node.op_type == "MaxPool")
assert node.op_type == "MaxPool"
# if version is less than 12, just no change
if self.quantizer.opset_version < 12:

Просмотреть файл

@ -1,7 +1,8 @@
import onnx
import numpy as np
from .base_operator import QuantOperatorBase
import onnx
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
from .base_operator import QuantOperatorBase
class QPad(QuantOperatorBase):
@ -10,7 +11,7 @@ class QPad(QuantOperatorBase):
def quantize(self):
node = self.node
assert (node.op_type == "Pad")
assert node.op_type == "Pad"
# Only after version 11, it has the optional constant_value
# If input[0] is not quantized, do not quanitize this node
@ -24,7 +25,7 @@ class QPad(QuantOperatorBase):
kv = attribute_to_kwarg(attribute)
kwargs.update(kv)
if 'mode' not in kwargs or kwargs['mode'] == b'constant':
if "mode" not in kwargs or kwargs["mode"] == b"constant":
if len(node.input) > 2: # There is 3rd input 'constant_value'
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
@ -39,29 +40,43 @@ class QPad(QuantOperatorBase):
scale_array = onnx.numpy_helper.to_array(scale_tensor)
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
quantized_padding_constant_array = quantize_nparray(self.quantizer.input_qType,
padding_constant_array, scale_value, zp_value)
quantized_padding_constant_array = quantize_nparray(
self.quantizer.input_qType,
padding_constant_array,
scale_value,
zp_value,
)
quantized_padding_constant_name = node.input[2] + "_quantized"
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
quantized_padding_constant_array, quantized_padding_constant_name)
quantized_padding_constant_array,
quantized_padding_constant_name,
)
# Suppose this padding constant initializer only used by the node
self.quantizer.model.remove_initializer(padding_constant_initializer)
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
node.input[2] = quantized_padding_constant_name
else:
# TODO: check quantize_inputs after sub graph is supported
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(node, 2, self.quantizer.input_qType,
quantized_input_value.scale_name,
quantized_input_value.zp_name)
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
node,
2,
self.quantizer.input_qType,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
)
self.quantizer.new_nodes += [pad_value_qnodes]
node.input[2] = pad_value_qnodes.output[0]
else:
node.input.extend([quantized_input_value.zp_name]) # pad zero_point for original zero
# Create an entry for output quantized value
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
quantized_input_value.scale_name, quantized_input_value.zp_name,
QuantizedValueType.Input)
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + "_quantized",
quantized_input_value.scale_name,
quantized_input_value.zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_value.q_name

Просмотреть файл

@ -1,6 +1,8 @@
import onnx
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from ..quant_utils import attribute_to_kwarg, ms_domain, QuantizedValue, QuantizedValueType
class QLinearPool(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
@ -10,11 +12,21 @@ class QLinearPool(QuantOperatorBase):
node = self.node
# only try to quantize when given quantization parameters for it
data_found, output_scale_name, output_zp_name, _, _ = \
self.quantizer._get_quantization_params(node.output[0])
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
# get quantized input tensor names, quantize input if needed
quantized_input_names, input_zero_point_names, input_scale_names, nodes = self.quantizer.quantize_inputs(node, [0])
(
quantized_input_names,
input_zero_point_names,
input_scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
@ -22,7 +34,12 @@ class QLinearPool(QuantOperatorBase):
# Create an entry for output quantized value.
qlinear_output_name = node.output[0] + "_quantized"
quantized_output_value = QuantizedValue(
node.output[0], qlinear_output_name, output_scale_name, output_zp_name, QuantizedValueType.Input)
node.output[0],
qlinear_output_name,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
# Create qlinear pool node for given type (AveragePool, etc)
@ -33,10 +50,17 @@ class QLinearPool(QuantOperatorBase):
qlinear_node_name = node.name + "_quant" if node.name != "" else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[quantized_input_names[0], input_scale_names[0], input_zero_point_names[0], output_scale_name, output_zp_name],
[
quantized_input_names[0],
input_scale_names[0],
input_zero_point_names[0],
output_scale_name,
output_zp_name,
],
[qlinear_output_name],
qlinear_node_name,
**kwargs)
**kwargs
)
# add all newly created nodes
nodes.append(qnode)

Просмотреть файл

@ -1,14 +1,16 @@
import itertools
from .base_operator import QuantOperatorBase
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
from .base_operator import QuantOperatorBase
class QDQOperatorBase:
def __init__(self, onnx_quantizer, onnx_node):
self.quantizer = onnx_quantizer
self.node = onnx_node
self.disable_qdq_for_node_output = True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization \
else False
self.disable_qdq_for_node_output = (
True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False
)
def quantize(self):
node = self.node

Просмотреть файл

@ -7,7 +7,7 @@ class QResize(Direct8BitOp):
def quantize(self):
node = self.node
assert (node.op_type == "Resize")
assert node.op_type == "Resize"
# if version is less than 11, go to normal quantize.
if self.quantizer.opset_version < 11:
@ -24,7 +24,7 @@ class QDQResize(QDQDirect8BitOp):
def quantize(self):
node = self.node
assert (node.op_type == "Resize")
assert node.op_type == "Resize"
# if version is less than 11, just keep this node
if self.quantizer.opset_version < 11:

Просмотреть файл

@ -1,8 +1,9 @@
import onnx
from .base_operator import QuantOperatorBase
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
from .base_operator import QuantOperatorBase
class QSplit(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
@ -10,7 +11,12 @@ class QSplit(QuantOperatorBase):
def quantize(self):
node = self.node
quantized_input_names, zero_point_names, scale_names, nodes = self.quantizer.quantize_inputs(node, [0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_inputs(node, [0])
if quantized_input_names is None:
return super().quantize()
@ -26,14 +32,20 @@ class QSplit(QuantOperatorBase):
for output_name in node.output:
quantized_output_name = output_name + "quantized"
quantized_output_names.append(quantized_output_name)
q_output = QuantizedValue(output_name, quantized_output_name, scale_names[0], zero_point_names[0],
QuantizedValueType.Input)
q_output = QuantizedValue(
output_name,
quantized_output_name,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[output_name] = q_output
if len(node.input) > 1:
quantized_input_names = quantized_input_names.extend(node.input[1:])
quantized_node = onnx.helper.make_node(node.op_type, quantized_input_names, quantized_output_names,
quantized_node_name, **kwargs)
quantized_node = onnx.helper.make_node(
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
)
nodes.append(quantized_node)
self.quantizer.new_nodes += nodes

Просмотреть файл

@ -3,33 +3,72 @@
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import logging
import os
import struct
from pathlib import Path
import numpy as np
import logging
import numpy as np
import onnx
import onnx.numpy_helper
from onnx import onnx_pb as onnx_proto
from onnx import TensorProto
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
from onnx import onnx_pb as onnx_proto
from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name, quantize_nparray
from .quant_utils import QuantType, onnx_domain, __producer__, __version__
from .registry import CreateQDQQuantizer
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
from .onnx_model import ONNXModel
from .onnx_quantizer import ONNXQuantizer
from .quant_utils import (
QuantizationMode,
QuantizedInitializer,
QuantizedValue,
QuantizedValueType,
QuantType,
__producer__,
__version__,
attribute_to_kwarg,
find_by_name,
generate_identified_filename,
get_elem_index,
get_mul_node,
onnx_domain,
quantize_nparray,
type_to_name,
)
from .registry import CreateQDQQuantizer
class QDQQuantizer(ONNXQuantizer):
def __init__(self, model, per_channel, reduce_range, mode, static, weight_qType, input_qType, tensors_range,
nodes_to_quantize, nodes_to_exclude, op_types_to_quantize, extra_options={}):
ONNXQuantizer.__init__(self, model, per_channel, reduce_range, mode, static, weight_qType, input_qType,
tensors_range, nodes_to_quantize, nodes_to_exclude, op_types_to_quantize, extra_options)
def __init__(
self,
model,
per_channel,
reduce_range,
mode,
static,
weight_qType,
input_qType,
tensors_range,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options={},
):
ONNXQuantizer.__init__(
self,
model,
per_channel,
reduce_range,
mode,
static,
weight_qType,
input_qType,
tensors_range,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options,
)
self.tensors_to_quantize = []
self.tensors_to_quantize_per_channel = []
self.bias_to_quantize = []
@ -40,23 +79,33 @@ class QDQQuantizer(ONNXQuantizer):
# because those ops may be followed by nodes that require high resolution inputs.
# Adding QDQ for those ops' output may end up with worse accuracy.
# So, we don't recommend to add QDQ to node's output under such condition.
self.op_types_to_exclude_output_quantization = [] if 'OpTypesToExcludeOutputQuantizatioin' not in extra_options \
else extra_options['OpTypesToExcludeOutputQuantizatioin']
self.op_types_to_exclude_output_quantization = (
[]
if "OpTypesToExcludeOutputQuantizatioin" not in extra_options
else extra_options["OpTypesToExcludeOutputQuantizatioin"]
)
# We do quantization on Dequantizelinear's input to remove Quantizelinear for weight as an optimization.
# In some cases, for example QDQ BERT model for TensorRT, QDQ should always appear as a pair.
# Therefore, we need to disable this optimization and add qdq pair to weight.
self.add_qdq_pair_to_weight = False if 'AddQDQPairToWeight' not in extra_options \
else extra_options['AddQDQPairToWeight']
self.add_qdq_pair_to_weight = (
False if "AddQDQPairToWeight" not in extra_options else extra_options["AddQDQPairToWeight"]
)
# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
# In TRT, QDQ pair cant be shared between nodes, so it will create dedicated QDQ pairs for each node.
self.dedicated_qdq_pair = False if 'DedicatedQDQPair' not in extra_options else extra_options['DedicatedQDQPair']
# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
# In TRT, QDQ pair cant be shared between nodes, so it will create dedicated QDQ pairs for each node.
self.dedicated_qdq_pair = (
False if "DedicatedQDQPair" not in extra_options else extra_options["DedicatedQDQPair"]
)
if self.dedicated_qdq_pair:
self.tensor_to_its_receiving_nodes = {}
# Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
self.qdq_op_type_per_channel_support_to_axis = {} if 'QDQOpTypePerChannelSupportToAxis' not in extra_options else extra_options['QDQOpTypePerChannelSupportToAxis']
self.qdq_op_type_per_channel_support_to_axis = (
{}
if "QDQOpTypePerChannelSupportToAxis" not in extra_options
else extra_options["QDQOpTypePerChannelSupportToAxis"]
)
def quantize_tensor(self, tensor_name):
weight = find_by_name(tensor_name, self.model.initializer())
@ -65,12 +114,14 @@ class QDQQuantizer(ONNXQuantizer):
self.tensors_to_quantize.append(tensor_name)
elif tensor_name in self.value_infos.keys():
vi = self.value_infos[tensor_name]
if vi.type.HasField('tensor_type') and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
self.tensors_to_quantize.append(tensor_name)
else:
logging.warning(
"failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(
tensor_name))
tensor_name
)
)
def quantize_tensor_per_channel(self, tensor_name, axis):
weight = find_by_name(tensor_name, self.model.initializer())
@ -80,10 +131,12 @@ class QDQQuantizer(ONNXQuantizer):
else:
logging.warning(
"only support per-channel quantization on weight. Quantize tensor: {} with per-tensor instead.".format(
tensor_name))
tensor_name
)
)
self.quantize_tensor(tensor_name)
def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta = 1.0):
def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
weight = find_by_name(bias_name, self.model.initializer())
if weight is not None:
if weight.data_type == onnx_proto.TensorProto.FLOAT:
@ -124,9 +177,11 @@ class QDQQuantizer(ONNXQuantizer):
return self.model.model
def try_replacing_upstream_output(self, upstream_output_name, output_name):
if output_name in self.quantization_params.keys() and \
len(self.model.input_name_to_nodes()[upstream_output_name]) == 1 and \
not self.model.is_graph_output(upstream_output_name):
if (
output_name in self.quantization_params.keys()
and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
and not self.model.is_graph_output(upstream_output_name)
):
self.model.replace_output_of_all_nodes(upstream_output_name, output_name)
self.tensors_to_quantize.remove(upstream_output_name)
return True
@ -141,25 +196,34 @@ class QDQQuantizer(ONNXQuantizer):
if initializer is not None:
if self.add_qdq_pair_to_weight:
q_weight_name, zp_name, scale_name = self.quantize_weight(initializer,
self.weight_qType,
keep_float_weight=True)
qlinear_node = onnx.helper.make_node("QuantizeLinear", [tensor_name, scale_name, zp_name],
[tensor_name + "_QuantizeLinear"],
tensor_name + "_QuantizeLinear")
dequant_node = onnx.helper.make_node("DequantizeLinear",
[tensor_name + "_QuantizeLinear", scale_name, zp_name],
[tensor_name + "_DequantizeLinear"],
tensor_name + "_DequantizeLinear")
q_weight_name, zp_name, scale_name = self.quantize_weight(
initializer, self.weight_qType, keep_float_weight=True
)
qlinear_node = onnx.helper.make_node(
"QuantizeLinear",
[tensor_name, scale_name, zp_name],
[tensor_name + "_QuantizeLinear"],
tensor_name + "_QuantizeLinear",
)
dequant_node = onnx.helper.make_node(
"DequantizeLinear",
[tensor_name + "_QuantizeLinear", scale_name, zp_name],
[tensor_name + "_DequantizeLinear"],
tensor_name + "_DequantizeLinear",
)
self.model.replace_input_of_all_nodes(tensor_name, tensor_name + "_DequantizeLinear")
self.model.add_nodes([qlinear_node, dequant_node])
else:
q_weight_name, zp_name, scale_name = self.quantize_weight(initializer, self.weight_qType)
inputs = [q_weight_name, scale_name, zp_name]
output_name = tensor_name + '_DequantizeLinear'
node = onnx.helper.make_node("DequantizeLinear", inputs, [output_name],
tensor_name + '_DequantizeLinear')
output_name = tensor_name + "_DequantizeLinear"
node = onnx.helper.make_node(
"DequantizeLinear",
inputs,
[output_name],
tensor_name + "_DequantizeLinear",
)
self.model.add_node(node)
self.model.replace_input_of_all_nodes(tensor_name, tensor_name + "_DequantizeLinear")
else:
@ -168,32 +232,49 @@ class QDQQuantizer(ONNXQuantizer):
if data_found == False:
raise ValueError(
"Quantization parameters are not specified for param {}."
"In static mode quantization params for inputs and outputs of nodes to be quantized are required."
.format(tensor_name))
"In static mode quantization params for inputs and outputs of nodes to be quantized are required.".format(
tensor_name
)
)
if self.dedicated_qdq_pair and tensor_name in self.tensor_to_its_receiving_nodes and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1:
if (
self.dedicated_qdq_pair
and tensor_name in self.tensor_to_its_receiving_nodes
and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
):
num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])
for i in range(num_dedicated_qdq_pair):
postfix = str(i+1)
postfix = str(i + 1)
q_input = tensor_name
q_output = tensor_name + "_QuantizeLinear_" + postfix
q_output = tensor_name + "_QuantizeLinear_" + postfix
dq_input = q_output
dq_output = tensor_name + "_DequantizeLinear_" + postfix
quant_node_name = tensor_name + "_QuantizeLinear_" + postfix
dequant_node_name = tensor_name + "_DequantizeLinear_" + postfix
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
[q_output], quant_node_name)
dequant_node = onnx.helper.make_node("DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name)
qlinear_node = onnx.helper.make_node(
"QuantizeLinear",
[q_input, scale_name, zp_name],
[q_output],
quant_node_name,
)
dequant_node = onnx.helper.make_node(
"DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name,
)
self.model.add_nodes([qlinear_node, dequant_node])
node = self.tensor_to_its_receiving_nodes[tensor_name][i]
self.model.replace_node_input(node, tensor_name, dq_output)
quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
QuantizedValueType.Input)
quantized_value = QuantizedValue(
tensor_name,
dq_output,
scale_name,
zp_name,
QuantizedValueType.Input,
)
self.quantized_value_map[tensor_name] = quantized_value
else:
q_input = tensor_name
@ -209,16 +290,27 @@ class QDQQuantizer(ONNXQuantizer):
quant_node_name = tensor_name + "_QuantizeLinear"
dequant_node_name = tensor_name + "_DequantizeLinear"
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
[q_output], quant_node_name)
dequant_node = onnx.helper.make_node("DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name)
qlinear_node = onnx.helper.make_node(
"QuantizeLinear",
[q_input, scale_name, zp_name],
[q_output],
quant_node_name,
)
dequant_node = onnx.helper.make_node(
"DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name,
)
self.model.add_nodes([qlinear_node, dequant_node])
quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
QuantizedValueType.Input)
quantized_value = QuantizedValue(
tensor_name,
dq_output,
scale_name,
zp_name,
QuantizedValueType.Input,
)
self.quantized_value_map[tensor_name] = quantized_value
def quantize_bias_tensors(self):
@ -231,13 +323,20 @@ class QDQQuantizer(ONNXQuantizer):
quant_value = self.quantized_value_map[bias_name]
inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]
if quant_value.axis is not None:
dequant_node = onnx.helper.make_node("DequantizeLinear",
inputs, [bias_name],
bias_name + '_DequantizeLinear',
axis=quant_value.axis)
dequant_node = onnx.helper.make_node(
"DequantizeLinear",
inputs,
[bias_name],
bias_name + "_DequantizeLinear",
axis=quant_value.axis,
)
else:
dequant_node = onnx.helper.make_node("DequantizeLinear", inputs, [bias_name],
bias_name + '_DequantizeLinear')
dequant_node = onnx.helper.make_node(
"DequantizeLinear",
inputs,
[bias_name],
bias_name + "_DequantizeLinear",
)
self.model.add_node(dequant_node)
def quantize_weights_per_channel(self):
@ -245,31 +344,44 @@ class QDQQuantizer(ONNXQuantizer):
raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
for weight_name, axis in self.tensors_to_quantize_per_channel:
if self.add_qdq_pair_to_weight:
q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, onnx_proto.TensorProto.INT8,
axis, keep_float_weight=True)
qlinear_node = onnx.helper.make_node("QuantizeLinear", [weight_name, scale_name, zp_name],
[weight_name + "_QuantizeLinear"],
weight_name + "_QuantizeLinear",
axis=axis)
dequant_node = onnx.helper.make_node("DequantizeLinear",
[weight_name + "_QuantizeLinear", scale_name, zp_name],
[weight_name + "_DequantizeLinear"],
weight_name + "_DequantizeLinear",
axis=axis)
q_name, zp_name, scale_name = self.quantize_weight_per_channel(
weight_name,
onnx_proto.TensorProto.INT8,
axis,
keep_float_weight=True,
)
qlinear_node = onnx.helper.make_node(
"QuantizeLinear",
[weight_name, scale_name, zp_name],
[weight_name + "_QuantizeLinear"],
weight_name + "_QuantizeLinear",
axis=axis,
)
dequant_node = onnx.helper.make_node(
"DequantizeLinear",
[weight_name + "_QuantizeLinear", scale_name, zp_name],
[weight_name + "_DequantizeLinear"],
weight_name + "_DequantizeLinear",
axis=axis,
)
self.model.replace_input_of_all_nodes(weight_name, weight_name + "_DequantizeLinear")
self.model.add_nodes([qlinear_node, dequant_node])
else:
#q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, self.weight_qType, axis)
q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, onnx_proto.TensorProto.INT8,
axis)
# q_name, zp_name, scale_name = self.quantize_weight_per_channel(weight_name, self.weight_qType, axis)
q_name, zp_name, scale_name = self.quantize_weight_per_channel(
weight_name, onnx_proto.TensorProto.INT8, axis
)
inputs = [q_name, scale_name, zp_name]
output_name = weight_name + "_DequantizeLinear"
node = onnx.helper.make_node("DequantizeLinear",
inputs, [output_name],
weight_name + '_DequantizeLinear',
axis=axis)
node = onnx.helper.make_node(
"DequantizeLinear",
inputs,
[output_name],
weight_name + "_DequantizeLinear",
axis=axis,
)
self.model.add_node(node)
# Replace weight_name with output of DequantizeLinear

Просмотреть файл

@ -1,14 +1,14 @@
import logging
import numpy
import onnx
import tempfile
from enum import Enum
from onnx import onnx_pb as onnx_proto
from onnx import external_data_helper
from pathlib import Path
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
import numpy
import onnx
from onnx import external_data_helper
from onnx import onnx_pb as onnx_proto
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
__producer__ = "onnx.quantize"
__version__ = "0.1.0"
@ -97,14 +97,17 @@ class QuantFormat(Enum):
except KeyError:
raise ValueError()
ONNX_TYPE_TO_NP_TYPE = {
onnx_proto.TensorProto.INT8: numpy.dtype('int8'),
onnx_proto.TensorProto.UINT8: numpy.dtype('uint8')
onnx_proto.TensorProto.INT8: numpy.dtype("int8"),
onnx_proto.TensorProto.UINT8: numpy.dtype("uint8"),
}
def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
assert qType in ONNX_TYPE_TO_NP_TYPE, \
"Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)
assert (
qType in ONNX_TYPE_TO_NP_TYPE
), "Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)
dtype = ONNX_TYPE_TO_NP_TYPE[qType]
cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
@ -114,10 +117,10 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
'''
Calculate the scale s and zero point z for the quantization relation
"""
Calculate the scale s and zero point z for the quantization relation
r = s(q-z), where r are the original values and q are the corresponding
quantized values.
quantized values.
r and z are calculated such that every value within [rmin,rmax] has an
approximate representation within [qmin,qmax]. In addition, qmin <= z <=
@ -131,8 +134,8 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
:parameter qmax: maximum value representable by the target quantization data type
:return: zero and scale [z, s]
'''
"""
# Adjust rmin and rmax such that 0 is included in the range. This is
# required to make sure zero can be represented by the quantization data
# type (i.e. to make sure qmin <= zero_point <= qmax)
@ -144,21 +147,21 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False):
rmin = -absmax
rmax = +absmax
scale = (rmax - rmin) / float(qmax-qmin) if rmax!=rmin else 1.0
zero_point = round(qmin - rmin/scale)
scale = (rmax - rmin) / float(qmax - qmin) if rmax != rmin else 1.0
zero_point = round(qmin - rmin / scale)
return [zero_point, scale]
def quantize_data(data, qType, symmetric, reduce_range=False):
'''
"""
:param data: data to quantize
:param qType: data type to quantize to. Supported types UINT8 and INT8
:param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
:return: minimum, maximum, zero point, scale, and quantized weights
To pack weights, we compute a linear transformation
- when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
- when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
`m = max(abs(rmin), abs(rmax))`
@ -166,12 +169,12 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
:math:`r = S(q-z)`, where
- *r*: real original value
- *q*: quantized value
- *S*: scale
- *z*: zero point
'''
"""
rmin = 0
rmax = 0
@ -188,46 +191,52 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
return rmin, rmax, zero_point, scale, quantized_data
def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):
'''
"""
Return qmin and qmax, the minimum and maximum value representable by the given qType
:parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
:return: qmin, qmax
'''
"""
if qType == onnx_proto.TensorProto.UINT8:
(qmin, qmax) = (0,127) if reduce_range else (0,255)
(qmin, qmax) = (0, 127) if reduce_range else (0, 255)
elif qType == onnx_proto.TensorProto.INT8:
if symmetric:
(qmin, qmax) = (-64,64) if reduce_range else (-127,127)
(qmin, qmax) = (-64, 64) if reduce_range else (-127, 127)
else:
(qmin, qmax) = (-64,64) if reduce_range else (-128,127)
(qmin, qmax) = (-64, 64) if reduce_range else (-128, 127)
else:
raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
return qmin, qmax
def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):
'''
"""
Helper function to get the quantization range for a type.
parameter qType: quantization type.
return: quantization range.
'''
"""
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
return qmax - qmin
return qmax - qmin
class QuantizedInitializer:
'''
Represents a linearly quantized weight input from ONNX operators
'''
def __init__(self,
name,
initializer,
rmins,
rmaxs,
zero_points,
scales,
data=[],
quantized_data=[],
axis=None):
"""
Represents a linearly quantized weight input from ONNX operators
"""
def __init__(
self,
name,
initializer,
rmins,
rmaxs,
zero_points,
scales,
data=[],
quantized_data=[],
axis=None,
):
self.name = name
self.initializer = initializer # TensorProto initializer in ONNX graph
self.rmins = rmins # List of minimum range for each axis
@ -243,16 +252,19 @@ class QuantizedInitializer:
class QuantizedValue:
'''
"""
Represents a linearly quantized value (input\output\intializer)
'''
def __init__(self,
name,
new_quantized_name,
scale_name,
zero_point_name,
quantized_value_type,
axis=None):
"""
def __init__(
self,
name,
new_quantized_name,
scale_name,
zero_point_name,
quantized_value_type,
axis=None,
):
self.original_name = name
self.q_name = new_quantized_name
self.scale_name = scale_name
@ -262,9 +274,10 @@ class QuantizedValue:
class BiasToQuantize:
'''
"""
Represents a bias to be quantized
'''
"""
def __init__(self, bias_name, input_name, weight_name):
self.bias_name = bias_name
self.input_name = input_name
@ -272,57 +285,57 @@ class BiasToQuantize:
def attribute_to_kwarg(attribute):
'''
"""
Convert attribute to kwarg format for use with onnx.helper.make_node.
:parameter attribute: attribute in AttributeProto format.
:return: attribute in {key: value} format.
'''
if (attribute.type == 0):
raise ValueError('attribute {} does not have type specified.'.format(attribute.name))
"""
if attribute.type == 0:
raise ValueError("attribute {} does not have type specified.".format(attribute.name))
# Based on attribute type definitions from AttributeProto
# definition in https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
if (attribute.type == 1):
if attribute.type == 1:
value = attribute.f
elif (attribute.type == 2):
elif attribute.type == 2:
value = attribute.i
elif (attribute.type == 3):
elif attribute.type == 3:
value = attribute.s
elif (attribute.type == 4):
elif attribute.type == 4:
value = attribute.t
elif (attribute.type == 5):
elif attribute.type == 5:
value = attribute.g
elif (attribute.type == 6):
elif attribute.type == 6:
value = attribute.floats
elif (attribute.type == 7):
elif attribute.type == 7:
value = attribute.ints
elif (attribute.type == 8):
elif attribute.type == 8:
value = attribute.strings
elif (attribute.type == 9):
elif attribute.type == 9:
value = attribute.tensors
elif (attribute.type == 10):
elif attribute.type == 10:
value = attribute.graphs
else:
raise ValueError('attribute {} has unsupported type {}.'.format(attribute.name, attribute.type))
raise ValueError("attribute {} has unsupported type {}.".format(attribute.name, attribute.type))
return {attribute.name: value}
def find_by_name(item_name, item_list):
'''
"""
Helper function to find item by name in a list.
parameter item_name: name of the item.
parameter item_list: list of items.
return: item if found. None otherwise.
'''
"""
items = [item for item in item_list if item.name == item_name]
return items[0] if len(items) > 0 else None
def get_elem_index(elem_name, elem_list):
'''
"""
Helper function to return index of an item in a node list
'''
"""
elem_idx = -1
for i in range(0, len(elem_list)):
if elem_list[i] == elem_name:
@ -331,50 +344,56 @@ def get_elem_index(elem_name, elem_list):
def get_mul_node(inputs, output, name):
'''
"""
Helper function to create a Mul node.
parameter inputs: list of input names.
parameter output: output name.
parameter name: name of the node.
return: Mul node in NodeProto format.
'''
"""
return onnx.helper.make_node("Mul", inputs, [output], name)
def generate_identified_filename(filename: Path, identifier: str) -> Path:
'''
Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
'''
"""
Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
"""
return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
def apply_plot(hist, hist_edges):
import sys
import numpy
import matplotlib.pyplot as plt
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
print("Histogram:")
print(hist)
print("Histogram Edges:")
print(hist_edges)
plt.stairs(hist, hist_edges, fill=True)
plt.xlabel('Tensor value')
plt.ylabel('Counts')
plt.title('Tensor value V.S. Counts')
plt.xlabel("Tensor value")
plt.ylabel("Counts")
plt.title("Tensor value V.S. Counts")
plt.show()
def write_calibration_table(calibration_cache):
'''
Helper function to write calibration table to files.
'''
"""
Helper function to write calibration table to files.
"""
import json
import flatbuffers
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
logging.info("calibration cache: {}".format(calibration_cache))
with open("calibration.json", 'w') as file:
with open("calibration.json", "w") as file:
file.write(json.dumps(calibration_cache)) # use `json.loads` to do the reverse
# Serialize data using FlatBuffers
@ -406,7 +425,7 @@ def write_calibration_table(calibration_cache):
builder.Finish(cal_table)
buf = builder.Output()
with open("calibration.flatbuffers", 'wb') as file:
with open("calibration.flatbuffers", "wb") as file:
file.write(buf)
# Deserialize data (for validation)
@ -419,12 +438,13 @@ def write_calibration_table(calibration_cache):
logging.info(key_value.Value())
# write plain text
with open("calibration.cache", 'w') as file:
with open("calibration.cache", "w") as file:
for key in sorted(calibration_cache.keys()):
value = calibration_cache[key]
s = key + ' ' + str(max(abs(value[0]), abs(value[1])))
s = key + " " + str(max(abs(value[0]), abs(value[1])))
file.write(s)
file.write('\n')
file.write("\n")
def smooth_distribution(p, eps=0.0001):
"""Given a discrete distribution (may have not been normalized to 1),
@ -444,7 +464,11 @@ def smooth_distribution(p, eps=0.0001):
# raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
return -1
eps1 = eps * float(n_zeros) / float(n_nonzeros)
assert eps1 < 1.0, 'n_zeros=%d, n_nonzeros=%d, eps1=%f' % (n_zeros, n_nonzeros, eps1)
assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
n_zeros,
n_nonzeros,
eps1,
)
hist = p.astype(np.float32)
hist += eps * is_zeros + (-eps1) * is_nonzeros
@ -452,32 +476,36 @@ def smooth_distribution(p, eps=0.0001):
return hist
def model_has_external_data(model_path : Path):
def model_has_external_data(model_path: Path):
model = onnx.load(model_path.as_posix(), load_external_data=False)
for intializer in model.graph.initializer:
if external_data_helper.uses_external_data(intializer):
return True
return False
def optimize_model(model_path : Path, opt_model_path : Path):
'''
def optimize_model(model_path: Path, opt_model_path: Path):
"""
Generate model that applies graph optimization (constant folding, etc.)
parameter model_path: path to the original onnx model
parameter opt_model_path: path to the optimized onnx model
:return: optimized onnx model
'''
"""
sess_option = SessionOptions()
sess_option.optimized_model_filepath = opt_model_path.as_posix()
sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
_ = InferenceSession(model_path.as_posix(), sess_option, providers=['CPUExecutionProvider'])
_ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"])
def add_infer_metadata(model):
metadata_props = {"onnx.infer": "onnxruntime.quant"}
if model.metadata_props:
for p in model.metadata_props:
metadata_props.update({p.key : p.value})
metadata_props.update({p.key: p.value})
onnx.helper.set_model_props(model, metadata_props)
def model_has_infer_metadata(model):
if model.metadata_props:
for p in model.metadata_props:
@ -485,7 +513,8 @@ def model_has_infer_metadata(model):
return True
return False
def load_model_with_shape_infer(model_path : Path):
def load_model_with_shape_infer(model_path: Path):
inferred_model_path = generate_identified_filename(model_path, "-inferred")
onnx.shape_inference.infer_shapes_path(str(model_path), str(inferred_model_path))
model = onnx.load(inferred_model_path.as_posix())
@ -493,8 +522,8 @@ def load_model_with_shape_infer(model_path : Path):
return model
def load_model(model_path : Path, need_optimize : bool):
with tempfile.TemporaryDirectory(prefix='ort.quant.') as quant_tmp_dir:
def load_model(model_path: Path, need_optimize: bool):
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if need_optimize and not model_has_external_data(model_path):
opt_model_path = Path(quant_tmp_dir).joinpath("model.onnx")
optimize_model(model_path, opt_model_path)
@ -504,18 +533,19 @@ def load_model(model_path : Path, need_optimize : bool):
add_infer_metadata(model)
return model
def save_and_reload_model(model):
with tempfile.TemporaryDirectory(prefix='ort.quant.') as quant_tmp_dir:
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
model_path = Path(quant_tmp_dir).joinpath("model.onnx")
onnx.external_data_helper.convert_model_to_external_data(model,
all_tensors_to_one_file=True)
onnx.external_data_helper.convert_model_to_external_data(model, all_tensors_to_one_file=True)
onnx.save_model(model, model_path.as_posix())
return load_model(model_path, False)
def clone_model_with_shape_infer(model):
if model_has_infer_metadata(model):
cloned_model = onnx_proto.ModelProto()
cloned_model.CopyFrom(model)
else:
cloned_model = save_and_reload_model(model)
return cloned_model
return cloned_model

Просмотреть файл

@ -5,52 +5,63 @@
# --------------------------------------------------------------------------
import logging
from pathlib import Path
from onnx import onnx_pb as onnx_proto
from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg
from .quant_utils import QuantType, QuantFormat
from .quant_utils import load_model
from .registry import QLinearOpsRegistry, IntegerOpsRegistry
from .calibrate import CalibrationDataReader, CalibrationMethod, create_calibrator
from .onnx_model import ONNXModel
from .onnx_quantizer import ONNXQuantizer
from .qdq_quantizer import QDQQuantizer
from .calibrate import CalibrationDataReader, create_calibrator, CalibrationMethod
from .quant_utils import (
QuantFormat,
QuantizationMode,
QuantizedInitializer,
QuantizedValue,
QuantizedValueType,
QuantType,
attribute_to_kwarg,
find_by_name,
generate_identified_filename,
get_elem_index,
get_mul_node,
load_model,
)
from .registry import IntegerOpsRegistry, QLinearOpsRegistry
def check_static_quant_arguments(quant_format : QuantFormat,
activation_type : QuantType,
weight_type : QuantType):
def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
raise ValueError("ONNXRuntime quantization doesn't support data format:"
"activation_type=QuantType.QInt8, weight_type = QuantType.QUInt8")
raise ValueError(
"ONNXRuntime quantization doesn't support data format:"
"activation_type=QuantType.QInt8, weight_type = QuantType.QUInt8"
)
if activation_type == QuantType.QInt8 and \
weight_type == QuantType.QInt8 and \
quant_format != QuantFormat.QDQ: \
logging.warning("Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
"Or it will lead to bad performance on x64.")
if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
logging.warning(
"Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
"Or it will lead to bad performance on x64."
)
def quantize_static(model_input,
model_output,
calibration_data_reader: CalibrationDataReader,
quant_format=QuantFormat.QDQ,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
optimize_model=True,
use_external_data_format=False,
calibrate_method=CalibrationMethod.MinMax,
extra_options = {}):
def quantize_static(
model_input,
model_output,
calibration_data_reader: CalibrationDataReader,
quant_format=QuantFormat.QDQ,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
optimize_model=True,
use_external_data_format=False,
calibrate_method=CalibrationMethod.MinMax,
extra_options={},
):
'''
"""
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and
@ -81,9 +92,9 @@ def quantize_static(model_input,
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
:param optimize_model: optimize model before quantization.
:param use_external_data_format: option used for large size (>2GB) model. Set to False by default.
:param calibrate_method:
Current calibration methods supported are MinMax and Entropy.
:param use_external_data_format: option used for large size (>2GB) model. Set to False by default.
:param calibrate_method:
Current calibration methods supported are MinMax and Entropy.
Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
:param extra_options:
key value pair dictionary for various options in different case. Current used:
@ -97,13 +108,13 @@ def quantize_static(model_input,
always quantize input and so generate quantized output. Also the True behavior
could be disabled per node using the nodes_to_exclude.
MatMulConstBOnly = True/False: Default is False for static mode. If enabled, only MatMul with const B will be quantized.
AddQDQPairToWeight = True/False : Default is False which quantizes floating-point weight and feeds it to
soley inserted DeQuantizeLinear node. If True, it remains floating-point weight and
AddQDQPairToWeight = True/False : Default is False which quantizes floating-point weight and feeds it to
soley inserted DeQuantizeLinear node. If True, it remains floating-point weight and
inserts both QuantizeLinear/DeQuantizeLinear nodes to weight.
OpTypesToExcludeOutputQuantizatioin = list of op type : Default is []. If any op type is specified, it won't quantize
OpTypesToExcludeOutputQuantizatioin = list of op type : Default is []. If any op type is specified, it won't quantize
the output of ops with this specific op types.
DedicatedQDQPair = True/False : Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their inputs.
If True, it will create identical and dedicated QDQ pair for each node.
If True, it will create identical and dedicated QDQ pair for each node.
QDQOpTypePerChannelSupportToAxis = dictionary : Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1},
and it's effective only when per channel quantization is supported and per_channel is True.
If specific op type supports per channel quantization but not explicitly specified with channel axis,
@ -114,7 +125,7 @@ def quantize_static(model_input,
CalibMovingAverageConstant = float : Default is 0.01. Constant smoothing factor to use when computing the moving average of
the minimum and maximum values. Effective only when the calibration method selected is
MinMax and when CalibMovingAverage is set to True.
'''
"""
mode = QuantizationMode.QLinearOps
@ -124,17 +135,19 @@ def quantize_static(model_input,
model = load_model(Path(model_input), optimize_model)
calib_extra_options_keys = [
('CalibTensorRangeSymmetric', 'symmetric'),
('CalibMovingAverage', 'moving_average'),
('CalibMovingAverageConstant', 'averaging_constant')
("CalibTensorRangeSymmetric", "symmetric"),
("CalibMovingAverage", "moving_average"),
("CalibMovingAverageConstant", "averaging_constant"),
]
calib_extra_options = {key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options}
calib_extra_options = {
key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
}
calibrator = create_calibrator(
model,
op_types_to_quantize,
calibrate_method=calibrate_method,
use_external_data_format=use_external_data_format,
extra_options=calib_extra_options
extra_options=calib_extra_options,
)
calibrator.collect_data(calibration_data_reader)
tensors_range = calibrator.compute_range()
@ -154,7 +167,8 @@ def quantize_static(model_input,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options)
extra_options,
)
else:
quantizer = QDQQuantizer(
model,
@ -168,24 +182,27 @@ def quantize_static(model_input,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options)
extra_options,
)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)
def quantize_dynamic(model_input: Path,
model_output: Path,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
weight_type=QuantType.QInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
optimize_model=True,
use_external_data_format=False,
extra_options = { }):
'''
def quantize_dynamic(
model_input: Path,
model_output: Path,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
weight_type=QuantType.QInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
optimize_model=True,
use_external_data_format=False,
extra_options={},
):
"""
Given an onnx model, create a quantized onnx model and save it into a file
:param model_input: file path of model to quantize
:param model_output: file path of quantized model
@ -218,7 +235,7 @@ def quantize_dynamic(model_input: Path,
always quantize input and so generate quantized output. Also the True behavior
could be disabled per node using the nodes_to_exclude.
MatMulConstBOnly = True/False: Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
'''
"""
mode = QuantizationMode.IntegerOps
@ -227,22 +244,23 @@ def quantize_dynamic(model_input: Path,
model = load_model(Path(model_input), optimize_model)
if 'MatMulConstBOnly' not in extra_options:
extra_options['MatMulConstBOnly'] = True
if "MatMulConstBOnly" not in extra_options:
extra_options["MatMulConstBOnly"] = True
quantizer = ONNXQuantizer(
model,
per_channel,
reduce_range,
mode,
False, #static
False, # static
weight_type,
QuantType.QUInt8, #dynamic activation only supports uint8
QuantType.QUInt8, # dynamic activation only supports uint8
None,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options)
extra_options,
)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)

Просмотреть файл

@ -1,28 +1,28 @@
from .quant_utils import QuantizationMode
from .operators.activation import QDQRemovableActivation, QLinearActivation
from .operators.argmax import QArgMax
from .operators.base_operator import QuantOperatorBase
from .operators.qdq_base_operator import QDQOperatorBase
from .operators.matmul import MatMulInteger, QLinearMatMul, QDQMatMul
from .operators.attention import AttentionQuant
from .operators.base_operator import QuantOperatorBase
from .operators.binary_op import QLinearBinaryOp
from .operators.concat import QDQConcat, QLinearConcat
from .operators.conv import ConvInteger, QDQConv, QLinearConv
from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
from .operators.embed_layernorm import EmbedLayerNormalizationQuant
from .operators.gather import GatherQuant
from .operators.conv import QLinearConv, ConvInteger, QDQConv
from .operators.activation import QLinearActivation, QDQRemovableActivation
from .operators.binary_op import QLinearBinaryOp
from .operators.maxpool import QDQMaxPool, QMaxPool
from .operators.gavgpool import QGlobalAveragePool
from .operators.gemm import QDQGemm, QLinearGemm
from .operators.lstm import LSTMQuant
from .operators.split import QSplit
from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
from .operators.maxpool import QDQMaxPool, QMaxPool
from .operators.pad import QPad
from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
from .operators.resize import QResize, QDQResize
from .operators.pooling import QLinearPool
from .operators.concat import QLinearConcat, QDQConcat
from .operators.gemm import QLinearGemm, QDQGemm
from .operators.qdq_base_operator import QDQOperatorBase
from .operators.resize import QDQResize, QResize
from .operators.split import QSplit
from .quant_utils import QuantizationMode
CommonOpsRegistry = {
"Gather": GatherQuant,
"Transpose" : Direct8BitOp,
"Transpose": Direct8BitOp,
"EmbedLayerNormalization": EmbedLayerNormalizationQuant,
}
@ -50,10 +50,10 @@ QLinearOpsRegistry = {
"Split": QSplit,
"Pad": QPad,
"Reshape": Direct8BitOp,
"Squeeze" : Direct8BitOp,
"Unsqueeze" : Direct8BitOp,
"Squeeze": Direct8BitOp,
"Unsqueeze": Direct8BitOp,
"Resize": QResize,
"AveragePool" : QLinearPool,
"AveragePool": QLinearPool,
"Concat": QLinearConcat,
}
QLinearOpsRegistry.update(CommonOpsRegistry)
@ -64,12 +64,12 @@ QDQRegistry = {
"Clip": QDQRemovableActivation,
"Relu": QDQRemovableActivation,
"Reshape": QDQDirect8BitOp,
"Transpose" : QDQDirect8BitOp,
"Squeeze" : QDQDirect8BitOp,
"Unsqueeze" : QDQDirect8BitOp,
"Transpose": QDQDirect8BitOp,
"Squeeze": QDQDirect8BitOp,
"Unsqueeze": QDQDirect8BitOp,
"Resize": QDQResize,
"MaxPool": QDQMaxPool,
"AveragePool" : QDQDirect8BitOp,
"AveragePool": QDQDirect8BitOp,
"Concat": QDQConcat,
"MatMul": QDQMatMul,
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,35 +1,49 @@
import os
import csv
import logging
import coloredlogs
import argparse
import copy
import csv
import json
import re
import logging
import os
import pprint
from perf_utils import *
import re
import coloredlogs
from benchmark import *
from perf_utils import *
def write_model_info_to_file(model, path):
with open(path, 'w') as file:
file.write(json.dumps(model)) # use `json.loads` to do the reverse
with open(path, "w") as file:
file.write(json.dumps(model)) # use `json.loads` to do the reverse
def get_ep_list(comparison):
if comparison == 'acl':
def get_ep_list(comparison):
if comparison == "acl":
ep_list = [cpu, acl]
else:
else:
# test with cuda and trt
ep_list = [cpu, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
ep_list = [
cpu,
cuda,
trt,
standalone_trt,
cuda_fp16,
trt_fp16,
standalone_trt_fp16,
]
return ep_list
def resolve_trtexec_path(workspace):
def resolve_trtexec_path(workspace):
trtexec_options = get_output(["find", workspace, "-name", "trtexec"])
trtexec_path = re.search(r'.*/bin/trtexec', trtexec_options).group(0)
trtexec_path = re.search(r".*/bin/trtexec", trtexec_options).group(0)
logger.info("using trtexec {}".format(trtexec_path))
return trtexec_path
def dict_to_args(dct):
return ','.join(["{}={}".format(k, v) for k, v in dct.items()])
return ",".join(["{}={}".format(k, v) for k, v in dct.items()])
def main():
args = parse_arguments()
@ -42,7 +56,7 @@ def main():
else:
ep_list = get_ep_list(args.comparison)
if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
if standalone_trt in ep_list or standalone_trt_fp16 in ep_list:
trtexec = resolve_trtexec_path(args.workspace)
models = {}
@ -59,28 +73,35 @@ def main():
specs_csv = specs_name + csv_ending
for model, model_info in models.items():
logger.info("\n" + "="*40 + "="*len(model))
logger.info("="*20 + model +"="*20)
logger.info("="*40 + "="*len(model))
logger.info("\n" + "=" * 40 + "=" * len(model))
logger.info("=" * 20 + model + "=" * 20)
logger.info("=" * 40 + "=" * len(model))
model_info["model_name"] = model
model_list_file = os.path.join(os.getcwd(), model +'.json')
model_info["model_name"] = model
model_list_file = os.path.join(os.getcwd(), model + ".json")
write_model_info_to_file([model_info], model_list_file)
for ep in ep_list:
command = ["python3",
"benchmark.py",
"-r", args.running_mode,
"-m", model_list_file,
"-o", args.perf_result_path,
"--ep", ep,
"--write_test_result", "false"]
if ep == standalone_trt or ep == standalone_trt_fp16:
if args.running_mode == "validate":
continue
command = [
"python3",
"benchmark.py",
"-r",
args.running_mode,
"-m",
model_list_file,
"-o",
args.perf_result_path,
"--ep",
ep,
"--write_test_result",
"false",
]
if ep == standalone_trt or ep == standalone_trt_fp16:
if args.running_mode == "validate":
continue
else:
command.extend(["--trtexec", trtexec])
@ -92,20 +113,30 @@ def main():
if args.running_mode == "validate":
command.extend(["--benchmark_metrics_csv", benchmark_metrics_csv])
elif args.running_mode == "benchmark":
command.extend(["-t", str(args.test_times),
"-o", args.perf_result_path,
"--write_test_result", "false",
"--benchmark_fail_csv", benchmark_fail_csv,
"--benchmark_latency_csv", benchmark_latency_csv,
"--benchmark_success_csv", benchmark_success_csv])
command.extend(
[
"-t",
str(args.test_times),
"-o",
args.perf_result_path,
"--write_test_result",
"false",
"--benchmark_fail_csv",
benchmark_fail_csv,
"--benchmark_latency_csv",
benchmark_latency_csv,
"--benchmark_success_csv",
benchmark_success_csv,
]
)
p = subprocess.run(command)
logger.info(p)
if p.returncode != 0:
error_type = "runtime error"
error_type = "runtime error"
error_message = "Benchmark script exited with returncode = " + str(p.returncode)
logger.error(error_message)
update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)
@ -117,6 +148,7 @@ def main():
path = os.path.join(os.getcwd(), args.perf_result_path)
if not os.path.exists(path):
from pathlib import Path
Path(path).mkdir(parents=True, exist_ok=True)
if args.running_mode == "validate":
@ -127,8 +159,8 @@ def main():
if os.path.exists(METRICS_FILE):
model_to_metrics = read_map_from_file(METRICS_FILE)
output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))
logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
elif args.running_mode == "benchmark":
logger.info("\n=========================================")
logger.info("======= Models/EPs session creation =======")
@ -138,8 +170,8 @@ def main():
model_to_session = read_map_from_file(SESSION_FILE)
pretty_print(pp, model_to_session)
output_session_creation(model_to_session, os.path.join(path, benchmark_session_csv))
logger.info("\nSaved session creation results to {}".format(benchmark_session_csv))
logger.info("\nSaved session creation results to {}".format(benchmark_session_csv))
logger.info("\n=========================================================")
logger.info("========== Failing Models/EPs (accumulated) ==============")
logger.info("==========================================================")
@ -148,7 +180,7 @@ def main():
model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
output_fail(model_to_fail_ep, os.path.join(path, benchmark_fail_csv))
logger.info(model_to_fail_ep)
logger.info("\nSaved model failing results to {}".format(benchmark_fail_csv))
logger.info("\nSaved model failing results to {}".format(benchmark_fail_csv))
logger.info("\n=======================================================")
logger.info("=========== Models/EPs Status (accumulated) ===========")
@ -163,11 +195,11 @@ def main():
model_fail = read_map_from_file(FAIL_MODEL_FILE)
is_fail = True
model_status = build_status(model_status, model_fail, is_fail)
pretty_print(pp, model_status)
output_status(model_status, os.path.join(path, benchmark_status_csv))
logger.info("\nSaved model status results to {}".format(benchmark_status_csv))
output_status(model_status, os.path.join(path, benchmark_status_csv))
logger.info("\nSaved model status results to {}".format(benchmark_status_csv))
logger.info("\n=========================================================")
logger.info("=========== Models/EPs latency (accumulated) ===========")
@ -176,11 +208,11 @@ def main():
if os.path.exists(LATENCY_FILE):
model_to_latency = read_map_from_file(LATENCY_FILE)
add_improvement_information(model_to_latency)
pretty_print(pp, model_to_latency)
output_latency(model_to_latency, os.path.join(path, benchmark_latency_csv))
logger.info("\nSaved model latency results to {}".format(benchmark_latency_csv))
logger.info("\nSaved model latency results to {}".format(benchmark_latency_csv))
logger.info("\n===========================================")
logger.info("=========== System information ===========")
@ -189,7 +221,8 @@ def main():
pretty_print(pp, info)
logger.info("\n")
output_specs(info, os.path.join(path, specs_csv))
logger.info("\nSaved hardware specs to {}".format(specs_csv))
logger.info("\nSaved hardware specs to {}".format(specs_csv))
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,46 +1,62 @@
import pandas as pd
import numpy as np
import argparse
ep_map = {"cpu": "CPU", "cuda":"CUDA","trt": "TRT EP","native": "Standalone TRT"}
import numpy as np
import pandas as pd
def parse_arguments():
ep_map = {"cpu": "CPU", "cuda": "CUDA", "trt": "TRT EP", "native": "Standalone TRT"}
def parse_arguments():
# create parser
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--prev", required=True, help="previous csv")
parser.add_argument("-c", "--current", required=True, help="current csv")
parser.add_argument("-o", "--output_csv", required=True, help="output different csv")
parser.add_argument("--ep", required=False, default="trt", choices=["cpu", "cuda", "trt", "native"], help="ep to capture regressions on")
parser.add_argument("--tolerance", required=False, default=0, help="allowed tolerance for latency comparison")
parser.add_argument(
"--ep",
required=False,
default="trt",
choices=["cpu", "cuda", "trt", "native"],
help="ep to capture regressions on",
)
parser.add_argument(
"--tolerance",
required=False,
default=0,
help="allowed tolerance for latency comparison",
)
args = parser.parse_args()
return args
return args
def get_table_condition(table, fp, ep, tol):
def get_table_condition(table, fp, ep, tol):
ep = ep_map[ep]
col1 = ep + " " + fp + " \nmean (ms)_x"
col2 = ep + " " + fp + " \nmean (ms)_y"
condition = table[col1] > (table[col2] + tol)
return condition
def main():
args = parse_arguments()
a = pd.read_csv(args.prev)
b = pd.read_csv(args.current)
common = a.merge(b, on=['Model'])
common = a.merge(b, on=["Model"])
condition_fp32 = get_table_condition(common, "fp32", args.ep, args.tolerance)
condition_fp16 = get_table_condition(common, "fp16", args.ep, args.tolerance)
common['greater'] = np.where((condition_fp32 | condition_fp16), True, False)
greater = common[common['greater'] == True].drop(['greater'], axis=1)
common["greater"] = np.where((condition_fp32 | condition_fp16), True, False)
greater = common[common["greater"] == True].drop(["greater"], axis=1)
# arrange columns
keys = list(greater.keys().sort_values())
keys.insert(0, keys.pop(keys.index('Model')))
keys.insert(0, keys.pop(keys.index("Model")))
greater = greater[keys]
greater.to_csv(args.output_csv)
if __name__=='__main__':
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,22 +1,30 @@
import pandas as pd
import argparse
def parse_arguments():
import pandas as pd
def parse_arguments():
# create parser
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--prev", required=True, help="previous csv")
parser.add_argument("-c", "--current", required=True, help="current csv")
parser.add_argument("-o", "--output_csv", required=True, help="output different csv")
args = parser.parse_args()
return args
return args
def main():
args = parse_arguments()
a = pd.read_csv(args.prev)
b = pd.read_csv(args.current)
common = b.merge(a, on=['model','ep','error type','error message'])
diff = b.append(common, ignore_index=True).drop_duplicates(['model', 'ep', 'error type', 'error message'], keep=False).loc[:b.index.max()]
common = b.merge(a, on=["model", "ep", "error type", "error message"])
diff = (
b.append(common, ignore_index=True)
.drop_duplicates(["model", "ep", "error type", "error message"], keep=False)
.loc[: b.index.max()]
)
diff.to_csv(args.output_csv)
if __name__=='__main__':
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,21 +1,22 @@
import subprocess
import json
import pprint
import logging
import coloredlogs
import pprint
import re
import subprocess
import sys
debug = False
debug_verbose = False
import coloredlogs
# ORT ep names
debug = False
debug_verbose = False
# ORT ep names
cpu_ep = "CPUExecutionProvider"
cuda_ep = "CUDAExecutionProvider"
trt_ep = "TensorrtExecutionProvider"
acl_ep = "ACLExecutionProvider"
# provider names
# provider names
cpu = "ORT-CPUFp32"
cuda = "ORT-CUDAFp32"
cuda_fp16 = "ORT-CUDAFp16"
@ -26,56 +27,70 @@ standalone_trt_fp16 = "TRTFp16"
acl = "ORT-ACLFp32"
# table names
metrics_name = 'metrics'
success_name = 'success'
fail_name = 'fail'
memory_name = 'memory'
latency_name = 'latency'
status_name = 'status'
latency_over_time_name = 'latency_over_time'
specs_name = 'specs'
session_name = 'session'
metrics_name = "metrics"
success_name = "success"
fail_name = "fail"
memory_name = "memory"
latency_name = "latency"
status_name = "status"
latency_over_time_name = "latency_over_time"
specs_name = "specs"
session_name = "session"
# column names
model_title = 'Model'
group_title = 'Group'
# column names
model_title = "Model"
group_title = "Group"
# endings
# endings
second = "_second"
csv_ending = '.csv'
avg_ending = ' \nmean (ms)'
percentile_ending = ' \n90th percentile (ms)'
memory_ending = ' \npeak memory usage (MiB)'
session_ending = ' \n session creation time (s)'
second_session_ending = ' \n second session creation time (s)'
csv_ending = ".csv"
avg_ending = " \nmean (ms)"
percentile_ending = " \n90th percentile (ms)"
memory_ending = " \npeak memory usage (MiB)"
session_ending = " \n session creation time (s)"
second_session_ending = " \n second session creation time (s)"
ort_provider_list = [cpu, cuda, trt, cuda_fp16, trt_fp16]
provider_list = [cpu, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
provider_list = [
cpu,
cuda,
trt,
standalone_trt,
cuda_fp16,
trt_fp16,
standalone_trt_fp16,
]
table_headers = [model_title] + provider_list
# graph options
disable = 'disable'
basic = 'basic'
extended = 'extended'
enable_all = 'all'
# graph options
disable = "disable"
basic = "basic"
extended = "extended"
enable_all = "all"
def is_standalone(ep):
return ep == standalone_trt or ep == standalone_trt_fp16
def get_output(command):
p = subprocess.run(command, check=True, stdout=subprocess.PIPE)
output = p.stdout.decode("ascii").strip()
return output
def find(regex_string):
def find(regex_string):
import glob
results = glob.glob(regex_string)
results.sort()
return results
def pretty_print(pp, json_object):
pp.pprint(json_object)
sys.stdout.flush()
def parse_single_file(f):
try:
@ -86,7 +101,7 @@ def parse_single_file(f):
model_run_flag = False
first_run_flag = True
provider_op_map = {} # ep -> map of operator to duration
provider_op_map_first_run = {} # ep -> map of operator to duration
provider_op_map_first_run = {} # ep -> map of operator to duration
for row in data:
if not "cat" in row:
@ -134,20 +149,19 @@ def parse_single_file(f):
op_map[row["name"]] = row["dur"]
provider_op_map[provider] = op_map
if debug_verbose:
pprint._sorted = lambda x:x
pprint._sorted = lambda x: x
pprint.sorted = lambda x, key=None: x
pp = pprint.PrettyPrinter(indent=4)
print("------First run ops map (START)------")
for key, map in provider_op_map_first_run.items():
print(key)
print(key)
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
print("------First run ops map (END) ------")
print("------Second run ops map (START)------")
for key, map in provider_op_map.items():
print(key)
print(key)
pp.pprint({k: v for k, v in sorted(map.items(), key=lambda item: item[1], reverse=True)})
print("------Second run ops map (END) ------")
@ -156,6 +170,7 @@ def parse_single_file(f):
return None
def calculate_cuda_op_percentage(cuda_op_map):
if not cuda_op_map or len(cuda_op_map) == 0:
return 0
@ -163,14 +178,15 @@ def calculate_cuda_op_percentage(cuda_op_map):
cuda_ops = 0
cpu_ops = 0
for key, value in cuda_op_map.items():
if key == 'CUDAExecutionProvider':
if key == "CUDAExecutionProvider":
cuda_ops += len(value)
if key == 'CPUExecutionProvider':
if key == "CPUExecutionProvider":
cpu_ops += len(value)
return cuda_ops / (cuda_ops + cpu_ops)
##########################################
# Return: total ops executed in TRT,
# total ops,
@ -208,6 +224,7 @@ def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
return ((total_ops - total_cuda_and_cpu_ops), total_ops, ratio_of_ops_in_trt)
def get_total_ops(op_map):
total_ops = 0
@ -227,7 +244,11 @@ def calculate_trt_latency_percentage(trt_op_map):
# % of TRT execution time
total_execution_time = 0
total_trt_execution_time = 0
for ep in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]:
for ep in [
"TensorrtExecutionProvider",
"CUDAExecutionProvider",
"CPUExecutionProvider",
]:
if ep in trt_op_map:
op_map = trt_op_map[ep]
@ -240,8 +261,6 @@ def calculate_trt_latency_percentage(trt_op_map):
total_execution_time += total_time
if total_execution_time == 0:
ratio_of_trt_execution_time = 0
else:
@ -257,7 +276,10 @@ def calculate_trt_latency_percentage(trt_op_map):
def get_profile_metrics(path, profile_already_parsed, logger=None):
logger.info("Parsing/Analyzing profiling files in {} ...".format(path))
p1 = subprocess.Popen(["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"], stdout=subprocess.PIPE)
p1 = subprocess.Popen(
["find", path, "-name", "onnxruntime_profile*", "-printf", "%T+\t%p\n"],
stdout=subprocess.PIPE,
)
p2 = subprocess.Popen(["sort"], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout, sterr = p2.communicate()
stdout = stdout.decode("ascii").strip()
@ -266,7 +288,7 @@ def get_profile_metrics(path, profile_already_parsed, logger=None):
data = []
for profile in profiling_files:
profile = profile.split('\t')[1]
profile = profile.split("\t")[1]
if profile in profile_already_parsed:
continue
profile_already_parsed.add(profile)

Просмотреть файл

@ -1,137 +1,171 @@
import argparse
import sys
import os
import pandas as pd
import sys
import time
import pandas as pd
from azure.kusto.data import KustoConnectionStringBuilder
from azure.kusto.data.data_format import DataFormat
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.ingest import (
IngestionProperties,
ReportLevel,
QueuedIngestClient,
)
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.ingest import IngestionProperties, QueuedIngestClient, ReportLevel
from perf_utils import *
# database connection strings
# database connection strings
cluster_ingest = "https://ingest-onnxruntimedashboarddb.southcentralus.kusto.windows.net"
database = "ep_perf_dashboard"
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"-r", "--report_folder", help="Path to the local file report", required=True)
parser.add_argument(
"-c", "--commit_hash", help="Commit id", required=True)
parser.add_argument(
"-u", "--report_url", help="Report Url", required=True)
parser.add_argument(
"-t", "--trt_version", help="Tensorrt Version", required=True)
parser.add_argument(
"-b", "--branch", help="Branch", required=True)
parser.add_argument(
"-d", "--datetime", help="Commit Datetime", required=True)
parser.add_argument("-r", "--report_folder", help="Path to the local file report", required=True)
parser.add_argument("-c", "--commit_hash", help="Commit id", required=True)
parser.add_argument("-u", "--report_url", help="Report Url", required=True)
parser.add_argument("-t", "--trt_version", help="Tensorrt Version", required=True)
parser.add_argument("-b", "--branch", help="Branch", required=True)
parser.add_argument("-d", "--datetime", help="Commit Datetime", required=True)
return parser.parse_args()
def adjust_columns(table, columns, db_columns, model_group):
def adjust_columns(table, columns, db_columns, model_group):
table = table[columns]
table = table.set_axis(db_columns, axis=1)
table = table.assign(Group=model_group)
return table
return table
def get_latency_over_time(commit_hash, report_url, branch, latency_table):
if not latency_table.empty:
over_time = latency_table
over_time = over_time.melt(id_vars=[model_title, group_title], var_name='Ep', value_name='Latency')
over_time = over_time.melt(id_vars=[model_title, group_title], var_name="Ep", value_name="Latency")
over_time = over_time.assign(CommitId=commit_hash)
over_time = over_time.assign(ReportUrl=report_url)
over_time = over_time.assign(Branch=branch)
over_time = over_time[['CommitId', model_title, 'Ep', 'Latency', 'ReportUrl', group_title, 'Branch']]
over_time.fillna('', inplace=True)
over_time = over_time[
[
"CommitId",
model_title,
"Ep",
"Latency",
"ReportUrl",
group_title,
"Branch",
]
]
over_time.fillna("", inplace=True)
return over_time
def get_failures(fail, model_group):
fail_columns = fail.keys()
fail_db_columns = [model_title, 'Ep', 'ErrorType', 'ErrorMessage']
fail_db_columns = [model_title, "Ep", "ErrorType", "ErrorMessage"]
fail = adjust_columns(fail, fail_columns, fail_db_columns, model_group)
return fail
def get_memory(memory, model_group):
def get_memory(memory, model_group):
memory_columns = [model_title]
for provider in provider_list:
for provider in provider_list:
if cpu not in provider:
memory_columns.append(provider + memory_ending)
memory_db_columns = [model_title, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16]
memory_db_columns = [
model_title,
cuda,
trt,
standalone_trt,
cuda_fp16,
trt_fp16,
standalone_trt_fp16,
]
memory = adjust_columns(memory, memory_columns, memory_db_columns, model_group)
return memory
def get_latency(latency, model_group):
latency_columns = [model_title]
for provider in provider_list:
for provider in provider_list:
latency_columns.append(provider + avg_ending)
latency_db_columns = table_headers
latency = adjust_columns(latency, latency_columns, latency_db_columns, model_group)
return latency
def get_status(status, model_group):
status_columns = status.keys()
status_db_columns = table_headers
status = adjust_columns(status, status_columns, status_db_columns, model_group)
return status
def get_specs(specs, branch, commit_id, date_time):
init_id = int(specs.tail(1).get('.', 0)) + 1
specs_additional = pd.DataFrame({'.': [init_id, init_id + 1, init_id + 2],
'Spec': ['Branch', 'CommitId', 'CommitTime'],
'Version': [branch, commit_id, date_time]})
init_id = int(specs.tail(1).get(".", 0)) + 1
specs_additional = pd.DataFrame(
{
".": [init_id, init_id + 1, init_id + 2],
"Spec": ["Branch", "CommitId", "CommitTime"],
"Version": [branch, commit_id, date_time],
}
)
return pd.concat([specs, specs_additional], ignore_index=True)
def get_session(session, model_group):
session_columns = session.keys()
session_db_columns = [model_title] + ort_provider_list + [p + second for p in ort_provider_list]
session = adjust_columns(session, session_columns, session_db_columns, model_group)
return session
def write_table(ingest_client, table, table_name, commit_time, identifier):
if table.empty:
return
table = table.assign(UploadTime=commit_time) # add Commit DateTime
table = table.assign(Identifier=identifier) # add Identifier
table = table.assign(UploadTime=commit_time) # add Commit DateTime
table = table.assign(Identifier=identifier) # add Identifier
ingestion_props = IngestionProperties(
database=database,
table=table_name,
data_format=DataFormat.CSV,
report_level=ReportLevel.FailuresAndSuccesses
database=database,
table=table_name,
data_format=DataFormat.CSV,
report_level=ReportLevel.FailuresAndSuccesses,
)
# append rows
ingest_client.ingest_from_dataframe(table, ingestion_properties=ingestion_props)
def get_time():
def get_time():
date_time = time.strftime(time_string_format)
return date_time
def get_identifier(date_time, commit_id, trt_version, branch):
date = date_time.split('T')[0] # extract date only
return date + '_' + commit_id + '_' + trt_version + '_' + branch
date = date_time.split("T")[0] # extract date only
return date + "_" + commit_id + "_" + trt_version + "_" + branch
def main():
args = parse_arguments()
# connect to database
kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(cluster_ingest)
ingest_client = QueuedIngestClient(kcsb_ingest)
date_time = args.datetime
identifier = get_identifier(date_time, args.commit_hash, args.trt_version, args.branch)
try:
result_file = args.report_folder
folders = os.listdir(result_file)
os.chdir(result_file)
tables = [fail_name, memory_name, latency_name, status_name, latency_over_time_name, specs_name, session_name]
tables = [
fail_name,
memory_name,
latency_name,
status_name,
latency_over_time_name,
specs_name,
session_name,
]
table_results = {}
for table_name in tables:
table_results[table_name] = pd.DataFrame()
@ -142,26 +176,54 @@ def main():
for csv in csv_filenames:
table = pd.read_csv(csv)
if session_name in csv:
table_results[session_name] = table_results[session_name].append(get_session(table, model_group), ignore_index=True)
table_results[session_name] = table_results[session_name].append(
get_session(table, model_group), ignore_index=True
)
elif specs_name in csv:
table_results[specs_name] = table_results[specs_name].append(get_specs(table, args.branch, args.commit_hash, date_time), ignore_index=True)
table_results[specs_name] = table_results[specs_name].append(
get_specs(table, args.branch, args.commit_hash, date_time),
ignore_index=True,
)
elif fail_name in csv:
table_results[fail_name] = table_results[fail_name].append(get_failures(table, model_group), ignore_index=True)
table_results[fail_name] = table_results[fail_name].append(
get_failures(table, model_group), ignore_index=True
)
elif latency_name in csv:
table_results[memory_name] = table_results[memory_name].append(get_memory(table, model_group), ignore_index=True)
table_results[latency_name] = table_results[latency_name].append(get_latency(table, model_group), ignore_index=True)
table_results[latency_over_time_name] = table_results[latency_over_time_name].append(get_latency_over_time(args.commit_hash, args.report_url, args.branch, table_results[latency_name]), ignore_index=True)
table_results[memory_name] = table_results[memory_name].append(
get_memory(table, model_group), ignore_index=True
)
table_results[latency_name] = table_results[latency_name].append(
get_latency(table, model_group), ignore_index=True
)
table_results[latency_over_time_name] = table_results[latency_over_time_name].append(
get_latency_over_time(
args.commit_hash,
args.report_url,
args.branch,
table_results[latency_name],
),
ignore_index=True,
)
elif status_name in csv:
table_results[status_name] = table_results[status_name].append(get_status(table, model_group), ignore_index=True)
table_results[status_name] = table_results[status_name].append(
get_status(table, model_group), ignore_index=True
)
os.chdir(result_file)
for table in tables:
print('writing ' + table + ' to database')
db_table_name = 'ep_model_' + table
write_table(ingest_client, table_results[table], db_table_name, date_time, identifier)
for table in tables:
print("writing " + table + " to database")
db_table_name = "ep_model_" + table
write_table(
ingest_client,
table_results[table],
db_table_name,
date_time,
identifier,
)
except BaseException as e:
except BaseException as e:
print(str(e))
sys.exit(1)
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,17 +1,21 @@
import json
import os
import wget
import tarfile
import json
import wget
def get_tar_file(link):
file_name = link.split("/")[-1]
return file_name
def create_model_folder(model):
os.mkdir(model)
def extract_and_get_files(file_name):
model_folder = file_name.replace(".tar.gz", "") + '/'
model_folder = file_name.replace(".tar.gz", "") + "/"
create_model_folder(model_folder)
model_tar = tarfile.open(file_name)
model_tar.extractall(model_folder)
@ -20,21 +24,25 @@ def extract_and_get_files(file_name):
model_tar.close()
return model_folder, file_list
def download_model(link):
file_name = get_tar_file(link)
wget.download(link)
model_folder, file_list = extract_and_get_files(file_name)
return model_folder, file_list
def get_model_path(file_list):
for file_name in file_list:
if ".onnx" in file_name:
return file_name
def get_test_path(model_path):
model_filename = os.path.basename(model_path)
def get_test_path(model_path):
model_filename = os.path.basename(model_path)
test_path = model_path.split(model_filename)[0]
return test_path
return test_path
def create_model_object(model, folder, model_file_path, test_path):
model_dict = {}
@ -44,6 +52,7 @@ def create_model_object(model, folder, model_file_path, test_path):
model_dict["test_data_path"] = "./" + test_path
return model_dict
def get_model_info(link):
model_folder, file_list = download_model(link)
model = model_folder[:-1]
@ -52,20 +61,23 @@ def get_model_info(link):
model_info = create_model_object(model, model_folder, model_file_path, test_path)
return model_info
def write_json(models):
model_json = json.dumps(models, indent=4)
with open('model_list.json', 'w') as fp:
def write_json(models):
model_json = json.dumps(models, indent=4)
with open("model_list.json", "w") as fp:
fp.write(model_json)
def main():
links = []
with open('links.txt', 'r') as fh:
with open("links.txt", "r") as fh:
links = [link.rstrip() for link in fh.readlines()]
model_list = []
for link in links:
model_list.append(get_model_info(link))
write_json(model_list)
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,13 +1,14 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import sys
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), 'models', 'gpt2'))
sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
import convert_to_onnx
# added for backward compatible
import gpt2_helper
import convert_to_onnx

Просмотреть файл

@ -1,7 +1,7 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
# Get/Set cpu affinity. Currently only support part of Unix system
import logging
@ -10,11 +10,11 @@ import os
logger = logging.getLogger(__name__)
class AffinitySetting():
class AffinitySetting:
def __init__(self):
self.pid = os.getpid()
self.affinity = None
self.is_os_supported = hasattr(os, 'sched_getaffinity') and hasattr(os, 'sched_setaffinity')
self.is_os_supported = hasattr(os, "sched_getaffinity") and hasattr(os, "sched_setaffinity")
if not self.is_os_supported:
logger.warning("Current OS does not support os.get_affinity() and os.set_affinity()")
@ -25,12 +25,16 @@ class AffinitySetting():
def set_affinity(self):
if self.is_os_supported:
current_affinity = os.sched_getaffinity(self.pid)
if (self.affinity != current_affinity):
logger.warning("Replacing affinity setting %s with %s", str(current_affinity), str(self.affinity))
if self.affinity != current_affinity:
logger.warning(
"Replacing affinity setting %s with %s",
str(current_affinity),
str(self.affinity),
)
os.sched_setaffinity(self.pid, self.affinity)
if __name__ == '__main__':
if __name__ == "__main__":
affi_helper = AffinitySetting()
affi_helper.get_affinity()
affi_helper.set_affinity()

Просмотреть файл

@ -42,24 +42,40 @@
import argparse
import logging
import os
import timeit
from datetime import datetime
import numpy
import os
import psutil
import onnx
from enum import Enum
from benchmark_helper import (OptimizerInfo, create_onnxruntime_session, Precision, setup_logger, get_latency_result,
output_details, output_summary, output_fusion_statistics, inference_ort,
inference_ort_with_io_binding, allocateOutputBuffers, ConfigModifier)
import numpy
import onnx
import psutil
from benchmark_helper import (
ConfigModifier,
OptimizerInfo,
Precision,
allocateOutputBuffers,
create_onnxruntime_session,
get_latency_result,
inference_ort,
inference_ort_with_io_binding,
output_details,
output_fusion_statistics,
output_summary,
setup_logger,
)
from fusion_options import FusionOptions
from onnx_exporter import (
create_onnxruntime_input,
export_onnx_model_from_pt,
export_onnx_model_from_tf,
load_pretrained_model,
)
from quantize_helper import QuantizeHelper
from onnx_exporter import create_onnxruntime_input, load_pretrained_model, export_onnx_model_from_pt, export_onnx_model_from_tf
logger = logging.getLogger('')
logger = logging.getLogger("")
from huggingface_models import MODELS, MODEL_CLASSES
from huggingface_models import MODEL_CLASSES, MODELS
cpu_count = psutil.cpu_count(logical=False)
@ -68,35 +84,60 @@ if "OMP_NUM_THREADS" not in os.environ:
os.environ["OMP_NUM_THREADS"] = str(cpu_count)
import torch
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model, LxmertConfig)
from transformers import AutoConfig, AutoModel, AutoTokenizer, GPT2Model, LxmertConfig
def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir,
verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics,
model_source, args):
def run_onnxruntime(
use_gpu,
provider,
model_names,
model_class,
config_modifier,
precision,
num_threads,
batch_sizes,
sequence_lengths,
repeat_times,
input_counts,
optimizer_info,
validate_onnx,
cache_dir,
onnx_dir,
verbose,
overwrite,
disable_ort_io_binding,
use_raw_attention_mask,
model_fusion_statistics,
model_source,
args,
):
import onnxruntime
results = []
if (use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers())
and ('ROCMExecutionProvider' not in onnxruntime.get_available_providers())):
if (
use_gpu
and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())
):
logger.error(
"Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
)
return results
warm_up_repeat = 0
if provider == 'tensorrt':
if provider == "tensorrt":
optimizer_info = OptimizerInfo.NOOPT
warm_up_repeat = 5
if 'TensorrtExecutionProvider' not in onnxruntime.get_available_providers():
if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers():
logger.error(
"Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
)
return results
if optimizer_info == OptimizerInfo.NOOPT:
logger.warning(f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied.")
logger.warning(
f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
)
for model_name in model_names:
all_input_names = MODELS[model_name][0]
@ -108,27 +149,64 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
args.model_type = MODELS[model_name][3]
fusion_options = FusionOptions.parse(args)
if 'pt' in model_source:
if "pt" in model_source:
with torch.no_grad():
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options)
if 'tf' in model_source:
onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options)
(
onnx_model_file,
is_valid_onnx_model,
vocab_size,
max_sequence_length,
) = export_onnx_model_from_pt(
model_name,
MODELS[model_name][1],
MODELS[model_name][2],
MODELS[model_name][3],
model_class,
config_modifier,
cache_dir,
onnx_dir,
input_names,
use_gpu,
precision,
optimizer_info,
validate_onnx,
use_raw_attention_mask,
overwrite,
model_fusion_statistics,
fusion_options,
)
if "tf" in model_source:
(onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length,) = export_onnx_model_from_tf(
model_name,
MODELS[model_name][1],
MODELS[model_name][2],
MODELS[model_name][3],
model_class,
config_modifier,
cache_dir,
onnx_dir,
input_names,
use_gpu,
precision,
optimizer_info,
validate_onnx,
use_raw_attention_mask,
overwrite,
model_fusion_statistics,
fusion_options,
)
if not is_valid_onnx_model:
continue
ort_session = create_onnxruntime_session(onnx_model_file,
use_gpu,
provider,
enable_all_optimization=True,
num_threads=num_threads,
verbose=verbose)
ort_session = create_onnxruntime_session(
onnx_model_file,
use_gpu,
provider,
enable_all_optimization=True,
num_threads=num_threads,
verbose=verbose,
)
if ort_session is None:
continue
@ -137,8 +215,12 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
device = "cuda" if use_gpu else "cpu"
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
max_last_state_size = numpy.prod(
[max(batch_sizes), max(sequence_lengths),
max(vocab_size, config.hidden_size)])
[
max(batch_sizes),
max(sequence_lengths),
max(vocab_size, config.hidden_size),
]
)
max_pooler_size = numpy.prod([max(batch_sizes), config.hidden_size])
for batch_size in batch_sizes:
if batch_size <= 0:
@ -147,9 +229,15 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
if max_sequence_length is not None and sequence_length > max_sequence_length:
continue
input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32
ort_inputs = create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config,
input_value_type)
input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
ort_inputs = create_onnxruntime_input(
vocab_size,
batch_size,
sequence_length,
input_names,
config,
input_value_type,
)
result_template = {
"engine": "onnxruntime",
"version": onnxruntime.__version__,
@ -167,12 +255,19 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
"datetime": str(datetime.now()),
}
logger.info("Run onnxruntime on {} with input shape {}".format(model_name,
[batch_size, sequence_length]))
logger.info(
"Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
)
if disable_ort_io_binding:
result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size,
warm_up_repeat)
result = inference_ort(
ort_session,
ort_inputs,
result_template,
repeat_times,
batch_size,
warm_up_repeat,
)
else:
# Get output sizes from a dummy ort run
ort_outputs = ort_session.run(ort_output_names, ort_inputs)
@ -184,19 +279,41 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
else:
output_buffer_max_sizes.append(max_last_state_size)
data_type = numpy.longlong if 'pt' in model_source else numpy.intc
result = inference_ort_with_io_binding(ort_session, ort_inputs, result_template, repeat_times,
ort_output_names, ort_outputs, output_buffers,
output_buffer_max_sizes, batch_size, device, data_type,
warm_up_repeat)
data_type = numpy.longlong if "pt" in model_source else numpy.intc
result = inference_ort_with_io_binding(
ort_session,
ort_inputs,
result_template,
repeat_times,
ort_output_names,
ort_outputs,
output_buffers,
output_buffer_max_sizes,
batch_size,
device,
data_type,
warm_up_repeat,
)
logger.info(result)
results.append(result)
return results
def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
sequence_lengths, repeat_times, torchscript, cache_dir, verbose):
def run_pytorch(
use_gpu,
model_names,
model_class,
config_modifier,
precision,
num_threads,
batch_sizes,
sequence_lengths,
repeat_times,
torchscript,
cache_dir,
verbose,
):
results = []
if use_gpu and not torch.cuda.is_available():
logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
@ -207,11 +324,17 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
for model_name in model_names:
config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir)
config_modifier.modify(config)
model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class)
model = load_pretrained_model(
model_name,
config=config,
cache_dir=cache_dir,
custom_model_class=model_class,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
max_input_size = tokenizer.max_model_input_sizes[
model_name] if model_name in tokenizer.max_model_input_sizes else 1024
max_input_size = (
tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
)
logger.debug(f"Model {model}")
logger.debug(f"Number of parameters {model.num_parameters()}")
@ -234,11 +357,13 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
continue
logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
input_ids = torch.randint(low=0,
high=config.vocab_size - 1,
size=(batch_size, sequence_length),
dtype=torch.long,
device=device)
input_ids = torch.randint(
low=0,
high=config.vocab_size - 1,
size=(batch_size, sequence_length),
dtype=torch.long,
device=device,
)
try:
inference = torch.jit.trace(model, input_ids) if torchscript else model
inference(input_ids)
@ -272,9 +397,10 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
import tensorflow as tf
from functools import wraps
import tensorflow as tf
def run_func(func):
@wraps(func)
def run_in_eager_mode(*args, **kwargs):
@ -296,26 +422,38 @@ def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
return run_func
def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
sequence_lengths, repeat_times, cache_dir, verbose):
def run_tensorflow(
use_gpu,
model_names,
model_class,
config_modifier,
precision,
num_threads,
batch_sizes,
sequence_lengths,
repeat_times,
cache_dir,
verbose,
):
results = []
import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
if not use_gpu:
tf.config.set_visible_devices([], 'GPU')
tf.config.set_visible_devices([], "GPU")
if use_gpu and not tf.test.is_built_with_cuda():
logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
return results
if use_gpu: # Restrict TensorFlow to only use the first GPU
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices("GPU")
try:
tf.config.set_visible_devices(physical_devices[0], 'GPU')
tf.config.set_visible_devices(physical_devices[0], "GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.distribute.OneDeviceStrategy(device='/gpu:0')
tf.distribute.OneDeviceStrategy(device="/gpu:0")
except RuntimeError as e:
logger.exception(e)
@ -326,16 +464,19 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
config_modifier.modify(config)
model = load_pretrained_model(model_name,
config=config,
cache_dir=cache_dir,
custom_model_class=model_class,
is_tf_model=True)
model = load_pretrained_model(
model_name,
config=config,
cache_dir=cache_dir,
custom_model_class=model_class,
is_tf_model=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
max_input_size = tokenizer.max_model_input_sizes[
model_name] if model_name in tokenizer.max_model_input_sizes else 1024
max_input_size = (
tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
)
for batch_size in batch_sizes:
if batch_size <= 0:
@ -345,10 +486,12 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
if max_input_size is not None and sequence_length > max_input_size:
continue
logger.info("Run Tensorflow on {} with input shape {}".format(model_name,
[batch_size, sequence_length]))
logger.info(
"Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length])
)
import random
rng = random.Random()
values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
@ -367,7 +510,12 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
def lxmert_forward():
feats = tf.random.normal([1, 1, config.visual_feat_dim])
pos = tf.random.normal([1, 1, config.visual_pos_dim])
return model(input_ids, visual_feats=feats, visual_pos=pos, training=False)
return model(
input_ids,
visual_feats=feats,
visual_pos=pos,
training=False,
)
inference = encoder_forward
if config.is_encoder_decoder:
@ -401,6 +549,7 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
except RuntimeError as e:
logger.exception(e)
from numba import cuda
device = cuda.get_current_device()
device.reset()
@ -410,55 +559,73 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-m",
"--models",
required=False,
nargs="+",
type=str,
default=["bert-base-cased", "roberta-base", "gpt2"],
choices=list(MODELS.keys()),
help="Pre-trained models in the list: " + ", ".join(MODELS.keys()))
parser.add_argument(
"-m",
"--models",
required=False,
nargs="+",
type=str,
default=["bert-base-cased", "roberta-base", "gpt2"],
choices=list(MODELS.keys()),
help="Pre-trained models in the list: " + ", ".join(MODELS.keys()),
)
parser.add_argument("--model_source",
required=False,
nargs=1,
type=str,
default='pt',
choices=['pt', 'tf'],
help="Export onnx from pt or tf")
parser.add_argument(
"--model_source",
required=False,
nargs=1,
type=str,
default="pt",
choices=["pt", "tf"],
help="Export onnx from pt or tf",
)
parser.add_argument('--model_class',
required=False,
type=str,
default=None,
choices=list(MODEL_CLASSES),
help='Model type selected in the list: ' + ', '.join(MODEL_CLASSES))
parser.add_argument(
"--model_class",
required=False,
type=str,
default=None,
choices=list(MODEL_CLASSES),
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
)
parser.add_argument("-e",
"--engines",
required=False,
nargs="+",
type=str,
default=['onnxruntime'],
choices=['onnxruntime', 'torch', 'torchscript', 'tensorflow'],
help="Engines to benchmark")
parser.add_argument(
"-e",
"--engines",
required=False,
nargs="+",
type=str,
default=["onnxruntime"],
choices=["onnxruntime", "torch", "torchscript", "tensorflow"],
help="Engines to benchmark",
)
parser.add_argument("-c",
"--cache_dir",
required=False,
type=str,
default=os.path.join('.', 'cache_models'),
help="Directory to cache pre-trained models")
parser.add_argument(
"-c",
"--cache_dir",
required=False,
type=str,
default=os.path.join(".", "cache_models"),
help="Directory to cache pre-trained models",
)
parser.add_argument("--onnx_dir",
required=False,
type=str,
default=os.path.join('.', 'onnx_models'),
help="Directory to store onnx models")
parser.add_argument(
"--onnx_dir",
required=False,
type=str,
default=os.path.join(".", "onnx_models"),
help="Directory to store onnx models",
)
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
parser.add_argument("--provider", required=False, type=str, default=None, help="Execution provider to use")
parser.add_argument(
"--provider",
required=False,
type=str,
default=None,
help="Execution provider to use",
)
parser.add_argument(
"-p",
@ -466,11 +633,17 @@ def parse_arguments():
type=Precision,
default=Precision.FLOAT32,
choices=list(Precision),
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization")
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
)
parser.add_argument("--verbose", required=False, action="store_true", help="Print more information")
parser.add_argument("--overwrite", required=False, action="store_true", help="Overwrite existing models")
parser.add_argument(
"--overwrite",
required=False,
action="store_true",
help="Overwrite existing models",
)
parser.add_argument(
"-o",
@ -478,54 +651,96 @@ def parse_arguments():
type=OptimizerInfo,
default=OptimizerInfo.BYSCRIPT,
choices=list(OptimizerInfo),
help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt"
help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt",
)
parser.add_argument("-v", "--validate_onnx", required=False, action="store_true", help="Validate ONNX model")
parser.add_argument(
"-v",
"--validate_onnx",
required=False,
action="store_true",
help="Validate ONNX model",
)
parser.add_argument("-f",
"--fusion_csv",
required=False,
default=None,
help="CSV file for saving summary results of graph optimization.")
parser.add_argument(
"-f",
"--fusion_csv",
required=False,
default=None,
help="CSV file for saving summary results of graph optimization.",
)
parser.add_argument("-d", "--detail_csv", required=False, default=None, help="CSV file for saving detail results.")
parser.add_argument(
"-d",
"--detail_csv",
required=False,
default=None,
help="CSV file for saving detail results.",
)
parser.add_argument("-r", "--result_csv", required=False, default=None, help="CSV file for saving summary results.")
parser.add_argument(
"-r",
"--result_csv",
required=False,
default=None,
help="CSV file for saving summary results.",
)
parser.add_argument("-i",
"--input_counts",
required=False,
nargs="+",
default=[1],
type=int,
choices=[1, 2, 3],
help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.")
parser.add_argument(
"-i",
"--input_counts",
required=False,
nargs="+",
default=[1],
type=int,
choices=[1, 2, 3],
help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.",
)
parser.add_argument("-t",
"--test_times",
required=False,
default=100,
type=int,
help="Number of repeat times to get average inference latency.")
parser.add_argument(
"-t",
"--test_times",
required=False,
default=100,
type=int,
help="Number of repeat times to get average inference latency.",
)
parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1])
parser.add_argument("-s", "--sequence_lengths", nargs="+", type=int, default=[4, 8, 16, 32, 64, 128, 256])
parser.add_argument(
"-s",
"--sequence_lengths",
nargs="+",
type=int,
default=[4, 8, 16, 32, 64, 128, 256],
)
parser.add_argument('--disable_ort_io_binding',
required=False,
action='store_true',
help='Disable running ONNX Runtime with binded inputs and outputs. ')
parser.add_argument(
"--disable_ort_io_binding",
required=False,
action="store_true",
help="Disable running ONNX Runtime with binded inputs and outputs. ",
)
parser.set_defaults(disable_ort_io_binding=False)
parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use")
parser.add_argument(
"-n",
"--num_threads",
required=False,
nargs="+",
type=int,
default=[0],
help="Threads to use",
)
parser.add_argument("--force_num_layers",
required=False,
type=int,
default=None,
help="Manually set the model's layer number")
parser.add_argument(
"--force_num_layers",
required=False,
type=int,
default=None,
help="Manually set the model's layer number",
)
FusionOptions.add_arguments(parser)
@ -573,30 +788,80 @@ def main():
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
if enable_torchscript:
results += run_pytorch(args.use_gpu, args.models, args.model_class, config_modifier, args.precision,
num_threads, args.batch_sizes, args.sequence_lengths, args.test_times, True,
args.cache_dir, args.verbose)
results += run_pytorch(
args.use_gpu,
args.models,
args.model_class,
config_modifier,
args.precision,
num_threads,
args.batch_sizes,
args.sequence_lengths,
args.test_times,
True,
args.cache_dir,
args.verbose,
)
if enable_torch:
results += run_pytorch(args.use_gpu, args.models, args.model_class, config_modifier, args.precision,
num_threads, args.batch_sizes, args.sequence_lengths, args.test_times, False,
args.cache_dir, args.verbose)
results += run_pytorch(
args.use_gpu,
args.models,
args.model_class,
config_modifier,
args.precision,
num_threads,
args.batch_sizes,
args.sequence_lengths,
args.test_times,
False,
args.cache_dir,
args.verbose,
)
if enable_tensorflow:
results += run_tensorflow(args.use_gpu, args.models, args.model_class, config_modifier, args.precision,
num_threads, args.batch_sizes, args.sequence_lengths, args.test_times,
args.cache_dir, args.verbose)
results += run_tensorflow(
args.use_gpu,
args.models,
args.model_class,
config_modifier,
args.precision,
num_threads,
args.batch_sizes,
args.sequence_lengths,
args.test_times,
args.cache_dir,
args.verbose,
)
model_fusion_statistics = {}
if enable_onnxruntime:
try:
use_raw_attention_mask = True
results += run_onnxruntime(args.use_gpu, args.provider, args.models, args.model_class, config_modifier,
args.precision, num_threads, args.batch_sizes, args.sequence_lengths,
args.test_times, args.input_counts, args.optimizer_info, args.validate_onnx,
args.cache_dir, args.onnx_dir, args.verbose, args.overwrite,
args.disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics,
args.model_source, args)
results += run_onnxruntime(
args.use_gpu,
args.provider,
args.models,
args.model_class,
config_modifier,
args.precision,
num_threads,
args.batch_sizes,
args.sequence_lengths,
args.test_times,
args.input_counts,
args.optimizer_info,
args.validate_onnx,
args.cache_dir,
args.onnx_dir,
args.verbose,
args.overwrite,
args.disable_ort_io_binding,
use_raw_attention_mask,
model_fusion_statistics,
args.model_source,
args,
)
except:
logger.error(f"Exception", exc_info=True)

Просмотреть файл

@ -4,28 +4,29 @@
# license information.
# --------------------------------------------------------------------------
import argparse
import csv
import logging
import os
import sys
import csv
import numpy
import time
import timeit
from datetime import datetime
import argparse
import logging
import coloredlogs
import torch
import onnx
from enum import Enum
import coloredlogs
import numpy
import onnx
import torch
from packaging import version
logger = logging.getLogger(__name__)
class Precision(Enum):
FLOAT32 = 'fp32'
FLOAT16 = 'fp16'
INT8 = 'int8'
FLOAT32 = "fp32"
FLOAT16 = "fp16"
INT8 = "int8"
def __str__(self):
return self.value
@ -34,28 +35,28 @@ class Precision(Enum):
class OptimizerInfo(Enum):
# no_opt means using the raw ONNX model, but OnnxRuntime might still apply optimization as long as
# graph optimization level is not 0 (disable all).
NOOPT = 'no_opt'
BYORT = 'by_ort'
BYSCRIPT = 'by_script'
NOOPT = "no_opt"
BYORT = "by_ort"
BYSCRIPT = "by_script"
def __str__(self):
return self.value
class ConfigModifier():
class ConfigModifier:
def __init__(self, num_layers):
self.num_layers = num_layers
def modify(self, config):
if self.num_layers is None:
return
if hasattr(config, 'num_hidden_layers'):
if hasattr(config, "num_hidden_layers"):
config.num_hidden_layers = self.num_layers
logger.info(f"Modifying pytorch model's number of hidden layers to: {self.num_layers}")
if hasattr(config, 'encoder_layers'):
if hasattr(config, "encoder_layers"):
config.encoder_layers = self.num_layers
logger.info(f"Modifying pytorch model's number of encoder layers to: {self.num_layers}")
if hasattr(config, 'decoder_layers '):
if hasattr(config, "decoder_layers "):
config.decoder_layers = self.num_layers
logger.info(f"Modifying pytorch model's number of decoder layers to: {self.num_layers}")
@ -69,16 +70,20 @@ IO_BINDING_DATA_TYPE_MAP = {
}
def create_onnxruntime_session(onnx_model_path,
use_gpu,
provider=None,
enable_all_optimization=True,
num_threads=-1,
enable_profiling=False,
verbose=False):
def create_onnxruntime_session(
onnx_model_path,
use_gpu,
provider=None,
enable_all_optimization=True,
num_threads=-1,
enable_profiling=False,
verbose=False,
):
session = None
try:
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
from onnxruntime import __version__ as onnxruntime_version
sess_options = SessionOptions()
if enable_all_optimization:
@ -100,20 +105,28 @@ def create_onnxruntime_session(onnx_model_path,
logger.debug(f"Create session for onnx model: {onnx_model_path}")
if use_gpu:
if provider == 'dml':
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
elif provider == 'rocm':
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'migraphx':
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'cuda':
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif provider == 'tensorrt':
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
if provider == "dml":
execution_providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
elif provider == "rocm":
execution_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
elif provider == "migraphx":
execution_providers = [
"MIGraphXExecutionProvider",
"ROCMExecutionProvider",
"CPUExecutionProvider",
]
elif provider == "cuda":
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif provider == "tensorrt":
execution_providers = [
"TensorrtExecutionProvider",
"CUDAExecutionProvider",
"CPUExecutionProvider",
]
else:
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
else:
execution_providers = ['CPUExecutionProvider']
execution_providers = ["CPUExecutionProvider"]
session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
except:
logger.error(f"Exception", exc_info=True)
@ -123,9 +136,12 @@ def create_onnxruntime_session(onnx_model_path,
def setup_logger(verbose=True):
if verbose:
coloredlogs.install(level='DEBUG', fmt='[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s')
coloredlogs.install(
level="DEBUG",
fmt="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
)
else:
coloredlogs.install(fmt='%(message)s')
coloredlogs.install(fmt="%(message)s")
logging.getLogger("transformers").setLevel(logging.WARNING)
@ -137,25 +153,30 @@ def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
os.makedirs(output_dir)
import onnxruntime
if use_gpu:
if provider == 'dml':
assert 'DmlExecutionProvider' in onnxruntime.get_available_providers(
if provider == "dml":
assert (
"DmlExecutionProvider" in onnxruntime.get_available_providers()
), "Please install onnxruntime-directml package to test GPU inference."
else:
assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers(
assert (
"CUDAExecutionProvider" in onnxruntime.get_available_providers()
), "Please install onnxruntime-gpu package to test GPU inference."
import transformers
logger.info(f'PyTorch Version:{torch.__version__}')
logger.info(f'Transformers Version:{transformers.__version__}')
logger.info(f'Onnxruntime Version:{onnxruntime.__version__}')
logger.info(f"PyTorch Version:{torch.__version__}")
logger.info(f"Transformers Version:{transformers.__version__}")
logger.info(f"Onnxruntime Version:{onnxruntime.__version__}")
# Support three major versions of PyTorch and OnnxRuntime, and up to 6 months of transformers.
from packaging import version
assert version.parse(torch.__version__) >= version.parse('1.5.0')
assert version.parse(transformers.__version__) >= version.parse('3.0.0')
assert version.parse(onnxruntime.__version__) >= version.parse('1.4.0')
assert version.parse(torch.__version__) >= version.parse("1.5.0")
assert version.parse(transformers.__version__) >= version.parse("3.0.0")
assert version.parse(onnxruntime.__version__) >= version.parse("1.4.0")
def get_latency_result(runtimes, batch_size):
@ -175,12 +196,29 @@ def get_latency_result(runtimes, batch_size):
def output_details(results, csv_filename):
with open(csv_filename, mode="a", newline='') as csv_file:
with open(csv_filename, mode="a", newline="") as csv_file:
column_names = [
"engine", "version", "providers", "device", "precision", "optimizer", "io_binding", "model_name", "inputs",
"threads", "batch_size", "sequence_length", "custom_layer_num", "datetime", "test_times", "QPS",
"average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile",
"latency_99_percentile"
"engine",
"version",
"providers",
"device",
"precision",
"optimizer",
"io_binding",
"model_name",
"inputs",
"threads",
"batch_size",
"sequence_length",
"custom_layer_num",
"datetime",
"test_times",
"QPS",
"average_latency_ms",
"latency_variance",
"latency_90_percentile",
"latency_95_percentile",
"latency_99_percentile",
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
@ -192,10 +230,19 @@ def output_details(results, csv_filename):
def output_summary(results, csv_filename, args):
with open(csv_filename, mode="a", newline='') as csv_file:
with open(csv_filename, mode="a", newline="") as csv_file:
header_names = [
"model_name", "inputs", "custom_layer_num", "engine", "version", "providers", "device", "precision",
"optimizer", "io_binding", "threads"
"model_name",
"inputs",
"custom_layer_num",
"engine",
"version",
"providers",
"device",
"precision",
"optimizer",
"io_binding",
"threads",
]
data_names = []
for batch_size in args.batch_sizes:
@ -211,9 +258,13 @@ def output_summary(results, csv_filename, args):
for threads in args.num_threads:
row = {}
for result in results:
if result["model_name"] == model_name and result["inputs"] == input_count and result[
"engine"] == engine_name and result["io_binding"] == io_binding and result[
"threads"] == threads:
if (
result["model_name"] == model_name
and result["inputs"] == input_count
and result["engine"] == engine_name
and result["io_binding"] == io_binding
and result["threads"] == threads
):
headers = {k: v for k, v in result.items() if k in header_names}
if not row:
row.update(headers)
@ -232,9 +283,11 @@ def output_summary(results, csv_filename, args):
def output_fusion_statistics(model_fusion_statistics, csv_filename):
from transformers import __version__ as transformers_version
with open(csv_filename, mode="a", newline='') as csv_file:
with open(csv_filename, mode="a", newline="") as csv_file:
column_names = ["model_filename", "datetime", "transformers", "torch"] + list(
next(iter(model_fusion_statistics.values())).keys())
next(iter(model_fusion_statistics.values())).keys()
)
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
for key in model_fusion_statistics.keys():
@ -256,18 +309,20 @@ def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_
return result
def inference_ort_with_io_binding(ort_session,
ort_inputs,
result_template,
repeat_times,
ort_output_names,
ort_outputs,
output_buffers,
output_buffer_max_sizes,
batch_size,
device,
data_type=numpy.longlong,
warm_up_repeat=0):
def inference_ort_with_io_binding(
ort_session,
ort_inputs,
result_template,
repeat_times,
ort_output_names,
ort_outputs,
output_buffers,
output_buffer_max_sizes,
batch_size,
device,
data_type=numpy.longlong,
warm_up_repeat=0,
):
result = {}
# Bind inputs and outputs to onnxruntime session
@ -275,18 +330,42 @@ def inference_ort_with_io_binding(ort_session,
# Bind inputs to device
for name in ort_inputs.keys():
np_input = torch.from_numpy(ort_inputs[name]).to(device)
input_type = IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)] if str(
ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP else data_type
io_binding.bind_input(name, np_input.device.type, 0, input_type, np_input.shape, np_input.data_ptr())
input_type = (
IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)]
if str(ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP
else data_type
)
io_binding.bind_input(
name,
np_input.device.type,
0,
input_type,
np_input.shape,
np_input.data_ptr(),
)
# Bind outputs buffers with the sizes needed if not allocated already
if len(output_buffers) == 0:
allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device)
for i in range(len(ort_output_names)):
io_binding.bind_output(ort_output_names[i], output_buffers[i].device.type, 0, numpy.float32,
ort_outputs[i].shape, output_buffers[i].data_ptr())
timeit.repeat(lambda: ort_session.run_with_iobinding(io_binding), number=1, repeat=warm_up_repeat) # Dry run
runtimes = timeit.repeat(lambda: ort_session.run_with_iobinding(io_binding), number=1, repeat=repeat_times)
io_binding.bind_output(
ort_output_names[i],
output_buffers[i].device.type,
0,
numpy.float32,
ort_outputs[i].shape,
output_buffers[i].data_ptr(),
)
timeit.repeat(
lambda: ort_session.run_with_iobinding(io_binding),
number=1,
repeat=warm_up_repeat,
) # Dry run
runtimes = timeit.repeat(
lambda: ort_session.run_with_iobinding(io_binding),
number=1,
repeat=repeat_times,
)
result.update(result_template)
result.update({"io_binding": True})
result.update(get_latency_result(runtimes, batch_size))
@ -304,21 +383,23 @@ def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):
def set_random_seed(seed=123):
"""Set random seed manully to get deterministic results"""
import random
random.seed(seed)
numpy.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
#torch.backends.cudnn.enabled = False
#torch.backends.cudnn.benchmark = False
#torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.enabled = False
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True
def measure_memory(is_gpu, func):
import os
import psutil
from time import sleep
import psutil
class MemoryMonitor:
def __init__(self, keep_measuring=True):
self.keep_measuring = keep_measuring
@ -333,8 +414,16 @@ def measure_memory(is_gpu, func):
return max_usage
def measure_gpu_usage(self):
from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
from py3nvml.py3nvml import (
NVMLError,
nvmlDeviceGetCount,
nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMemoryInfo,
nvmlDeviceGetName,
nvmlInit,
nvmlShutdown,
)
max_gpu_usage = []
gpu_name = []
try:
@ -350,11 +439,14 @@ def measure_memory(is_gpu, func):
if not self.keep_measuring:
break
nvmlShutdown()
return [{
"device_id": i,
"name": gpu_name[i],
"max_used_MB": max_gpu_usage[i]
} for i in range(deviceCount)]
return [
{
"device_id": i,
"name": gpu_name[i],
"max_used_MB": max_gpu_usage[i],
}
for i in range(deviceCount)
]
except NVMLError as error:
if not self.silent:
self.logger.error("Error fetching GPU information using nvml: %s", error)
@ -365,6 +457,7 @@ def measure_memory(is_gpu, func):
memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage()
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
monitor = MemoryMonitor()
mem_thread = executor.submit(monitor.measure_gpu_usage if is_gpu else monitor.measure_cpu_usage)

Просмотреть файл

@ -1,7 +1,7 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
# This tool measures the inference performance of onnxruntime or onnxruntime-gpu python package on Bert model.
@ -12,22 +12,22 @@
# Example command to run test on batch_size 1 and 2 for a model on GPU:
# python bert_perf_test.py --model bert.onnx --batch_size 1 2 --sequence_length 128 --use_gpu --samples 1000 --test_times 1
import sys
import argparse
import os
from pathlib import Path
import timeit
import statistics
import psutil
import csv
import numpy as np
import torch
import random
from datetime import datetime
import multiprocessing
from bert_test_data import get_bert_inputs, generate_test_data
import os
import random
import statistics
import sys
import timeit
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import numpy as np
import psutil
import torch
from bert_test_data import generate_test_data, get_bert_inputs
@dataclass
@ -56,7 +56,7 @@ class ModelSetting:
def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_optimization_level=None):
import onnxruntime
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
if use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()):
print(
"Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
)
@ -65,20 +65,28 @@ def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_op
session = onnxruntime.InferenceSession(model_path)
else:
if use_gpu:
if provider == 'dml':
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
elif provider == 'rocm':
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'migraphx':
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'cuda':
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif provider == 'tensorrt':
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
if provider == "dml":
execution_providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
elif provider == "rocm":
execution_providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
elif provider == "migraphx":
execution_providers = [
"MIGraphXExecutionProvider",
"ROCMExecutionProvider",
"CPUExecutionProvider",
]
elif provider == "cuda":
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
elif provider == "tensorrt":
execution_providers = [
"TensorrtExecutionProvider",
"CUDAExecutionProvider",
"CPUExecutionProvider",
]
else:
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
else:
execution_providers = ['CPUExecutionProvider']
execution_providers = ["CPUExecutionProvider"]
sess_options = onnxruntime.SessionOptions()
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
@ -102,55 +110,69 @@ def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_op
session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)
if use_gpu:
if provider == 'dml':
assert 'DmlExecutionProvider' in session.get_providers()
elif provider == 'rocm':
assert 'ROCMExecutionProvider' in session.get_providers()
elif provider == 'migraphx':
assert 'MIGraphXExecutionProvider' in session.get_providers()
assert 'ROCMExecutionProvider' in session.get_providers()
elif provider == 'cuda':
assert 'CUDAExecutionProvider' in session.get_providers()
elif provider == 'tensorrt':
assert 'TensorrtExecutionProvider' in session.get_providers()
assert 'CUDAExecutionProvider' in session.get_providers()
if provider == "dml":
assert "DmlExecutionProvider" in session.get_providers()
elif provider == "rocm":
assert "ROCMExecutionProvider" in session.get_providers()
elif provider == "migraphx":
assert "MIGraphXExecutionProvider" in session.get_providers()
assert "ROCMExecutionProvider" in session.get_providers()
elif provider == "cuda":
assert "CUDAExecutionProvider" in session.get_providers()
elif provider == "tensorrt":
assert "TensorrtExecutionProvider" in session.get_providers()
assert "CUDAExecutionProvider" in session.get_providers()
else:
assert 'CUDAExecutionProvider' in session.get_providers()
assert "CUDAExecutionProvider" in session.get_providers()
else:
assert 'CPUExecutionProvider' in session.get_providers()
assert "CPUExecutionProvider" in session.get_providers()
return session
def numpy_type(torch_type):
type_map = {torch.float32: np.float32,
torch.float16: np.float16,
torch.int32: np.int32,
torch.int64: np.longlong}
type_map = {
torch.float32: np.float32,
torch.float16: np.float16,
torch.int32: np.int32,
torch.int64: np.longlong,
}
return type_map[torch_type]
def create_input_output_tensors(inputs, outputs, device):
input_tensors = {name: torch.from_numpy(array).to(device)
for name, array in inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device)
for name, array in outputs.items()}
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
return input_tensors, output_tensors
def create_io_binding(sess, input_tensors, output_tensors):
io_binding = sess.io_binding()
for name, tensor in input_tensors.items():
io_binding.bind_input(name, tensor.device.type, 0,
numpy_type(tensor.dtype), tensor.shape,
tensor.data_ptr())
io_binding.bind_input(
name,
tensor.device.type,
0,
numpy_type(tensor.dtype),
tensor.shape,
tensor.data_ptr(),
)
for name, tensor in output_tensors.items():
io_binding.bind_output(name, tensor.device.type, 0,
numpy_type(tensor.dtype), tensor.shape,
tensor.data_ptr())
io_binding.bind_output(
name,
tensor.device.type,
0,
numpy_type(tensor.dtype),
tensor.shape,
tensor.data_ptr(),
)
return io_binding
def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting):
results = []
latency_list = []
device = 'cuda' if test_setting.use_gpu else 'cpu'
device = "cuda" if test_setting.use_gpu else "cpu"
for test_case_id, inputs in enumerate(all_inputs):
result = session.run(output_names, inputs)
results.append(result)
@ -171,6 +193,7 @@ def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, tes
return results, latency_list
def onnxruntime_inference(session, all_inputs, output_names):
if len(all_inputs) > 0:
# Use a random input as warm up.
@ -186,19 +209,25 @@ def onnxruntime_inference(session, all_inputs, output_names):
latency_list.append(latency)
return results, latency_list
def to_string(model_path, session, test_setting):
sess_options = session.get_session_options()
option = "model={},".format(os.path.basename(model_path))
option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level,
sess_options.intra_op_num_threads).replace(
'GraphOptimizationLevel.ORT_', '')
option += "graph_optimization_level={},intra_op_num_threads={},".format(
sess_options.graph_optimization_level, sess_options.intra_op_num_threads
).replace("GraphOptimizationLevel.ORT_", "")
option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
return option
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
session = create_session(model_setting.model_path, test_setting.use_gpu, test_setting.provider, intra_op_num_threads,
model_setting.opt_level)
session = create_session(
model_setting.model_path,
test_setting.use_gpu,
test_setting.provider,
intra_op_num_threads,
model_setting.opt_level,
)
output_names = [output.name for output in session.get_outputs()]
key = to_string(model_setting.model_path, session, test_setting)
@ -211,7 +240,9 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
all_latency_list = []
if test_setting.use_io_binding:
for i in range(test_setting.test_times):
results, latency_list = onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting)
results, latency_list = onnxruntime_inference_with_io_binding(
session, all_inputs, output_names, test_setting
)
all_latency_list.extend(latency_list)
else:
for i in range(test_setting.test_times):
@ -229,23 +260,45 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
latency_99 = np.percentile(latency_ms, 99)
throughput = test_setting.batch_size * (1000.0 / average_latency)
perf_results[key] = (average_latency, latency_50, latency_75, latency_90, latency_95, latency_99, throughput)
perf_results[key] = (
average_latency,
latency_50,
latency_75,
latency_90,
latency_95,
latency_99,
throughput,
)
print("Average latency = {} ms, Throughput = {} QPS".format(format(average_latency, '.2f'),
format(throughput, '.2f')))
print(
"Average latency = {} ms, Throughput = {} QPS".format(format(average_latency, ".2f"), format(throughput, ".2f"))
)
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
process = multiprocessing.Process(target=run_one_test,
args=(model_setting, test_setting, perf_results, all_inputs,
intra_op_num_threads))
process = multiprocessing.Process(
target=run_one_test,
args=(
model_setting,
test_setting,
perf_results,
all_inputs,
intra_op_num_threads,
),
)
process.start()
process.join()
def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
if (test_setting.intra_op_num_threads is not None):
launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads)
if test_setting.intra_op_num_threads is not None:
launch_test(
model_setting,
test_setting,
perf_results,
all_inputs,
test_setting.intra_op_num_threads,
)
return
cpu_count = psutil.cpu_count(logical=False)
@ -262,91 +315,139 @@ def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
def run_performance(model_setting, test_setting, perf_results):
input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name,
model_setting.segment_ids_name, model_setting.input_mask_name)
input_ids, segment_ids, input_mask = get_bert_inputs(
model_setting.model_path,
model_setting.input_ids_name,
model_setting.segment_ids_name,
model_setting.input_mask_name,
)
# Do not generate random mask for performance test.
print(
f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}"
)
all_inputs = generate_test_data(test_setting.batch_size,
test_setting.sequence_length,
test_setting.test_cases,
test_setting.seed,
test_setting.verbose,
input_ids,
segment_ids,
input_mask,
random_mask_length=False)
all_inputs = generate_test_data(
test_setting.batch_size,
test_setting.sequence_length,
test_setting.test_cases,
test_setting.seed,
test_setting.verbose,
input_ids,
segment_ids,
input_mask,
random_mask_length=False,
)
run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True, type=str, help="bert onnx model path")
parser.add_argument('-b',
'--batch_size',
required=True,
type=int,
nargs="+",
help="batch size of input. Allow one or multiple values in the range of [1, 128].")
parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input")
parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated")
parser.add_argument('-t',
'--test_times',
required=False,
type=int,
default=0,
help="number of times to run per sample. By default, the value is 1000 / samples")
parser.add_argument("--model", required=True, type=str, help="bert onnx model path")
parser.add_argument(
'--opt_level',
"-b",
"--batch_size",
required=True,
type=int,
nargs="+",
help="batch size of input. Allow one or multiple values in the range of [1, 128].",
)
parser.add_argument(
"-s",
"--sequence_length",
required=True,
type=int,
help="maximum sequence length of input",
)
parser.add_argument(
"--samples",
required=False,
type=int,
default=10,
help="number of samples to be generated",
)
parser.add_argument(
"-t",
"--test_times",
required=False,
type=int,
default=0,
help="number of times to run per sample. By default, the value is 1000 / samples",
)
parser.add_argument(
"--opt_level",
required=False,
type=int,
choices=[0, 1, 2, 99],
default=99,
help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 99 - enable all.")
help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 99 - enable all.",
)
parser.add_argument('--seed',
required=False,
type=int,
default=3,
help="random seed. Use the same seed to make sure test data is same in multiple tests.")
parser.add_argument(
"--seed",
required=False,
type=int,
default=3,
help="random seed. Use the same seed to make sure test data is same in multiple tests.",
)
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
parser.add_argument(
"--verbose",
required=False,
action="store_true",
help="print verbose information",
)
parser.set_defaults(verbose=False)
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument('--use_io_binding', required=False, action='store_true', help="use io_binding")
parser.add_argument("--use_io_binding", required=False, action="store_true", help="use io_binding")
parser.set_defaults(use_io_binding=False)
parser.add_argument("--provider",
required=False,
type=str,
default=None,
help="Execution provider to use")
parser.add_argument(
"--provider",
required=False,
type=str,
default=None,
help="Execution provider to use",
)
parser.add_argument('-n',
'--intra_op_num_threads',
required=False,
type=int,
default=None,
help=">=0, set intra_op_num_threads")
parser.add_argument(
"-n",
"--intra_op_num_threads",
required=False,
type=int,
default=None,
help=">=0, set intra_op_num_threads",
)
parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
parser.add_argument('--input_mask_name',
required=False,
type=str,
default=None,
help="input name for attention mask")
parser.add_argument(
"--input_ids_name",
required=False,
type=str,
default=None,
help="input name for input ids",
)
parser.add_argument(
"--segment_ids_name",
required=False,
type=str,
default=None,
help="input name for segment ids",
)
parser.add_argument(
"--input_mask_name",
required=False,
type=str,
default=None,
help="input name for attention mask",
)
args = parser.parse_args()
return args
@ -365,12 +466,27 @@ def main():
if not min(batch_size_set) >= 1 and max(batch_size_set) <= 128:
raise Exception("batch_size not in range [1, 128]")
model_setting = ModelSetting(args.model, args.input_ids_name, args.segment_ids_name, args.input_mask_name,
args.opt_level)
model_setting = ModelSetting(
args.model,
args.input_ids_name,
args.segment_ids_name,
args.input_mask_name,
args.opt_level,
)
for batch_size in batch_size_set:
test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu, args.use_io_binding,
args.provider, args.intra_op_num_threads, args.seed, args.verbose)
test_setting = TestSetting(
batch_size,
args.sequence_length,
args.samples,
args.test_times,
args.use_gpu,
args.use_io_binding,
args.provider,
args.intra_op_num_threads,
args.seed,
args.verbose,
)
print("test setting", test_setting)
run_performance(model_setting, test_setting, perf_results)
@ -380,25 +496,33 @@ def main():
summary_file = os.path.join(
Path(args.model).parent,
"perf_results_{}_B{}_S{}_{}.txt".format('GPU' if args.use_gpu else 'CPU',
"-".join([str(x) for x in sorted(list(batch_size_set))]),
args.sequence_length,
datetime.now().strftime("%Y%m%d-%H%M%S")))
with open(summary_file, 'w+', newline='') as tsv_file:
tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
"perf_results_{}_B{}_S{}_{}.txt".format(
"GPU" if args.use_gpu else "CPU",
"-".join([str(x) for x in sorted(list(batch_size_set))]),
args.sequence_length,
datetime.now().strftime("%Y%m%d-%H%M%S"),
),
)
with open(summary_file, "w+", newline="") as tsv_file:
tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
headers = None
for (key, perf_result) in sorted_results:
params = key.split(',')
params = key.split(",")
if headers is None:
headers = [
"Latency(ms)", "Latency_P50", "Latency_P75", "Latency_P90", "Latency_P95", "Latency_P99",
"Throughput(QPS)"
"Latency(ms)",
"Latency_P50",
"Latency_P75",
"Latency_P90",
"Latency_P95",
"Latency_P99",
"Throughput(QPS)",
]
headers.extend([x.split('=')[0] for x in params])
headers.extend([x.split("=")[0] for x in params])
tsv_writer.writerow(headers)
values = [format(x, '.2f') for x in perf_result]
values.extend([x.split('=')[1] for x in params])
values = [format(x, ".2f") for x in perf_result]
values.extend([x.split("=")[1] for x in params])
tsv_writer.writerow(values)
print("Test summary is saved to", summary_file)

Просмотреть файл

@ -1,24 +1,26 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
# It is a tool to generate test data for a bert model.
# The test data can be used by onnxruntime_perf_test tool to evaluate the inference latency.
import sys
import argparse
import numpy as np
import os
import random
import sys
from pathlib import Path
from typing import List, Dict, Tuple, Union
from typing import Dict, List, Tuple, Union
import numpy as np
from onnx import ModelProto, TensorProto, numpy_helper
from onnx_model import OnnxModel
def fake_input_ids_data(input_ids: TensorProto, batch_size: int, sequence_length: int,
dictionary_size: int) -> np.ndarray:
def fake_input_ids_data(
input_ids: TensorProto, batch_size: int, sequence_length: int, dictionary_size: int
) -> np.ndarray:
"""Create input tensor based on the graph input of input_ids
Args:
@ -30,7 +32,11 @@ def fake_input_ids_data(input_ids: TensorProto, batch_size: int, sequence_length
Returns:
np.ndarray: the input tensor created
"""
assert input_ids.type.tensor_type.elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
assert input_ids.type.tensor_type.elem_type in [
TensorProto.FLOAT,
TensorProto.INT32,
TensorProto.INT64,
]
data = np.random.randint(dictionary_size, size=(batch_size, sequence_length), dtype=np.int32)
@ -43,7 +49,7 @@ def fake_input_ids_data(input_ids: TensorProto, batch_size: int, sequence_length
def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_length: int) -> np.ndarray:
"""Create input tensor based on the graph input of segment_ids
"""Create input tensor based on the graph input of segment_ids
Args:
segment_ids (TensorProto): graph input of the token_type_ids input tensor
@ -53,7 +59,11 @@ def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_le
Returns:
np.ndarray: the input tensor created
"""
assert segment_ids.type.tensor_type.elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
assert segment_ids.type.tensor_type.elem_type in [
TensorProto.FLOAT,
TensorProto.INT32,
TensorProto.INT64,
]
data = np.zeros((batch_size, sequence_length), dtype=np.int32)
@ -65,8 +75,12 @@ def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_le
return data
def fake_input_mask_data(input_mask: TensorProto, batch_size: int, sequence_length: int,
random_mask_length: bool) -> np.ndarray:
def fake_input_mask_data(
input_mask: TensorProto,
batch_size: int,
sequence_length: int,
random_mask_length: bool,
) -> np.ndarray:
"""Create input tensor based on the graph input of segment_ids.
Args:
@ -79,13 +93,17 @@ def fake_input_mask_data(input_mask: TensorProto, batch_size: int, sequence_leng
np.ndarray: the input tensor created
"""
assert input_mask.type.tensor_type.elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
assert input_mask.type.tensor_type.elem_type in [
TensorProto.FLOAT,
TensorProto.INT32,
TensorProto.INT64,
]
if random_mask_length:
actual_seq_len = random.randint(int(sequence_length * 2 / 3), sequence_length)
data = np.zeros((batch_size, sequence_length), dtype=np.int32)
temp = np.ones((batch_size, actual_seq_len), dtype=np.int32)
data[:temp.shape[0], :temp.shape[1]] = temp
data[: temp.shape[0], : temp.shape[1]] = temp
else:
data = np.ones((batch_size, sequence_length), dtype=np.int32)
@ -117,14 +135,23 @@ def output_test_data(dir: str, inputs: np.ndarray):
index = 0
for name, data in inputs.items():
tensor = numpy_helper.from_array(data, name)
with open(os.path.join(dir, 'input_{}.pb'.format(index)), 'wb') as f:
with open(os.path.join(dir, "input_{}.pb".format(index)), "wb") as f:
f.write(tensor.SerializeToString())
index += 1
def fake_test_data(batch_size: int, sequence_length: int, test_cases: int, dictionary_size: int, verbose: bool,
random_seed: int, input_ids: TensorProto, segment_ids: TensorProto, input_mask: TensorProto,
random_mask_length: bool):
def fake_test_data(
batch_size: int,
sequence_length: int,
test_cases: int,
dictionary_size: int,
verbose: bool,
random_seed: int,
input_ids: TensorProto,
segment_ids: TensorProto,
input_mask: TensorProto,
random_mask_length: bool,
):
"""Create given number of input data for testing
Args:
@ -164,9 +191,17 @@ def fake_test_data(batch_size: int, sequence_length: int, test_cases: int, dicti
return all_inputs
def generate_test_data(batch_size: int, sequence_length: int, test_cases: int, seed: int, verbose: bool,
input_ids: TensorProto, segment_ids: TensorProto, input_mask: TensorProto,
random_mask_length: bool):
def generate_test_data(
batch_size: int,
sequence_length: int,
test_cases: int,
seed: int,
verbose: bool,
input_ids: TensorProto,
segment_ids: TensorProto,
input_mask: TensorProto,
random_mask_length: bool,
):
"""Create given number of minput data for testing
Args:
@ -184,8 +219,18 @@ def generate_test_data(batch_size: int, sequence_length: int, test_cases: int, s
List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictonary with input name as key and a tensor as value
"""
dictionary_size = 10000
all_inputs = fake_test_data(batch_size, sequence_length, test_cases, dictionary_size, verbose, seed, input_ids,
segment_ids, input_mask, random_mask_length)
all_inputs = fake_test_data(
batch_size,
sequence_length,
test_cases,
dictionary_size,
verbose,
seed,
input_ids,
segment_ids,
input_mask,
random_mask_length,
)
if len(all_inputs) != test_cases:
print("Failed to create test data for test.")
return all_inputs
@ -199,16 +244,17 @@ def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
graph_input = onnx_model.find_graph_input(input)
if graph_input is None:
parent_node = onnx_model.get_parent(embed_node, input_index)
if parent_node is not None and parent_node.op_type == 'Cast':
if parent_node is not None and parent_node.op_type == "Cast":
graph_input = onnx_model.find_graph_input(parent_node.input[0])
return graph_input
def find_bert_inputs(onnx_model: OnnxModel,
input_ids_name: str = None,
segment_ids_name: str = None,
input_mask_name: str = None
) -> Tuple[Union[None, np.ndarray], Union[None, np.ndarray], Union[None, np.ndarray]]:
def find_bert_inputs(
onnx_model: OnnxModel,
input_ids_name: str = None,
segment_ids_name: str = None,
input_mask_name: str = None,
) -> Tuple[Union[None, np.ndarray], Union[None, np.ndarray], Union[None, np.ndarray]]:
"""Find graph inputs for BERT model.
First, we will deduce inputs from EmbedLayerNormalization node. If not found, we will guess the meaning of graph inputs based on naming.
@ -254,7 +300,7 @@ def find_bert_inputs(onnx_model: OnnxModel,
if len(graph_inputs) != 3:
raise ValueError("Expect the graph to have 3 inputs. Got {}".format(len(graph_inputs)))
embed_nodes = onnx_model.get_nodes_by_op_type('EmbedLayerNormalization')
embed_nodes = onnx_model.get_nodes_by_op_type("EmbedLayerNormalization")
if len(embed_nodes) == 1:
embed_node = embed_nodes[0]
input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
@ -279,7 +325,9 @@ def find_bert_inputs(onnx_model: OnnxModel,
input_name_lower = input.name.lower()
if "mask" in input_name_lower: # matches input with name like "attention_mask" or "input_mask"
input_mask = input
elif "token" in input_name_lower or "segment" in input_name_lower: # matches input with name like "segment_ids" or "token_type_ids"
elif (
"token" in input_name_lower or "segment" in input_name_lower
): # matches input with name like "segment_ids" or "token_type_ids"
segment_ids = input
else:
input_ids = input
@ -290,10 +338,12 @@ def find_bert_inputs(onnx_model: OnnxModel,
raise ValueError("Fail to assign 3 inputs. You might try rename the graph inputs.")
def get_bert_inputs(onnx_file: str,
input_ids_name: str = None,
segment_ids_name: str = None,
input_mask_name: str = None):
def get_bert_inputs(
onnx_file: str,
input_ids_name: str = None,
segment_ids_name: str = None,
input_mask_name: str = None,
):
"""Find graph inputs for BERT model.
First, we will deduce inputs from EmbedLayerNormalization node. If not found, we will guess the meaning of graph inputs based on naming.
@ -317,54 +367,95 @@ def get_bert_inputs(onnx_file: str,
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True, type=str, help="bert onnx model path.")
parser.add_argument("--model", required=True, type=str, help="bert onnx model path.")
parser.add_argument('--output_dir',
required=False,
type=str,
default=None,
help="output test data path. Default is current directory.")
parser.add_argument(
"--output_dir",
required=False,
type=str,
default=None,
help="output test data path. Default is current directory.",
)
parser.add_argument('--batch_size', required=False, type=int, default=1, help="batch size of input")
parser.add_argument("--batch_size", required=False, type=int, default=1, help="batch size of input")
parser.add_argument('--sequence_length',
required=False,
type=int,
default=128,
help="maximum sequence length of input")
parser.add_argument(
"--sequence_length",
required=False,
type=int,
default=128,
help="maximum sequence length of input",
)
parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
parser.add_argument('--input_mask_name',
required=False,
type=str,
default=None,
help="input name for attention mask")
parser.add_argument(
"--input_ids_name",
required=False,
type=str,
default=None,
help="input name for input ids",
)
parser.add_argument(
"--segment_ids_name",
required=False,
type=str,
default=None,
help="input name for segment ids",
)
parser.add_argument(
"--input_mask_name",
required=False,
type=str,
default=None,
help="input name for attention mask",
)
parser.add_argument('--samples', required=False, type=int, default=1, help="number of test cases to be generated")
parser.add_argument(
"--samples",
required=False,
type=int,
default=1,
help="number of test cases to be generated",
)
parser.add_argument('--seed', required=False, type=int, default=3, help="random seed")
parser.add_argument("--seed", required=False, type=int, default=3, help="random seed")
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
parser.add_argument(
"--verbose",
required=False,
action="store_true",
help="print verbose information",
)
parser.set_defaults(verbose=False)
parser.add_argument('--only_input_tensors',
required=False,
action='store_true',
help="only save input tensors and no output tensors")
parser.add_argument(
"--only_input_tensors",
required=False,
action="store_true",
help="only save input tensors and no output tensors",
)
parser.set_defaults(only_input_tensors=False)
args = parser.parse_args()
return args
def create_and_save_test_data(model: str, output_dir: str, batch_size: int, sequence_length: int, test_cases: int,
seed: int, verbose: bool, input_ids_name: str, segment_ids_name: str,
input_mask_name: str, only_input_tensors: bool):
def create_and_save_test_data(
model: str,
output_dir: str,
batch_size: int,
sequence_length: int,
test_cases: int,
seed: int,
verbose: bool,
input_ids_name: str,
segment_ids_name: str,
input_mask_name: str,
only_input_tensors: bool,
):
"""Create test data for a model, and save test data to a directory.
Args:
model (str): path of ONNX bert model
model (str): path of ONNX bert model
output_dir (str): output directory
batch_size (int): batch size
sequence_length (int): sequence length
@ -378,33 +469,36 @@ def create_and_save_test_data(model: str, output_dir: str, batch_size: int, sequ
"""
input_ids, segment_ids, input_mask = get_bert_inputs(model, input_ids_name, segment_ids_name, input_mask_name)
all_inputs = generate_test_data(batch_size,
sequence_length,
test_cases,
seed,
verbose,
input_ids,
segment_ids,
input_mask,
random_mask_length=False)
all_inputs = generate_test_data(
batch_size,
sequence_length,
test_cases,
seed,
verbose,
input_ids,
segment_ids,
input_mask,
random_mask_length=False,
)
for i, inputs in enumerate(all_inputs):
dir = os.path.join(output_dir, 'test_data_set_' + str(i))
dir = os.path.join(output_dir, "test_data_set_" + str(i))
output_test_data(dir, inputs)
if only_input_tensors:
return
import onnxruntime
sess = onnxruntime.InferenceSession(model)
output_names = [output.name for output in sess.get_outputs()]
for i, inputs in enumerate(all_inputs):
dir = os.path.join(output_dir, 'test_data_set_' + str(i))
dir = os.path.join(output_dir, "test_data_set_" + str(i))
result = sess.run(output_names, inputs)
for i, output_name in enumerate(output_names):
tensor_result = numpy_helper.from_array(np.asarray(result[i]), output_names[i])
with open(os.path.join(dir, 'output_{}.pb'.format(i)), 'wb') as f:
with open(os.path.join(dir, "output_{}.pb".format(i)), "wb") as f:
f.write(tensor_result.SerializeToString())
@ -424,9 +518,19 @@ def main():
else:
print("Directory existed. test data files will be overwritten.")
create_and_save_test_data(args.model, output_dir, args.batch_size, args.sequence_length, args.samples, args.seed,
args.verbose, args.input_ids_name, args.segment_ids_name, args.input_mask_name,
args.only_input_tensors)
create_and_save_test_data(
args.model,
output_dir,
args.batch_size,
args.sequence_length,
args.samples,
args.seed,
args.verbose,
args.input_ids_name,
args.segment_ids_name,
args.input_mask_name,
args.only_input_tensors,
)
print("Test data is saved to directory:", output_dir)

Просмотреть файл

@ -1,27 +1,28 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
# It is a tool to compare the inference results of the original model and optimized model.
import sys
import argparse
import numpy as np
import csv
import os
import random
from pathlib import Path
import statistics
import sys
import timeit
from datetime import datetime
from pathlib import Path
import numpy as np
import onnx
import onnx.utils
import psutil
import csv
import timeit
from datetime import datetime
from bert_perf_test import create_session, onnxruntime_inference
from bert_test_data import generate_test_data, get_bert_inputs, output_test_data
from onnx import ModelProto, TensorProto, numpy_helper
from onnx_model import OnnxModel
from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
from bert_perf_test import create_session, onnxruntime_inference
def run_model(model_path, all_inputs, use_gpu, disable_optimization):
@ -64,51 +65,75 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
print("rel_diff={} abs_diff={}".format(rel_diff, abs_diff))
if diff_count == 0:
print("100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
len(baseline_results), rtol, atol))
print(
"100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
len(baseline_results), rtol, atol
)
)
else:
print("WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
diff_count, len(baseline_results), rtol, atol))
print(
"WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
diff_count, len(baseline_results), rtol, atol
)
)
print("maximum absolute difference={}".format(max_abs_diff))
print("maximum relative difference={}".format(max_rel_diff))
def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed,
verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
def run_test(
baseline_model,
optimized_model,
output_dir,
batch_size,
sequence_length,
use_gpu,
test_cases,
seed,
verbose,
rtol,
atol,
input_ids_name,
segment_ids_name,
input_mask_name,
):
# Try deduce input names from optimized model.
input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name,
input_mask_name)
input_ids, segment_ids, input_mask = get_bert_inputs(
optimized_model, input_ids_name, segment_ids_name, input_mask_name
)
# Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
all_inputs = generate_test_data(batch_size,
sequence_length,
test_cases,
seed,
verbose,
input_ids,
segment_ids,
input_mask,
random_mask_length=True)
all_inputs = generate_test_data(
batch_size,
sequence_length,
test_cases,
seed,
verbose,
input_ids,
segment_ids,
input_mask,
random_mask_length=True,
)
baseline_results, baseline_latency, output_names = run_model(baseline_model,
all_inputs,
use_gpu,
disable_optimization=True)
baseline_results, baseline_latency, output_names = run_model(
baseline_model, all_inputs, use_gpu, disable_optimization=True
)
if verbose:
print("baseline average latency (all optimizations disabled): {} ms".format(
statistics.mean(baseline_latency) * 1000))
print(
"baseline average latency (all optimizations disabled): {} ms".format(
statistics.mean(baseline_latency) * 1000
)
)
if output_dir is not None:
for i, inputs in enumerate(all_inputs):
output_test_data(output_dir, i, inputs)
treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model,
all_inputs,
use_gpu,
disable_optimization=False)
treatment_results, treatment_latency, treatment_output_names = run_model(
optimized_model, all_inputs, use_gpu, disable_optimization=False
)
if verbose:
print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
@ -118,41 +143,79 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--baseline_model', required=True, type=str, help="baseline onnx model path.")
parser.add_argument("--baseline_model", required=True, type=str, help="baseline onnx model path.")
parser.add_argument('--optimized_model',
required=True,
type=str,
default=None,
help="path of the optimized model. It shall have same inputs as the baseline model.")
parser.add_argument(
"--optimized_model",
required=True,
type=str,
default=None,
help="path of the optimized model. It shall have same inputs as the baseline model.",
)
parser.add_argument('--output_dir',
required=False,
type=str,
default=None,
help="output test data path. If not specified, test data will not be saved.")
parser.add_argument(
"--output_dir",
required=False,
type=str,
default=None,
help="output test data path. If not specified, test data will not be saved.",
)
parser.add_argument('--batch_size', required=True, type=int, help="batch size of input")
parser.add_argument("--batch_size", required=True, type=int, help="batch size of input")
parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input")
parser.add_argument(
"--sequence_length",
required=True,
type=int,
help="maximum sequence length of input",
)
parser.add_argument('--rtol', required=False, type=float, default=1e-3, help="relative tolerance")
parser.add_argument("--rtol", required=False, type=float, default=1e-3, help="relative tolerance")
parser.add_argument('--atol', required=False, type=float, default=1e-4, help="absolute tolerance")
parser.add_argument("--atol", required=False, type=float, default=1e-4, help="absolute tolerance")
parser.add_argument('--samples', required=False, type=int, default=100, help="number of test cases to be generated")
parser.add_argument(
"--samples",
required=False,
type=int,
default=100,
help="number of test cases to be generated",
)
parser.add_argument('--seed', required=False, type=int, default=3, help="random seed")
parser.add_argument("--seed", required=False, type=int, default=3, help="random seed")
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
parser.add_argument(
"--verbose",
required=False,
action="store_true",
help="print verbose information",
)
parser.set_defaults(verbose=False)
parser.add_argument('--input_ids', required=False, type=str, default=None, help="input name for input ids")
parser.add_argument('--segment_ids', required=False, type=str, default=None, help="input name for segment ids")
parser.add_argument('--input_mask', required=False, type=str, default=None, help="input name for attention mask")
parser.add_argument(
"--input_ids",
required=False,
type=str,
default=None,
help="input name for input ids",
)
parser.add_argument(
"--segment_ids",
required=False,
type=str,
default=None,
help="input name for segment ids",
)
parser.add_argument(
"--input_mask",
required=False,
type=str,
default=None,
help="input name for attention mask",
)
args = parser.parse_args()
return args
@ -166,9 +229,22 @@ def main():
path = Path(args.output_dir)
path.mkdir(parents=True, exist_ok=True)
run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length,
args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids,
args.segment_ids, args.input_mask)
run_test(
args.baseline_model,
args.optimized_model,
args.output_dir,
args.batch_size,
args.sequence_length,
args.use_gpu,
args.samples,
args.seed,
args.verbose,
args.rtol,
args.atol,
args.input_ids,
args.segment_ids,
args.input_mask,
)
if __name__ == "__main__":

Просмотреть файл

@ -1,7 +1,7 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
"""
This converts GPT2 or T5 model to onnx with beam search operator.
@ -13,161 +13,203 @@ Example 2: convert T5 model with beam search:
python convert_beam_search.py -m t5-small --model_type t5 --decoder_onnx ./onnx_models/t5-small_decoder.onnx --encoder_decoder_init_onnx ./onnx_models/t5-small_encoder_decoder_init.onnx --output ./onnx_models/t5_small_beam_search.onnx
"""
import os
import time
import onnx
import logging
import argparse
import logging
import os
import sys
import time
from pathlib import Path
from onnx import helper
import numpy as np
from typing import List, Union
import numpy as np
import onnx
import torch
from benchmark_helper import Precision
from onnx import helper
from onnx import onnx_pb as onnx_proto
from packaging import version
from transformers import GPT2Config, T5Config
from benchmark_helper import Precision
from onnx import onnx_pb as onnx_proto
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), 'models', 'gpt2'))
from gpt2_helper import PRETRAINED_GPT2_MODELS
sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
from convert_to_onnx import main as convert_gpt2_to_onnx
from gpt2_helper import PRETRAINED_GPT2_MODELS
config: Union[GPT2Config, T5Config] = None
logger = logging.getLogger('')
logger = logging.getLogger("")
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('-m',
'--model_name_or_path',
required=True,
type=str,
help='Model path, or pretrained model name in the list: ' + ', '.join(PRETRAINED_GPT2_MODELS))
parser.add_argument(
"-m",
"--model_name_or_path",
required=True,
type=str,
help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
)
parser.add_argument('--model_type',
required=False,
type=str,
default="gpt2",
choices=["gpt2", "t5"],
help='Model type in the list: ' + ', '.join(["gpt2", "t5"]))
parser.add_argument(
"--model_type",
required=False,
type=str,
default="gpt2",
choices=["gpt2", "t5"],
help="Model type in the list: " + ", ".join(["gpt2", "t5"]),
)
parser.add_argument('--cache_dir',
required=False,
type=str,
default=os.path.join('.', 'cache_models'),
help='Directory to cache pre-trained models')
parser.add_argument(
"--cache_dir",
required=False,
type=str,
default=os.path.join(".", "cache_models"),
help="Directory to cache pre-trained models",
)
parser.add_argument('--decoder_onnx',
required=True,
type=str,
help='Output directory for decoder onnx model, or model path ends with .onnx')
parser.add_argument(
"--decoder_onnx",
required=True,
type=str,
help="Output directory for decoder onnx model, or model path ends with .onnx",
)
parser.add_argument('--encoder_decoder_init_onnx',
required=False,
type=str,
default="",
help='path of ONNX model for encoder and decoder initialization. Required for t5 model type.')
parser.add_argument(
"--encoder_decoder_init_onnx",
required=False,
type=str,
default="",
help="path of ONNX model for encoder and decoder initialization. Required for t5 model type.",
)
parser.add_argument('--output',
required=False,
type=str,
help='Output directory for beam search model, or model path ends with .onnx')
parser.add_argument(
"--output",
required=False,
type=str,
help="Output directory for beam search model, or model path ends with .onnx",
)
parser.add_argument("-p",
"--precision",
required=False,
type=Precision,
default=Precision.FLOAT32,
choices=[Precision.FLOAT32, Precision.FLOAT16],
help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision")
parser.add_argument(
"-p",
"--precision",
required=False,
type=Precision,
default=Precision.FLOAT32,
choices=[Precision.FLOAT32, Precision.FLOAT16],
help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision",
)
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference")
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
parser.set_defaults(use_gpu=False)
parser.add_argument('-e', '--use_external_data_format', required=False, action='store_true')
parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
parser.set_defaults(use_external_data_format=False)
parser.add_argument('--disable_parity', required=False, action='store_true', help="do not run parity test")
parser.add_argument(
"--disable_parity",
required=False,
action="store_true",
help="do not run parity test",
)
parser.set_defaults(disable_parity=False)
parser.add_argument('--torch_performance', required=False, action='store_true', help="test PyTorch performance")
parser.add_argument(
"--torch_performance",
required=False,
action="store_true",
help="test PyTorch performance",
)
parser.set_defaults(torch_performance=False)
parser.add_argument('--total_runs',
required=False,
type=int,
default=1,
help='Number of times of inference for latency measurement')
parser.add_argument(
"--total_runs",
required=False,
type=int,
default=1,
help="Number of times of inference for latency measurement",
)
beam_search_group = parser.add_argument_group("beam search options")
beam_search_group.add_argument('--output_sequences_scores',
required=False,
action='store_true',
help="output sequences scores")
beam_search_group.add_argument(
"--output_sequences_scores",
required=False,
action="store_true",
help="output sequences scores",
)
beam_search_group.set_defaults(output_sequences_scores=False)
beam_search_group.add_argument('--output_token_scores',
required=False,
action='store_true',
help="output token scores")
beam_search_group.add_argument(
"--output_token_scores",
required=False,
action="store_true",
help="output token scores",
)
beam_search_group.set_defaults(output_token_scores=False)
beam_search_group.add_argument('--early_stopping', required=False, action='store_true')
beam_search_group.add_argument("--early_stopping", required=False, action="store_true")
beam_search_group.set_defaults(early_stopping=False)
beam_search_group.add_argument('--min_length', type=int, required=False, default=1, help='Min sequence length')
beam_search_group.add_argument("--min_length", type=int, required=False, default=1, help="Min sequence length")
beam_search_group.add_argument('--max_length', type=int, required=False, default=50, help='Max sequence length')
beam_search_group.add_argument('--no_repeat_ngram_size',
type=int,
required=False,
default=0,
help='No repeat ngram size')
beam_search_group.add_argument('--num_beams', type=int, required=False, default=4, help='Beam size')
beam_search_group.add_argument('--num_return_sequences',
type=int,
required=False,
default=1,
help='Number of return sequence <= num_beams')
beam_search_group.add_argument('--temperature',
type=float,
required=False,
default=1,
help='Softmax temperature for output logits.')
beam_search_group.add_argument('--length_penalty',
type=float,
required=False,
default=1,
help='Positive. >1 to penalize and <1 to encorage short sentence.')
beam_search_group.add_argument('--repetition_penalty',
type=float,
required=False,
default=1,
help='Positive. >1 to penalize and <1 to encorage.')
beam_search_group.add_argument('--vocab_size',
type=int,
required=False,
default=-1,
help="Vocab_size of the underlying model")
beam_search_group.add_argument("--max_length", type=int, required=False, default=50, help="Max sequence length")
beam_search_group.add_argument(
'--prefix_vocab_mask',
"--no_repeat_ngram_size",
type=int,
required=False,
action='store_true',
help="This vocab mask applies only to first iteration, enable if last word in query might need auto complete")
default=0,
help="No repeat ngram size",
)
beam_search_group.add_argument("--num_beams", type=int, required=False, default=4, help="Beam size")
beam_search_group.add_argument(
"--num_return_sequences",
type=int,
required=False,
default=1,
help="Number of return sequence <= num_beams",
)
beam_search_group.add_argument(
"--temperature",
type=float,
required=False,
default=1,
help="Softmax temperature for output logits.",
)
beam_search_group.add_argument(
"--length_penalty",
type=float,
required=False,
default=1,
help="Positive. >1 to penalize and <1 to encorage short sentence.",
)
beam_search_group.add_argument(
"--repetition_penalty",
type=float,
required=False,
default=1,
help="Positive. >1 to penalize and <1 to encorage.",
)
beam_search_group.add_argument(
"--vocab_size",
type=int,
required=False,
default=-1,
help="Vocab_size of the underlying model",
)
beam_search_group.add_argument(
"--prefix_vocab_mask",
required=False,
action="store_true",
help="This vocab mask applies only to first iteration, enable if last word in query might need auto complete",
)
beam_search_group.set_defaults(prefix_vocab_mask=False)
args = parser.parse_args(argv)
@ -180,39 +222,40 @@ def gpt2_to_onnx(args):
print(f"use convert_to_onnx.py to convert model {model_name} to onnx {args.decoder_onnx} ...")
arguments = [
'--model_name_or_path',
"--model_name_or_path",
model_name,
'--output',
"--output",
args.decoder_onnx,
'--optimize_onnx',
'--precision',
'fp32' if args.precision == Precision.FLOAT32 else 'fp16',
'--test_runs',
'1',
'--test_cases',
'10',
'--use_int32_inputs' # BeamSearch requires to use int32 for input_ids, postion_ids and attention_mask
"--optimize_onnx",
"--precision",
"fp32" if args.precision == Precision.FLOAT32 else "fp16",
"--test_runs",
"1",
"--test_cases",
"10",
"--use_int32_inputs", # BeamSearch requires to use int32 for input_ids, postion_ids and attention_mask
]
if args.use_gpu:
arguments.append('--use_gpu')
arguments.append("--use_gpu")
if args.use_external_data_format:
arguments.append('--use_external_data_format')
arguments.append("--use_external_data_format")
if args.precision == Precision.FLOAT16:
assert args.use_gpu, "fp16 or mixed precision model cannot run in CPU. Please add --use_gpu"
# TODO: Use auto mixed precision for fp16 conversion: arguments.append('--auto_mixed_precision')
# Need change cuda kernel to support a combination of fp32 logits and fp16 past state.
# Currently logits and past state shall be same data type.
arguments.extend(['--op_block_list', 'Add', 'LayerNormalization', 'FastGelu'])
arguments.extend(["--op_block_list", "Add", "LayerNormalization", "FastGelu"])
convert_gpt2_to_onnx(arguments)
def shape_inference(decoder_onnx_path):
if version.parse(onnx.__version__) >= version.parse('1.11.0'):
if version.parse(onnx.__version__) >= version.parse("1.11.0"):
logger.warn("SymbolicShapeInference might fail using onnx version 1.11. Please install 1.10.0 for now.")
# Run symbolic shape inference to walk around ORT shape inference issue for subgraph.
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
out = SymbolicShapeInference.infer_shapes(onnx.load(decoder_onnx_path), auto_merge=True, guess_output_rank=False)
if out:
# TODO: Use external format if input has extra data.
@ -222,12 +265,15 @@ def shape_inference(decoder_onnx_path):
def create_ort_session(model_path, use_gpu):
from onnxruntime import SessionOptions, InferenceSession, __version__ as ort_version, GraphOptimizationLevel, get_available_providers
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
from onnxruntime import __version__ as ort_version
from onnxruntime import get_available_providers
sess_options = SessionOptions()
sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider']
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
if use_gpu:
if 'CUDAExecutionProvider' not in get_available_providers():
if "CUDAExecutionProvider" not in get_available_providers():
raise RuntimeError("CUDAExecutionProvider is not avaiable for --use_gpu!")
else:
print("use CUDAExecutionProvider")
@ -237,12 +283,12 @@ def create_ort_session(model_path, use_gpu):
def verify_gpt2_subgraph(graph, precision):
is_float16 = (Precision.FLOAT16 == precision)
is_float16 = Precision.FLOAT16 == precision
input_count = len(graph.input)
layer_count = input_count - 3
expected_inputs = ['input_ids', 'position_ids', 'attention_mask'] + [f"past_{i}" for i in range(layer_count)]
expected_inputs = ["input_ids", "position_ids", "attention_mask"] + [f"past_{i}" for i in range(layer_count)]
if len(graph.input) != len(expected_inputs):
raise ValueError(f"Number of inputs expected to be {len(expected_inputs)}. Got {len(graph.input)}")
@ -260,7 +306,7 @@ def verify_gpt2_subgraph(graph, precision):
)
print("Verifying GPT-2 graph inputs: name and data type are good.")
expected_outputs = ['logits'] + [f"present_{i}" for i in range(layer_count)]
expected_outputs = ["logits"] + [f"present_{i}" for i in range(layer_count)]
if len(graph.output) != len(expected_outputs):
raise ValueError(f"Number of outputs expected to be {len(expected_outputs)}. Got {len(graph.output)}")
@ -327,8 +373,15 @@ def convert_model(args):
verify_t5_decoder_subgraph(model.graph, args.precision)
inputs = [
"input_ids", "max_length", "min_length", "num_beams", "num_return_sequences", "temperature", "length_penalty",
"repetition_penalty", "vocab_mask"
"input_ids",
"max_length",
"min_length",
"num_beams",
"num_return_sequences",
"temperature",
"length_penalty",
"repetition_penalty",
"vocab_mask",
]
if args.prefix_vocab_mask:
inputs.append("prefix_vocab_mask")
@ -341,16 +394,23 @@ def convert_model(args):
assert args.output_sequences_scores, "--output_token_scores requires --output_sequences_scores"
outputs.append("scores")
node = helper.make_node('BeamSearch', inputs=inputs, outputs=outputs, name=f'BeamSearch_{args.model_type}')
node = helper.make_node(
"BeamSearch",
inputs=inputs,
outputs=outputs,
name=f"BeamSearch_{args.model_type}",
)
node.domain = "com.microsoft"
node.attribute.extend([
helper.make_attribute("eos_token_id", eos_token_id),
helper.make_attribute("pad_token_id", pad_token_id),
helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
helper.make_attribute("early_stopping", 1 if args.early_stopping else 0),
helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1),
helper.make_attribute("decoder", model.graph),
])
node.attribute.extend(
[
helper.make_attribute("eos_token_id", eos_token_id),
helper.make_attribute("pad_token_id", pad_token_id),
helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
helper.make_attribute("early_stopping", 1 if args.early_stopping else 0),
helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1),
helper.make_attribute("decoder", model.graph),
]
)
if args.model_type == "t5":
if enable_shape_inference:
@ -359,42 +419,59 @@ def convert_model(args):
init_model = onnx.load(args.encoder_decoder_init_onnx)
init_model.graph.name = f"{args.model_type} encoder decoder init subgraph"
verify_t5_encoder_decoder_init_subgraph(init_model.graph, args.precision)
node.attribute.extend([
helper.make_attribute("encoder_decoder_init", init_model.graph),
])
node.attribute.extend(
[
helper.make_attribute("encoder_decoder_init", init_model.graph),
]
)
from onnx import TensorProto
# graph inputs
input_ids = helper.make_tensor_value_info('input_ids', TensorProto.INT32, ['batch_size', 'sequence_length'])
max_length = helper.make_tensor_value_info('max_length', TensorProto.INT32, [1])
min_length = helper.make_tensor_value_info('min_length', TensorProto.INT32, [1])
num_beams = helper.make_tensor_value_info('num_beams', TensorProto.INT32, [1])
num_return_sequences = helper.make_tensor_value_info('num_return_sequences', TensorProto.INT32, [1])
temperature = helper.make_tensor_value_info('temperature', TensorProto.FLOAT, [1])
length_penalty = helper.make_tensor_value_info('length_penalty', TensorProto.FLOAT, [1])
repetition_penalty = helper.make_tensor_value_info('repetition_penalty', TensorProto.FLOAT, [1])
vocab_mask = helper.make_tensor_value_info('vocab_mask', TensorProto.INT32, [vocab_size])
input_ids = helper.make_tensor_value_info("input_ids", TensorProto.INT32, ["batch_size", "sequence_length"])
max_length = helper.make_tensor_value_info("max_length", TensorProto.INT32, [1])
min_length = helper.make_tensor_value_info("min_length", TensorProto.INT32, [1])
num_beams = helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1])
num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1])
length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [vocab_size])
graph_inputs = [
input_ids, max_length, min_length, num_beams, num_return_sequences, temperature, length_penalty,
repetition_penalty, vocab_mask
input_ids,
max_length,
min_length,
num_beams,
num_return_sequences,
temperature,
length_penalty,
repetition_penalty,
vocab_mask,
]
if args.prefix_vocab_mask:
prefix_vocab_mask = helper.make_tensor_value_info('prefix_vocab_mask', TensorProto.INT32,
['batch_size', vocab_size])
prefix_vocab_mask = helper.make_tensor_value_info(
"prefix_vocab_mask", TensorProto.INT32, ["batch_size", vocab_size]
)
graph_inputs.append(prefix_vocab_mask)
# graph outputs
sequences = helper.make_tensor_value_info('sequences', TensorProto.INT32,
['batch_size', 'num_return_sequences', 'max_length'])
sequences = helper.make_tensor_value_info(
"sequences",
TensorProto.INT32,
["batch_size", "num_return_sequences", "max_length"],
)
sequences_scores = helper.make_tensor_value_info('sequences_scores', TensorProto.FLOAT,
['batch_size', 'num_return_sequences'])
sequences_scores = helper.make_tensor_value_info(
"sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"]
)
scores = helper.make_tensor_value_info('scores', TensorProto.FLOAT,
['max_length - sequence_length', 'batch_size', 'num_beams', vocab_size])
scores = helper.make_tensor_value_info(
"scores",
TensorProto.FLOAT,
["max_length - sequence_length", "batch_size", "num_beams", vocab_size],
)
initializers = []
@ -406,10 +483,20 @@ def convert_model(args):
if args.output_token_scores:
graph_outputs.append(scores)
new_graph = helper.make_graph([node], f'{args.model_type}-beam-search', graph_inputs, graph_outputs, initializers)
new_graph = helper.make_graph(
[node],
f"{args.model_type}-beam-search",
graph_inputs,
graph_outputs,
initializers,
)
# Create the model
new_model = helper.make_model(new_graph, producer_name='onnxruntime.transformers', opset_imports=model.opset_import)
new_model = helper.make_model(
new_graph,
producer_name="onnxruntime.transformers",
opset_imports=model.opset_import,
)
onnx.save(new_model, args.output)
@ -431,25 +518,28 @@ def test_torch_performance(args, model, input_ids, attention_mask, eos_token_id,
torch_latency = []
for _ in range(args.total_runs):
start = time.time()
_ = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
max_length=args.max_length,
min_length=args.min_length,
num_beams=args.num_beams,
early_stopping=args.early_stopping,
no_repeat_ngram_size=args.no_repeat_ngram_size,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
num_return_sequences=args.num_return_sequences,
temperature=args.temperature,
length_penalty=args.length_penalty,
repetition_penalty=args.repetition_penalty,
bad_words_ids=bad_words_ids,
return_dict_in_generate=True,
output_scores=args.output_sequences_scores or args.output_token_scores)
_ = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=args.max_length,
min_length=args.min_length,
num_beams=args.num_beams,
early_stopping=args.early_stopping,
no_repeat_ngram_size=args.no_repeat_ngram_size,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
num_return_sequences=args.num_return_sequences,
temperature=args.temperature,
length_penalty=args.length_penalty,
repetition_penalty=args.repetition_penalty,
bad_words_ids=bad_words_ids,
return_dict_in_generate=True,
output_scores=args.output_sequences_scores or args.output_token_scores,
)
torch_latency.append(time.time() - start)
batch_size = input_ids.shape[0]
from benchmark_helper import get_latency_result
return get_latency_result(torch_latency, batch_size)
@ -469,21 +559,27 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
print("Skipping parity test as prefix vocab mask is not implemented by Hugging Face")
return True
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path,
cache_dir=args.cache_dir,
pad_token_id=tokenizer.eos_token_id)
model = GPT2LMHeadModel.from_pretrained(
args.model_name_or_path,
cache_dir=args.cache_dir,
pad_token_id=tokenizer.eos_token_id,
)
# Use different length sentences to test batching
if sentences is None:
sentences = ["The product is released", "I enjoy walking in the park", "Test best way to invest"]
sentences = [
"The product is released",
"I enjoy walking in the park",
"Test best way to invest",
]
inputs = tokenizer(sentences, return_tensors='pt', padding=True)
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
@ -503,24 +599,26 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
torch_decoded_sequences = []
if not args.disable_parity:
print('-' * 50)
print("-" * 50)
print("Test PyTorch model and beam search with huggingface transformers...")
beam_outputs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
max_length=args.max_length,
min_length=args.min_length,
num_beams=args.num_beams,
early_stopping=args.early_stopping,
no_repeat_ngram_size=args.no_repeat_ngram_size,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
num_return_sequences=args.num_return_sequences,
temperature=args.temperature,
length_penalty=args.length_penalty,
repetition_penalty=args.repetition_penalty,
bad_words_ids=bad_words_ids,
return_dict_in_generate=True,
output_scores=args.output_sequences_scores or args.output_token_scores)
beam_outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=args.max_length,
min_length=args.min_length,
num_beams=args.num_beams,
early_stopping=args.early_stopping,
no_repeat_ngram_size=args.no_repeat_ngram_size,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
num_return_sequences=args.num_return_sequences,
temperature=args.temperature,
length_penalty=args.length_penalty,
repetition_penalty=args.repetition_penalty,
bad_words_ids=bad_words_ids,
return_dict_in_generate=True,
output_scores=args.output_sequences_scores or args.output_token_scores,
)
print("input_ids", input_ids)
print("huggingface transformers outputs:")
print("sequences", beam_outputs.sequences)
@ -533,7 +631,7 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
torch_decoded_sequences.append(decoded_sequence)
print("{}: {}".format(i, decoded_sequence))
print('-' * 50)
print("-" * 50)
print("Test ONNX model and bream search with onnxruntime...")
ort_session = create_ort_session(args.output, args.use_gpu)
@ -552,15 +650,16 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
"temperature": np.array([args.temperature], dtype=np.float32),
"length_penalty": np.array([args.length_penalty], dtype=np.float32),
"repetition_penalty": np.array([args.repetition_penalty], dtype=np.float32),
"vocab_mask": vocab_mask
"vocab_mask": vocab_mask,
}
test_data_dir = Path(args.output).parent.as_posix()
print("test_data_dir", test_data_dir)
from bert_test_data import output_test_data
all_inputs = [inputs]
for i, inputs in enumerate(all_inputs):
dir = os.path.join(test_data_dir, 'test_data_set_' + str(i))
dir = os.path.join(test_data_dir, "test_data_set_" + str(i))
output_test_data(dir, inputs)
print("inputs", inputs)
@ -573,6 +672,7 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
latency.append(time.time() - start)
batch_size = input_ids.shape[0]
from benchmark_helper import get_latency_result
output = get_latency_result(latency, batch_size)
print("ORT outputs:")
@ -604,13 +704,20 @@ def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
print(ort_decoded_sequences)
print("-" * 50)
# Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not.
is_same = (torch_decoded_sequences == ort_decoded_sequences)
is_same = torch_decoded_sequences == ort_decoded_sequences
print("Torch and ORT result is ", "same" if is_same else "different")
output["parity"] = is_same
if args.torch_performance:
torch_latency_output = test_torch_performance(args, model, input_ids, attention_mask, eos_token_id,
pad_token_id, bad_words_ids)
torch_latency_output = test_torch_performance(
args,
model,
input_ids,
attention_mask,
eos_token_id,
pad_token_id,
bad_words_ids,
)
print("Torch Latency", torch_latency_output)
print("ORT", output)
@ -630,5 +737,5 @@ def main(argv=None, sentences=None):
return test_model(args, use_vocab_mask=True, sentences=sentences)
if __name__ == '__main__':
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,24 +1,56 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
import glob
import os
import requests
TFMODELS = {
"bert-base-uncased":
("bert", "BertConfig", "", "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip"),
"bert-base-cased":
("bert", "BertConfig", "", "https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip"),
"bert-large-uncased":
("bert", "BertConfig", "", "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip"),
"albert-base": ("albert", "AlbertConfig", "", "https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz"),
"albert-large":
("albert", "AlbertConfig", "", "https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz"),
"gpt-2-117M": ("gpt2", "GPT2Config", "GPT2Model", "https://storage.googleapis.com/gpt-2/models/117M"),
"gpt-2-124M": ("gpt2", "GPT2Config", "GPT2Model", "https://storage.googleapis.com/gpt-2/models/124M")
"bert-base-uncased": (
"bert",
"BertConfig",
"",
"https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip",
),
"bert-base-cased": (
"bert",
"BertConfig",
"",
"https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip",
),
"bert-large-uncased": (
"bert",
"BertConfig",
"",
"https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip",
),
"albert-base": (
"albert",
"AlbertConfig",
"",
"https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz",
),
"albert-large": (
"albert",
"AlbertConfig",
"",
"https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz",
),
"gpt-2-117M": (
"gpt2",
"GPT2Config",
"GPT2Model",
"https://storage.googleapis.com/gpt-2/models/117M",
),
"gpt-2-124M": (
"gpt2",
"GPT2Config",
"GPT2Model",
"https://storage.googleapis.com/gpt-2/models/124M",
),
}
@ -26,7 +58,7 @@ def download_compressed_file(tf_ckpt_url, ckpt_dir):
r = requests.get(tf_ckpt_url)
compressed_file_name = tf_ckpt_url.split("/")[-1]
compressed_file_dir = os.path.join(ckpt_dir, compressed_file_name)
with open(compressed_file_dir, 'wb') as f:
with open(compressed_file_dir, "wb") as f:
f.write(r.content)
return compressed_file_dir
@ -40,13 +72,14 @@ def get_ckpt_prefix_path(ckpt_dir):
if os.path.isfile(sub_folder_dir):
sub_folder_dir = ckpt_dir
unique_file_name = str(glob.glob(sub_folder_dir + "/*data-00000-of-00001"))
prefix = (unique_file_name.rpartition('.')[0]).split("/")[-1]
prefix = (unique_file_name.rpartition(".")[0]).split("/")[-1]
return os.path.join(sub_folder_dir, prefix)
def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
import pathlib
base_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), tf_models_dir)
ckpt_dir = os.path.join(base_dir, model_name)
@ -56,32 +89,40 @@ def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
tf_ckpt_url = TFMODELS[model_name][3]
import re
if (re.search('.zip$', tf_ckpt_url) != None):
if re.search(".zip$", tf_ckpt_url) != None:
zip_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
# unzip file
import zipfile
with zipfile.ZipFile(zip_dir, 'r') as zip_ref:
with zipfile.ZipFile(zip_dir, "r") as zip_ref:
zip_ref.extractall(ckpt_dir)
os.remove(zip_dir)
return get_ckpt_prefix_path(ckpt_dir)
elif (re.search('.tar.gz$', tf_ckpt_url) != None):
elif re.search(".tar.gz$", tf_ckpt_url) != None:
tar_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
# untar file
import tarfile
with tarfile.open(tar_dir, 'r') as tar_ref:
with tarfile.open(tar_dir, "r") as tar_ref:
tar_ref.extractall(ckpt_dir)
os.remove(tar_dir)
return get_ckpt_prefix_path(ckpt_dir)
else:
for filename in ['checkpoint', 'model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta']:
for filename in [
"checkpoint",
"model.ckpt.data-00000-of-00001",
"model.ckpt.index",
"model.ckpt.meta",
]:
r = requests.get(tf_ckpt_url + "/" + filename)
with open(os.path.join(ckpt_dir, filename), 'wb') as f:
with open(os.path.join(ckpt_dir, filename), "wb") as f:
f.write(r.content)
return get_ckpt_prefix_path(ckpt_dir)
@ -92,12 +133,13 @@ def init_pytorch_model(model_name, tf_checkpoint_path):
config_module = __import__("transformers", fromlist=[config_name])
model_config = getattr(config_module, config_name)
parent_path = tf_checkpoint_path.rpartition('/')[0]
parent_path = tf_checkpoint_path.rpartition("/")[0]
config_path = glob.glob(parent_path + "/*config.json")
config = model_config() if len(config_path) == 0 else model_config.from_json_file(str(config_path[0]))
if TFMODELS[model_name][2] == "":
from transformers import AutoModelForPreTraining
init_model = AutoModelForPreTraining.from_config(config)
else:
model_categroy_name = TFMODELS[model_name][2]
@ -118,11 +160,15 @@ def convert_tf_checkpoint_to_pytorch(model_name, config, init_model, tf_checkpoi
if TFMODELS[model_name][0] != "bert":
raise NotImplementedError("Only support tf2 ckeckpoint for Bert model")
from transformers import convert_bert_original_tf2_checkpoint_to_pytorch
load_tf_weight_func = convert_bert_original_tf2_checkpoint_to_pytorch.load_tf2_weights_in_bert
# Expect transformers team will unify the order of signature in the future
model = load_tf_weight_func(init_model, config, tf_checkpoint_path) if is_tf2 is False else load_tf_weight_func(
init_model, tf_checkpoint_path, config)
model = (
load_tf_weight_func(init_model, config, tf_checkpoint_path)
if is_tf2 is False
else load_tf_weight_func(init_model, tf_checkpoint_path, config)
)
model.eval()
return model
@ -140,11 +186,13 @@ def tf2pt_pipeline(model_name, is_tf2=False):
def tf2pt_pipeline_test():
# For test on linux only
import logging
import torch
logger = logging.getLogger('')
logger = logging.getLogger("")
for model_name in TFMODELS.keys():
config, model = tf2pt_pipeline(model_name)
assert (config.model_type is TFMODELS[model_name][0])
assert config.model_type is TFMODELS[model_name][0]
input = torch.randint(low=0, high=config.vocab_size - 1, size=(4, 128), dtype=torch.long)
try:
@ -153,5 +201,5 @@ def tf2pt_pipeline_test():
logger.exception(e)
if __name__ == '__main__':
if __name__ == "__main__":
tf2pt_pipeline_test()

Просмотреть файл

@ -1,48 +1,48 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
import itertools
import logging
from typing import Dict, List
import numpy as np
import onnx
from onnx import helper, numpy_helper
from onnx import onnx_pb as onnx_proto
from typing import List, Dict
import logging
logger = logging.getLogger(__name__)
def _npfloat16_to_int(np_list):
'''
"""
Convert numpy float16 to python int.
:param np_list: numpy float16 list
:return int_list: python int list
'''
return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
"""
return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
'''
"""
Convert float32 numpy array to float16 without changing sign or finiteness.
Positive values less than min_positive_val are mapped to min_positive_val.
Positive finite values greater than max_finite_val are mapped to max_finite_val.
Similar for negative values. NaN, 0, inf, and -inf are unchanged.
'''
"""
def between(a, b, c):
return np.logical_and(a < b, b < c)
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)
np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)
return np.float16(np_array)
@ -62,7 +62,7 @@ def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finit
"""
if not isinstance(tensor, onnx_proto.TensorProto):
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
if tensor.data_type == onnx_proto.TensorProto.FLOAT:
tensor.data_type = onnx_proto.TensorProto.FLOAT16
@ -75,7 +75,7 @@ def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finit
# convert raw_data (bytes type)
if tensor.raw_data:
# convert n.raw_data to float
float32_list = np.fromstring(tensor.raw_data, dtype='float32')
float32_list = np.fromstring(tensor.raw_data, dtype="float32")
# convert float to float16
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
# convert float16 to bytes and write back to raw_data
@ -89,10 +89,33 @@ def make_value_info_from_tensor(tensor):
DEFAULT_OP_BLOCK_LIST = [
'ArrayFeatureExtractor', 'Binarizer', 'CastMap', 'CategoryMapper', 'DictVectorizer', 'FeatureVectorizer', 'Imputer',
'LabelEncoder', 'LinearClassifier', 'LinearRegressor', 'Normalizer', 'OneHotEncoder', 'SVMClassifier',
'SVMRegressor', 'Scaler', 'TreeEnsembleClassifier', 'TreeEnsembleRegressor', 'ZipMap', 'NonMaxSuppression', 'TopK',
'RoiAlign', 'Resize', 'Range', 'CumSum', 'Min', 'Max', 'Upsample'
"ArrayFeatureExtractor",
"Binarizer",
"CastMap",
"CategoryMapper",
"DictVectorizer",
"FeatureVectorizer",
"Imputer",
"LabelEncoder",
"LinearClassifier",
"LinearRegressor",
"Normalizer",
"OneHotEncoder",
"SVMClassifier",
"SVMRegressor",
"Scaler",
"TreeEnsembleClassifier",
"TreeEnsembleRegressor",
"ZipMap",
"NonMaxSuppression",
"TopK",
"RoiAlign",
"Resize",
"Range",
"CumSum",
"Min",
"Max",
"Upsample",
]
@ -111,14 +134,16 @@ class InitializerTracker:
self.fp16_nodes.append(node)
def convert_float_to_float16(model,
min_positive_val=5.96e-08,
max_finite_val=65504.0,
keep_io_types=False,
disable_shape_infer=False,
op_block_list=None,
node_block_list=None,
force_fp16_initializers=False):
def convert_float_to_float16(
model,
min_positive_val=5.96e-08,
max_finite_val=65504.0,
keep_io_types=False,
disable_shape_infer=False,
op_block_list=None,
node_block_list=None,
force_fp16_initializers=False,
):
"""Convert model tensor float type in the ONNX ModelProto input to tensor float16.
Args:
@ -139,19 +164,22 @@ def convert_float_to_float16(model,
Returns:
ModelProto: converted model.
"""
assert min_positive_val >= 5.96e-08, "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
assert (
min_positive_val >= 5.96e-08
), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"
func_infer_shape = None
if not disable_shape_infer and onnx.__version__ >= '1.2':
if not disable_shape_infer and onnx.__version__ >= "1.2":
try:
from onnx.shape_inference import infer_shapes
func_infer_shape = infer_shapes
finally:
pass
if not isinstance(model, onnx_proto.ModelProto):
raise ValueError('Expected model type is an ONNX ModelProto but got %s' % type(model))
raise ValueError("Expected model type is an ONNX ModelProto but got %s" % type(model))
# create blocklists
if op_block_list is None:
@ -188,34 +216,34 @@ def convert_float_to_float16(model,
for i, n in enumerate(model.graph.input):
if n.name in fp32_inputs:
output_name = 'graph_input_cast_' + str(i)
output_name = "graph_input_cast_" + str(i)
name_mapping[n.name] = output_name
graph_io_to_skip.add(n.name)
node_name = 'graph_input_cast' + str(i)
node_name = "graph_input_cast" + str(i)
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(n)
new_value_info.name = output_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
# add Cast node (from tensor(float) to tensor(float16) after graph input
new_node = [helper.make_node('Cast', [n.name], [output_name], to=10, name=node_name)]
new_node = [helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)]
model.graph.node.extend(new_node)
value_info_list.append(new_value_info)
io_casts.add(node_name)
for i, n in enumerate(model.graph.output):
if n.name in fp32_outputs:
input_name = 'graph_output_cast_' + str(i)
input_name = "graph_output_cast_" + str(i)
name_mapping[n.name] = input_name
graph_io_to_skip.add(n.name)
node_name = 'graph_output_cast' + str(i)
node_name = "graph_output_cast" + str(i)
# add Cast node (from tensor(float16) to tensor(float) before graph output
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(n)
new_value_info.name = input_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
new_node = [helper.make_node('Cast', [input_name], [n.name], to=1, name=node_name)]
new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]
model.graph.node.extend(new_node)
value_info_list.append(new_value_info)
io_casts.add(node_name)
@ -254,9 +282,9 @@ def convert_float_to_float16(model,
if is_node_blocked:
node_list.append(n)
else:
if n.op_type == 'Cast':
if n.op_type == "Cast":
for attr in n.attribute:
if attr.name == 'to' and attr.i == 1:
if attr.name == "to" and attr.i == 1:
attr.i = 10
break
for attr in n.attribute:
@ -280,12 +308,12 @@ def convert_float_to_float16(model,
if n.name not in graph_io_to_skip:
n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
value_info_list.append(n)
if n.type.HasField('sequence_type'):
if n.type.HasField("sequence_type"):
if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
if n.name not in graph_io_to_skip:
n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
value_info_list.append(n)
queue = next_level
for key, value in fp32_initializers.items():
@ -296,7 +324,9 @@ def convert_float_to_float16(model,
if value.fp32_nodes and not force_fp16_initializers:
logger.info(
"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
value.fp16_nodes))
value.fp16_nodes
)
)
# process the nodes in block list that doesn't support tensor(float16)
for node in node_list:
@ -310,12 +340,12 @@ def convert_float_to_float16(model,
# create new value_info for current node's new input name
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(value_info)
output_name = node.name + '_input_cast_' + str(i)
output_name = node.name + "_input_cast_" + str(i)
new_value_info.name = output_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
# add Cast node (from tensor(float16) to tensor(float) before current node
node_name = node.name + '_input_cast' + str(i)
new_node = [helper.make_node('Cast', [input], [output_name], to=1, name=node_name)]
node_name = node.name + "_input_cast" + str(i)
new_node = [helper.make_node("Cast", [input], [output_name], to=1, name=node_name)]
model.graph.node.extend(new_node)
# change current node's input name
node.input[i] = output_name
@ -329,12 +359,12 @@ def convert_float_to_float16(model,
# create new value_info for current node's new output
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(value_info)
input_name = node.name + '_output_cast_' + str(i)
input_name = node.name + "_output_cast_" + str(i)
new_value_info.name = input_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
# add Cast node (from tensor(float) to tensor(float16) after current node
node_name = node.name + '_output_cast' + str(i)
new_node = [helper.make_node('Cast', [input_name], [output], to=10, name=node_name)]
node_name = node.name + "_output_cast" + str(i)
new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
model.graph.node.extend(new_node)
# change current node's input name
node.output[i] = input_name
@ -345,15 +375,15 @@ def convert_float_to_float16(model,
def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
"""Measure the maximum absolute difference after converting a float tensor to float16."""
if not isinstance(tensor, onnx_proto.TensorProto):
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
if tensor.data_type != onnx_proto.TensorProto.FLOAT:
raise ValueError('Expected tensor data type is float.')
raise ValueError("Expected tensor data type is float.")
if tensor.float_data:
float32_data = np.array(tensor.float_data)
if tensor.raw_data:
float32_data = np.fromstring(tensor.raw_data, dtype='float32')
float32_data = np.fromstring(tensor.raw_data, dtype="float32")
float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
return np.amax(np.abs(float32_data - np.float32(float16_data)))

Просмотреть файл

@ -1,27 +1,29 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
from enum import Enum
from logging import getLogger
from os import name
from sys import path
import numpy as np
from logging import getLogger
from enum import Enum
from typing import Tuple, Union
from onnx import helper, numpy_helper, TensorProto, NodeProto
from onnx_model import OnnxModel
import numpy as np
from fusion_base import Fusion
from fusion_utils import FusionUtils, NumpyHelper
from fusion_options import AttentionMaskFormat
from fusion_utils import FusionUtils, NumpyHelper
from onnx import NodeProto, TensorProto, helper, numpy_helper
from onnx_model import OnnxModel
from shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
logger = getLogger(__name__)
class AttentionMask():
class AttentionMask:
"""
Fuse Attention subgraph into one Attention node.
"""
def __init__(self, model: OnnxModel):
self.model = model
# A lookup table with mask input as key, and mask index output as value
@ -66,11 +68,13 @@ class AttentionMask():
return input_name
# Add a mask processing node to convert attention mask to mask index (1D)
output_name = self.model.create_node_name('mask_index')
mask_index_node = helper.make_node('ReduceSum',
inputs=[input_name],
outputs=[output_name],
name=self.model.create_node_name('ReduceSum', 'MaskReduceSum'))
output_name = self.model.create_node_name("mask_index")
mask_index_node = helper.make_node(
"ReduceSum",
inputs=[input_name],
outputs=[output_name],
name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
)
mask_index_node.attribute.extend([helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)])
self.model.add_node(mask_index_node)
@ -82,7 +86,14 @@ class FusionAttention(Fusion):
"""
Fuse Attention subgraph into one Attention node.
"""
def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int, attention_mask: AttentionMask):
def __init__(
self,
model: OnnxModel,
hidden_size: int,
num_heads: int,
attention_mask: AttentionMask,
):
super().__init__(model, "Attention", ["SkipLayerNormalization", "LayerNormalization"])
self.hidden_size = hidden_size
self.num_heads = num_heads
@ -93,7 +104,7 @@ class FusionAttention(Fusion):
self.hidden_size_warning = True
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
""" Detect num_heads and hidden_size from a reshape node.
"""Detect num_heads and hidden_size from a reshape node.
Args:
reshape_q (NodeProto): reshape node for Q
@ -125,7 +136,8 @@ class FusionAttention(Fusion):
if self.hidden_size > 0 and hidden_size != self.hidden_size:
if self.hidden_size_warning:
logger.warning(
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value.")
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
)
self.hidden_size_warning = False # Do not show the warning more than once
return num_heads, hidden_size
@ -148,10 +160,22 @@ class FusionAttention(Fusion):
return add_qk.input[1]
def create_attention_node(self, mask_index: str, q_matmul: NodeProto, k_matmul: NodeProto, v_matmul: NodeProto,
q_add: NodeProto, k_add: NodeProto, v_add: NodeProto, num_heads: int, hidden_size: int,
input: str, output: str, add_qk_str: str) -> Union[NodeProto, None]:
""" Create an Attention node.
def create_attention_node(
self,
mask_index: str,
q_matmul: NodeProto,
k_matmul: NodeProto,
v_matmul: NodeProto,
q_add: NodeProto,
k_add: NodeProto,
v_add: NodeProto,
num_heads: int,
hidden_size: int,
input: str,
output: str,
add_qk_str: str,
) -> Union[NodeProto, None]:
"""Create an Attention node.
Args:
mask_index (str): mask input
@ -244,27 +268,35 @@ class FusionAttention(Fusion):
qkv_bias = np.stack((qb, kb, vb), axis=0)
qkv_bias_dim = 3 * q_bias_shape
attention_node_name = self.model.create_node_name('Attention')
attention_node_name = self.model.create_node_name("Attention")
weight = helper.make_tensor(name=attention_node_name + '_qkv_weight',
data_type=TensorProto.FLOAT,
dims=[qw_in_size, qkv_weight_dim],
vals=qkv_weight.flatten().tolist())
weight = helper.make_tensor(
name=attention_node_name + "_qkv_weight",
data_type=TensorProto.FLOAT,
dims=[qw_in_size, qkv_weight_dim],
vals=qkv_weight.flatten().tolist(),
)
# Sometimes weights and bias are stored in fp16
if q_weight.data_type == 10:
weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
self.model.add_initializer(weight, self.this_graph_name)
bias = helper.make_tensor(name=attention_node_name + '_qkv_bias',
data_type=TensorProto.FLOAT,
dims=[qkv_bias_dim],
vals=qkv_bias.flatten().tolist())
bias = helper.make_tensor(
name=attention_node_name + "_qkv_bias",
data_type=TensorProto.FLOAT,
dims=[qkv_bias_dim],
vals=qkv_bias.flatten().tolist(),
)
if q_bias.data_type == 10:
bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
self.model.add_initializer(bias, self.this_graph_name)
attention_inputs = [input, attention_node_name + '_qkv_weight', attention_node_name + '_qkv_bias']
attention_inputs = [
input,
attention_node_name + "_qkv_weight",
attention_node_name + "_qkv_bias",
]
if mask_index is not None:
attention_inputs.append(mask_index)
else:
@ -274,16 +306,19 @@ class FusionAttention(Fusion):
attention_inputs.append("")
attention_inputs.append(add_qk_str)
attention_node = helper.make_node('Attention',
inputs=attention_inputs,
outputs=[output],
name=attention_node_name)
attention_node = helper.make_node(
"Attention",
inputs=attention_inputs,
outputs=[output],
name=attention_node_name,
)
attention_node.domain = "com.microsoft"
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
if is_qkv_diff_dims:
attention_node.attribute.extend(
[helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])])
[helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
)
return attention_node
@ -291,23 +326,27 @@ class FusionAttention(Fusion):
# Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
# Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
start_node = normalize_node
if normalize_node.op_type == 'LayerNormalization':
add_before_layernorm = self.model.match_parent(normalize_node, 'Add', 0)
if normalize_node.op_type == "LayerNormalization":
add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
if add_before_layernorm is not None:
start_node = add_before_layernorm
else:
return
# SkipLayerNormalization has two inputs, and one of them is the root input for attention.
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
[None, None, 0, 0, 0])
qkv_nodes = self.model.match_parent_path(
start_node,
["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
[None, None, 0, 0, 0],
)
einsum_node = None
if qkv_nodes is not None:
(_, matmul_qkv, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
else:
# Match Albert
qkv_nodes = self.model.match_parent_path(start_node, ['Add', 'Einsum', 'Transpose', 'MatMul'],
[1, None, 0, 0])
qkv_nodes = self.model.match_parent_path(
start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
)
if qkv_nodes is not None:
(_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
else:
@ -333,12 +372,12 @@ class FusionAttention(Fusion):
| |
+---------------------------------------------------------
"""
mul_before_layernorm = self.model.match_parent(start_node, 'Mul', 0)
mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
if mul_before_layernorm is not None:
mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
if mul_children is not None and len(mul_children) == 2:
layernorm_node = mul_children[1]
if layernorm_node.op_type == 'LayerNormalization':
if layernorm_node.op_type == "LayerNormalization":
root_input = layernorm_node.output[0]
else:
return
@ -346,7 +385,7 @@ class FusionAttention(Fusion):
root_input = mul_before_layernorm.output[0]
else:
return
elif normalize_node.op_type == 'LayerNormalization':
elif normalize_node.op_type == "LayerNormalization":
children = input_name_to_nodes[root_input]
for child in children:
if child.op_type == "LayerNormalization":
@ -354,10 +393,10 @@ class FusionAttention(Fusion):
children = input_name_to_nodes[root_input]
children_types = [child.op_type for child in children]
if children_types.count('MatMul') != 3:
if children_types.count("MatMul") != 3:
return
v_nodes = self.model.match_parent_path(matmul_qkv, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
if v_nodes is None:
logger.debug("fuse_attention: failed to match v path")
return
@ -366,10 +405,10 @@ class FusionAttention(Fusion):
is_distill = False
is_distill_add = False
qk_paths = {
"path1": (['Softmax', 'Add', 'Div', 'MatMul'], [0, 0, None, 0]),
"path2": (['Softmax', 'Add', 'Mul', 'MatMul'], [0, 0, None, 0]),
"path3": (['Softmax', 'Where', 'MatMul', 'Div'], [0, 0, 2, 0]),
"path4": (['Softmax', 'Add', 'Where', 'MatMul'], [0, 0, 0, 2])
"path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
"path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
"path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
"path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
}
qk_nodes = None
@ -397,10 +436,13 @@ class FusionAttention(Fusion):
else:
(_, add_qk, _, matmul_qk) = qk_nodes
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [0, 0, 0, None])
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
if q_nodes is None:
q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Add', 'MatMul'],
[0, 0, 0, 0, None])
q_nodes = self.model.match_parent_path(
matmul_qk,
["Div", "Transpose", "Reshape", "Add", "MatMul"],
[0, 0, 0, 0, None],
)
if q_nodes is None:
logger.debug("fuse_attention: failed to match q path")
return
@ -408,10 +450,13 @@ class FusionAttention(Fusion):
add_q = q_nodes[-2]
matmul_q = q_nodes[-1]
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Add', 'MatMul'], [1, 0, 0, None])
k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
if k_nodes is None:
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Transpose', 'Reshape', 'Add', 'MatMul'],
[1, 0, 0, 0, None])
k_nodes = self.model.match_parent_path(
matmul_qk,
["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
[1, 0, 0, 0, None],
)
if k_nodes is None:
logger.debug("fuse_attention: failed to match k path")
return
@ -422,15 +467,24 @@ class FusionAttention(Fusion):
mask_nodes = None
add_qk_str = None
if is_distill:
_, mask_nodes, _ = self.model.match_parent_paths(where_qk,
[(['Expand', 'Reshape', 'Equal'], [0, 0, 0]),
(['Equal', 'Unsqueeze', 'Unsqueeze'], [0, 0, 0]),
(['Cast', 'Expand', 'Reshape', 'Equal'], [0, 0, 0, 0])],
output_name_to_node)
_, mask_nodes, _ = self.model.match_parent_paths(
where_qk,
[
(["Expand", "Reshape", "Equal"], [0, 0, 0]),
(["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
(["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
],
output_name_to_node,
)
elif is_distill_add:
_, mask_nodes, _ = self.model.match_parent_paths(
where_qk, [(['Cast', 'Equal', 'Unsqueeze', 'Unsqueeze'], [0, 0, 0, 0]),
(['Equal', 'Unsqueeze', 'Unsqueeze'], [0, 0, 0])], output_name_to_node)
where_qk,
[
(["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
(["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
],
output_name_to_node,
)
if add_qk is not None:
add_qk_str = self.get_add_qk_str(add_qk)
if add_qk_str is None:
@ -438,8 +492,16 @@ class FusionAttention(Fusion):
return
else:
_, mask_nodes, _ = self.model.match_parent_paths(
add_qk, [(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0, 0]),
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0])], output_name_to_node)
add_qk,
[
(
["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
[None, 0, 1, 0, 0],
),
(["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
],
output_name_to_node,
)
if mask_nodes is None:
logger.debug("fuse_attention: failed to match mask path")
return
@ -452,9 +514,20 @@ class FusionAttention(Fusion):
q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
# number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
# the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
new_node = self.create_attention_node(mask_index, matmul_q, matmul_k, matmul_v, add_q, add_k, add_v,
q_num_heads, q_hidden_size, root_input, attention_last_node.output[0],
add_qk_str)
new_node = self.create_attention_node(
mask_index,
matmul_q,
matmul_k,
matmul_v,
add_q,
add_k,
add_v,
q_num_heads,
q_hidden_size,
root_input,
attention_last_node.output[0],
add_qk_str,
)
if new_node is None:
return
@ -464,16 +537,23 @@ class FusionAttention(Fusion):
if einsum_node is not None:
unique_index = einsum_node.input[0]
new_edge = "edge_modified_" + unique_index
shape_tensor = helper.make_tensor(name="shape_modified_tensor" + unique_index,
data_type=TensorProto.INT64,
dims=[4],
vals=np.int64([0, 0, q_num_heads,
int(q_hidden_size / q_num_heads)]).tobytes(),
raw=True)
shape_tensor = helper.make_tensor(
name="shape_modified_tensor" + unique_index,
data_type=TensorProto.INT64,
dims=[4],
vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
raw=True,
)
self.model.add_initializer(shape_tensor, self.this_graph_name)
self.model.add_node(
helper.make_node("Reshape", [attention_last_node.output[0], shape_tensor.name], [new_edge],
"reshape_modified_" + unique_index), self.this_graph_name)
helper.make_node(
"Reshape",
[attention_last_node.output[0], shape_tensor.name],
[new_edge],
"reshape_modified_" + unique_index,
),
self.this_graph_name,
)
einsum_node.input[0] = new_edge
self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
@ -483,5 +563,5 @@ class FusionAttention(Fusion):
self.nodes_to_remove.extend(v_nodes)
# Use prune graph to remove mask nodes since they are shared by all attention nodes.
#self.nodes_to_remove.extend(mask_nodes)
# self.nodes_to_remove.extend(mask_nodes)
self.prune_graph = True

Просмотреть файл

@ -1,21 +1,24 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
from logging import getLogger
from onnx_model import OnnxModel
from typing import Union, List
from typing import List, Union
from onnx import GraphProto
from onnx_model import OnnxModel
logger = getLogger(__name__)
class Fusion:
def __init__(self,
model: OnnxModel,
fused_op_type: str,
search_op_types: Union[str, List[str]],
description: str = None):
def __init__(
self,
model: OnnxModel,
fused_op_type: str,
search_op_types: Union[str, List[str]],
description: str = None,
):
self.search_op_types: List[str] = [search_op_types] if isinstance(search_op_types, str) else search_op_types
self.fused_op_type: str = fused_op_type
self.description: str = f"{fused_op_type}({description})" if description else fused_op_type

Просмотреть файл

@ -1,13 +1,14 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
from logging import getLogger
from onnx import helper
from onnx_model import OnnxModel
from fusion_base import Fusion
from fusion_utils import NumpyHelper
from onnx import helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
@ -15,18 +16,18 @@ logger = getLogger(__name__)
class FusionBiasGelu(Fusion):
def __init__(self, model: OnnxModel, is_fastgelu):
if is_fastgelu:
super().__init__(model, 'FastGelu', 'FastGelu', 'add bias')
super().__init__(model, "FastGelu", "FastGelu", "add bias")
else:
super().__init__(model, 'BiasGelu', 'Gelu')
super().__init__(model, "BiasGelu", "Gelu")
def fuse(self, node, input_name_to_nodes, output_name_to_node):
gelu_op_type = node.op_type
fuse_op_type = 'BiasGelu' if gelu_op_type == 'Gelu' else 'FastGelu'
fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
if len(node.input) != 1:
return
nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [0, None])
nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
if nodes is None:
return
(add, matmul) = nodes
@ -47,16 +48,19 @@ class FusionBiasGelu(Fusion):
return
subgraph_nodes = [node, add]
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [node.output[0]], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node(fuse_op_type,
inputs=[matmul.output[0], add.input[bias_index]],
outputs=node.output,
name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"))
fused_node = helper.make_node(
fuse_op_type,
inputs=[matmul.output[0], add.input[bias_index]],
outputs=node.output,
name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name

Просмотреть файл

@ -1,26 +1,32 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
from typing import Dict, List, Tuple, Union
from logging import getLogger
from onnx import helper, TensorProto, NodeProto
from onnx_model import OnnxModel
from typing import Dict, List, Tuple, Union
from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import NodeProto, TensorProto, helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class FusionEmbedLayerNoMask(Fusion):
"""
Fuse embedding layer into one node (EmbedLayerNormalization).
It supports the following model types: BERT, DistilBert, ALBert.
Fuse embedding layer into one node (EmbedLayerNormalization).
It supports the following model types: BERT, DistilBert, ALBert.
"""
def __init__(self, model: OnnxModel, description: str = 'no mask'):
super().__init__(model, "EmbedLayerNormalization", ["LayerNormalization", "SkipLayerNormalization"],
description)
def __init__(self, model: OnnxModel, description: str = "no mask"):
super().__init__(
model,
"EmbedLayerNormalization",
["LayerNormalization", "SkipLayerNormalization"],
description,
)
self.utils = FusionUtils(model)
self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
# The following will be reset in each fuse call of FusionEmbedLayerNormalization
@ -28,18 +34,22 @@ class FusionEmbedLayerNoMask(Fusion):
self.embed_node = None
def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeProto]]:
gather_0_path = self.model.match_parent_path(add, ['Gather'], [0])
gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
if gather_0_path is None:
return None
gather_1_path = self.model.match_parent_path(add, ['Gather'], [1])
gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
if gather_1_path is None:
return None
return gather_0_path[0], gather_1_path[0]
def check_attention_subgraph(self, layernorm: NodeProto, input_name_to_nodes: Dict[str, List[NodeProto]],
is_distil_bert: bool) -> bool:
def check_attention_subgraph(
self,
layernorm: NodeProto,
input_name_to_nodes: Dict[str, List[NodeProto]],
is_distil_bert: bool,
) -> bool:
"""Check that LayerNormalization has a child of Attention node or subgraph like Attention.
Args:
@ -50,10 +60,9 @@ class FusionEmbedLayerNoMask(Fusion):
Returns:
bool: whether there is Attention node or subgraph like Attention
"""
self.attention = self.model.find_first_child_by_type(layernorm,
'Attention',
input_name_to_nodes,
recursive=False)
self.attention = self.model.find_first_child_by_type(
layernorm, "Attention", input_name_to_nodes, recursive=False
)
if self.attention is None:
# In case user disables attention fusion, check whether subgraph looks like Attention.
if layernorm.output[0] not in input_name_to_nodes:
@ -63,8 +72,11 @@ class FusionEmbedLayerNoMask(Fusion):
# For Albert, there is MatMul+Add after embedding layer before attention.
if len(children) == 1 and children[0].op_type == "MatMul" and children[0].output[0] in input_name_to_nodes:
grandchildren = input_name_to_nodes[children[0].output[0]]
if len(grandchildren) == 1 and grandchildren[0].op_type == "Add" and grandchildren[0].output[
0] in input_name_to_nodes:
if (
len(grandchildren) == 1
and grandchildren[0].op_type == "Add"
and grandchildren[0].output[0] in input_name_to_nodes
):
nodes = input_name_to_nodes[grandchildren[0].output[0]]
for node in nodes:
if node.op_type == "Attention":
@ -77,14 +89,20 @@ class FusionEmbedLayerNoMask(Fusion):
# Two Shape nodes might be merged by ORT
if is_distil_bert:
# SkipLayerNormailization might exist when model has been optimized by ORT first.
if children_types != ['MatMul', 'MatMul', 'MatMul', 'Shape', 'SkipLayerNormalization'] and \
children_types != ['Add', 'MatMul', 'MatMul', 'MatMul', 'Shape', 'Shape'] and \
children_types != ['Add', 'MatMul', 'MatMul', 'MatMul', 'Shape']:
if (
children_types != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
):
logger.debug("No Attention like subgraph in children of LayerNormalization")
return False
else:
if children_types != ['Add', 'MatMul', 'MatMul', 'MatMul'] and \
children_types != ['MatMul', 'MatMul', 'MatMul', 'SkipLayerNormalization']:
if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
"MatMul",
"MatMul",
"MatMul",
"SkipLayerNormalization",
]:
logger.debug("No Attention like subgraph in children of LayerNormalization")
return False
return True
@ -110,9 +128,13 @@ class FusionEmbedLayerNoMask(Fusion):
Gather
"""
# remove after tests pass
path1 = self.model.match_parent_path(position_embedding_gather, ['Expand', 'Shape'], [1, 1])
path1 = self.model.match_parent_path(position_embedding_gather, ["Expand", "Shape"], [1, 1])
if path1 is None:
path1 = self.model.match_parent_path(position_embedding_gather, ['Expand', 'Where', 'Reshape', 'Shape'], [1, 1, 2, 0])
path1 = self.model.match_parent_path(
position_embedding_gather,
["Expand", "Where", "Reshape", "Shape"],
[1, 1, 2, 0],
)
if path1 is None:
return False
@ -120,14 +142,21 @@ class FusionEmbedLayerNoMask(Fusion):
if shape.input[0] != input_ids:
return False
_, path2, _ = self.model.match_parent_paths(expand, [(['Unsqueeze', 'Range', 'Cast', 'Gather', 'Shape'], [0, 0, 1, 0, 0]), \
(['Unsqueeze', 'Range', 'Gather', 'Shape'], [0, 0, 1, 0])], output_name_to_node)
_, path2, _ = self.model.match_parent_paths(
expand,
[
(["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
(["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
],
output_name_to_node,
)
if path2 is None:
return False
range_node = path2[1]
if not (self.utils.check_node_input_value(range_node, 0, 0)
and self.utils.check_node_input_value(range_node, 2, 1)):
if not (
self.utils.check_node_input_value(range_node, 0, 0) and self.utils.check_node_input_value(range_node, 2, 1)
):
return False
gather_node = path2[-2]
@ -141,19 +170,19 @@ class FusionEmbedLayerNoMask(Fusion):
return True
def match_position_embedding_roberta(self, position_embedding_gather, input_ids, output_name_to_node):
""" Match position embedding path from input_ids to Gather for Roberta.
"""Match position embedding path from input_ids to Gather for Roberta.
Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
(input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
| ^
V |
+------------------------------+
+------------------------------+
Roberta new pattern from transformers v4.9:
(input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
| ^
V |
+-------------------------------------------+
+-------------------------------------------+
start_node = position_embedding_gather
start_index = 1
@ -209,22 +238,30 @@ class FusionEmbedLayerNoMask(Fusion):
|
LayerNormalization
"""
path = self.model.match_parent_path(position_embedding_gather, ['Slice', 'Unsqueeze'], [1, 2],
output_name_to_node)
path = self.model.match_parent_path(
position_embedding_gather,
["Slice", "Unsqueeze"],
[1, 2],
output_name_to_node,
)
if path is None:
return False
slice, unsqueeze = path
slice_weight = self.model.get_constant_value(slice.input[0])
if not (slice_weight is not None and len(slice_weight.shape) == 2 and slice_weight.shape[0] == 1 \
and self.utils.check_node_input_value(slice, 1, [0]) \
and self.utils.check_node_input_value(slice, 3, [1]) \
and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))):
if not (
slice_weight is not None
and len(slice_weight.shape) == 2
and slice_weight.shape[0] == 1
and self.utils.check_node_input_value(slice, 1, [0])
and self.utils.check_node_input_value(slice, 3, [1])
and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))
):
return False
opset_version = self.model.get_opset_version()
if opset_version < 13:
if not FusionUtils.check_node_attribute(unsqueeze, 'axes', [0]):
if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
return False
else:
if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
@ -257,7 +294,7 @@ class FusionEmbedLayerNoMask(Fusion):
# TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
# related: https://github.com/huggingface/transformers/issues/10736
#if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
# if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
# return True
if self.match_position_embedding_distilbert(position_embedding_gather, input_ids, output_name_to_node):
@ -266,8 +303,7 @@ class FusionEmbedLayerNoMask(Fusion):
return False
def check_embedding(self, word_embedding_gather, segment_embedding_gather, position_embedding_gather):
"""Sanity check of embedding weights, and match hidden_size of weights and shape of inputs.
"""
"""Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
input_ids = word_embedding_gather.input[1]
segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
position_ids = position_embedding_gather.input[1]
@ -276,17 +312,25 @@ class FusionEmbedLayerNoMask(Fusion):
input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
assert input_ids_shape and position_ids_shape
if not (len(input_ids_shape) == 2 and len(position_ids_shape) == 2
and input_ids_shape[1] == position_ids_shape[1]):
if not (
len(input_ids_shape) == 2
and len(position_ids_shape) == 2
and input_ids_shape[1] == position_ids_shape[1]
):
logger.info(
"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}"
.format(input_ids_shape, position_ids_shape))
"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
input_ids_shape, position_ids_shape
)
)
return False
if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
logger.info(
"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".
format(input_ids_shape, self.shape_infer_helper.get_edge_shape(segment_ids)))
"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
input_ids_shape,
self.shape_infer_helper.get_edge_shape(segment_ids),
)
)
return False
word_embedding_table = self.model.get_constant_value(word_embedding_gather.input[0])
@ -295,15 +339,21 @@ class FusionEmbedLayerNoMask(Fusion):
return False
position_embedding_table = self.model.get_constant_value(position_embedding_gather.input[0])
if position_embedding_table is None or len(position_embedding_table.shape) != 2 or (
word_embedding_table.shape[1] != position_embedding_table.shape[1]):
if (
position_embedding_table is None
or len(position_embedding_table.shape) != 2
or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
):
logger.info("Cannot fuse EmbedLayerNormalization: position embedding table is not expected")
return False
if segment_ids:
segment_embedding_table = self.model.get_constant_value(segment_embedding_gather.input[0])
if segment_embedding_table is None or len(segment_embedding_table.shape) != 2 or (
word_embedding_table.shape[1] != segment_embedding_table.shape[1]):
if (
segment_embedding_table is None
or len(segment_embedding_table.shape) != 2
or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
):
logger.info("Cannot fuse EmbedLayerNormalization: segment embedding table is not expected")
return False
@ -350,9 +400,16 @@ class FusionEmbedLayerNoMask(Fusion):
return int32_output, input_cast_node
def create_fused_node(self, input_ids: str, layernorm: NodeProto, word_embedding_gather: NodeProto,
position_embedding_gather: NodeProto, segment_embedding_gather: Union[None, NodeProto],
position_ids: str = None, embedding_sum_output = False):
def create_fused_node(
self,
input_ids: str,
layernorm: NodeProto,
word_embedding_gather: NodeProto,
position_embedding_gather: NodeProto,
segment_embedding_gather: Union[None, NodeProto],
position_ids: str = None,
embedding_sum_output=False,
):
"""Create an EmbedLayerNormalization node. Note that segment embedding is optional.
Args:
@ -368,7 +425,7 @@ class FusionEmbedLayerNoMask(Fusion):
nodes_to_add = []
input_ids, _ = self.cast_to_int32(input_ids)
node_name = self.model.create_node_name('EmbedLayerNormalization')
node_name = self.model.create_node_name("EmbedLayerNormalization")
if layernorm.op_type == "LayerNormalization":
gamma = layernorm.input[1]
@ -382,17 +439,28 @@ class FusionEmbedLayerNoMask(Fusion):
segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
embed_node_inputs = [
input_ids, segment_ids, word_embedding_gather.input[0], position_embedding_gather.input[0],
segment_embedding_gather.input[0], gamma, beta
input_ids,
segment_ids,
word_embedding_gather.input[0],
position_embedding_gather.input[0],
segment_embedding_gather.input[0],
gamma,
beta,
]
else: # no segment embedding
embed_node_inputs = [
input_ids, '', word_embedding_gather.input[0], position_embedding_gather.input[0], '', gamma, beta
input_ids,
"",
word_embedding_gather.input[0],
position_embedding_gather.input[0],
"",
gamma,
beta,
]
if position_ids is not None:
#Adding an empty input for mask before position_ids
embed_node_inputs.append('')
# Adding an empty input for mask before position_ids
embed_node_inputs.append("")
position_ids, _ = self.cast_to_int32(position_ids)
embed_node_inputs.append(position_ids)
@ -400,22 +468,24 @@ class FusionEmbedLayerNoMask(Fusion):
if embedding_sum_output:
embed_node_outputs.append(node_name + "_embedding_sum")
embed_node = helper.make_node('EmbedLayerNormalization',
embed_node_inputs,
outputs=embed_node_outputs,
name=node_name)
embed_node = helper.make_node(
"EmbedLayerNormalization",
embed_node_inputs,
outputs=embed_node_outputs,
name=node_name,
)
embed_node.domain = "com.microsoft"
# Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
for att in layernorm.attribute:
if att.name == 'epsilon':
if att.name == "epsilon":
embed_node.attribute.extend([att])
# Set default value to 1e-12 if no attribute is found.
# OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
if len(embed_node.attribute) == 0:
embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0E-12)])
embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
# Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
nodes_to_add.append(embed_node)
@ -446,7 +516,7 @@ class FusionEmbedLayerNoMask(Fusion):
return len(nodes) > 1
def fuse_gpt2(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
#graph checks
# graph checks
# gpt2 has no segment embedding, subgraph pattern is like
# input_ids position_ids
# | |
@ -484,8 +554,15 @@ class FusionEmbedLayerNoMask(Fusion):
optional_embedding_sum_output = True
# make the fused node
embed_node = self.create_fused_node(input_ids, layernorm, word_embedding_gather, position_embedding_gather,
None, position_ids, optional_embedding_sum_output)
embed_node = self.create_fused_node(
input_ids,
layernorm,
word_embedding_gather,
position_embedding_gather,
None,
position_ids,
optional_embedding_sum_output,
)
# direct the output to another add too
self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
@ -529,8 +606,9 @@ class FusionEmbedLayerNoMask(Fusion):
if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
return False
embed_node = self.create_fused_node(input_ids, layernorm, word_embedding_gather, position_embedding_gather,
None)
embed_node = self.create_fused_node(
input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
)
self.finish_fusion(layernorm, embed_node)
return True
@ -543,7 +621,7 @@ class FusionEmbedLayerNoMask(Fusion):
output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
"""
add_2_gather = self.model.match_parent_path(add_before_layernorm, ['Add'], [0])
add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
if add_2_gather is None:
return False
@ -558,7 +636,7 @@ class FusionEmbedLayerNoMask(Fusion):
if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
return False
position_embedding_path = self.model.match_parent_path(add_before_layernorm, ['Gather'], [1])
position_embedding_path = self.model.match_parent_path(add_before_layernorm, ["Gather"], [1])
if position_embedding_path is None:
return False
@ -574,14 +652,19 @@ class FusionEmbedLayerNoMask(Fusion):
if not self.check_embedding(word_embedding_gather, segment_embedding_gather, position_embedding_gather):
return False
embed_node = self.create_fused_node(input_ids, layernorm, word_embedding_gather, position_embedding_gather,
segment_embedding_gather)
embed_node = self.create_fused_node(
input_ids,
layernorm,
word_embedding_gather,
position_embedding_gather,
segment_embedding_gather,
)
self.finish_fusion(layernorm, embed_node)
return True
def fuse(self, node, input_name_to_nodes, output_name_to_node):
if node.op_type == "LayerNormalization":
first_add_path = self.model.match_parent_path(node, ['Add'], [0])
first_add_path = self.model.match_parent_path(node, ["Add"], [0])
if first_add_path is None:
return
add_before_layernorm = first_add_path[0]

Просмотреть файл

@ -1,12 +1,13 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
from typing import Dict, Optional
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Dict, Optional
from fusion_base import Fusion
from onnx import helper
from onnx_model import OnnxModel
from fusion_base import Fusion
logger = getLogger(__name__)
@ -40,7 +41,7 @@ class FusionFastGelu(Fusion):
if tanh_node.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[tanh_node.output[0]]
if len(children) != 1 or children[0].op_type != 'Add':
if len(children) != 1 or children[0].op_type != "Add":
return
add_after_tanh = children[0]
@ -50,11 +51,11 @@ class FusionFastGelu(Fusion):
if add_after_tanh.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[add_after_tanh.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_after_tanh = children[0]
mul_half = self.model.match_parent(mul_after_tanh, 'Mul', None, output_name_to_node)
mul_half = self.model.match_parent(mul_after_tanh, "Mul", None, output_name_to_node)
if mul_half is None:
return
@ -64,10 +65,10 @@ class FusionFastGelu(Fusion):
root_input = mul_half.input[0 if i == 1 else 1]
#root_node could be None when root_input is graph input
# root_node could be None when root_input is graph input
root_node = self.model.get_parent(mul_half, 0 if i == 1 else 1, output_name_to_node)
mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node)
if mul_before_tanh is None:
return
@ -75,15 +76,17 @@ class FusionFastGelu(Fusion):
if i < 0:
return
add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node)
add_before_tanh = self.model.match_parent(mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node)
if add_before_tanh is None:
return
mul_after_pow = self.model.match_parent(add_before_tanh,
'Mul',
None,
output_name_to_node,
exclude=[root_node] if root_node else [])
mul_after_pow = self.model.match_parent(
add_before_tanh,
"Mul",
None,
output_name_to_node,
exclude=[root_node] if root_node else [],
)
if mul_after_pow is None:
return
@ -91,7 +94,7 @@ class FusionFastGelu(Fusion):
if i < 0:
return
pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node)
pow = self.model.match_parent(mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node)
if pow is None:
return
@ -102,17 +105,30 @@ class FusionFastGelu(Fusion):
return
subgraph_nodes = [
mul_after_tanh, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow, pow
mul_after_tanh,
mul_half,
add_after_tanh,
tanh_node,
mul_before_tanh,
add_before_tanh,
mul_after_pow,
pow,
]
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_tanh.output[0]], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
[mul_after_tanh.output[0]],
input_name_to_nodes,
output_name_to_node,
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node('FastGelu',
inputs=[root_input],
outputs=mul_after_tanh.output,
name=self.model.create_node_name('FastGelu'))
fused_node = helper.make_node(
"FastGelu",
inputs=[root_input],
outputs=mul_after_tanh.output,
name=self.model.create_node_name("FastGelu"),
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
@ -134,7 +150,7 @@ class FusionFastGelu(Fusion):
if tanh_node.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[tanh_node.output[0]]
if len(children) != 1 or children[0].op_type != 'Add':
if len(children) != 1 or children[0].op_type != "Add":
return
add_after_tanh = children[0]
@ -144,7 +160,7 @@ class FusionFastGelu(Fusion):
if add_after_tanh.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[add_after_tanh.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_half = children[0]
@ -155,17 +171,19 @@ class FusionFastGelu(Fusion):
if mul_half.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[mul_half.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_after_mul_half = children[0]
root_node = self.model.get_parent(mul_after_mul_half,
0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
output_name_to_node)
root_node = self.model.get_parent(
mul_after_mul_half,
0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
output_name_to_node,
)
if root_node is None:
return
mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node)
if mul_before_tanh is None:
return
@ -173,11 +191,11 @@ class FusionFastGelu(Fusion):
if i < 0:
return
add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node)
add_before_tanh = self.model.match_parent(mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node)
if add_before_tanh is None:
return
mul_after_pow = self.model.match_parent(add_before_tanh, 'Mul', None, output_name_to_node, exclude=[root_node])
mul_after_pow = self.model.match_parent(add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node])
if mul_after_pow is None:
return
@ -185,7 +203,7 @@ class FusionFastGelu(Fusion):
if i < 0:
return
pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node)
pow = self.model.match_parent(mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node)
if pow is None:
return
@ -196,18 +214,30 @@ class FusionFastGelu(Fusion):
return
subgraph_nodes = [
mul_after_mul_half, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow,
pow
mul_after_mul_half,
mul_half,
add_after_tanh,
tanh_node,
mul_before_tanh,
add_before_tanh,
mul_after_pow,
pow,
]
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_mul_half.output[0]], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
[mul_after_mul_half.output[0]],
input_name_to_nodes,
output_name_to_node,
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node('FastGelu',
inputs=[root_node.output[0]],
outputs=mul_after_mul_half.output,
name=self.model.create_node_name('FastGelu'))
fused_node = helper.make_node(
"FastGelu",
inputs=[root_node.output[0]],
outputs=mul_after_mul_half.output,
name=self.model.create_node_name("FastGelu"),
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
@ -215,25 +245,25 @@ class FusionFastGelu(Fusion):
def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
"""
OpenAI's gelu implementation, also used in Megatron:
Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
OpenAI's gelu implementation, also used in Megatron:
Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
Fuse subgraph into a FastGelu node:
+------------ Mul (B=0.79788456) -------------------+
| |
+-------------------------------+ |
| | |
| v v
[root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
| ^
| |
+-----------> Mul (B=0.5) --------------------------------------------------------+
"""
Fuse subgraph into a FastGelu node:
+------------ Mul (B=0.79788456) -------------------+
| |
+-------------------------------+ |
| | |
| v v
[root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
| ^
| |
+-----------> Mul (B=0.5) --------------------------------------------------------+
"""
if tanh_node.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[tanh_node.output[0]]
if len(children) != 1 or children[0].op_type != 'Add':
if len(children) != 1 or children[0].op_type != "Add":
return
add_after_tanh = children[0]
@ -243,11 +273,11 @@ class FusionFastGelu(Fusion):
if add_after_tanh.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[add_after_tanh.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_last = children[0]
mul_half = self.model.match_parent(mul_last, 'Mul', None, output_name_to_node)
mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
if mul_half is None:
return
@ -257,18 +287,18 @@ class FusionFastGelu(Fusion):
root_input = mul_half.input[0 if i == 1 else 1]
mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
mul_before_tanh = self.model.match_parent(tanh_node, "Mul", 0, output_name_to_node)
if mul_before_tanh is None:
return
add_1 = self.model.match_parent(mul_before_tanh, 'Add', None, output_name_to_node)
add_1 = self.model.match_parent(mul_before_tanh, "Add", None, output_name_to_node)
if add_1 is None:
return
j = self.model.find_constant_input(add_1, 1.0)
if j < 0:
return
mul_7978 = self.model.match_parent(mul_before_tanh, 'Mul', None, output_name_to_node)
mul_7978 = self.model.match_parent(mul_before_tanh, "Mul", None, output_name_to_node)
if mul_7978 is None:
return
k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
@ -277,7 +307,7 @@ class FusionFastGelu(Fusion):
if mul_7978.input[0 if k == 1 else 1] != root_input:
return
mul_before_add_1 = self.model.match_parent(add_1, 'Mul', 0 if j == 1 else 1, output_name_to_node)
mul_before_add_1 = self.model.match_parent(add_1, "Mul", 0 if j == 1 else 1, output_name_to_node)
if mul_before_add_1 is None:
return
@ -288,7 +318,7 @@ class FusionFastGelu(Fusion):
else:
return
mul_0447 = self.model.match_parent(mul_before_add_1, 'Mul', another, output_name_to_node)
mul_0447 = self.model.match_parent(mul_before_add_1, "Mul", another, output_name_to_node)
if mul_0447 is None:
return
m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
@ -299,17 +329,31 @@ class FusionFastGelu(Fusion):
return
subgraph_nodes = [
mul_0447, mul_before_add_1, add_1, mul_before_tanh, tanh_node, add_after_tanh, mul_7978, mul_half, mul_last
mul_0447,
mul_before_add_1,
add_1,
mul_before_tanh,
tanh_node,
add_after_tanh,
mul_7978,
mul_half,
mul_last,
]
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_last.output[0]], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
[mul_last.output[0]],
input_name_to_nodes,
output_name_to_node,
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node('FastGelu',
inputs=[root_input],
outputs=mul_last.output,
name=self.model.create_node_name('FastGelu'))
fused_node = helper.make_node(
"FastGelu",
inputs=[root_input],
outputs=mul_last.output,
name=self.model.create_node_name("FastGelu"),
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name

Просмотреть файл

@ -1,12 +1,13 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
from typing import Dict, Optional
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Dict, Optional
from fusion_base import Fusion
from onnx import helper
from onnx_model import OnnxModel
from fusion_base import Fusion
logger = getLogger(__name__)
@ -45,7 +46,7 @@ class FusionGelu(Fusion):
if erf_node.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[erf_node.output[0]]
if len(children) != 1 or children[0].op_type != 'Add':
if len(children) != 1 or children[0].op_type != "Add":
return
add_after_erf = children[0]
@ -55,11 +56,11 @@ class FusionGelu(Fusion):
if add_after_erf.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[add_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_after_erf = children[0]
div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node)
div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
if div is None:
return
@ -71,14 +72,14 @@ class FusionGelu(Fusion):
another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
if subgraph_input == mul_after_erf.input[another]: # pattern 2
children = input_name_to_nodes[mul_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_half = children[0]
if not self.model.has_constant_input(mul_half, 0.5):
return
subgraph_output = mul_half.output[0]
else: # pattern 1
mul_half = self.model.match_parent(mul_after_erf, 'Mul', another, output_name_to_node)
mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
if mul_half is None:
return
@ -91,12 +92,13 @@ class FusionGelu(Fusion):
subgraph_output = mul_after_erf.output[0]
subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node('Gelu', inputs=[subgraph_input], outputs=[subgraph_output])
fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
@ -117,7 +119,7 @@ class FusionGelu(Fusion):
if erf_node.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[erf_node.output[0]]
if len(children) != 1 or children[0].op_type != 'Add':
if len(children) != 1 or children[0].op_type != "Add":
return
add_after_erf = children[0]
@ -127,7 +129,7 @@ class FusionGelu(Fusion):
if add_after_erf.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[add_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_after_erf = children[0]
@ -137,17 +139,17 @@ class FusionGelu(Fusion):
if mul_after_erf.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[mul_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul = children[0]
div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node)
div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
if div is None:
return
sqrt_node = None
if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
sqrt_node = self.model.match_parent(div, 'Sqrt', 1, output_name_to_node)
sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
if sqrt_node is None:
return
if not self.model.has_constant_input(sqrt_node, 2.0):
@ -164,12 +166,13 @@ class FusionGelu(Fusion):
if sqrt_node:
subgraph_nodes.append(sqrt_node)
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[mul.output[0]])
fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
@ -191,7 +194,7 @@ class FusionGelu(Fusion):
if erf_node.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[erf_node.output[0]]
if len(children) != 1 or children[0].op_type != 'Add':
if len(children) != 1 or children[0].op_type != "Add":
return
add_after_erf = children[0]
@ -201,14 +204,14 @@ class FusionGelu(Fusion):
if add_after_erf.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[add_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
mul_half = children[0]
if not self.model.has_constant_input(mul_half, 0.5):
return
first_mul = self.model.match_parent(erf_node, 'Mul', 0, output_name_to_node)
first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
if first_mul is None:
return
@ -223,7 +226,7 @@ class FusionGelu(Fusion):
if mul_half.output[0] not in input_name_to_nodes:
return
children = input_name_to_nodes[mul_half.output[0]]
if len(children) != 1 or children[0].op_type != 'Mul':
if len(children) != 1 or children[0].op_type != "Mul":
return
last_mul = children[0]
@ -231,12 +234,16 @@ class FusionGelu(Fusion):
return
subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [last_mul.output[0]], input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
[last_mul.output[0]],
input_name_to_nodes,
output_name_to_node,
):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name

Просмотреть файл

@ -1,23 +1,26 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# --------------------------------------------------------------------------
from logging import getLogger
from fusion_base import Fusion
from onnx import helper
from onnx_model import OnnxModel
from fusion_base import Fusion
class FusionGeluApproximation(Fusion):
def __init__(self, model: OnnxModel):
super().__init__(model, 'FastGelu', ['Gelu', 'BiasGelu'], 'GeluApproximation')
super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
def fuse(self, node, input_name_to_nodes, output_name_to_node):
new_node = helper.make_node("FastGelu",
inputs=node.input,
outputs=node.output,
name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"))
new_node = helper.make_node(
"FastGelu",
inputs=node.input,
outputs=node.output,
name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"),
)
new_node.domain = "com.microsoft"
self.nodes_to_remove.append(node)
self.nodes_to_add.append(new_node)

Просмотреть файл

@ -1,20 +1,21 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
import numpy as np
# --------------------------------------------------------------------------
from logging import getLogger
from onnx import helper, numpy_helper, TensorProto
from onnx_model import OnnxModel
import numpy as np
from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import TensorProto, helper, numpy_helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class FusionGptAttentionPastBase(Fusion):
"""Base class for GPT Attention Fusion with past state
"""
"""Base class for GPT Attention Fusion with past state"""
def __init__(self, model: OnnxModel, num_heads: int):
super().__init__(model, "Attention", "LayerNormalization", "with past")
self.num_heads = num_heads
@ -41,7 +42,7 @@ class FusionGptAttentionPastBase(Fusion):
# |
# {present}
gather = self.model.get_parent(concat_v, 0, output_name_to_node)
if gather.op_type != 'Gather':
if gather.op_type != "Gather":
logger.debug("match_past_pattern_1: expect Gather for past")
return None
@ -51,10 +52,10 @@ class FusionGptAttentionPastBase(Fusion):
past = gather.input[0]
parent = self.model.get_parent(concat_k, 0, output_name_to_node)
if parent.op_type == 'Gather':
if parent.op_type == "Gather":
gather_past_k = parent
else:
past_k_nodes = self.model.match_parent_path(concat_k, ['Transpose', 'Gather'], [0, 0])
past_k_nodes = self.model.match_parent_path(concat_k, ["Transpose", "Gather"], [0, 0])
if past_k_nodes is None:
logger.debug("match_past_pattern_1: failed match Transpose and Gather")
return None
@ -93,7 +94,7 @@ class FusionGptAttentionPastBase(Fusion):
# {present}
#
squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
if squeeze.op_type != 'Squeeze':
if squeeze.op_type != "Squeeze":
logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
return None
@ -104,11 +105,11 @@ class FusionGptAttentionPastBase(Fusion):
opset_version = self.model.get_opset_version()
if opset_version < 13:
if not FusionUtils.check_node_attribute(squeeze, 'axes', [0]):
if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
return None
if not FusionUtils.check_node_attribute(split, 'split', [1, 1]):
if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
return None
else:
@ -120,12 +121,12 @@ class FusionGptAttentionPastBase(Fusion):
logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
return None
if not FusionUtils.check_node_attribute(split, 'axis', 0, default_value=0):
if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
logger.debug("match_past_pattern_2: attribute axis of Split are not expected in past path")
return None
past = split.input[0]
past_k_nodes = self.model.match_parent_path(concat_k, ['Squeeze', 'Split'], [0, 0])
past_k_nodes = self.model.match_parent_path(concat_k, ["Squeeze", "Split"], [0, 0])
if past_k_nodes is None:
logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
return None
@ -138,17 +139,15 @@ class FusionGptAttentionPastBase(Fusion):
return past
def match_present(self, concat_v, input_name_to_nodes):
unsqueeze_present_v = self.model.find_first_child_by_type(concat_v,
'Unsqueeze',
input_name_to_nodes,
recursive=False)
unsqueeze_present_v = self.model.find_first_child_by_type(
concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
)
if not unsqueeze_present_v:
logger.info("expect unsqueeze for present")
return None
concat_present = self.model.find_first_child_by_type(unsqueeze_present_v,
'Concat',
input_name_to_nodes,
recursive=False)
concat_present = self.model.find_first_child_by_type(
unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
)
if not concat_present:
logger.info("expect concat for present")
return None
@ -172,31 +171,50 @@ class FusionGptAttention(FusionGptAttentionPastBase):
"""
Fuse GPT-2 Attention with past state subgraph into one Attention node.
"""
def __init__(self, model: OnnxModel, num_heads: int):
super().__init__(model, num_heads)
def create_attention_node(self, fc_weight, fc_bias, gemm_qkv, past, present, input, output, mask,
is_unidirectional):
attention_node_name = self.model.create_node_name('GptAttention')
attention_node = helper.make_node('Attention',
inputs=[input, fc_weight, fc_bias, mask, past],
outputs=[attention_node_name + "_output", present],
name=attention_node_name)
def create_attention_node(
self,
fc_weight,
fc_bias,
gemm_qkv,
past,
present,
input,
output,
mask,
is_unidirectional,
):
attention_node_name = self.model.create_node_name("GptAttention")
attention_node = helper.make_node(
"Attention",
inputs=[input, fc_weight, fc_bias, mask, past],
outputs=[attention_node_name + "_output", present],
name=attention_node_name,
)
attention_node.domain = "com.microsoft"
attention_node.attribute.extend([
helper.make_attribute("num_heads", self.num_heads),
helper.make_attribute("unidirectional", 1 if is_unidirectional else 0)
])
attention_node.attribute.extend(
[
helper.make_attribute("num_heads", self.num_heads),
helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
]
)
matmul_node = helper.make_node('MatMul',
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
outputs=[attention_node_name + "_matmul_output"],
name=attention_node_name + "_matmul")
matmul_node = helper.make_node(
"MatMul",
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
outputs=[attention_node_name + "_matmul_output"],
name=attention_node_name + "_matmul",
)
add_node = helper.make_node('Add',
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
outputs=[output],
name=attention_node_name + "_add")
add_node = helper.make_node(
"Add",
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
outputs=[output],
name=attention_node_name + "_add",
)
self.nodes_to_add.extend([attention_node, matmul_node, add_node])
self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
@ -208,28 +226,44 @@ class FusionGptAttention(FusionGptAttentionPastBase):
return_indice = []
qkv_nodes = self.model.match_parent_path(
normalize_node,
['Add', 'Reshape', 'Gemm', 'Reshape', 'Reshape', 'Transpose', 'MatMul'],
[0, None, 0, 0, 0, 0, 0],
["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
[0, None, 0, 0, 0, 0, 0],
output_name_to_node=output_name_to_node,
return_indice=return_indice
) # yapf: disable
return_indice=return_indice,
) # yapf: disable
if qkv_nodes is None:
return
(add_qkv, reshape_qkv, gemm_qkv, reshape_1, reshape_2, transpose_qkv, matmul_qkv) = qkv_nodes
(
add_qkv,
reshape_qkv,
gemm_qkv,
reshape_1,
reshape_2,
transpose_qkv,
matmul_qkv,
) = qkv_nodes
another_input = add_qkv.input[1 - return_indice[0]]
v_nodes = self.model.match_parent_path(matmul_qkv, ['Concat', 'Transpose', 'Reshape', 'Split'], [1, 1, 0, 0])
v_nodes = self.model.match_parent_path(matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
if v_nodes is None:
logger.debug("fuse_attention: failed to match v path")
return
(concat_v, transpose_v, reshape_v, split_fc) = v_nodes
fc_nodes = self.model.match_parent_path(split_fc, ['Reshape', 'Gemm', 'Reshape', 'LayerNormalization'],
[0, 0, 0, 0], output_name_to_node)
fc_nodes = self.model.match_parent_path(
split_fc,
["Reshape", "Gemm", "Reshape", "LayerNormalization"],
[0, 0, 0, 0],
output_name_to_node,
)
if fc_nodes is None:
fc_nodes = self.model.match_parent_path(split_fc, ['Add', 'MatMul', 'LayerNormalization'], [0, None, 0],
output_name_to_node)
fc_nodes = self.model.match_parent_path(
split_fc,
["Add", "MatMul", "LayerNormalization"],
[0, None, 0],
output_name_to_node,
)
if fc_nodes is None:
logger.debug("fuse_attention: failed to match fc path")
return
@ -250,13 +284,25 @@ class FusionGptAttention(FusionGptAttentionPastBase):
slice_mask = None
input_mask_nodes = None
concat_k_to_match = None
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'Div', 'MatMul'], [0, 0, 0, 0, 0])
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
if qk_nodes is not None:
(softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
mask_nodes = self.model.match_parent_path(
sub_qk,
['Mul', 'Sub', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable
[
"Mul",
"Sub",
"Slice",
"Slice",
"Unsqueeze",
"Sub",
"Squeeze",
"Slice",
"Shape",
"Div",
],
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
) # yapf: disable
if mask_nodes is None:
logger.debug("fuse_attention: failed to match unidirectional mask path")
return
@ -269,8 +315,13 @@ class FusionGptAttention(FusionGptAttentionPastBase):
else:
# New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
i, qk_nodes, _ = self.model.match_parent_paths(
matmul_qkv, [(['Softmax', 'Where', 'Div', 'MatMul'], [0, 0, 1, 0]),
(['Softmax', 'Add', 'Where', 'Div', 'MatMul'], [0, 0, None, 1, 0])], output_name_to_node)
matmul_qkv,
[
(["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
(["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
],
output_name_to_node,
)
if qk_nodes is None:
logger.debug("fuse_attention: failed to match qk nodes")
return
@ -284,20 +335,40 @@ class FusionGptAttention(FusionGptAttentionPastBase):
_, input_mask_nodes, _ = self.model.match_parent_paths(
add_qk,
[
(['Mul', 'Sub', 'Cast', 'Unsqueeze', 'Unsqueeze', 'Reshape'], [None, 0, 1, 0, 0, 0]),
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze', 'Reshape'], [None, 0, 1, 0, 0]),
(['Mul', 'Sub', 'Unsqueeze', 'Unsqueeze'], [None, 0, 1, 0]), # useless cast and reshape are removed.
(
["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
[None, 0, 1, 0, 0, 0],
),
(
["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
[None, 0, 1, 0, 0],
),
(
["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
[None, 0, 1, 0],
), # useless cast and reshape are removed.
],
output_name_to_node) # yapf: disable
output_name_to_node,
) # yapf: disable
if input_mask_nodes is None:
logger.debug("fuse_attention: failed to match input attention mask path")
return
mask_nodes = self.model.match_parent_path(
where_qk,
['Cast', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape'],
[ 0, 0, 0, 1, 0, 0, 0, 0],
output_name_to_node) # yapf: disable
[
"Cast",
"Slice",
"Slice",
"Unsqueeze",
"Sub",
"Squeeze",
"Slice",
"Shape",
],
[0, 0, 0, 1, 0, 0, 0, 0],
output_name_to_node,
) # yapf: disable
if mask_nodes is None:
# TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
logger.debug("fuse_attention: failed to match mask path")
@ -318,8 +389,9 @@ class FusionGptAttention(FusionGptAttentionPastBase):
# Validate that the mask data is either lower triangular (unidirectional) or all ones
mask_data = numpy_helper.to_array(self.model.get_initializer(slice_mask.input[0]))
if not (len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1)
and mask_data.shape[2] == mask_data.shape[3]):
if not (
len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1) and mask_data.shape[2] == mask_data.shape[3]
):
logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
return
if np.allclose(mask_data, np.ones_like(mask_data)):
@ -328,7 +400,7 @@ class FusionGptAttention(FusionGptAttentionPastBase):
logger.debug("fuse_attention: skip since mask is neither lower triangular nor ones")
return
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [0, 0, 0])
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
if q_nodes is None:
logger.debug("fuse_attention: failed to match q path")
return
@ -337,11 +409,14 @@ class FusionGptAttention(FusionGptAttentionPastBase):
logger.debug("fuse_attention: skip since split_fc != split_q")
return
k_nodes = self.model.match_parent_path(matmul_qk, ['Concat', 'Transpose', 'Reshape', 'Split'], [1, 1, 0, 0])
k_nodes = self.model.match_parent_path(matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
if k_nodes is None:
# This pattern is from pytorch 1.7.1 and transformers 4.6.1
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Concat', 'Transpose', 'Reshape', 'Split'],
[1, 0, 1, 0, 0])
k_nodes = self.model.match_parent_path(
matmul_qk,
["Transpose", "Concat", "Transpose", "Reshape", "Split"],
[1, 0, 1, 0, 0],
)
if k_nodes is None:
logger.debug("fuse_attention: failed to match k path")
return
@ -357,14 +432,15 @@ class FusionGptAttention(FusionGptAttentionPastBase):
logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
return
attention_mask_input_name = ''
attention_mask_input_name = ""
if input_mask_nodes is not None:
input_name = input_mask_nodes[-1].input[0]
attention_mask_input_name = self.cast_attention_mask(input_name)
# Match past and present paths
past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or \
self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or self.match_past_pattern_2(
concat_k, concat_v, output_name_to_node
)
if past is None:
logger.info("fuse_attention: failed to match past path")
return
@ -380,8 +456,17 @@ class FusionGptAttention(FusionGptAttentionPastBase):
logger.info("expect present to be graph output")
return
self.create_attention_node(fc_weight, fc_bias, gemm_qkv, past, present, layernorm_before_attention.output[0],
reshape_qkv.output[0], attention_mask_input_name, is_unidirectional)
self.create_attention_node(
fc_weight,
fc_bias,
gemm_qkv,
past,
present,
layernorm_before_attention.output[0],
reshape_qkv.output[0],
attention_mask_input_name,
is_unidirectional,
)
# we rely on prune_graph() to clean old subgraph nodes:
# qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]

Просмотреть файл

@ -1,14 +1,15 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
import numpy as np
# --------------------------------------------------------------------------
from logging import getLogger
from onnx import helper, numpy_helper, TensorProto
from onnx_model import OnnxModel
import numpy as np
from fusion_base import Fusion
from fusion_utils import FusionUtils
from fusion_gpt_attention import FusionGptAttentionPastBase
from fusion_utils import FusionUtils
from onnx import TensorProto, helper, numpy_helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
@ -21,24 +22,43 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
"""
Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
"""
def __init__(self, model: OnnxModel, num_heads: int):
super().__init__(model, num_heads)
def fuse_attention_node(self, matmul_before_split, add_before_split, past, present, input, reshape_qkv, mask):
attention_node_name = self.model.create_node_name('GptAttention')
def fuse_attention_node(
self,
matmul_before_split,
add_before_split,
past,
present,
input,
reshape_qkv,
mask,
):
attention_node_name = self.model.create_node_name("GptAttention")
int32_mask = self.cast_attention_mask(mask)
output = reshape_qkv.output[0]
i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
attention_node = helper.make_node(
'Attention',
inputs=[input, matmul_before_split.input[1], add_before_split.input[i], int32_mask, past],
"Attention",
inputs=[
input,
matmul_before_split.input[1],
add_before_split.input[i],
int32_mask,
past,
],
outputs=[output, present],
name=attention_node_name)
name=attention_node_name,
)
attention_node.domain = "com.microsoft"
attention_node.attribute.extend([
helper.make_attribute("num_heads", self.num_heads),
helper.make_attribute("unidirectional", 0) # unidirectional shall not be ON for 4D attention mask
])
attention_node.attribute.extend(
[
helper.make_attribute("num_heads", self.num_heads),
helper.make_attribute("unidirectional", 0), # unidirectional shall not be ON for 4D attention mask
]
)
nodes_to_add = [attention_node]
self.nodes_to_add.extend(nodes_to_add)
@ -53,9 +73,8 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
mask_nodes = self.model.match_parent_path(
sub_qk,
['Mul', 'Sub', 'Slice', 'Slice'],
[1, 0, 1, 0]) # yapf: disable
sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
) # yapf: disable
if mask_nodes is None:
logger.debug("fuse_attention: failed to match unidirectional mask path")
return None
@ -97,27 +116,34 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]")
return None
last_slice_path = self.model.match_parent_path(last_slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'],
[2, 0, 0, 0])
last_slice_path = self.model.match_parent_path(
last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
)
if last_slice_path is None or last_slice_path[-1] != matmul_qk:
logger.debug("fuse_attention: failed to match last slice path")
return None
first_slice_path = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'],
[2, 0, 0, 0])
first_slice_path = self.model.match_parent_path(
slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
)
if first_slice_path is None or first_slice_path[-1] != matmul_qk:
logger.debug("fuse_attention: failed to match first slice path")
return None
first_slice_sub = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Sub', 'Gather', 'Shape', 'MatMul'],
[1, 0, 0, 0, 0])
first_slice_sub = self.model.match_parent_path(
slice_mask,
["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
[1, 0, 0, 0, 0],
)
if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
logger.debug("fuse_attention: failed to match last slice sub path")
return None
first_slice_sub_1 = self.model.match_parent_path(slice_mask,
['Unsqueeze', 'Sub', 'Gather', 'Shape', 'LayerNormalization'],
[1, 0, 1, 0, 0])
first_slice_sub_1 = self.model.match_parent_path(
slice_mask,
["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
[1, 0, 1, 0, 0],
)
if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention:
logger.debug("fuse_attention: failed to match last slice sub path 1")
return None
@ -130,30 +156,53 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
qkv_nodes = self.model.match_parent_path(
normalize_node,
['Add', 'Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
[ 0, 1, None, 0, 0, 0],
["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
[0, 1, None, 0, 0, 0],
output_name_to_node=output_name_to_node,
) # yapf: disable
) # yapf: disable
if qkv_nodes is None:
return
(add_skip, add_after_attention, matmul_after_attention, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
(
add_skip,
add_after_attention,
matmul_after_attention,
reshape_qkv,
transpose_qkv,
matmul_qkv,
) = qkv_nodes
skip_input = add_skip.input[0]
v_nodes = self.model.match_parent_path(
matmul_qkv,
['Concat', 'Transpose', 'Reshape', 'Split', 'Add', 'MatMul', 'LayerNormalization'],
[1, 1, 0, 0, 0, None, 0]) # yapf: disable
[
"Concat",
"Transpose",
"Reshape",
"Split",
"Add",
"MatMul",
"LayerNormalization",
],
[1, 1, 0, 0, 0, None, 0],
) # yapf: disable
if v_nodes is None:
logger.debug("fuse_attention: failed to match v path")
return
(concat_v, transpose_v, reshape_v, split_v, add_before_split, matmul_before_split,
layernorm_before_attention) = v_nodes
(
concat_v,
transpose_v,
reshape_v,
split_v,
add_before_split,
matmul_before_split,
layernorm_before_attention,
) = v_nodes
if skip_input != layernorm_before_attention.input[0]:
logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]")
return
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'MatMul'], [0, 0, 0, 0])
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0])
if qk_nodes is None:
logger.debug("fuse_attention: failed to match qk path")
return None
@ -164,7 +213,7 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention)
q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Split'], [0, 0, 0, 0])
q_nodes = self.model.match_parent_path(matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0])
if q_nodes is None:
logger.debug("fuse_attention: failed to match q path")
return
@ -173,9 +222,11 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
logger.debug("fuse_attention: skip since split_v != split_q")
return
k_nodes = self.model.match_parent_path(matmul_qk,
['Div', 'Transpose', 'Concat', 'Transpose', 'Reshape', 'Split'],
[1, 0, 0, 1, 0, 0])
k_nodes = self.model.match_parent_path(
matmul_qk,
["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
[1, 0, 0, 1, 0, 0],
)
if k_nodes is None:
logger.debug("fuse_attention: failed to match k path")
return
@ -185,8 +236,14 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
return
i, value = self.model.get_constant_input(reshape_k)
if not (isinstance(value, np.ndarray) and list(value.shape) == [4] and value[0] == 0 and value[1] == 0
and value[2] > 0 and value[3] > 0):
if not (
isinstance(value, np.ndarray)
and list(value.shape) == [4]
and value[0] == 0
and value[1] == 0
and value[2] > 0
and value[3] > 0
):
logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
return
@ -224,5 +281,12 @@ class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
logger.info("fuse_attention: expect present to be graph output")
return
self.fuse_attention_node(matmul_before_split, add_before_split, past, present,
layernorm_before_attention.output[0], reshape_qkv, attention_mask)
self.fuse_attention_node(
matmul_before_split,
add_before_split,
past,
present,
layernorm_before_attention.output[0],
reshape_qkv,
attention_mask,
)

Просмотреть файл

@ -1,13 +1,14 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
import numpy as np
# --------------------------------------------------------------------------
from logging import getLogger
from onnx import helper, numpy_helper, TensorProto
from onnx_model import OnnxModel
import numpy as np
from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import TensorProto, helper, numpy_helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
@ -17,31 +18,41 @@ class FusionGptAttentionNoPast(Fusion):
Fuse GPT-2 Attention without past state into one Attention node.
This does not support attention_mask graph input right now.
"""
def __init__(self, model: OnnxModel, num_heads: int):
super().__init__(model, "Attention", "LayerNormalization", "without past")
# TODO: detect num_heads from graph like FusionAttention
self.num_heads = num_heads
def create_attention_node(self, gemm, gemm_qkv, input, output):
attention_node_name = self.model.create_node_name('Attention')
attention_node = helper.make_node('Attention',
inputs=[input, gemm.input[1], gemm.input[2]],
outputs=[attention_node_name + "_output"],
name=attention_node_name)
attention_node_name = self.model.create_node_name("Attention")
attention_node = helper.make_node(
"Attention",
inputs=[input, gemm.input[1], gemm.input[2]],
outputs=[attention_node_name + "_output"],
name=attention_node_name,
)
attention_node.domain = "com.microsoft"
attention_node.attribute.extend(
[helper.make_attribute("num_heads", self.num_heads),
helper.make_attribute("unidirectional", 1)])
[
helper.make_attribute("num_heads", self.num_heads),
helper.make_attribute("unidirectional", 1),
]
)
matmul_node = helper.make_node('MatMul',
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
outputs=[attention_node_name + "_matmul_output"],
name=attention_node_name + "_matmul")
matmul_node = helper.make_node(
"MatMul",
inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
outputs=[attention_node_name + "_matmul_output"],
name=attention_node_name + "_matmul",
)
add_node = helper.make_node('Add',
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
outputs=[output],
name=attention_node_name + "_add")
add_node = helper.make_node(
"Add",
inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
outputs=[output],
name=attention_node_name + "_add",
)
self.nodes_to_add.extend([attention_node, matmul_node, add_node])
self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
@ -52,29 +63,45 @@ class FusionGptAttentionNoPast(Fusion):
return_indice = []
qkv_nodes = self.model.match_parent_path(
normalize_node,
['Add', 'Reshape', 'Gemm', 'Reshape', 'Reshape', 'Transpose', 'MatMul'],
["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
[0, None, 0, 0, 0, 0, 0],
output_name_to_node=output_name_to_node,
return_indice=return_indice
) # yapf: disable
return_indice=return_indice,
) # yapf: disable
if qkv_nodes is None:
return
(add_qkv, reshape_qkv, gemm_qkv, reshape_1, reshape_2, transpose_qkv, matmul_qkv) = qkv_nodes
(
add_qkv,
reshape_qkv,
gemm_qkv,
reshape_1,
reshape_2,
transpose_qkv,
matmul_qkv,
) = qkv_nodes
another_input = add_qkv.input[1 - return_indice[0]]
v_nodes = self.model.match_parent_path(
matmul_qkv,
['Transpose', 'Reshape', 'Split', 'Reshape', 'Gemm', 'Reshape'],
[1, 0, 0, 0, 0, 0]) # yapf: disable
["Transpose", "Reshape", "Split", "Reshape", "Gemm", "Reshape"],
[1, 0, 0, 0, 0, 0],
) # yapf: disable
if v_nodes is None:
logger.debug("fuse_attention: failed to match v path")
return
(transpose_v, reshape_v, split_v, reshape_after_gemm, gemm, reshape_before_gemm) = v_nodes
(
transpose_v,
reshape_v,
split_v,
reshape_after_gemm,
gemm,
reshape_before_gemm,
) = v_nodes
layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node)
if layernorm_before_attention is None or layernorm_before_attention.op_type != 'LayerNormalization':
if layernorm_before_attention.op_type != 'Add':
if layernorm_before_attention is None or layernorm_before_attention.op_type != "LayerNormalization":
if layernorm_before_attention.op_type != "Add":
logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}")
return
@ -84,13 +111,25 @@ class FusionGptAttentionNoPast(Fusion):
logger.debug("Add and LayerNormalization shall have one same input")
return
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'Div', 'MatMul'], [0, 0, 0, 0, 0])
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
if qk_nodes is not None:
(softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
mask_nodes = self.model.match_parent_path(
sub_qk,
['Mul', 'Sub', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable
[
"Mul",
"Sub",
"Slice",
"Slice",
"Unsqueeze",
"Sub",
"Squeeze",
"Slice",
"Shape",
"Div",
],
[1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
) # yapf: disable
if mask_nodes is None:
logger.debug("fuse_attention: failed to match mask path")
return
@ -101,13 +140,24 @@ class FusionGptAttentionNoPast(Fusion):
return
else:
# New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'Div', 'MatMul'], [0, 0, 1, 0])
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0])
if qk_nodes is not None:
(softmax_qk, where_qk, div_qk, matmul_qk) = qk_nodes
mask_nodes = self.model.match_parent_path(
where_qk,
['Cast', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
[ 0, 0, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable
[
"Cast",
"Slice",
"Slice",
"Unsqueeze",
"Sub",
"Squeeze",
"Slice",
"Shape",
"Div",
],
[0, 0, 0, 1, 0, 0, 0, 0, 0],
) # yapf: disable
if mask_nodes is None:
logger.debug("fuse_attention: failed to match mask path")
return
@ -118,16 +168,20 @@ class FusionGptAttentionNoPast(Fusion):
return
else:
# match openai-gpt
qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'Div', 'MatMul'],
[0, 0, 0, 0, 0])
qk_nodes = self.model.match_parent_path(
matmul_qkv,
["Softmax", "Add", "Mul", "Div", "MatMul"],
[0, 0, 0, 0, 0],
)
if qk_nodes is None:
logger.debug("fuse_attention: failed to match qk path")
return
(softmax_qk, add_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
mask_nodes = self.model.match_parent_path(
mul_qk,
['Slice', 'Slice', 'Unsqueeze', 'Squeeze', 'Slice', 'Shape', 'Div'],
[ 1, 0, 2, 0, 0, 0, 0]) # yapf: disable
["Slice", "Slice", "Unsqueeze", "Squeeze", "Slice", "Shape", "Div"],
[1, 0, 2, 0, 0, 0, 0],
) # yapf: disable
if mask_nodes is None:
logger.debug("fuse_attention: failed to match mask path")
return
@ -137,7 +191,7 @@ class FusionGptAttentionNoPast(Fusion):
logger.debug("fuse_attention: skip since div_qk != div_mask")
return
q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [0, 0, 0])
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
if q_nodes is None:
logger.debug("fuse_attention: failed to match q path")
return
@ -146,7 +200,7 @@ class FusionGptAttentionNoPast(Fusion):
logger.debug("fuse_attention: skip since split_v != split_q")
return
k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [1, 0, 0])
k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0])
if k_nodes is None:
logger.debug("fuse_attention: failed to match k path")
return

Просмотреть файл

@ -1,12 +1,13 @@
#-------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
from typing import Dict
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Dict
from fusion_base import Fusion
from onnx import helper
from onnx_model import OnnxModel
from fusion_base import Fusion
logger = getLogger(__name__)
@ -43,24 +44,32 @@ class FusionLayerNormalization(Fusion):
root_input = node.input[0]
if children[0].op_type != 'Sub' or children[0].input[0] != root_input:
if children[0].op_type != "Sub" or children[0].input[0] != root_input:
return
if len(children) == 2:
if children[1].op_type != 'Sub' or children[1].input[0] != root_input:
if children[1].op_type != "Sub" or children[1].input[0] != root_input:
return
div_node = None
for child in children:
div_node = self.model.find_first_child_by_type(child, 'Div', input_name_to_nodes, recursive=False)
div_node = self.model.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
if div_node is not None:
break
if div_node is None:
return
path_id, parent_nodes, _ = self.model.match_parent_paths(
div_node, [(['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Sub'], [1, 0, 0, 0, 0]),
(['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Cast', 'Sub'], [1, 0, 0, 0, 0, 0])], output_name_to_node)
div_node,
[
(["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
(
["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
[1, 0, 0, 0, 0, 0],
),
],
output_name_to_node,
)
if path_id < 0:
return
@ -70,7 +79,7 @@ class FusionLayerNormalization(Fusion):
second_add_node = parent_nodes[1]
i, add_weight = self.model.get_constant_input(second_add_node)
if add_weight is None or add_weight <= 0 or add_weight > 1.0E-4:
if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
logger.warning(f"epsilon value is not expeced: {add_weight}")
return
@ -79,11 +88,11 @@ class FusionLayerNormalization(Fusion):
return
mul_node = input_name_to_nodes[div_node.output[0]][0]
if mul_node.op_type != 'Mul':
if mul_node.op_type != "Mul":
return
last_add_node = input_name_to_nodes[mul_node.output[0]][0]
if last_add_node.op_type != 'Add':
if last_add_node.op_type != "Add":
return
subgraph_nodes = [node]
@ -91,8 +100,12 @@ class FusionLayerNormalization(Fusion):
subgraph_nodes.extend(parent_nodes[:-1])
subgraph_nodes.extend([last_add_node, mul_node, div_node])
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, last_add_node.output, input_name_to_nodes,
output_name_to_node):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
last_add_node.output,
input_name_to_nodes,
output_name_to_node,
):
logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
return
@ -106,11 +119,12 @@ class FusionLayerNormalization(Fusion):
self.nodes_to_remove.extend(subgraph_nodes)
normalize_node = helper.make_node('LayerNormalization',
inputs=[node.input[0], weight_input, bias_input],
outputs=[last_add_node.output[0]],
name=self.model.create_node_name("LayerNormalization",
name_prefix="LayerNorm"))
normalize_node = helper.make_node(
"LayerNormalization",
inputs=[node.input[0], weight_input, bias_input],
outputs=[last_add_node.output[0]],
name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
)
normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
self.nodes_to_add.append(normalize_node)
self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
@ -122,28 +136,58 @@ class FusionLayerNormalizationTF(Fusion):
def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
"""
Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
+------------------------------------+
| |
| |
(Cast_1) |
| |
| v (B) (B) (A)
Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
| | | ^ ^
| | | | |
| +--------------------------------------------------(Cast_2)-------------------------------|-------+ |
| v |
+---------------------------------------------------------------------------------------------------------------> Mul--------------------+
Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
+------------------------------------+
| |
| |
(Cast_1) |
| |
| v (B) (B) (A)
Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
| | | ^ ^
| | | | |
| +--------------------------------------------------(Cast_2)-------------------------------|-------+ |
| v |
+---------------------------------------------------------------------------------------------------------------> Mul--------------------+
"""
return_indice = []
_, parent_nodes, return_indice = self.model.match_parent_paths(
node,
[(['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'],
[ 1, 1, None, 0, 0, 0, None, 0, 0, None]),
(['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'Cast', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'],
[ 1, 1, None, 0, 0, 0, 0, None, 0, 0, None])],
output_name_to_node) # yapf: disable
[
(
[
"Sub",
"Mul",
"Mul",
"Reciprocal",
"Sqrt",
"Add",
"ReduceMean",
"Mul",
"Sub",
"ReduceMean",
],
[1, 1, None, 0, 0, 0, None, 0, 0, None],
),
(
[
"Sub",
"Mul",
"Mul",
"Reciprocal",
"Sqrt",
"Add",
"Cast",
"ReduceMean",
"Mul",
"Sub",
"ReduceMean",
],
[1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
),
],
output_name_to_node,
) # yapf: disable
if parent_nodes is None:
return
@ -153,38 +197,50 @@ class FusionLayerNormalizationTF(Fusion):
logger.debug("return indice is exepected in [0, 1], but got {return_indice}")
return
sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0 = parent_nodes[:6]
(
sub_node_0,
mul_node_0,
mul_node_1,
reciprocol_node,
sqrt_node,
add_node_0,
) = parent_nodes[:6]
reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[-4:]
cast_node_3 = None
if len(parent_nodes) == 11:
cast_node_3 = parent_nodes[6]
assert (cast_node_3.op_type == 'Cast')
assert cast_node_3.op_type == "Cast"
mul_node_3 = self.model.match_parent(node, 'Mul', 0, output_name_to_node)
mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
if mul_node_3 is None:
logger.debug("mul_node_3 not found")
return
node_before_reduce = self.model.get_parent(reduce_mean_node_1, 0, output_name_to_node)
root_node = node_before_reduce if cast_node_3 is None else self.model.get_parent(
node_before_reduce, 0, output_name_to_node)
root_node = (
node_before_reduce
if cast_node_3 is None
else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
)
if root_node is None:
logger.debug("root node is none")
return
i, epsilon = self.model.get_constant_input(add_node_0)
if epsilon is None or epsilon <= 0 or (epsilon > 1.0E-5 and cast_node_3 is None):
if epsilon is None or epsilon <= 0 or (epsilon > 1.0e-5 and cast_node_3 is None):
logger.debug("epsilon is not matched")
return
if cast_node_3 is None and (reduce_mean_node_1.input[0] not in mul_node_3.input
or reduce_mean_node_1.input[0] not in sub_node_1.input):
if cast_node_3 is None and (
reduce_mean_node_1.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
):
logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
return
if cast_node_3 is not None and (node_before_reduce.input[0] not in mul_node_3.input
or reduce_mean_node_1.input[0] not in sub_node_1.input):
if cast_node_3 is not None and (
node_before_reduce.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
):
logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
return
@ -193,19 +249,33 @@ class FusionLayerNormalizationTF(Fusion):
return
subgraph_nodes = [
node, sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0, reduce_mean_node_0,
mul_node_2, sub_node_1, reduce_mean_node_1, mul_node_3
node,
sub_node_0,
mul_node_0,
mul_node_1,
reciprocol_node,
sqrt_node,
add_node_0,
reduce_mean_node_0,
mul_node_2,
sub_node_1,
reduce_mean_node_1,
mul_node_3,
]
if cast_node_3 is not None:
cast_node_2 = self.model.match_parent(mul_node_0, 'Cast', 0, output_name_to_node)
cast_node_2 = self.model.match_parent(mul_node_0, "Cast", 0, output_name_to_node)
if cast_node_2 is None:
logger.debug("cast_node_2 not found")
return
subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, node.output, self.model.input_name_to_nodes(),
self.model.output_name_to_node()):
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
node.output,
self.model.input_name_to_nodes(),
self.model.output_name_to_node(),
):
logger.debug("not safe to fuse layer normalization")
return
@ -214,11 +284,13 @@ class FusionLayerNormalizationTF(Fusion):
weight_input = mul_node_1.input[1]
bias_input = sub_node_0.input[0]
#TODO: add epsilon attribute
fused_node = helper.make_node('LayerNormalization',
inputs=[mul_node_3.input[0], weight_input, bias_input],
outputs=[node.output[0]],
name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"))
# TODO: add epsilon attribute
fused_node = helper.make_node(
"LayerNormalization",
inputs=[mul_node_3.input[0], weight_input, bias_input],
outputs=[node.output[0]],
name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
)
fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше