* DNNL: Moving Files to rename file names

* DNNL name change

* azure pipeline updated

* disable ceil/dialation and enable Opset10

* disable ceil/dialation tests in Python

* mlperf_ssd_resnet34_1200 disabled
This commit is contained in:
Sreekanth Yalachigere 2019-12-03 07:34:23 -08:00 коммит произвёл George Wu
Родитель 3d627362a0
Коммит 31ea11a696
82 изменённых файлов: 1208 добавлений и 1202 удалений

Просмотреть файл

@ -96,7 +96,7 @@ The complete list of build options can be found by running `./build.sh (or .\bui
**Execution Providers**
* [NVIDIA CUDA](#CUDA)
* [NVIDIA TensorRT](#TensorRT)
* [Intel MKL-DNN/MKL-ML](#MKLDNN-and-MKLML)
* [Intel DNNL/MKL-ML](#DNNL-and-MKLML)
* [Intel nGraph](#nGraph)
* [Intel OpenVINO](#openvino)
* [Android NNAPI](#Android)
@ -203,15 +203,12 @@ Dockerfile instructions are available [here](./dockerfiles#tensorrt)
---
### MKLDNN and MKLML
See more information on MKL-DNN and MKL-ML [here](./docs/execution_providers/MKL-DNN-ExecutionProvider.md).
### DNNL and MKLML
See more information on DNNL and MKL-ML [here](./docs/execution_providers/DNNL-ExecutionProvider.md).
#### Build Instructions
##### Linux
MKL-DNN: `./build.sh --use_mkldnn`
MKL-DNN built with dependency on MKL small libraries: `./build.sh --use_mkldnn --use_mklml`
DNNL: `./build.sh --use_dnnl`
---

Просмотреть файл

@ -58,7 +58,7 @@ Currently ONNX Runtime supports the following accelerators:
* NVIDIA CUDA
* Intel MKL-ML
* [Intel MKL-DNN](./docs/execution_providers/MKL-DNN-ExecutionProvider.md) - [subgraph optimization](./docs/execution_providers/MKL-DNN-Subgraphs.md)
* [Intel DNNL](./docs/execution_providers/MKL-DNN-ExecutionProvider.md) - [subgraph optimization](./docs/execution_providers/MKL-DNN-Subgraphs.md)
* [Intel nGraph](./docs/execution_providers/nGraph-ExecutionProvider.md)
* [NVIDIA TensorRT](./docs/execution_providers/TensorRT-ExecutionProvider.md)
* [Intel OpenVINO](./docs/execution_providers/OpenVINO-ExecutionProvider.md)

Просмотреть файл

@ -610,7 +610,7 @@ Exhibit B - "Incompatible With Secondary Licenses" Notice
_____
intel/mkl-dnn
intel/dnnl
Copyright 2016-2018 Intel Corporation

Просмотреть файл

@ -52,8 +52,8 @@ option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
option(onnxruntime_USE_NSYNC "Build with NSYNC support. This option only takes effect on Linux" OFF)
option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF)
option(onnxruntime_USE_MKLDNN "Build with MKL-DNN support" OFF)
option(onnxruntime_USE_MKLML "Build MKL-DNN with MKL-ML binary dependency" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_MKLML "Build DNNL with MKL-ML binary dependency" OFF)
option(onnxruntime_USE_GEMMLOWP "Build with gemmlowp for quantized gemm" OFF)
option(onnxruntime_USE_AUTOML "Build AutoML support" ON)
option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF)
@ -327,7 +327,7 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DGSL_UNEN
include(eigen)
#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn, jemalloc,
# mkldnn/mklml, openblas, onnxruntime_codegen_tvm, tvm, nnvm_compiler and pthread
# dnnl/mklml, openblas, onnxruntime_codegen_tvm, tvm, nnvm_compiler and pthread
# pthread is always at the last
set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto protobuf::libprotobuf re2)
@ -346,8 +346,8 @@ if (onnxruntime_USE_ACL)
endif()
# MKLML
if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
include(mkldnn)
if (onnxruntime_USE_DNNL OR onnxruntime_USE_MKLML)
include(dnnl)
endif()
# TVM
@ -537,10 +537,10 @@ if (onnxruntime_USE_MKLML)
link_directories(${MKLML_LIB_DIR})
endif()
if (onnxruntime_USE_MKLDNN)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES mkldnn)
list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES project_mkldnn)
link_directories(${MKLDNN_LIB_DIR})
if (onnxruntime_USE_DNNL)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES dnnl)
list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES project_dnnl)
link_directories(${DNNL_LIB_DIR})
endif()
if (onnxruntime_USE_NGRAPH)

86
cmake/external/dnnl.cmake поставляемый Normal file
Просмотреть файл

@ -0,0 +1,86 @@
include (ExternalProject)
set(DNNL_URL https://github.com/intel/mkl-dnn.git)
# If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
set(DNNL_TAG v1.1.1)
set(MKLML_VERSION 2019.0.5.20190502)
if(WIN32)
set(MKLML_OS_VERSION_STR "win")
set(MKLML_FILE_EXTENSION "zip")
set(DNNL_SHARED_LIB dnnl.dll)
set(DNNL_IMPORT_LIB dnnl.lib)
if(onnxruntime_USE_MKLML)
# Windows-only updated MKLML binary which contains fix for thread cleanup hang.
set(MKLML_VERSION 2020.0.20190813)
set(MKLML_SHARED_LIB mklml.dll)
set(MKLML_IMPORT_LIB mklml.lib)
set(IOMP5MD_SHARED_LIB libiomp5md.dll)
set(IOMP5MD_IMPORT_LIB libiomp5md.lib)
endif()
else()
set(MKLML_FILE_EXTENSION "tgz")
if (APPLE)
set(DNNL_SHARED_LIB libdnnl.1.dylib)
set(MKLML_OS_VERSION_STR "mac")
else()
set(DNNL_SHARED_LIB libdnnl.so.1)
set(MKLML_OS_VERSION_STR "lnx")
endif()
if(onnxruntime_USE_MKLML)
set(MKLML_SHARED_LIB libmklml_intel.so)
set(IOMP5MD_SHARED_LIB libiomp5.so)
endif()
endif()
if (onnxruntime_USE_MKLML)
set(MKLDNN_VERSION_SHORT v0.20)
set(MKLML_URL https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_VERSION_SHORT}/mklml_${MKLML_OS_VERSION_STR}_${MKLML_VERSION}.${MKLML_FILE_EXTENSION})
ExternalProject_Add(project_mklml
PREFIX mklml
URL ${MKLML_URL}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND "" )
set(MKML_DIR ${CMAKE_CURRENT_BINARY_DIR}/mklml/src/project_mklml)
set(MKLML_INCLUDE_DIR "${MKML_DIR}/include")
set(MKLML_LIB_DIR "${MKML_DIR}/lib")
link_directories(${MKLML_LIB_DIR})
endif()
if (onnxruntime_USE_DNNL)
set(DNNL_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/dnnl/src/dnnl/src)
set(DNNL_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/dnnl/install)
set(DNNL_LIB_DIR ${DNNL_INSTALL}/${CMAKE_INSTALL_LIBDIR})
if(WIN32)
set(DNNL_DLL_PATH ${DNNL_INSTALL}/${CMAKE_INSTALL_BINDIR}/${DNNL_SHARED_LIB})
else()
set(DNNL_DLL_PATH ${DNNL_LIB_DIR}/${DNNL_SHARED_LIB})
endif()
set(DNNL_INCLUDE_DIR ${DNNL_INSTALL}/include)
set(DNNL_CMAKE_EXTRA_ARGS)
# set(DNNL_PATCH_COMMAND git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/constexpr.patch)
# discard prior changes due to patching in mkldnn source to unblock incremental builds.
# set(MKLDNN_PATCH_DISCARD_COMMAND cd ${DNNL_SOURCE} && git checkout -- .)
# if(NOT onnxruntime_BUILD_FOR_NATIVE_MACHINE)
# pre-v1.0
# list(APPEND DNNL_CMAKE_EXTRA_ARGS "-DARCH_OPT_FLAGS=")
# v1.0
# list(APPEND DNNL_CMAKE_EXTRA_ARGS "-DDNNL_ARCH_OPT_FLAGS=")
# endif()
ExternalProject_Add(project_dnnl
PREFIX dnnl
GIT_REPOSITORY ${DNNL_URL}
GIT_TAG ${DNNL_TAG}
# PATCH_COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${DNNL_PATCH_COMMAND}
SOURCE_DIR ${DNNL_SOURCE}
CMAKE_ARGS -DDNNL_PRODUCT_BUILD_MODE=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${DNNL_INSTALL} -DMKLROOT=${MKML_DIR}
)
link_directories(${DNNL_LIB_DIR})
#if (onnxruntime_USE_MKLML)
# add_dependencies(project_dnnl project_mklml)
#endif()
endif()

86
cmake/external/mkldnn.cmake поставляемый
Просмотреть файл

@ -1,86 +0,0 @@
include (ExternalProject)
set(MKLDNN_URL https://github.com/intel/mkl-dnn.git)
# If MKLDNN_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
set(MKLDNN_TAG v1.0.2)
set(MKLML_VERSION 2019.0.5.20190502)
if(WIN32)
set(MKLML_OS_VERSION_STR "win")
set(MKLML_FILE_EXTENSION "zip")
set(MKLDNN_SHARED_LIB mkldnn.dll)
set(MKLDNN_IMPORT_LIB mkldnn.lib)
if(onnxruntime_USE_MKLML)
# Windows-only updated MKLML binary which contains fix for thread cleanup hang.
set(MKLML_VERSION 2020.0.20190813)
set(MKLML_SHARED_LIB mklml.dll)
set(MKLML_IMPORT_LIB mklml.lib)
set(IOMP5MD_SHARED_LIB libiomp5md.dll)
set(IOMP5MD_IMPORT_LIB libiomp5md.lib)
endif()
else()
set(MKLML_FILE_EXTENSION "tgz")
if (APPLE)
set(MKLDNN_SHARED_LIB libmkldnn.1.dylib)
set(MKLML_OS_VERSION_STR "mac")
else()
set(MKLDNN_SHARED_LIB libmkldnn.so.1)
set(MKLML_OS_VERSION_STR "lnx")
endif()
if(onnxruntime_USE_MKLML)
set(MKLML_SHARED_LIB libmklml_intel.so)
set(IOMP5MD_SHARED_LIB libiomp5.so)
endif()
endif()
if (onnxruntime_USE_MKLML)
set(MKLDNN_VERSION_SHORT v0.20)
set(MKLML_URL https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_VERSION_SHORT}/mklml_${MKLML_OS_VERSION_STR}_${MKLML_VERSION}.${MKLML_FILE_EXTENSION})
ExternalProject_Add(project_mklml
PREFIX mklml
URL ${MKLML_URL}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND "" )
set(MKML_DIR ${CMAKE_CURRENT_BINARY_DIR}/mklml/src/project_mklml)
set(MKLML_INCLUDE_DIR "${MKML_DIR}/include")
set(MKLML_LIB_DIR "${MKML_DIR}/lib")
link_directories(${MKLML_LIB_DIR})
endif()
if (onnxruntime_USE_MKLDNN)
set(MKLDNN_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/mkl-dnn/src/mkl-dnn/src)
set(MKLDNN_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/mkl-dnn/install)
set(MKLDNN_LIB_DIR ${MKLDNN_INSTALL}/${CMAKE_INSTALL_LIBDIR})
if(WIN32)
set(MKLDNN_DLL_PATH ${MKLDNN_INSTALL}/${CMAKE_INSTALL_BINDIR}/${MKLDNN_SHARED_LIB})
else()
set(MKLDNN_DLL_PATH ${MKLDNN_LIB_DIR}/${MKLDNN_SHARED_LIB})
endif()
set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL}/include)
set(MKLDNN_CMAKE_EXTRA_ARGS)
set(MKLDNN_PATCH_COMMAND git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/constexpr.patch)
# discard prior changes due to patching in mkldnn source to unblock incremental builds.
set(MKLDNN_PATCH_DISCARD_COMMAND cd ${MKLDNN_SOURCE} && git checkout -- .)
if(NOT onnxruntime_BUILD_FOR_NATIVE_MACHINE)
# pre-v1.0
list(APPEND MKLDNN_CMAKE_EXTRA_ARGS "-DARCH_OPT_FLAGS=")
# v1.0
list(APPEND MKLDNN_CMAKE_EXTRA_ARGS "-DMKLDNN_ARCH_OPT_FLAGS=")
endif()
ExternalProject_Add(project_mkldnn
PREFIX mkl-dnn
GIT_REPOSITORY ${MKLDNN_URL}
GIT_TAG ${MKLDNN_TAG}
PATCH_COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${MKLDNN_PATCH_COMMAND}
SOURCE_DIR ${MKLDNN_SOURCE}
CMAKE_ARGS -DMKLDNN_PRODUCT_BUILD_MODE=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL} -DMKLROOT=${MKML_DIR} ${MKLDNN_CMAKE_EXTRA_ARGS}
)
link_directories(${MKLDNN_LIB_DIR})
if (onnxruntime_USE_MKLML)
add_dependencies(project_mkldnn project_mklml)
endif()
endif()

Просмотреть файл

@ -58,7 +58,7 @@ target_link_libraries(onnxruntime PRIVATE
onnxruntime_session
${onnxruntime_libs}
${PROVIDERS_CUDA}
${PROVIDERS_MKLDNN}
${PROVIDERS_DNNL}
${PROVIDERS_NGRAPH}
${PROVIDERS_NNAPI}
${PROVIDERS_TENSORRT}

Просмотреть файл

@ -14,8 +14,8 @@ if (onnxruntime_USE_CUDA)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_CUDA,")
endif()
if (onnxruntime_USE_MKLDNN)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_MKLDNN,")
if (onnxruntime_USE_DNNL)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_DNNL,")
endif()
if (onnxruntime_USE_TENSORRT)

Просмотреть файл

@ -40,9 +40,9 @@ file(GLOB onnxruntime_providers_common_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/*.cc"
)
if(onnxruntime_USE_MKLDNN)
set(PROVIDERS_MKLDNN onnxruntime_providers_mkldnn)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES mkldnn)
if(onnxruntime_USE_DNNL)
set(PROVIDERS_DNNL onnxruntime_providers_dnnl)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES dnnl)
endif()
if(onnxruntime_USE_NGRAPH)
set(PROVIDERS_NGRAPH onnxruntime_providers_ngraph)
@ -177,20 +177,20 @@ if (onnxruntime_USE_CUDA)
endif()
endif()
if (onnxruntime_USE_MKLDNN)
file(GLOB_RECURSE onnxruntime_providers_mkldnn_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/mkldnn/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/mkldnn/*.cc"
if (onnxruntime_USE_DNNL)
file(GLOB_RECURSE onnxruntime_providers_dnnl_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/dnnl/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/dnnl/*.cc"
)
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_mkldnn_cc_srcs})
add_library(onnxruntime_providers_mkldnn ${onnxruntime_providers_mkldnn_cc_srcs})
onnxruntime_add_include_to_target(onnxruntime_providers_mkldnn onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf)
add_dependencies(onnxruntime_providers_mkldnn ${onnxruntime_EXTERNAL_DEPENDENCIES})
set_target_properties(onnxruntime_providers_mkldnn PROPERTIES FOLDER "ONNXRuntime")
target_include_directories(onnxruntime_providers_mkldnn PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${MKLDNN_INCLUDE_DIR})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/mkldnn DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers_mkldnn PROPERTIES LINKER_LANGUAGE CXX)
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dnnl_cc_srcs})
add_library(onnxruntime_providers_dnnl ${onnxruntime_providers_dnnl_cc_srcs})
onnxruntime_add_include_to_target(onnxruntime_providers_dnnl onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf)
add_dependencies(onnxruntime_providers_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers_dnnl PROPERTIES LINKER_LANGUAGE CXX)
endif()
if (onnxruntime_USE_TENSORRT)

Просмотреть файл

@ -50,8 +50,8 @@ if(onnxruntime_PYBIND_EXPORT_OPSCHEMA)
target_compile_definitions(onnxruntime_pybind11_state PRIVATE onnxruntime_PYBIND_EXPORT_OPSCHEMA)
endif()
if (onnxruntime_USE_MKLDNN)
target_compile_definitions(onnxruntime_pybind11_state PRIVATE USE_MKLDNN=1)
if (onnxruntime_USE_DNNL)
target_compile_definitions(onnxruntime_pybind11_state PRIVATE USE_DNNL=1)
endif()
target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${PYTHON_INCLUDE_DIR} ${NUMPY_INCLUDE_DIR})
@ -68,7 +68,7 @@ set(onnxruntime_pybind11_state_libs
onnxruntime_session
${onnxruntime_libs}
${PROVIDERS_CUDA}
${PROVIDERS_MKLDNN}
${PROVIDERS_DNNL}
${PROVIDERS_TENSORRT}
${PROVIDERS_NGRAPH}
${PROVIDERS_OPENVINO}
@ -198,10 +198,10 @@ add_custom_command(
$<TARGET_FILE_DIR:${test_data_target}>
)
if (onnxruntime_USE_MKLDNN)
if (onnxruntime_USE_DNNL)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_DLL_PATH}
COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH}
$<TARGET_FILE_DIR:${test_data_target}>/onnxruntime/capi/
)
endif()

Просмотреть файл

@ -202,8 +202,8 @@ if(onnxruntime_USE_CUDA)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda)
endif()
if(onnxruntime_USE_MKLDNN)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_mkldnn)
if(onnxruntime_USE_DNNL)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_dnnl)
endif()
if(onnxruntime_USE_NGRAPH)
@ -260,7 +260,7 @@ set(ONNXRUNTIME_TEST_LIBS
${ONNXRUNTIME_INTEROP_TEST_LIBS}
${onnxruntime_libs}
${PROVIDERS_CUDA}
${PROVIDERS_MKLDNN}
${PROVIDERS_DNNL}
${PROVIDERS_TENSORRT}
${PROVIDERS_NGRAPH}
${PROVIDERS_OPENVINO}
@ -314,8 +314,8 @@ onnxruntime_add_include_to_target(onnxruntime_test_utils_for_framework onnxrunti
if (onnxruntime_USE_FULL_PROTOBUF)
target_compile_definitions(onnxruntime_test_utils_for_framework PRIVATE USE_FULL_PROTOBUF=1)
endif()
if (onnxruntime_USE_MKLDNN)
target_compile_definitions(onnxruntime_test_utils_for_framework PUBLIC USE_MKLDNN=1)
if (onnxruntime_USE_DNNL)
target_compile_definitions(onnxruntime_test_utils_for_framework PUBLIC USE_DNNL=1)
endif()
add_dependencies(onnxruntime_test_utils_for_framework ${onnxruntime_EXTERNAL_DEPENDENCIES})
target_include_directories(onnxruntime_test_utils_for_framework PUBLIC "${TEST_SRC_DIR}/util/include" PRIVATE ${eigen_INCLUDE_DIRS} ${ONNXRUNTIME_ROOT})
@ -329,8 +329,8 @@ onnxruntime_add_include_to_target(onnxruntime_test_utils onnxruntime_framework g
if (onnxruntime_USE_FULL_PROTOBUF)
target_compile_definitions(onnxruntime_test_utils PRIVATE USE_FULL_PROTOBUF=1)
endif()
if (onnxruntime_USE_MKLDNN)
target_compile_definitions(onnxruntime_test_utils PUBLIC USE_MKLDNN=1)
if (onnxruntime_USE_DNNL)
target_compile_definitions(onnxruntime_test_utils PUBLIC USE_DNNL=1)
endif()
add_dependencies(onnxruntime_test_utils ${onnxruntime_EXTERNAL_DEPENDENCIES})
target_include_directories(onnxruntime_test_utils PUBLIC "${TEST_SRC_DIR}/util/include" PRIVATE ${eigen_INCLUDE_DIRS} ${ONNXRUNTIME_ROOT})
@ -440,11 +440,11 @@ add_custom_command(
${TEST_DATA_SRC}
${TEST_DATA_DES})
if(WIN32)
if (onnxruntime_USE_MKLDNN)
list(APPEND onnx_test_libs mkldnn)
if (onnxruntime_USE_DNNL)
list(APPEND onnx_test_libs dnnl)
add_custom_command(
TARGET ${test_data_target} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_DLL_PATH} $<TARGET_FILE_DIR:${test_data_target}>
COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $<TARGET_FILE_DIR:${test_data_target}>
)
endif()
if (onnxruntime_USE_MKLML)

Просмотреть файл

@ -18,7 +18,7 @@
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(NativeBuildOutputDir)\mkldnn.dll" Condition="Exists('$(NativeBuildOutputDir)\mkldnn.dll')">
<None Include="$(NativeBuildOutputDir)\dnnl.dll" Condition="Exists('$(NativeBuildOutputDir)\dnnl.dll')">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
<Visible>false</Visible>
</None>

Просмотреть файл

@ -79,8 +79,8 @@
CopyToOutputDirectory="Always"
Visible="false"
/>
<None Include="$(NativeBuildOutputDir)\mkldnn.dll"
Condition="Exists('$(NativeBuildOutputDir)\mkldnn.dll')"
<None Include="$(NativeBuildOutputDir)\dnnl.dll"
Condition="Exists('$(NativeBuildOutputDir)\dnnl.dll')"
PackagePath="\runtimes\win-$(TargetArchitecture)\native"
Pack="true"
CopyToOutputDirectory="Always"

Просмотреть файл

@ -412,9 +412,9 @@ namespace Microsoft.ML.OnnxRuntime
[DllImport(nativeLib, CharSet = charSet)]
public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_CPU(IntPtr /*(OrtSessionOptions*) */ options, int use_arena);
#if USE_MKLDNN
#if USE_DNNL
[DllImport(nativeLib, CharSet = charSet)]
public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_Mkldnn(IntPtr /*(OrtSessionOptions*) */ options, int use_arena);
public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_Dnnl(IntPtr /*(OrtSessionOptions*) */ options, int use_arena);
#endif
#if USE_CUDA

Просмотреть файл

@ -81,10 +81,10 @@ namespace Microsoft.ML.OnnxRuntime
NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsAppendExecutionProvider_CPU(_nativePtr, useArena));
}
#if USE_MKLDNN
public void AppendExecutionProvider_Mkldnn(int useArena)
#if USE_DNNL
public void AppendExecutionProvider_Dnnl(int useArena)
{
NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsAppendExecutionProvider_Mkldnn(_nativePtr, useArena));
NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsAppendExecutionProvider_Dnnl(_nativePtr, useArena));
}
#endif

Просмотреть файл

@ -31,10 +31,10 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\mkldnn.dll"
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\dnnl.dll"
Condition="'$(PlatformTarget)' == 'x64' AND
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\mkldnn.dll')">
<Link>mkldnn.dll</Link>
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\dnnl.dll')">
<Link>dnnl.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
@ -58,10 +58,10 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\mkldnn.dll"
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\dnnl.dll"
Condition="'$(PlatformTarget)' == 'x86' AND
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\mkldnn.dll')">
<Link>mkldnn.dll</Link>
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\dnnl.dll')">
<Link>dnnl.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>

Просмотреть файл

@ -78,8 +78,8 @@ namespace Microsoft.ML.OnnxRuntime.Tests
Assert.Throws<OnnxRuntimeException>(() => { opt.GraphOptimizationLevel = (GraphOptimizationLevel)10; });
opt.AppendExecutionProvider_CPU(1);
#if USE_MKLDNN
opt.AppendExecutionProvider_Mkldnn(0);
#if USE_DNNL
opt.AppendExecutionProvider_Dnnl(0);
#endif
#if USE_CUDA
opt.AppendExecutionProvider_CUDA(0);
@ -1106,8 +1106,8 @@ namespace Microsoft.ML.OnnxRuntime.Tests
var entryPointNames = new[]{
"OrtGetApiBase",
"OrtSessionOptionsAppendExecutionProvider_CPU"
#if USE_MKLDNN
,"OrtSessionOptionsAppendExecutionProvider_Mkldnn"
#if USE_DNNL
,"OrtSessionOptionsAppendExecutionProvider_Dnnl"
#endif
#if USE_CUDA
,"OrtSessionOptionsAppendExecutionProvider_CUDA"

Просмотреть файл

@ -19,7 +19,7 @@
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(NativeBuildOutputDir)\mkldnn.dll" Condition="Exists('$(NativeBuildOutputDir)\mkldnn.dll')">
<None Include="$(NativeBuildOutputDir)\dnnl.dll" Condition="Exists('$(NativeBuildOutputDir)\dnnl.dll')">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
<Visible>false</Visible>
</None>

Просмотреть файл

@ -27,14 +27,14 @@ ENV PATH="/usr/local/go/bin:${PATH}"
WORKDIR /
RUN mkdir -p /onnxruntime/build
RUN python3 /onnxruntime/tools/ci_build/build.py --build_dir /onnxruntime/build --config Release --use_mkldnn --build_server --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
RUN python3 /onnxruntime/tools/ci_build/build.py --build_dir /onnxruntime/build --config Release --use_dnnl --build_server --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
FROM ubuntu:16.04 AS minimal
WORKDIR /onnxruntime/server/
COPY --from=build /onnxruntime/build/Release/onnxruntime_server /onnxruntime/server/
COPY --from=build /onnxruntime/build/Release/libonnxruntime.so.* /lib/
COPY --from=build /onnxruntime/build/Release/mkl-dnn/install/lib/libmkldnn.so* /lib/
COPY --from=build /onnxruntime/build/Release/mkl-dnn/install/lib/libdnnl.so* /lib/
RUN apt-get update && apt-get install -y libgomp1
ENTRYPOINT ["/onnxruntime/server/onnxruntime_server"]

Просмотреть файл

@ -27,7 +27,7 @@ ENV PATH="/usr/local/go/bin:${PATH}"
WORKDIR /
RUN mkdir -p /onnxruntime/build
RUN python3 /onnxruntime/tools/ci_build/build.py --build_dir /onnxruntime/build --config Release --use_mkldnn --use_mklml \
RUN python3 /onnxruntime/tools/ci_build/build.py --build_dir /onnxruntime/build --config Release --use_dnnl --use_mklml \
--build_server --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
@ -36,7 +36,7 @@ WORKDIR /onnxruntime/server/
RUN apt-get update && apt-get install -y libgomp1
COPY --from=build /onnxruntime/build/Release/onnxruntime_server /onnxruntime/server/
COPY --from=build /onnxruntime/build/Release/libonnxruntime.so.* /lib/
COPY --from=build /onnxruntime/build/Release/mkl-dnn/install/lib/libmkldnn.so* /lib/
COPY --from=build /onnxruntime/build/Release/mkl-dnn/install/lib/libdnnl.so* /lib/
COPY --from=build /onnxruntime/build/Release/mklml/src/project_mklml/lib/*.so* /lib/
ENTRYPOINT ["/onnxruntime/server/onnxruntime_server"]

Просмотреть файл

@ -15,8 +15,8 @@ Examples:
Provider](../onnxruntime/core/providers/cpu/cpu_execution_provider.h)
* [CUDA Execution
Provider](../onnxruntime/core/providers/cuda/cuda_execution_provider.h)
* [MKL-DNN Execution
Provider](../onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h)
* [DNNL Execution
Provider](../onnxruntime/core/providers/dnnl/dnnl_execution_provider.h)
# Using the execution provider

Просмотреть файл

@ -5,7 +5,7 @@
* Creating an InferenceSession from an on-disk model file and a set of SessionOptions.
* Registering customized loggers.
* Registering customized allocators.
* Registering predefined providers and set the priority order. ONNXRuntime has a set of predefined execution providers, like CUDA, MKLDNN. User can register providers to their InferenceSession. The order of registration indicates the preference order as well.
* Registering predefined providers and set the priority order. ONNXRuntime has a set of predefined execution providers, like CUDA, DNNL. User can register providers to their InferenceSession. The order of registration indicates the preference order as well.
* Running a model with inputs. These inputs must be in CPU memory, not GPU. If the model has multiple outputs, user can specify which outputs they want.
* Converting an in-memory ONNX Tensor encoded in protobuf format to a pointer that can be used as model input.
* Setting the thread pool size for each session.

Просмотреть файл

@ -35,7 +35,7 @@ Options:
-r [repeat]: Specifies the number of times to repeat
-v: verbose
-n [test_case_name]: Specifies a single test case to run.
-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'mkldnn' or 'tensorrt'. Default: 'cpu'.
-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl' or 'tensorrt'. Default: 'cpu'.
-x: Use parallel executor, default (without -x): sequential executor.
-h: help

Просмотреть файл

@ -20,7 +20,7 @@ Official Python packages on Pypi only support the default CPU (MLAS) and default
For example:
`MKLDNN: ./build.sh --config RelWithDebInfo --use_mkldnn --build_wheel --parallel`
`DNNL: ./build.sh --config RelWithDebInfo --use_dnnl --build_wheel --parallel`
` CUDA: ./build.sh --config RelWithDebInfo --use_cuda --build_wheel --parallel`
@ -30,11 +30,11 @@ Official release (nuget package) supports default (MLAS) and MKL-ML for CPU, and
For example:
`MKLDNN: ./build.sh --config RelWithDebInfo --use_mkldnn --build_csharp --parallel`
`DNNL: ./build.sh --config RelWithDebInfo --use_dnnl --build_csharp --parallel`
`CUDA: ./build.sh --config RelWithDebInfo --use_cuda --build_csharp --parallel`
In order to use MKLDNN, nGraph, CUDA, or TensorRT execution provider, you need to call the C API OrtSessionOptionsAppendExecutionProvider. Here is an example for the CUDA execution provider:
In order to use DNNL, nGraph, CUDA, or TensorRT execution provider, you need to call the C API OrtSessionOptionsAppendExecutionProvider. Here is an example for the CUDA execution provider:
C API Example:
```c

Просмотреть файл

@ -450,7 +450,7 @@
| |
## Operators implemented by MKLDNNExecutionProvider
## Operators implemented by DNNLExecutionProvider
| Op Name | Parameters | OpSet Version | Types Supported |
|---------|------------|---------------|-----------------|

Просмотреть файл

@ -78,7 +78,7 @@ To achieve the best performance on a growing set of compute targets across cloud
|Supported|Future|
|---|---|
|MLAS (Microsoft Linear Algebra Subprograms)|Android NN API (in progress)|
|Intel MKL-DNN / MKL-ML|ARM Compute Library (community contribution by NXP, in progress)|
|Intel DNNL / MKL-ML|ARM Compute Library (community contribution by NXP, in progress)|
|Intel nGraph| |
|NVIDIA CUDA| |
|NVIDIA TensorRT| |

Просмотреть файл

@ -20,7 +20,7 @@ alias cmake="/usr/bin/cmake -DCMAKE_TOOLCHAIN_FILE=$OECORE_NATIVE_SYSROOT/usr/sh
Confiure ONNX Runtime with ACL support:
```
cmake ../onnxruntime-arm-upstream/cmake -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc -Donnxruntime_RUN_ONNX_TESTS=OFF -Donnxruntime_GENERATE_TEST_REPORTS=ON -Donnxruntime_DEV_MODE=ON -DPYTHON_EXECUTABLE=/usr/bin/python3 -Donnxruntime_USE_CUDA=OFF -Donnxruntime_USE_NSYNC=OFF -Donnxruntime_CUDNN_HOME= -Donnxruntime_USE_JEMALLOC=OFF -Donnxruntime_ENABLE_PYTHON=OFF -Donnxruntime_BUILD_CSHARP=OFF -Donnxruntime_BUILD_SHARED_LIB=ON -Donnxruntime_USE_EIGEN_FOR_BLAS=ON -Donnxruntime_USE_OPENBLAS=OFF -Donnxruntime_USE_ACL=ON -Donnxruntime_USE_MKLDNN=OFF -Donnxruntime_USE_MKLML=OFF -Donnxruntime_USE_OPENMP=ON -Donnxruntime_USE_TVM=OFF -Donnxruntime_USE_LLVM=OFF -Donnxruntime_ENABLE_MICROSOFT_INTERNAL=OFF -Donnxruntime_USE_BRAINSLICE=OFF -Donnxruntime_USE_NUPHAR=OFF -Donnxruntime_USE_EIGEN_THREADPOOL=OFF -Donnxruntime_BUILD_UNIT_TESTS=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo
cmake ../onnxruntime-arm-upstream/cmake -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc -Donnxruntime_RUN_ONNX_TESTS=OFF -Donnxruntime_GENERATE_TEST_REPORTS=ON -Donnxruntime_DEV_MODE=ON -DPYTHON_EXECUTABLE=/usr/bin/python3 -Donnxruntime_USE_CUDA=OFF -Donnxruntime_USE_NSYNC=OFF -Donnxruntime_CUDNN_HOME= -Donnxruntime_USE_JEMALLOC=OFF -Donnxruntime_ENABLE_PYTHON=OFF -Donnxruntime_BUILD_CSHARP=OFF -Donnxruntime_BUILD_SHARED_LIB=ON -Donnxruntime_USE_EIGEN_FOR_BLAS=ON -Donnxruntime_USE_OPENBLAS=OFF -Donnxruntime_USE_ACL=ON -Donnxruntime_USE_DNNL=OFF -Donnxruntime_USE_MKLML=OFF -Donnxruntime_USE_OPENMP=ON -Donnxruntime_USE_TVM=OFF -Donnxruntime_USE_LLVM=OFF -Donnxruntime_ENABLE_MICROSOFT_INTERNAL=OFF -Donnxruntime_USE_BRAINSLICE=OFF -Donnxruntime_USE_NUPHAR=OFF -Donnxruntime_USE_EIGEN_THREADPOOL=OFF -Donnxruntime_BUILD_UNIT_TESTS=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo
```
Build ONNX Runtime library, test and performance application:

Просмотреть файл

@ -1,13 +1,13 @@
# MKL-DNN Execution Provider
# DNNL Execution Provider
Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN) is an open-source performance library for deep-learning applications. The library accelerates deep-learning applications and frameworks on Intel® architecture and Intel® Processor Graphics Architecture. Intel MKL-DNN contains vectorized and threaded building blocks that you can use to implement deep neural networks (DNN) with C and C++ interfaces. For more, please see the MKL-DNN documentation on (https://intel.github.io/mkl-dnn/).
Intel® Math Kernel Library for Deep Neural Networks (Intel® DNNL) is an open-source performance library for deep-learning applications. The library accelerates deep-learning applications and frameworks on Intel® architecture and Intel® Processor Graphics Architecture. Intel DNNL contains vectorized and threaded building blocks that you can use to implement deep neural networks (DNN) with C and C++ interfaces. For more, please see the DNNL documentation on (https://intel.github.io/mkl-dnn/).
Intel and Microsoft have developed MKL-DNN Execution Provider (EP) for ONNX Runtime to accelerate performance of ONNX Runtime using Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN) optimized primitives.
Intel and Microsoft have developed DNNL Execution Provider (EP) for ONNX Runtime to accelerate performance of ONNX Runtime using Intel® Math Kernel Library for Deep Neural Networks (Intel® DNNL) optimized primitives.
For information on how MKL-DNN optimizes subgraphs, see [Subgraph Optimization](./MKL-DNN-Subgraphs.md)
For information on how DNNL optimizes subgraphs, see [Subgraph Optimization](./MKL-DNN-Subgraphs.md)
## Build
For build instructions, please see the [BUILD page](../../BUILD.md#mkldnn-and-mklml).
For build instructions, please see the [BUILD page](../../BUILD.md#dnnl-and-mklml).
## Supported OS
* Ubuntu 16.04
@ -17,18 +17,18 @@ For build instructions, please see the [BUILD page](../../BUILD.md#mkldnn-and-mk
## Supported backend
* CPU
## Using the MKL-DNN Execution Provider
## Using the DNNL Execution Provider
### C/C++
The MKLDNNExecutionProvider execution provider needs to be registered with ONNX Runtime to enable in the inference session.
The DNNLExecutionProvider execution provider needs to be registered with ONNX Runtime to enable in the inference session.
```
InferenceSession session_object{so};
session_object.RegisterExecutionProvider(std::make_unique<::onnxruntime:: MKLDNNExecutionProvider >());
session_object.RegisterExecutionProvider(std::make_unique<::onnxruntime:: DNNLExecutionProvider >());
status = session_object.Load(model_file_name);
```
The C API details are [here](../C_API.md#c-api).
### Python
When using the python wheel from the ONNX Runtime built with MKL-DNN execution provider, it will be automatically prioritized over the CPU execution provider. Python APIs details are [here](https://aka.ms/onnxruntime-python).
When using the python wheel from the ONNX Runtime built with DNNL execution provider, it will be automatically prioritized over the CPU execution provider. Python APIs details are [here](https://aka.ms/onnxruntime-python).
## Performance Tuning
For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md)

Просмотреть файл

@ -1,25 +1,25 @@
# Subgraph Optimization
MKL-DNN uses blocked layout (example: nhwc with channels blocked by 16 – nChw16c) to take advantage of vector operations using AVX512. To get best performance, we avoid reorders (example. Nchw16c to nchw) and propagate blocked layout to next primitive.
DNNL uses blocked layout (example: nhwc with channels blocked by 16 – nChw16c) to take advantage of vector operations using AVX512. To get best performance, we avoid reorders (example. Nchw16c to nchw) and propagate blocked layout to next primitive.
Subgraph optimization achieves this in the following steps.
1. Parses ONNX Runtime graph and creates an Internal Representation of subgraph..
2. Subgraph Operator (MklDnnFunKernel) iterates through MKL-DNN nodes and creates a vector MKL-DNN Kernels
3. Compute Function of MklDnnFunKernel iterates and binds data to MKL-DNN primitives in the vector and submits vector for execution.
2. Subgraph Operator (DnnlFunKernel) iterates through DNNL nodes and creates a vector DNNL Kernels
3. Compute Function of DnnlFunKernel iterates and binds data to DNNL primitives in the vector and submits vector for execution.
## Subgraph (IR) Internal Representation
MklDnnExecutionProvicer::GetCapability() parses ONNX model graph and creates IR (Internal Representation) of subgraphs of MKL-DNN operators.
Each subgraph contains a vector MklDnnNodes, inputs, outputs and attributes for all its MklDnnNodes. There can be attributes of same name. So, we prefix attribute names with Node name and its index.
DnnlExecutionProvicer::GetCapability() parses ONNX model graph and creates IR (Internal Representation) of subgraphs of DNNL operators.
Each subgraph contains a vector DnnlNodes, inputs, outputs and attributes for all its DnnlNodes. There can be attributes of same name. So, we prefix attribute names with Node name and its index.
Unique id for subgraph is set as an attribute.
MklDnnNode has an index to its inputs and outputs and pointer to its parent nodes. MklDnnNode directly reads blocked memory from its parent to avoid data reordering.
DnnlNode has an index to its inputs and outputs and pointer to its parent nodes. DnnlNode directly reads blocked memory from its parent to avoid data reordering.
<p align="left"><img src="images/mkl-dnn_node.png" /></p>
## Subgraph Classes
Primitive like MklDnnConv, MklDnnPool, etc are derived from MklDnnKernel base class.
Primitive like DnnlConv, DnnlPool, etc are derived from DnnlKernel base class.
The following UML diagram captures Subgraph classes.
@ -28,30 +28,30 @@ The following UML diagram captures Subgraph classes.
## Subgraph Execution
MklDnnExecutionProvicer::Compute() function creates MklDnnFuncKernel and call its Compute Function.
DnnlExecutionProvicer::Compute() function creates DnnlFuncKernel and call its Compute Function.
MklDnnFuncKernel::Compute function creates SubgraphPrimitve pool and add the object to a map.
DnnlFuncKernel::Compute function creates SubgraphPrimitve pool and add the object to a map.
SubgraphPrimitve constructor calls the following member functions
```
SubgraphPrimitve::CreatePrimitives()
for (auto& mklnode : mklnodes) {
if (mklnode.name == "Conv") {
kernel.reset(new MklDnnConv());
kernel.reset(new DnnlConv());
kernels.push_back(kernel);
} else if (mklnode.name == "BatchNormalization-Relu") {
kernel.reset(new MklDnnBatchNorm());
kernel.reset(new DnnlBatchNorm());
context_.kernels.push_back(kernel);
} else if (mklnode.name == "MaxPool") {
kernel.reset(new MklDnnPool());
kernel.reset(new DnnlPool());
context_.kernels.push_back(kernel);
}
.
.
.
```
In CreatePrimitives method, we iterate MklDnnNodes and creates MklDnnKernel objects and add MKL-DNN primitive to a vector. It also reads attributes. This is done only once, at first iteration.
In CreatePrimitives method, we iterate DnnlNodes and creates DnnlKernel objects and add DNNL primitive to a vector. It also reads attributes. This is done only once, at first iteration.
```
SubgraphPrimitve::Compute()
@ -61,5 +61,5 @@ SubgraphPrimitve::Compute()
stream->submit(net);
```
In SubgraphPrimitve::Compute() method, we iterate thru MklDnn Kernels and bind input data. Then we submit the vector of Primitives to MKL-DNN stream.
In SubgraphPrimitve::Compute() method, we iterate thru Dnnl Kernels and bind input data. Then we submit the vector of Primitives to DNNL stream.

Просмотреть файл

@ -24,7 +24,7 @@ constexpr const char* kMSDmlDomain = "com.microsoft.dml";
constexpr const char* kNGraphDomain = "com.intel.ai";
constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider";
constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
constexpr const char* kMklDnnExecutionProvider = "MKLDNNExecutionProvider";
constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
constexpr const char* kNGraphExecutionProvider = "NGRAPHExecutionProvider";
constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
constexpr const char* kNupharExecutionProvider = "NupharExecutionProvider";

Просмотреть файл

@ -10,7 +10,7 @@ extern "C" {
/**
* \param use_arena zero: false. non-zero: true.
*/
ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Mkldnn, _In_ OrtSessionOptions* options, int use_arena);
ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessionOptions* options, int use_arena);
#ifdef __cplusplus
}

Просмотреть файл

@ -97,7 +97,7 @@ AllocatorPtr GetAllocator(const SessionState& session_state, const OrtMemoryInfo
bool ProviderIsCpuBased(const std::string& provider_type) {
return provider_type == onnxruntime::kCpuExecutionProvider ||
provider_type == onnxruntime::kMklDnnExecutionProvider ||
provider_type == onnxruntime::kDnnlExecutionProvider ||
provider_type == onnxruntime::kNGraphExecutionProvider ||
provider_type == onnxruntime::kNupharExecutionProvider ||
provider_type == onnxruntime::kOpenVINOExecutionProvider ||

Просмотреть файл

@ -70,7 +70,7 @@ static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::st
common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
for (auto& provider : provider_types_) {
if (provider != onnxruntime::kCpuExecutionProvider &&
provider != onnxruntime::kMklDnnExecutionProvider &&
provider != onnxruntime::kDnnlExecutionProvider &&
provider != onnxruntime::kNGraphExecutionProvider &&
provider != onnxruntime::kNupharExecutionProvider &&
provider != onnxruntime::kOpenVINOExecutionProvider &&

Просмотреть файл

@ -3,28 +3,28 @@
#pragma once
#include "core/common/common.h"
#include "mkldnn.hpp"
#include "dnnl.hpp"
#include <unordered_map>
#include <list>
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
static mkldnn::memory::data_type MklDnnType();
static dnnl::memory::data_type DnnnType();
// Add more types here as needed.
template <>
mkldnn::memory::data_type MklDnnType<float>() {
return mkldnn::memory::data_type::f32;
dnnl::memory::data_type DnnnType<float>() {
return dnnl::memory::data_type::f32;
}
static mkldnn::engine& GetEngine() {
static mkldnn::engine cpu_engine = mkldnn::engine(mkldnn::engine::kind::cpu, 0);
static dnnl::engine& GetEngine() {
static dnnl::engine cpu_engine = dnnl::engine(dnnl::engine::kind::cpu, 0);
return cpu_engine;
}
static void AddDimsToKey(std::string& key, const mkldnn::memory::dims& dims) {
static void AddDimsToKey(std::string& key, const dnnl::memory::dims& dims) {
key.append(1, '#');
for (size_t i = 0; i < dims.size(); i++) {
key.append(std::to_string(dims[i]));
@ -69,5 +69,5 @@ class PrimitivePool {
return map;
}
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -8,22 +8,22 @@
#include "core/framework/allocator.h"
#include "core/framework/compute_capability.h"
#include "core/framework/kernel_registry.h"
#include "core/providers/mkldnn/subgraph/mkldnn_func_kernel.h"
#include "mkldnn_execution_provider.h"
#include "mkldnn_fwd.h"
#include "core/providers/dnnl/subgraph/dnnl_func_kernel.h"
#include "dnnl_execution_provider.h"
#include "dnnl_fwd.h"
namespace onnxruntime {
constexpr const char* MKLDNN = "MklDnn";
constexpr const char* MKLDNN_CPU = "MklDnnCpu";
constexpr const char* DNNL = "Dnnl";
constexpr const char* DNNL_CPU = "DnnlCpu";
MKLDNNExecutionProvider::MKLDNNExecutionProvider(const MKLDNNExecutionProviderInfo& info)
: IExecutionProvider{onnxruntime::kMklDnnExecutionProvider} {
DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info)
: IExecutionProvider{onnxruntime::kDnnlExecutionProvider} {
DeviceAllocatorRegistrationInfo default_memory_info({OrtMemTypeDefault,
[](int) { return onnxruntime::make_unique<CPUAllocator>(onnxruntime::make_unique<OrtMemoryInfo>(MKLDNN, OrtAllocatorType::OrtDeviceAllocator)); }, std::numeric_limits<size_t>::max()});
[](int) { return onnxruntime::make_unique<CPUAllocator>(onnxruntime::make_unique<OrtMemoryInfo>(DNNL, OrtAllocatorType::OrtDeviceAllocator)); }, std::numeric_limits<size_t>::max()});
DeviceAllocatorRegistrationInfo cpu_memory_info({OrtMemTypeCPUOutput,
[](int) { return onnxruntime::make_unique<CPUAllocator>(onnxruntime::make_unique<OrtMemoryInfo>(MKLDNN_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); }, std::numeric_limits<size_t>::max()});
[](int) { return onnxruntime::make_unique<CPUAllocator>(onnxruntime::make_unique<OrtMemoryInfo>(DNNL_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); }, std::numeric_limits<size_t>::max()});
if (info.create_arena) {
InsertAllocator(CreateAllocator(default_memory_info));
@ -38,15 +38,15 @@ MKLDNNExecutionProvider::MKLDNNExecutionProvider(const MKLDNNExecutionProviderIn
}
} // namespace onnxruntime
MKLDNNExecutionProvider::~MKLDNNExecutionProvider() {
DNNLExecutionProvider::~DNNLExecutionProvider() {
}
namespace mkl_dnn {
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 7, Gemm);
namespace ort_dnnl {
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kDnnlExecutionProvider, kOnnxDomain, 7, Gemm);
void RegisterMKLDNNKernels(KernelRegistry& kernel_registry) {
void RegisterDNNLKernels(KernelRegistry& kernel_registry) {
static const BuildKernelCreateInfoFn function_table[] = {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 7, Gemm)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kDnnlExecutionProvider, kOnnxDomain, 7, Gemm)>,
};
for (auto& function_table_entry : function_table) {
@ -54,25 +54,23 @@ void RegisterMKLDNNKernels(KernelRegistry& kernel_registry) {
}
}
std::shared_ptr<KernelRegistry> GetMklDnnKernelRegistry() {
std::shared_ptr<KernelRegistry> GetDnnlKernelRegistry() {
std::shared_ptr<KernelRegistry> kernel_registry = std::make_shared<KernelRegistry>();
RegisterMKLDNNKernels(*kernel_registry);
RegisterDNNLKernels(*kernel_registry);
return kernel_registry;
}
} // namespace mkl_dnn
} // namespace ort_dnnl
std::shared_ptr<KernelRegistry> MKLDNNExecutionProvider::GetKernelRegistry() const {
static std::shared_ptr<KernelRegistry> kernel_registry = onnxruntime::mkl_dnn::GetMklDnnKernelRegistry();
std::shared_ptr<KernelRegistry> DNNLExecutionProvider::GetKernelRegistry() const {
static std::shared_ptr<KernelRegistry> kernel_registry = onnxruntime::ort_dnnl::GetDnnlKernelRegistry();
return kernel_registry;
}
bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_viewer) const {
// switch between mkldnn-vanilla and mkldnn-subgraph implementation using
// MKLDNN_SUBGRAPH environment variable
bool DNNLExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_viewer) const {
bool use_subgraph = true;
bool FP16_graph = false;
bool mkldnn_nodes_in_the_graph = false;
bool dnnl_nodes_in_the_graph = false;
int max_node_index = graph_viewer.MaxNodeIndex();
for (auto node_index = 0; node_index < max_node_index; node_index++) {
@ -92,18 +90,18 @@ bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_
continue;
}
auto op_it = mkldnn_ops_.find(node->OpType());
if (op_it != mkldnn_ops_.end()) {
mkldnn_nodes_in_the_graph = true;
auto op_it = dnnl_ops_.find(node->OpType());
if (op_it != dnnl_ops_.end()) {
dnnl_nodes_in_the_graph = true;
break;
}
}
if (FP16_graph || !mkldnn_nodes_in_the_graph) {
if (FP16_graph || !dnnl_nodes_in_the_graph) {
// FP16 not supported yet.
use_subgraph = false;
} else {
const char* env = getenv("ORT_MKLDNN_SUBGRAPH");
const char* env = getenv("ORT_DNNL_SUBGRAPH");
if (env != nullptr) {
if (atoi(env) == 0) {
use_subgraph = false;
@ -113,9 +111,9 @@ bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_
return use_subgraph;
}
void MKLDNNExecutionProvider::CreateOrUpdateMklDnnNode(const Node* node,
std::shared_ptr<mkl_dnn::Subgraph>& subgraph_ptr,
mkl_dnn::Subgraph::SubgraphVariables& sub_var,
void DNNLExecutionProvider::CreateOrUpdateDnnlNode(const Node* node,
std::shared_ptr<ort_dnnl::Subgraph>& subgraph_ptr,
ort_dnnl::Subgraph::SubgraphVariables& sub_var,
bool fused,
std::map<std::string, size_t>& output_to_source_node_map,
NodeAttributes& subgraph_attributes) const {
@ -123,29 +121,29 @@ void MKLDNNExecutionProvider::CreateOrUpdateMklDnnNode(const Node* node,
sub_var.outputs.push_back(node->OutputDefs()[0]->Name());
if (!fused) {
mkl_dnn::MklDnnNode mkldnn_node;
mkldnn_node.name = node->OpType();
mkldnn_node.num_inputs = static_cast<int>(node->InputDefs().size());
mkldnn_node.input_start_index = static_cast<int>(sub_var.inputs.size()) - 1;
mkldnn_node.node_index = static_cast<int>(subgraph_ptr->mkldnn_nodes.size()) + 1;
ort_dnnl::DnnlNode dnnl_node;
dnnl_node.name = node->OpType();
dnnl_node.num_inputs = static_cast<int>(node->InputDefs().size());
dnnl_node.input_start_index = static_cast<int>(sub_var.inputs.size()) - 1;
dnnl_node.node_index = static_cast<int>(subgraph_ptr->dnnl_nodes.size()) + 1;
const auto& node_outputs = node->OutputDefs();
mkldnn_node.output_name = node_outputs[0]->Name();
dnnl_node.output_name = node_outputs[0]->Name();
if (node->OpType() == "Conv") {
mkldnn_node.weight_name = node->InputDefs()[1]->Name();
dnnl_node.weight_name = node->InputDefs()[1]->Name();
}
for (size_t i = 0; i < node_inputs.size(); i++) {
auto iter = output_to_source_node_map.find(node_inputs[i]->Name());
if (iter != output_to_source_node_map.end())
mkldnn_node.parent_nodes.push_back(iter->second);
dnnl_node.parent_nodes.push_back(iter->second);
}
subgraph_ptr->mkldnn_nodes.push_back(mkldnn_node);
output_to_source_node_map.insert(std::make_pair(node_outputs[0]->Name(), subgraph_ptr->mkldnn_nodes.size() - 1));
subgraph_ptr->dnnl_nodes.push_back(dnnl_node);
output_to_source_node_map.insert(std::make_pair(node_outputs[0]->Name(), subgraph_ptr->dnnl_nodes.size() - 1));
} else {
subgraph_ptr->mkldnn_nodes.back().num_inputs += static_cast<int>(node->InputDefs().size() - 1);
subgraph_ptr->dnnl_nodes.back().num_inputs += static_cast<int>(node->InputDefs().size() - 1);
const auto& node_outputs = node->OutputDefs();
output_to_source_node_map.erase(subgraph_ptr->mkldnn_nodes.back().output_name);
subgraph_ptr->mkldnn_nodes.back().output_name = node_outputs[0]->Name();
output_to_source_node_map.insert(std::make_pair(node_outputs[0]->Name(), subgraph_ptr->mkldnn_nodes.size() - 1));
output_to_source_node_map.erase(subgraph_ptr->dnnl_nodes.back().output_name);
subgraph_ptr->dnnl_nodes.back().output_name = node_outputs[0]->Name();
output_to_source_node_map.insert(std::make_pair(node_outputs[0]->Name(), subgraph_ptr->dnnl_nodes.size() - 1));
}
// Add inputs which are not in the outputs vector.
@ -163,7 +161,7 @@ void MKLDNNExecutionProvider::CreateOrUpdateMklDnnNode(const Node* node,
NodeAttributes attributes = node->GetAttributes();
if (attributes.size() > 0) {
size_t index = subgraph_ptr->mkldnn_nodes.size();
size_t index = subgraph_ptr->dnnl_nodes.size();
std::string op_name;
if (fused) {
for (auto iter = node->InputNodesBegin(); iter != node->InputNodesEnd(); ++iter) {
@ -181,29 +179,27 @@ void MKLDNNExecutionProvider::CreateOrUpdateMklDnnNode(const Node* node,
}
}
std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapability(
std::vector<std::unique_ptr<ComputeCapability>> DNNLExecutionProvider::GetCapability(
const onnxruntime::GraphViewer& graph_viewer,
const std::vector<const KernelRegistry*>& kernel_registries) const {
ORT_UNUSED_PARAMETER(kernel_registries);
// temporary switch to toggle between mkldnn-vanilla and mkldnn-subgraph implementation using
// ORT_MKLDNN_SUBGRAPH environment variable
if (UseSubgraph(graph_viewer) == false) {
return IExecutionProvider::GetCapability(graph_viewer, kernel_registries);
}
LOGS_DEFAULT(INFO) << "Using MKL-DNN Subgraph";
LOGS_DEFAULT(INFO) << "Using DNNL Subgraph";
// use sub-graph implementation
std::vector<std::unique_ptr<ComputeCapability>> result;
mkl_dnn::Subgraph::SubgraphVariables sub_var;
std::shared_ptr<mkl_dnn::Subgraph> subgraph_ptr;
ort_dnnl::Subgraph::SubgraphVariables sub_var;
std::shared_ptr<ort_dnnl::Subgraph> subgraph_ptr;
// We need graph name make PrimitivePool keys unique.
// There are several identical graphs in Model zoo and only differ in
// few attribute values. GetGraphName return graph-name + first-node-output name
std::string graph_name = GetGraphName(graph_viewer);
subgraph_ptr = onnxruntime::make_unique<mkl_dnn::Subgraph>(
mkl_dnn::Subgraph(graph_name));
subgraph_ptr = onnxruntime::make_unique<ort_dnnl::Subgraph>(
ort_dnnl::Subgraph(graph_name));
// output name to node index map. Using it to find sub-graph end nodes
// if output of a node is not an input to any node in a sub-graph is end node
@ -220,43 +216,43 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
if (IsDimensionSupported(node) == false) {
node_index++;
if (subgraph_ptr->mkldnn_nodes.size() > 0) {
if (subgraph_ptr->dnnl_nodes.size() > 0) {
CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result);
subgraph_ptr = std::make_shared<mkl_dnn::Subgraph>(mkl_dnn::Subgraph(graph_name));
subgraph_ptr = std::make_shared<ort_dnnl::Subgraph>(ort_dnnl::Subgraph(graph_name));
subgraph_attributes.clear();
output_to_source_node_map.clear();
}
continue;
}
auto op_it = mkldnn_ops_.find(node->OpType());
if (op_it != mkldnn_ops_.end()) {
auto op_it = dnnl_ops_.find(node->OpType());
if (op_it != dnnl_ops_.end()) {
sub_var.subgraph_node_indexes.push_back(node->Index());
// can we fuse (at mkldnn level) nodes?
// can we fuse (at Dnnl level) nodes?
bool fused = false;
if (sub_var.subgraph_node_indexes.size() > 1 && node->OpType() == "BatchNormalization") {
if (subgraph_ptr->mkldnn_nodes.back().name == "Conv") {
subgraph_ptr->mkldnn_nodes.back().name += "-BatchNormalization";
if (subgraph_ptr->dnnl_nodes.back().name == "Conv") {
subgraph_ptr->dnnl_nodes.back().name += "-BatchNormalization";
fused = true;
}
}
if (sub_var.subgraph_node_indexes.size() > 1 && node->OpType() == "Relu") {
if (subgraph_ptr->mkldnn_nodes.back().name == "Conv-BatchNormalization" || subgraph_ptr->mkldnn_nodes.back().name == "BatchNormalization" || subgraph_ptr->mkldnn_nodes.back().name == "Conv") {
subgraph_ptr->mkldnn_nodes.back().name += "-Relu";
if (subgraph_ptr->dnnl_nodes.back().name == "Conv-BatchNormalization" || subgraph_ptr->dnnl_nodes.back().name == "BatchNormalization" || subgraph_ptr->dnnl_nodes.back().name == "Conv") {
subgraph_ptr->dnnl_nodes.back().name += "-Relu";
fused = true;
}
}
// Create MklDnn node:
// Create Dnnl node:
// Update inputs, outputs and parent nodes
// Collect attributes and modify the key to make it unique
CreateOrUpdateMklDnnNode(node, subgraph_ptr, sub_var, fused, output_to_source_node_map, subgraph_attributes);
CreateOrUpdateDnnlNode(node, subgraph_ptr, sub_var, fused, output_to_source_node_map, subgraph_attributes);
auto temp_index = node_index + 1;
if (temp_index < graph_viewer.MaxNodeIndex()) {
if (!sub_var.subgraph_node_indexes.empty()) {
// if next node is mkldnn node and if it's input is not output of current node
// if next node is Dnnl node and if it's input is not output of current node
// if next node input is output of any of the nodes in sub-graph continue
// else
// break and create sub-graph
@ -265,8 +261,8 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
temp_index++;
next_node = graph_viewer.GetNode(temp_index);
}
auto sub_it = mkldnn_ops_.find(next_node->OpType());
if (sub_it != mkldnn_ops_.end()) {
auto sub_it = dnnl_ops_.find(next_node->OpType());
if (sub_it != dnnl_ops_.end()) {
const auto& next_node_inputs = next_node->InputDefs();
bool input_from_subgraph = true;
size_t inputs_count = 1;
@ -282,7 +278,7 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
if (input_from_subgraph == false) {
CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result);
subgraph_attributes.clear();
subgraph_ptr = std::make_shared<mkl_dnn::Subgraph>(mkl_dnn::Subgraph(graph_name));
subgraph_ptr = std::make_shared<ort_dnnl::Subgraph>(ort_dnnl::Subgraph(graph_name));
output_to_source_node_map.clear();
}
}
@ -290,8 +286,8 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
if (!sub_var.subgraph_node_indexes.empty()) {
if (node->GetOutputEdgesCount() > 1) {
// If current node has branches
// iterate and see if all nodes are mkldnn ops OR
// it ends in node with same number of input edges (mkldnn node or cpu node)
// iterate and see if all nodes are Dnnl ops OR
// it ends in node with same number of input edges (Dnnl node or cpu node)
// create sub-graph
bool create_subgraph = false;
bool break_loop = false;
@ -303,14 +299,14 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
next_node = graph_viewer.GetNode(temp_index);
}
if (next_node->GetInputEdgesCount() == node->GetOutputEdgesCount()) {
// if all nodes in the branch loop are mkldnn nodes
// if all nodes in the branch loop are Dnnl nodes
// then continue with adding nodes to sub-graph
break_loop = true;
}
// inner nodes. if inner nodes are not mkldnn nodes
// inner nodes. if inner nodes are not Dnnl nodes
// create subgraph (inception v2)
auto sub_it = mkldnn_ops_.find(next_node->OpType());
if (sub_it == mkldnn_ops_.end()) {
auto sub_it = dnnl_ops_.find(next_node->OpType());
if (sub_it == dnnl_ops_.end()) {
// break and create a sub-graph
break_loop = true;
create_subgraph = true;
@ -322,7 +318,7 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
}
if (create_subgraph) {
CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result);
subgraph_ptr = std::make_shared<mkl_dnn::Subgraph>(mkl_dnn::Subgraph(graph_name));
subgraph_ptr = std::make_shared<ort_dnnl::Subgraph>(ort_dnnl::Subgraph(graph_name));
subgraph_attributes.clear();
output_to_source_node_map.clear();
}
@ -332,7 +328,7 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
} else {
if (!sub_var.subgraph_node_indexes.empty()) {
CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result);
subgraph_ptr = std::make_shared<mkl_dnn::Subgraph>(mkl_dnn::Subgraph(graph_name));
subgraph_ptr = std::make_shared<ort_dnnl::Subgraph>(ort_dnnl::Subgraph(graph_name));
subgraph_attributes.clear();
output_to_source_node_map.clear();
}
@ -341,17 +337,17 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
} // graph_viewer node iterator ends
if (!sub_var.subgraph_node_indexes.empty()) {
CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result);
subgraph_ptr = std::make_shared<mkl_dnn::Subgraph>(mkl_dnn::Subgraph(graph_name));
subgraph_ptr = std::make_shared<ort_dnnl::Subgraph>(ort_dnnl::Subgraph(graph_name));
subgraph_attributes.clear();
output_to_source_node_map.clear();
}
return result;
}
void MKLDNNExecutionProvider::CreateMetaDef(const onnxruntime::GraphViewer& graph_viewer,
void DNNLExecutionProvider::CreateMetaDef(const onnxruntime::GraphViewer& graph_viewer,
const NodeAttributes& subgraph_attributes,
std::shared_ptr<mkl_dnn::Subgraph>& subgraph_ptr,
mkl_dnn::Subgraph::SubgraphVariables& sub_var,
std::shared_ptr<ort_dnnl::Subgraph>& subgraph_ptr,
ort_dnnl::Subgraph::SubgraphVariables& sub_var,
std::vector<std::unique_ptr<ComputeCapability>>& result) const {
std::string graph_fused_nodes;
std::string node_list;
@ -376,7 +372,7 @@ void MKLDNNExecutionProvider::CreateMetaDef(const onnxruntime::GraphViewer& grap
auto meta_def = onnxruntime::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>();
meta_def->attributes["initializers"] = initializers;
meta_def->name = "MkldnnCustomOp" + std::to_string(subgraph_index_);
meta_def->name = "DnnlCustomOp" + std::to_string(subgraph_index_);
meta_def->domain = kMSDomain;
meta_def->since_version = 1;
meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
@ -384,7 +380,7 @@ void MKLDNNExecutionProvider::CreateMetaDef(const onnxruntime::GraphViewer& grap
meta_def->attributes.insert(subgraph_attributes.begin(), subgraph_attributes.end());
// Find the end nodes
for (auto& mklnode : subgraph_ptr->mkldnn_nodes) {
for (auto& mklnode : subgraph_ptr->dnnl_nodes) {
auto itr = std::find(sub_var.outputs_as_input_other_node.begin(),
sub_var.outputs_as_input_other_node.end(), mklnode.output_name);
if (itr == sub_var.outputs_as_input_other_node.end()) {
@ -407,25 +403,25 @@ void MKLDNNExecutionProvider::CreateMetaDef(const onnxruntime::GraphViewer& grap
sub_var.Reset();
}
Status MKLDNNExecutionProvider::Compile(const std::vector<onnxruntime::Node*>& fused_nodes,
Status DNNLExecutionProvider::Compile(const std::vector<onnxruntime::Node*>& fused_nodes,
std::vector<NodeComputeInfo>& node_compute_funcs) {
for (const auto* fused_node : fused_nodes) {
auto attributes = fused_node->GetAttributes();
NodeComputeInfo compute_info;
compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
auto* p = new onnxruntime::mkl_dnn::MkldnnFuncKernel<float>(context, attributes, this);
auto* p = new onnxruntime::ort_dnnl::DnnlFuncKernel<float>(context, attributes, this);
*state = p;
return 0;
};
compute_info.release_state_func = [](FunctionState state) {
if (state)
delete static_cast<onnxruntime::mkl_dnn::MkldnnFuncKernel<float>*>(state);
delete static_cast<onnxruntime::ort_dnnl::DnnlFuncKernel<float>*>(state);
};
compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) {
onnxruntime::mkl_dnn::MkldnnFuncKernel<float>* custom_op = reinterpret_cast<mkl_dnn::MkldnnFuncKernel<float>*>(state);
onnxruntime::ort_dnnl::DnnlFuncKernel<float>* custom_op = reinterpret_cast<ort_dnnl::DnnlFuncKernel<float>*>(state);
return custom_op->Compute(api, context);
};

Просмотреть файл

@ -12,32 +12,32 @@
#include "core/graph/constants.h"
#include "core/framework/allocatormgr.h"
#include "core/framework/execution_provider.h"
#include "core/providers/mkldnn/subgraph/subgraph.h"
#include "core/providers/dnnl/subgraph/subgraph.h"
namespace mkldnn {
namespace dnnl {
struct memory;
};
namespace onnxruntime {
// Information needed to construct MKL-DNN execution providers.
struct MKLDNNExecutionProviderInfo {
// Information needed to construct DNNL execution providers.
struct DNNLExecutionProviderInfo {
bool create_arena{true};
explicit MKLDNNExecutionProviderInfo(bool use_arena)
explicit DNNLExecutionProviderInfo(bool use_arena)
: create_arena(use_arena) {}
MKLDNNExecutionProviderInfo() = default;
DNNLExecutionProviderInfo() = default;
};
// Logical device representation.
class MKLDNNExecutionProvider : public IExecutionProvider {
class DNNLExecutionProvider : public IExecutionProvider {
public:
explicit MKLDNNExecutionProvider(const MKLDNNExecutionProviderInfo& info);
virtual ~MKLDNNExecutionProvider();
explicit DNNLExecutionProvider(const DNNLExecutionProviderInfo& info);
virtual ~DNNLExecutionProvider();
virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
std::shared_ptr<mkldnn::memory> GetWeightsMemoryBuffer(const std::string& weight_key) {
std::shared_ptr<dnnl::memory> GetWeightsMemoryBuffer(const std::string& weight_key) {
auto iter = weights_mem_map_.find(weight_key);
if (iter != weights_mem_map_.end())
return iter->second;
@ -45,7 +45,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
}
void SetWeightsMemoryBuffer(const std::string& weight_key,
const std::shared_ptr<mkldnn::memory>& filter_dst_mem) {
const std::shared_ptr<dnnl::memory>& filter_dst_mem) {
weights_mem_map_.insert(std::make_pair(weight_key, filter_dst_mem));
}
@ -58,7 +58,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
reordered_buffers_.push_back(std::move(buffer));
}
std::shared_ptr<mkldnn::memory> GetBiasMemoryBuffer(const std::string& key) {
std::shared_ptr<dnnl::memory> GetBiasMemoryBuffer(const std::string& key) {
auto iter = bias_mem_map_.find(key);
if (iter != bias_mem_map_.end())
return iter->second;
@ -67,7 +67,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
// Conv+BathNorm fusion. save scaled bias memory.
void SetBiasMemoryBuffer(const std::string& key,
const std::shared_ptr<mkldnn::memory>& bias_mem) {
const std::shared_ptr<dnnl::memory>& bias_mem) {
bias_mem_map_.insert(std::make_pair(key, bias_mem));
}
@ -84,14 +84,14 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
std::vector<NodeComputeInfo>& node_compute_funcs) override;
private:
// mkldnn weights(filer data) memory blocks from first iteration
// dnnl weights(filer data) memory blocks from first iteration
// saved by weights name
std::unordered_map<std::string, std::shared_ptr<mkldnn::memory>> weights_mem_map_;
std::unordered_map<std::string, std::shared_ptr<dnnl::memory>> weights_mem_map_;
// Save reordered memory buffers in list so that memory is not freed.
std::vector<IAllocatorUniquePtr<void>> reordered_buffers_;
// conv+batchnorm fusion. normalized bias memory blocks from first iteration
std::unordered_map<std::string, std::shared_ptr<mkldnn::memory>> bias_mem_map_;
std::unordered_map<std::string, std::shared_ptr<dnnl::memory>> bias_mem_map_;
// Conv+BathNorm fusion bias memory buffer.
std::vector<IAllocatorUniquePtr<void>> biass_buffers_;
OrtMutex mutex_;
@ -123,7 +123,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
bool UseSubgraph(const onnxruntime::GraphViewer& graph_viewer) const;
// Some dimensions are not supported by MKL-DNN
// Some dimensions are not supported by DNNL
// example: Pool with NumDimensions <= 3 is not supported
// Fall back to CPU implementation
bool IsDimensionSupported(const Node* node) const {
@ -140,43 +140,40 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
supported = false;
}
if (node->Op()->SinceVersion() == 10)
supported = false;
if (node->OutputDefs().size() > 1)
supported = false;
}
return supported;
}
void CreateOrUpdateMklDnnNode(const Node* node,
std::shared_ptr<mkl_dnn::Subgraph>& subgraph_ptr,
mkl_dnn::Subgraph::SubgraphVariables& sub_var,
void CreateOrUpdateDnnlNode(const Node* node,
std::shared_ptr<ort_dnnl::Subgraph>& subgraph_ptr,
ort_dnnl::Subgraph::SubgraphVariables& sub_var,
bool fused,
std::map<std::string, size_t>& output_to_source_node_map,
NodeAttributes& subgraph_attributes) const;
// Create MklDnn node, update inputs, outputs and parent nodes
// Create Dnnl node, update inputs, outputs and parent nodes
// collect attribtes
void CreateMetaDef(const onnxruntime::GraphViewer& graph_viewer,
const NodeAttributes& subgraph_attributes,
std::shared_ptr<mkl_dnn::Subgraph>& subgraph_ptr,
mkl_dnn::Subgraph::SubgraphVariables& sub_var,
std::shared_ptr<ort_dnnl::Subgraph>& subgraph_ptr,
ort_dnnl::Subgraph::SubgraphVariables& sub_var,
std::vector<std::unique_ptr<ComputeCapability>>& result) const;
public:
const std::shared_ptr<mkl_dnn::Subgraph> GetMklDnnSubgraph(const std::string& subgraph_id) {
const std::shared_ptr<ort_dnnl::Subgraph> GetDnnlSubgraph(const std::string& subgraph_id) {
return mkl_subgraphs_[subgraph_id];
}
private:
mutable int subgraph_index_ = 0;
// supported MklDnn Operators
std::set<std::string> mkldnn_ops_ = {"Conv", "BatchNormalization", "Relu", "Sum",
// supported Dnnl Operators
std::set<std::string> dnnl_ops_ = {"Conv", "BatchNormalization", "Relu", "Sum",
"AveragePool", "GlobalMaxPool", "GlobalAveragePool", "MaxPool", "LRN"};
mutable std::unordered_map<std::string, std::shared_ptr<mkl_dnn::Subgraph>> mkl_subgraphs_;
mutable std::unordered_map<std::string, std::shared_ptr<ort_dnnl::Subgraph>> mkl_subgraphs_;
};
} // namespace onnxruntime

Просмотреть файл

@ -4,7 +4,7 @@
#pragma once
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
KernelCreateInfo BuildKernelCreateInfo();
}

Просмотреть файл

@ -0,0 +1,38 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/dnnl/dnnl_provider_factory.h"
#include <atomic>
#include "dnnl_execution_provider.h"
#include "core/session/abi_session_options_impl.h"
using namespace onnxruntime;
namespace onnxruntime {
struct DnnlProviderFactory : IExecutionProviderFactory {
DnnlProviderFactory(bool create_arena) : create_arena_(create_arena) {}
~DnnlProviderFactory() override {}
std::unique_ptr<IExecutionProvider> CreateProvider() override;
private:
bool create_arena_;
};
std::unique_ptr<IExecutionProvider> DnnlProviderFactory::CreateProvider() {
DNNLExecutionProviderInfo info;
info.create_arena = create_arena_;
return onnxruntime::make_unique<DNNLExecutionProvider>(info);
}
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int device_id) {
return std::make_shared<onnxruntime::DnnlProviderFactory>(device_id);
//TODO: This is apparently a bug. The consructor parameter is create-arena-flag, not the device-id
}
} // namespace onnxruntime
ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessionOptions* options, int use_arena) {
options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Dnnl(use_arena));
return nullptr;
}

Просмотреть файл

@ -4,18 +4,18 @@
#include "gemm.h"
#include "core/providers/cpu/math/gemm_helper.h"
#include "core/util/math_cpuonly.h"
#include "mkldnn.h"
#include "mkldnn.hpp"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "dnnl.h"
#include "dnnl.hpp"
#include "core/providers/dnnl/dnnl_fwd.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
ONNX_OPERATOR_KERNEL_EX(
Gemm,
kOnnxDomain,
7,
kMklDnnExecutionProvider,
kDnnlExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Gemm<float>);
@ -29,9 +29,9 @@ Status Gemm<float>::Compute(OpKernelContext* ctx) const {
if (!helper.State().IsOK())
return helper.State();
mkldnn::memory::dim M = gsl::narrow_cast<int>(helper.M());
mkldnn::memory::dim N = gsl::narrow_cast<int>(helper.N());
mkldnn::memory::dim K = gsl::narrow_cast<int>(helper.K());
dnnl::memory::dim M = gsl::narrow_cast<int>(helper.M());
dnnl::memory::dim N = gsl::narrow_cast<int>(helper.N());
dnnl::memory::dim K = gsl::narrow_cast<int>(helper.K());
auto Y = ctx->Output(0, TensorShape({M, N}));
if (M <= 0)
@ -81,19 +81,19 @@ Status Gemm<float>::Compute(OpKernelContext* ctx) const {
}
}
// mkldnn_sgemm expects row major matrices, so no need to swap the operands A and B
auto status = mkldnn_sgemm(trans_A_ ? 'T' : 'N',
// dnnl_sgemm expects row major matrices, so no need to swap the operands A and B
auto status = dnnl_sgemm(trans_A_ ? 'T' : 'N',
trans_B_ ? 'T' : 'N',
M, N, K,
alpha_, X->template Data<float>() , trans_A_ ? M : K,
W->template Data<float>(), trans_B_ ? K : N,
beta_, Y->template MutableData<float>(), N);
if (status == mkldnn_success) {
if (status == dnnl_success) {
return Status::OK();
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "mkldnn_sgemm failed with status: ", status);
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DNNL_sgemm failed with status: ", status);
}
}
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -5,7 +5,7 @@
#include "core/framework/op_kernel.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
class Gemm final : public OpKernel {
public:
@ -29,5 +29,5 @@ class Gemm final : public OpKernel {
float alpha_;
float beta_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -5,29 +5,29 @@
#include "core/util/math.h"
#include "core/util/math_cpuonly.h"
#include "core/framework/op_kernel.h"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
class MklDnnRelu : public MklDnnKernel {
class DnnlRelu : public DnnlKernel {
public:
MklDnnRelu(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
DnnlRelu(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ORT_UNUSED_PARAMETER(attributes);
ORT_UNUSED_PARAMETER(attributes_prefix);
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) {
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
@ -41,7 +41,7 @@ class MklDnnRelu : public MklDnnKernel {
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
mkldnn::memory::dims dims(xdim);
dnnl::memory::dims dims(xdim);
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
@ -52,19 +52,19 @@ class MklDnnRelu : public MklDnnKernel {
return;
}
mkldnn::memory::dims src_dims(
dnnl::memory::dims src_dims(
x_shape.GetDims().begin(), x_shape.GetDims().end());
ort_source_desc_ = mkldnn::memory::desc(
{src_dims}, MklDnnType<T>(), ort_source_format_);
ort_source_desc_ = dnnl::memory::desc(
{src_dims}, DnnnType<T>(), ort_source_format_);
source_desc_ = ort_source_desc_;
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims}, MklDnnType<T>(), ort_source_format_));
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory({{src_dims}, MklDnnType<T>(), ort_source_format_}, cpu_engine, nullptr));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory({{src_dims}, DnnnType<T>(), ort_source_format_}, cpu_engine, nullptr));
} else {
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc(parents_[0].get()->primitive_dst_desc_));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_));
src_mem_ = parents_[0].get()->primitive_dst_mem_;
x_shape = parents_[0].get()->primitive_dst_shape_;
ort_source_format_ = parents_[0].get()->ort_source_format_;
@ -74,12 +74,12 @@ class MklDnnRelu : public MklDnnKernel {
primitive_dst_shape_ = TensorShape(x_shape);
mkldnn::memory::dims dst_dims_mkl(primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
mkldnn::algorithm algo = mkldnn::algorithm::eltwise_relu;
fwd_desc_ = onnxruntime::make_unique<mkldnn::eltwise_forward::desc>(
mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_inference, algo, *src_md_, 0));
relu_fwd_pd_ = onnxruntime::make_unique<mkldnn::eltwise_forward::primitive_desc>(
mkldnn::eltwise_forward::primitive_desc(*fwd_desc_, cpu_engine));
dnnl::memory::dims dst_dims_mkl(primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
dnnl::algorithm algo = dnnl::algorithm::eltwise_relu;
fwd_desc_ = onnxruntime::make_unique<dnnl::eltwise_forward::desc>(
dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo, *src_md_, 0));
relu_fwd_pd_ = onnxruntime::make_unique<dnnl::eltwise_forward::primitive_desc>(
dnnl::eltwise_forward::primitive_desc(*fwd_desc_, cpu_engine));
primitive_src_desc_ = relu_fwd_pd_.get()->src_desc();
primitive_dst_desc_ = relu_fwd_pd_.get()->dst_desc();
@ -89,29 +89,29 @@ class MklDnnRelu : public MklDnnKernel {
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder neded. Use primitive output as input to reorder and
// allocate buffer for reorder output, final output of this subgraph
primitive_dst_mem_ = std::make_shared<mkldnn::memory>(mkldnn::memory(relu_fwd_pd_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(relu_fwd_pd_.get()->dst_desc(), cpu_engine));
} else {
// Last node but re-order not needed. Allocate buffer to output of this node
primitive_dst_mem_ = std::make_shared<mkldnn::memory>(mkldnn::memory(relu_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(relu_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
}
} else {
// Intermediate node. Use mkldnn kernel internal memory for output and
// Intermediate node. Use dnnl kernel internal memory for output and
// use this as input to next node.
primitive_dst_mem_ = std::make_shared<mkldnn::memory>(mkldnn::memory(relu_fwd_pd_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(relu_fwd_pd_.get()->dst_desc(), cpu_engine));
}
relu_fwd_ = onnxruntime::make_unique<mkldnn::eltwise_forward>(
mkldnn::eltwise_forward(*relu_fwd_pd_));
relu_fwd_ = onnxruntime::make_unique<dnnl::eltwise_forward>(
dnnl::eltwise_forward(*relu_fwd_pd_));
net.push_back(*relu_fwd_);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
@ -147,13 +147,13 @@ class MklDnnRelu : public MklDnnKernel {
}
private:
std::shared_ptr<mkldnn::memory> src_mem_;
std::shared_ptr<dnnl::memory> src_mem_;
std::unique_ptr<mkldnn::eltwise_forward::desc> fwd_desc_;
std::unique_ptr<mkldnn::eltwise_forward::primitive_desc> relu_fwd_pd_;
std::unique_ptr<mkldnn::primitive> relu_fwd_;
std::unique_ptr<dnnl::eltwise_forward::desc> fwd_desc_;
std::unique_ptr<dnnl::eltwise_forward::primitive_desc> relu_fwd_pd_;
std::unique_ptr<dnnl::primitive> relu_fwd_;
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::unique_ptr<dnnl::memory::desc> src_md_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -3,14 +3,14 @@
#pragma once
#include "core/framework/op_kernel.h"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/mkldnn/memcpy_s.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
#include "core/providers/dnnl/memcpy_s.h"
#include "core/util/math.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
class BatchNormHelper {
public:
@ -80,12 +80,12 @@ class BatchNormHelper {
};
template <typename T>
class MklDnnBatchNorm : public MklDnnKernel {
class DnnlBatchNorm : public DnnlKernel {
public:
explicit MklDnnBatchNorm(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
explicit DnnlBatchNorm(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ReadAttributes(attributes, attributes_prefix);
}
void ReadAttributes(const NodeAttributes& attributes,
@ -99,9 +99,9 @@ class MklDnnBatchNorm : public MklDnnKernel {
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) override {
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) override {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
@ -113,22 +113,22 @@ class MklDnnBatchNorm : public MklDnnKernel {
ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
mkldnn::memory::dims dims(xdim);
dnnl::memory::dims dims(xdim);
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
x_shape = TensorShape(xshape, xdim);
mkldnn::memory::dims src_dims(
dnnl::memory::dims src_dims(
x_shape.GetDims().begin(), x_shape.GetDims().end());
ort_source_desc_ = mkldnn::memory::desc(
{src_dims}, MklDnnType<T>(), ort_source_format_);
ort_source_desc_ = dnnl::memory::desc(
{src_dims}, DnnnType<T>(), ort_source_format_);
source_desc_ = ort_source_desc_;
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims}, MklDnnType<T>(), ort_source_format_));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_));
} else {
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc(parents_[0].get()->primitive_dst_desc_));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_));
x_shape = parents_[0].get()->primitive_dst_shape_;
ort_source_format_ = parents_[0].get()->ort_source_format_;
ort_source_desc_ = parents_[0].get()->ort_source_desc_;
@ -138,7 +138,7 @@ class MklDnnBatchNorm : public MklDnnKernel {
int num_dimensions = static_cast<int>(x_shape.NumDimensions());
if (num_dimensions == 3) {
primitive_created_status_ = ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"1D BatchNormalization is not supported in MKLDNN.");
"1D BatchNormalization is not supported in DNNL.");
return;
}
@ -182,105 +182,105 @@ class MklDnnBatchNorm : public MklDnnKernel {
return;
}
mkldnn::memory::dims src_dims_mkl(
dnnl::memory::dims src_dims_mkl(
x_shape.GetDims().begin(), x_shape.GetDims().end());
mkldnn::memory::dims scale_dims_mkl(
dnnl::memory::dims scale_dims_mkl(
scale_shape.GetDims().begin(), scale_shape.GetDims().end());
mkldnn::memory::dims b_dims_mkl(
dnnl::memory::dims b_dims_mkl(
b_shape.GetDims().begin(), b_shape.GetDims().end());
mkldnn::memory::dims mean_dims_mkl(
dnnl::memory::dims mean_dims_mkl(
mean_shape.GetDims().begin(), mean_shape.GetDims().end());
mkldnn::memory::dims var_dims_mkl(
dnnl::memory::dims var_dims_mkl(
var_shape.GetDims().begin(), var_shape.GetDims().end());
mkldnn::memory::dims dst_dims_mkl(
dnnl::memory::dims dst_dims_mkl(
primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
scale_shift_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({2, scale_dims_mkl[0]}, MklDnnType<T>(), mkldnn::memory::format_tag::nc));
mean_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({mean_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::x));
var_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({var_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::x));
primitive_dst_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({dst_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
scale_shift_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({2, scale_dims_mkl[0]}, DnnnType<T>(), dnnl::memory::format_tag::nc));
mean_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({mean_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::x));
var_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({var_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::x));
primitive_dst_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({dst_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
// scale_shift_mem will allocate 2*C*sizeof(float) buffer
//
scale_shift_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory({*scale_shift_md_, cpu_engine}));
scale_shift_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory({*scale_shift_md_, cpu_engine}));
mean_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(*mean_md_, cpu_engine, nullptr));
var_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(*var_md_, cpu_engine, nullptr));
mean_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(*mean_md_, cpu_engine, nullptr));
var_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(*var_md_, cpu_engine, nullptr));
batchnorm_fwd_ = onnxruntime::make_unique<mkldnn::batch_normalization_forward::desc>(
mkldnn::batch_normalization_forward::desc(
mkldnn::prop_kind::forward_inference, *src_md_, epsilon_,
mkldnn::normalization_flags::use_scale_shift |
mkldnn::normalization_flags::use_global_stats));
batchnorm_fwd_ = onnxruntime::make_unique<dnnl::batch_normalization_forward::desc>(
dnnl::batch_normalization_forward::desc(
dnnl::prop_kind::forward_inference, *src_md_, epsilon_,
dnnl::normalization_flags::use_scale_shift |
dnnl::normalization_flags::use_global_stats));
if (fuse_relu_) {
mkldnn::primitive_attr attr;
// attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
dnnl::primitive_attr attr;
// attr.set_int_output_round_mode(dnnl::round_mode::round_nearest);
// Execute RELU as Fuse PostOps
const float ops_scale = 1.f;
const float ops_alpha = 0.f; // relu negative slope
const float ops_beta = 0.f;
mkldnn::post_ops ops;
ops.append_eltwise(ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
dnnl::post_ops ops;
ops.append_eltwise(ops_scale, dnnl::algorithm::eltwise_relu, ops_alpha, ops_beta);
attr.set_post_ops(ops);
batchnorm_fwd_pd_ = onnxruntime::make_unique<mkldnn::batch_normalization_forward::primitive_desc>(
mkldnn::batch_normalization_forward::primitive_desc(*batchnorm_fwd_, attr, cpu_engine));
batchnorm_fwd_pd_ = onnxruntime::make_unique<dnnl::batch_normalization_forward::primitive_desc>(
dnnl::batch_normalization_forward::primitive_desc(*batchnorm_fwd_, attr, cpu_engine));
} else {
batchnorm_fwd_pd_ = onnxruntime::make_unique<mkldnn::batch_normalization_forward::primitive_desc>(
mkldnn::batch_normalization_forward::primitive_desc(
batchnorm_fwd_pd_ = onnxruntime::make_unique<dnnl::batch_normalization_forward::primitive_desc>(
dnnl::batch_normalization_forward::primitive_desc(
*batchnorm_fwd_, cpu_engine));
}
// out format of this kernel
primitive_dst_desc_ = static_cast<mkldnn::memory::desc>(
primitive_dst_desc_ = static_cast<dnnl::memory::desc>(
batchnorm_fwd_pd_.get()->dst_desc());
primitive_src_desc_ = static_cast<mkldnn::memory::desc>(
primitive_src_desc_ = static_cast<dnnl::memory::desc>(
batchnorm_fwd_pd_.get()->dst_desc());
if (mklnode_ptr_->parent_nodes.empty()) {
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(batchnorm_fwd_pd_.get()->src_desc(), cpu_engine, nullptr));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(batchnorm_fwd_pd_.get()->src_desc(), cpu_engine, nullptr));
} else {
src_mem_ = parents_[0].get()->primitive_dst_mem_;
}
if (mklnode_ptr_->output_index >= 0) {
// Use mkldnn's internal output buffer
// Use Dnnl's internal output buffer
if (primitive_dst_desc_ != ort_source_desc_) {
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(batchnorm_fwd_pd_->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(batchnorm_fwd_pd_->dst_desc(), cpu_engine));
} else {
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(batchnorm_fwd_pd_->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(batchnorm_fwd_pd_->dst_desc(), cpu_engine, nullptr));
}
} else {
// last node of sub-graph. need to allocate memory for output_tensor
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(batchnorm_fwd_pd_->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(batchnorm_fwd_pd_->dst_desc(), cpu_engine));
}
auto bn = mkldnn::batch_normalization_forward(
auto bn = dnnl::batch_normalization_forward(
*batchnorm_fwd_pd_);
net.push_back(bn);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_MEAN, *mean_mem_},
{MKLDNN_ARG_VARIANCE, *var_mem_},
{MKLDNN_ARG_SCALE_SHIFT, *scale_shift_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_MEAN, *mean_mem_},
{DNNL_ARG_VARIANCE, *var_mem_},
{DNNL_ARG_SCALE_SHIFT, *scale_shift_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
// Allocate dst buffer if reorder is necessary
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
@ -289,7 +289,7 @@ class MklDnnBatchNorm : public MklDnnKernel {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
// abort as MKLDNN cannot execute this. but
// abort as DNNL cannot execute this. but
// ORT try to delete output_tensor buffer data. allocate memory so that it can delete
// fix for test_averagepool_1d_default node test
ORT_RETURN_IF_ERROR(primitive_created_status_);
@ -316,7 +316,7 @@ class MklDnnBatchNorm : public MklDnnKernel {
auto sdim = tensor_shape.size();
TensorShape scale_shape(sshape, sdim);
mkldnn::memory::dims scale_dims_mkl(
dnnl::memory::dims scale_dims_mkl(
scale_shape.GetDims().begin(), scale_shape.GetDims().end());
mean_mem_->set_data_handle(static_cast<void*>(const_cast<T*>(mean_data)));
@ -346,23 +346,23 @@ class MklDnnBatchNorm : public MklDnnKernel {
}
private:
std::shared_ptr<mkldnn::memory> src_mem_;
std::unique_ptr<mkldnn::memory> scale_shift_mem_;
std::unique_ptr<mkldnn::memory> mean_mem_;
std::unique_ptr<mkldnn::memory> var_mem_;
std::unique_ptr<mkldnn::memory> dst_mem_;
std::shared_ptr<dnnl::memory> src_mem_;
std::unique_ptr<dnnl::memory> scale_shift_mem_;
std::unique_ptr<dnnl::memory> mean_mem_;
std::unique_ptr<dnnl::memory> var_mem_;
std::unique_ptr<dnnl::memory> dst_mem_;
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::unique_ptr<mkldnn::memory::desc> scale_shift_md_;
std::unique_ptr<mkldnn::memory::desc> mean_md_;
std::unique_ptr<mkldnn::memory::desc> var_md_;
std::unique_ptr<mkldnn::memory::desc> dst_md_;
std::unique_ptr<dnnl::memory::desc> src_md_;
std::unique_ptr<dnnl::memory::desc> scale_shift_md_;
std::unique_ptr<dnnl::memory::desc> mean_md_;
std::unique_ptr<dnnl::memory::desc> var_md_;
std::unique_ptr<dnnl::memory::desc> dst_md_;
std::unique_ptr<mkldnn::batch_normalization_forward::desc> batchnorm_fwd_;
std::unique_ptr<mkldnn::batch_normalization_forward::primitive_desc> batchnorm_fwd_pd_;
std::unique_ptr<dnnl::batch_normalization_forward::desc> batchnorm_fwd_;
std::unique_ptr<dnnl::batch_normalization_forward::primitive_desc> batchnorm_fwd_pd_;
protected:
float epsilon_ = 1e-5f;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -2,16 +2,16 @@
// Licensed under the MIT License.
#pragma once
#include "mkldnn_types.h"
#include "dnnl_types.h"
#include "core/framework/op_kernel.h"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/cpu/nn/autopad_type.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
#include "core/util/math.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
// helper function
template <bool ForceSymmetricAutoPadding>
@ -61,22 +61,22 @@ Status ComputePadAndOutputShape(
}
template <typename T>
class MklDnnConv : public MklDnnKernel {
class DnnlConv : public DnnlKernel {
public:
MklDnnConv(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
DnnlConv(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ReadAttributes(attributes, attributes_prefix);
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) override {
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) override {
Ort::CustomOpApi ort{*api};
stream_ = onnxruntime::make_unique<mkldnn::stream>(mkldnn::stream(cpu_engine));
stream_ = onnxruntime::make_unique<dnnl::stream>(dnnl::stream(cpu_engine));
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
const OrtValue* winput_tensor = ort.KernelContext_GetInput(context, input_index + 1);
@ -98,17 +98,17 @@ class MklDnnConv : public MklDnnKernel {
ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
ort_source_format_ = mkldnn::memory::format_tag::any;
ort_source_format_ = dnnl::memory::format_tag::any;
x_shape = TensorShape(xshape, xdim);
} else {
// get the output of previous node (mkldnn block propagation).
// get the output of previous node (Dnnl block propagation).
// TODO Sourcenode will set src of this node.
x_shape = parents_[0].get()->primitive_dst_shape_;
ort_source_format_ = parents_[0].get()->ort_source_format_;
ort_source_desc_ = parents_[0].get()->ort_source_desc_;
source_desc_ = parents_[0].get()->primitive_dst_desc_;
mkldnn::memory::dims src_dims_mkl(x_shape.GetDims().begin(), x_shape.GetDims().end());
dnnl::memory::dims src_dims_mkl(x_shape.GetDims().begin(), x_shape.GetDims().end());
}
primitive_created_status_ = ValidateInputShape(x_shape, w_shape);
@ -166,11 +166,11 @@ class MklDnnConv : public MklDnnKernel {
TensorShape y_shape(y_dims);
primitive_dst_shape_ = TensorShape(y_dims);
TensorShape output_shape = y_shape.Slice(2);
mkldnn::memory::dims dst_dims_mkl(y_dims.begin(), y_dims.end());
primitive_dst_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({dst_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
dnnl::memory::dims dst_dims_mkl(y_dims.begin(), y_dims.end());
primitive_dst_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({dst_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
mkldnn::memory::dims filter_dims_mkl;
dnnl::memory::dims filter_dims_mkl;
if (group_mkl == 1) {
filter_dims_mkl.assign(w_shape.GetDims().begin(), w_shape.GetDims().end());
} else {
@ -178,16 +178,16 @@ class MklDnnConv : public MklDnnKernel {
static_cast<int>(w_shape[0] / group_mkl)});
filter_dims_mkl.insert(filter_dims_mkl.end(), w_shape.GetDims().begin() + 1, w_shape.GetDims().end());
}
mkldnn::memory::dims strides_mkl(strides.begin(), strides.end());
mkldnn::memory::dims dilations_mkl(dilations.begin(), dilations.end());
// mkldnn dilations start from 0 so we need to subtract 1 from each dim.
dnnl::memory::dims strides_mkl(strides.begin(), strides.end());
dnnl::memory::dims dilations_mkl(dilations.begin(), dilations.end());
// Dnnl dilations start from 0 so we need to subtract 1 from each dim.
for (size_t dim = 0; dim < kernel_rank; dim++) {
dilations_mkl[dim] -= 1;
}
mkldnn::memory::dims padding_left_mkl(pads.begin(), pads.begin() + kernel_rank);
mkldnn::memory::dims padding_right_mkl(pads.begin() + kernel_rank, pads.end());
mkldnn::memory::dims bias_dims_mkl;
dnnl::memory::dims padding_left_mkl(pads.begin(), pads.begin() + kernel_rank);
dnnl::memory::dims padding_right_mkl(pads.begin() + kernel_rank, pads.end());
dnnl::memory::dims bias_dims_mkl;
if (mklnode_ptr_->num_inputs == 3) {
const OrtValue* binput_tensor = ort.KernelContext_GetInput(context, input_index + 2);
auto btensor_info = ort.GetTensorTypeAndShape(binput_tensor);
@ -199,165 +199,165 @@ class MklDnnConv : public MklDnnKernel {
bias_dims_mkl.assign(b_shape.GetDims().begin(), b_shape.GetDims().end());
}
auto src_format = mkldnn::memory::format_tag::any;
auto src_format = dnnl::memory::format_tag::any;
if (kernel_rank == 1) {
src_format = mkldnn::memory::format_tag::ncw;
src_format = dnnl::memory::format_tag::ncw;
if (group_mkl == 1) {
filter_format_ = mkldnn::memory::format_tag::oiw;
filter_format_ = dnnl::memory::format_tag::oiw;
} else {
filter_format_ = mkldnn::memory::format_tag::goiw;
filter_format_ = dnnl::memory::format_tag::goiw;
}
} else if (kernel_rank == 2) {
src_format = mkldnn::memory::format_tag::nchw;
src_format = dnnl::memory::format_tag::nchw;
if (group_mkl == 1) {
filter_format_ = mkldnn::memory::format_tag::oihw;
filter_format_ = dnnl::memory::format_tag::oihw;
} else {
filter_format_ = mkldnn::memory::format_tag::goihw;
filter_format_ = dnnl::memory::format_tag::goihw;
}
} else {
src_format = mkldnn::memory::format_tag::ncdhw;
src_format = dnnl::memory::format_tag::ncdhw;
if (group_mkl == 1) {
filter_format_ = mkldnn::memory::format_tag::oidhw;
filter_format_ = dnnl::memory::format_tag::oidhw;
} else {
filter_format_ = mkldnn::memory::format_tag::goidhw;
filter_format_ = dnnl::memory::format_tag::goidhw;
}
}
mkldnn::memory::dims src_dims_mkl(x_shape.GetDims().begin(), x_shape.GetDims().end());
dnnl::memory::dims src_dims_mkl(x_shape.GetDims().begin(), x_shape.GetDims().end());
if (mklnode_ptr_->parent_nodes.empty()) {
ort_source_format_ = src_format;
ort_source_desc_ = mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), src_format);
source_desc_ = mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), src_format);
ort_source_desc_ = dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), src_format);
source_desc_ = dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), src_format);
}
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
// Set the memory descriptors to format::any to allow MKLDNN to decide what the optimal memory layout should be
// Set the memory descriptors to format::any to allow DNNL to decide what the optimal memory layout should be
// for the computation given the input
filter_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({filter_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
filter_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({filter_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
if (!bias_dims_mkl.empty())
bias_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({bias_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
bias_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({bias_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
mkldnn::memory::dims conv_zero_padding = {0, 0};
dnnl::memory::dims conv_zero_padding = {0, 0};
if (!bias_dims_mkl.empty()) {
fwd_desc_ = onnxruntime::make_unique<mkldnn::convolution_forward::desc>(
mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward_inference, mkldnn::algorithm::convolution_direct, *src_md_,
fwd_desc_ = onnxruntime::make_unique<dnnl::convolution_forward::desc>(
dnnl::convolution_forward::desc(
dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, *src_md_,
*filter_md_, *bias_md_, *primitive_dst_md_,
strides_mkl, dilations_mkl, padding_left_mkl,
padding_right_mkl));
} else {
fwd_desc_ = onnxruntime::make_unique<mkldnn::convolution_forward::desc>(
mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward_inference, mkldnn::algorithm::convolution_direct, *src_md_,
fwd_desc_ = onnxruntime::make_unique<dnnl::convolution_forward::desc>(
dnnl::convolution_forward::desc(
dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, *src_md_,
*filter_md_, *primitive_dst_md_, strides_mkl,
dilations_mkl, padding_left_mkl, padding_right_mkl));
}
if (fuse_relu_) {
mkldnn::primitive_attr attr;
// attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
dnnl::primitive_attr attr;
// attr.set_int_output_round_mode(dnnl::round_mode::round_nearest);
// Execute RELU as Fuse PostOps
const float ops_scale = 1.f;
const float ops_alpha = 0.f; // relu negative slope
const float ops_beta = 0.f;
mkldnn::post_ops ops;
ops.append_eltwise(ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
dnnl::post_ops ops;
ops.append_eltwise(ops_scale, dnnl::algorithm::eltwise_relu, ops_alpha, ops_beta);
attr.set_post_ops(ops);
conv_fwd_pd_ = onnxruntime::make_unique<mkldnn::convolution_forward::primitive_desc>(
mkldnn::convolution_forward::primitive_desc(*fwd_desc_, attr, cpu_engine));
conv_fwd_pd_ = onnxruntime::make_unique<dnnl::convolution_forward::primitive_desc>(
dnnl::convolution_forward::primitive_desc(*fwd_desc_, attr, cpu_engine));
} else {
conv_fwd_pd_ = onnxruntime::make_unique<mkldnn::convolution_forward::primitive_desc>(
mkldnn::convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
conv_fwd_pd_ = onnxruntime::make_unique<dnnl::convolution_forward::primitive_desc>(
dnnl::convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
}
primitive_src_desc_ = static_cast<mkldnn::memory::desc>(
primitive_src_desc_ = static_cast<dnnl::memory::desc>(
conv_fwd_pd_.get()->src_desc());
filter_desc_ = static_cast<mkldnn::memory::desc>(
filter_desc_ = static_cast<dnnl::memory::desc>(
conv_fwd_pd_.get()->weights_desc());
primitive_dst_desc_ = static_cast<mkldnn::memory::desc>(
primitive_dst_desc_ = static_cast<dnnl::memory::desc>(
conv_fwd_pd_.get()->dst_desc());
src_size_ = conv_fwd_pd_.get()->src_desc().get_size();
filter_size_ = conv_fwd_pd_.get()->weights_desc().get_size();
dst_size_ = conv_fwd_pd_.get()->dst_desc().get_size();
filter_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->weights_desc(), cpu_engine, nullptr));
filter_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->weights_desc(), cpu_engine, nullptr));
if (primitive_src_desc_ != source_desc_) {
mkldnn::memory::dims src_dims(x_shape.GetDims().begin(), x_shape.GetDims().end());
auto pd = mkldnn::memory::desc({{src_dims}, MklDnnType<T>(), ort_source_format_});
dnnl::memory::dims src_dims(x_shape.GetDims().begin(), x_shape.GetDims().end());
auto pd = dnnl::memory::desc({{src_dims}, DnnnType<T>(), ort_source_format_});
if (mklnode_ptr_->parent_nodes.empty())
src_mem_from_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(pd, cpu_engine, nullptr));
src_mem_from_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(pd, cpu_engine, nullptr));
else
src_mem_from_ = parents_[0].get()->primitive_dst_mem_;
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
net.push_back(mkldnn::reorder(*src_mem_from_, *src_mem_));
net_args.push_back({{MKLDNN_ARG_FROM, *src_mem_from_},
{MKLDNN_ARG_TO, *src_mem_}});
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
net.push_back(dnnl::reorder(*src_mem_from_, *src_mem_));
net_args.push_back({{DNNL_ARG_FROM, *src_mem_from_},
{DNNL_ARG_TO, *src_mem_}});
} else {
if (mklnode_ptr_->parent_nodes.empty()) {
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
} else {
src_mem_ = parents_[0].get()->primitive_dst_mem_;
}
}
if (mklnode_ptr_->output_index >= 0) {
// Use mkldnn's internal output buffer
// Use Dnnl's internal output buffer
if (primitive_dst_desc_ != ort_source_desc_) {
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
} else {
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
}
} else {
// last node of sub-graph. need to allocate memory for output_tensor
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
}
if (!bias_dims_mkl.empty()) {
bias_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->bias_desc(), cpu_engine, nullptr));
conv_fwd_ = onnxruntime::make_unique<mkldnn::convolution_forward>(
mkldnn::convolution_forward(*conv_fwd_pd_));
bias_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->bias_desc(), cpu_engine, nullptr));
conv_fwd_ = onnxruntime::make_unique<dnnl::convolution_forward>(
dnnl::convolution_forward(*conv_fwd_pd_));
net.push_back(*conv_fwd_);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_WEIGHTS, *filter_mem_},
{MKLDNN_ARG_BIAS, *bias_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_WEIGHTS, *filter_mem_},
{DNNL_ARG_BIAS, *bias_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
} else {
conv_fwd_ = onnxruntime::make_unique<mkldnn::convolution_forward>(
mkldnn::convolution_forward(*conv_fwd_pd_));
conv_fwd_ = onnxruntime::make_unique<dnnl::convolution_forward>(
dnnl::convolution_forward(*conv_fwd_pd_));
net.push_back(*conv_fwd_);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_WEIGHTS, *filter_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_WEIGHTS, *filter_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
}
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
virtual void ReorderWeights(const OrtCustomOpApi* api, OrtKernelContext* context, mkldnn::engine& cpu_engine) override {
virtual void ReorderWeights(const OrtCustomOpApi* api, OrtKernelContext* context, dnnl::engine& cpu_engine) override {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
@ -373,7 +373,7 @@ class MklDnnConv : public MklDnnKernel {
const int group_mkl = static_cast<int>(group_);
mkldnn::memory::dims filter_dims_mkl;
dnnl::memory::dims filter_dims_mkl;
if (group_mkl == 1) {
filter_dims_mkl.assign(W.GetDims().begin(), W.GetDims().end());
} else {
@ -385,16 +385,16 @@ class MklDnnConv : public MklDnnKernel {
{
// lock to make sure reordering is done only once
std::lock_guard<OrtMutex> lock(provider_->GetMutex());
std::shared_ptr<mkldnn::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
std::shared_ptr<dnnl::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
if (filter_dst_mem == nullptr) {
mkldnn::memory src = mkldnn::memory({{filter_dims_mkl}, MklDnnType<T>(), filter_format_}, cpu_engine, (void*)filter_data);
dnnl::memory src = dnnl::memory({{filter_dims_mkl}, DnnnType<T>(), filter_format_}, cpu_engine, (void*)filter_data);
IAllocatorUniquePtr<void> filter_reorder_buffer =
IAllocator::MakeUniquePtr<void>(alloc_, filter_size_);
filter_dst_mem = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->weights_desc(), cpu_engine, filter_reorder_buffer.get()));
filter_dst_mem = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->weights_desc(), cpu_engine, filter_reorder_buffer.get()));
mkldnn::reorder(src, *filter_dst_mem)
dnnl::reorder(src, *filter_dst_mem)
.execute(cpu_engine, src, *filter_dst_mem);
provider_->SaveAllocatedMemory(std::move(filter_reorder_buffer));
@ -418,7 +418,7 @@ class MklDnnConv : public MklDnnKernel {
const OrtValue* binput_tensor = ort.KernelContext_GetInput(context, input_index + 2);
bias_data = const_cast<T*>(ort.GetTensorData<T>(binput_tensor));
}
std::shared_ptr<mkldnn::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
std::shared_ptr<dnnl::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
if (filter_dst_mem == nullptr) {
ReorderWeights(api, context, GetEngine());
filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
@ -527,28 +527,28 @@ class MklDnnConv : public MklDnnKernel {
}
private:
mkldnn::memory::desc filter_desc_;
mkldnn::memory::format_tag filter_format_;
dnnl::memory::desc filter_desc_;
dnnl::memory::format_tag filter_format_;
std::shared_ptr<mkldnn::memory> src_mem_from_;
std::unique_ptr<mkldnn::memory> src_mem_to_;
std::shared_ptr<dnnl::memory> src_mem_from_;
std::unique_ptr<dnnl::memory> src_mem_to_;
size_t src_size_;
size_t filter_size_;
size_t dst_size_;
std::shared_ptr<mkldnn::memory> src_mem_;
std::unique_ptr<mkldnn::memory> filter_mem_;
std::unique_ptr<mkldnn::memory> bias_mem_;
std::shared_ptr<dnnl::memory> src_mem_;
std::unique_ptr<dnnl::memory> filter_mem_;
std::unique_ptr<dnnl::memory> bias_mem_;
std::unique_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
std::unique_ptr<dnnl::convolution_forward::desc> fwd_desc_;
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::unique_ptr<mkldnn::memory::desc> filter_md_;
std::unique_ptr<mkldnn::memory::desc> bias_md_;
std::unique_ptr<dnnl::memory::desc> src_md_;
std::unique_ptr<dnnl::memory::desc> filter_md_;
std::unique_ptr<dnnl::memory::desc> bias_md_;
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd_;
std::unique_ptr<mkldnn::primitive> conv_fwd_;
std::unique_ptr<dnnl::convolution_forward::primitive_desc> conv_fwd_pd_;
std::unique_ptr<dnnl::primitive> conv_fwd_;
private:
IAllocatorUniquePtr<void> src_reorder_buffer_;
@ -636,7 +636,7 @@ class MklDnnConv : public MklDnnKernel {
}
private:
std::unique_ptr<mkldnn::stream> stream_;
std::unique_ptr<dnnl::stream> stream_;
std::vector<int64_t> kernel_shape_; // must use ComputeKernelShape(...), instead of kernel_shape_
AutoPadType auto_pad_;
int64_t group_;
@ -647,5 +647,5 @@ class MklDnnConv : public MklDnnKernel {
std::string activation_;
float alpha_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -2,34 +2,34 @@
// Licensed under the MIT License.
#pragma once
#include "mkldnn_types.h"
#include "dnnl_types.h"
#include "core/framework/op_kernel.h"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/cpu/nn/autopad_type.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
#include "core/util/math.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
class MklDnnConvBatchNorm : public MklDnnKernel {
class DnnlConvBatchNorm : public DnnlKernel {
public:
MklDnnConvBatchNorm(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
DnnlConvBatchNorm(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ReadAttributes(attributes, attributes_prefix);
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) override {
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) override {
Ort::CustomOpApi ort{*api};
stream_ = onnxruntime::make_unique<mkldnn::stream>(mkldnn::stream(cpu_engine));
stream_ = onnxruntime::make_unique<dnnl::stream>(dnnl::stream(cpu_engine));
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
const OrtValue* winput_tensor = ort.KernelContext_GetInput(context, input_index + 1);
auto wtensor_info = ort.GetTensorTypeAndShape(winput_tensor);
@ -50,10 +50,10 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
ort_source_format_ = mkldnn::memory::format_tag::any;
ort_source_format_ = dnnl::memory::format_tag::any;
x_shape = TensorShape(xshape, xdim);
} else {
// get the output of previous node (mkldnn block propagation).
// get the output of previous node (Dnnl block propagation).
// TODO Sourcenode will set src of this node.
x_shape = parents_[0].get()->primitive_dst_shape_;
ort_source_format_ = parents_[0].get()->ort_source_format_;
@ -116,11 +116,11 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
TensorShape y_shape(y_dims);
primitive_dst_shape_ = TensorShape(y_dims);
TensorShape output_shape = y_shape.Slice(2);
mkldnn::memory::dims dst_dims_mkl(y_dims.begin(), y_dims.end());
primitive_dst_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({dst_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
dnnl::memory::dims dst_dims_mkl(y_dims.begin(), y_dims.end());
primitive_dst_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({dst_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
mkldnn::memory::dims filter_dims_mkl;
dnnl::memory::dims filter_dims_mkl;
if (group_mkl == 1) {
filter_dims_mkl.assign(w_shape.GetDims().begin(), w_shape.GetDims().end());
} else {
@ -128,16 +128,16 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
static_cast<int>(w_shape[0] / group_mkl)});
filter_dims_mkl.insert(filter_dims_mkl.end(), w_shape.GetDims().begin() + 1, w_shape.GetDims().end());
}
mkldnn::memory::dims strides_mkl(strides.begin(), strides.end());
mkldnn::memory::dims dilations_mkl(dilations.begin(), dilations.end());
// mkldnn dilations start from 0 so we need to subtract 1 from each dim.
dnnl::memory::dims strides_mkl(strides.begin(), strides.end());
dnnl::memory::dims dilations_mkl(dilations.begin(), dilations.end());
// Dnnl dilations start from 0 so we need to subtract 1 from each dim.
for (size_t dim = 0; dim < kernel_rank; dim++) {
dilations_mkl[dim] -= 1;
}
mkldnn::memory::dims padding_left_mkl(pads.begin(), pads.begin() + kernel_rank);
mkldnn::memory::dims padding_right_mkl(pads.begin() + kernel_rank, pads.end());
mkldnn::memory::dims bias_dims_mkl;
dnnl::memory::dims padding_left_mkl(pads.begin(), pads.begin() + kernel_rank);
dnnl::memory::dims padding_right_mkl(pads.begin() + kernel_rank, pads.end());
dnnl::memory::dims bias_dims_mkl;
int batchNormIndix = (mklnode_ptr_->num_inputs == 7) ? input_index + 3 : input_index + 2;
if (mklnode_ptr_->num_inputs == 7) {
@ -160,143 +160,143 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
bias_dims_mkl.assign(b_shape.GetDims().begin(), b_shape.GetDims().end());
}
auto src_format = mkldnn::memory::format_tag::any;
auto src_format = dnnl::memory::format_tag::any;
if (kernel_rank == 1) {
src_format = mkldnn::memory::format_tag::ncw;
src_format = dnnl::memory::format_tag::ncw;
if (group_mkl == 1) {
filter_format_ = mkldnn::memory::format_tag::oiw;
filter_format_ = dnnl::memory::format_tag::oiw;
} else {
filter_format_ = mkldnn::memory::format_tag::goiw;
filter_format_ = dnnl::memory::format_tag::goiw;
}
} else if (kernel_rank == 2) {
src_format = mkldnn::memory::format_tag::nchw;
src_format = dnnl::memory::format_tag::nchw;
if (group_mkl == 1) {
filter_format_ = mkldnn::memory::format_tag::oihw;
filter_format_ = dnnl::memory::format_tag::oihw;
} else {
filter_format_ = mkldnn::memory::format_tag::goihw;
filter_format_ = dnnl::memory::format_tag::goihw;
}
} else {
src_format = mkldnn::memory::format_tag::ncdhw;
src_format = dnnl::memory::format_tag::ncdhw;
if (group_mkl == 1) {
filter_format_ = mkldnn::memory::format_tag::oidhw;
filter_format_ = dnnl::memory::format_tag::oidhw;
} else {
filter_format_ = mkldnn::memory::format_tag::goidhw;
filter_format_ = dnnl::memory::format_tag::goidhw;
}
}
mkldnn::memory::dims src_dims_mkl(x_shape.GetDims().begin(), x_shape.GetDims().end());
dnnl::memory::dims src_dims_mkl(x_shape.GetDims().begin(), x_shape.GetDims().end());
if (mklnode_ptr_->parent_nodes.empty()) {
ort_source_format_ = src_format;
ort_source_desc_ = mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), src_format);
source_desc_ = mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), src_format);
ort_source_desc_ = dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), src_format);
source_desc_ = dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), src_format);
}
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
// Set the memory descriptors to format::any to allow MKLDNN to decide what the optimal memory layout should be
// Set the memory descriptors to format::any to allow DNNL to decide what the optimal memory layout should be
// for the computation given the input
filter_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({filter_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
bias_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({bias_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
filter_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({filter_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
bias_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({bias_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
mkldnn::memory::dims conv_zero_padding = {0, 0};
dnnl::memory::dims conv_zero_padding = {0, 0};
fwd_desc_ = onnxruntime::make_unique<mkldnn::convolution_forward::desc>(
mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward_inference, mkldnn::algorithm::convolution_direct, *src_md_,
fwd_desc_ = onnxruntime::make_unique<dnnl::convolution_forward::desc>(
dnnl::convolution_forward::desc(
dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, *src_md_,
*filter_md_, *bias_md_, *primitive_dst_md_,
strides_mkl, dilations_mkl, padding_left_mkl,
padding_right_mkl));
if (fuse_relu_) {
mkldnn::primitive_attr attr;
// attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
dnnl::primitive_attr attr;
// attr.set_int_output_round_mode(dnnl::round_mode::round_nearest);
// Execute RELU as Fuse PostOps
const float ops_scale = 1.f;
const float ops_alpha = 0.f; // relu negative slope
const float ops_beta = 0.f;
mkldnn::post_ops ops;
ops.append_eltwise(ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
dnnl::post_ops ops;
ops.append_eltwise(ops_scale, dnnl::algorithm::eltwise_relu, ops_alpha, ops_beta);
attr.set_post_ops(ops);
conv_fwd_pd_ = onnxruntime::make_unique<mkldnn::convolution_forward::primitive_desc>(
mkldnn::convolution_forward::primitive_desc(*fwd_desc_, attr, cpu_engine));
conv_fwd_pd_ = onnxruntime::make_unique<dnnl::convolution_forward::primitive_desc>(
dnnl::convolution_forward::primitive_desc(*fwd_desc_, attr, cpu_engine));
} else {
conv_fwd_pd_ = onnxruntime::make_unique<mkldnn::convolution_forward::primitive_desc>(
mkldnn::convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
conv_fwd_pd_ = onnxruntime::make_unique<dnnl::convolution_forward::primitive_desc>(
dnnl::convolution_forward::primitive_desc(*fwd_desc_, cpu_engine));
}
primitive_src_desc_ = static_cast<mkldnn::memory::desc>(
primitive_src_desc_ = static_cast<dnnl::memory::desc>(
conv_fwd_pd_.get()->src_desc());
filter_desc_ = static_cast<mkldnn::memory::desc>(
filter_desc_ = static_cast<dnnl::memory::desc>(
conv_fwd_pd_.get()->weights_desc());
primitive_dst_desc_ = static_cast<mkldnn::memory::desc>(
primitive_dst_desc_ = static_cast<dnnl::memory::desc>(
conv_fwd_pd_.get()->dst_desc());
src_size_ = conv_fwd_pd_.get()->src_desc().get_size();
filter_size_ = conv_fwd_pd_.get()->weights_desc().get_size();
dst_size_ = conv_fwd_pd_.get()->dst_desc().get_size();
filter_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->weights_desc(), cpu_engine, nullptr));
filter_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->weights_desc(), cpu_engine, nullptr));
if (primitive_src_desc_ != source_desc_) {
mkldnn::memory::dims src_dims(x_shape.GetDims().begin(), x_shape.GetDims().end());
auto pd = mkldnn::memory::desc({{src_dims}, MklDnnType<T>(), ort_source_format_});
dnnl::memory::dims src_dims(x_shape.GetDims().begin(), x_shape.GetDims().end());
auto pd = dnnl::memory::desc({{src_dims}, DnnnType<T>(), ort_source_format_});
if (mklnode_ptr_->parent_nodes.empty())
src_mem_from_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(pd, cpu_engine, nullptr));
src_mem_from_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(pd, cpu_engine, nullptr));
else
src_mem_from_ = parents_[0].get()->primitive_dst_mem_;
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
net.push_back(mkldnn::reorder(*src_mem_from_, *src_mem_));
net_args.push_back({{MKLDNN_ARG_FROM, *src_mem_from_},
{MKLDNN_ARG_TO, *src_mem_}});
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
net.push_back(dnnl::reorder(*src_mem_from_, *src_mem_));
net_args.push_back({{DNNL_ARG_FROM, *src_mem_from_},
{DNNL_ARG_TO, *src_mem_}});
} else {
if (mklnode_ptr_->parent_nodes.empty()) {
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->src_desc(), cpu_engine, nullptr));
} else {
src_mem_ = parents_[0].get()->primitive_dst_mem_;
}
}
if (mklnode_ptr_->output_index >= 0) {
// Use mkldnn's internal output buffer
// Use Dnnl's internal output buffer
if (primitive_dst_desc_ != ort_source_desc_) {
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
} else {
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
}
} else {
// last node of sub-graph. need to allocate memory for output_tensor
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->dst_desc(), cpu_engine));
}
bias_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_.get()->bias_desc(), cpu_engine, nullptr));
conv_fwd_ = onnxruntime::make_unique<mkldnn::convolution_forward>(
mkldnn::convolution_forward(*conv_fwd_pd_));
bias_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_.get()->bias_desc(), cpu_engine, nullptr));
conv_fwd_ = onnxruntime::make_unique<dnnl::convolution_forward>(
dnnl::convolution_forward(*conv_fwd_pd_));
net.push_back(*conv_fwd_);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_WEIGHTS, *filter_mem_},
{MKLDNN_ARG_BIAS, *bias_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_WEIGHTS, *filter_mem_},
{DNNL_ARG_BIAS, *bias_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
@ -357,7 +357,7 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
}
}
virtual void ReorderWeights(const OrtCustomOpApi* api, OrtKernelContext* context, mkldnn::engine& cpu_engine) override {
virtual void ReorderWeights(const OrtCustomOpApi* api, OrtKernelContext* context, dnnl::engine& cpu_engine) override {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index + 1);
@ -369,7 +369,7 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
TensorShape W(xshape, xdim);
const int group_mkl = static_cast<int>(group_);
mkldnn::memory::dims filter_dims_mkl;
dnnl::memory::dims filter_dims_mkl;
if (group_mkl == 1) {
filter_dims_mkl.assign(W.GetDims().begin(), W.GetDims().end());
} else {
@ -393,29 +393,29 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
{
// lock to make sure reordering is done only once
std::lock_guard<OrtMutex> lock(provider_->GetMutex());
std::shared_ptr<mkldnn::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
std::shared_ptr<dnnl::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
if (filter_dst_mem == nullptr) {
mkldnn::memory src = mkldnn::memory({{filter_dims_mkl}, MklDnnType<T>(), filter_format_}, cpu_engine, (void*)weights_scaled_by_axis.data());
dnnl::memory src = dnnl::memory({{filter_dims_mkl}, DnnnType<T>(), filter_format_}, cpu_engine, (void*)weights_scaled_by_axis.data());
IAllocatorUniquePtr<void> filter_reorder_buffer =
IAllocator::MakeUniquePtr<void>(alloc_, filter_size_);
filter_dst_mem = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->weights_desc(), cpu_engine, filter_reorder_buffer.get()));
filter_dst_mem = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->weights_desc(), cpu_engine, filter_reorder_buffer.get()));
mkldnn::reorder(src, *filter_dst_mem)
dnnl::reorder(src, *filter_dst_mem)
.execute(cpu_engine, src, *filter_dst_mem);
provider_->SaveAllocatedMemory(std::move(filter_reorder_buffer));
provider_->SetWeightsMemoryBuffer(mklnode_ptr_->weight_name, filter_dst_mem);
}
std::shared_ptr<mkldnn::memory> bias_mem = provider_->GetBiasMemoryBuffer(mklnode_ptr_->weight_name);
std::shared_ptr<dnnl::memory> bias_mem = provider_->GetBiasMemoryBuffer(mklnode_ptr_->weight_name);
if (bias_mem == nullptr) {
auto bias_size = conv_fwd_pd_.get()->bias_desc().get_size();
IAllocatorUniquePtr<void> bias_buffer =
IAllocator::MakeUniquePtr<void>(alloc_, bias_size);
bias_mem = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(conv_fwd_pd_->bias_desc(), cpu_engine, bias_buffer.get()));
bias_mem = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(conv_fwd_pd_->bias_desc(), cpu_engine, bias_buffer.get()));
float* bias_buffer_data = static_cast<float*>(bias_buffer.get());
if (mklnode_ptr_->num_inputs == 7) {
const OrtValue* conv_bias_tensor = ort.KernelContext_GetInput(context, input_index + 2);
@ -460,7 +460,7 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
const OrtValue* winput_tensor = ort.KernelContext_GetInput(context, input_index + 1);
const T* filter_data = const_cast<T*>(ort.GetTensorData<T>(winput_tensor));
std::shared_ptr<mkldnn::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
std::shared_ptr<dnnl::memory> filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
if (filter_dst_mem == nullptr) {
ReorderWeights(api, context, GetEngine());
filter_dst_mem = provider_->GetWeightsMemoryBuffer(mklnode_ptr_->weight_name);
@ -468,7 +468,7 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
filter_data = static_cast<T*>(filter_dst_mem->get_data_handle());
filter_mem_->set_data_handle(static_cast<void*>(const_cast<T*>(filter_data)));
std::shared_ptr<mkldnn::memory> bias_mem = provider_->GetBiasMemoryBuffer(mklnode_ptr_->weight_name);
std::shared_ptr<dnnl::memory> bias_mem = provider_->GetBiasMemoryBuffer(mklnode_ptr_->weight_name);
const T* bias_data = static_cast<T*>(bias_mem->get_data_handle());
bias_mem_->set_data_handle(static_cast<void*>(const_cast<T*>(bias_data)));
@ -574,28 +574,28 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
}
private:
mkldnn::memory::desc filter_desc_;
mkldnn::memory::format_tag filter_format_;
dnnl::memory::desc filter_desc_;
dnnl::memory::format_tag filter_format_;
std::shared_ptr<mkldnn::memory> src_mem_from_;
std::unique_ptr<mkldnn::memory> src_mem_to_;
std::shared_ptr<dnnl::memory> src_mem_from_;
std::unique_ptr<dnnl::memory> src_mem_to_;
size_t src_size_;
size_t filter_size_;
size_t dst_size_;
std::shared_ptr<mkldnn::memory> src_mem_;
std::unique_ptr<mkldnn::memory> filter_mem_;
std::unique_ptr<mkldnn::memory> bias_mem_;
std::shared_ptr<dnnl::memory> src_mem_;
std::unique_ptr<dnnl::memory> filter_mem_;
std::unique_ptr<dnnl::memory> bias_mem_;
std::unique_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
std::unique_ptr<dnnl::convolution_forward::desc> fwd_desc_;
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::unique_ptr<mkldnn::memory::desc> filter_md_;
std::unique_ptr<mkldnn::memory::desc> bias_md_;
std::unique_ptr<dnnl::memory::desc> src_md_;
std::unique_ptr<dnnl::memory::desc> filter_md_;
std::unique_ptr<dnnl::memory::desc> bias_md_;
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd_;
std::unique_ptr<mkldnn::primitive> conv_fwd_;
std::unique_ptr<dnnl::convolution_forward::primitive_desc> conv_fwd_pd_;
std::unique_ptr<dnnl::primitive> conv_fwd_;
private:
IAllocatorUniquePtr<void> src_reorder_buffer_;
@ -683,7 +683,7 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
}
private:
std::unique_ptr<mkldnn::stream> stream_;
std::unique_ptr<dnnl::stream> stream_;
std::vector<int64_t> kernel_shape_; // must use ComputeKernelShape(...), instead of kernel_shape_
AutoPadType auto_pad_;
int64_t group_;
@ -695,5 +695,5 @@ class MklDnnConvBatchNorm : public MklDnnKernel {
float alpha_;
float epsilon_ = 1e-5f; // attribute of fused batchnorm.
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -4,21 +4,21 @@
#pragma warning(disable : 4505) //Unreferenced local function has been removed
#endif
#include "mkldnn_func_kernel.h"
#include "dnnl_func_kernel.h"
#include "core/common/exceptions.h"
#include "core/session/onnxruntime_cxx_api.h"
#include "core/providers/mkldnn/mkldnn_common.h"
#include "core/providers/mkldnn/subgraph/mkldnn_conv.h"
#include "core/providers/mkldnn/subgraph/mkldnn_batchnorm.h"
#include "core/providers/mkldnn/subgraph/mkldnn_conv_batchnorm.h"
#include "core/providers/mkldnn/subgraph/mkldnn_activations.h"
#include "core/providers/mkldnn/subgraph/mkldnn_pool.h"
#include "core/providers/mkldnn/subgraph/mkldnn_sum.h"
#include "core/providers/mkldnn/subgraph/mkldnn_lrn.h"
#include "core/providers/dnnl/dnnl_common.h"
#include "core/providers/dnnl/subgraph/dnnl_conv.h"
#include "core/providers/dnnl/subgraph/dnnl_batchnorm.h"
#include "core/providers/dnnl/subgraph/dnnl_conv_batchnorm.h"
#include "core/providers/dnnl/subgraph/dnnl_activations.h"
#include "core/providers/dnnl/subgraph/dnnl_pool.h"
#include "core/providers/dnnl/subgraph/dnnl_sum.h"
#include "core/providers/dnnl/subgraph/dnnl_lrn.h"
#include "core/session/onnxruntime_cxx_api.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
namespace {
template <typename T>
@ -28,7 +28,7 @@ class SubgraphPrimitive : public PrimitiveBase {
OrtKernelContext* context,
const SubgraphParams& params)
: cpu_engine_(GetEngine()) {
context_.stream = onnxruntime::make_unique<mkldnn::stream>(mkldnn::stream(cpu_engine_));
context_.stream = onnxruntime::make_unique<dnnl::stream>(dnnl::stream(cpu_engine_));
if (context_.net.size() == 0) {
CreateKernels(params);
@ -59,124 +59,124 @@ class SubgraphPrimitive : public PrimitiveBase {
private:
void CreateKernels(const SubgraphParams& params) {
for (const auto& mkldnn_node : params.subgraph->mkldnn_nodes) {
if (mkldnn_node.name == "Conv") {
for (const auto& dnnl_node : params.subgraph->dnnl_nodes) {
if (dnnl_node.name == "Conv") {
std::ostringstream os;
os << "Conv-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnConv<T>> kernel;
kernel = std::make_shared<MklDnnConv<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "Conv-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlConv<T>> kernel;
kernel = std::make_shared<DnnlConv<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "Conv-Relu") {
} else if (dnnl_node.name == "Conv-Relu") {
std::ostringstream os;
os << "Conv-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnConv<T>> kernel;
kernel = std::make_shared<MklDnnConv<T>>(mkldnn_node, params.provider, params.attributes, os.str());
os << "Conv-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlConv<T>> kernel;
kernel = std::make_shared<DnnlConv<T>>(dnnl_node, params.provider, params.attributes, os.str());
kernel->fuse_relu_ = true;
for (auto index : mkldnn_node.parent_nodes) {
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "Relu") {
} else if (dnnl_node.name == "Relu") {
std::ostringstream os;
os << "Relu-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnRelu<T>> kernel;
kernel = std::make_shared<MklDnnRelu<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "Relu-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlRelu<T>> kernel;
kernel = std::make_shared<DnnlRelu<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "BatchNormalization") {
} else if (dnnl_node.name == "BatchNormalization") {
std::ostringstream os;
os << "BatchNormalization-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnBatchNorm<T>> kernel;
kernel = std::make_shared<MklDnnBatchNorm<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "BatchNormalization-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlBatchNorm<T>> kernel;
kernel = std::make_shared<DnnlBatchNorm<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "BatchNormalization-Relu") {
} else if (dnnl_node.name == "BatchNormalization-Relu") {
std::ostringstream os;
os << "BatchNormalization-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnBatchNorm<T>> kernel;
kernel = std::make_shared<MklDnnBatchNorm<T>>(mkldnn_node, params.provider, params.attributes, os.str());
os << "BatchNormalization-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlBatchNorm<T>> kernel;
kernel = std::make_shared<DnnlBatchNorm<T>>(dnnl_node, params.provider, params.attributes, os.str());
kernel->fuse_relu_ = true;
for (auto index : mkldnn_node.parent_nodes) {
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "Conv-BatchNormalization") {
} else if (dnnl_node.name == "Conv-BatchNormalization") {
std::ostringstream os;
os << "Conv-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnConvBatchNorm<T>> kernel;
kernel = std::make_shared<MklDnnConvBatchNorm<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "Conv-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlConvBatchNorm<T>> kernel;
kernel = std::make_shared<DnnlConvBatchNorm<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "Conv-BatchNormalization-Relu") {
} else if (dnnl_node.name == "Conv-BatchNormalization-Relu") {
std::ostringstream os;
os << "Conv-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnConvBatchNorm<T>> kernel;
kernel = std::make_shared<MklDnnConvBatchNorm<T>>(mkldnn_node, params.provider, params.attributes, os.str());
os << "Conv-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlConvBatchNorm<T>> kernel;
kernel = std::make_shared<DnnlConvBatchNorm<T>>(dnnl_node, params.provider, params.attributes, os.str());
kernel->fuse_relu_ = true;
for (auto index : mkldnn_node.parent_nodes) {
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "MaxPool") {
} else if (dnnl_node.name == "MaxPool") {
std::ostringstream os;
os << "MaxPool-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnPool<T>> kernel;
kernel = std::make_shared<MklDnnPool<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "MaxPool-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlPool<T>> kernel;
kernel = std::make_shared<DnnlPool<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "GlobalMaxPool") {
} else if (dnnl_node.name == "GlobalMaxPool") {
std::ostringstream os;
os << "GlobalMaxPool-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnPool<T>> kernel;
kernel = std::make_shared<MklDnnPool<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "GlobalMaxPool-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlPool<T>> kernel;
kernel = std::make_shared<DnnlPool<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "AveragePool") {
} else if (dnnl_node.name == "AveragePool") {
std::ostringstream os;
os << "AveragePool-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnPool<T>> kernel;
kernel = std::make_shared<MklDnnPool<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "AveragePool-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlPool<T>> kernel;
kernel = std::make_shared<DnnlPool<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "GlobalAveragePool") {
} else if (dnnl_node.name == "GlobalAveragePool") {
std::ostringstream os;
os << "GlobalAveragePool-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnPool<T>> kernel;
kernel = std::make_shared<MklDnnPool<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "GlobalAveragePool-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlPool<T>> kernel;
kernel = std::make_shared<DnnlPool<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "LRN") {
} else if (dnnl_node.name == "LRN") {
std::ostringstream os;
os << "LRN-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnLrn<T>> kernel;
kernel = std::make_shared<MklDnnLrn<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "LRN-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlLrn<T>> kernel;
kernel = std::make_shared<DnnlLrn<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (mkldnn_node.name == "Sum") {
} else if (dnnl_node.name == "Sum") {
std::ostringstream os;
os << "Sum-" << mkldnn_node.node_index << "-";
std::shared_ptr<MklDnnSum<T>> kernel;
kernel = std::make_shared<MklDnnSum<T>>(mkldnn_node, params.provider, params.attributes, os.str());
for (auto index : mkldnn_node.parent_nodes) {
os << "Sum-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlSum<T>> kernel;
kernel = std::make_shared<DnnlSum<T>>(dnnl_node, params.provider, params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
@ -185,16 +185,16 @@ class SubgraphPrimitive : public PrimitiveBase {
}
struct SubgraphContext {
std::unique_ptr<mkldnn::stream> stream;
std::vector<mkldnn::primitive> net;
std::vector<std::unordered_map<int, mkldnn::memory>> net_args;
std::vector<std::shared_ptr<MklDnnKernel>> kernels;
std::unique_ptr<dnnl::stream> stream;
std::vector<dnnl::primitive> net;
std::vector<std::unordered_map<int, dnnl::memory>> net_args;
std::vector<std::shared_ptr<DnnlKernel>> kernels;
SubgraphContext() : stream(nullptr) {}
};
void Initialize(const OrtCustomOpApi* api, OrtKernelContext* context) {
// Propagate mkldnn block format
// Propagate Dnnl block format
// dst format of current node to src format of next node
for (auto& kernel : context_.kernels) {
kernel->CreatePrimitives(api, context, cpu_engine_, context_.net, context_.net_args);
@ -205,10 +205,10 @@ class SubgraphPrimitive : public PrimitiveBase {
}
SubgraphContext context_;
mkldnn::engine& cpu_engine_;
dnnl::engine& cpu_engine_;
};
// Pool which allows for reuse of MKLDNN Conv primitives which are expensive to instantiate.
// Pool which allows for reuse of DNNL Conv primitives which are expensive to instantiate.
// To address thread safety, the primitives are stored in a map on thread local storage.
template <typename T>
class SubgraphPrimitivePool : public PrimitivePool<T> {
@ -218,7 +218,7 @@ class SubgraphPrimitivePool : public PrimitivePool<T> {
const SubgraphParams& params) {
Ort::CustomOpApi ort{*api};
std::string dims_str;
for (auto i = 0; i < params.subgraph->mkldnn_nodes[0].num_inputs; i++) {
for (auto i = 0; i < params.subgraph->dnnl_nodes[0].num_inputs; i++) {
const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i);
auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
auto tensor_shape = ort.GetTensorShape(tensor_info);
@ -227,7 +227,7 @@ class SubgraphPrimitivePool : public PrimitivePool<T> {
auto dim = tensor_shape.size();
TensorShape x_shape(shape, dim);
mkldnn::memory::dims src_dims(x_shape.GetDims().begin(), x_shape.GetDims().end());
dnnl::memory::dims src_dims(x_shape.GetDims().begin(), x_shape.GetDims().end());
AddDimsToKey(dims_str, src_dims);
}
@ -254,20 +254,20 @@ class SubgraphPrimitivePool : public PrimitivePool<T> {
} // namespace
template <typename T>
Status MkldnnFuncKernel<T>::Compute(const OrtCustomOpApi* api, OrtKernelContext* context) const {
Status DnnlFuncKernel<T>::Compute(const OrtCustomOpApi* api, OrtKernelContext* context) const {
Status status;
try {
SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
primitive->UpdateProvider(params_);
status = primitive->Compute(api, context);
} catch (const mkldnn::error& e) {
} catch (const dnnl::error& e) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status,
", message: ", e.what());
}
return status;
}
template class MkldnnFuncKernel<float>;
template class DnnlFuncKernel<float>;
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -3,18 +3,18 @@
#pragma once
#include "core/graph/onnx_protobuf.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/session/onnxruntime_c_api.h"
#include "core/framework/func_api.h"
#include "mkldnn_kernel.h"
#include "dnnl_kernel.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
namespace {
struct SubgraphParams {
NodeAttributes attributes;
MKLDNNExecutionProvider* provider;
DNNLExecutionProvider* provider;
std::shared_ptr<Subgraph> subgraph;
std::string subgraph_id;
std::string subgraph_key;
@ -24,11 +24,11 @@ struct SubgraphParams {
} // namespace
template <typename T>
class MkldnnFuncKernel {
class DnnlFuncKernel {
public:
explicit MkldnnFuncKernel(const ComputeContext* context,
explicit DnnlFuncKernel(const ComputeContext* context,
const NodeAttributes& attributes,
MKLDNNExecutionProvider* provider) {
DNNLExecutionProvider* provider) {
ORT_UNUSED_PARAMETER(context);
params_.provider = provider;
@ -37,28 +37,28 @@ class MkldnnFuncKernel {
auto sub_it = attributes.find("subgraph_id");
if (sub_it->second.type() == ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING) {
params_.subgraph_id = sub_it->second.s();
params_.subgraph = provider->GetMklDnnSubgraph(params_.subgraph_id);
params_.subgraph = provider->GetDnnlSubgraph(params_.subgraph_id);
std::ostringstream key_os;
key_os << params_.subgraph->graph_name << "_" << params_.subgraph_id << "-";
key_os << params_.subgraph->mkldnn_nodes.back().ToString() << "-";
key_os << params_.subgraph->mkldnn_nodes.back().output_name;
key_os << params_.subgraph->dnnl_nodes.back().ToString() << "-";
key_os << params_.subgraph->dnnl_nodes.back().output_name;
if (params_.subgraph->mkldnn_nodes[0].name == "Conv") {
if (params_.subgraph->dnnl_nodes[0].name == "Conv") {
std::ostringstream os;
os << "Conv-" << params_.subgraph->mkldnn_nodes[0].node_index << "-";
os << "Conv-" << params_.subgraph->dnnl_nodes[0].node_index << "-";
key_os << GetConvAttributeKey(attributes, os.str());
}
if (params_.subgraph->mkldnn_nodes[0].name == "LRN") {
if (params_.subgraph->dnnl_nodes[0].name == "LRN") {
std::ostringstream os;
os << "LRN-" << params_.subgraph->mkldnn_nodes[0].node_index << "-";
os << "LRN-" << params_.subgraph->dnnl_nodes[0].node_index << "-";
key_os << GetLrnAttributeKey(attributes, os.str());
}
if (params_.subgraph->mkldnn_nodes[0].name.find("Pool") != std::string::npos) {
if (params_.subgraph->dnnl_nodes[0].name.find("Pool") != std::string::npos) {
std::ostringstream os;
os << params_.subgraph->mkldnn_nodes[0].name << "-" << params_.subgraph->mkldnn_nodes[0].node_index << "-";
os << params_.subgraph->dnnl_nodes[0].name << "-" << params_.subgraph->dnnl_nodes[0].node_index << "-";
key_os << GetPoolAttributesKey(attributes, os.str());
}
@ -231,5 +231,5 @@ class MkldnnFuncKernel {
private:
SubgraphParams params_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -0,0 +1,61 @@
// Copyright(C) 2019 Intel Corporation
// Licensed under the MIT License
#include "dnnl_kernel.h"
namespace onnxruntime {
namespace ort_dnnl {
void DnnlKernel::InitDstReorderOutput(dnnl::engine& cpu_engine,
dnnl::memory::data_type& data_type,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) {
// Allocate dst buffer if reorder is necessary
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder to ONNXRuntime format
dnnl::memory::dims dst_dims_mkl(
primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
dnnl::memory::desc dst_des = dnnl::memory::desc(dst_dims_mkl,
data_type, ort_source_format_);
reorder_dst_mem_to_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(dst_des, cpu_engine));
net.push_back(dnnl::reorder(*primitive_dst_mem_, *reorder_dst_mem_to_));
net_args.push_back({{DNNL_ARG_FROM, *primitive_dst_mem_},
{DNNL_ARG_TO, *reorder_dst_mem_to_}});
}
}
dnnl::memory::format_tag DnnlKernel::GetSourceFormat(int dim_size) {
dnnl::memory::format_tag source_format = dnnl::memory::format_tag::any;
switch (dim_size) {
case 1: {
source_format = dnnl::memory::format_tag::x;
break;
}
case 2: {
source_format = dnnl::memory::format_tag::nc;
break;
}
case 3: {
source_format = dnnl::memory::format_tag::ntc;
break;
}
case 4: {
source_format = dnnl::memory::format_tag::nchw;
break;
}
case 5: {
source_format = dnnl::memory::format_tag::ncdhw;
break;
}
default: {
source_format = dnnl::memory::format_tag::any;
break;
}
}
return source_format;
}
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -6,41 +6,41 @@
#pragma warning(disable : 4244)
#endif
#include "mkldnn.hpp"
#include "dnnl.hpp"
#include "core/common/cpuid_info.h"
#include "core/session/onnxruntime_cxx_api.h"
#include "core/providers/mkldnn/subgraph/subgraph.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/dnnl/subgraph/subgraph.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
class MklDnnKernel {
class DnnlKernel {
public:
MklDnnKernel(const MklDnnNode& node,
MKLDNNExecutionProvider* provider) {
DnnlKernel(const DnnlNode& node,
DNNLExecutionProvider* provider) {
name_ = node.name;
mklnode_ptr_ = std::make_shared<MklDnnNode>(node);
mklnode_ptr_ = std::make_shared<DnnlNode>(node);
provider_ = provider;
alloc_ = provider_->GetAllocator(0, OrtMemTypeDefault);
}
virtual ~MklDnnKernel(){};
virtual ~DnnlKernel(){};
virtual void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) = 0;
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) = 0;
virtual void ReorderWeights(const OrtCustomOpApi* api, OrtKernelContext* context, mkldnn::engine& cpu_engine) {
virtual void ReorderWeights(const OrtCustomOpApi* api, OrtKernelContext* context, dnnl::engine& cpu_engine) {
ORT_UNUSED_PARAMETER(api);
ORT_UNUSED_PARAMETER(context);
ORT_UNUSED_PARAMETER(cpu_engine);
}
virtual void SetProvider(MKLDNNExecutionProvider* provider) {
virtual void SetProvider(DNNLExecutionProvider* provider) {
provider_ = provider;
}
MKLDNNExecutionProvider* GetProvider() {
DNNLExecutionProvider* GetProvider() {
return provider_;
}
@ -79,43 +79,43 @@ class MklDnnKernel {
return Status::OK();
}
void InitDstReorderOutput(mkldnn::engine& cpu_engine,
mkldnn::memory::data_type& data_type,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args);
void InitDstReorderOutput(dnnl::engine& cpu_engine,
dnnl::memory::data_type& data_type,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args);
mkldnn::memory::format_tag GetSourceFormat(int dim_size);
dnnl::memory::format_tag GetSourceFormat(int dim_size);
public:
std::string name_;
std::vector<std::shared_ptr<MklDnnKernel>> parents_;
std::vector<std::shared_ptr<DnnlKernel>> parents_;
bool fuse_relu_ = false;
bool fuse_sum_ = false;
std::shared_ptr<mkldnn::memory> primitive_dst_mem_;
std::unique_ptr<mkldnn::memory::desc> primitive_dst_md_;
std::shared_ptr<dnnl::memory> primitive_dst_mem_;
std::unique_ptr<dnnl::memory::desc> primitive_dst_md_;
TensorShape primitive_dst_shape_;
mkldnn::memory::desc primitive_dst_desc_;
dnnl::memory::desc primitive_dst_desc_;
// ONNX Runtime format
mkldnn::memory::format_tag ort_source_format_;
mkldnn::memory::desc ort_source_desc_;
dnnl::memory::format_tag ort_source_format_;
dnnl::memory::desc ort_source_desc_;
// input format.
// It can be ORT format (nchw) or blocked memory format from parent node
// mkldnn::memory::format_tag source_format_ = mkldnn::memory::format_tag::any;
mkldnn::memory::desc source_desc_ = mkldnn::memory::desc();
// dnnl::memory::format_tag source_format_ = dnnl::memory::format_tag::any;
dnnl::memory::desc source_desc_ = dnnl::memory::desc();
Status primitive_created_status_;
protected:
// Pointer to MklNode of subgraph IR
std::shared_ptr<MklDnnNode> mklnode_ptr_;
std::shared_ptr<DnnlNode> mklnode_ptr_;
// input format expected by primitive object
mkldnn::memory::desc primitive_src_desc_;
dnnl::memory::desc primitive_src_desc_;
// memory used for reorders
std::unique_ptr<mkldnn::memory> reorder_dst_mem_to_;
std::unique_ptr<dnnl::memory> reorder_dst_mem_to_;
AllocatorPtr alloc_;
MKLDNNExecutionProvider* provider_;
DNNLExecutionProvider* provider_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -5,28 +5,28 @@
#include "core/util/math.h"
#include "core/util/math_cpuonly.h"
#include "core/framework/op_kernel.h"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
class MklDnnLrn : public MklDnnKernel {
class DnnlLrn : public DnnlKernel {
public:
MklDnnLrn(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
DnnlLrn(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ReadAttributes(attributes, attributes_prefix);
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) override {
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) override {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
@ -42,18 +42,18 @@ class MklDnnLrn : public MklDnnKernel {
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
x_shape = TensorShape(xshape, xdim);
mkldnn::memory::dims src_dims(
dnnl::memory::dims src_dims(
x_shape.GetDims().begin(), x_shape.GetDims().end());
ort_source_desc_ = mkldnn::memory::desc(
{src_dims}, MklDnnType<T>(), ort_source_format_);
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims}, MklDnnType<T>(), ort_source_format_));
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(*src_md_, cpu_engine, nullptr));
ort_source_desc_ = dnnl::memory::desc(
{src_dims}, DnnnType<T>(), ort_source_format_);
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(*src_md_, cpu_engine, nullptr));
} else {
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc(parents_[0].get()->primitive_dst_desc_));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_));
src_mem_ = parents_[0].get()->primitive_dst_mem_;
x_shape = parents_[0].get()->primitive_dst_shape_;
ort_source_format_ = parents_[0].get()->ort_source_format_;
@ -63,13 +63,13 @@ class MklDnnLrn : public MklDnnKernel {
primitive_dst_shape_ = TensorShape(x_shape);
mkldnn::algorithm algo = mkldnn::algorithm::lrn_across_channels;
fwd_desc_ = onnxruntime::make_unique<mkldnn::lrn_forward::desc>(
mkldnn::lrn_forward::desc(mkldnn::prop_kind::forward_scoring, algo, *src_md_,
dnnl::algorithm algo = dnnl::algorithm::lrn_across_channels;
fwd_desc_ = onnxruntime::make_unique<dnnl::lrn_forward::desc>(
dnnl::lrn_forward::desc(dnnl::prop_kind::forward_scoring, algo, *src_md_,
size_, alpha_, beta_, bias_));
fwd_primitive_desc_ = onnxruntime::make_unique<mkldnn::lrn_forward::primitive_desc>(
mkldnn::lrn_forward::primitive_desc(*fwd_desc_, cpu_engine));
fwd_primitive_desc_ = onnxruntime::make_unique<dnnl::lrn_forward::primitive_desc>(
dnnl::lrn_forward::primitive_desc(*fwd_desc_, cpu_engine));
primitive_src_desc_ = fwd_primitive_desc_.get()->src_desc();
primitive_dst_desc_ = fwd_primitive_desc_.get()->dst_desc();
@ -79,30 +79,30 @@ class MklDnnLrn : public MklDnnKernel {
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder neded. Use primitive output as input to reorder and
// allocate buffer for reorder output, final output of this subgraph
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
} else {
// Last node but re-order not needed. Allocate buffer to output of this node
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine, nullptr));
}
} else {
// Intermediate node. Use mkldnn kernel internal memory for output and
// Intermediate node. Use Dnnl kernel internal memory for output and
// use this as input to next node.
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
}
lrn_fwd_ = onnxruntime::make_unique<mkldnn::lrn_forward>(
mkldnn::lrn_forward(*fwd_primitive_desc_));
lrn_fwd_ = onnxruntime::make_unique<dnnl::lrn_forward>(
dnnl::lrn_forward(*fwd_primitive_desc_));
net.push_back(*lrn_fwd_);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
@ -172,13 +172,13 @@ class MklDnnLrn : public MklDnnKernel {
int size_ = 0;
private:
std::shared_ptr<mkldnn::memory> src_mem_;
std::shared_ptr<dnnl::memory> src_mem_;
std::unique_ptr<mkldnn::lrn_forward::desc> fwd_desc_;
std::unique_ptr<mkldnn::lrn_forward::primitive_desc> fwd_primitive_desc_;
std::unique_ptr<mkldnn::primitive> lrn_fwd_;
std::unique_ptr<dnnl::lrn_forward::desc> fwd_desc_;
std::unique_ptr<dnnl::lrn_forward::primitive_desc> fwd_primitive_desc_;
std::unique_ptr<dnnl::primitive> lrn_fwd_;
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::unique_ptr<dnnl::memory::desc> src_md_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -2,29 +2,29 @@
// Licensed under the MIT License.
#pragma once
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/cpu/nn/autopad_type.h"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
#include "core/util/math.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
class MklDnnPool : public MklDnnKernel {
class DnnlPool : public DnnlKernel {
public:
MklDnnPool(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
DnnlPool(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
op_name_ = node.name;
ReadAttributes(attributes, attributes_prefix);
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine, std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) override {
dnnl::engine& cpu_engine, std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) override {
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
@ -36,26 +36,26 @@ class MklDnnPool : public MklDnnKernel {
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
mkldnn::memory::dims dims(xdim);
dnnl::memory::dims dims(xdim);
x_shape_ = TensorShape(xshape, xdim);
mkldnn::memory::dims src_dims_mkl(x_shape_.GetDims().begin(), x_shape_.GetDims().end());
dnnl::memory::dims src_dims_mkl(x_shape_.GetDims().begin(), x_shape_.GetDims().end());
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
// ort_source_desc is the format of ONNX Runtime tensor format
ort_source_desc_ = mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), ort_source_format_);
ort_source_desc_ = dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), ort_source_format_);
// source_desc is propagating format. input to this op.
source_desc_ = mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), ort_source_format_);
source_desc_ = dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), ort_source_format_);
// reorder for better performance
mkldnn::memory::format_tag src_format = GetAVXFormat(src_dims_mkl);
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), src_format));
dnnl::memory::format_tag src_format = GetAVXFormat(src_dims_mkl);
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), src_format));
} else {
// get the output of previous node (mkldnn block propagation).
// get the output of previous node (Dnnl block propagation).
// TODO Sourcenode will set src of this node.
x_shape_ = parents_[0].get()->primitive_dst_shape_;
source_desc_ = parents_[0].get()->primitive_dst_desc_;
mkldnn::memory::dims src_dims_mkl(x_shape_.GetDims().begin(), x_shape_.GetDims().end());
dnnl::memory::dims src_dims_mkl(x_shape_.GetDims().begin(), x_shape_.GetDims().end());
ort_source_format_ = parents_[0].get()->ort_source_format_;
ort_source_desc_ = parents_[0].get()->ort_source_desc_;
@ -63,12 +63,12 @@ class MklDnnPool : public MklDnnKernel {
if (source_desc_ == ort_source_desc_) {
// reorder for better performance
mkldnn::memory::format_tag fmt = GetAVXFormat(src_dims_mkl);
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({src_dims_mkl}, MklDnnType<T>(), fmt));
dnnl::memory::format_tag fmt = GetAVXFormat(src_dims_mkl);
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims_mkl}, DnnnType<T>(), fmt));
} else {
src_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc(parents_[0].get()->primitive_dst_mem_->get_desc()));
src_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc(parents_[0].get()->primitive_dst_mem_->get_desc()));
}
}
@ -78,7 +78,7 @@ class MklDnnPool : public MklDnnKernel {
if (x_shape_.NumDimensions() <= 3) {
primitive_created_status_ = ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"1D Pooling is not supported by MKLDNN.");
"1D Pooling is not supported by DNNL.");
}
if (global_pooling_) {
@ -87,35 +87,35 @@ class MklDnnPool : public MklDnnKernel {
strides_.assign(kernel_shape_.size(), 1);
}
mkldnn::memory::dims dst_dims_mkl(y_dims.begin(), y_dims.end());
mkldnn::memory::dims kernel_mkl(kernel_shape_.begin(), kernel_shape_.end());
mkldnn::memory::dims strides_mkl(strides_.begin(), strides_.end());
mkldnn::memory::dims padding_left_mkl(pads_.begin(), pads_.begin() + (pads_.size() / 2));
mkldnn::memory::dims padding_right_mkl(pads_.begin() + (pads_.size() / 2), pads_.end());
dnnl::memory::dims dst_dims_mkl(y_dims.begin(), y_dims.end());
dnnl::memory::dims kernel_mkl(kernel_shape_.begin(), kernel_shape_.end());
dnnl::memory::dims strides_mkl(strides_.begin(), strides_.end());
dnnl::memory::dims padding_left_mkl(pads_.begin(), pads_.begin() + (pads_.size() / 2));
dnnl::memory::dims padding_right_mkl(pads_.begin() + (pads_.size() / 2), pads_.end());
primitive_dst_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({dst_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
primitive_dst_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({dst_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
mkldnn::algorithm algo = mkldnn::algorithm::pooling_max;
dnnl::algorithm algo = dnnl::algorithm::pooling_max;
if (op_name_ == "AveragePool" || op_name_ == "GlobalAveragePool") {
algo = mkldnn::algorithm::pooling_avg_exclude_padding;
algo = dnnl::algorithm::pooling_avg_exclude_padding;
if (count_include_pad_) {
algo = mkldnn::algorithm::pooling_avg_include_padding;
algo = dnnl::algorithm::pooling_avg_include_padding;
}
}
fwd_desc_ = onnxruntime::make_unique<mkldnn::pooling_forward::desc>(
mkldnn::pooling_forward::desc(mkldnn::prop_kind::forward_inference, algo,
fwd_desc_ = onnxruntime::make_unique<dnnl::pooling_forward::desc>(
dnnl::pooling_forward::desc(dnnl::prop_kind::forward_inference, algo,
*src_md_, *primitive_dst_md_,
strides_mkl, kernel_mkl,
padding_left_mkl, padding_right_mkl));
fwd_primitive_desc_ = onnxruntime::make_unique<mkldnn::pooling_forward::primitive_desc>(
mkldnn::pooling_forward::primitive_desc(*fwd_desc_, cpu_engine));
fwd_primitive_desc_ = onnxruntime::make_unique<dnnl::pooling_forward::primitive_desc>(
dnnl::pooling_forward::primitive_desc(*fwd_desc_, cpu_engine));
if (mklnode_ptr_->parent_nodes.empty()) {
// Sub-graph's first node. Read input from input buffer
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->src_desc(), cpu_engine, nullptr));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->src_desc(), cpu_engine, nullptr));
} else {
// Sub-graph's inner node. set input to parent's output
src_mem_ = parents_[0].get()->primitive_dst_mem_;
@ -129,24 +129,24 @@ class MklDnnPool : public MklDnnKernel {
// reorder source memory for best performance (AVX512);
if (primitive_src_desc_ != source_desc_) {
mkldnn::memory::dims src_dims(x_shape_.GetDims().begin(), x_shape_.GetDims().end());
auto pd = mkldnn::memory::desc(source_desc_);
dnnl::memory::dims src_dims(x_shape_.GetDims().begin(), x_shape_.GetDims().end());
auto pd = dnnl::memory::desc(source_desc_);
if (mklnode_ptr_->parent_nodes.empty())
src_mem_from_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(pd, cpu_engine, nullptr));
src_mem_from_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(pd, cpu_engine, nullptr));
else
src_mem_from_ = parents_[0].get()->primitive_dst_mem_;
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_->src_desc(), cpu_engine, nullptr));
net.push_back(mkldnn::reorder(*src_mem_from_, *src_mem_));
net_args.push_back({{MKLDNN_ARG_FROM, *src_mem_from_},
{MKLDNN_ARG_TO, *src_mem_}});
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_->src_desc(), cpu_engine, nullptr));
net.push_back(dnnl::reorder(*src_mem_from_, *src_mem_));
net_args.push_back({{DNNL_ARG_FROM, *src_mem_from_},
{DNNL_ARG_TO, *src_mem_}});
} else {
if (mklnode_ptr_->parent_nodes.empty()) {
src_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_->src_desc(), cpu_engine, nullptr));
src_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_->src_desc(), cpu_engine, nullptr));
} else {
src_mem_ = parents_[0].get()->primitive_dst_mem_;
}
@ -157,29 +157,29 @@ class MklDnnPool : public MklDnnKernel {
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder neded. Use primitive output as input to reorder and
// allocate buffer for reorder output, final output of this subgraph
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
} else {
// Last node but re-order not needed. Allocate buffer to output of this node
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine, nullptr));
}
} else {
// Intermediate node. Use mkldnn kernel internal memory for output and
// Intermediate node. Use Dnnl kernel internal memory for output and
// use this as input to next node.
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(fwd_primitive_desc_.get()->dst_desc(), cpu_engine));
}
pool_fwd_ = onnxruntime::make_unique<mkldnn::pooling_forward>(
mkldnn::pooling_forward(*fwd_primitive_desc_));
pool_fwd_ = onnxruntime::make_unique<dnnl::pooling_forward>(
dnnl::pooling_forward(*fwd_primitive_desc_));
net.push_back(*pool_fwd_);
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_DST, *primitive_dst_mem_}});
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
@ -298,29 +298,29 @@ class MklDnnPool : public MklDnnKernel {
size_t src_size_;
size_t dst_size_;
std::shared_ptr<mkldnn::memory> src_mem_;
std::shared_ptr<dnnl::memory> src_mem_;
std::unique_ptr<mkldnn::pooling_forward::desc> fwd_desc_;
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::unique_ptr<mkldnn::pooling_forward::primitive_desc> fwd_primitive_desc_;
std::unique_ptr<mkldnn::primitive> pool_fwd_;
std::unique_ptr<dnnl::pooling_forward::desc> fwd_desc_;
std::unique_ptr<dnnl::memory::desc> src_md_;
std::unique_ptr<dnnl::pooling_forward::primitive_desc> fwd_primitive_desc_;
std::unique_ptr<dnnl::primitive> pool_fwd_;
std::shared_ptr<mkldnn::memory> src_mem_from_;
std::unique_ptr<mkldnn::memory> src_mem_to_;
std::shared_ptr<dnnl::memory> src_mem_from_;
std::unique_ptr<dnnl::memory> src_mem_to_;
std::unique_ptr<mkldnn::memory> dst_mem_from_;
std::unique_ptr<mkldnn::memory> dst_mem_to_;
std::unique_ptr<dnnl::memory> dst_mem_from_;
std::unique_ptr<dnnl::memory> dst_mem_to_;
private:
mkldnn::memory::format_tag GetAVXFormat(const mkldnn::memory::dims& src_dims_mkl) {
dnnl::memory::format_tag GetAVXFormat(const dnnl::memory::dims& src_dims_mkl) {
bool is_2D = src_dims_mkl.size() == 4 ? true : false;
mkldnn::memory::format_tag fmt = mkldnn::memory::format_tag::any;
dnnl::memory::format_tag fmt = dnnl::memory::format_tag::any;
if (CPUIDInfo::GetCPUIDInfo().HasAVX512f()) {
fmt = is_2D ? mkldnn::memory::format_tag::nChw16c : mkldnn::memory::format_tag::nCdhw16c;
fmt = is_2D ? dnnl::memory::format_tag::nChw16c : dnnl::memory::format_tag::nCdhw16c;
} else if (CPUIDInfo::GetCPUIDInfo().HasAVX2() && (src_dims_mkl[1] % 8 == 0)) {
fmt = is_2D ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::ncdhw;
fmt = is_2D ? dnnl::memory::format_tag::nChw8c : dnnl::memory::format_tag::ncdhw;
} else {
fmt = is_2D ? mkldnn::memory::format_tag::nchw : mkldnn::memory::format_tag::ncdhw;
fmt = is_2D ? dnnl::memory::format_tag::nchw : dnnl::memory::format_tag::ncdhw;
}
return fmt;
}
@ -413,5 +413,5 @@ class MklDnnPool : public MklDnnKernel {
TensorShape x_shape_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -3,29 +3,29 @@
#pragma once
#include "core/framework/op_kernel.h"
#include "core/providers/mkldnn/mkldnn_fwd.h"
#include "core/providers/mkldnn/mkldnn_common.h"
#include "core/providers/mkldnn/subgraph/mkldnn_kernel.h"
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/dnnl/dnnl_common.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
#include "core/util/math.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
template <typename T>
class MklDnnSum : public MklDnnKernel {
class DnnlSum : public DnnlKernel {
public:
explicit MklDnnSum(const MklDnnNode& node,
MKLDNNExecutionProvider* provider,
explicit DnnlSum(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : MklDnnKernel(node, provider) {
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ReadAttributes(attributes, attributes_prefix);
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
mkldnn::engine& cpu_engine,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) override {
dnnl::engine& cpu_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) override {
Ort::CustomOpApi ort{*api};
int num_inputs = mklnode_ptr_->num_inputs;
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
@ -43,10 +43,10 @@ class MklDnnSum : public MklDnnKernel {
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
x_shape = TensorShape(xshape, xdim);
mkldnn::memory::dims src_dims(
dnnl::memory::dims src_dims(
x_shape.GetDims().begin(), x_shape.GetDims().end());
ort_source_desc_ = mkldnn::memory::desc(
{src_dims}, MklDnnType<T>(), ort_source_format_);
ort_source_desc_ = dnnl::memory::desc(
{src_dims}, DnnnType<T>(), ort_source_format_);
source_desc_ = ort_source_desc_;
} else {
x_shape = parents_[0].get()->primitive_dst_shape_;
@ -56,7 +56,7 @@ class MklDnnSum : public MklDnnKernel {
}
primitive_dst_shape_ = TensorShape(x_shape);
mkldnn::memory::dims dst_dims_mkl(
dnnl::memory::dims dst_dims_mkl(
primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
for (int i = 0; i < num_inputs; i++) {
@ -69,19 +69,19 @@ class MklDnnSum : public MklDnnKernel {
ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
mkldnn::memory::dims dims(xdim);
dnnl::memory::dims dims(xdim);
x_shape1 = TensorShape(xshape, xdim);
mkldnn::memory::dims src_dims(
dnnl::memory::dims src_dims(
x_shape1.GetDims().begin(), x_shape1.GetDims().end());
auto mpd = mkldnn::memory::desc({src_dims}, MklDnnType<T>(), ort_source_format_);
auto src_memory = mkldnn::memory(mpd, cpu_engine, nullptr);
auto mpd = dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_);
auto src_memory = dnnl::memory(mpd, cpu_engine, nullptr);
srcs_pd_.push_back(mpd);
srcs_memory_.push_back(src_memory);
coeff.push_back(1.0);
} else {
x_shape = parents_[0].get()->primitive_dst_shape_;
auto mpd = mkldnn::memory::desc(parents_[i].get()->primitive_dst_desc_);
auto mpd = dnnl::memory::desc(parents_[i].get()->primitive_dst_desc_);
auto src_memory = *parents_[i].get()->primitive_dst_mem_;
srcs_pd_.push_back(mpd);
srcs_memory_.push_back(src_memory);
@ -89,44 +89,44 @@ class MklDnnSum : public MklDnnKernel {
}
}
primitive_dst_md_ = onnxruntime::make_unique<mkldnn::memory::desc>(
mkldnn::memory::desc({dst_dims_mkl}, MklDnnType<T>(), mkldnn::memory::format_tag::any));
sum_pd_ = onnxruntime::make_unique<mkldnn::sum::primitive_desc>(
mkldnn::sum::primitive_desc(*primitive_dst_md_, coeff, srcs_pd_, cpu_engine));
primitive_dst_md_ = onnxruntime::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({dst_dims_mkl}, DnnnType<T>(), dnnl::memory::format_tag::any));
sum_pd_ = onnxruntime::make_unique<dnnl::sum::primitive_desc>(
dnnl::sum::primitive_desc(*primitive_dst_md_, coeff, srcs_pd_, cpu_engine));
if (mklnode_ptr_->output_index >= 0) {
// last node of sub-graph. need to allocate memory for output_tensor
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder neded. Use primitive output as input to reorder and
// allocate buffer for reorder output, final output of this subgraph
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(sum_pd_->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(sum_pd_->dst_desc(), cpu_engine));
} else {
// Last node but re-order not needed. Allocate buffer to output of this node
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(sum_pd_->dst_desc(), cpu_engine, nullptr));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(sum_pd_->dst_desc(), cpu_engine, nullptr));
}
} else {
// Intermediate node. Use mkldnn kernel internal memory for output and
// Intermediate node. Use Dnnl kernel internal memory for output and
// use this as input to next node.
primitive_dst_mem_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(sum_pd_->dst_desc(), cpu_engine));
primitive_dst_mem_ = onnxruntime::make_unique<dnnl::memory>(
dnnl::memory(sum_pd_->dst_desc(), cpu_engine));
}
primitive_dst_desc_ = sum_pd_->dst_desc();
auto c = mkldnn::sum(*sum_pd_);
auto c = dnnl::sum(*sum_pd_);
net.push_back(c);
std::unordered_map<int, mkldnn::memory> args{
{MKLDNN_ARG_DST, *primitive_dst_mem_}};
std::unordered_map<int, dnnl::memory> args{
{DNNL_ARG_DST, *primitive_dst_mem_}};
for (int i = 0; i < (int)num_inputs; i++) {
args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, srcs_memory_[i]});
args.insert({DNNL_ARG_MULTIPLE_SRC + i, srcs_memory_[i]});
}
net_args.push_back(args);
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
mkldnn::memory::data_type t = MklDnnType<T>();
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args);
}
}
@ -162,14 +162,14 @@ class MklDnnSum : public MklDnnKernel {
}
private:
std::unique_ptr<mkldnn::memory::desc> src_md_;
std::vector<mkldnn::memory::desc> src_mds_;
std::vector<mkldnn::memory> srcs_memory_;
std::unique_ptr<dnnl::memory::desc> src_md_;
std::vector<dnnl::memory::desc> src_mds_;
std::vector<dnnl::memory> srcs_memory_;
std::vector<mkldnn::memory::desc> srcs_pd_;
std::unique_ptr<mkldnn::memory::desc> src_mpd_;
std::unique_ptr<mkldnn::memory::desc> dst_pd_;
std::unique_ptr<mkldnn::sum::primitive_desc> sum_pd_;
std::vector<dnnl::memory::desc> srcs_pd_;
std::unique_ptr<dnnl::memory::desc> src_mpd_;
std::unique_ptr<dnnl::memory::desc> dst_pd_;
std::unique_ptr<dnnl::sum::primitive_desc> sum_pd_;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -10,9 +10,9 @@
#include "core/graph/graph.h"
namespace onnxruntime {
namespace mkl_dnn {
namespace ort_dnnl {
struct MklDnnNode {
struct DnnlNode {
std::string name;
int node_index = -1;
int input_start_index = -1; // start index in inputs()
@ -63,7 +63,7 @@ struct Subgraph {
std::string graph_name;
std::string subgraph_id;
std::vector<MklDnnNode> mkldnn_nodes;
std::vector<DnnlNode> dnnl_nodes;
};
} // namespace mkl_dnn
} // namespace ort_dnnl
} // namespace onnxruntime

Просмотреть файл

@ -0,0 +1 @@
OrtSessionOptionsAppendExecutionProvider_Dnnl

Просмотреть файл

@ -1,38 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/mkldnn/mkldnn_provider_factory.h"
#include <atomic>
#include "mkldnn_execution_provider.h"
#include "core/session/abi_session_options_impl.h"
using namespace onnxruntime;
namespace onnxruntime {
struct MkldnnProviderFactory : IExecutionProviderFactory {
MkldnnProviderFactory(bool create_arena) : create_arena_(create_arena) {}
~MkldnnProviderFactory() override {}
std::unique_ptr<IExecutionProvider> CreateProvider() override;
private:
bool create_arena_;
};
std::unique_ptr<IExecutionProvider> MkldnnProviderFactory::CreateProvider() {
MKLDNNExecutionProviderInfo info;
info.create_arena = create_arena_;
return onnxruntime::make_unique<MKLDNNExecutionProvider>(info);
}
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Mkldnn(int device_id) {
return std::make_shared<onnxruntime::MkldnnProviderFactory>(device_id);
//TODO: This is apparently a bug. The consructor parameter is create-arena-flag, not the device-id
}
} // namespace onnxruntime
ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Mkldnn, _In_ OrtSessionOptions* options, int use_arena) {
options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Mkldnn(use_arena));
return nullptr;
}

Просмотреть файл

@ -1,61 +0,0 @@
// Copyright(C) 2019 Intel Corporation
// Licensed under the MIT License
#include "mkldnn_kernel.h"
namespace onnxruntime {
namespace mkl_dnn {
void MklDnnKernel::InitDstReorderOutput(mkldnn::engine& cpu_engine,
mkldnn::memory::data_type& data_type,
std::vector<mkldnn::primitive>& net,
std::vector<std::unordered_map<int, mkldnn::memory>>& net_args) {
// Allocate dst buffer if reorder is necessary
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder to ONNXRuntime format
mkldnn::memory::dims dst_dims_mkl(
primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
mkldnn::memory::desc dst_des = mkldnn::memory::desc(dst_dims_mkl,
data_type, ort_source_format_);
reorder_dst_mem_to_ = onnxruntime::make_unique<mkldnn::memory>(
mkldnn::memory(dst_des, cpu_engine));
net.push_back(mkldnn::reorder(*primitive_dst_mem_, *reorder_dst_mem_to_));
net_args.push_back({{MKLDNN_ARG_FROM, *primitive_dst_mem_},
{MKLDNN_ARG_TO, *reorder_dst_mem_to_}});
}
}
mkldnn::memory::format_tag MklDnnKernel::GetSourceFormat(int dim_size) {
mkldnn::memory::format_tag source_format = mkldnn::memory::format_tag::any;
switch (dim_size) {
case 1: {
source_format = mkldnn::memory::format_tag::x;
break;
}
case 2: {
source_format = mkldnn::memory::format_tag::nc;
break;
}
case 3: {
source_format = mkldnn::memory::format_tag::ntc;
break;
}
case 4: {
source_format = mkldnn::memory::format_tag::nchw;
break;
}
case 5: {
source_format = mkldnn::memory::format_tag::ncdhw;
break;
}
default: {
source_format = mkldnn::memory::format_tag::any;
break;
}
}
return source_format;
}
} // namespace mkl_dnn
} // namespace onnxruntime

Просмотреть файл

@ -1 +0,0 @@
OrtSessionOptionsAppendExecutionProvider_Mkldnn

Просмотреть файл

@ -27,11 +27,11 @@
#define BACKEND_OPENMP ""
#endif
#if USE_MKLDNN
#define BACKEND_MKLDNN "-MKL-DNN"
#include "core/providers/mkldnn/mkldnn_execution_provider.h"
#if USE_DNNL
#define BACKEND_DNNL "-DNNL"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#else
#define BACKEND_MKLDNN ""
#define BACKEND_DNNL ""
#endif
#if USE_MKLML
@ -78,7 +78,7 @@
#define BACKEND_OPENBLAS ""
#endif
#define BACKEND_DEVICE BACKEND_PROC BACKEND_MKLDNN BACKEND_MKLML BACKEND_NGRAPH BACKEND_OPENVINO BACKEND_NUPHAR BACKEND_OPENBLAS
#define BACKEND_DEVICE BACKEND_PROC BACKEND_DNNL BACKEND_MKLML BACKEND_NGRAPH BACKEND_OPENVINO BACKEND_NUPHAR BACKEND_OPENBLAS
#include "core/session/onnxruntime_cxx_api.h"
#include "core/providers/providers.h"
#include "core/providers/cpu/cpu_execution_provider.h"
@ -90,8 +90,8 @@
#ifdef USE_TENSORRT
#include "core/providers/tensorrt/tensorrt_provider_factory.h"
#endif
#ifdef USE_MKLDNN
#include "core/providers/mkldnn/mkldnn_provider_factory.h"
#ifdef USE_DNNL
#include "core/providers/dnnl/dnnl_provider_factory.h"
#endif
#ifdef USE_NGRAPH
#include "core/providers/ngraph/ngraph_provider_factory.h"
@ -111,7 +111,7 @@ namespace onnxruntime {
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CPU(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CUDA(int device_id);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(int device_id);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Mkldnn(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_NGraph(const char* ng_backend_type);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const char* device);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nuphar(bool, const char*);
@ -256,7 +256,7 @@ inline void RegisterExecutionProvider(InferenceSession* sess, onnxruntime::IExec
// ordered by default priority. highest to lowest.
const std::vector<std::string>& GetAllProviders() {
static std::vector<std::string> all_providers = {kTensorrtExecutionProvider, kCudaExecutionProvider, kMklDnnExecutionProvider,
static std::vector<std::string> all_providers = {kTensorrtExecutionProvider, kCudaExecutionProvider, kDnnlExecutionProvider,
kNGraphExecutionProvider, kOpenVINOExecutionProvider, kNupharExecutionProvider,
kBrainSliceExecutionProvider, kCpuExecutionProvider};
return all_providers;
@ -271,8 +271,8 @@ const std::vector<std::string>& GetAvailableProviders() {
#ifdef USE_CUDA
available_providers.push_back(kCudaExecutionProvider);
#endif
#ifdef USE_MKLDNN
available_providers.push_back(kMklDnnExecutionProvider);
#ifdef USE_DNNL
available_providers.push_back(kDnnlExecutionProvider);
#endif
#ifdef USE_NGRAPH
available_providers.push_back(kNGraphExecutionProvider);
@ -305,9 +305,9 @@ void RegisterExecutionProviders(InferenceSession* sess, const std::vector<std::s
// device id??
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_CUDA(0));
#endif
} else if (type == kMklDnnExecutionProvider) {
#ifdef USE_MKLDNN
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Mkldnn(sess->GetSessionOptions().enable_cpu_mem_arena));
} else if (type == kDnnlExecutionProvider) {
#ifdef USE_DNNL
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Dnnl(sess->GetSessionOptions().enable_cpu_mem_arena));
#endif
} else if (type == kNGraphExecutionProvider) {
#if USE_NGRAPH
@ -382,7 +382,7 @@ void addGlobalMethods(py::module& m) {
"get_all_opkernel_def", []() -> const std::vector<onnxruntime::KernelDef> {
std::vector<onnxruntime::KernelDef> result;
// default logger is needed to create the MklDNNExecutionProvider
// default logger is needed to create the DNNLExecutionProvider
std::string default_logger_id{"DefaultLogger"};
std::unique_ptr<onnxruntime::logging::LoggingManager> default_logging_manager =
onnxruntime::make_unique<LoggingManager>(
@ -398,8 +398,8 @@ void addGlobalMethods(py::module& m) {
#ifdef USE_CUDA
onnxruntime::CreateExecutionProviderFactory_CUDA(0),
#endif
#ifdef USE_MKLDNN
onnxruntime::CreateExecutionProviderFactory_Mkldnn(1),
#ifdef USE_DNNL
onnxruntime::CreateExecutionProviderFactory_Dnnl(1),
#endif
#ifdef USE_NGRAPH
onnxruntime::CreateExecutionProviderFactory_NGraph("CPU"),

Просмотреть файл

@ -5,9 +5,9 @@
#include "environment.h"
#include "core/session/onnxruntime_cxx_api.h"
#ifdef USE_MKLDNN
#ifdef USE_DNNL
#include "core/providers/mkldnn/mkldnn_provider_factory.h"
#include "core/providers/dnnl/dnnl_provider_factory.h"
#endif
@ -67,8 +67,8 @@ ServerEnvironment::ServerEnvironment(OrtLoggingLevel severity, spdlog::sinks_ini
}
void ServerEnvironment::RegisterExecutionProviders(){
#ifdef USE_MKLDNN
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Mkldnn(options_, 1));
#ifdef USE_DNNL
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Dnnl(options_, 1));
#endif
#ifdef USE_NGRAPH

Просмотреть файл

@ -4,7 +4,7 @@ Options:
-c [runs]: Specifies the number of Session::Run() to invoke simultaneously for each model.
-n [test_case_name]: Specifies a single test case to run.
-p [PLANNER_TYPE]: PLANNER_TYPE could be 'seq' or 'simple'. Default: 'simple'.
-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'mkldnn', 'tensorrt', 'ngraph', 'nuphar' or 'acl'. Default: 'cpu'.
-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt', 'ngraph', 'nuphar' or 'acl'. Default: 'cpu'.
-h: help
The debug version of this program depends on dbghelp.dll. Please make sure it's in your PATH.

Просмотреть файл

@ -36,7 +36,7 @@ void usage() {
"\t-r [repeat]: Specifies the number of times to repeat\n"
"\t-v: verbose\n"
"\t-n [test_case_name]: Specifies a single test case to run.\n"
"\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'mkldnn', 'tensorrt', 'ngraph', "
"\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt', 'ngraph', "
"'openvino', 'nuphar' or 'acl'. "
"Default: 'cpu'.\n"
"\t-x: Use parallel executor, default (without -x): sequential executor.\n"
@ -92,7 +92,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
int repeat_count = 1;
int p_models = GetNumCpuCores();
bool enable_cuda = false;
bool enable_mkl = false;
bool enable_dnnl = false;
bool enable_ngraph = false;
bool enable_nuphar = false;
bool enable_tensorrt = false;
@ -151,8 +151,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
// do nothing
} else if (!CompareCString(optarg, ORT_TSTR("cuda"))) {
enable_cuda = true;
} else if (!CompareCString(optarg, ORT_TSTR("mkldnn"))) {
enable_mkl = true;
} else if (!CompareCString(optarg, ORT_TSTR("dnnl"))) {
enable_dnnl = true;
} else if (!CompareCString(optarg, ORT_TSTR("ngraph"))) {
enable_ngraph = true;
} else if (!CompareCString(optarg, ORT_TSTR("nuphar"))) {
@ -304,11 +304,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
return -1;
#endif
}
if (enable_mkl) {
#ifdef USE_MKLDNN
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Mkldnn(sf, enable_cpu_mem_arena ? 1 : 0));
if (enable_dnnl) {
#ifdef USE_DNNL
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Dnnl(sf, enable_cpu_mem_arena ? 1 : 0));
#else
fprintf(stderr, "MKL-DNN is not supported in this build");
fprintf(stderr, "DNNL is not supported in this build");
return -1;
#endif
}
@ -356,7 +356,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
static const char* cuda_flaky_tests[] = {"fp16_inception_v1", "fp16_shufflenet", "fp16_tiny_yolov2"};
static const char* dml_disabled_tests[] = {"mlperf_ssd_resnet34_1200", "mlperf_ssd_mobilenet_300", "mask_rcnn_keras", "mask_rcnn", "faster_rcnn"};
static const char* dnnl_disabled_tests[] = {"test_densenet121", "test_resnet18v2", "test_resnet34v2", "test_resnet50v2", "test_resnet101v2",
"test_resnet101v2", "test_vgg19", "tf_inception_resnet_v2", "tf_inception_v1", "tf_inception_v3", "tf_inception_v4", "tf_mobilenet_v1_1.0_224",
"tf_mobilenet_v2_1.0_224", "tf_mobilenet_v2_1.4_224", "tf_nasnet_large", "tf_pnasnet_large", "tf_resnet_v1_50", "tf_resnet_v1_101", "tf_resnet_v1_101",
"tf_resnet_v2_101", "tf_resnet_v2_152"};
std::unordered_set<std::string> all_disabled_tests;
if (enable_cuda) {
all_disabled_tests.insert(std::begin(cuda_flaky_tests), std::end(cuda_flaky_tests));
@ -364,6 +368,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
if (enable_dml) {
all_disabled_tests.insert(std::begin(dml_disabled_tests), std::end(dml_disabled_tests));
}
if (enable_dnnl) {
// these models run but disabled tests to keep memory utilization low
// This will be removed after LRU implementation
all_disabled_tests.insert(std::begin(dnnl_disabled_tests), std::end(dnnl_disabled_tests));
}
#if defined(__amd64__) || defined(_M_AMD64)
#else
//out of memory
@ -465,14 +474,18 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
broken_tests.insert({"candy", "Results mismatch: 2 of 150528"});
#endif
#ifdef USE_MKLDNN
#ifdef USE_DNNL
broken_tests.insert({"tf_mobilenet_v2_1.0_224", "result mismatch"});
broken_tests.insert({"tf_mobilenet_v2_1.4_224", "result mismatch"});
broken_tests.insert({"tf_mobilenet_v1_1.0_224", "result mismatch"});
broken_tests.insert({"mobilenetv2-1.0", "result mismatch"});
broken_tests.insert({"candy", "result mismatch"});
broken_tests.insert({"range_float_type_positive_delta_expanded", "get unknown exception from MKLDNN EP"});
broken_tests.insert({"range_int32_type_negative_delta_expanded", "get unknown exception from MKLDNN EP"});
broken_tests.insert({"range_float_type_positive_delta_expanded", "get unknown exception from DNNL EP"});
broken_tests.insert({"range_int32_type_negative_delta_expanded", "get unknown exception from DNNL EP"});
broken_tests.insert({"averagepool_2d_ceil", "maxpool ceiling not supported"});
broken_tests.insert({"maxpool_2d_ceil", "maxpool ceiling not supported"});
broken_tests.insert({"maxpool_2d_dilations", "maxpool dilations not supported"});
broken_tests.insert({"mlperf_ssd_resnet34_1200", "test pass on dev box but fails on CI build"});
#endif
#ifdef USE_OPENVINO

Просмотреть файл

@ -14,7 +14,7 @@ Options:
-c: [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.
-e: [cpu|cuda|mkldnn|tensorrt|ngraph|openvino|nuphar|acl]: Specifies the execution provider 'cpu','cuda','mkldnn','tensorrt', 'ngraph', 'openvino', 'nuphar' or 'acl'. Default is 'cpu'.
-e: [cpu|cuda|mkldnn|tensorrt|ngraph|openvino|nuphar|acl]: Specifies the execution provider 'cpu','cuda','dnnn','tensorrt', 'ngraph', 'openvino', 'nuphar' or 'acl'. Default is 'cpu'.
-m: [test_mode]: Specifies the test mode. Value coulde be 'duration' or 'times'. Provide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. Default:'duration'.

Просмотреть файл

@ -32,7 +32,7 @@ namespace perftest {
"\t-M: Disable memory pattern.\n"
"\t-A: Disable memory arena\n"
"\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n"
"\t-e [cpu|cuda|mkldnn|tensorrt|ngraph|openvino|nuphar|dml|acl]: Specifies the provider 'cpu','cuda','mkldnn','tensorrt', "
"\t-e [cpu|cuda|dnnl|tensorrt|ngraph|openvino|nuphar|dml|acl]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
"'ngraph', 'openvino', 'nuphar', 'dml' or 'acl'. "
"Default:'cpu'.\n"
"\t-b [tf|ort]: backend to use. Default:ort\n"
@ -41,7 +41,7 @@ namespace perftest {
"\t-p [profile_file]: Specifies the profile name to enable profiling and dump the profile data to the file.\n"
"\t-s: Show statistics result, like P75, P90.\n"
"\t-v: Show verbose information.\n"
"\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0. If OpenMP is enabled, this configuration will be ignored.\n"
"\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0.\n"
"\t-y [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means ORT will pick a default. Must >=0.\n"
"\t-P: Use parallel executor instead of sequential executor.\n"
"\t-o [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
@ -80,8 +80,8 @@ namespace perftest {
test_config.machine_config.provider_type_name = onnxruntime::kCpuExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("cuda"))) {
test_config.machine_config.provider_type_name = onnxruntime::kCudaExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("mkldnn"))) {
test_config.machine_config.provider_type_name = onnxruntime::kMklDnnExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("dnnl"))) {
test_config.machine_config.provider_type_name = onnxruntime::kDnnlExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("ngraph"))) {
test_config.machine_config.provider_type_name = onnxruntime::kNGraphExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("brainslice"))) {

Просмотреть файл

@ -30,11 +30,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
: rand_engine_(rd()), input_names_(m->GetInputCount()), input_length_(m->GetInputCount()) {
Ort::SessionOptions session_options;
const std::string& provider_name = performance_test_config.machine_config.provider_type_name;
if (provider_name == onnxruntime::kMklDnnExecutionProvider) {
#ifdef USE_MKLDNN
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Mkldnn(session_options, performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
if (provider_name == onnxruntime::kDnnlExecutionProvider) {
#ifdef USE_DNNL
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Dnnl(session_options, performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
#else
ORT_THROW("MKL-DNN is not supported in this build\n");
ORT_THROW("DNNL is not supported in this build\n");
#endif
} else if (provider_name == onnxruntime::kNGraphExecutionProvider) {
#ifdef USE_NGRAPH

Просмотреть файл

@ -216,7 +216,7 @@ bool PerformanceRunner::Initialize() {
test_case_.reset(CreateOnnxTestCase(narrow_model_name, test_model_info_, 0.0, 0.0));
// TODO: Place input tensor on cpu memory if mkldnn provider type to avoid CopyTensor logic in CopyInputAcrossDevices
// TODO: Place input tensor on cpu memory if dnnl provider type to avoid CopyTensor logic in CopyInputAcrossDevices
size_t test_data_count = test_case_->GetDataCount();
if (test_data_count == 0) {
std::cout << "there is no test data for model " << test_case_->GetTestCaseName() << std::endl;

Просмотреть файл

@ -235,7 +235,7 @@ TEST(GemmOpTest, GemmEmptyTensor) {
test.AddInput<float>("C", {3}, std::vector<float>(3, 1.0f));
test.AddOutput<float>("Y", {0, 3},
{});
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kMklDnnExecutionProvider}); //TensorRT: doesn't support dynamic shape yet
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kDnnlExecutionProvider}); //TensorRT: doesn't support dynamic shape yet
}
TEST(GemmOpTest, GemmNoBiasOpset11) {

Просмотреть файл

@ -162,7 +162,7 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
storage_order == 0 ? test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_row)
: test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_col);
}
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kMklDnnExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
}
TEST(PoolTest, MaxPool_8_With_Index) {

Просмотреть файл

@ -95,7 +95,7 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
0.0f, 0.0f, 0.0f, 0.0f});
//On GPU, just set the value to 0 instead of report error. exclude all other providers
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider, kMklDnnExecutionProvider, kNupharExecutionProvider, kTensorrtExecutionProvider});
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider, kDnnlExecutionProvider, kNupharExecutionProvider, kTensorrtExecutionProvider});
}
#endif

Просмотреть файл

@ -460,7 +460,7 @@ void OpTester::ExecuteModel(Model& model, InferenceSession& session_object, Expe
} else {
if (expect_result == ExpectResult::kExpectFailure) {
// Disable expected_failure_string checks for MKL-DNN and nGraph EP's
if (provider_type != kMklDnnExecutionProvider && provider_type != kNGraphExecutionProvider) {
if (provider_type != kDnnlExecutionProvider && provider_type != kNGraphExecutionProvider) {
EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr(expected_failure_string));
}
} else {
@ -569,7 +569,7 @@ void OpTester::Run(SessionOptions so, // Take the SessionOptions by value (i.e.
static const std::string all_provider_types[] = {
kCpuExecutionProvider,
kCudaExecutionProvider,
kMklDnnExecutionProvider,
kDnnlExecutionProvider,
kNGraphExecutionProvider,
kNupharExecutionProvider,
kBrainSliceExecutionProvider,
@ -622,8 +622,8 @@ void OpTester::Run(SessionOptions so, // Take the SessionOptions by value (i.e.
execution_provider = DefaultCpuExecutionProvider();
else if (provider_type == onnxruntime::kCudaExecutionProvider)
execution_provider = DefaultCudaExecutionProvider();
else if (provider_type == onnxruntime::kMklDnnExecutionProvider)
execution_provider = DefaultMkldnnExecutionProvider();
else if (provider_type == onnxruntime::kDnnlExecutionProvider)
execution_provider = DefaultDnnlExecutionProvider();
else if (provider_type == onnxruntime::kNGraphExecutionProvider)
execution_provider = DefaultNGraphExecutionProvider();
else if (provider_type == onnxruntime::kNupharExecutionProvider)

Просмотреть файл

@ -139,9 +139,12 @@ def create_backend_test(testname=None):
'test_edge_pad_cpu',
'test_reflect_pad_cpu']
if c2.supports_device('MKL-DNN'):
if c2.supports_device('DNNL'):
current_failing_tests += ['^test_range_float_type_positive_delta_expanded_cpu',
'^test_range_int32_type_negative_delta_expanded_cpu']
'^test_range_int32_type_negative_delta_expanded_cpu',
'^test_averagepool_2d_ceil_cpu',
'^test_maxpool_2d_ceil_cpu',
'^test_maxpool_2d_dilations_cpu']
if c2.supports_device('OPENVINO_GPU_FP32') or c2.supports_device('OPENVINO_GPU_FP16'):
current_failing_tests.append('^test_div_cpu*')

Просмотреть файл

@ -75,9 +75,9 @@ void TestInference(Ort::Env& env, T model_uri,
return;
#endif
} else if (provider_type == 2) {
#ifdef USE_MKLDNN
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Mkldnn(session_options, 1));
std::cout << "Running simple inference with mkldnn provider" << std::endl;
#ifdef USE_DNNL
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Dnnl(session_options, 1));
std::cout << "Running simple inference with dnnl provider" << std::endl;
#else
return;
#endif

Просмотреть файл

@ -10,7 +10,7 @@ namespace onnxruntime {
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CPU(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CUDA(int device_id);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Mkldnn(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_NGraph(const char* ng_backend_type);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nuphar(bool, const char*);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_BrainSlice(uint32_t ip, int, int, bool, const char*, const char*, const char*);
@ -49,9 +49,9 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
#endif
}
std::unique_ptr<IExecutionProvider> DefaultMkldnnExecutionProvider(bool enable_arena) {
#ifdef USE_MKLDNN
return CreateExecutionProviderFactory_Mkldnn(enable_arena ? 1 : 0)->CreateProvider();
std::unique_ptr<IExecutionProvider> DefaultDnnlExecutionProvider(bool enable_arena) {
#ifdef USE_DNNL
return CreateExecutionProviderFactory_Dnnl(enable_arena ? 1 : 0)->CreateProvider();
#else
ORT_UNUSED_PARAMETER(enable_arena);
return nullptr;

Просмотреть файл

@ -9,7 +9,7 @@ namespace test {
// unique_ptr providers with default values for session registration
std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena = true);
std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider();
std::unique_ptr<IExecutionProvider> DefaultMkldnnExecutionProvider(bool enable_arena = true);
std::unique_ptr<IExecutionProvider> DefaultDnnlExecutionProvider(bool enable_arena = true);
std::unique_ptr<IExecutionProvider> DefaultNGraphExecutionProvider();
std::unique_ptr<IExecutionProvider> DefaultNupharExecutionProvider(bool allow_unaligned_buffers = true);
std::unique_ptr<IExecutionProvider> DefaultBrainSliceExecutionProvider();

Просмотреть файл

@ -7,8 +7,8 @@
#ifdef USE_CUDA
#include "core/providers/cuda/cuda_provider_factory.h"
#endif
#ifdef USE_MKLDNN
#include "core/providers/mkldnn/mkldnn_provider_factory.h"
#ifdef USE_DNNL
#include "core/providers/dnnl/dnnl_provider_factory.h"
#endif
#ifdef USE_NGRAPH
#include "core/providers/ngraph/ngraph_provider_factory.h"

Просмотреть файл

@ -16,7 +16,7 @@ endif()
option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF)
option(onnxruntime_USE_MKLDNN "Build with MKL-DNN support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF)
option(onnxruntime_USE_NUPHAR "Build with Nuphar" OFF)
option(onnxruntime_USE_BRAINSLICE "Build with BrainSlice" OFF)
@ -64,8 +64,8 @@ endif()
if(onnxruntime_USE_NNAPI)
add_definitions(-DUSE_NNAPI)
endif()
if(onnxruntime_USE_MKLDNN)
add_definitions(-DUSE_MKLDNN)
if(onnxruntime_USE_DNNL)
add_definitions(-DUSE_DNNL)
endif()
if(onnxruntime_USE_NGRAPH)
add_definitions(-DUSE_NGRAPH)

Просмотреть файл

@ -7,8 +7,8 @@
#ifdef USE_CUDA
#include "onnxruntime/core/providers/cuda/cuda_provider_factory.h"
#endif
#ifdef USE_MKLDNN
#include "onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.h"
#ifdef USE_DNNL
#include "onnxruntime/core/providers/dnnl/dnnl_provider_factory.h"
#endif
#ifdef USE_NGRAPH
#include "onnxruntime/core/providers/ngraph/ngraph_provider_factory.h"

Просмотреть файл

@ -117,9 +117,9 @@ except ImportError as error:
# Additional binaries
if platform.system() == 'Linux':
libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.so.1', 'libmklml_intel.so', 'libiomp5.so', 'mimalloc.so']
libs = ['onnxruntime_pybind11_state.so', 'libdnnl.so.1', 'libmklml_intel.so', 'libiomp5.so', 'mimalloc.so']
# nGraph Libs
libs.extend(['libngraph.so', 'libcodegen.so', 'libcpu_backend.so', 'libmkldnn.so', 'libtbb_debug.so', 'libtbb_debug.so.2', 'libtbb.so', 'libtbb.so.2'])
libs.extend(['libngraph.so', 'libcodegen.so', 'libcpu_backend.so', 'libdnnl.so', 'libtbb_debug.so', 'libtbb_debug.so.2', 'libtbb.so', 'libtbb.so.2'])
# Nuphar Libs
libs.extend(['libtvm.so.0.5.1'])
# Openvino Libs
@ -127,11 +127,11 @@ if platform.system() == 'Linux':
if nightly_build:
libs.extend(['libonnxruntime_pywrapper.so'])
elif platform.system() == "Darwin":
libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.1.dylib', 'mimalloc.so'] # TODO add libmklml and libiomp5 later.
libs = ['onnxruntime_pybind11_state.so', 'libdnnl.1.dylib', 'mimalloc.so'] # TODO add libmklml and libiomp5 later.
if nightly_build:
libs.extend(['libonnxruntime_pywrapper.dylib'])
else:
libs = ['onnxruntime_pybind11_state.pyd', 'mkldnn.dll', 'mklml.dll', 'libiomp5md.dll']
libs = ['onnxruntime_pybind11_state.pyd', 'dnnl.dll', 'mklml.dll', 'libiomp5md.dll']
libs.extend(['ngraph.dll', 'cpu_backend.dll', 'tbb.dll', 'mimalloc-override.dll', 'mimalloc-redirect.dll', 'mimalloc-redirect32.dll'])
# Nuphar Libs
libs.extend(['tvm.dll'])

Просмотреть файл

@ -124,7 +124,7 @@ Use the individual flags to only run the specified stages.
parser.add_argument("--use_jemalloc", action='store_true', help="Use jemalloc.")
parser.add_argument("--use_mimalloc", action='store_true', help="Use mimalloc.")
parser.add_argument("--use_openblas", action='store_true', help="Build with OpenBLAS.")
parser.add_argument("--use_mkldnn", action='store_true', help="Build with MKLDNN.")
parser.add_argument("--use_dnnl", action='store_true', help="Build with DNNL.")
parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.")
parser.add_argument("--use_gemmlowp", action='store_true', help="Build with gemmlowp for quantized gemm.")
parser.add_argument("--use_automl", action='store_true', help="Build with AutoML support.")
@ -295,7 +295,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
"-Donnxruntime_BUILD_SHARED_LIB=" + ("ON" if args.build_shared_lib or args.build_server else "OFF"),
"-Donnxruntime_USE_EIGEN_FOR_BLAS=" + ("OFF" if args.use_openblas else "ON"),
"-Donnxruntime_USE_OPENBLAS=" + ("ON" if args.use_openblas else "OFF"),
"-Donnxruntime_USE_MKLDNN=" + ("ON" if args.use_mkldnn else "OFF"),
"-Donnxruntime_USE_DNNL=" + ("ON" if args.use_dnnl else "OFF"),
"-Donnxruntime_USE_MKLML=" + ("ON" if args.use_mklml else "OFF"),
"-Donnxruntime_USE_GEMMLOWP=" + ("ON" if args.use_gemmlowp else "OFF"),
"-Donnxruntime_USE_NGRAPH=" + ("ON" if args.use_ngraph else "OFF"),
@ -620,8 +620,8 @@ def run_onnx_tests(build_dir, configs, onnx_test_data_dir, provider, enable_mult
run_subprocess([exe,'-x'] + cmd, cwd=cwd)
# mkldnn temporary function for running onnx tests and model tests separately.
def mkldnn_run_onnx_tests(build_dir, configs, onnx_test_data_dir):
# dnnl temporary function for running onnx tests and model tests separately.
def dnnl_run_onnx_tests(build_dir, configs, onnx_test_data_dir):
for config in configs:
cwd = get_config_build_dir(build_dir, config)
if is_windows():
@ -630,7 +630,7 @@ def mkldnn_run_onnx_tests(build_dir, configs, onnx_test_data_dir):
else:
exe = os.path.join(cwd, 'onnx_test_runner')
model_dir = os.path.join(build_dir, "models")
cmd_base = ['-e', 'mkldnn', '-c', '1', '-j', '1']
cmd_base = ['-e', 'dnnl', '-c', '1', '-j', '1']
if os.path.exists(onnx_test_data_dir):
onnxdata_cmd = cmd_base + [onnx_test_data_dir]
# /data/onnx
@ -818,7 +818,7 @@ def generate_documentation(source_dir, build_dir, configs):
print('git diff returned non-zero error code')
if len(docdiff) > 0:
# Show warning instead of throwing exception, because it is dependent on build configuration for including execution propviders
log.warning('The updated opkernel document file '+str(opkernel_doc_path)+' is different from the checked in version. Consider regenrating the file with CPU, MKLDNN and CUDA providers enabled.')
log.warning('The updated opkernel document file '+str(opkernel_doc_path)+' is different from the checked in version. Consider regenrating the file with CPU, DNNL and CUDA providers enabled.')
log.debug('diff:\n'+str(docdiff))
docdiff = ''
@ -973,9 +973,9 @@ def main():
if args.use_dml:
run_onnx_tests(build_dir, configs, onnx_test_data_dir, 'dml', args.enable_multi_device_test, False, 1)
#It could run out of memory because of memory leak
#if args.use_mkldnn:
# mkldnn_run_onnx_tests(build_dir, configs, onnx_test_data_dir)
# Run some models are disabled to keep memory utilization under control
if args.use_dnnl:
dnnl_run_onnx_tests(build_dir, configs, onnx_test_data_dir)
run_onnx_tests(build_dir, configs, onnx_test_data_dir, None, args.enable_multi_device_test, False,
1 if args.x86 or platform.system() == 'Darwin' else 0,

Просмотреть файл

@ -3,7 +3,7 @@ jobs:
parameters:
AgentPool : 'Linux-CPU'
JobName: 'Linux_CI_Dev'
BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--use_mklml --use_llvm --use_nuphar --use_mkldnn --use_tvm --use_automl --build_wheel --enable_language_interop_ops"'
BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--use_mklml --use_llvm --use_nuphar --use_dnnl --use_tvm --use_automl --build_wheel --enable_language_interop_ops"'
DoNugetPack: 'false'
ArtifactName: 'drop-linux'
TimeoutInMinutes: 120

Просмотреть файл

@ -30,7 +30,7 @@ jobs:
displayName: 'Generate cmake config'
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --use_automl --use_mkldnn --use_openmp --build_shared_lib --enable_onnx_tests'
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --use_automl --use_dnnl --use_openmp --build_shared_lib --enable_onnx_tests'
workingDirectory: '$(Build.BinariesDirectory)'
- task: VSBuild@1
@ -112,7 +112,7 @@ jobs:
set /p WHEEL_FILENAME=<wheel_filename_file
del wheel_filename_file
python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --use_mkldnn --build_wheel --enable_onnx_tests
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --use_dnnl --build_wheel --enable_onnx_tests
workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
displayName: 'Run tests'
@ -123,4 +123,4 @@ jobs:
condition: succeeded()
- template: templates/clean-agent-build-directory-step.yml

Просмотреть файл

@ -5,7 +5,7 @@ jobs:
AgentDemands: 'Has19H1WinSDK'
DoDebugBuild: 'true'
DoCompliance: 'false'
BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_mkldnn --use_dml --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --gen_doc'
BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_dnnl --use_dml --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --gen_doc'
JobName: 'Windows_CI_GPU_Dev'
DoNugetPack: 'false'
NuPackScript : ''

Просмотреть файл

@ -4,7 +4,7 @@ jobs:
AgentPool : 'Win-GPU-CUDA10'
DoDebugBuild: 'true'
DoCompliance: 'false'
BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_mkldnn --use_tensorrt --tensorrt_home="C:\local\TensorRT-6.0.1.5" --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10_trt6015dll" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --gen_doc'
BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_dnnl --use_tensorrt --tensorrt_home="C:\local\TensorRT-6.0.1.5" --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10_trt6015dll" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --gen_doc'
JobName: 'Windows_CI_GPU_Dev'
DoNugetPack: 'false'
NuPackScript : ''