Fix typos according to reviewdog report. (#21335)
### Description Fix typos based on reviewdog report but with some exceptions/corrections.
This commit is contained in:
Родитель
4e75605eec
Коммит
5b9369e93c
|
@ -1,4 +1,4 @@
|
|||
# This sets the default behaviour, overriding core.autocrlf
|
||||
# This sets the default behavior, overriding core.autocrlf
|
||||
* text=auto
|
||||
|
||||
# All source files should have unix line-endings in the repository,
|
||||
|
|
|
@ -4820,7 +4820,7 @@ SOFTWARE.
|
|||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
This is the MIT/Expat Licence. For more information see:
|
||||
This is the MIT/Expat License. For more information see:
|
||||
|
||||
1. http://www.opensource.org/licenses/mit-license.php
|
||||
|
||||
|
|
|
@ -150,7 +150,7 @@ endif()
|
|||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_MINIMAL_BUILD)
|
||||
# target onnxruntime is a shared library, the dummy __cxa_demangle is only attach to it to avoid
|
||||
# affecting downstream ort library users with the behaviour of dummy __cxa_demangle. So the dummy
|
||||
# affecting downstream ort library users with the behavior of dummy __cxa_demangle. So the dummy
|
||||
# __cxa_demangle must not expose to libonnxruntime_common.a. It works as when the linker is
|
||||
# creating the DSO, our dummy __cxa_demangle always comes before libc++abi.a so the
|
||||
# __cxa_demangle in libc++abi.a is discarded, thus, huge binary size reduction.
|
||||
|
|
|
@ -44,7 +44,7 @@ index c23746e7f..bc326c8b5 100644
|
|||
find_package(HIP REQUIRED)
|
||||
# Override HIP version in config.h, if necessary.
|
||||
@@ -269,12 +248,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
|
||||
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
|
||||
message(STATUS "CK_HIP_VERSION_PATCH overridden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
|
||||
endif()
|
||||
message(STATUS "Build with HIP ${HIP_VERSION}")
|
||||
-link_libraries(hip::device)
|
||||
|
|
|
@ -39,7 +39,7 @@ Event {{name.0.value}}
|
|||
Operator {{name.0.value}}
|
||||
{{/inOperator}}
|
||||
{{#inEii}}
|
||||
Explict Interface Implementation {{name.0.value}}
|
||||
Explicit Interface Implementation {{name.0.value}}
|
||||
{{/inEii}}
|
||||
{{#inVariable}}
|
||||
Variable {{name.0.value}}
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
docker run -it onnxruntime-source
|
||||
```
|
||||
|
||||
The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explictly specify which CPU architecture you want to build. For example:
|
||||
The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explicitly specify which CPU architecture you want to build. For example:
|
||||
|
||||
```bash
|
||||
docker build --platform linux/arm64/v8 -f Dockerfile.source
|
||||
|
@ -274,7 +274,7 @@ Note: You may add --use_tensorrt and --tensorrt_home options if you wish to use
|
|||
Note: Resulting Docker image will have ONNX Runtime installed in /usr, and ONNX Runtime wheel copied to /onnxruntime directory.
|
||||
Nothing else from ONNX Runtime source tree will be copied/installed to the image.
|
||||
|
||||
Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
|
||||
Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropriate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
|
||||
|
||||
## MIGraphX
|
||||
**Ubuntu 20.04, ROCm6.0, MIGraphX**
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, please follow the [Azure ML configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) to set up your environment.\n",
|
||||
"\n",
|
||||
"### Install additional packages needed for this Notebook\n",
|
||||
"You need to install the popular plotting library matplotlib, the image manipulation library opencv, and the onnx library in the conda environment where Azure Maching Learning SDK is installed.\n",
|
||||
"You need to install the popular plotting library matplotlib, the image manipulation library opencv, and the onnx library in the conda environment where Azure Machine Learning SDK is installed.\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"(myenv) $ pip install matplotlib onnx opencv-python\n",
|
||||
|
@ -79,7 +79,7 @@
|
|||
"source": [
|
||||
"## 1. Obtain a model from the ONNX Model Zoo\n",
|
||||
"\n",
|
||||
"For more information on the Facial Emotion Recognition (FER+) model, you can explore the notebook explaning how to deploy [FER+ with ONNX Runtime on an ACI Instance](onnx-inference-facial-expression-recognition-deploy.ipynb)."
|
||||
"For more information on the Facial Emotion Recognition (FER+) model, you can explore the notebook explaining how to deploy [FER+ with ONNX Runtime on an ACI Instance](onnx-inference-facial-expression-recognition-deploy.ipynb)."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -1129,7 +1129,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
|
|||
//
|
||||
// Ensure that the ThreadPoolParallelSection has sufficient workers to
|
||||
// execute a loop with degree of parallelism n. We track the number
|
||||
// of workers already avaiable to the parallel section, prior to
|
||||
// of workers already available to the parallel section, prior to
|
||||
// submitting tasks to the work queues to make up the total.
|
||||
//
|
||||
// Each worker will call in to worker_fn(idx) with a per-worker thread
|
||||
|
|
|
@ -53,7 +53,8 @@ struct CudaContext : public CustomOpContext {
|
|||
cudnn_conv_use_max_workspace = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t);
|
||||
|
||||
cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
|
||||
enable_skip_layer_norm_strict_mode = FetchResource<bool>(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
|
||||
enable_skip_layer_norm_strict_mode = FetchResource<bool>(
|
||||
kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
|
||||
prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
|
||||
use_tf32 = FetchResource<bool>(kernel_ctx, CudaResource::use_tf32_t);
|
||||
}
|
||||
|
@ -61,13 +62,16 @@ struct CudaContext : public CustomOpContext {
|
|||
template <typename T>
|
||||
T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
|
||||
if constexpr (sizeof(T) > sizeof(void*)) {
|
||||
ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
|
||||
ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type),
|
||||
OrtErrorCode::ORT_INVALID_ARGUMENT);
|
||||
}
|
||||
const auto& ort_api = Ort::GetApi();
|
||||
void* resource = {};
|
||||
OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource);
|
||||
OrtStatus* status = ort_api.KernelContext_GetResource(
|
||||
&kernel_ctx, ORT_CUDA_RESOURCE_VERSION, resource_type, &resource);
|
||||
if (status) {
|
||||
ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION);
|
||||
ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resource type: " + std::to_string(resource_type),
|
||||
OrtErrorCode::ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
T t = {};
|
||||
memcpy(&t, &resource, sizeof(T));
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
#include "core/providers/resource.h"
|
||||
|
||||
#define ORT_CUDA_RESOUCE_VERSION 3
|
||||
#define ORT_CUDA_RESOURCE_VERSION 3
|
||||
|
||||
enum CudaResource : int {
|
||||
cuda_stream_t = cuda_resource_offset, // 10000
|
||||
|
|
|
@ -23,21 +23,24 @@ struct RocmContext : public CustomOpContext {
|
|||
void* resource = {};
|
||||
OrtStatus* status = nullptr;
|
||||
|
||||
status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::hip_stream_t, &resource);
|
||||
status = ort_api.KernelContext_GetResource(
|
||||
&kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::hip_stream_t, &resource);
|
||||
if (status) {
|
||||
ORT_CXX_API_THROW("failed to fetch hip stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
hip_stream = reinterpret_cast<hipStream_t>(resource);
|
||||
|
||||
resource = {};
|
||||
status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::miopen_handle_t, &resource);
|
||||
status = ort_api.KernelContext_GetResource(
|
||||
&kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::miopen_handle_t, &resource);
|
||||
if (status) {
|
||||
ORT_CXX_API_THROW("failed to fetch miopen handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
miopen_handle = reinterpret_cast<miopenHandle_t>(resource);
|
||||
|
||||
resource = {};
|
||||
status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::rocblas_handle_t, &resource);
|
||||
status = ort_api.KernelContext_GetResource(
|
||||
&kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::rocblas_handle_t, &resource);
|
||||
if (status) {
|
||||
ORT_CXX_API_THROW("failed to fetch rocblas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
#include "core/providers/resource.h"
|
||||
|
||||
#define ORT_ROCM_RESOUCE_VERSION 1
|
||||
#define ORT_ROCM_RESOURCE_VERSION 1
|
||||
|
||||
enum RocmResource : int {
|
||||
hip_stream_t = rocm_resource_offset,
|
||||
|
|
|
@ -473,13 +473,13 @@ typedef struct OrtCUDAProviderOptions {
|
|||
|
||||
/** \brief Enable TunableOp for using.
|
||||
* Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
|
||||
* This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
|
||||
* This option can be overridden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
|
||||
*/
|
||||
int tunable_op_enable;
|
||||
|
||||
/** \brief Enable TunableOp for tuning.
|
||||
* Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
|
||||
* This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
|
||||
* This option can be overridden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
|
||||
*/
|
||||
int tunable_op_tuning_enable;
|
||||
|
||||
|
@ -562,13 +562,13 @@ typedef struct OrtROCMProviderOptions {
|
|||
|
||||
/** \brief Enable TunableOp for using.
|
||||
* Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
|
||||
* This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
|
||||
* This option can be overridden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
|
||||
*/
|
||||
int tunable_op_enable;
|
||||
|
||||
/** \brief Enable TunableOp for tuning.
|
||||
* Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
|
||||
* This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
|
||||
* This option can be overridden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
|
||||
*/
|
||||
int tunable_op_tuning_enable;
|
||||
|
||||
|
@ -2798,7 +2798,7 @@ struct OrtApi {
|
|||
* "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena.
|
||||
* Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
|
||||
* "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`.
|
||||
* It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit.
|
||||
* It is not an allocation limit, it is only a limit for extension when requested byte is less than the limit.
|
||||
* When requested bytes is more than the limit, allocator will still return as requested.
|
||||
* Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes.
|
||||
* Ultimately, the allocation size is determined by the allocation memory request.
|
||||
|
@ -4467,13 +4467,14 @@ struct OrtApi {
|
|||
* E.g. a cuda stream or a cublas handle
|
||||
*
|
||||
* \param context - Kernel context
|
||||
* \param resouce_version - Version of the resource
|
||||
* \param resource_version - Version of the resource
|
||||
* \param resource_id - Type of resource
|
||||
* \param resource - A pointer to returned resource
|
||||
*
|
||||
* \since Version 1.16.
|
||||
*/
|
||||
ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resouce_version, _In_ int resource_id, _Outptr_ void** resource);
|
||||
ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resource_version,
|
||||
_In_ int resource_id, _Outptr_ void** resource);
|
||||
|
||||
/** \brief Set user logging function
|
||||
*
|
||||
|
@ -4528,10 +4529,10 @@ struct OrtApi {
|
|||
ORT_API2_STATUS(ShapeInferContext_GetAttribute, _In_ const OrtShapeInferContext* context, _In_ const char* attr_name, _Outptr_ const OrtOpAttr** attr);
|
||||
|
||||
/**
|
||||
* Set type and shape info of an ouput
|
||||
* Set type and shape info of an output
|
||||
*
|
||||
* \param[in] context
|
||||
* \param[in] index The index of the ouput
|
||||
* \param[in] index The index of the output
|
||||
* \param[out] info Type shape info of the output
|
||||
*
|
||||
* \since Version 1.17.
|
||||
|
|
|
@ -403,7 +403,7 @@ using Variadic = TensorArray;
|
|||
Note:
|
||||
OrtLiteCustomOp inherits from OrtCustomOp to bridge tween a custom func/struct and ort core.
|
||||
The lifetime of an OrtLiteCustomOp instance is managed by customer code, not ort, so:
|
||||
1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierachy.
|
||||
1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierarchy.
|
||||
2. OrtLiteCustomFunc and OrtLiteCustomStruct, as two sub-structs, can be released in form of OrtLiteCustomOp since all members are kept in the OrtLiteCustomOp,
|
||||
hence memory could still be recycled properly.
|
||||
Further, OrtCustomOp is a c struct bearing no v-table, so offspring structs are by design to be of zero virtual functions to maintain cast safety.
|
||||
|
|
|
@ -54,7 +54,7 @@ java {
|
|||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
|
||||
// This jar tasks serves as a CMAKE signalling
|
||||
// This jar tasks serves as a CMAKE signaling
|
||||
// mechanism. The jar will be overwritten by allJar task
|
||||
jar {
|
||||
}
|
||||
|
|
|
@ -438,7 +438,7 @@ final class OnnxRuntime {
|
|||
/**
|
||||
* Extracts the providers array from the C API, converts it into an EnumSet.
|
||||
*
|
||||
* <p>Throws IllegalArgumentException if a provider isn't recognised (note this exception should
|
||||
* <p>Throws IllegalArgumentException if a provider isn't recognized (note this exception should
|
||||
* only happen during development of ONNX Runtime, if it happens at any other point, file an issue
|
||||
* on <a href="https://github.com/microsoft/onnxruntime">GitHub</a>).
|
||||
*
|
||||
|
|
|
@ -3,5 +3,5 @@
|
|||
* Licensed under the MIT License.
|
||||
*/
|
||||
|
||||
/** Classes for controlling the behaviour of ONNX Runtime Execution Providers. */
|
||||
/** Classes for controlling the behavior of ONNX Runtime Execution Providers. */
|
||||
package ai.onnxruntime.providers;
|
||||
|
|
|
@ -242,7 +242,7 @@ public class ScoreMNIST {
|
|||
/**
|
||||
* Find the maximum probability and return it's index.
|
||||
*
|
||||
* @param probabilities The probabilites.
|
||||
* @param probabilities The probabilities.
|
||||
* @return The index of the max.
|
||||
*/
|
||||
public static int pred(float[] probabilities) {
|
||||
|
|
|
@ -1234,7 +1234,7 @@ export class CoordsGlslLib extends GlslLib {
|
|||
}
|
||||
|
||||
/**
|
||||
* This is the main function to map from the given texture coordiantes (s,t)
|
||||
* This is the main function to map from the given texture coordinates (s,t)
|
||||
* to logical indices for the output
|
||||
* There will only be one single variation of this
|
||||
* Also see coordsToOffset and offsetToIndices for input-specific versions
|
||||
|
|
|
@ -85,7 +85,7 @@ function getOutOfBoundsCondition(rank: number, shape: readonly number[], dims: s
|
|||
}
|
||||
|
||||
/**
|
||||
* code snippet to sample input texture with output coordiantes
|
||||
* code snippet to sample input texture with output coordinates
|
||||
*/
|
||||
function getOutput(shape: readonly number[], dims: string[]): string {
|
||||
const rank = shape.length;
|
||||
|
|
|
@ -19,7 +19,7 @@ using onnxruntime::rnn::detail::Direction;
|
|||
using onnxruntime::rnn::detail::MakeDirection;
|
||||
|
||||
// The class represents DeepCPU implementation of a long short term memory (LSTM) plus a Bahdanau Attention wraper.
|
||||
// The equivilent python usage could be checked int the corresponding op test directory, attention_lstm_data_gen.py.
|
||||
// The equivalent python usage could be checked int the corresponding op test directory, attention_lstm_data_gen.py.
|
||||
// Also please note that detail implementation re-used lot of code from current ONNXRuntime LSTM operator, refactor
|
||||
// is needed in future if this is become part of ONNX.
|
||||
class DeepCpuAttnLstmOp final : public OpKernel {
|
||||
|
|
|
@ -152,7 +152,7 @@ Status Sample(AllocatorPtr& allocator,
|
|||
1,
|
||||
generator,
|
||||
*sampled_idx));
|
||||
// TODO: update presense_mask()
|
||||
// TODO: update presence_mask()
|
||||
#ifdef DEBUG_GENERATION
|
||||
dumper->Print("sampled_idx", *sampled_idx);
|
||||
#endif
|
||||
|
|
|
@ -159,7 +159,7 @@ std::unique_ptr<ComputeCapability> ToCapacity(const onnxruntime::GraphViewer& gr
|
|||
ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.ImplicitInputDefs(), process_input_fn));
|
||||
|
||||
// Handle outouts
|
||||
// two cases are considerd as outputs
|
||||
// two cases are considered as outputs
|
||||
// 1. Output NodeArg is not used by any Node
|
||||
// 2. Output NodeArg is used by at least one Node out of this subgraph.
|
||||
// Note a NodeArg can be used by Nodes in and out of the subgraph at the same time.
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
|
||||
#define MTI_ASSERT(condition) \
|
||||
if (!(condition)) { \
|
||||
std::string error_msg = "Not satsified: " #condition \
|
||||
std::string error_msg = "Not satisfied: " #condition \
|
||||
": line " + \
|
||||
std::to_string(__LINE__) + \
|
||||
" in file " + std::string(__FILE__) + "\n"; \
|
||||
|
|
|
@ -74,7 +74,7 @@ bool ShouldTryVectorization(
|
|||
// Check the schedule of tensor
|
||||
// If it is not scheduled, try to vectorize it.
|
||||
// Note TryVectorization has to use with compute_root.
|
||||
// Therefore, there is a safty check of tensor's schedule
|
||||
// Therefore, there is a safety check of tensor's schedule
|
||||
bool TryVectorization(
|
||||
const tvm::Tensor& tensor,
|
||||
int64_t natural_vector_size,
|
||||
|
@ -124,7 +124,7 @@ bool TryVectorization(
|
|||
// Check the schedule of tensor
|
||||
// If it is not scheduled, try to add compute_inline on it.
|
||||
// Note TryInlineSchedule cannot be used with compute_root.
|
||||
// Therefore, there is a safty check of tensor's schedule.
|
||||
// Therefore, there is a safety check of tensor's schedule.
|
||||
bool TryInlineSchedule(
|
||||
const tvm::Tensor& tensor,
|
||||
ScheduleContext& ctx) {
|
||||
|
|
|
@ -34,7 +34,7 @@ bool ShouldTryVectorization(
|
|||
// Check the schedule of tensor
|
||||
// If it is not scheduled, try to vectorize it.
|
||||
// Note TryVectorization has to use with compute_root.
|
||||
// Therefore, there is a safty check of tensor's schedule
|
||||
// Therefore, there is a safety check of tensor's schedule
|
||||
bool TryVectorization(
|
||||
const tvm::Tensor& tensor,
|
||||
int64_t natural_vector_size,
|
||||
|
@ -43,7 +43,7 @@ bool TryVectorization(
|
|||
// Check the schedule of tensor
|
||||
// If it is not scheduled, try to add compute_inline on it.
|
||||
// Note TryInlineSchedule cannot be used with compute_root.
|
||||
// Therefore, there is a safty check of tensor's schedule.
|
||||
// Therefore, there is a safety check of tensor's schedule.
|
||||
bool TryInlineSchedule(
|
||||
const tvm::Tensor& tensor,
|
||||
ScheduleContext& ctx);
|
||||
|
|
|
@ -39,7 +39,7 @@ void TVMScheduleBuilder::DumpAllSchedulers() const {
|
|||
|
||||
d->ForEach([&stream](const std::string& key, Scheduler* op) {
|
||||
stream << "Key " << key
|
||||
<< ", Creater " << op->Name() << std::endl;
|
||||
<< ", Creator " << op->Name() << std::endl;
|
||||
});
|
||||
|
||||
++count;
|
||||
|
|
|
@ -13,7 +13,7 @@ namespace tvm_codegen {
|
|||
|
||||
using CoordTransFunc = std::function<tvm::Array<tvm::Expr>(const tvm::Array<tvm::Expr>&)>;
|
||||
|
||||
// WeightLayout is data layout trasnformer for weight/initializer
|
||||
// WeightLayout is data layout transformer for weight/initializer
|
||||
class WeightLayout {
|
||||
public:
|
||||
// Static function to return unique string as a key
|
||||
|
|
|
@ -56,7 +56,7 @@ LoggingManager* LoggingManager::GetDefaultInstance() {
|
|||
return static_cast<LoggingManager*>(DefaultLoggerManagerInstance().load());
|
||||
}
|
||||
|
||||
// GSL_SUPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
|
||||
// GSL_SUPPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
|
||||
// and should not have any destruction order issues via pragmas instead.
|
||||
// https://developercommunity.visualstudio.com/content/problem/249706/gslsuppress-does-not-work-for-i22-c-core-guideline.html
|
||||
#ifdef _MSC_VER
|
||||
|
|
|
@ -70,7 +70,7 @@ std::string Status::ToString() const {
|
|||
return result;
|
||||
}
|
||||
|
||||
// GSL_SUPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
|
||||
// GSL_SUPPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
|
||||
// and should not have any destruction order issues via pragmas instead.
|
||||
// https://developercommunity.visualstudio.com/content/problem/249706/gslsuppress-does-not-work-for-i22-c-core-guideline.html
|
||||
#ifdef _MSC_VER
|
||||
|
|
|
@ -1073,7 +1073,7 @@ class PlannerImpl {
|
|||
|
||||
#ifdef ORT_ENABLE_STREAM
|
||||
// assume we already have a baseline reuse plan (no memory reuse at all)
|
||||
// this funciton will optimize the plan by building a reuse plan with stream safety.
|
||||
// this function will optimize the plan by building a reuse plan with stream safety.
|
||||
Status OptimizeReusePlanForMultiStream() {
|
||||
InlinedHashMap<NodeIndex, int> dependent_counter;
|
||||
for (const auto& it : dependence_graph_) {
|
||||
|
@ -2012,7 +2012,7 @@ class PlannerImpl {
|
|||
for (auto* output : node->OutputDefs()) {
|
||||
if (output->Exists()) {
|
||||
if (std::find(it->InputDefs().begin(), it->InputDefs().end(), output) != it->InputDefs().end()) {
|
||||
output_consumed_in_subgraph = false; // output direclty consumed in current graph
|
||||
output_consumed_in_subgraph = false; // output directly consumed in current graph
|
||||
OrtValueIndex output_arg_idx;
|
||||
ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(output->Name(), output_arg_idx));
|
||||
// there are two cases we need notification:
|
||||
|
|
|
@ -53,7 +53,7 @@ class SequentialPlannerContext : public ISequentialPlannerContext {
|
|||
public:
|
||||
SequentialPlannerContext(ExecutionMode execution_mode, ExecutionOrder execution_order, bool enable_memory_reuse)
|
||||
: execution_mode_(execution_mode),
|
||||
exection_order_(execution_order),
|
||||
execution_order_(execution_order),
|
||||
enable_memory_reuse_(enable_memory_reuse) {
|
||||
}
|
||||
|
||||
|
@ -63,13 +63,13 @@ class SequentialPlannerContext : public ISequentialPlannerContext {
|
|||
|
||||
bool IsParallelExecutionEnabled() const override { return execution_mode_ == ExecutionMode::ORT_PARALLEL; }
|
||||
|
||||
ExecutionOrder GetExecutionOrder() const override { return exection_order_; }
|
||||
ExecutionOrder GetExecutionOrder() const override { return execution_order_; }
|
||||
|
||||
bool GetEnableMemoryReuse() const override { return enable_memory_reuse_; }
|
||||
|
||||
private:
|
||||
ExecutionMode execution_mode_ = ExecutionMode::ORT_SEQUENTIAL;
|
||||
ExecutionOrder exection_order_ = ExecutionOrder::DEFAULT;
|
||||
ExecutionOrder execution_order_ = ExecutionOrder::DEFAULT;
|
||||
bool enable_memory_reuse_ = true;
|
||||
};
|
||||
|
||||
|
|
|
@ -93,7 +93,8 @@ class DeviceStreamCollectionImpl {
|
|||
const AllocatorMap& allocators_;
|
||||
bool is_main_graph_ = false;
|
||||
// This is used in ExecutionFrame when memory pattern is enabled, to allocate the peak size memory
|
||||
// labelled this stream in the current thread, instead of the default stream which will be used in all the threads (thus caused thread safe issue)
|
||||
// labeled this stream in the current thread, instead of the default stream which will be used in all the threads
|
||||
// (thus caused thread safe issue)
|
||||
std::unique_ptr<Stream> root_stream_;
|
||||
OrtDevice root_stream_device_;
|
||||
void ReleaseSingleStreamBuffers();
|
||||
|
|
|
@ -167,7 +167,7 @@ class ExecutionFrame final : public IExecutionFrame {
|
|||
}
|
||||
|
||||
// This function try retrieve the inferred shapes for the given NodeArg index.
|
||||
// If the retrival is sucessful, this function returns true and false otherwise.
|
||||
// If the retrival is successful, this function returns true and false otherwise.
|
||||
bool TryGetInferredShape(int index, TensorShape& shape) const override;
|
||||
|
||||
#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
|
||||
|
|
|
@ -50,7 +50,7 @@ PartialGraphExecutionState::~PartialGraphExecutionState() {
|
|||
DeviceStreamCollection* PartialGraphExecutionState::GetDeviceStreamCollection(const SessionState& session_state) {
|
||||
if (device_stream_collection_ == nullptr) {
|
||||
device_stream_collection_ = session_state.AcquireDeviceStreamCollection();
|
||||
// the life-time of partial graph execution state is in-consistant with session,
|
||||
// the life-time of partial graph execution state is inconsistent with session,
|
||||
// so we can't make sure it is safe to return the device stream collection to
|
||||
// session when deconstruct partial graph execution state.
|
||||
// so let's always delete the stream collections.
|
||||
|
|
|
@ -106,7 +106,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
|
|||
// types of steps:
|
||||
// 1. Kernel Launch
|
||||
// 2. Activate notification
|
||||
// 3. Wait on a notificaiton
|
||||
// 3. Wait on a notification
|
||||
class ExecutionStep {
|
||||
public:
|
||||
ExecutionStep(NodeIndex node_index) : node_index_(node_index) {}
|
||||
|
@ -122,7 +122,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
|
|||
protected:
|
||||
NodeIndex node_index_;
|
||||
};
|
||||
// LogicStream is a sequence of execution steps that can be executed independetly.
|
||||
// LogicStream is a sequence of execution steps that can be executed independently.
|
||||
// The steps within a sequence are executed in order, and happened on the same device.
|
||||
struct LogicStream {
|
||||
std::vector<std::unique_ptr<ExecutionStep>> steps_;
|
||||
|
@ -160,7 +160,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
|
|||
std::vector<size_t> notification_owners;
|
||||
// key: notification index.
|
||||
// value: {stream_idx, step_idx}
|
||||
// giving a notificaiton, we used this map to figure out what is the downstream steps it need to trigger.
|
||||
// giving a notification, we used this map to figure out what is the downstream steps it need to trigger.
|
||||
InlinedHashMap<onnxruntime::NotificationIndex, std::vector<std::pair<size_t, size_t>>> downstream_map;
|
||||
|
||||
size_t num_barriers{0};
|
||||
|
|
|
@ -442,7 +442,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
|
|||
if (p_kernel->KernelDef().OpName() == "YieldOp") {
|
||||
// Do not execute YieldOp (it is an no-op anyways).
|
||||
// Decrement the reference count of tensors that are not needed beyond this point.
|
||||
// REVEIW(codemzs): The current model assumes the intermediate tensors that are exported
|
||||
// REVIEW(codemzs): The current model assumes the intermediate tensors that are exported
|
||||
// as graph outputs are owned by ORT, the risk of caller freeing the tensor or manipulating tensor
|
||||
// memory lingers while the tensor is used downstream after the export.
|
||||
ctx.RecycleNodeInputs(idx);
|
||||
|
|
|
@ -62,7 +62,7 @@ enum class ExecutionPriority : int {
|
|||
|
||||
struct FreeDimensionOverride {
|
||||
std::string dim_identifier;
|
||||
FreeDimensionOverrideType dim_identifer_type;
|
||||
FreeDimensionOverrideType dim_identifier_type;
|
||||
int64_t dim_value;
|
||||
};
|
||||
|
||||
|
|
|
@ -22,9 +22,9 @@ using namespace ::onnxruntime::common;
|
|||
|
||||
namespace onnxruntime {
|
||||
#ifdef ORT_ENABLE_STREAM
|
||||
static inline std::string GetWaitKey(const OrtDevice::DeviceType notificaiton_device_type,
|
||||
static inline std::string GetWaitKey(const OrtDevice::DeviceType notification_device_type,
|
||||
const OrtDevice::DeviceType executor_device_type) {
|
||||
return std::to_string(notificaiton_device_type) + ":" + std::to_string(executor_device_type);
|
||||
return std::to_string(notification_device_type) + ":" + std::to_string(executor_device_type);
|
||||
}
|
||||
|
||||
class StreamCommandHandleRegistryImpl : public IStreamCommandHandleRegistry {
|
||||
|
|
|
@ -551,7 +551,7 @@ Status SparseTensor::Copy(const IDataTransfer& data_transfer, SparseTensor& dst_
|
|||
}
|
||||
|
||||
if (Values().Shape().Size() > 0) {
|
||||
// This instance may either have a contigious buffer which we can copy in one shot
|
||||
// This instance may either have a contiguous buffer which we can copy in one shot
|
||||
// or it can point to users buffers, in which case we have to copy each buffer individually
|
||||
// strings can not be memcpyed albeit always on CPU.
|
||||
if (p_data_ != nullptr) {
|
||||
|
@ -569,7 +569,7 @@ Status SparseTensor::Copy(const IDataTransfer& data_transfer, SparseTensor& dst_
|
|||
ORT_RETURN_IF_ERROR(data_transfer.CopyTensor(src, dst));
|
||||
}
|
||||
} else {
|
||||
// non-contiguos buffer
|
||||
// non-contiguous buffer
|
||||
if (is_string) {
|
||||
CopyStrings(Values(), result_values);
|
||||
} else {
|
||||
|
|
|
@ -151,7 +151,7 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
|
|||
// the data location is external. i.e. it does not load the external data.
|
||||
// However if AttributeProto contains SparseTensorProto then it converts the data into dense tensor proto
|
||||
// (including loading external data when applicable).
|
||||
// model_path is used for contructing full path for external_data
|
||||
// model_path is used for constructing full path for external_data
|
||||
// tensor_name specifies the name for the new TensorProto TensorProto
|
||||
common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& node,
|
||||
const std::filesystem::path& model_path,
|
||||
|
@ -165,7 +165,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
|
|||
// Convert a SparseTensorProto to a dense TensorProto
|
||||
// If the SparseTensorProto contains external data then it loads the data and converts to dense tensor proto
|
||||
// The resulting TensorProto will contain the data as raw data.
|
||||
// model_path is used for contructing full path for external_data
|
||||
// model_path is used for constructing full path for external_data
|
||||
common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseTensorProto& sparse,
|
||||
const std::filesystem::path& model_path,
|
||||
ONNX_NAMESPACE::TensorProto& dense);
|
||||
|
@ -174,7 +174,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
|
|||
// Convert a TensorProto to a SparseTensorProto
|
||||
// If the tensorproto contains external data then it loads the data and converts to sparse tensor
|
||||
// The resulting SparseTensorProto will contain the data as raw data
|
||||
// model_path is used for contructing full path for external_data
|
||||
// model_path is used for constructing full path for external_data
|
||||
common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense,
|
||||
const std::filesystem::path& model_path,
|
||||
ONNX_NAMESPACE::SparseTensorProto& sparse);
|
||||
|
|
|
@ -47,7 +47,7 @@ void ConstructStrings(void* p_data, int64_t elements);
|
|||
|
||||
/// <summary>
|
||||
/// Destroy std::string objects in the contiquous chunk of memory
|
||||
/// by explicitely invoking ~string();
|
||||
/// by explicitly invoking ~string();
|
||||
/// </summary>
|
||||
/// <param name="p_data"></param>
|
||||
/// <param name="elements"></param>
|
||||
|
|
|
@ -37,12 +37,12 @@ void weightsMinuEight2Half(uint32_t const &weights,
|
|||
//
|
||||
// For element 0, 1, 4, 5, we have 0x000?000?, set the high bits
|
||||
// to 0x6400, essentially we set the exponent bits to 25, effective
|
||||
// exp = 25 - 15 = 10, with explicity hight bit, the value is
|
||||
// exp = 25 - 15 = 10, with explicitly hight bit, the value is
|
||||
// 2^10 + q_w.
|
||||
//
|
||||
// Similarly for element 2, 3, 6, 7, we have 0x00?000?, set the
|
||||
// high bits to 0x5400, essentially we set the exponent bits to 21,
|
||||
// effective exp = 21 - 15 = 6, with explicity hight bit, the value
|
||||
// effective exp = 21 - 15 = 6, with explicitly hight bit, the value
|
||||
// is 2^6 + q_w.
|
||||
//
|
||||
// 1.125 instruction per weight, 9 instructions in total.
|
||||
|
@ -86,12 +86,12 @@ void weights2Half([[maybe_unused]] uint32_t const &weights,
|
|||
//
|
||||
// For element 0, 1, 4, 5, we have 0x000?000?, set the high bits
|
||||
// to 0x6400, essentially we set the exponent bits to 25, effective
|
||||
// exp = 25 - 15 = 10, with explicity hight bit, the value is
|
||||
// exp = 25 - 15 = 10, with explicitly hight bit, the value is
|
||||
// 2^10 + q_w.
|
||||
//
|
||||
// Similarly for element 2, 3, 6, 7, we have 0x00?000?, set the
|
||||
// high bits to 0x5400, essentially we set the exponent bits to 21,
|
||||
// effective exp = 21 - 15 = 6, with explicity hight bit, the value
|
||||
// effective exp = 21 - 15 = 6, with explicitly hight bit, the value
|
||||
// is 2^6 + q_w.
|
||||
//
|
||||
// 1.125 instruction per weight, 9 instructions in total.
|
||||
|
|
|
@ -61,7 +61,7 @@ Routine Description:
|
|||
|
||||
This implementation supports sampling a portion of the convolution
|
||||
patches. This avoids the need to allocate very large buffers to store
|
||||
all of the convolution patches at once, when the underyling GEMM
|
||||
all of the convolution patches at once, when the underlying GEMM
|
||||
implementation will already break up the operation into panels. Multiple
|
||||
threads can also be used to process different portions of the image.
|
||||
|
||||
|
@ -267,7 +267,7 @@ Routine Description:
|
|||
|
||||
This implementation supports sampling a portion of the convolution
|
||||
patches. This avoids the need to allocate very large buffers to store
|
||||
all of the convolution patches at once, when the underyling GEMM
|
||||
all of the convolution patches at once, when the underlying GEMM
|
||||
implementation will already break up the operation into panels. Multiple
|
||||
threads can also be used to process different portions of the image.
|
||||
|
||||
|
|
|
@ -1118,8 +1118,8 @@ bool CheckNodesInPathV(const Graph& graph, const Node& reshape, const Node& tran
|
|||
head_size = v_reshape_shape[3];
|
||||
|
||||
// Check reshape for attention output has shape input (0, 0, -1) or (0, 0, N*H)
|
||||
// In DistilBert, the reshape after qkv paths can not be fused during reshape fusion, so we do not have the correspondig
|
||||
// initializer. We need to get the shape information from the input of concat.
|
||||
// In DistilBert, the reshape after qkv paths can not be fused during reshape fusion, so we do not have the
|
||||
// corresponding initializer. We need to get the shape information from the input of concat.
|
||||
InlinedVector<int64_t> reshape_shape;
|
||||
if (!optimizer_utils::AppendTensorFromInitializer(graph, *(reshape.InputDefs()[1]), reshape_shape)) {
|
||||
if (CheckDistilBertReshapeShape(graph, reshape, hidden_size, record_node_idx, logger)) {
|
||||
|
|
|
@ -22,9 +22,9 @@ FreeDimensionOverrideTransformer::FreeDimensionOverrideTransformer(gsl::span<con
|
|||
: GraphTransformer("FreeDimensionOverrideTransformer") {
|
||||
for (const auto& o : overrides_to_apply) {
|
||||
// Convert to lowercase to perform case-insensitive comparisons later
|
||||
if (o.dim_identifer_type == FreeDimensionOverrideType::Denotation) {
|
||||
if (o.dim_identifier_type == FreeDimensionOverrideType::Denotation) {
|
||||
dimension_override_by_denotation_.emplace(ToLower(o.dim_identifier), o.dim_value);
|
||||
} else if (o.dim_identifer_type == FreeDimensionOverrideType::Name) {
|
||||
} else if (o.dim_identifier_type == FreeDimensionOverrideType::Name) {
|
||||
dimension_override_by_name_.emplace(o.dim_identifier, o.dim_value);
|
||||
} else {
|
||||
ORT_THROW("Invalid free dimension override.");
|
||||
|
|
|
@ -284,10 +284,12 @@ class RemoveDuplicateCastTransformer : public GraphTransformer {
|
|||
private:
|
||||
static bool UnsafeCast(DataType src_type, DataType dst_type, const Node& node) {
|
||||
// This is not a complete cast optimisation pass, and is more conservative than it could be.
|
||||
// For instance, certain integral -> floating point casts could be optimised but this is left to an explicit cast optimisation pass.
|
||||
// For instance, certain integral -> floating point casts could be optimized but
|
||||
// this is left to an explicit cast optimisation pass.
|
||||
|
||||
// The comparison with "InsertedPrecisionFreeCast_" reflects cast nodes that are inserted by InsertCastTransformer.
|
||||
// Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
|
||||
// Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and
|
||||
// downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
|
||||
auto src_type_group = GetTypeGroup(src_type);
|
||||
auto dst_type_group = GetTypeGroup(dst_type);
|
||||
if (Unknown == src_type_group || Unknown == dst_type_group) {
|
||||
|
|
|
@ -1258,7 +1258,7 @@ static int EstimateTransposeValueCost(const api::GraphRef& graph, std::string_vi
|
|||
std::unique_ptr<api::NodeRef> producer_node = graph.GetNodeProducingOutput(input);
|
||||
|
||||
if (producer_node != nullptr) {
|
||||
// this handles cancelling out a Transpose or Squeeze added to a shared initializer that was updated
|
||||
// this handles canceling out a Transpose or Squeeze added to a shared initializer that was updated
|
||||
// by TransposeInputImpl Case 1 or UnqueezeInput Case 1.
|
||||
// - if a shared initializer is not broadcast, we have <updated initializer> -> Transpose -> DQ
|
||||
// - if a shared initializer is broadcast, we have <updated initializer> -> Transpose -> Squeeze -> DQ and need
|
||||
|
@ -1992,7 +1992,7 @@ static bool HandleTile(HandlerArgs& args) {
|
|||
|
||||
constexpr HandlerInfo tile_handler = {&FirstInput, &HandleTile};
|
||||
|
||||
// Helper to remove cancelling Transpose -> Transpose or
|
||||
// Helper to remove canceling Transpose -> Transpose or
|
||||
// Transpose -> Reshape nodes.
|
||||
static void RemoveCancelingTransposeNodes(HandlerArgs& args) {
|
||||
// Input to 1st transpose
|
||||
|
|
|
@ -118,7 +118,7 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
|
|||
ACLImportMemory(tbatch_norm.b->allocator(), (void*)b_data, B->Shape().Size() * 4);
|
||||
ACLImportMemory(tbatch_norm.scale->allocator(), (void*)scale_data, S->Shape().Size() * 4);
|
||||
|
||||
// allocate space for input tensor to accomodate paddings and strides
|
||||
// allocate space for input tensor to accommodate paddings and strides
|
||||
tbatch_norm.in->allocator()->allocate();
|
||||
|
||||
tbatch_norm.layer = std::move(layer);
|
||||
|
|
|
@ -121,7 +121,7 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
|
|||
layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
|
||||
}
|
||||
|
||||
// allocate space for input tensor to accomodate paddings and strides
|
||||
// allocate space for input tensor to accommodate paddings and strides
|
||||
tpool.in->allocator()->allocate();
|
||||
|
||||
tpool.layer = std::move(layer);
|
||||
|
|
|
@ -56,7 +56,7 @@ Status Relu<T>::Compute(OpKernelContext* context) const {
|
|||
armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
|
||||
activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
|
||||
|
||||
// Optimise ArmNN network
|
||||
// Optimize ArmNN network
|
||||
armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Relu::run->GetDeviceSpec());
|
||||
|
||||
if (optNet == nullptr) {
|
||||
|
|
|
@ -130,7 +130,7 @@ class Gemm : public onnxruntime::Gemm<T> {
|
|||
armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
|
||||
fc_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
|
||||
|
||||
// Optimise ArmNN network
|
||||
// Optimize ArmNN network
|
||||
armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Gemm::run->GetDeviceSpec());
|
||||
|
||||
if (optNet == nullptr) {
|
||||
|
|
|
@ -89,7 +89,7 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
|
|||
armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
|
||||
layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
|
||||
|
||||
// Optimise ArmNN network
|
||||
// Optimize ArmNN network
|
||||
armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, BatchNorm::run->GetDeviceSpec());
|
||||
|
||||
if (optNet == nullptr) {
|
||||
|
|
|
@ -266,7 +266,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
|
||||
}
|
||||
|
||||
// Optimise ArmNN network
|
||||
// Optimize ArmNN network
|
||||
armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Conv::run->GetDeviceSpec());
|
||||
|
||||
if (optNet == nullptr) {
|
||||
|
|
|
@ -161,7 +161,7 @@ Status Pool<T, PoolType>::Compute(OpKernelContext* context) const {
|
|||
armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
|
||||
pool_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
|
||||
|
||||
// Optimise ArmNN network
|
||||
// Optimize ArmNN network
|
||||
armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Pool::run->GetDeviceSpec());
|
||||
|
||||
if (optNet == nullptr) {
|
||||
|
@ -250,7 +250,7 @@ Status MaxPoolV8<T>::Compute(OpKernelContext* context) const {
|
|||
armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
|
||||
pool_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
|
||||
|
||||
// Optimise ArmNN network
|
||||
// Optimize ArmNN network
|
||||
armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, MaxPoolV8::run->GetDeviceSpec());
|
||||
|
||||
if (optNet == nullptr) {
|
||||
|
|
|
@ -290,9 +290,9 @@ std::unique_ptr<Tensor> Transpose(const Tensor& input, const TensorShape& input_
|
|||
// and it will de-allocate the memory for this intermediate tensor when it goes out of scope
|
||||
std::unique_ptr<Tensor> output = std::make_unique<Tensor>(input.DataType(), output_dims, allocator);
|
||||
|
||||
TensorShape overriden_shape(input_shape_override);
|
||||
TensorShape overridden_shape(input_shape_override);
|
||||
|
||||
auto status = device_transpose_func(permutation, input, *output, &overriden_shape, einsum_cuda_assets);
|
||||
auto status = device_transpose_func(permutation, input, *output, &overridden_shape, einsum_cuda_assets);
|
||||
|
||||
if (!status.IsOK()) {
|
||||
ORT_THROW(ONNXRUNTIME, FAIL, "Einsum op: Transpose failed: ", status.ErrorMessage());
|
||||
|
|
|
@ -209,7 +209,7 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
|
|||
if (current_left && IsTransposeReshapeForEinsum(left_permutation,
|
||||
current_left->Shape().GetDims(),
|
||||
reshaped_dims)) {
|
||||
// This can be done because curent_* tensors (if they exist) and output tensors are
|
||||
// This can be done because current_* tensors (if they exist) and output tensors are
|
||||
// intermediate tensors and cannot be input tensors to the Einsum node itself
|
||||
// (which are immutable).
|
||||
// Covered by ExplicitEinsumAsTensorContractionReshapeLeft.
|
||||
|
|
|
@ -135,7 +135,7 @@ static void PreCalcForBilinearInterpolate(const int64_t height, const int64_t wi
|
|||
T w3 = ly * hx;
|
||||
T w4 = ly * lx;
|
||||
|
||||
// save weights and indeces
|
||||
// save weights and indices
|
||||
PreCalc<T> pc;
|
||||
pc.pos1 = y_low * width + x_low;
|
||||
pc.pos2 = y_low * width + x_high;
|
||||
|
|
|
@ -317,7 +317,7 @@ Status SequenceConstruct::Compute(OpKernelContext* context) const {
|
|||
const auto* X = context->Input<Tensor>(input_idx);
|
||||
if (input_idx > 0 && X->DataType() != first_dtype) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
"Violation of the requirment that all input tensors must have the same data type.");
|
||||
"Violation of the requirement that all input tensors must have the same data type.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ ONNX_OPERATOR_SET_SCHEMA(
|
|||
1,
|
||||
"indices",
|
||||
"A 1-D INT64 tensor "
|
||||
"containing indices of 'Y' elements' first occurance in 'X'. "
|
||||
"containing indices of 'Y' elements' first occurrence in 'X'. "
|
||||
"When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. "
|
||||
"When 'axis' is not provided, it contains indices to values in the flattened input tensor. ",
|
||||
"tensor(int64)",
|
||||
|
|
|
@ -60,7 +60,7 @@ void* CUDAExternalAllocator::Alloc(size_t size) {
|
|||
if (size > 0) {
|
||||
p = alloc_(size);
|
||||
|
||||
// review(codemzs): ORT_ENFORCE does not seem appropiate.
|
||||
// review(codemzs): ORT_ENFORCE does not seem appropriate.
|
||||
ORT_ENFORCE(p != nullptr);
|
||||
}
|
||||
|
||||
|
|
|
@ -179,7 +179,7 @@ Status CudaStream::CleanUpOnRunEnd() {
|
|||
}
|
||||
|
||||
void* CudaStream::GetResource(int version, int id) const {
|
||||
ORT_ENFORCE(version <= ORT_CUDA_RESOUCE_VERSION, "resource version unsupported!");
|
||||
ORT_ENFORCE(version <= ORT_CUDA_RESOURCE_VERSION, "resource version unsupported!");
|
||||
void* resource{};
|
||||
switch (id) {
|
||||
case CudaResource::cuda_stream_t:
|
||||
|
|
|
@ -30,7 +30,7 @@ dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
|
|||
uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
|
||||
|
||||
// In the vectorized case we want to trade off allowing more of the buffers to be accessed
|
||||
// in a vectorized way against wanting a larger block size to get better utilisation.
|
||||
// in a vectorized way against wanting a larger block size to get better utilization.
|
||||
// In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
|
||||
// of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
|
||||
// allowing a larger block size.
|
||||
|
|
|
@ -15,7 +15,7 @@ namespace onnxruntime {
|
|||
namespace cuda {
|
||||
|
||||
// Op Set 11 for Conv only update document to clearify default dilations and strides value.
|
||||
// which are already convered by op set 11 cpu versoin, so simply add declaration.
|
||||
// which are already convered by op set 11 cpu version, so simply add declaration.
|
||||
#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC) \
|
||||
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
|
||||
Conv, \
|
||||
|
@ -269,7 +269,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
|||
// especially for EXHAUSTIVE algo search which may result in a better algo selection.
|
||||
// ORTModule uses different algo search options (HEURISTIC, and use max workspace size) compared to
|
||||
// inference build (EXHAUSTIVE, 32M workspace size). We observed better perf when we pad input shape
|
||||
// [N,C,D] to [N,C,1,D], expecially on A100, and especially for ConvGrad.
|
||||
// [N,C,D] to [N,C,1,D], especially on A100, and especially for ConvGrad.
|
||||
// PyTorch also pads to [N,C,1,D]. For inference build, we still pad it to [N, C, D, 1] as this seems
|
||||
// to be the sweet spot for all algo search options: EXHAUSTIVE, HEURISTIC, and DEFAULT.
|
||||
// See PR #7348 and #7702 for more context.
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
|
||||
template <typename T>
|
||||
__device__ T bilinear_interpolate(
|
||||
const T* bottom_data,
|
||||
|
@ -73,8 +73,8 @@ __device__ T bilinear_interpolate(
|
|||
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
||||
|
||||
T val = is_mode_avg
|
||||
? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4) // mode Avg
|
||||
: max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max
|
||||
? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4) // mode Avg
|
||||
: max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max
|
||||
|
||||
return val;
|
||||
}
|
||||
|
@ -116,7 +116,7 @@ __global__ void RoIAlignForward(
|
|||
|
||||
T roi_width = roi_end_w - roi_start_w;
|
||||
T roi_height = roi_end_h - roi_start_h;
|
||||
if (!half_pixel) { // backward compatiblity
|
||||
if (!half_pixel) { // backward compatibility
|
||||
// Force malformed ROIs to be 1x1
|
||||
roi_width = max(roi_width, (T)1.);
|
||||
roi_height = max(roi_height, (T)1.);
|
||||
|
@ -129,29 +129,29 @@ __global__ void RoIAlignForward(
|
|||
|
||||
// We use roi_bin_grid to sample the grid and mimic integral
|
||||
int roi_bin_grid_h = (sampling_ratio > 0)
|
||||
? sampling_ratio
|
||||
: _Ceil(roi_height / pooled_height); // e.g., = 2
|
||||
? sampling_ratio
|
||||
: _Ceil(roi_height / pooled_height); // e.g., = 2
|
||||
int roi_bin_grid_w =
|
||||
(sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width);
|
||||
|
||||
// We do average (integral) pooling inside a bin
|
||||
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
|
||||
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
|
||||
|
||||
T output_val = 0.;
|
||||
bool max_flag = false;
|
||||
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
|
||||
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
|
||||
{
|
||||
const T y = roi_start_h + ph * bin_size_h +
|
||||
static_cast<T>(iy + .5f) * bin_size_h /
|
||||
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
|
||||
static_cast<T>(iy + .5f) * bin_size_h /
|
||||
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
|
||||
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
|
||||
const T x = roi_start_w + pw * bin_size_w +
|
||||
static_cast<T>(ix + .5f) * bin_size_w /
|
||||
static_cast<T>(roi_bin_grid_w);
|
||||
static_cast<T>(ix + .5f) * bin_size_w /
|
||||
static_cast<T>(roi_bin_grid_w);
|
||||
|
||||
T val = bilinear_interpolate(
|
||||
offset_bottom_data, height, width, y, x, is_mode_avg, index);
|
||||
|
||||
|
||||
if (is_mode_avg) {
|
||||
output_val += val;
|
||||
} else {
|
||||
|
@ -174,24 +174,24 @@ __global__ void RoIAlignForward(
|
|||
|
||||
template <typename T>
|
||||
void RoiAlignImpl(
|
||||
cudaStream_t stream,
|
||||
const int64_t nthreads,
|
||||
const T* bottom_data,
|
||||
const T spatial_scale,
|
||||
const int64_t channels,
|
||||
const int64_t height,
|
||||
const int64_t width,
|
||||
const int64_t pooled_height,
|
||||
const int64_t pooled_width,
|
||||
const int64_t sampling_ratio,
|
||||
const T* bottom_rois,
|
||||
int64_t roi_cols,
|
||||
T* top_data,
|
||||
const bool is_mode_avg,
|
||||
const bool half_pixel,
|
||||
const int64_t* batch_indices_ptr) {
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock));
|
||||
RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
cudaStream_t stream,
|
||||
const int64_t nthreads,
|
||||
const T* bottom_data,
|
||||
const T spatial_scale,
|
||||
const int64_t channels,
|
||||
const int64_t height,
|
||||
const int64_t width,
|
||||
const int64_t pooled_height,
|
||||
const int64_t pooled_width,
|
||||
const int64_t sampling_ratio,
|
||||
const T* bottom_rois,
|
||||
int64_t roi_cols,
|
||||
T* top_data,
|
||||
const bool is_mode_avg,
|
||||
const bool half_pixel,
|
||||
const int64_t* batch_indices_ptr) {
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock));
|
||||
RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
nthreads,
|
||||
bottom_data,
|
||||
spatial_scale,
|
||||
|
@ -206,30 +206,30 @@ void RoiAlignImpl(
|
|||
top_data,
|
||||
is_mode_avg,
|
||||
half_pixel,
|
||||
batch_indices_ptr);
|
||||
batch_indices_ptr);
|
||||
}
|
||||
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template void RoiAlignImpl<T>( \
|
||||
cudaStream_t stream, \
|
||||
const int64_t nthreads, \
|
||||
const T* bottom_data, \
|
||||
const T spatial_scale, \
|
||||
const int64_t channels, \
|
||||
const int64_t height, \
|
||||
const int64_t width, \
|
||||
const int64_t pooled_height, \
|
||||
const int64_t pooled_width, \
|
||||
const int64_t sampling_ratio, \
|
||||
const T* bottom_rois, \
|
||||
int64_t roi_cols, \
|
||||
T* top_data, \
|
||||
const bool is_mode_avg, \
|
||||
const bool half_pixel, \
|
||||
const int64_t* batch_indices_ptr);
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template void RoiAlignImpl<T>( \
|
||||
cudaStream_t stream, \
|
||||
const int64_t nthreads, \
|
||||
const T* bottom_data, \
|
||||
const T spatial_scale, \
|
||||
const int64_t channels, \
|
||||
const int64_t height, \
|
||||
const int64_t width, \
|
||||
const int64_t pooled_height, \
|
||||
const int64_t pooled_width, \
|
||||
const int64_t sampling_ratio, \
|
||||
const T* bottom_rois, \
|
||||
int64_t roi_cols, \
|
||||
T* top_data, \
|
||||
const bool is_mode_avg, \
|
||||
const bool half_pixel, \
|
||||
const int64_t* batch_indices_ptr);
|
||||
|
||||
SPECIALIZED_IMPL(float)
|
||||
SPECIALIZED_IMPL(double)
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
@ -115,7 +115,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
|
|||
CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(cudnn_handle, reduce_desc, input_tensor, output_tensor, &indices_bytes));
|
||||
auto indices_cuda = GetScratchBuffer<uint32_t>(indices_bytes, stream);
|
||||
|
||||
// need to allocate a separate buffer for ArgMin/ArgMax comparsion output
|
||||
// need to allocate a separate buffer for ArgMin/ArgMax comparison output
|
||||
auto output_count = output_shape.Size();
|
||||
|
||||
if (ReduceTensorIndices == CUDNN_REDUCE_TENSOR_NO_INDICES) {
|
||||
|
|
|
@ -234,15 +234,15 @@ __global__ void _ResizeNearestKernel(
|
|||
|
||||
int output_index = static_cast<int>(id);
|
||||
int input_index = 0;
|
||||
int extrapolation_occured = 0;
|
||||
int extrapolation_occurred = 0;
|
||||
for (int axis = 0; axis < rank; ++axis) {
|
||||
int dim = 0;
|
||||
output_div_pitches[axis].divmod(output_index, dim, output_index);
|
||||
const NearestMappingInfo& mi = dims_mapping[prefix_dim_sum[axis] + dim];
|
||||
extrapolation_occured += mi.extrapolate_;
|
||||
extrapolation_occurred += mi.extrapolate_;
|
||||
input_index += input_strides[axis] * mi.origin_;
|
||||
}
|
||||
output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
|
||||
output_data[id] = extrapolation_occurred ? extrapolation_value : input_data[input_index];
|
||||
}
|
||||
|
||||
struct LinearMappingInfo {
|
||||
|
|
|
@ -145,7 +145,7 @@ bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const cu
|
|||
(input_dims[3] % num_elements_per_thread) == 0 &&
|
||||
input_dims[1] <= prop.maxGridSize[1] &&
|
||||
input_dims[0] <= prop.maxGridSize[2]) {
|
||||
// There are 2 constrains when luanching the kernels
|
||||
// There are 2 constrains when launching the kernels
|
||||
// 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
|
||||
// 2. block_size_y * num_block_ext >= input_dims[2]
|
||||
int64_t block_size_x = input_dims[3] / num_elements_per_thread;
|
||||
|
@ -261,7 +261,7 @@ bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop,
|
|||
if (input_dims[3] <= prop.maxThreadsPerBlock &&
|
||||
input_dims[1] <= prop.maxGridSize[1] &&
|
||||
input_dims[0] <= prop.maxGridSize[2]) {
|
||||
// There are 2 constrains when luanching the kernels
|
||||
// There are 2 constrains when launching the kernels
|
||||
// 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
|
||||
// 2. block_size_y * num_block_ext >= input_dims[2]
|
||||
int64_t block_size_x = input_dims[3];
|
||||
|
|
|
@ -44,7 +44,7 @@ public:
|
|||
|
||||
// At this point, we have manipulated input/output shapes and strides and
|
||||
// we do not care about actual input shapes present in the model (.onnx file).
|
||||
// Create the TensorDesc with the manipulated input shapes becuase we don't want incorrect
|
||||
// Create the TensorDesc with the manipulated input shapes because we don't want incorrect
|
||||
// broadcasting to be happen inside TensorDesc constructor.
|
||||
std::vector<std::optional<uint32_t>> inputIndices = { 0, 1, std::nullopt };
|
||||
gsl::span<const uint32_t> inputShapes[2] = {sizesA, sizesB};
|
||||
|
|
|
@ -344,7 +344,7 @@ Status DnnlExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fuse
|
|||
auto input_tensor = ctx.GetInput(i);
|
||||
auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
|
||||
auto shape = tensor_info.GetShape();
|
||||
// dnnl expectes non-const data
|
||||
// dnnl expects non-const data
|
||||
void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
|
||||
inputs.emplace(
|
||||
input_name,
|
||||
|
|
|
@ -431,7 +431,7 @@ bool DnnlMatMulIntegerNodeCapability::IsDimensionSupported(const Node* node, con
|
|||
}
|
||||
}
|
||||
|
||||
// if shape nullptr, not enough information to reject it. attempt to run it (no gaurantee)
|
||||
// if shape nullptr, not enough information to reject it. attempt to run it (no guarantee)
|
||||
if (node_inputs[0]->Shape() == nullptr || node_inputs[1]->Shape() == nullptr) {
|
||||
return true;
|
||||
}
|
||||
|
@ -465,7 +465,7 @@ bool DnnlSumNodeCapability::Supported(const Node* node, const GraphViewer& graph
|
|||
}
|
||||
|
||||
// OneDNN version of Sum does not support Numpy style broadcasting.
|
||||
// If the dimentions of all inputs do not match return false
|
||||
// If the dimensions of all inputs do not match return false
|
||||
bool DnnlSumNodeCapability::IsDimensionSupported(const Node* node) const {
|
||||
auto node_inputs = node->InputDefs();
|
||||
// find first non-null shape
|
||||
|
@ -615,7 +615,7 @@ bool DnnlReshapeNodeCapability::Supported(const Node* node, const GraphViewer& g
|
|||
}
|
||||
bool DnnlReshapeNodeCapability::IsDimensionSupported(const Node* node) const {
|
||||
auto node_inputs = node->InputDefs();
|
||||
// We can not reshape a one dimentional tensor to a scalar output
|
||||
// We can not reshape a one dimensional tensor to a scalar output
|
||||
if (node_inputs[1]->Shape() != nullptr &&
|
||||
node_inputs[1]->Shape()->dim_size() == 1 &&
|
||||
node_inputs[1]->Shape()->dim(0).dim_value() == 0) {
|
||||
|
|
|
@ -32,9 +32,9 @@ class DnnlConv {
|
|||
|
||||
private:
|
||||
/*
|
||||
* Return the infered padding.
|
||||
* Return the inferred padding.
|
||||
*
|
||||
* The padding will be based on the specified padding or will infered based on the
|
||||
* The padding will be based on the specified padding or will inferred based on the
|
||||
* Onnx 'auto_pad' attributes.
|
||||
*
|
||||
* This will return the padding in the format specified in the Onnx specification.
|
||||
|
@ -47,9 +47,9 @@ class DnnlConv {
|
|||
const dnnl::memory::dims& dilations,
|
||||
const std::vector<int64_t>& kernel_shape,
|
||||
const dnnl::memory::dims& strides);
|
||||
/* Get the padding left values from the infered pads */
|
||||
/* Get the padding left values from the inferred pads */
|
||||
dnnl::memory::dims GetPaddingLeft(const std::vector<int64_t>& onnx_padding, ConvShape shape);
|
||||
/* Get the padding right values from the infered pads */
|
||||
/* Get the padding right values from the inferred pads */
|
||||
dnnl::memory::dims GetPaddingRight(const std::vector<int64_t>& onnx_padding, ConvShape shape);
|
||||
|
||||
/*
|
||||
|
|
|
@ -40,7 +40,7 @@ ConvGrad: (According to OnnxRuntime discovered using code inspection and Onnx do
|
|||
|
||||
Attributes (auto_pad, dilations, group, kernel_shap, pads, and strides) should be the same as the forward pass Conv operator
|
||||
|
||||
To acheive Everything specified in the OnnxRuntime ConvGrad we must use both:
|
||||
To achieve Everything specified in the OnnxRuntime ConvGrad we must use both:
|
||||
1) dnnl::convolution_backward_data - used to calculate (dX) diff_src
|
||||
2) dnnl::convolution_backward_weights - used to calculate (dW) diff_weights and (dB) diff_bias
|
||||
*/
|
||||
|
|
|
@ -39,9 +39,9 @@ class DnnlConvGrad {
|
|||
std::vector<int64_t> GetKernelShape(DnnlNode& node);
|
||||
/* Get the 'pads' attribute */
|
||||
dnnl::memory::dims GetPads(DnnlNode& node, ConvShape shape);
|
||||
/* Get the padding left values from the infered pads */
|
||||
/* Get the padding left values from the inferred pads */
|
||||
dnnl::memory::dims GetPaddingLeft(const std::vector<int64_t>& onnx_padding, ConvShape shape);
|
||||
/* Get the padding right values from the infered pads */
|
||||
/* Get the padding right values from the inferred pads */
|
||||
dnnl::memory::dims GetPaddingRight(const std::vector<int64_t>& onnx_padding, ConvShape shape);
|
||||
/*
|
||||
* Get the 'dilations' attribute.
|
||||
|
|
|
@ -68,7 +68,7 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode&
|
|||
auto dst_md = dnnl::memory::desc(x_md.get_dims(), node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any);
|
||||
dnnl::memory dst_mem;
|
||||
|
||||
// If zero point exists and we are NOT dequantizing int32, then substract zp from x and scale
|
||||
// If zero point exists and we are NOT dequantizing int32, then subtract zp from x and scale
|
||||
if (isZeroPointUseful && (x_mem.get_desc().get_data_type() != dnnl::memory::data_type::s32)) {
|
||||
// Get Zero point
|
||||
auto x_zp_mem = sp.GetMemory(node.Input(IN_X_ZERO_POINT));
|
||||
|
|
|
@ -126,7 +126,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
}
|
||||
|
||||
// The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
|
||||
// that will have the correct dimentions and correct memory::format
|
||||
// that will have the correct dimensions and correct memory::format
|
||||
transposedA_md = dnnl::memory::desc(transposedA_dims, node.Input(IN_A).Type(), sp.GetDnnlFormat(transposedA_dims.size()));
|
||||
transposedA_mem = dnnl::memory(transposedA_md, eng, nullptr);
|
||||
void* handle = intermediateA_mem.get_data_handle();
|
||||
|
@ -146,7 +146,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
}
|
||||
|
||||
// The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
|
||||
// that will have the correct dimentions and correct memory::format
|
||||
// that will have the correct dimensions and correct memory::format
|
||||
transposedB_md = dnnl::memory::desc(transposedB_dims, node.Input(IN_B).Type(), sp.GetDnnlFormat(transposedB_dims.size()));
|
||||
transposedB_mem = dnnl::memory(transposedB_md, eng, nullptr);
|
||||
void* handle = intermediateB_mem.get_data_handle();
|
||||
|
@ -193,8 +193,8 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
create a post op binary with possible unsqueezing in order to make sure onednn properly broadcast
|
||||
current limitation
|
||||
1. is no unsqueeze for matmul output as it is not exposed due to post op fusion
|
||||
2. the third input has to be reordered to plain format (eg, no memory format propogation if the third input is internal to subgraph)
|
||||
3. adding 1s to front (unsqueeze/expand) in logical dims would possibly fail if physcial layout is not plain format
|
||||
2. the third input has to be reordered to plain format (eg, no memory format propagation if the third input is internal to subgraph)
|
||||
3. adding 1s to front (unsqueeze/expand) in logical dims would possibly fail if physical layout is not plain format
|
||||
*/
|
||||
dnnl::primitive_attr attr;
|
||||
if (has_postop_fusion) {
|
||||
|
|
|
@ -135,16 +135,16 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
* shape reduction. For this reason we have code paths that are taken if the source dimensions and
|
||||
* destination dimensions are equal that will not call the reduction op.
|
||||
*
|
||||
* "ReduceLogSum" is equivelent to Log(ReduceSum(input))
|
||||
* "ReduceLogSum" is equivalent to Log(ReduceSum(input))
|
||||
* - if the reduction op is called then the eltwise_log post op will added to the reduction primitive.
|
||||
* - if the reduction op is not called then the eltwise_log primitive is added as its own primitive
|
||||
* - NOTE "ReduceLogSum" follows the code flow of "All other reduce ops" with the exception of the added
|
||||
* post op and an extra check if src_dims == dest_dims.
|
||||
* "ReduceLogSumExp" is equivelent to Log(ReduceSum(Exp(input)))
|
||||
* "ReduceLogSumExp" is equivalent to Log(ReduceSum(Exp(input)))
|
||||
* - if the reduction op is called then the eltwise_exp primitive is added before the reduction op
|
||||
* the eletwise_log post op will be added to the reduction primitive
|
||||
* - if the reduction op is not called then the input is not modified since Log(Exp(input) == input
|
||||
* "ReduceSumSquare" is equivelent to ReduceSum(Square(input))
|
||||
* "ReduceSumSquare" is equivalent to ReduceSum(Square(input))
|
||||
* - the eltwise_square primitive is added before the reduction op
|
||||
* - if the source and destination dimensions are not equal the reduction op is called
|
||||
* All other reduce ops
|
||||
|
@ -298,7 +298,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
dnnl::memory squeeze_mem = dnnl::memory(squeeze_md, dnnl_engine, nullptr);
|
||||
// if the src and dst dims are equal then we will have a valid data handle here.
|
||||
// Otherwise we must get the data handle at runtime using the AddReshape function.
|
||||
// reading the data handle directy is more efficent if is it possible.
|
||||
// reading the data handle directly is more efficient if is it possible.
|
||||
if (!src_and_dst_dims_equal) {
|
||||
squeeze_mem.set_data_handle(reduce_dst_mem.get_data_handle());
|
||||
} else {
|
||||
|
|
|
@ -65,7 +65,7 @@ class DnnlSubgraphPrimitive {
|
|||
dnnl::memory::desc GetOutputInfo(std::string name);
|
||||
bool IsScalarOutput(const std::string& name);
|
||||
bool IsDynamic();
|
||||
// All Scalar inputs are automatically converterted to a one dimentional tensor when used in OneDNN
|
||||
// All Scalar inputs are automatically converterted to a one dimensional tensor when used in OneDNN
|
||||
// If the input being a scalar affects the operator this function can be used to determine if the
|
||||
// original input from ORT was a scalar.
|
||||
bool IsScalar(const DnnlTensor& tensor);
|
||||
|
|
|
@ -56,7 +56,8 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
strides_inverse.push_back(strides[ndata_dims - i - 1]);
|
||||
}
|
||||
|
||||
// Memory descriptor describes the memory reorder but will not have the correct output dimentions or the correct dnnl::memory::format
|
||||
// Memory descriptor describes the memory reorder but will not have the correct output dimensions
|
||||
// or the correct dnnl::memory::format
|
||||
dnnl::memory::desc intermediate_md = dnnl::memory::desc(data_dims, node.Input(IN_DATA).Type(), strides);
|
||||
dnnl::memory intermediate_mem = dnnl::memory(intermediate_md, dnnl_engine);
|
||||
|
||||
|
@ -65,7 +66,7 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
|
|||
{DNNL_ARG_TO, intermediate_mem}});
|
||||
|
||||
// The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
|
||||
// that will have the correct dimentions and correct memory::format
|
||||
// that will have the correct dimensions and correct memory::format
|
||||
dnnl::memory::desc transposed_md = dnnl::memory::desc(transposed_dims, node.Input(IN_DATA).Type(), sp.GetDnnlFormat(data_dims.size()));
|
||||
dnnl::memory transposed_mem = dnnl::memory(transposed_md, dnnl_engine, nullptr);
|
||||
void* handle = intermediate_mem.get_data_handle();
|
||||
|
|
|
@ -42,7 +42,7 @@ void* MIGraphXExternalAllocator::Alloc(size_t size) {
|
|||
if (size > 0) {
|
||||
p = alloc_(size);
|
||||
|
||||
// review(codemzs): ORT_ENFORCE does not seem appropiate.
|
||||
// review(codemzs): ORT_ENFORCE does not seem appropriate.
|
||||
ORT_ENFORCE(p != nullptr);
|
||||
}
|
||||
|
||||
|
|
|
@ -123,7 +123,7 @@ Status MIGraphXStream::CleanUpOnRunEnd() {
|
|||
}
|
||||
|
||||
void* MIGraphXStream::GetResource(int version, int id) const {
|
||||
ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
|
||||
ORT_ENFORCE(version <= ORT_ROCM_RESOURCE_VERSION, "resource version unsupported!");
|
||||
void* resource{};
|
||||
switch (id) {
|
||||
case RocmResource::hip_stream_t:
|
||||
|
|
|
@ -228,7 +228,7 @@ const NnApi LoadNnApi() {
|
|||
nnapi.ASharedMemory_create = getASharedMemory_create();
|
||||
#else
|
||||
// Mock ASharedMemory_create only if libneuralnetworks.so was successfully
|
||||
// loaded. This ensures identical behaviour on platforms which use this
|
||||
// loaded. This ensures identical behavior on platforms which use this
|
||||
// implementation, but don't have libneuralnetworks.so library, and
|
||||
// platforms which use nnapi_implementation_disabled.cc stub.
|
||||
if (libneuralnetworks != nullptr) {
|
||||
|
|
|
@ -28,7 +28,7 @@ constexpr const char* RKNPU = "Rknpu";
|
|||
struct RknpuFuncState {
|
||||
std::string uniq_input_shape;
|
||||
|
||||
std::unique_ptr<rk::nn::Exection> exector;
|
||||
std::unique_ptr<rk::nn::Execution> exector;
|
||||
ONNX_NAMESPACE::ModelProto model_proto;
|
||||
std::unordered_map<std::string, int> input_map;
|
||||
std::unordered_map<std::string, int> output_map;
|
||||
|
@ -282,7 +282,7 @@ common::Status RknpuExecutionProvider::Compile(const std::vector<FusedNodeAndGra
|
|||
std::unique_ptr<RknpuFuncState> p =
|
||||
std::make_unique<RknpuFuncState>();
|
||||
rk::nn::Graph* graph = new rk::nn::Graph();
|
||||
*p = {"", std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(graph)),
|
||||
*p = {"", std::unique_ptr<rk::nn::Execution>(new rk::nn::Execution(graph)),
|
||||
model_proto_[context->node_name], input_info_[context->node_name],
|
||||
output_info_[context->node_name],
|
||||
std::vector<int>{}, std::vector<int>{}};
|
||||
|
|
|
@ -12,7 +12,7 @@ namespace onnxruntime {
|
|||
namespace rocm {
|
||||
|
||||
// Op Set 11 for Conv only update document to clearify default dilations and strides value.
|
||||
// which are already convered by op set 11 cpu versoin, so simply add declaration.
|
||||
// which are already convered by op set 11 cpu version, so simply add declaration.
|
||||
#define REGISTER_KERNEL_TYPED(T) \
|
||||
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
|
||||
Conv, \
|
||||
|
|
|
@ -226,7 +226,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
|
|||
MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(miopen_handle, reduce_desc, input_tensor, output_tensor, &indices_bytes));
|
||||
auto indices_rocm = GetScratchBuffer<uint32_t>(indices_bytes, stream);
|
||||
|
||||
// need to allocate a separate buffer for ArgMin/ArgMax comparsion output
|
||||
// need to allocate a separate buffer for ArgMin/ArgMax comparison output
|
||||
auto output_count = output_shape.Size();
|
||||
|
||||
if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) {
|
||||
|
|
|
@ -60,7 +60,7 @@ void* ROCMExternalAllocator::Alloc(size_t size) {
|
|||
if (size > 0) {
|
||||
p = alloc_(size);
|
||||
|
||||
// review(codemzs): ORT_ENFORCE does not seem appropiate.
|
||||
// review(codemzs): ORT_ENFORCE does not seem appropriate.
|
||||
ORT_ENFORCE(p != nullptr);
|
||||
}
|
||||
|
||||
|
|
|
@ -140,7 +140,7 @@ Status RocmStream::CleanUpOnRunEnd() {
|
|||
}
|
||||
|
||||
void* RocmStream::GetResource(int version, int id) const {
|
||||
ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
|
||||
ORT_ENFORCE(version <= ORT_ROCM_RESOURCE_VERSION, "resource version unsupported!");
|
||||
void* resource{};
|
||||
switch (id) {
|
||||
case RocmResource::hip_stream_t:
|
||||
|
|
|
@ -329,7 +329,7 @@ common::Status WebNNExecutionProvider::Compile(const std::vector<FusedNodeAndGra
|
|||
node_compute_funcs.push_back(compute_info);
|
||||
}
|
||||
|
||||
// Explictly release the WebNN builder to free memory.
|
||||
// Explicitly release the WebNN builder to free memory.
|
||||
wnn_builder_ = emscripten::val::undefined();
|
||||
|
||||
return Status::OK();
|
||||
|
|
|
@ -29,7 +29,7 @@ common::Status IOBinding::BindInput(const std::string& name, const OrtValue& ml_
|
|||
// It may copy the data instead of copying the pointer.
|
||||
// When OrtValue is empty, the pointer is copied. When it is not
|
||||
// (if feeds_[index] is not for example),
|
||||
// CopyOneInputAcrossDevices has a different behaviour.
|
||||
// CopyOneInputAcrossDevices has a different behavior.
|
||||
ORT_RETURN_IF_ERROR(utils::CopyOneInputAcrossDevices(session_state_, name, ml_value, new_mlvalue));
|
||||
add_or_replace(new_mlvalue);
|
||||
} else {
|
||||
|
|
|
@ -2827,7 +2827,7 @@ std::pair<common::Status, const InputDefList*> InferenceSession::GetOverridableI
|
|||
}
|
||||
}
|
||||
|
||||
// returns a list of initializers that can be overriden.
|
||||
// returns a list of initializers that can be overridden.
|
||||
return std::make_pair(common::Status::OK(), &model_->MainGraph().GetOverridableInitializers());
|
||||
}
|
||||
|
||||
|
|
|
@ -386,7 +386,7 @@ class InferenceSession {
|
|||
* @param run_options run options.
|
||||
* @param mutable_feeds inputs owned by client code and will be released as long as the feeds be set in session states.
|
||||
* Then the feeds will purely managed in the session states.
|
||||
* @param fetches outputs produced after the executin of this function.
|
||||
* @param fetches outputs produced after the execution of this function.
|
||||
* @param state State of the graph needed to resume partial graph run.
|
||||
* @param feeds_fetches_manager Contains feed/fetches name to internal indices mapping and information for device
|
||||
* copy/checks.
|
||||
|
|
|
@ -552,7 +552,7 @@ struct BlockedQuantizeLinear<float, TOut, 2> {
|
|||
std::ptrdiff_t N, const std::ptrdiff_t quant_block_size,
|
||||
const std::ptrdiff_t thread_block_size, bool saturate) {
|
||||
ORT_UNUSED_PARAMETER(saturate);
|
||||
// to avoid a byte being writen from mutiple threads, use 2 * N as thread block
|
||||
// to avoid a byte being written from mutiple threads, use 2 * N as thread block
|
||||
ORT_UNUSED_PARAMETER(thread_block_size);
|
||||
constexpr auto low = static_cast<int32_t>(TOut::min_val);
|
||||
constexpr auto high = static_cast<int32_t>(TOut::max_val);
|
||||
|
@ -637,7 +637,7 @@ struct BlockedQuantizeLinear<float, TOut, 2> {
|
|||
ORT_UNUSED_PARAMETER(saturate);
|
||||
constexpr auto low = static_cast<int32_t>(TOut::min_val);
|
||||
constexpr auto high = static_cast<int32_t>(TOut::max_val);
|
||||
// to avoid a byte being writen from mutiple threads, use 2 * K as thread block
|
||||
// to avoid a byte being written from mutiple threads, use 2 * K as thread block
|
||||
auto size_thread_block = 2 * K;
|
||||
auto quant_block_num_K = (K + quant_block_size - 1) / quant_block_size;
|
||||
auto num_thread_block = (M + 1) / 2;
|
||||
|
@ -697,7 +697,7 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 2> {
|
|||
std::ptrdiff_t N, const std::ptrdiff_t quant_block_size,
|
||||
const std::ptrdiff_t thread_block_size, bool saturate) {
|
||||
ORT_UNUSED_PARAMETER(saturate);
|
||||
// to avoid a byte being writen from mutiple threads, use 2 * N as thread block
|
||||
// to avoid a byte being written from mutiple threads, use 2 * N as thread block
|
||||
ORT_UNUSED_PARAMETER(thread_block_size);
|
||||
constexpr auto low = static_cast<int32_t>(TOut::min_val);
|
||||
constexpr auto high = static_cast<int32_t>(TOut::max_val);
|
||||
|
@ -786,7 +786,7 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 2> {
|
|||
ORT_UNUSED_PARAMETER(saturate);
|
||||
constexpr auto low = static_cast<int32_t>(TOut::min_val);
|
||||
constexpr auto high = static_cast<int32_t>(TOut::max_val);
|
||||
// to avoid a byte being writen from mutiple threads, use 2 * K as thread block
|
||||
// to avoid a byte being written from mutiple threads, use 2 * K as thread block
|
||||
auto size_thread_block = 2 * K;
|
||||
auto quant_block_num_K = (K + quant_block_size - 1) / quant_block_size;
|
||||
auto num_thread_block = (M + 1) / 2;
|
||||
|
|
|
@ -15,7 +15,7 @@ void addGlobalSchemaFunctions(pybind11::module& m) {
|
|||
"get_all_operator_schema", []() -> const std::vector<ONNX_NAMESPACE::OpSchema> {
|
||||
return ONNX_NAMESPACE::OpSchemaRegistry::get_all_schemas_with_history();
|
||||
},
|
||||
"Return a vector of OpSchema all registed operators");
|
||||
"Return a vector of OpSchema all registered operators");
|
||||
m.def(
|
||||
"get_all_opkernel_def", []() -> const std::vector<onnxruntime::KernelDef> {
|
||||
std::vector<onnxruntime::KernelDef> result;
|
||||
|
|
|
@ -41,7 +41,7 @@ struct MakeDType {
|
|||
|
||||
/// <summary>
|
||||
/// The function creates a numpy array that points to
|
||||
/// data stored within the corresponing tensor. Parent object
|
||||
/// data stored within the corresponding tensor. Parent object
|
||||
/// holds a reference to the object that owns the data so it
|
||||
/// does not disappear.
|
||||
/// </summary>
|
||||
|
@ -396,7 +396,7 @@ void addSparseTensorMethods(pybind11::module& m) {
|
|||
})
|
||||
// pybind apparently has a bug with returning enums from def_property_readonly or methods
|
||||
// returning a method object instead of the enumeration value
|
||||
// so we are using def_property and throw on a potential modificaiton
|
||||
// so we are using def_property and throw on a potential modification
|
||||
.def_property(
|
||||
"format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
|
||||
const SparseTensor& tensor = py_tensor->Instance();
|
||||
|
|
|
@ -152,7 +152,7 @@ void AsyncCallback(void* user_data, OrtValue** outputs, size_t num_outputs, OrtS
|
|||
} else {
|
||||
// acquire GIL to safely:
|
||||
// 1) invoke python callback
|
||||
// 2) create, manipulate, and destory python objects
|
||||
// 2) create, manipulate, and destroy python objects
|
||||
py::gil_scoped_acquire acquire;
|
||||
invoke_callback();
|
||||
}
|
||||
|
@ -946,7 +946,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
|
|||
provider_options_map);
|
||||
|
||||
// This variable is never initialized because the APIs by which it should be initialized are deprecated,
|
||||
// however they still exist are are in-use. Neverthless, it is used to return CUDAAllocator,
|
||||
// however they still exist are are in-use. Nevertheless, it is used to return CUDAAllocator,
|
||||
// hence we must try to initialize it here if we can since FromProviderOptions might contain
|
||||
// external CUDA allocator.
|
||||
external_allocator_info = info.external_allocator_info;
|
||||
|
@ -973,14 +973,17 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
|
|||
const ROCMExecutionProviderInfo info = GetRocmExecutionProviderInfo(rocm_provider_info,
|
||||
provider_options_map);
|
||||
|
||||
// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
|
||||
// exist are are in-use. Neverthless, it is used to return ROCMAllocator, hence we must try to initialize it here if we can
|
||||
// since FromProviderOptions might contain external ROCM allocator.
|
||||
// This variable is never initialized because the APIs by which is it should be initialized are deprecated,
|
||||
// however they still exist and are in-use. Nevertheless, it is used to return ROCMAllocator, hence we must
|
||||
// try to initialize it here if we can since FromProviderOptions might contain external ROCM allocator.
|
||||
external_allocator_info = info.external_allocator_info;
|
||||
return rocm_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
|
||||
} else {
|
||||
if (!Env::Default().GetEnvironmentVar("ROCM_PATH").empty()) {
|
||||
ORT_THROW("ROCM_PATH is set but ROCM wasn't able to be loaded. Please install the correct version of ROCM and MIOpen as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
|
||||
ORT_THROW(
|
||||
"ROCM_PATH is set but ROCM wasn't able to be loaded. Please install the correct version "
|
||||
"of ROCM and MIOpen as mentioned in the GPU requirements page, make sure they're in the PATH, "
|
||||
"and that your GPU is supported.");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1389,7 +1392,8 @@ void addGlobalMethods(py::module& m) {
|
|||
LogDeprecationWarning("set_openvino_device", "OpenVINO execution provider option \"device_type\"");
|
||||
openvino_device_type = device_type;
|
||||
},
|
||||
"Set the prefered OpenVINO device type to be used. If left unset, the device type selected during build time will be used.");
|
||||
"Set the preferred OpenVINO device type to be used. If left unset, "
|
||||
"the device type selected during build time will be used.");
|
||||
// TODO remove deprecated global config
|
||||
m.def(
|
||||
"get_openvino_device", []() -> std::string {
|
||||
|
|
|
@ -812,7 +812,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
hist_edges = hist_edges.astype(data_arr_np.dtype)
|
||||
assert (
|
||||
data_arr_np.dtype != np.float64
|
||||
), "only float32 or float16 is supported, every constant must be explicetly typed"
|
||||
), "only float32 or float16 is supported, every constant must be explicitly typed"
|
||||
self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
|
||||
else:
|
||||
old_histogram = self.histogram_dict[tensor]
|
||||
|
@ -834,7 +834,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
hist[: len(old_hist)] += old_hist
|
||||
assert (
|
||||
data_arr_np.dtype != np.float64
|
||||
), "only float32 or float16 is supported, every constant must be explicetly typed"
|
||||
), "only float32 or float16 is supported, every constant must be explicitly typed"
|
||||
self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
|
||||
|
||||
def collect_value(self, name_to_arr):
|
||||
|
|
|
@ -13,7 +13,7 @@ class Direct8BitOp(QuantOperatorBase):
|
|||
node = self.node
|
||||
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
# Keep backward compatiblity
|
||||
# Keep backward compatibility
|
||||
# Quantize when input[0] is quantized already. Otherwise keep it.
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
|
|
|
@ -357,7 +357,7 @@ def quantize_data(
|
|||
- when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
|
||||
`m = max(abs(rmin), abs(rmax))`
|
||||
|
||||
and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
|
||||
and add necessary intermediate nodes to transform quantized weight to full weight using the equation
|
||||
|
||||
:math:`r = S(q-z)`, where
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ Models not in the list may only be partially optimized or not optimized at all.
|
|||
- **hidden_size**: (*default: 768*)
|
||||
BERT-base and BERT-large has 768 and 1024 hidden nodes respectively.
|
||||
- **input_int32**: (*optional*)
|
||||
Exported model ususally uses int64 tensor as input. If this flag is specified, int32 tensors will be used as input, and it could avoid un-necessary Cast nodes and get better performance.
|
||||
Exported model usually uses int64 tensor as input. If this flag is specified, int32 tensors will be used as input, and it could avoid un-necessary Cast nodes and get better performance.
|
||||
- **float16**: (*optional*)
|
||||
By default, model uses float32 in computation. If this flag is specified, half-precision float will be used. This option is recommended for NVidia GPU with Tensor Core like V100 and T4. For older GPUs, float32 is likely faster.
|
||||
- **use_gpu**: (*optional*)
|
||||
|
|
|
@ -930,7 +930,7 @@ def main():
|
|||
|
||||
if len(results) == 0:
|
||||
if args.batch_sizes != [0]:
|
||||
logger.warning("No any result avaiable.")
|
||||
logger.warning("No any result available.")
|
||||
return
|
||||
|
||||
csv_filename = args.detail_csv or f"benchmark_detail_{time_stamp}.csv"
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче