Update ruff and clang-format versions (#21479)
ruff -> 0.5.4 clang-format -> 18
This commit is contained in:
Родитель
eb9b377306
Коммит
c203d89958
|
@ -73,7 +73,7 @@ def add_github_dep(name, parsed_url):
|
|||
return
|
||||
# Make a REST call to convert to tag to a git commit
|
||||
url = f"https://api.github.com/repos/{org_name}/{repo_name}/git/refs/tags/{tag}"
|
||||
print("requesting %s ..." % url)
|
||||
print("requesting {url} ...")
|
||||
res = requests.get(url, auth=(args.username, args.token))
|
||||
response_json = res.json()
|
||||
tag_object = response_json["object"]
|
||||
|
|
|
@ -19,7 +19,7 @@ def check_all_delegates_have_unmanaged_function_pointer_attribute(file: pathlib.
|
|||
line_num = 0
|
||||
with open(str(file.resolve(strict=True))) as f:
|
||||
prev_line = ""
|
||||
for line in f.readlines():
|
||||
for line in f:
|
||||
line_num += 1
|
||||
|
||||
# strip so it's easier to deal with commented out lines.
|
||||
|
|
|
@ -17,13 +17,13 @@ namespace onnxruntime {
|
|||
|
||||
class NotImplementedException : public std::logic_error {
|
||||
public:
|
||||
explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message){};
|
||||
explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message){};
|
||||
explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {};
|
||||
explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {};
|
||||
};
|
||||
|
||||
class TypeMismatchException : public std::logic_error {
|
||||
public:
|
||||
TypeMismatchException() noexcept : logic_error("Type mismatch"){};
|
||||
TypeMismatchException() noexcept : logic_error("Type mismatch") {};
|
||||
};
|
||||
|
||||
class OnnxRuntimeException : public std::exception {
|
||||
|
|
|
@ -32,7 +32,7 @@ class Stream {
|
|||
return {};
|
||||
};
|
||||
// block the host thread until all the tasks in the stream finished.
|
||||
virtual void Flush(){};
|
||||
virtual void Flush() {};
|
||||
// The framework may reuse the stream instance for multiple iterations.
|
||||
// This is the API that provide a chance to let the device stream cleanup
|
||||
// resource at the end of a iteration.
|
||||
|
|
|
@ -76,6 +76,6 @@ class Barrier {
|
|||
// Multiple threads can wait on the same Notification object,
|
||||
// but only one caller must call Notify() on the object.
|
||||
struct Notification : Barrier {
|
||||
Notification() : Barrier(1){};
|
||||
Notification() : Barrier(1) {};
|
||||
};
|
||||
} // namespace onnxruntime
|
||||
|
|
|
@ -219,18 +219,18 @@ class ThreadPoolProfiler {
|
|||
WAIT_REVOKE,
|
||||
MAX_EVENT
|
||||
};
|
||||
ThreadPoolProfiler(int, const CHAR_TYPE*){};
|
||||
ThreadPoolProfiler(int, const CHAR_TYPE*) {};
|
||||
~ThreadPoolProfiler() = default;
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler);
|
||||
void Start(){};
|
||||
void Start() {};
|
||||
std::string Stop() { return "not available for minimal build"; }
|
||||
void LogStart(){};
|
||||
void LogStart() {};
|
||||
void LogEnd(ThreadPoolEvent){};
|
||||
void LogEndAndStart(ThreadPoolEvent){};
|
||||
void LogStartAndCoreAndBlock(std::ptrdiff_t){};
|
||||
void LogCoreAndBlock(std::ptrdiff_t){};
|
||||
void LogThreadId(int){};
|
||||
void LogRun(int){};
|
||||
void LogThreadId(int) {};
|
||||
void LogRun(int) {};
|
||||
std::string DumpChildThreadStat() { return {}; }
|
||||
};
|
||||
#else
|
||||
|
|
|
@ -6,5 +6,5 @@
|
|||
// CustomOpContext defines an interface allowing a custom op to access ep-specific resources.
|
||||
struct CustomOpContext {
|
||||
CustomOpContext() = default;
|
||||
virtual ~CustomOpContext(){};
|
||||
virtual ~CustomOpContext() {};
|
||||
};
|
|
@ -24,9 +24,9 @@ namespace Experimental {
|
|||
|
||||
struct Session : Ort::Session {
|
||||
Session(Env& env, std::basic_string<ORTCHAR_T>& model_path, SessionOptions& options)
|
||||
: Ort::Session(env, model_path.data(), options){};
|
||||
: Ort::Session(env, model_path.data(), options) {};
|
||||
Session(Env& env, void* model_data, size_t model_data_length, SessionOptions& options)
|
||||
: Ort::Session(env, model_data, model_data_length, options){};
|
||||
: Ort::Session(env, model_data, model_data_length, options) {};
|
||||
|
||||
// overloaded Run() with sensible defaults
|
||||
std::vector<Ort::Value> Run(const std::vector<std::string>& input_names,
|
||||
|
@ -52,7 +52,7 @@ struct Session : Ort::Session {
|
|||
|
||||
struct Value : Ort::Value {
|
||||
Value(OrtValue* p)
|
||||
: Ort::Value(p){};
|
||||
: Ort::Value(p) {};
|
||||
|
||||
template <typename T>
|
||||
static Ort::Value CreateTensor(T* p_data, size_t p_data_element_count, const std::vector<int64_t>& shape);
|
||||
|
|
|
@ -2175,8 +2175,8 @@ struct Op : detail::Base<OrtOp> {
|
|||
/// </summary>
|
||||
struct ShapeInferContext {
|
||||
struct SymbolicInteger {
|
||||
SymbolicInteger(int64_t i) : i_(i), is_int_(true){};
|
||||
SymbolicInteger(const char* s) : s_(s), is_int_(false){};
|
||||
SymbolicInteger(int64_t i) : i_(i), is_int_(true) {};
|
||||
SymbolicInteger(const char* s) : s_(s), is_int_(false) {};
|
||||
SymbolicInteger(const SymbolicInteger&) = default;
|
||||
SymbolicInteger(SymbolicInteger&&) = default;
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ class ArgBase {
|
|||
ArgBase(OrtKernelContext* ctx,
|
||||
size_t indice,
|
||||
bool is_input) : ctx_(ctx), indice_(indice), is_input_(is_input) {}
|
||||
virtual ~ArgBase(){};
|
||||
virtual ~ArgBase() {};
|
||||
|
||||
protected:
|
||||
struct KernelContext ctx_;
|
||||
|
|
|
@ -267,83 +267,83 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
|
|||
|
||||
Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
|
||||
static const BuildKernelCreateInfoFn function_table[] = {
|
||||
BuildKernelCreateInfo<void>, // default entry to avoid the list become empty after ops-reducing
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
|
||||
BuildKernelCreateInfo<void>, // default entry to avoid the list become empty after ops-reducing
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
|
||||
|
||||
// add more kernels here
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, WhisperBeamSearch)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Range)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, WordConvEmbedding)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherND)>,
|
||||
// add more kernels here
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, WhisperBeamSearch)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Range)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, WordConvEmbedding)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherND)>,
|
||||
#if !defined(DISABLE_SPARSE_TENSORS)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SparseToDenseMatMul)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SparseToDenseMatMul)>,
|
||||
#endif
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MurmurHash3)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MurmurHash3)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulBnb4)>,
|
||||
#ifndef ORT_MINIMAL_BUILD
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulFpQ4)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulFpQ4)>,
|
||||
#endif
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MaxpoolWithMask)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Pad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Unique)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CropAndResize)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FastGelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NGramRepeatBlock)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BifurcationDetector)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QuickGelu)>,
|
||||
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
|
||||
// contrib ops to main backward compatibility
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Affine)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Crop)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, DynamicSlice)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ImageScaler)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, MeanVarianceNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, float, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, double, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MaxpoolWithMask)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Pad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Unique)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CropAndResize)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FastGelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NGramRepeatBlock)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BifurcationDetector)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QuickGelu)>,
|
||||
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
|
||||
// contrib ops to main backward compatibility
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Affine)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Crop)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, DynamicSlice)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ImageScaler)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, MeanVarianceNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, float, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, double, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
|
||||
|
||||
#ifdef ENABLE_ATEN
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_TRAINING_OPS
|
||||
// Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or
|
||||
// 2). this is needed by inference for other purpose.
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
|
||||
// Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or
|
||||
// 2). this is needed by inference for other purpose.
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
|
||||
#endif
|
||||
|
||||
};
|
||||
|
|
|
@ -173,7 +173,7 @@ void CropAndResizeForward(const TensorShape& output_shape,
|
|||
}
|
||||
}
|
||||
} // for pw
|
||||
} // for ph
|
||||
} // for ph
|
||||
},
|
||||
0); // for n
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ struct Alibi {
|
|||
const int max_seqlen_k, max_seqlen_q;
|
||||
|
||||
__forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
|
||||
: alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q){};
|
||||
: alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q) {};
|
||||
|
||||
template <typename Engine, typename Layout>
|
||||
__forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout>& tensor,
|
||||
|
|
|
@ -116,7 +116,7 @@ struct Mask {
|
|||
__forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q,
|
||||
const int window_size_left, const int window_size_right,
|
||||
const float alibi_slope = 0.f)
|
||||
: max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q), window_size_left(window_size_left), window_size_right(window_size_right), alibi_slope(!Has_alibi ? 0.0 : alibi_slope){};
|
||||
: max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q), window_size_left(window_size_left), window_size_right(window_size_right), alibi_slope(!Has_alibi ? 0.0 : alibi_slope) {};
|
||||
|
||||
// Causal_mask: whether this particular iteration needs causal masking
|
||||
template <bool Causal_mask = false, bool Is_even_MN = true, typename Engine, typename Layout>
|
||||
|
|
|
@ -121,7 +121,7 @@ struct Softmax {
|
|||
using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
|
||||
TensorT row_max, row_sum;
|
||||
|
||||
__forceinline__ __device__ Softmax(){};
|
||||
__forceinline__ __device__ Softmax() {};
|
||||
|
||||
template <bool Is_first, bool Check_inf = false, typename Tensor0, typename Tensor1>
|
||||
__forceinline__ __device__ void softmax_rescale_o(Tensor0& acc_s, Tensor1& acc_o, float softmax_scale_log2) {
|
||||
|
|
|
@ -231,206 +231,206 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
|
|||
|
||||
Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
|
||||
static const BuildKernelCreateInfoFn function_table[] = {
|
||||
BuildKernelCreateInfo<void>, // default entry to avoid the list become empty after ops-reducing
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GridSample)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FastGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FastGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Gelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Gelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Gelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasSplitGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasSplitGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasAdd)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasAdd)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, QuickGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, QuickGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, QuickGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GatedRelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GatedRelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RemovePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RemovePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RestorePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RestorePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Rfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Rfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Rfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Irfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Irfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Irfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMulConj)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMulConj)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, NGramRepeatBlock)>,
|
||||
BuildKernelCreateInfo<void>, // default entry to avoid the list become empty after ops-reducing
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GridSample)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FastGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FastGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Gelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Gelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Gelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasSplitGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasSplitGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasAdd)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasAdd)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, QuickGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, QuickGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, QuickGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GatedRelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GatedRelativePositionBias)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RemovePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RemovePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RestorePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RestorePadding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Rfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Rfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Rfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Irfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Irfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Irfft)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMulConj)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMulConj)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, NGramRepeatBlock)>,
|
||||
|
||||
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
|
||||
// contrib ops to maintain backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Affine)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Affine)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Affine)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Attention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Attention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BeamSearch)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, WhisperBeamSearch)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ConvTransposeWithDynamicPads)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Crop)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Crop)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Crop)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QMoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GroupQueryAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, GroupQueryAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int32_t, DynamicSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int64_t, DynamicSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GreedySearch)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GroupNorm)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, NhwcConv)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, NhwcConv)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ImageScaler)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ImageScaler)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ImageScaler)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, LongformerAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, LongformerAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GemmaRotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Sampling)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, SkipGroupNorm)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_float, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, double_double_double, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_MLFloat16, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_MLFloat16, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_float, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, BFloat16_float_BFloat16, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_float, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double_double_double, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_MLFloat16, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_float, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, BFloat16_float_BFloat16, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Inverse)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulNBits)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulNBits)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasSoftmax)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasDropout)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskDropout)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskBiasDropout)>,
|
||||
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
|
||||
// contrib ops to maintain backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Affine)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Affine)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Affine)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Attention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Attention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BeamSearch)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, WhisperBeamSearch)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ConvTransposeWithDynamicPads)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Crop)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Crop)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Crop)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QMoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GroupQueryAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, GroupQueryAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int32_t, DynamicSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int64_t, DynamicSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GreedySearch)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GroupNorm)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, NhwcConv)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, NhwcConv)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ImageScaler)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ImageScaler)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ImageScaler)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, LongformerAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, LongformerAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, RotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GemmaRotaryEmbedding)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Sampling)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ScaledTanh)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, SkipGroupNorm)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipSimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ThresholdedRelu)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_float, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, double_double_double, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_MLFloat16, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_MLFloat16, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_float, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, BFloat16_float_BFloat16, LayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_float, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double_double_double, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_MLFloat16, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_float, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, BFloat16_float_BFloat16, SimplifiedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Inverse)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulNBits)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulNBits)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulBnb4)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasSoftmax)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasDropout)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskDropout)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskBiasDropout)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float_int8_t, QAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_int8_t, QAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, UnfoldTensor)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DynamicTimeWarping)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Trilu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FastGelu)>,
|
||||
// TransposedMatMul is still here for backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedConv)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QuantizeWithOrder)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DequantizeWithOrder)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLongformerAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedSelfAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedSelfAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GemmFloat8)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SparseAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, SparseAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float_int8_t, QAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_int8_t, QAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, UnfoldTensor)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DynamicTimeWarping)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Trilu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FastGelu)>,
|
||||
// TransposedMatMul is still here for backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, TransposeMatMul)>, // backward compatibility
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FusedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedConv)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedGelu)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QuantizeWithOrder)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DequantizeWithOrder)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLongformerAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedSelfAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedSelfAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedMultiHeadAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GemmFloat8)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SparseAttention)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, SparseAttention)>,
|
||||
|
||||
#ifdef ENABLE_ATEN
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_TRAINING_OPS
|
||||
// Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once
|
||||
// 1). compute optimizer is enabled for inference or
|
||||
// 2). this is needed by inference for other purpose.
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, ShrunkenGather)>,
|
||||
// Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once
|
||||
// 1). compute optimizer is enabled for inference or
|
||||
// 2). this is needed by inference for other purpose.
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, ShrunkenGather)>,
|
||||
#endif
|
||||
|
||||
#if defined(ORT_USE_NCCL)
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllReduce)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllGather)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllToAll)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllReduce)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllGather)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllToAll)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ShardedMoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ShardedMoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ShardedMoE)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ShardedMoE)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedMatMul)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedMatMul)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSlice)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSlice)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedReshape)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReshape)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReshape)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedReshape)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReshape)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReshape)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedExpand)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedExpand)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedExpand)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedExpand)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedExpand)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedExpand)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceSum)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceSum)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceSum)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceSum)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMax)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMax)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMax)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMax)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMean)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMean)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMean)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMean)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedUnsqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedUnsqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedUnsqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedUnsqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedUnsqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedUnsqueeze)>,
|
||||
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedSqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedSqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSqueeze)>,
|
||||
BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSqueeze)>,
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ class ExLibLoader {
|
|||
virtual ~ExLibLoader();
|
||||
|
||||
protected:
|
||||
virtual void PreUnloadLibrary(void* /*handle*/){};
|
||||
virtual void PreUnloadLibrary(void* /*handle*/) {};
|
||||
|
||||
std::map<std::string, void*> dso_name_data_map_;
|
||||
|
||||
|
|
|
@ -2665,10 +2665,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(CropAndResize, 1,
|
|||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
#define GEMM_FLOAT8_TYPES \
|
||||
{ "tensor(float8e4m3fn)", "tensor(float8e5m2)", "tensor(float16)", "tensor(bfloat16)", "tensor(float)" }
|
||||
{"tensor(float8e4m3fn)", "tensor(float8e5m2)", "tensor(float16)", "tensor(bfloat16)", "tensor(float)"}
|
||||
#else
|
||||
#define GEMM_FLOAT8_TYPES \
|
||||
{ "tensor(float16)", "tensor(bfloat16)", "tensor(float)" }
|
||||
{"tensor(float16)", "tensor(bfloat16)", "tensor(float)"}
|
||||
#endif
|
||||
|
||||
ONNX_MS_OPERATOR_SET_SCHEMA(GemmFloat8, 1,
|
||||
|
|
|
@ -86,7 +86,7 @@ class TensorRef {
|
|||
/// <returns>Flattened tensor data in bytes</returns>
|
||||
virtual std::vector<uint8_t> Data() const = 0;
|
||||
|
||||
virtual ~TensorRef(){};
|
||||
virtual ~TensorRef() {};
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
|
@ -131,7 +131,7 @@ class ValueInfoRef {
|
|||
/// <param name="axes">Indices of dimensions to add. Indices are relative to final shape.</param>
|
||||
virtual void UnsqueezeDims(const std::vector<int64_t>& axes) = 0;
|
||||
|
||||
virtual ~ValueInfoRef(){};
|
||||
virtual ~ValueInfoRef() {};
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
|
@ -248,7 +248,7 @@ class NodeRef {
|
|||
/// <returns>Id</returns>
|
||||
virtual int64_t Id() const = 0;
|
||||
|
||||
virtual ~NodeRef(){};
|
||||
virtual ~NodeRef() {};
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
|
@ -449,7 +449,7 @@ class GraphRef {
|
|||
/// <returns>True if output of the Graph.</returns>
|
||||
virtual bool IsGraphOutput(std::string_view name) const = 0;
|
||||
|
||||
virtual ~GraphRef(){};
|
||||
virtual ~GraphRef() {};
|
||||
};
|
||||
|
||||
} // namespace api
|
||||
|
|
|
@ -228,11 +228,9 @@ inline std::basic_string<PATH_CHAR_TYPE> GetLastComponent(const std::basic_strin
|
|||
typename std::basic_string<PATH_CHAR_TYPE>::size_type pos = input.length();
|
||||
PATH_CHAR_TYPE sep = GetPathSep<PATH_CHAR_TYPE>();
|
||||
// remove trailing backslash
|
||||
for (; pos > 1 && input[pos - 1] == sep; --pos)
|
||||
;
|
||||
for (; pos > 1 && input[pos - 1] == sep; --pos);
|
||||
input.resize(pos);
|
||||
for (; pos != 0 && input[pos - 1] != sep; --pos)
|
||||
;
|
||||
for (; pos != 0 && input[pos - 1] != sep; --pos);
|
||||
return input.substr(pos);
|
||||
}
|
||||
|
||||
|
|
|
@ -502,7 +502,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
|
|||
class Execution {
|
||||
public:
|
||||
Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
|
||||
~Execution(){};
|
||||
~Execution() {};
|
||||
|
||||
Status LoadModel();
|
||||
Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -41,16 +41,16 @@ TreeEnsembleClassifier<T>::TreeEnsembleClassifier(const OpKernelInfo& info) : Op
|
|||
|
||||
template <typename T>
|
||||
Status TreeEnsembleClassifier<T>::GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const {
|
||||
InlinedVector<std::string> names {
|
||||
"base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
|
||||
"nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
|
||||
"nodes_truenodeids", "nodes_values", "class_ids", "class_treeids", "class_nodeids",
|
||||
"class_weights", "classlabels_strings",
|
||||
"classlabels_int64s"
|
||||
InlinedVector<std::string> names{
|
||||
"base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
|
||||
"nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
|
||||
"nodes_truenodeids", "nodes_values", "class_ids", "class_treeids", "class_nodeids",
|
||||
"class_weights", "classlabels_strings",
|
||||
"classlabels_int64s"
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
"base_values_as_tensor",
|
||||
"nodes_hitrates_as_tensor", "nodes_values_as_tensor",
|
||||
"class_weights_as_tensor"
|
||||
"base_values_as_tensor",
|
||||
"nodes_hitrates_as_tensor", "nodes_values_as_tensor",
|
||||
"class_weights_as_tensor"
|
||||
#endif
|
||||
};
|
||||
removable_attributes.swap(names);
|
||||
|
|
|
@ -48,16 +48,16 @@ TreeEnsembleRegressor<T>::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKe
|
|||
|
||||
template <typename T>
|
||||
Status TreeEnsembleRegressor<T>::GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const {
|
||||
InlinedVector<std::string> names {
|
||||
"base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
|
||||
"nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
|
||||
"nodes_truenodeids", "nodes_values",
|
||||
"target_ids", "target_treeids", "target_nodeids",
|
||||
"target_weights"
|
||||
InlinedVector<std::string> names{
|
||||
"base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
|
||||
"nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
|
||||
"nodes_truenodeids", "nodes_values",
|
||||
"target_ids", "target_treeids", "target_nodeids",
|
||||
"target_weights"
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
"base_values_as_tensor",
|
||||
"nodes_hitrates_as_tensor", "nodes_values_as_tensor",
|
||||
"class_weights_as_tensor"
|
||||
"base_values_as_tensor",
|
||||
"nodes_hitrates_as_tensor", "nodes_values_as_tensor",
|
||||
"class_weights_as_tensor"
|
||||
#endif
|
||||
};
|
||||
removable_attributes.swap(names);
|
||||
|
|
|
@ -195,8 +195,8 @@ Status NonMaxSuppression::Compute(OpKernelContext* ctx) const {
|
|||
}
|
||||
sorted_boxes.pop();
|
||||
} // while
|
||||
} // for class_index
|
||||
} // for batch_index
|
||||
} // for class_index
|
||||
} // for batch_index
|
||||
|
||||
constexpr auto last_dim = 3;
|
||||
const auto num_selected = selected_indices.size();
|
||||
|
|
|
@ -251,9 +251,9 @@ void RoiAlignForward(const TensorShape& output_shape, const T* bottom_data, floa
|
|||
|
||||
top_data[index] = output_val;
|
||||
} // for pw
|
||||
} // for ph
|
||||
} // for c
|
||||
} // for n
|
||||
} // for ph
|
||||
} // for c
|
||||
} // for n
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
|
|
|
@ -128,7 +128,7 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
|
|||
memcpy(output_data + output_offset, input_data + input_offset, onnxruntime::narrow<size_t>(copy_byte));
|
||||
output_offsets[onnxruntime::narrow<size_t>(i)] = output_offset;
|
||||
} // for i
|
||||
}; // distribute_fn
|
||||
}; // distribute_fn
|
||||
|
||||
auto per_thread_tasks =
|
||||
distribute_count / concurrency::ThreadPool::DegreeOfParallelism(context->GetOperatorThreadPool());
|
||||
|
@ -169,9 +169,9 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
|
|||
copy_byte >>= 1;
|
||||
}
|
||||
} // while
|
||||
} // if
|
||||
} // for
|
||||
}; // copy_fn
|
||||
} // if
|
||||
} // for
|
||||
}; // copy_fn
|
||||
if (per_thread_tasks > 20) {
|
||||
concurrency::ThreadPool::TryParallelFor(
|
||||
context->GetOperatorThreadPool(),
|
||||
|
@ -181,7 +181,7 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
|
|||
} else {
|
||||
copy_fn(0, onnxruntime::narrow<std::ptrdiff_t>(distribute_count));
|
||||
} // else
|
||||
} // for
|
||||
} // for
|
||||
return Status::OK();
|
||||
} // Expand::compute
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -18,7 +18,7 @@ constexpr CudaGraphAnnotation_t kCudaGraphAnnotationSkip = -1;
|
|||
constexpr CudaGraphAnnotation_t kCudaGraphAnnotationDefault = 0;
|
||||
|
||||
struct CudaGraphSet {
|
||||
CudaGraphSet(){};
|
||||
CudaGraphSet() {};
|
||||
~CudaGraphSet();
|
||||
|
||||
void Clear();
|
||||
|
@ -31,7 +31,7 @@ struct CudaGraphSet {
|
|||
};
|
||||
|
||||
struct CUDAGraphManager {
|
||||
CUDAGraphManager(){};
|
||||
CUDAGraphManager() {};
|
||||
CUDAGraphManager(cudaStream_t stream);
|
||||
~CUDAGraphManager();
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class CudaProfiler final : public EpProfiler {
|
|||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
|
||||
~CudaProfiler() {}
|
||||
bool StartProfiling(TimePoint) override { return true; }
|
||||
void EndProfiling(TimePoint, Events&) override{};
|
||||
void EndProfiling(TimePoint, Events&) override {};
|
||||
void Start(uint64_t) override{};
|
||||
void Stop(uint64_t) override{};
|
||||
};
|
||||
|
|
|
@ -18,7 +18,7 @@ namespace cuda {
|
|||
template <typename T, bool NHWC>
|
||||
class ConvTranspose : public CudaKernel {
|
||||
public:
|
||||
ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info){};
|
||||
ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info) {};
|
||||
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
|
||||
bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
|
|
@ -45,7 +45,7 @@ enum class Color : uint32_t {
|
|||
class RangeCreatorBase {
|
||||
public:
|
||||
RangeCreatorBase(const std::string message, const Color color)
|
||||
: message_(message), color_(color), is_begin_called_(false), is_end_called_(false){};
|
||||
: message_(message), color_(color), is_begin_called_(false), is_end_called_(false) {};
|
||||
|
||||
// Check if Begin and End are both called.
|
||||
// It's pointless if not all of them are called.
|
||||
|
@ -100,7 +100,7 @@ class RangeCreatorBase {
|
|||
class NvtxRangeCreator final : public RangeCreatorBase {
|
||||
public:
|
||||
NvtxRangeCreator(const std::string message, const Color color)
|
||||
: RangeCreatorBase(message, color){};
|
||||
: RangeCreatorBase(message, color) {};
|
||||
|
||||
void BeginImpl() override;
|
||||
void EndImpl() override;
|
||||
|
@ -114,7 +114,7 @@ class NvtxRangeCreator final : public RangeCreatorBase {
|
|||
class NvtxNestedRangeCreator final : public RangeCreatorBase {
|
||||
public:
|
||||
NvtxNestedRangeCreator(const std::string message, const Color color)
|
||||
: RangeCreatorBase(message, color){};
|
||||
: RangeCreatorBase(message, color) {};
|
||||
|
||||
void BeginImpl() override;
|
||||
void EndImpl() override;
|
||||
|
@ -123,7 +123,7 @@ class NvtxNestedRangeCreator final : public RangeCreatorBase {
|
|||
class NvtxMarkerCreator final {
|
||||
public:
|
||||
NvtxMarkerCreator(const std::string message, const Color color)
|
||||
: message_(message), color_(color){};
|
||||
: message_(message), color_(color) {};
|
||||
void Mark();
|
||||
|
||||
private:
|
||||
|
|
|
@ -35,7 +35,7 @@ enum class BroadcastIndexType : int32_t {
|
|||
template <typename T>
|
||||
class IConstantBuffer {
|
||||
public:
|
||||
virtual ~IConstantBuffer(){};
|
||||
virtual ~IConstantBuffer() {};
|
||||
virtual const T* GetBuffer(cudaStream_t stream, size_t count) = 0;
|
||||
};
|
||||
|
||||
|
|
|
@ -13,23 +13,23 @@ const std::vector<MLDataType>& CastOpTypeConstraints() {
|
|||
// Must be done as a local static for a shared provider, to avoid the prefast warning:
|
||||
// Global initializer calls a non-constexpr function 'onnxruntime::DataTypeImpl::GetTensorType<onnxruntime::MLFloat16>'
|
||||
// In a shared provider, GetTensorType is a function call into Onnxruntime and isn't constexpr
|
||||
static std::vector<MLDataType> types {
|
||||
DataTypeImpl::GetTensorType<MLFloat16>(),
|
||||
DataTypeImpl::GetTensorType<BFloat16>(),
|
||||
DataTypeImpl::GetTensorType<float>(),
|
||||
DataTypeImpl::GetTensorType<double>(),
|
||||
DataTypeImpl::GetTensorType<int8_t>(),
|
||||
DataTypeImpl::GetTensorType<int16_t>(),
|
||||
DataTypeImpl::GetTensorType<int32_t>(),
|
||||
DataTypeImpl::GetTensorType<int64_t>(),
|
||||
DataTypeImpl::GetTensorType<uint8_t>(),
|
||||
DataTypeImpl::GetTensorType<uint16_t>(),
|
||||
DataTypeImpl::GetTensorType<uint32_t>(),
|
||||
DataTypeImpl::GetTensorType<uint64_t>(),
|
||||
DataTypeImpl::GetTensorType<bool>()
|
||||
static std::vector<MLDataType> types{
|
||||
DataTypeImpl::GetTensorType<MLFloat16>(),
|
||||
DataTypeImpl::GetTensorType<BFloat16>(),
|
||||
DataTypeImpl::GetTensorType<float>(),
|
||||
DataTypeImpl::GetTensorType<double>(),
|
||||
DataTypeImpl::GetTensorType<int8_t>(),
|
||||
DataTypeImpl::GetTensorType<int16_t>(),
|
||||
DataTypeImpl::GetTensorType<int32_t>(),
|
||||
DataTypeImpl::GetTensorType<int64_t>(),
|
||||
DataTypeImpl::GetTensorType<uint8_t>(),
|
||||
DataTypeImpl::GetTensorType<uint16_t>(),
|
||||
DataTypeImpl::GetTensorType<uint32_t>(),
|
||||
DataTypeImpl::GetTensorType<uint64_t>(),
|
||||
DataTypeImpl::GetTensorType<bool>()
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
,
|
||||
DataTypeImpl::GetTensorType<Float8E4M3FN>(), DataTypeImpl::GetTensorType<Float8E5M2>()
|
||||
,
|
||||
DataTypeImpl::GetTensorType<Float8E4M3FN>(), DataTypeImpl::GetTensorType<Float8E5M2>()
|
||||
#endif
|
||||
};
|
||||
return types;
|
||||
|
|
|
@ -42,7 +42,7 @@ enum ORT_DataType : int {
|
|||
*/
|
||||
class DnnlNodeCapability {
|
||||
public:
|
||||
virtual ~DnnlNodeCapability(){};
|
||||
virtual ~DnnlNodeCapability() {};
|
||||
/**
|
||||
* virtual function expected to be implemented for different node
|
||||
* types.
|
||||
|
|
|
@ -18,7 +18,7 @@ class DnnlNode;
|
|||
class DnnlNodeArg {
|
||||
public:
|
||||
DnnlNodeArg(DnnlNode* node, size_t index, bool is_output)
|
||||
: node_(node), index_(index), is_output_(is_output){};
|
||||
: node_(node), index_(index), is_output_(is_output) {};
|
||||
DnnlNodeArg() = default;
|
||||
DnnlNode* GetNode() { return node_; };
|
||||
size_t GetIndex() { return index_; };
|
||||
|
|
|
@ -15,7 +15,7 @@ class JsCPUAllocator : public CPUAllocator {
|
|||
: CPUAllocator(
|
||||
OrtMemoryInfo("JsCPUAllocator", OrtAllocatorType::OrtDeviceAllocator,
|
||||
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
|
||||
0, OrtMemTypeCPU)){};
|
||||
0, OrtMemTypeCPU)) {};
|
||||
};
|
||||
|
||||
class JsCustomAllocator : public IAllocator {
|
||||
|
|
|
@ -11,8 +11,8 @@ namespace js {
|
|||
|
||||
class DataTransfer : public IDataTransfer {
|
||||
public:
|
||||
DataTransfer(){};
|
||||
~DataTransfer(){};
|
||||
DataTransfer() {};
|
||||
~DataTransfer() {};
|
||||
|
||||
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
|
||||
|
||||
|
|
|
@ -125,7 +125,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
|
|||
default:
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported.");
|
||||
} // switch
|
||||
} // if-else
|
||||
} // if-else
|
||||
|
||||
const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
|
||||
std::string shape_input_name(input_name + "_" + output_name);
|
||||
|
|
|
@ -163,7 +163,7 @@ Status ProcessConstantValue(QnnModelWrapper& qnn_model_wrapper,
|
|||
default:
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported.");
|
||||
} // switch
|
||||
} // if-else
|
||||
} // if-else
|
||||
|
||||
QnnParamWrapper constant_value_param(node_unit.Index(),
|
||||
node_unit.Name(),
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
|
||||
|
||||
#define ALIGN_PTR_UP(ptr, align, type) \
|
||||
reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align)-1) & ~((align)-1))
|
||||
reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align) - 1) & ~((align) - 1))
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace qnn {
|
||||
|
|
|
@ -34,7 +34,7 @@ class RocmProfiler final : public EpProfiler {
|
|||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RocmProfiler);
|
||||
~RocmProfiler() {}
|
||||
bool StartProfiling(TimePoint) override { return true; }
|
||||
void EndProfiling(TimePoint, Events&) override{};
|
||||
void EndProfiling(TimePoint, Events&) override {};
|
||||
void Start(uint64_t) override{};
|
||||
void Stop(uint64_t) override{};
|
||||
};
|
||||
|
|
|
@ -24,10 +24,10 @@ struct Provider {
|
|||
virtual ProviderOptions GetProviderOptions(const void* /*provider options struct*/) { return {}; }
|
||||
|
||||
// Update provider options from key-value string configuration
|
||||
virtual void UpdateProviderOptions(void* /*provider options to be configured*/, const ProviderOptions& /*key-value string provider options*/){};
|
||||
virtual void UpdateProviderOptions(void* /*provider options to be configured*/, const ProviderOptions& /*key-value string provider options*/) {};
|
||||
|
||||
// Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates.
|
||||
virtual void GetCustomOpDomainList(IExecutionProviderFactory* /*pointer to factory instance*/, std::vector<OrtCustomOpDomain*>& /*provider custom op domain list*/){};
|
||||
virtual void GetCustomOpDomainList(IExecutionProviderFactory* /*pointer to factory instance*/, std::vector<OrtCustomOpDomain*>& /*provider custom op domain list*/) {};
|
||||
|
||||
virtual void Initialize() = 0; // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded
|
||||
virtual void Shutdown() = 0; // Called right before unloading the shared library
|
||||
|
|
|
@ -24,8 +24,8 @@ struct TensorRTCustomKernel {
|
|||
: compute_stream_(compute_stream) {
|
||||
}
|
||||
|
||||
void Compute(OrtKernelContext* /*context*/){
|
||||
// The implementation is in TensorRT plugin. No need to implement it here.
|
||||
void Compute(OrtKernelContext* /*context*/) {
|
||||
// The implementation is in TensorRT plugin. No need to implement it here.
|
||||
};
|
||||
|
||||
private:
|
||||
|
|
|
@ -46,7 +46,7 @@ struct VitisAI_Provider : Provider {
|
|||
}
|
||||
};
|
||||
// Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates.
|
||||
void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector<OrtCustomOpDomain*>&) override{};
|
||||
void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector<OrtCustomOpDomain*>&) override {};
|
||||
// Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded
|
||||
void Initialize() override { initialize_vitisai_ep(); }
|
||||
// Called right before unloading the shared library
|
||||
|
|
|
@ -47,7 +47,7 @@ namespace npu {
|
|||
std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs, \
|
||||
const NodeUnit& node_unit) override { \
|
||||
LOGS_DEFAULT(INFO) << "Creating " << #onnx_op_type << " Op"; \
|
||||
auto op = graph_ep->GetGraph() -> CreateOperation<tim::vx::ops::vsinpu_op_kind>(); \
|
||||
auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::vsinpu_op_kind>(); \
|
||||
(*op).BindInputs(inputs).BindOutputs(outputs); \
|
||||
return true; \
|
||||
; \
|
||||
|
|
|
@ -60,10 +60,9 @@ using createIOpBuildItemFunc = std::function<std::unique_ptr<IOpBuilder>()>;
|
|||
using OpBuildItemType = std::map<std::string, std::unique_ptr<IOpBuilder>>;
|
||||
|
||||
static const std::map<std::string, createIOpBuildItemFunc> reg = {
|
||||
#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE) \
|
||||
{ \
|
||||
ONNX_NODE_TYPE, [] { return std::make_unique<BUILDER_TYPE>(); } \
|
||||
}
|
||||
#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE) \
|
||||
{ \
|
||||
ONNX_NODE_TYPE, [] { return std::make_unique<BUILDER_TYPE>(); }}
|
||||
|
||||
REGISTER_OP_BUILDER("Add", AddOpBuilder),
|
||||
REGISTER_OP_BUILDER("Sub", SubOpBuilder),
|
||||
|
|
|
@ -155,11 +155,7 @@ void addIoBindingMethods(pybind11::module& m) {
|
|||
.def("clear_binding_outputs", [](SessionIOBinding* io_binding) -> void {
|
||||
io_binding->Get()->ClearOutputs();
|
||||
})
|
||||
.def(
|
||||
"get_outputs", [](const SessionIOBinding* io_binding) -> const std::vector<OrtValue>& {
|
||||
return io_binding->Get()->GetOutputs();
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
.def("get_outputs", [](const SessionIOBinding* io_binding) -> const std::vector<OrtValue>& { return io_binding->Get()->GetOutputs(); }, py::return_value_policy::reference_internal)
|
||||
.def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> py::list {
|
||||
const std::vector<OrtValue>& outputs = io_binding->Get()->GetOutputs();
|
||||
|
||||
|
@ -180,8 +176,7 @@ void addIoBindingMethods(pybind11::module& m) {
|
|||
}
|
||||
++pos;
|
||||
}
|
||||
return result;
|
||||
});
|
||||
return result; });
|
||||
}
|
||||
|
||||
} // namespace python
|
||||
|
|
|
@ -226,7 +226,7 @@ void addOrtValueMethods(pybind11::module& m) {
|
|||
|
||||
ORT_THROW("Only OrtValues that are Tensors/SparseTensors are currently supported");
|
||||
#else
|
||||
ORT_THROW("Only OrtValues that are Tensors are supported in this build");
|
||||
ORT_THROW("Only OrtValues that are Tensors are supported in this build");
|
||||
#endif
|
||||
})
|
||||
.def("shape", [](const OrtValue* ort_value) -> py::list {
|
||||
|
@ -275,26 +275,15 @@ void addOrtValueMethods(pybind11::module& m) {
|
|||
|
||||
return *ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(*type_proto);
|
||||
})
|
||||
.def(
|
||||
"element_type", [](const OrtValue* ort_value) -> int32_t {
|
||||
return GetTensorProtoType(*ort_value);
|
||||
},
|
||||
"Returns an integer equal to the ONNX tensor proto type of the tensor or sequence. "
|
||||
"This integer is one type defined by ONNX TensorProto_DataType "
|
||||
"(such as onnx.TensorProto.FLOAT)."
|
||||
"Raises an exception in any other case.")
|
||||
.def("has_value", [](const OrtValue* ort_value) -> bool {
|
||||
return ort_value->IsAllocated();
|
||||
})
|
||||
.def("is_tensor", [](const OrtValue* ort_value) -> bool {
|
||||
return ort_value->IsTensor();
|
||||
})
|
||||
.def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool {
|
||||
return ort_value->IsSparseTensor();
|
||||
})
|
||||
.def("is_tensor_sequence", [](const OrtValue* ort_value) -> bool {
|
||||
return ort_value->IsTensorSequence();
|
||||
})
|
||||
.def("element_type", [](const OrtValue* ort_value) -> int32_t { return GetTensorProtoType(*ort_value); },
|
||||
"Returns an integer equal to the ONNX tensor proto type of the tensor or sequence. "
|
||||
"This integer is one type defined by ONNX TensorProto_DataType "
|
||||
"(such as onnx.TensorProto.FLOAT)."
|
||||
"Raises an exception in any other case.")
|
||||
.def("has_value", [](const OrtValue* ort_value) -> bool { return ort_value->IsAllocated(); })
|
||||
.def("is_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensor(); })
|
||||
.def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsSparseTensor(); })
|
||||
.def("is_tensor_sequence", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensorSequence(); })
|
||||
// Converts Tensor into a numpy array
|
||||
.def("numpy", [](const OrtValue* ml_value) -> py::object {
|
||||
ORT_ENFORCE(ml_value->IsTensor(), "Only OrtValues that are Tensors are convertible to Numpy objects");
|
||||
|
@ -310,37 +299,22 @@ void addOrtValueMethods(pybind11::module& m) {
|
|||
#else
|
||||
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr);
|
||||
#endif
|
||||
return obj;
|
||||
})
|
||||
return obj; })
|
||||
#ifdef ENABLE_TRAINING
|
||||
.def(
|
||||
"to_dlpack", [](OrtValue* ort_value) -> py::object {
|
||||
return py::reinterpret_steal<py::object>(ToDlpack(*ort_value));
|
||||
},
|
||||
"Returns a DLPack representing the tensor. This method does not copy the pointer shape, "
|
||||
"instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure "
|
||||
"is consumed.")
|
||||
.def_static(
|
||||
"from_dlpack", [](py::object data, bool is_bool_tensor) {
|
||||
return FromDlpack(data.ptr(), is_bool_tensor);
|
||||
},
|
||||
py::arg("data"), py::arg("is_bool_tensor") = false, "Converts a tensor from a external library into an OrtValue by means of the __dlpack__ protocol.")
|
||||
.def(
|
||||
"__dlpack__", [](OrtValue* ort_value, py::object /* stream */) -> py::object {
|
||||
return py::reinterpret_steal<py::object>(ToDlpack(*ort_value));
|
||||
},
|
||||
py::arg("stream") = py::none(),
|
||||
"Returns a DLPack representing the tensor (part of __dlpack__ protocol). "
|
||||
"This method does not copy the pointer shape, instead, it copies the pointer value. "
|
||||
"The OrtValue must persist until the dlpack structure is consumed.")
|
||||
.def(
|
||||
"__dlpack_device__", [](const OrtValue* ort_value) -> py::tuple {
|
||||
.def("to_dlpack", [](OrtValue* ort_value) -> py::object { return py::reinterpret_steal<py::object>(ToDlpack(*ort_value)); },
|
||||
"Returns a DLPack representing the tensor. This method does not copy the pointer shape, "
|
||||
"instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure "
|
||||
"is consumed.")
|
||||
.def_static("from_dlpack", [](py::object data, bool is_bool_tensor) { return FromDlpack(data.ptr(), is_bool_tensor); }, py::arg("data"), py::arg("is_bool_tensor") = false, "Converts a tensor from a external library into an OrtValue by means of the __dlpack__ protocol.")
|
||||
.def("__dlpack__", [](OrtValue* ort_value, py::object /* stream */) -> py::object { return py::reinterpret_steal<py::object>(ToDlpack(*ort_value)); }, py::arg("stream") = py::none(),
|
||||
"Returns a DLPack representing the tensor (part of __dlpack__ protocol). "
|
||||
"This method does not copy the pointer shape, instead, it copies the pointer value. "
|
||||
"The OrtValue must persist until the dlpack structure is consumed.")
|
||||
.def("__dlpack_device__", [](const OrtValue* ort_value) -> py::tuple {
|
||||
ORT_ENFORCE(ort_value->IsTensor(), "Only tensor type OrtValues are supported");
|
||||
const onnxruntime::Tensor& tensor = ort_value->Get<Tensor>();
|
||||
DLDevice device = onnxruntime::dlpack::GetDlpackDevice(*ort_value, tensor.Location().device.Id());
|
||||
return py::make_tuple(static_cast<int>(device.device_type), device.device_id);
|
||||
},
|
||||
"Returns a tuple of integers, (device, device index) (part of __dlpack__ protocol).")
|
||||
return py::make_tuple(static_cast<int>(device.device_type), device.device_id); }, "Returns a tuple of integers, (device, device index) (part of __dlpack__ protocol).")
|
||||
#endif
|
||||
;
|
||||
|
||||
|
@ -350,13 +324,8 @@ void addOrtValueMethods(pybind11::module& m) {
|
|||
v->push_back(ortvalue);
|
||||
})
|
||||
#ifdef ENABLE_TRAINING
|
||||
.def(
|
||||
"push_back", [](std::vector<OrtValue>* v, py::object dlpack_tensor, const bool is_bool_tensor) {
|
||||
v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor));
|
||||
},
|
||||
"Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false)
|
||||
.def(
|
||||
"push_back_batch", [](std::vector<OrtValue>* v, std::vector<py::object>& torch_tensors, std::vector<int64_t>& data_ptrs, std::vector<py::object>& element_types, const std::vector<std::vector<int64_t>>& shapes, const std::vector<OrtDevice>& devices) {
|
||||
.def("push_back", [](std::vector<OrtValue>* v, py::object dlpack_tensor, const bool is_bool_tensor) { v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor)); }, "Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false)
|
||||
.def("push_back_batch", [](std::vector<OrtValue>* v, std::vector<py::object>& torch_tensors, std::vector<int64_t>& data_ptrs, std::vector<py::object>& element_types, const std::vector<std::vector<int64_t>>& shapes, const std::vector<OrtDevice>& devices) {
|
||||
for (size_t i = 0; i < torch_tensors.size(); ++i) {
|
||||
py::object& element_type = element_types.at(i);
|
||||
const std::vector<int64_t>& shape = shapes.at(i);
|
||||
|
@ -377,52 +346,36 @@ void addOrtValueMethods(pybind11::module& m) {
|
|||
OrtValue ml_value;
|
||||
Tensor::InitOrtValue(ml_type, gsl::make_span(shape), reinterpret_cast<void*>(data_ptr), info, ml_value);
|
||||
v->push_back(ml_value);
|
||||
}
|
||||
},
|
||||
"Add a batch of OrtValue's by wrapping PyTorch tensors.")
|
||||
} }, "Add a batch of OrtValue's by wrapping PyTorch tensors.")
|
||||
#endif
|
||||
.def("reserve", [](std::vector<OrtValue>* v, const size_t len) { v->reserve(len); })
|
||||
.def("shrink_to_fit", [](std::vector<OrtValue>* v) { v->shrink_to_fit(); })
|
||||
.def("__len__", [](const std::vector<OrtValue>& v) { return v.size(); })
|
||||
.def(
|
||||
"__iter__", [](const std::vector<OrtValue>& v) {
|
||||
return py::make_iterator(v.cbegin(), v.cend());
|
||||
},
|
||||
py::keep_alive<0, 1>())
|
||||
.def("__getitem__", [](const std::vector<OrtValue>& v, const size_t idx) {
|
||||
return v.at(idx);
|
||||
})
|
||||
.def(
|
||||
"bool_tensor_indices", [](std::vector<OrtValue>* v) -> std::vector<int64_t> {
|
||||
.def("__iter__", [](const std::vector<OrtValue>& v) { return py::make_iterator(v.cbegin(), v.cend()); }, py::keep_alive<0, 1>())
|
||||
.def("__getitem__", [](const std::vector<OrtValue>& v, const size_t idx) { return v.at(idx); })
|
||||
.def("bool_tensor_indices", [](std::vector<OrtValue>* v) -> std::vector<int64_t> {
|
||||
std::vector<int64_t> indices;
|
||||
for (size_t i = 0; i < v->size(); ++i) {
|
||||
if (GetTensorProtoType((*v)[i]) == ONNX_NAMESPACE::TensorProto_DataType_BOOL) {
|
||||
indices.push_back(static_cast<int64_t>(i));
|
||||
}
|
||||
}
|
||||
return indices;
|
||||
},
|
||||
"Returns the indices of every boolean tensor in this vector of OrtValue. "
|
||||
"In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. "
|
||||
"If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor "
|
||||
"to get a boolean tensor.")
|
||||
return indices; },
|
||||
"Returns the indices of every boolean tensor in this vector of OrtValue. "
|
||||
"In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. "
|
||||
"If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor "
|
||||
"to get a boolean tensor.")
|
||||
#ifdef ENABLE_TRAINING
|
||||
.def("dlpack_at", [](std::vector<OrtValue>* v, const size_t idx) {
|
||||
return py::reinterpret_steal<py::object>(ToDlpack(v->at(idx)));
|
||||
})
|
||||
.def("dlpack_at", [](std::vector<OrtValue>* v, const size_t idx) { return py::reinterpret_steal<py::object>(ToDlpack(v->at(idx))); })
|
||||
#endif
|
||||
.def(
|
||||
"element_type_at", [](std::vector<OrtValue>* v, const size_t idx) -> int32_t {
|
||||
return GetTensorProtoType(v->at(idx));
|
||||
},
|
||||
"Returns an integer equal to the ONNX proto type of the tensor at position i. "
|
||||
"This integer is one type defined by ONNX TensorProto_DataType "
|
||||
"(such as onnx.TensorProto.FLOAT)."
|
||||
"Raises an exception in any other case.",
|
||||
py::arg("idx"))
|
||||
.def("element_type_at", [](std::vector<OrtValue>* v, const size_t idx) -> int32_t { return GetTensorProtoType(v->at(idx)); },
|
||||
"Returns an integer equal to the ONNX proto type of the tensor at position i. "
|
||||
"This integer is one type defined by ONNX TensorProto_DataType "
|
||||
"(such as onnx.TensorProto.FLOAT)."
|
||||
"Raises an exception in any other case.",
|
||||
py::arg("idx"))
|
||||
#ifdef ENABLE_TRAINING
|
||||
.def(
|
||||
"to_dlpacks", [](const std::vector<OrtValue>& v, py::object to_tensor) -> py::list {
|
||||
.def("to_dlpacks", [](const std::vector<OrtValue>& v, py::object to_tensor) -> py::list {
|
||||
if (v.size() == 0)
|
||||
return py::list();
|
||||
|
||||
|
@ -469,9 +422,8 @@ void addOrtValueMethods(pybind11::module& m) {
|
|||
Py_DECREF(capsule);
|
||||
}
|
||||
}
|
||||
return list_dlpacks;
|
||||
},
|
||||
R"pbdoc(Converts all OrtValue into tensors through DLPack protocol, the method creates
|
||||
return list_dlpacks; },
|
||||
R"pbdoc(Converts all OrtValue into tensors through DLPack protocol, the method creates
|
||||
a DLPack structure for every tensors, then calls python function `to_tensor` to a new object
|
||||
consuming the DLPack structure or return a list of capsule if this function is None.
|
||||
|
||||
|
@ -488,7 +440,7 @@ It creates many tensors acquiring ownership of existing OrtValue.
|
|||
This method saves one object creation and an C++ allocation
|
||||
for every transferred tensor.
|
||||
)pbdoc",
|
||||
py::arg("to_tensor"))
|
||||
py::arg("to_tensor"))
|
||||
#endif
|
||||
;
|
||||
|
||||
|
|
|
@ -397,8 +397,7 @@ void addSparseTensorMethods(pybind11::module& m) {
|
|||
// pybind apparently has a bug with returning enums from def_property_readonly or methods
|
||||
// returning a method object instead of the enumeration value
|
||||
// so we are using def_property and throw on a potential modification
|
||||
.def_property(
|
||||
"format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
|
||||
.def_property("format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
|
||||
const SparseTensor& tensor = py_tensor->Instance();
|
||||
auto retval = OrtSparseFormat::ORT_SPARSE_UNDEFINED;
|
||||
switch (tensor.Format()) {
|
||||
|
|
|
@ -1425,7 +1425,7 @@ void addGlobalMethods(py::module& m) {
|
|||
ORT_UNUSED_PARAMETER(algo);
|
||||
ORT_THROW("set_cudnn_conv_algo_search is not supported in ROCM");
|
||||
#else
|
||||
cudnn_conv_algo_search = algo;
|
||||
cudnn_conv_algo_search = algo;
|
||||
#endif
|
||||
});
|
||||
// TODO remove deprecated global config
|
||||
|
@ -1436,7 +1436,7 @@ void addGlobalMethods(py::module& m) {
|
|||
ORT_UNUSED_PARAMETER(use_single_stream);
|
||||
ORT_THROW("set_do_copy_in_default_stream is not supported in ROCM");
|
||||
#else
|
||||
do_copy_in_default_stream = use_single_stream;
|
||||
do_copy_in_default_stream = use_single_stream;
|
||||
#endif
|
||||
});
|
||||
// TODO remove deprecated global config
|
||||
|
@ -1801,10 +1801,10 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
|
|||
}
|
||||
ORT_THROW_IF_ERROR(options->value.AddExternalInitializers(names_ptrs, values_ptrs));
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(options);
|
||||
ORT_UNUSED_PARAMETER(names);
|
||||
ORT_UNUSED_PARAMETER(ort_values);
|
||||
ORT_THROW("External initializers are not supported in this build.");
|
||||
ORT_UNUSED_PARAMETER(options);
|
||||
ORT_UNUSED_PARAMETER(names);
|
||||
ORT_UNUSED_PARAMETER(ort_values);
|
||||
ORT_THROW("External initializers are not supported in this build.");
|
||||
#endif
|
||||
});
|
||||
|
||||
|
@ -1866,8 +1866,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
return *(na.Type());
|
||||
},
|
||||
"node type")
|
||||
.def(
|
||||
"__str__", [](const onnxruntime::NodeArg& na) -> std::string {
|
||||
.def("__str__", [](const onnxruntime::NodeArg& na) -> std::string {
|
||||
std::ostringstream res;
|
||||
res << "NodeArg(name='" << na.Name() << "', type='" << *(na.Type()) << "', shape=";
|
||||
auto shape = na.Shape();
|
||||
|
@ -1893,11 +1892,8 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
}
|
||||
res << ")";
|
||||
|
||||
return std::string(res.str());
|
||||
},
|
||||
"converts the node into a readable string")
|
||||
.def_property_readonly(
|
||||
"shape", [](const onnxruntime::NodeArg& na) -> std::vector<py::object> {
|
||||
return std::string(res.str()); }, "converts the node into a readable string")
|
||||
.def_property_readonly("shape", [](const onnxruntime::NodeArg& na) -> std::vector<py::object> {
|
||||
auto shape = na.Shape();
|
||||
std::vector<py::object> arr;
|
||||
if (shape == nullptr || shape->dim_size() == 0) {
|
||||
|
@ -1914,9 +1910,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
arr[i] = py::none();
|
||||
}
|
||||
}
|
||||
return arr;
|
||||
},
|
||||
"node shape (assuming the node holds a tensor)");
|
||||
return arr; }, "node shape (assuming the node holds a tensor)");
|
||||
|
||||
py::class_<SessionObjectInitializer> sessionObjectInitializer(m, "SessionObjectInitializer");
|
||||
py::class_<PyInferenceSession>(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc")
|
||||
|
@ -2107,51 +2101,28 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
.def_property_readonly("get_profiling_start_time_ns", [](const PyInferenceSession* sess) -> uint64_t {
|
||||
return sess->GetSessionHandle()->GetProfiling().GetStartTimeNs();
|
||||
})
|
||||
.def(
|
||||
"get_providers", [](const PyInferenceSession* sess) -> const std::vector<std::string>& {
|
||||
return sess->GetSessionHandle()->GetRegisteredProviderTypes();
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
.def(
|
||||
"get_provider_options", [](const PyInferenceSession* sess) -> const ProviderOptionsMap& {
|
||||
return sess->GetSessionHandle()->GetAllProviderOptions();
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
.def_property_readonly(
|
||||
"session_options", [](const PyInferenceSession* sess) -> PySessionOptions* {
|
||||
.def("get_providers", [](const PyInferenceSession* sess) -> const std::vector<std::string>& { return sess->GetSessionHandle()->GetRegisteredProviderTypes(); }, py::return_value_policy::reference_internal)
|
||||
.def("get_provider_options", [](const PyInferenceSession* sess) -> const ProviderOptionsMap& { return sess->GetSessionHandle()->GetAllProviderOptions(); }, py::return_value_policy::reference_internal)
|
||||
.def_property_readonly("session_options", [](const PyInferenceSession* sess) -> PySessionOptions* {
|
||||
auto session_options = std::make_unique<PySessionOptions>();
|
||||
session_options->value = sess->GetSessionHandle()->GetSessionOptions();
|
||||
return session_options.release();
|
||||
},
|
||||
py::return_value_policy::take_ownership)
|
||||
.def_property_readonly(
|
||||
"inputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
|
||||
return session_options.release(); }, py::return_value_policy::take_ownership)
|
||||
.def_property_readonly("inputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
|
||||
auto res = sess->GetSessionHandle()->GetModelInputs();
|
||||
OrtPybindThrowIfError(res.first);
|
||||
return *(res.second);
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
.def_property_readonly(
|
||||
"outputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
|
||||
return *(res.second); }, py::return_value_policy::reference_internal)
|
||||
.def_property_readonly("outputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
|
||||
auto res = sess->GetSessionHandle()->GetModelOutputs();
|
||||
OrtPybindThrowIfError(res.first);
|
||||
return *(res.second);
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
.def_property_readonly(
|
||||
"overridable_initializers", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
|
||||
return *(res.second); }, py::return_value_policy::reference_internal)
|
||||
.def_property_readonly("overridable_initializers", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
|
||||
auto res = sess->GetSessionHandle()->GetOverridableInitializers();
|
||||
OrtPybindThrowIfError(res.first);
|
||||
return *(res.second);
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
.def_property_readonly(
|
||||
"model_meta", [](const PyInferenceSession* sess) -> const onnxruntime::ModelMetadata& {
|
||||
return *(res.second); }, py::return_value_policy::reference_internal)
|
||||
.def_property_readonly("model_meta", [](const PyInferenceSession* sess) -> const onnxruntime::ModelMetadata& {
|
||||
auto res = sess->GetSessionHandle()->GetModelMetadata();
|
||||
OrtPybindThrowIfError(res.first);
|
||||
return *(res.second);
|
||||
},
|
||||
py::return_value_policy::reference_internal)
|
||||
return *(res.second); }, py::return_value_policy::reference_internal)
|
||||
.def("run_with_iobinding", [](PyInferenceSession* sess, SessionIOBinding& io_binding, RunOptions* run_options = nullptr) -> void {
|
||||
Status status;
|
||||
// release GIL to allow multiple python threads to invoke Run() in parallel.
|
||||
|
@ -2161,8 +2132,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
else
|
||||
status = sess->GetSessionHandle()->Run(*run_options, *io_binding.Get());
|
||||
if (!status.IsOK())
|
||||
throw std::runtime_error("Error in execution: " + status.ErrorMessage());
|
||||
})
|
||||
throw std::runtime_error("Error in execution: " + status.ErrorMessage()); })
|
||||
.def("get_tuning_results", [](PyInferenceSession* sess) -> py::list {
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
auto results = sess->GetSessionHandle()->GetTuningResults();
|
||||
|
@ -2177,8 +2147,8 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
|
||||
return ret;
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(sess);
|
||||
ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
|
||||
ORT_UNUSED_PARAMETER(sess);
|
||||
ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
|
||||
#endif
|
||||
})
|
||||
.def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void {
|
||||
|
@ -2209,10 +2179,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
throw std::runtime_error("Error in execution: " + status.ErrorMessage());
|
||||
}
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(sess);
|
||||
ORT_UNUSED_PARAMETER(results);
|
||||
ORT_UNUSED_PARAMETER(error_on_invalid);
|
||||
ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
|
||||
ORT_UNUSED_PARAMETER(sess);
|
||||
ORT_UNUSED_PARAMETER(results);
|
||||
ORT_UNUSED_PARAMETER(error_on_invalid);
|
||||
ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
|
||||
#endif
|
||||
});
|
||||
|
||||
|
|
|
@ -24,8 +24,7 @@ def check_distro_info():
|
|||
|
||||
if __my_distro_ver__ not in ["10", "11"]:
|
||||
warnings.warn(
|
||||
"Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
|
||||
% __my_distro_ver__
|
||||
f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
|
||||
)
|
||||
elif __my_system__ == "linux":
|
||||
"""Although the 'platform' python module for getting Distro information works well on standard OS images
|
||||
|
@ -54,11 +53,11 @@ def check_distro_info():
|
|||
|
||||
if int(__my_distro_ver__.split(".")[0]) < 11:
|
||||
warnings.warn(
|
||||
"Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later." % (__my_distro_ver__)
|
||||
f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later."
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only." % __my_system__
|
||||
f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS and Windows platforms, only."
|
||||
)
|
||||
|
||||
|
||||
|
@ -115,10 +114,10 @@ def validate_build_package_info():
|
|||
cudart_version = None
|
||||
|
||||
def print_build_package_info():
|
||||
warnings.warn("onnxruntime training package info: package_name: %s" % package_name)
|
||||
warnings.warn("onnxruntime training package info: __version__: %s" % version)
|
||||
warnings.warn("onnxruntime training package info: cuda_version: %s" % cuda_version)
|
||||
warnings.warn("onnxruntime build info: cudart_version: %s" % cudart_version)
|
||||
warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
|
||||
warnings.warn(f"onnxruntime training package info: __version__: {version}")
|
||||
warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
|
||||
warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
|
||||
|
||||
# collection cuda library info from current environment.
|
||||
from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
|
||||
|
@ -127,7 +126,7 @@ def validate_build_package_info():
|
|||
if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
|
||||
print_build_package_info()
|
||||
warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
|
||||
warnings.warn("WARNING: found cudart versions: %s" % local_cudart_versions)
|
||||
warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}")
|
||||
else:
|
||||
# TODO: rcom
|
||||
pass
|
||||
|
|
|
@ -22,7 +22,7 @@ _registered_ops: typing.AbstractSet[str] = set()
|
|||
|
||||
|
||||
def _reg(symbolic_fn: typing.Callable):
|
||||
name = "::%s" % symbolic_fn.__name__
|
||||
name = f"::{symbolic_fn.__name__}"
|
||||
torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION)
|
||||
_registered_ops.add(name)
|
||||
|
||||
|
|
|
@ -1076,7 +1076,7 @@ class HistogramCollector(CalibrationDataCollector):
|
|||
|
||||
for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
|
||||
start_index = zero_bin_index - i
|
||||
end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
|
||||
end_index = min(zero_bin_index + i + 1, num_bins)
|
||||
|
||||
thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ def get_attribute(node, attr_name, default_value=None):
|
|||
|
||||
|
||||
def get_dim_from_proto(dim):
|
||||
return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) is str else None # noqa: E721
|
||||
return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) is str else None
|
||||
|
||||
|
||||
def is_sequence(type_proto):
|
||||
|
@ -92,19 +92,19 @@ def get_opset(mp, domain=None):
|
|||
|
||||
|
||||
def as_scalar(x):
|
||||
if type(x) == list: # noqa: E721
|
||||
if type(x) is list:
|
||||
assert len(x) == 1
|
||||
return x[0]
|
||||
elif type(x) == np.ndarray:
|
||||
elif type(x) is np.ndarray:
|
||||
return x.item()
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
def as_list(x, keep_none):
|
||||
if type(x) == list: # noqa: E721
|
||||
if type(x) is list:
|
||||
return x
|
||||
elif type(x) == np.ndarray:
|
||||
elif type(x) is np.ndarray:
|
||||
return list(x)
|
||||
elif keep_none and x is None:
|
||||
return None
|
||||
|
@ -113,7 +113,7 @@ def as_list(x, keep_none):
|
|||
|
||||
|
||||
def sympy_reduce_product(x):
|
||||
if type(x) == list: # noqa: E721
|
||||
if type(x) is list:
|
||||
value = sympy.Integer(1)
|
||||
for v in x:
|
||||
value = value * v
|
||||
|
@ -258,7 +258,7 @@ class SymbolicShapeInference:
|
|||
self.prefix_ = prefix
|
||||
|
||||
def _add_suggested_merge(self, symbols, apply=False):
|
||||
assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols]) # noqa: E721
|
||||
assert all([(type(s) is str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])
|
||||
symbols = set(symbols)
|
||||
for k, v in self.suggested_merge_.items():
|
||||
if k in symbols:
|
||||
|
@ -278,7 +278,7 @@ class SymbolicShapeInference:
|
|||
break
|
||||
if map_to is None:
|
||||
for s in symbols:
|
||||
if type(self.symbolic_dims_[s]) == sympy.Symbol:
|
||||
if type(self.symbolic_dims_[s]) is sympy.Symbol:
|
||||
map_to = s
|
||||
break
|
||||
# when nothing to map to, use the shorter one
|
||||
|
@ -328,7 +328,7 @@ class SymbolicShapeInference:
|
|||
)
|
||||
|
||||
def _merge_symbols(self, dims):
|
||||
if not all([type(d) == str for d in dims]): # noqa: E721
|
||||
if not all([type(d) is str for d in dims]):
|
||||
if self.auto_merge_:
|
||||
unique_dims = list(set(dims))
|
||||
is_int = [is_literal(d) for d in unique_dims]
|
||||
|
@ -408,7 +408,7 @@ class SymbolicShapeInference:
|
|||
def _get_sympy_shape(self, node, idx):
|
||||
sympy_shape = []
|
||||
for d in self._get_shape(node, idx):
|
||||
if type(d) == str: # noqa: E721
|
||||
if type(d) is str:
|
||||
sympy_shape.append(
|
||||
self.symbolic_dims_[d]
|
||||
if d in self.symbolic_dims_
|
||||
|
@ -590,7 +590,7 @@ class SymbolicShapeInference:
|
|||
# for new symbolic dims from subgraph output, add to main graph symbolic dims
|
||||
subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
|
||||
subgraph_new_symbolic_dims = {
|
||||
d for s in subgraph_shapes if s for d in s if type(d) == str and d not in self.symbolic_dims_ # noqa: E721
|
||||
d for s in subgraph_shapes if s for d in s if type(d) is str and d not in self.symbolic_dims_
|
||||
}
|
||||
new_dims = {}
|
||||
for d in subgraph_new_symbolic_dims:
|
||||
|
@ -610,7 +610,7 @@ class SymbolicShapeInference:
|
|||
if all([v is not None for v in values]):
|
||||
# some shape compute is in floating point, cast to int for sympy
|
||||
for i, v in enumerate(values):
|
||||
if type(v) != np.ndarray:
|
||||
if type(v) is not np.ndarray:
|
||||
continue
|
||||
if len(v.shape) > 1:
|
||||
new_v = None # ignore value for rank > 1
|
||||
|
@ -924,7 +924,7 @@ class SymbolicShapeInference:
|
|||
if all([d == dims[0] for d in dims]):
|
||||
continue
|
||||
merged = self._merge_symbols(dims)
|
||||
if type(merged) == str: # noqa: E721
|
||||
if type(merged) is str:
|
||||
sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
|
||||
else:
|
||||
sympy_shape[d] = merged
|
||||
|
@ -1060,7 +1060,7 @@ class SymbolicShapeInference:
|
|||
dim = shape[-i]
|
||||
if letter not in letter_to_dim:
|
||||
letter_to_dim[letter] = dim
|
||||
elif type(dim) != sympy.Symbol:
|
||||
elif type(dim) is not sympy.Symbol:
|
||||
letter_to_dim[letter] = dim
|
||||
num_operands = num_operands + 1
|
||||
|
||||
|
@ -1127,8 +1127,8 @@ class SymbolicShapeInference:
|
|||
idx = self._try_get_value(node, 1)
|
||||
if idx is not None:
|
||||
data = self.sympy_data_[node.input[0]]
|
||||
if type(data) == list: # noqa: E721
|
||||
if type(idx) == np.ndarray and len(idx.shape) == 1:
|
||||
if type(data) is list:
|
||||
if type(idx) is np.ndarray and len(idx.shape) == 1:
|
||||
self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
|
||||
else:
|
||||
self.sympy_data_[node.output[0]] = data[int(idx)]
|
||||
|
@ -1530,7 +1530,7 @@ class SymbolicShapeInference:
|
|||
new_shape = input_shape[:2]
|
||||
output_size = self._try_get_value(node, 1)
|
||||
if output_size is not None:
|
||||
new_shape += [dim_size.item() if type(dim_size) == np.int64 else dim_size for dim_size in output_size]
|
||||
new_shape += [dim_size.item() if type(dim_size) is np.int64 else dim_size for dim_size in output_size]
|
||||
else:
|
||||
rank = len(input_shape)
|
||||
new_shape += [str(self._new_symbolic_dim_from_output(node, 0, i)) for i in range(2, rank)]
|
||||
|
@ -1645,7 +1645,7 @@ class SymbolicShapeInference:
|
|||
deferred_dim_idx = -1
|
||||
non_deferred_size = 1
|
||||
for i, d in enumerate(shape_value):
|
||||
if type(d) == sympy.Symbol:
|
||||
if type(d) is sympy.Symbol:
|
||||
new_sympy_shape.append(d)
|
||||
elif d == 0:
|
||||
new_sympy_shape.append(input_sympy_shape[i])
|
||||
|
@ -1940,7 +1940,7 @@ class SymbolicShapeInference:
|
|||
# handle sympy_data if needed, for slice in shape computation
|
||||
if (
|
||||
node.input[0] in self.sympy_data_
|
||||
and [0] == axes
|
||||
and axes == [0]
|
||||
and starts is not None
|
||||
and len(starts) == 1
|
||||
and ends is not None
|
||||
|
@ -1949,8 +1949,8 @@ class SymbolicShapeInference:
|
|||
and len(steps) == 1
|
||||
):
|
||||
input_sympy_data = self.sympy_data_[node.input[0]]
|
||||
if type(input_sympy_data) == list or ( # noqa: E721
|
||||
type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
|
||||
if type(input_sympy_data) is list or (
|
||||
type(input_sympy_data) is np.array and len(input_sympy_data.shape) == 1
|
||||
):
|
||||
self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]]
|
||||
|
||||
|
@ -2616,7 +2616,7 @@ class SymbolicShapeInference:
|
|||
# some models use None for symbolic dim in input, replace it with a string
|
||||
input_dims[i_dim].dim_param = str(self._new_symbolic_dim(i.name, i_dim))
|
||||
|
||||
self.input_symbols_.update([d for d in input_shape if type(d) == str]) # noqa: E721
|
||||
self.input_symbols_.update([d for d in input_shape if type(d) is str])
|
||||
|
||||
for s in self.input_symbols_:
|
||||
if s in self.suggested_merge_:
|
||||
|
|
|
@ -925,8 +925,8 @@ def find_model_path(path):
|
|||
|
||||
logger.info(target_model_path)
|
||||
if len(target_model_path) > 1:
|
||||
logger.error("We expect to find only one model in " + path) # noqa: G003
|
||||
raise
|
||||
logger.error("We expect to find only one model in %s", path)
|
||||
raise RuntimeError
|
||||
|
||||
return target_model_path[0]
|
||||
|
||||
|
@ -1007,7 +1007,7 @@ def parse_models_info_from_file(root_dir, path, models):
|
|||
models[row["model_name"]] = {}
|
||||
else:
|
||||
logger.error("Model name must be provided in models_info.json")
|
||||
raise
|
||||
raise RuntimeError
|
||||
|
||||
model = models[row["model_name"]]
|
||||
|
||||
|
@ -1018,19 +1018,19 @@ def parse_models_info_from_file(root_dir, path, models):
|
|||
model["working_directory"] = os.path.join(root_working_directory, row["working_directory"])
|
||||
else:
|
||||
logger.error("Model path must be provided in models_info.json")
|
||||
raise
|
||||
raise RuntimeError
|
||||
|
||||
if "model_path" in row:
|
||||
model["model_path"] = row["model_path"]
|
||||
else:
|
||||
logger.error("Model path must be provided in models_info.json")
|
||||
raise
|
||||
raise RuntimeError
|
||||
|
||||
if "test_data_path" in row:
|
||||
model["test_data_path"] = row["test_data_path"]
|
||||
else:
|
||||
logger.error("Test data path must be provided in models_info.json")
|
||||
raise
|
||||
raise RuntimeError
|
||||
|
||||
if "model_path_fp16" in row:
|
||||
model["model_path_fp16"] = row["model_path_fp16"]
|
||||
|
|
|
@ -234,7 +234,7 @@ def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
|
|||
|
||||
if total_ops == 0:
|
||||
print("Error ...")
|
||||
raise
|
||||
raise RuntimeError
|
||||
|
||||
if len(trt_op_map) == 0:
|
||||
total_cuda_and_cpu_ops = total_ops
|
||||
|
|
|
@ -71,7 +71,7 @@ def write_json(models):
|
|||
def main():
|
||||
links = []
|
||||
with open("links.txt") as fh:
|
||||
links = [link.rstrip() for link in fh.readlines()]
|
||||
links = [link.rstrip() for link in fh]
|
||||
|
||||
model_list = []
|
||||
for link in links:
|
||||
|
|
|
@ -802,7 +802,7 @@ def main():
|
|||
try:
|
||||
os.mkdir(args.cache_dir)
|
||||
except OSError:
|
||||
logger.error("Creation of the directory %s failed" % args.cache_dir) # noqa: G002
|
||||
logger.error("Creation of the directory %s failed", args.cache_dir)
|
||||
|
||||
enable_torch = "torch" in args.engines
|
||||
enable_torch2 = "torch2" in args.engines
|
||||
|
|
|
@ -168,11 +168,11 @@ def output_test_data(directory: str, inputs: Dict[str, np.ndarray]):
|
|||
try:
|
||||
os.mkdir(directory)
|
||||
except OSError:
|
||||
print("Creation of the directory %s failed" % directory)
|
||||
print(f"Creation of the directory {directory} failed")
|
||||
else:
|
||||
print("Successfully created the directory %s " % directory)
|
||||
print(f"Successfully created the directory {directory} ")
|
||||
else:
|
||||
print("Warning: directory %s existed. Files will be overwritten." % directory)
|
||||
print(f"Warning: directory {directory} existed. Files will be overwritten.")
|
||||
|
||||
for index, (name, data) in enumerate(inputs.items()):
|
||||
tensor = numpy_helper.from_array(data, name)
|
||||
|
|
|
@ -672,7 +672,7 @@ class FusionAttention(Fusion):
|
|||
q_matmul, k_matmul, v_matmul, q_add, k_add, v_add, num_heads
|
||||
)
|
||||
mha_inputs.extend([q_slice.output[0], k_slice.output[0], v_slice.output[0]])
|
||||
elif type(k_matmul) == NodeProto and type(v_matmul) == NodeProto:
|
||||
elif type(k_matmul) is NodeProto and type(v_matmul) is NodeProto:
|
||||
if self.disable_multi_head_attention_bias:
|
||||
mha_inputs.extend([q_add.output[0], k_matmul.output[0], v_add.output[0]])
|
||||
else:
|
||||
|
|
|
@ -159,7 +159,7 @@ class FusionUtils:
|
|||
tensor (TensorProto): transposed tensor
|
||||
"""
|
||||
if not isinstance(tensor, onnx_proto.TensorProto):
|
||||
raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
|
||||
raise ValueError(f"Expected input type is an ONNX TensorProto but got {type(tensor)}")
|
||||
|
||||
if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
|
||||
raise ValueError("Only INT8 2-D tensors can be transposed")
|
||||
|
|
|
@ -205,5 +205,5 @@ def export_encoder(args):
|
|||
no_repeat_ngram_size=no_repeat_ngram_size,
|
||||
)
|
||||
time_cost = time.time() - start_time
|
||||
print("--- %s seconds ---" % (time_cost))
|
||||
print(f"--- {time_cost} seconds ---")
|
||||
print(tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
|
||||
|
|
|
@ -266,5 +266,5 @@ def export_decoder(args):
|
|||
use_cache=True,
|
||||
)
|
||||
time_cost = time.time() - start_time
|
||||
print("--- %s seconds ---" % (time_cost))
|
||||
print(f"--- {time_cost} seconds ---")
|
||||
print(tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
|
||||
|
|
|
@ -49,7 +49,7 @@ def run_inference(args):
|
|||
no_repeat_ngram_size=no_repeat_ngram_size,
|
||||
)
|
||||
time_cost = time.time() - start_time
|
||||
print("--- %s seconds ---" % (time_cost))
|
||||
print(f"--- {time_cost} seconds ---")
|
||||
for j in range(batch_num):
|
||||
for i in range(beam):
|
||||
print(
|
||||
|
@ -81,7 +81,7 @@ def run_inference(args):
|
|||
start_time = time.time()
|
||||
out = sess.run(None, ort_inputs)
|
||||
time_cost = time.time() - start_time
|
||||
print("--- %s seconds ---" % (time_cost))
|
||||
print(f"--- {time_cost} seconds ---")
|
||||
for j in range(batch_num):
|
||||
for i in range(beam):
|
||||
print(
|
||||
|
|
|
@ -117,7 +117,7 @@ class EngineBuilder:
|
|||
model_name = model_name + "_" + "_".join(self.pipeline_info.controlnet)
|
||||
|
||||
if hash_source:
|
||||
model_name += "_" + hashlib.md5("\t".join(hash_source).encode("utf-8")).digest().hex()[:8]
|
||||
model_name += "_" + hashlib.md5("\t".join(hash_source).encode("utf-8")).hexdigest()[:8]
|
||||
|
||||
# TODO: When we support original VAE, we shall save custom VAE to another directory.
|
||||
|
||||
|
|
|
@ -459,9 +459,9 @@ class StableDiffusionPipeline:
|
|||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if type(self.scheduler) == UniPCMultistepScheduler:
|
||||
if type(self.scheduler) is UniPCMultistepScheduler:
|
||||
latents = self.scheduler.step(noise_pred, timestep, latents, return_dict=False)[0]
|
||||
elif type(self.scheduler) == LCMScheduler:
|
||||
elif type(self.scheduler) is LCMScheduler:
|
||||
latents = self.scheduler.step(noise_pred, timestep, latents, generator=self.generator)[0]
|
||||
else:
|
||||
latents = self.scheduler.step(noise_pred, latents, step_offset + step_index, timestep)
|
||||
|
|
|
@ -1883,7 +1883,7 @@ TEST_F(PlannerTest, ParaPlanCreation) {
|
|||
ORT_ENFORCE(main_graph_ort_value_index_map.GetName(per_value_plan.reused_buffer, reused).IsOK());
|
||||
reuse_pairs.erase(reused);
|
||||
} // if
|
||||
} // for
|
||||
} // for
|
||||
ASSERT_TRUE(reuse_pairs.empty());
|
||||
}
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ class OrtValueArray {
|
|||
public:
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OrtValueArray);
|
||||
// n must be non-negative
|
||||
OrtValueArray(int n) : values(static_cast<size_t>(n), nullptr){};
|
||||
OrtValueArray(int n) : values(static_cast<size_t>(n), nullptr) {};
|
||||
~OrtValueArray() {
|
||||
for (OrtValue* v : values) {
|
||||
if (v != nullptr) Ort::GetApi().ReleaseValue(v);
|
||||
|
|
|
@ -27,7 +27,7 @@ class Allocs : public IExecutionProvider {
|
|||
std::shared_ptr<CPUAllocator> alloc = std::make_shared<CPUAllocator>();
|
||||
|
||||
public:
|
||||
Allocs() : IExecutionProvider("fake"){};
|
||||
Allocs() : IExecutionProvider("fake") {};
|
||||
AllocatorPtr GetAllocator(OrtMemType) const {
|
||||
return alloc;
|
||||
}
|
||||
|
|
|
@ -401,7 +401,7 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one
|
|||
auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
|
||||
auto op_to_count = CountOpsInGraph(session.GetGraph());
|
||||
const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
|
||||
if ((!has_output_q || std::is_same_v<Input1Type, OutputType>)&&(!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
|
||||
if ((!has_output_q || std::is_same_v<Input1Type, OutputType>) && (!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
|
||||
(std::is_same_v<Input1Type, uint8_t> || std::is_same_v<Input2Type, int8_t>)) {
|
||||
EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
|
||||
EXPECT_EQ(op_to_count["Gemm"], 0);
|
||||
|
|
|
@ -786,7 +786,7 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one
|
|||
auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
|
||||
auto op_to_count = CountOpsInGraph(session.GetGraph());
|
||||
const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
|
||||
if ((!has_output_q || std::is_same_v<Input1Type, OutputType>)&&(!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
|
||||
if ((!has_output_q || std::is_same_v<Input1Type, OutputType>) && (!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
|
||||
(std::is_same_v<Input1Type, uint8_t> || std::is_same_v<Input2Type, int8_t>)) {
|
||||
EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
|
||||
EXPECT_EQ(op_to_count["Gemm"], 0);
|
||||
|
|
|
@ -40,13 +40,13 @@ def TestReduction(op, data, axes, keepdims): # noqa: N802
|
|||
|
||||
|
||||
def PrintResult(op, axes, keepdims, res): # noqa: N802
|
||||
print(' {"%s",' % op)
|
||||
print(f' {{"{op}",')
|
||||
print("OpAttributesResult(")
|
||||
print(" // ReductionAttribute")
|
||||
print(" {")
|
||||
print(" // axes_")
|
||||
print("{", end="")
|
||||
print(*axes, sep=", ", end="") if axes else print("")
|
||||
print(*axes, sep=", ", end="") if axes else print()
|
||||
print("},")
|
||||
print(" // keep_dims_")
|
||||
print(keepdims, ",")
|
||||
|
@ -60,7 +60,7 @@ def PrintResult(op, axes, keepdims, res): # noqa: N802
|
|||
print(" // expected values")
|
||||
print("{", end="")
|
||||
for i in range(res.size):
|
||||
print("%5.6ff," % res.item(i))
|
||||
print(f"{res.item(i):5.6f}f,")
|
||||
|
||||
print("})},")
|
||||
|
||||
|
@ -130,7 +130,7 @@ if __name__ == "__main__":
|
|||
print("{")
|
||||
for i in range(input_data.size):
|
||||
print(
|
||||
"%5.6ff," % input_data.item(i),
|
||||
f"{input_data.item(i):5.6f}f,",
|
||||
)
|
||||
print("},")
|
||||
print("// input_dims")
|
||||
|
|
|
@ -66,13 +66,13 @@ static void RunAllOpsetAllDomainPadTests(
|
|||
bool pads_is_initializer;
|
||||
bool value_is_initializer;
|
||||
};
|
||||
const std::vector<TestParams> all_test_params {
|
||||
{false, false},
|
||||
const std::vector<TestParams> all_test_params{
|
||||
{false, false},
|
||||
#if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__))
|
||||
// only enable when building NNAPI EP on Android or building CoreML EP for Apple environment
|
||||
// test runs out of memory in QEMU aarch64 environment, so don't enable otherwise
|
||||
// TODO try to enable when we move from QEMU to arm64 CI machines
|
||||
{true, true},
|
||||
// only enable when building NNAPI EP on Android or building CoreML EP for Apple environment
|
||||
// test runs out of memory in QEMU aarch64 environment, so don't enable otherwise
|
||||
// TODO try to enable when we move from QEMU to arm64 CI machines
|
||||
{true, true},
|
||||
#endif
|
||||
};
|
||||
for (const auto& test_params : all_test_params) {
|
||||
|
|
|
@ -835,14 +835,14 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) {
|
|||
|
||||
// Test that models run with various SoC model values
|
||||
TEST_F(QnnHTPBackendTests, HTPSocModels) {
|
||||
constexpr std::array<const char*, 3> soc_models = { "", // No explicit SoC model specified
|
||||
"0", // "Unknown"
|
||||
constexpr std::array<const char*, 3> soc_models = {"", // No explicit SoC model specified
|
||||
"0", // "Unknown"
|
||||
#if defined(_M_ARM64)
|
||||
"37" }; // SC8280X
|
||||
"37"}; // SC8280X
|
||||
#elif defined(__linux__)
|
||||
"30" }; // SM8350
|
||||
"30"}; // SM8350
|
||||
#else
|
||||
"" };
|
||||
""};
|
||||
#endif
|
||||
|
||||
for (auto soc_model : soc_models) {
|
||||
|
|
|
@ -76,7 +76,7 @@ def apply_filters(filters, category):
|
|||
opset_version = f"opset{onnx.defs.onnx_opset_version()}"
|
||||
validated_filters = []
|
||||
for f in filters[category]:
|
||||
if type(f) is list: # noqa: E721
|
||||
if type(f) is list:
|
||||
opset_regex = f[0]
|
||||
filter_regex = f[1]
|
||||
opset_match = re.match(opset_regex, opset_version)
|
||||
|
|
|
@ -486,9 +486,6 @@ class ApplyRotaryEmbKV(torch.autograd.Function):
|
|||
return dkv, None, None, None, None
|
||||
|
||||
|
||||
apply_rotary_emb_kv_ = ApplyRotaryEmbKV.apply
|
||||
|
||||
|
||||
def apply_rotary_emb_kv_(
|
||||
kv,
|
||||
cos,
|
||||
|
|
|
@ -343,9 +343,9 @@ def generate_test_data(
|
|||
try:
|
||||
os.mkdir(path)
|
||||
except OSError:
|
||||
print("Creation of the directory %s failed" % path)
|
||||
print(f"Creation of the directory {path} failed")
|
||||
else:
|
||||
print("Successfully created the directory %s " % path)
|
||||
print(f"Successfully created the directory {path} ")
|
||||
|
||||
if input_tensor_only:
|
||||
return
|
||||
|
|
|
@ -452,9 +452,9 @@ def generate_test_data(
|
|||
try:
|
||||
os.mkdir(path)
|
||||
except OSError:
|
||||
print("Creation of the directory %s failed" % path)
|
||||
print(f"Creation of the directory {path} failed")
|
||||
else:
|
||||
print("Successfully created the directory %s " % path)
|
||||
print(f"Successfully created the directory {path} ")
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
|
|
|
@ -381,9 +381,9 @@ struct StandaloneCustomOp : Ort::CustomOpBase<StandaloneCustomOp, StandaloneCust
|
|||
/////////////// structures to test multi-kernls-single-schema ///////////////
|
||||
|
||||
struct MulTopKernelFloat {
|
||||
MulTopKernelFloat(const OrtKernelInfo*){};
|
||||
MulTopKernelFloat(const OrtKernelInfo*) {};
|
||||
~MulTopKernelFloat() = default;
|
||||
void Compute(OrtKernelContext*){};
|
||||
void Compute(OrtKernelContext*) {};
|
||||
};
|
||||
|
||||
struct MulTopOpFloat : Ort::CustomOpBase<MulTopOpFloat, MulTopKernelFloat> {
|
||||
|
@ -397,9 +397,9 @@ struct MulTopOpFloat : Ort::CustomOpBase<MulTopOpFloat, MulTopKernelFloat> {
|
|||
};
|
||||
|
||||
struct MulTopKernelInt32 {
|
||||
MulTopKernelInt32(const OrtKernelInfo*){};
|
||||
MulTopKernelInt32(const OrtKernelInfo*) {};
|
||||
~MulTopKernelInt32() = default;
|
||||
void Compute(OrtKernelContext*){};
|
||||
void Compute(OrtKernelContext*) {};
|
||||
};
|
||||
|
||||
struct MulTopOpInt32 : Ort::CustomOpBase<MulTopOpInt32, MulTopKernelInt32> {
|
||||
|
@ -413,9 +413,9 @@ struct MulTopOpInt32 : Ort::CustomOpBase<MulTopOpInt32, MulTopKernelInt32> {
|
|||
};
|
||||
|
||||
struct MulTopKernelDouble {
|
||||
MulTopKernelDouble(const OrtKernelInfo*){};
|
||||
MulTopKernelDouble(const OrtKernelInfo*) {};
|
||||
~MulTopKernelDouble() = default;
|
||||
void Compute(OrtKernelContext*){};
|
||||
void Compute(OrtKernelContext*) {};
|
||||
};
|
||||
|
||||
// MulTopOpDouble and MulTopOpFloat has input count mismatch
|
||||
|
@ -430,9 +430,9 @@ struct MulTopOpDouble : Ort::CustomOpBase<MulTopOpDouble, MulTopKernelDouble> {
|
|||
};
|
||||
|
||||
struct MulTopKernelInt16 {
|
||||
MulTopKernelInt16(const OrtKernelInfo*){};
|
||||
MulTopKernelInt16(const OrtKernelInfo*) {};
|
||||
~MulTopKernelInt16() = default;
|
||||
void Compute(OrtKernelContext*){};
|
||||
void Compute(OrtKernelContext*) {};
|
||||
};
|
||||
|
||||
// MulTopOpInt16 and MulTopOpFloat has output count mismatch
|
||||
|
@ -448,9 +448,9 @@ struct MulTopOpInt16 : Ort::CustomOpBase<MulTopOpInt16, MulTopKernelInt16> {
|
|||
|
||||
// MulTopKernelFloat16 and MulTopOpFloat has input characteristic mismatch
|
||||
struct MulTopKernelFloat16 {
|
||||
MulTopKernelFloat16(const OrtKernelInfo*){};
|
||||
MulTopKernelFloat16(const OrtKernelInfo*) {};
|
||||
~MulTopKernelFloat16() = default;
|
||||
void Compute(OrtKernelContext*){};
|
||||
void Compute(OrtKernelContext*) {};
|
||||
};
|
||||
|
||||
struct MulTopOpFloat16 : Ort::CustomOpBase<MulTopOpFloat16, MulTopKernelFloat16> {
|
||||
|
|
|
@ -48,10 +48,10 @@ def Save(dir, func, feed, outputs): # noqa: N802
|
|||
if actual_input_name.startswith(cntk_name):
|
||||
cntk_to_actual_names[cntk_name] = actual_input_name
|
||||
|
||||
if type(feed) is not dict: # noqa: E721
|
||||
if type(feed) is not dict:
|
||||
feed = {func.arguments[0]: feed}
|
||||
|
||||
if type(outputs) is not dict: # noqa: E721
|
||||
if type(outputs) is not dict:
|
||||
outputs = {func.outputs[0]: outputs}
|
||||
|
||||
test_data_dir = os.path.join(dir, data_dir)
|
||||
|
|
|
@ -35,8 +35,7 @@ void AdasumMPI::InitializeVHDDReductionComms(WorkerGroupType worker_group) {
|
|||
int nearest_power_2 = 1;
|
||||
int log_size;
|
||||
for (nearest_power_2 = 1, log_size = 0; (nearest_power_2 << 1) <= size;
|
||||
nearest_power_2 = (nearest_power_2 << 1), log_size++)
|
||||
;
|
||||
nearest_power_2 = (nearest_power_2 << 1), log_size++);
|
||||
int shift_val;
|
||||
int level;
|
||||
reduction_comms_ = std::make_unique<std::vector<MPI_Comm>>();
|
||||
|
|
|
@ -247,7 +247,7 @@ struct PipelineWorkerState {
|
|||
|
||||
struct PipelineWorkerPool {
|
||||
PipelineWorkerPool() = default;
|
||||
PipelineWorkerPool(size_t num_workers) : workers(num_workers), worker_states(num_workers){};
|
||||
PipelineWorkerPool(size_t num_workers) : workers(num_workers), worker_states(num_workers) {};
|
||||
void Join(size_t worker_id);
|
||||
void JoinAll();
|
||||
|
||||
|
|
|
@ -102,7 +102,7 @@ class OrtTorchFunctionPool final {
|
|||
void UnRegisterFunctions();
|
||||
|
||||
private:
|
||||
OrtTorchFunctionPool(){};
|
||||
OrtTorchFunctionPool() {};
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OrtTorchFunctionPool);
|
||||
|
||||
void UnRegisterGlobalFunctions();
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
// See https://docs.python.org/3/c-api/init.html#non-python-created-threads for details.
|
||||
class GilGuard {
|
||||
public:
|
||||
GilGuard() : state_(PyGILState_Ensure()){};
|
||||
GilGuard() : state_(PyGILState_Ensure()) {};
|
||||
~GilGuard() { PyGILState_Release(state_); };
|
||||
|
||||
private:
|
||||
|
|
|
@ -95,8 +95,8 @@ class TorchProxy {
|
|||
std::vector<int64_t>& bw_output_to_input_alias_map);
|
||||
|
||||
private:
|
||||
TorchProxy(){};
|
||||
~TorchProxy(){};
|
||||
TorchProxy() {};
|
||||
~TorchProxy() {};
|
||||
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TorchProxy);
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ struct OpDef {
|
|||
OpDef(const std::string& type, const std::string& domain = kOnnxDomain, const int opset_version = 9)
|
||||
: type(type),
|
||||
domain(domain),
|
||||
opset_version(opset_version){};
|
||||
opset_version(opset_version) {};
|
||||
|
||||
std::string type;
|
||||
std::string domain;
|
||||
|
@ -52,7 +52,7 @@ struct NodeDef {
|
|||
output_args(output_args),
|
||||
attributes(attributes),
|
||||
name(name),
|
||||
priority(priority){};
|
||||
priority(priority) {};
|
||||
|
||||
NodeDef(const std::string& op_type,
|
||||
const std::vector<ArgDef>& input_args,
|
||||
|
@ -64,7 +64,7 @@ struct NodeDef {
|
|||
output_args(output_args),
|
||||
attributes(attributes),
|
||||
name(name),
|
||||
priority(priority){};
|
||||
priority(priority) {};
|
||||
|
||||
NodeDef(const OpDef& op_def,
|
||||
const std::vector<ArgDef>& input_args,
|
||||
|
|
|
@ -21,7 +21,7 @@ struct LossFunctionInfo {
|
|||
|
||||
struct ILossFunction {
|
||||
virtual GraphAugmenter::GraphDefs operator()(const Graph& graph, const LossFunctionInfo& loss_func_info) = 0;
|
||||
virtual ~ILossFunction(){};
|
||||
virtual ~ILossFunction() {};
|
||||
};
|
||||
|
||||
TypeProto* GetSparseTypeProto(const NodeArg* input_arg,
|
||||
|
|
|
@ -887,7 +887,7 @@ struct PipelineStageNodeGroup {
|
|||
// the consumer nodes of a particular initializer can be more than one, so we need a vector to store those
|
||||
// nodes.
|
||||
std::vector<Node*> nodes;
|
||||
PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)){};
|
||||
PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)) {};
|
||||
};
|
||||
|
||||
// This function passes through the given initializer across stages specified in node_groups[i].stage_id.
|
||||
|
|
|
@ -21,7 +21,7 @@ struct OpInfo {
|
|||
const size_t output_count = 1) : op_type(op_type),
|
||||
supported_versions(supported_versions),
|
||||
domain(domain),
|
||||
output_count(output_count){};
|
||||
output_count(output_count) {};
|
||||
|
||||
std::string op_type;
|
||||
std::initializer_list<OperatorSetVersion> supported_versions;
|
||||
|
@ -53,7 +53,7 @@ const OpInfo where_info = OpInfo("Where", opset_v9);
|
|||
struct NodeInfo {
|
||||
NodeInfo(const std::vector<OpInfo>& op_infos,
|
||||
const bool required = true) : op_infos(op_infos),
|
||||
required(required){};
|
||||
required(required) {};
|
||||
|
||||
std::vector<OpInfo> op_infos;
|
||||
bool required;
|
||||
|
|
|
@ -46,7 +46,7 @@ class TrainingSession : public InferenceSession {
|
|||
|
||||
TrainingSession(const SessionOptions& session_options, const Environment& env)
|
||||
: InferenceSession(session_options, env), is_mixed_precision_enabled_(false) {}
|
||||
virtual ~TrainingSession(){};
|
||||
virtual ~TrainingSession() {};
|
||||
|
||||
/**
|
||||
* The training configuration options.
|
||||
|
@ -215,11 +215,11 @@ class TrainingSession : public InferenceSession {
|
|||
// If the edge is unique, i.e. only have one consumer node, or all the edges
|
||||
// with the same node_arg_name needs to be cut, specify the node_arg_name
|
||||
// suffices.
|
||||
CutEdge(std::string edge) : node_arg_name(edge){};
|
||||
CutEdge(std::string edge) : node_arg_name(edge) {};
|
||||
// If the edges with same node_arg_name belongs to different cut, i.e. some of its
|
||||
// consumer node belongs to one partition, and some belongs to another, specify
|
||||
// the consumer node names which you want to perform the cut on.
|
||||
CutEdge(std::string edge, std::vector<std::string> nodes) : node_arg_name(edge), consumer_nodes(nodes){};
|
||||
CutEdge(std::string edge, std::vector<std::string> nodes) : node_arg_name(edge), consumer_nodes(nodes) {};
|
||||
};
|
||||
// CutInfo is a group of CutEdges that describes a specific cut that composed of splitting those edges.
|
||||
typedef std::vector<CutEdge> CutInfo;
|
||||
|
|
|
@ -60,7 +60,7 @@ class DynamicSettings {
|
|||
}
|
||||
|
||||
private:
|
||||
DynamicSettings() : onnx_fusion_status_(true){};
|
||||
DynamicSettings() : onnx_fusion_status_(true) {};
|
||||
bool onnx_fusion_status_;
|
||||
};
|
||||
|
||||
|
|
|
@ -861,8 +861,7 @@ int main(int argc, char* argv[]) {
|
|||
OrtParameters ort_params{};
|
||||
RETURN_IF_FAIL(ParseArguments(argc, argv, params, ort_params));
|
||||
bool keep_looping = params.debug_break;
|
||||
while (keep_looping)
|
||||
;
|
||||
while (keep_looping);
|
||||
|
||||
// setup logger, be noted: LOGS_DEFAULT must be after logging manager initialization.
|
||||
string default_logger_id{"Default"};
|
||||
|
|
|
@ -86,36 +86,36 @@ int main(int argc, char* argv[]) {
|
|||
// setup onnxruntime env
|
||||
std::vector<FreeDimensionOverride> overrides = {};
|
||||
SessionOptions so = {
|
||||
ExecutionMode::ORT_SEQUENTIAL, // execution_mode
|
||||
ExecutionOrder::DEFAULT, // execution_order
|
||||
false, // enable_profiling
|
||||
ORT_TSTR(""), // optimized_model_filepath
|
||||
true, // enable_mem_pattern
|
||||
true, // enable_mem_reuse
|
||||
true, // enable_cpu_mem_arena
|
||||
ORT_TSTR("onnxruntime_profile_"), // profile_file_prefix
|
||||
"", // session_logid
|
||||
-1, // session_log_severity_level
|
||||
0, // session_log_verbosity_level
|
||||
5, // max_num_graph_transformation_steps
|
||||
TransformerLevel::Level1, // graph_optimization_level
|
||||
{}, // intra_op_param
|
||||
{}, // inter_op_param
|
||||
overrides, // free_dimension_overrides
|
||||
true, // use_per_session_threads
|
||||
true, // thread_pool_allow_spinning
|
||||
false, // use_deterministic_compute
|
||||
{}, // session_configurations
|
||||
{}, // initializers_to_share_map
|
||||
ExecutionMode::ORT_SEQUENTIAL, // execution_mode
|
||||
ExecutionOrder::DEFAULT, // execution_order
|
||||
false, // enable_profiling
|
||||
ORT_TSTR(""), // optimized_model_filepath
|
||||
true, // enable_mem_pattern
|
||||
true, // enable_mem_reuse
|
||||
true, // enable_cpu_mem_arena
|
||||
ORT_TSTR("onnxruntime_profile_"), // profile_file_prefix
|
||||
"", // session_logid
|
||||
-1, // session_log_severity_level
|
||||
0, // session_log_verbosity_level
|
||||
5, // max_num_graph_transformation_steps
|
||||
TransformerLevel::Level1, // graph_optimization_level
|
||||
{}, // intra_op_param
|
||||
{}, // inter_op_param
|
||||
overrides, // free_dimension_overrides
|
||||
true, // use_per_session_threads
|
||||
true, // thread_pool_allow_spinning
|
||||
false, // use_deterministic_compute
|
||||
{}, // session_configurations
|
||||
{}, // initializers_to_share_map
|
||||
#if !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_EXTERNAL_INITIALIZERS)
|
||||
{}, // external_initializers
|
||||
{}, // external_initializer_files
|
||||
{}, // external_initializers
|
||||
{}, // external_initializer_files
|
||||
#endif
|
||||
nullptr, // custom_create_thread_fn
|
||||
nullptr, // custom_thread_creation_options
|
||||
nullptr, // custom_join_thread_fn
|
||||
nullptr, // custom_create_thread_fn
|
||||
nullptr, // custom_thread_creation_options
|
||||
nullptr, // custom_join_thread_fn
|
||||
#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
|
||||
{}, // custom_op_libs
|
||||
{}, // custom_op_libs
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ class RandomDataSet : public DataSet {
|
|||
: DataSet(tensor_names),
|
||||
num_samples_(num_samples),
|
||||
tensor_shapes_(tensor_shapes),
|
||||
tensor_types_(tensor_types){};
|
||||
tensor_types_(tensor_types) {};
|
||||
|
||||
virtual ~RandomDataSet() {}
|
||||
|
||||
|
@ -189,7 +189,7 @@ class LossScaler {
|
|||
min_loss_scale_(min_loss_scale),
|
||||
max_loss_scale_(max_loss_scale),
|
||||
loss_scale_(loss_scale),
|
||||
stable_steps_(0){};
|
||||
stable_steps_(0) {};
|
||||
|
||||
std::string GetLossScaleInputName() const { return loss_scale_input_name_; }
|
||||
|
||||
|
|
|
@ -319,7 +319,7 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
|
||||
pool.RegisterForwardRunner(function_address);
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
#endif
|
||||
});
|
||||
m.def("register_backward_runner", [](py::object obj) -> void {
|
||||
|
@ -328,7 +328,7 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
|
||||
pool.RegisterBackwardRunner(function_address);
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
#endif
|
||||
});
|
||||
m.def("register_torch_autograd_function", [](std::string function_full_qual_name, py::object obj) -> void {
|
||||
|
@ -336,8 +336,8 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
|
||||
pool.RegisterTorchAutogradFunction(function_full_qual_name, obj.ptr());
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(function_full_qual_name);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
ORT_UNUSED_PARAMETER(function_full_qual_name);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
#endif
|
||||
});
|
||||
m.def("register_shape_inference_function", [](std::string function_full_qual_name, py::object obj) -> void {
|
||||
|
@ -345,8 +345,8 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
|
||||
pool.RegisterShapeInferenceFunction(function_full_qual_name, obj.ptr());
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(function_full_qual_name);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
ORT_UNUSED_PARAMETER(function_full_qual_name);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
#endif
|
||||
});
|
||||
m.def("get_shape_inference_function", [](std::string function_full_qual_name) -> py::object {
|
||||
|
@ -368,8 +368,8 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
|
||||
pool.RegisterInputAliasFunction(function_full_qual_name, obj.ptr());
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(function_full_qual_name);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
ORT_UNUSED_PARAMETER(function_full_qual_name);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
#endif
|
||||
});
|
||||
m.def("register_miscellaneous_const_input", [](py::object obj) -> void {
|
||||
|
@ -377,7 +377,7 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
|
||||
pool.RegisterMiscellaneousConstInput(obj.ptr());
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
ORT_UNUSED_PARAMETER(obj);
|
||||
#endif
|
||||
});
|
||||
m.def("unregister_python_functions", []() -> void {
|
||||
|
@ -391,14 +391,14 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
#ifdef ENABLE_TRAINING_TORCH_INTEROP
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
return false;
|
||||
#endif
|
||||
});
|
||||
m.def("is_triton_enabled", []() -> bool {
|
||||
#ifdef ENABLE_TRITON
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
return false;
|
||||
#endif
|
||||
});
|
||||
#ifdef ENABLE_TRITON
|
||||
|
@ -1036,7 +1036,7 @@ void addObjectMethodsForTraining(py::module& m) {
|
|||
#ifdef __linux__
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
return false;
|
||||
#endif
|
||||
});
|
||||
#endif
|
||||
|
|
|
@ -372,7 +372,7 @@ def _gen_bmm_module(
|
|||
) -> Tuple[str, ModuleType]:
|
||||
func_name = gen_unique_name("bmm")
|
||||
kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name)
|
||||
batch = batch_a if batch_a >= batch_b else batch_b
|
||||
batch = max(batch_a, batch_b)
|
||||
kwargs["stride_aq"] = m * k if batch_a == batch else 0
|
||||
kwargs["stride_bq"] = k * n if batch_b == batch else 0
|
||||
kwargs["batch"] = batch
|
||||
|
|
|
@ -74,7 +74,7 @@ def _ortvalues_to_torch_tensor(
|
|||
return tuple(C.to_aten_ort_device_tensor(ov) for ov in ortvalues)
|
||||
|
||||
if not isinstance(ortvalues, C.OrtValueVector):
|
||||
raise TypeError("ortvalues must be an instance of OrtValueVector not %r." % type(ortvalues))
|
||||
raise TypeError(f"ortvalues must be an instance of OrtValueVector not {type(ortvalues)!r}.")
|
||||
|
||||
res: List[torch.Tensor] = ortvalues.to_dlpacks(_from_dlpack)
|
||||
bool_indices = ortvalues.bool_tensor_indices()
|
||||
|
|
|
@ -58,8 +58,8 @@ class PyNodeSharedPointerPool {
|
|||
}
|
||||
|
||||
private:
|
||||
PyNodeSharedPointerPool(){};
|
||||
~PyNodeSharedPointerPool(){};
|
||||
PyNodeSharedPointerPool() {};
|
||||
~PyNodeSharedPointerPool() {};
|
||||
|
||||
PyNodeSharedPointerPool(const PyNodeSharedPointerPool&) = delete;
|
||||
PyNodeSharedPointerPool& operator=(const PyNodeSharedPointerPool&) = delete;
|
||||
|
|
|
@ -159,7 +159,7 @@ struct PipelineStageNodeGroup {
|
|||
// the consumer nodes of a particular initializer can be more than one, so we need a vector to store those
|
||||
// nodes.
|
||||
std::vector<Node*> nodes;
|
||||
PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)){};
|
||||
PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)) {};
|
||||
};
|
||||
|
||||
// This function passes through the given initializer across stages specified in node_groups[i].stage_id.
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче