From 6d7a8659136f446d3db249dde61217b32b3ad2e6 Mon Sep 17 00:00:00 2001 From: Zuwei Zhao <4123666+Zuwei-Zhao@users.noreply.github.com> Date: Wed, 8 Sep 2021 19:21:40 -0500 Subject: [PATCH] Disable c++ exceptions in onnxruntime-extensions. (#143) * Disable c++ exceptions in onnxruntime-extensions. * Remove cxx flags for extensions. * Remove redundant lines. Co-authored-by: Zuwei Zhao --- cmake/noexcep_ops.cmake | 6 +++++- operators/math/segment_sum.cc | 14 +++++++------- .../text/re2_strings/string_regex_replace.cc | 10 +++++----- .../text/re2_strings/string_regex_split.cc | 14 +++++++------- operators/tokenizer/gpt2_tokenizer.cc | 18 +++++++++--------- operators/tokenizer/wordpiece_tokenizer.cc | 2 +- tools/gen_selectedops.py | 10 ++++------ 7 files changed, 38 insertions(+), 36 deletions(-) diff --git a/cmake/noexcep_ops.cmake b/cmake/noexcep_ops.cmake index df8f013c..f316d32f 100644 --- a/cmake/noexcep_ops.cmake +++ b/cmake/noexcep_ops.cmake @@ -1,12 +1,16 @@ # If the oeprator needs the cpp exceptions supports, write down their names if (OCOS_ENABLE_GPT2_TOKENIZER) + # gpt2 tokenizer depends on nlohmann_json in onnxruntime, which is old and cannot disable exceptions. + # could remove this limit when the nlohmann_json is updated in onnxruntime. message(FATAL_ERROR "GPT2_TOKENIZER operator needs c++ exceptions support") endif() if (OCOS_ENABLE_WORDPIECE_TOKENIZER) + # wordpiece tokenizer depends on nlohmann_json in onnxruntime, which is old and cannot disable exceptions. + # could remove this limit when the nlohmann_json is updated in onnxruntime. message(FATAL_ERROR "WORDPIECE_TOKENIZER operator needs c++ exceptions support") endif() if (OCOS_ENABLE_BLINGFIRE) - message(FATAL_ERROR "BLINGFIRE operator needs c++ exceptions support") + message(STATUS "BLINGFIRE operator needs c++ exceptions support, enable exceptions by default!") endif() if (OCOS_ENABLE_SPM_TOKENIZER) message(FATAL_ERROR "SPM_TOKENIZER operator needs c++ exceptions support") diff --git a/operators/math/segment_sum.cc b/operators/math/segment_sum.cc index b81e5126..6b5f7c8d 100644 --- a/operators/math/segment_sum.cc +++ b/operators/math/segment_sum.cc @@ -15,13 +15,13 @@ void KernelSegmentSum_Compute(Ort::CustomOpApi& ort_, OrtKernelContext* context) OrtTensorDimensions dim_data(ort_, data); OrtTensorDimensions dim_seg(ort_, segment_ids); if (dim_data.size() == 0 || dim_seg.size() == 0) - throw std::runtime_error("Both inputs cannot be empty."); + ORT_CXX_API_THROW("Both inputs cannot be empty.", ORT_INVALID_ARGUMENT); if (dim_seg.size() != 1) - throw std::runtime_error("segment_ids must a single tensor"); + ORT_CXX_API_THROW("segment_ids must a single tensor", ORT_INVALID_ARGUMENT); if (dim_data[0] != dim_seg[0]) - throw std::runtime_error(MakeString( + ORT_CXX_API_THROW(MakeString( "First dimensions of data and segment_ids should be the same, data shape: ", dim_data.GetDims(), - " segment_ids shape: ", dim_seg.GetDims())); + " segment_ids shape: ", dim_seg.GetDims()), ORT_INVALID_ARGUMENT); int64_t last_seg = p_segment_ids[dim_seg[0] - 1]; OrtTensorDimensions dim_out = dim_data; @@ -42,9 +42,9 @@ void KernelSegmentSum_Compute(Ort::CustomOpApi& ort_, OrtKernelContext* context) const int64_t* p_seg = p_segment_ids; for (; begin != end; ++p_seg) { if ((p_seg != p_segment_ids) && (*p_seg != *(p_seg - 1)) && (*p_seg != *(p_seg - 1) + 1)) - throw std::runtime_error(MakeString("segment_ids must be increasing but found ", + ORT_CXX_API_THROW(MakeString("segment_ids must be increasing but found ", *(p_seg - 1), " and ", *p_seg, " at position ", - std::distance(p_segment_ids, p_seg), ".")); + std::distance(p_segment_ids, p_seg), "."), ORT_INVALID_ARGUMENT); p_out = p_output + *p_seg * in_stride; p_out_end = p_out + in_stride; for (; p_out != p_out_end; ++p_out, ++begin) @@ -86,6 +86,6 @@ ONNXTensorElementDataType CustomOpSegmentSum::GetInputType(size_t index) const { case 1: return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; default: - throw std::runtime_error("Operator SegmentSum has 2 inputs."); + ORT_CXX_API_THROW("Operator SegmentSum has 2 inputs.", ORT_INVALID_ARGUMENT); } }; diff --git a/operators/text/re2_strings/string_regex_replace.cc b/operators/text/re2_strings/string_regex_replace.cc index 59b98171..a1ba640f 100644 --- a/operators/text/re2_strings/string_regex_replace.cc +++ b/operators/text/re2_strings/string_regex_replace.cc @@ -27,15 +27,15 @@ void KernelStringRegexReplace::Compute(OrtKernelContext* context) { OrtTensorDimensions pattern_dimensions(ort_, pattern); OrtTensorDimensions rewrite_dimensions(ort_, rewrite); if (pattern_dimensions.size() != 1 || pattern_dimensions[0] != 1) - throw std::runtime_error(MakeString( + ORT_CXX_API_THROW(MakeString( "pattern (second input) must contain only one element. It has ", - pattern_dimensions.size(), " dimensions.")); + pattern_dimensions.size(), " dimensions."), ORT_INVALID_ARGUMENT); if (rewrite_dimensions.size() != 1 || rewrite_dimensions[0] != 1) - throw std::runtime_error(MakeString( + ORT_CXX_API_THROW(MakeString( "rewrite (third input) must contain only one element. It has ", - rewrite_dimensions.size(), " dimensions.")); + rewrite_dimensions.size(), " dimensions."), ORT_INVALID_ARGUMENT); if (str_pattern[0].empty()) - throw std::runtime_error("pattern (second input) cannot be empty."); + ORT_CXX_API_THROW("pattern (second input) cannot be empty.", ORT_INVALID_ARGUMENT); // Setup output OrtTensorDimensions dimensions(ort_, input); diff --git a/operators/text/re2_strings/string_regex_split.cc b/operators/text/re2_strings/string_regex_split.cc index 544a8cc8..ae0ad83b 100644 --- a/operators/text/re2_strings/string_regex_split.cc +++ b/operators/text/re2_strings/string_regex_split.cc @@ -24,15 +24,15 @@ void KernelStringRegexSplitWithOffsets::Compute(OrtKernelContext* context) { // Verifications OrtTensorDimensions keep_pattern_dimensions(ort_, keep_pattern); if (str_pattern.size() != 1) - throw std::runtime_error(MakeString( + ORT_CXX_API_THROW(MakeString( "pattern (second input) must contain only one element. It has ", - str_pattern.size(), " values.")); + str_pattern.size(), " values."), ORT_INVALID_ARGUMENT); if (str_keep_pattern.size() > 1) - throw std::runtime_error(MakeString( + ORT_CXX_API_THROW(MakeString( "Third input must contain only one element. It has ", - str_keep_pattern.size(), " values.")); + str_keep_pattern.size(), " values."), ORT_INVALID_ARGUMENT); if (str_pattern[0].empty()) - throw std::runtime_error("Splitting pattern cannot be empty."); + ORT_CXX_API_THROW("Splitting pattern cannot be empty.", ORT_INVALID_ARGUMENT); OrtTensorDimensions dimensions(ort_, input); bool include_delimiter = (str_keep_pattern.size() == 1) && (!str_keep_pattern[0].empty()); @@ -106,7 +106,7 @@ ONNXTensorElementDataType CustomOpStringRegexSplitWithOffsets::GetOutputType(siz case 3: return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; default: - throw std::runtime_error(MakeString( - "StringRegexSplitWithOffsets has 4 outputs but index is ", index, ".")); + ORT_CXX_API_THROW(MakeString( + "StringRegexSplitWithOffsets has 4 outputs but index is ", index, "."), ORT_INVALID_ARGUMENT); } }; diff --git a/operators/tokenizer/gpt2_tokenizer.cc b/operators/tokenizer/gpt2_tokenizer.cc index 9c17bb12..a9d70a5b 100644 --- a/operators/tokenizer/gpt2_tokenizer.cc +++ b/operators/tokenizer/gpt2_tokenizer.cc @@ -29,7 +29,7 @@ class SpecialTokenMap { auto it = token_map_.find(p_str); if (it != token_map_.end()) { if (it->second != p_id) { - throw std::runtime_error("Duplicate special tokens"); + ORT_CXX_API_THROW("Duplicate special tokens.", ORT_INVALID_ARGUMENT); } } else { token_map_[p_str] = p_id; @@ -84,7 +84,7 @@ class SpecialTokenMap { SpecialTokenInfo(ustring p_str, int p_id) : str(std::move(p_str)), id(p_id) { if (str.empty()) { - throw std::runtime_error("Empty special token."); + ORT_CXX_API_THROW("Empty special token.", ORT_INVALID_ARGUMENT); } } }; @@ -147,7 +147,7 @@ class VocabData { if ((line[0] == '#') && (index == 0)) continue; auto pos = line.find(' '); if (pos == std::string::npos) { - throw std::runtime_error("Cannot know how to parse line: " + line); + ORT_CXX_API_THROW("Cannot know how to parse line: " + line, ORT_INVALID_ARGUMENT); } std::string w1 = line.substr(0, pos); std::string w2 = line.substr(pos + 1); @@ -231,14 +231,14 @@ class VocabData { int TokenToID(const std::string& input) const { auto it = vocab_map_.find(input); if (it == vocab_map_.end()) { - throw std::runtime_error("Token not found: " + input); + ORT_CXX_API_THROW("Token not found: " + input, ORT_INVALID_ARGUMENT); } return it->second; } const std::string& IdToToken(int id) const { if ((id < 0) || (id >= id2token_map_.size())) { - throw std::runtime_error("Invalid ID: " + std::to_string(id)); + ORT_CXX_API_THROW("Invalid ID: " + std::to_string(id), ORT_INVALID_ARGUMENT); } return id2token_map_[id]; } @@ -247,7 +247,7 @@ class VocabData { int GetVocabIndex(const std::string& str) { auto it = vocab_map_.find(str); if (it == vocab_map_.end()) { - throw std::runtime_error("Cannot find word in vocabulary: " + str); + ORT_CXX_API_THROW("Cannot find word in vocabulary: " + str, ORT_INVALID_ARGUMENT); } return it->second; } @@ -467,12 +467,12 @@ KernelBpeTokenizer::KernelBpeTokenizer(OrtApi api, const OrtKernelInfo* info) : BaseKernel(api, info) { std::string vocab = ort_.KernelInfoGetAttribute(info, "vocab"); if (vocab.empty()) { - throw std::runtime_error("vocabulary shouldn't be empty."); + ORT_CXX_API_THROW("vocabulary shouldn't be empty.", ORT_INVALID_ARGUMENT); } std::string merges = ort_.KernelInfoGetAttribute(info, "merges"); if (merges.empty()) { - throw std::runtime_error("merges shouldn't be empty."); + ORT_CXX_API_THROW("merges shouldn't be empty.", ORT_INVALID_ARGUMENT); } if (!TryToGetAttribute("padding_length", padding_length_)) { @@ -480,7 +480,7 @@ KernelBpeTokenizer::KernelBpeTokenizer(OrtApi api, const OrtKernelInfo* info) } if (padding_length_ != -1 && padding_length_ <= 0) { - throw std::runtime_error("padding_length should be more than 0 or equal -1"); + ORT_CXX_API_THROW("padding_length should be more than 0 or equal -1", ORT_INVALID_ARGUMENT); } std::stringstream vocabu_stream(vocab); diff --git a/operators/tokenizer/wordpiece_tokenizer.cc b/operators/tokenizer/wordpiece_tokenizer.cc index ae0eba0e..4822535b 100644 --- a/operators/tokenizer/wordpiece_tokenizer.cc +++ b/operators/tokenizer/wordpiece_tokenizer.cc @@ -200,4 +200,4 @@ ONNXTensorElementDataType CustomOpWordpieceTokenizer::GetOutputType(size_t index default: ORT_CXX_API_THROW(MakeString("[WordpieceTokenizer] Unexpected output index ", index), ORT_INVALID_ARGUMENT); } -}; \ No newline at end of file +}; diff --git a/tools/gen_selectedops.py b/tools/gen_selectedops.py index 85124dd1..c5c5fe8f 100644 --- a/tools/gen_selectedops.py +++ b/tools/gen_selectedops.py @@ -4,19 +4,17 @@ import sys OPMAP_TO_CMAKE_FLAGS = {'BlingFireSentenceBreaker': 'OCOS_ENABLE_BLINGFIRE', 'GPT2Tokenizer': 'OCOS_ENABLE_GPT2_TOKENIZER', 'WordpieceTokenizer': 'OCOS_ENABLE_WORDPIECE_TOKENIZER', - # Currently use one option for all string operators because their binary sizes are not large. - # Would probably split to more options like tokenizers in the future. - 'StringRegexReplace': 'OCOS_ENABLE_RE2_REGEX', - 'StringRegexSplitWithOffsets': 'OCOS_ENABLE_RE2_REGEX', 'StringConcat': 'OCOS_ENABLE_TF_STRING', + 'StringECMARegexReplace': 'OCOS_ENABLE_TF_STRING', + 'StringECMARegexSplitWithOffsets': 'OCOS_ENABLE_TF_STRING', 'StringEqual': 'OCOS_ENABLE_TF_STRING', 'StringToHashBucket': 'OCOS_ENABLE_TF_STRING', 'StringToHashBucketFast': 'OCOS_ENABLE_TF_STRING', 'StringJoin': 'OCOS_ENABLE_TF_STRING', 'StringLength': 'OCOS_ENABLE_TF_STRING', 'StringLower': 'OCOS_ENABLE_TF_STRING', - 'StringECMARegexReplace': 'OCOS_ENABLE_TF_STRING', - 'StringECMARegexSplitWithOffsets': 'OCOS_ENABLE_TF_STRING', + 'StringRegexReplace': 'OCOS_ENABLE_RE2_REGEX', + 'StringRegexSplitWithOffsets': 'OCOS_ENABLE_RE2_REGEX', 'StringSplit': 'OCOS_ENABLE_TF_STRING', 'StringToVector': 'OCOS_ENABLE_TF_STRING', 'StringUpper': 'OCOS_ENABLE_TF_STRING',