Fix the code security issue and 0.5 C++ release preparation. (#274)

* Fix the code security issue and 0.5 C++ release preparation. * more fixings * vswhere
2022-08-02 10:09:35 -07:00 · 2022-08-02 10:09:35 -07:00 · 5320af1eea
--- a/.az/mshost.yaml
+++ b/.az/mshost.yaml
@ -223,6 +223,20 @@ jobs:
        displayName: Unpack ONNXRuntime package.

      - script: |
+          @echo off
+          set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+          for /f "usebackq delims=" %%i in (`%vswherepath% -latest -property installationPath`) do (
+            if exist "%%i\Common7\Tools\vsdevcmd.bat" (
+              set vsdevcmd="%%i\Common7\Tools\vsdevcmd.bat"
+            )
+          )
+
+          @echo %vsdevcmd% will be used as the VC compiler
+          @echo ##vso[task.setvariable variable=vsdevcmd]%vsdevcmd%
+        displayName: 'locate vsdevcmd via vswhere'
+
+      - script: |
+          call $(vsdevcmd)
          call .\build.bat -DONNXRUNTIME_LIB_DIR=.\onnxruntime-win-x64-$(ort.version)\lib -DOCOS_ENABLE_CTEST=ON
        displayName: build the customop library with onnxruntime

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,8 +10,8 @@ endif()

 set(CPACK_PACKAGE_NAME "onnxruntime_extensions")
 set(CPACK_PACKAGE_VERSION_MAJOR "0")
-set(CPACK_PACKAGE_VERSION_MINOR "3")
-set(CPACK_PACKAGE_VERSION_PATCH "1")
+set(CPACK_PACKAGE_VERSION_MINOR "5")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
 set(VERSION ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})


@ -244,7 +244,7 @@ if (OCOS_ENABLE_TF_STRING)
  target_include_directories(ocos_operators PUBLIC
    ${googlere2_SOURCE_DIR}
    ${farmhash_SOURCE_DIR}/src)
-  list(APPEND OCOS_COMPILE_DEFINITIONS ENABLE_TF_STRING NOMINMAX FARMHASH_NO_BUILTIN_EXPECT)
+  list(APPEND OCOS_COMPILE_DEFINITIONS ENABLE_TF_STRING NOMINMAX FARMHASH_NO_BUILTIN_EXPECT FARMHASH_DEBUG=0)
  list(APPEND ocos_libraries re2)
 endif()

--- a/build.bat
+++ b/build.bat
@ -1,14 +1,8 @@
@ECHO OFF
-ECHO Copy this file to mybuild.bat and make any changes you deem necessary
 SETLOCAL ENABLEDELAYEDEXPANSION
 IF DEFINED VSINSTALLDIR GOTO :VSDEV_CMD
-set VCVARS="NOT/EXISTED"
-FOR %%I in (Enterprise Professional Community BuildTools^
-  ) DO IF EXIST "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\%%I\VC\Auxiliary\Build\vcvars64.bat" (
-       SET VCVARS="%ProgramFiles(x86)%\Microsoft Visual Studio\2019\%%I\VC\Auxiliary\Build\vcvars64.bat" )
+IF NOT DEFINED VCVARS GOTO :NOT_FOUND

-IF NOT EXIST %VCVARS% GOTO :NOT_FOUND
-ECHO Found %VCVARS%
 CALL %VCVARS%

 :VSDEV_CMD
@ -18,15 +12,15 @@ set GENERATOR="Visual Studio 17 2022"

 :START_BUILD
 mkdir .\out\Windows\ 2>NUL
-cmake -G %GENERATOR% -A x64 %* -B out\Windows -S .
+"%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G %GENERATOR% -A x64 %* -B out\Windows -S .
 IF %ERRORLEVEL% NEQ 0 EXIT /B %ERRORLEVEL%
 cmake --build out\Windows --config RelWithDebInfo
 IF %ERRORLEVEL% NEQ 0 EXIT /B %ERRORLEVEL%
 GOTO :EOF

 :NOT_FOUND
-ECHO "No Microsoft Visual Studio 2019 installation found!"
-ECHO "  Or not run from Developer Command Prompt for VS 2022"
+ECHO "No Microsoft Visual Studio installation found!"
+ECHO "  Please run build from Developer Command Prompt"
 EXIT /B 1

 ENDLOCAL
--- a/includes/onnxruntime/onnxruntime_cxx_api.h
+++ b/includes/onnxruntime/onnxruntime_cxx_api.h
@ -466,8 +466,8 @@ struct MemoryAllocation {
  ~MemoryAllocation();
  MemoryAllocation(const MemoryAllocation&) = delete;
  MemoryAllocation& operator=(const MemoryAllocation&) = delete;
-  MemoryAllocation(MemoryAllocation&&);
-  MemoryAllocation& operator=(MemoryAllocation&&);
+  MemoryAllocation(MemoryAllocation&&) noexcept;
+  MemoryAllocation& operator=(MemoryAllocation&&) noexcept;

  void* get() { return p_; }
  size_t size() const { return size_; }
--- a/includes/onnxruntime/onnxruntime_cxx_inline.h
+++ b/includes/onnxruntime/onnxruntime_cxx_inline.h
@ -64,11 +64,11 @@ inline MemoryAllocation::~MemoryAllocation() {
  }
 }

-inline MemoryAllocation::MemoryAllocation(MemoryAllocation&& o) : allocator_(nullptr), p_(nullptr), size_(0) {
+inline MemoryAllocation::MemoryAllocation(MemoryAllocation&& o) noexcept : allocator_(nullptr), p_(nullptr), size_(0) {
  *this = std::move(o);
 }

-inline MemoryAllocation& MemoryAllocation::operator=(MemoryAllocation&& o) {
+inline MemoryAllocation& MemoryAllocation::operator=(MemoryAllocation&& o) noexcept {
  OrtAllocator* alloc = nullptr;
  void* p = nullptr;
  size_t sz = 0;
--- a/onnxruntime_extensions/_version.py
+++ b/onnxruntime_extensions/_version.py
@ -3,4 +3,4 @@
 # license information.
 ###############################################################################

-__version__ = "0.4.2"
+__version__ = "0.5.0"
--- a/operators/string_utils.cc
+++ b/operators/string_utils.cc
@ -174,7 +174,7 @@ uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
      h ^= ByteAs64(data[2]) << 16;
    case 2:
      h ^= ByteAs64(data[1]) << 8;
-    case 1:
+    default: // case 1: make some code analyzer be happier.
      h ^= ByteAs64(data[0]);
      h *= m;
  }
--- a/operators/text/op_equal_impl.hpp
+++ b/operators/text/op_equal_impl.hpp
@ -37,7 +37,7 @@ class BroadcastIteratorRight {
  }

  struct BroadcastIteratorRightState {
-    const BroadcastIteratorRight<T1, T2, T3>* parent;
+    const BroadcastIteratorRight<T1, T2, T3>* parent = nullptr;
    std::vector<int64_t> index1;
    const T1* p1;
    const T1* end_;
--- a/operators/text/string_ecmaregex_split.hpp
+++ b/operators/text/string_ecmaregex_split.hpp
@ -30,7 +30,7 @@ void ECMARegexSplitImpl(const std::string& input, const std::regex& pattern,
                        std::vector<std::string_view>& tokens,
                        std::vector<T>& begin_offsets,
                        std::vector<T>& end_offsets) {
-  int prev_pos = 0;
+  size_t prev_pos = 0;
  for (auto it = std::sregex_iterator(input.begin(), input.end(), pattern); it != std::sregex_iterator(); it++) {
    int cur_pos = it->position();
    int matched_length = it->length();
--- a/operators/text/string_lower.cc
+++ b/operators/text/string_lower.cc
@ -17,7 +17,7 @@ void KernelStringLower::Compute(OrtKernelContext* context) {
  GetTensorMutableDataString(api_, ort_, context, input_X, X);

  for (int64_t i = 0; i < (int64_t)X.size(); ++i) {
-    std::transform(X[i].begin(), X[i].end(), X[i].begin(), ToLower);
+    std::transform(X[i].begin(), X[i].end(), X[i].begin(), [](char c) {return static_cast<char>(ToLower(c));});
  }

  OrtTensorDimensions dimensions(ort_, input_X);
--- a/operators/tokenizer/bert_tokenizer.cc
+++ b/operators/tokenizer/bert_tokenizer.cc
@ -97,8 +97,8 @@ void WordpieceTokenizer::GreedySearch(const ustring& token, std::vector<ustring>
    return;
  }

-  int start = 0;
-  int end = -1;
+  size_t start = 0;
+  size_t end = 0;
  ustring substr;
  for (; start < token.size();) {
    end = token.size();
@ -146,12 +146,12 @@ void TruncateStrategy::Truncate(std::vector<int64_t>& ids1, std::vector<int64_t>
    case TruncateStrategyType::LONGEST_FROM_BACK:

      if ((ids1_keep_len > half_max_len) && (ids2_keep_len > half_max_len)) {
-        ids1_keep_len = max_len - half_max_len;
+        ids1_keep_len = static_cast<size_t>(max_len) - half_max_len;
        ids2_keep_len = half_max_len;
      } else if (ids2_keep_len > ids1_keep_len) {
-        ids2_keep_len = max_len - ids1_keep_len;
+        ids2_keep_len = static_cast<size_t>(max_len) - ids1_keep_len;
      } else {
-        ids1_keep_len = max_len - ids2_keep_len;
+        ids1_keep_len = static_cast<size_t>(max_len) - ids2_keep_len;
      }

      if (strategy_ == TruncateStrategyType::LONGEST_FIRST) {
@ -179,6 +179,7 @@ BertTokenizer::BertTokenizer(
    const std::string& truncation_strategy)
    : do_basic_tokenize_(do_basic_tokenize), max_length_(max_len)
    , truncate_(std::make_unique<TruncateStrategy>(truncation_strategy)) {
+
  vocab_ = std::make_shared<BertTokenizerVocab>(vocab);

  if (do_basic_tokenize) {
--- a/operators/tokenizer/bert_tokenizer_decoder.hpp
+++ b/operators/tokenizer/bert_tokenizer_decoder.hpp
@ -19,11 +19,11 @@ class BertTokenizerDecoder {

 private:
  std::string unk_token_;
-  int32_t unk_token_id_;
-  int32_t sep_token_id_;
-  int32_t pad_token_id_;
-  int32_t cls_token_id_;
-  int32_t mask_token_id_;
+  int32_t unk_token_id_ = -1;
+  int32_t sep_token_id_ = -1;
+  int32_t pad_token_id_ = -1;
+  int32_t cls_token_id_ = -1;
+  int32_t mask_token_id_ = -1;
  std::string suffix_indicator_;
  std::vector<std::string_view> vocab_;
  std::string raw_vocab_;
--- a/operators/tokenizer/gpt2_tokenizer.cc
+++ b/operators/tokenizer/gpt2_tokenizer.cc
@ -535,7 +535,7 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(const ustring& input, int64_t
    }
  }

-  return std::move(res);
+  return res;
 }

 void KernelBpeTokenizer::Compute(OrtKernelContext* context) {
--- a/operators/tokenizer/wordpiece_tokenizer.cc
+++ b/operators/tokenizer/wordpiece_tokenizer.cc
@ -28,8 +28,8 @@ void KernelWordpieceTokenizer_Split(const std::u32string& suffix_indicator,
                                    const std::u32string& text,
                                    std::vector<std::u32string>& words) {
  ustring space(" ");
-  int pos = 0;
-  int last = 0;
+  size_t pos = 0;
+  size_t last = 0;
  words.clear();
  for (; pos < text.size(); ++pos) {
    if (text[pos] == space[0]) {
@ -57,7 +57,7 @@ void KernelWordpieceTokenizer_Tokenizer(const std::unordered_map<std::u32string,
  std::vector<std::u32string> words;
  bool is_bad;
  bool no_existing_rows = n_existing_rows == 0;
-  int start, end;
+  size_t start = 0, end = 0;
  std::u32string substr;
  int64_t cur_substr;
  tokens.clear();
--- a/pyop/pykernel.h
+++ b/pyop/pykernel.h
@ -9,7 +9,7 @@

 struct PyCustomOpDef {
  std::string op_type;
-  uint64_t obj_id;
+  uint64_t obj_id = 0;
  std::vector<int> input_types;
  std::vector<int> output_types;
  std::vector<std::string> attrs;
@ -88,7 +88,7 @@ struct PyCustomOpFactory : Ort::CustomOpBase<PyCustomOpFactory, PyCustomOpKernel
    return static_cast<ONNXTensorElementDataType>(opdef_->output_types[idx]);
  }

-  const PyCustomOpDef* opdef_;
+  const PyCustomOpDef* opdef_ = nullptr;
  std::string op_type_;
  std::string op_domain_;
 };