A tutorial of build ort-extensions from source as a static library (#703)

* The tutorial of build from source as a static library * update test flag control * add the tutorial
2024-05-01 13:46:27 -07:00 · 2024-05-01 13:46:27 -07:00 · 8645a846fb
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,9 +45,14 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 include(CheckCXXCompilerFlag)
 include(CheckLanguage)

+set(_ORTX_STANDALONE_PROJECT OFF)
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+    set(_ORTX_STANDALONE_PROJECT ON)
+endif()
+
 option(CC_OPTIMIZE "Allow compiler optimizations, Set to OFF to disable" ON)
 option(OCOS_ENABLE_PYTHON "Enable Python component building, (deprecated)" OFF)
-option(OCOS_ENABLE_CTEST "Enable C++ test" ON)
+option(OCOS_ENABLE_CTEST "Enable C++ test" ${_ORTX_STANDALONE_PROJECT})
 option(OCOS_ENABLE_CPP_EXCEPTIONS "Enable C++ Exception" ON)
 option(OCOS_ENABLE_TF_STRING "Enable String Operator Set" ON)
 option(OCOS_ENABLE_RE2_REGEX "Enable StringRegexReplace and StringRegexSplit" ON)
@ -877,6 +882,7 @@ if(OCOS_BUILD_APPLE_FRAMEWORK)
  endif()
 endif()

+if (_ORTX_STANDALONE_PROJECT)
 # clean up the requirements.txt files from 3rd party project folder to suppress the code security false alarms
 file(GLOB_RECURSE NO_USE_FILES ${CMAKE_BINARY_DIR}/_deps/*requirements.txt)
 message(STATUS "Found the following requirements.txt: ${NO_USE_FILES}")
@ -887,6 +893,7 @@ endforeach()

 # Run CPack to generate the NuGet package
 include(CPack)
+endif()

 if(OCOS_ENABLE_CTEST)
  include(ext_tests)
--- a/operators/tokenizer/bpe_tokenizer.hpp
+++ b/operators/tokenizer/bpe_tokenizer.hpp
@ -114,7 +114,7 @@ class BpeModel {

    id2token_map_.resize(vocab_map_.size());
    for (const auto& [t, i] : vocab_map_) {
-      if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
+      if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
        continue;  // safe purpose.
      }
      if (i > id2token_map_.size()) {
@ -183,7 +183,7 @@ class BpeModel {

    id2token_map_.resize(vocab_map_.size());
    for (const auto& [t, i] : vocab_map_) {
-      if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
+      if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
        continue;  // safe purpose.
      }
      if (i > id2token_map_.size()) {
@ -256,7 +256,7 @@ class BpeModel {
  void PerformBPE(std::list<std::pair<uint32_t, uint32_t>>& vals) const {
    while (vals.size() >= 2) {
      auto pos_it = vals.end();
-      uint32_t minval = std::numeric_limits<uint32_t>::max();
+      uint32_t minval = (std::numeric_limits<uint32_t>::max)();
      uint32_t ori_id1 = 0, ori_id2 = 0;
      uint32_t aim_id = 0;
      int token_length = 0;
@ -355,7 +355,7 @@ class BpeModel {
  std::unordered_map<std::string, uint32_t> vocab_map_;
  std::vector<std::string> id2token_map_;

-  uint32_t unk_id_ = std::numeric_limits<uint32_t>::max();
+  uint32_t unk_id_ = (std::numeric_limits<uint32_t>::max)();
  bpe::SpecialTokenMap special_tokens_;
  TrieTree<char32_t> added_tokens_;
 };
--- a/tutorials/ortx_api/CMakeLists.txt
+++ b/tutorials/ortx_api/CMakeLists.txt
@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.25)
+
+project(ortx_api_test)
+
+set(CMAKE_CXX_STANDARD 17)
+include(FetchContent)
+
+FetchContent_Declare(
+  ortx
+  GIT_REPOSITORY https://github.com/microsoft/onnxruntime-extensions.git
+  GIT_TAG a7043c56e4f19c4bf11642d390f7b502f80a34ba)
+
+set(OCOS_BUILD_PRESET token_api_only)
+FetchContent_MakeAvailable(ortx)
+
+file(GLOB_RECURSE SOURCES "src/*.cc")
+add_executable(ortx_api_test ${SOURCES})
+target_link_libraries(ortx_api_test onnxruntime_extensions)
+target_include_directories(ortx_api_test PRIVATE ${ortx_SOURCE_DIR}/include)
--- a/tutorials/ortx_api/README.md
+++ b/tutorials/ortx_api/README.md
@ -0,0 +1,3 @@
+# Running the Demo
+
+To run this demo, you'll need a developer tool like Visual Studio Code or a command line tool that supports CMake to configure the project. Once configured, compile the C++ project `ortx_api_test` to build the test program, and then run it.
--- a/tutorials/ortx_api/src/main.cc
+++ b/tutorials/ortx_api/src/main.cc
@ -0,0 +1,80 @@
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ortx_tokenizer.h"
+
+extError_t tokenize_text(const OrtxTokenizer *tokenizer,
+                         const char *text, std::string &decoded_text, std::vector<extTokenId_t> &ids)
+{
+  OrtxTokenId2DArray *tok_2d_output = NULL;
+  const char *tok_input[] = {text};
+  extError_t err = OrtxTokenize(tokenizer, tok_input, 1, &tok_2d_output);
+  if (err != kOrtxOK)
+  {
+    return err;
+  }
+
+  size_t length = 0;
+  const extTokenId_t *token_ids = NULL;
+  OrtxTokenId2DArrayGetItem(tok_2d_output, 0, &token_ids, &length);
+
+  OrtxStringArray *detok_output = NULL;
+  err = OrtxDetokenize1D(tokenizer, token_ids, length, &detok_output);
+  if (err != kOrtxOK)
+  {
+    ORTX_DISPOSE(tok_2d_output);
+    return err;
+  }
+  ids.insert(ids.end(), token_ids, token_ids + length);
+
+  const char *decoded_str = NULL;
+  OrtxStringArrayGetItem(detok_output, 0, &decoded_str);
+  decoded_text = decoded_str;
+
+  ORTX_DISPOSE(tok_2d_output);
+  ORTX_DISPOSE(detok_output);
+  return kOrtxOK;
+}
+
+int main()
+{
+  int ver = OrtxGetAPIVersion();
+  std::cout << "Ortx API version: " << ver << std::endl;
+  OrtxTokenizer *tokenizer = NULL;
+
+  std::cout << "Please specify the tokenizer model file path (like <root>/test/data/llama2)" << std::endl;
+  std::string model_path;
+  std::cin >> model_path;
+
+  extError_t err = OrtxCreateTokenizer(&tokenizer, model_path.c_str());
+  if (err != kOrtxOK)
+  {
+    std::cerr << "Failed to create tokenizer" << std::endl;
+    return 1;
+  }
+
+  const char *input = "How many hours does it take a man to eat a Helicopter?";
+  std::string decoded_text;
+  std::vector<extTokenId_t> ids;
+  err = tokenize_text(tokenizer, input, decoded_text, ids);
+  if (err != kOrtxOK)
+  {
+    std::cerr << "Failed to tokenize text" << std::endl;
+    return 1;
+  }
+
+  std::cout << "Input  : " << input << std::endl;
+  // output the token ids
+  std::cout << "Token IDs: ";
+  for (const auto &id : ids)
+  {
+    std::cout << id << " ";
+  }
+  std::cout << std::endl;
+
+  std::cout << "Decoded: " << decoded_text << std::endl;
+
+  OrtxDisposeOnly(tokenizer); // Clean up the tokenizer
+  return 0;
+}