A tutorial of build ort-extensions from source as a static library (#703)

* The tutorial of build from source as a static library

* update test flag control

* add the tutorial
This commit is contained in:
Wenbing Li 2024-05-01 13:46:27 -07:00 коммит произвёл GitHub
Родитель 3b889fc42f
Коммит 8645a846fb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
5 изменённых файлов: 114 добавлений и 5 удалений

Просмотреть файл

@ -45,9 +45,14 @@ set(CMAKE_CXX_EXTENSIONS OFF)
include(CheckCXXCompilerFlag)
include(CheckLanguage)
set(_ORTX_STANDALONE_PROJECT OFF)
if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
set(_ORTX_STANDALONE_PROJECT ON)
endif()
option(CC_OPTIMIZE "Allow compiler optimizations, Set to OFF to disable" ON)
option(OCOS_ENABLE_PYTHON "Enable Python component building, (deprecated)" OFF)
option(OCOS_ENABLE_CTEST "Enable C++ test" ON)
option(OCOS_ENABLE_CTEST "Enable C++ test" ${_ORTX_STANDALONE_PROJECT})
option(OCOS_ENABLE_CPP_EXCEPTIONS "Enable C++ Exception" ON)
option(OCOS_ENABLE_TF_STRING "Enable String Operator Set" ON)
option(OCOS_ENABLE_RE2_REGEX "Enable StringRegexReplace and StringRegexSplit" ON)
@ -877,6 +882,7 @@ if(OCOS_BUILD_APPLE_FRAMEWORK)
endif()
endif()
if (_ORTX_STANDALONE_PROJECT)
# clean up the requirements.txt files from 3rd party project folder to suppress the code security false alarms
file(GLOB_RECURSE NO_USE_FILES ${CMAKE_BINARY_DIR}/_deps/*requirements.txt)
message(STATUS "Found the following requirements.txt: ${NO_USE_FILES}")
@ -887,6 +893,7 @@ endforeach()
# Run CPack to generate the NuGet package
include(CPack)
endif()
if(OCOS_ENABLE_CTEST)
include(ext_tests)

Просмотреть файл

@ -114,7 +114,7 @@ class BpeModel {
id2token_map_.resize(vocab_map_.size());
for (const auto& [t, i] : vocab_map_) {
if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
continue; // safe purpose.
}
if (i > id2token_map_.size()) {
@ -183,7 +183,7 @@ class BpeModel {
id2token_map_.resize(vocab_map_.size());
for (const auto& [t, i] : vocab_map_) {
if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
continue; // safe purpose.
}
if (i > id2token_map_.size()) {
@ -256,7 +256,7 @@ class BpeModel {
void PerformBPE(std::list<std::pair<uint32_t, uint32_t>>& vals) const {
while (vals.size() >= 2) {
auto pos_it = vals.end();
uint32_t minval = std::numeric_limits<uint32_t>::max();
uint32_t minval = (std::numeric_limits<uint32_t>::max)();
uint32_t ori_id1 = 0, ori_id2 = 0;
uint32_t aim_id = 0;
int token_length = 0;
@ -355,7 +355,7 @@ class BpeModel {
std::unordered_map<std::string, uint32_t> vocab_map_;
std::vector<std::string> id2token_map_;
uint32_t unk_id_ = std::numeric_limits<uint32_t>::max();
uint32_t unk_id_ = (std::numeric_limits<uint32_t>::max)();
bpe::SpecialTokenMap special_tokens_;
TrieTree<char32_t> added_tokens_;
};

Просмотреть файл

@ -0,0 +1,19 @@
cmake_minimum_required(VERSION 3.25)
project(ortx_api_test)
set(CMAKE_CXX_STANDARD 17)
include(FetchContent)
FetchContent_Declare(
ortx
GIT_REPOSITORY https://github.com/microsoft/onnxruntime-extensions.git
GIT_TAG a7043c56e4f19c4bf11642d390f7b502f80a34ba)
set(OCOS_BUILD_PRESET token_api_only)
FetchContent_MakeAvailable(ortx)
file(GLOB_RECURSE SOURCES "src/*.cc")
add_executable(ortx_api_test ${SOURCES})
target_link_libraries(ortx_api_test onnxruntime_extensions)
target_include_directories(ortx_api_test PRIVATE ${ortx_SOURCE_DIR}/include)

Просмотреть файл

@ -0,0 +1,3 @@
# Running the Demo
To run this demo, you'll need a developer tool like Visual Studio Code or a command line tool that supports CMake to configure the project. Once configured, compile the C++ project `ortx_api_test` to build the test program, and then run it.

Просмотреть файл

@ -0,0 +1,80 @@
#include <iostream>
#include <string>
#include <vector>
#include "ortx_tokenizer.h"
extError_t tokenize_text(const OrtxTokenizer *tokenizer,
const char *text, std::string &decoded_text, std::vector<extTokenId_t> &ids)
{
OrtxTokenId2DArray *tok_2d_output = NULL;
const char *tok_input[] = {text};
extError_t err = OrtxTokenize(tokenizer, tok_input, 1, &tok_2d_output);
if (err != kOrtxOK)
{
return err;
}
size_t length = 0;
const extTokenId_t *token_ids = NULL;
OrtxTokenId2DArrayGetItem(tok_2d_output, 0, &token_ids, &length);
OrtxStringArray *detok_output = NULL;
err = OrtxDetokenize1D(tokenizer, token_ids, length, &detok_output);
if (err != kOrtxOK)
{
ORTX_DISPOSE(tok_2d_output);
return err;
}
ids.insert(ids.end(), token_ids, token_ids + length);
const char *decoded_str = NULL;
OrtxStringArrayGetItem(detok_output, 0, &decoded_str);
decoded_text = decoded_str;
ORTX_DISPOSE(tok_2d_output);
ORTX_DISPOSE(detok_output);
return kOrtxOK;
}
int main()
{
int ver = OrtxGetAPIVersion();
std::cout << "Ortx API version: " << ver << std::endl;
OrtxTokenizer *tokenizer = NULL;
std::cout << "Please specify the tokenizer model file path (like <root>/test/data/llama2)" << std::endl;
std::string model_path;
std::cin >> model_path;
extError_t err = OrtxCreateTokenizer(&tokenizer, model_path.c_str());
if (err != kOrtxOK)
{
std::cerr << "Failed to create tokenizer" << std::endl;
return 1;
}
const char *input = "How many hours does it take a man to eat a Helicopter?";
std::string decoded_text;
std::vector<extTokenId_t> ids;
err = tokenize_text(tokenizer, input, decoded_text, ids);
if (err != kOrtxOK)
{
std::cerr << "Failed to tokenize text" << std::endl;
return 1;
}
std::cout << "Input : " << input << std::endl;
// output the token ids
std::cout << "Token IDs: ";
for (const auto &id : ids)
{
std::cout << id << " ";
}
std::cout << std::endl;
std::cout << "Decoded: " << decoded_text << std::endl;
OrtxDisposeOnly(tokenizer); // Clean up the tokenizer
return 0;
}