A tutorial of build ort-extensions from source as a static library (#703)
* The tutorial of build from source as a static library * update test flag control * add the tutorial
This commit is contained in:
Родитель
3b889fc42f
Коммит
8645a846fb
|
@ -45,9 +45,14 @@ set(CMAKE_CXX_EXTENSIONS OFF)
|
|||
include(CheckCXXCompilerFlag)
|
||||
include(CheckLanguage)
|
||||
|
||||
set(_ORTX_STANDALONE_PROJECT OFF)
|
||||
if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
|
||||
set(_ORTX_STANDALONE_PROJECT ON)
|
||||
endif()
|
||||
|
||||
option(CC_OPTIMIZE "Allow compiler optimizations, Set to OFF to disable" ON)
|
||||
option(OCOS_ENABLE_PYTHON "Enable Python component building, (deprecated)" OFF)
|
||||
option(OCOS_ENABLE_CTEST "Enable C++ test" ON)
|
||||
option(OCOS_ENABLE_CTEST "Enable C++ test" ${_ORTX_STANDALONE_PROJECT})
|
||||
option(OCOS_ENABLE_CPP_EXCEPTIONS "Enable C++ Exception" ON)
|
||||
option(OCOS_ENABLE_TF_STRING "Enable String Operator Set" ON)
|
||||
option(OCOS_ENABLE_RE2_REGEX "Enable StringRegexReplace and StringRegexSplit" ON)
|
||||
|
@ -877,6 +882,7 @@ if(OCOS_BUILD_APPLE_FRAMEWORK)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (_ORTX_STANDALONE_PROJECT)
|
||||
# clean up the requirements.txt files from 3rd party project folder to suppress the code security false alarms
|
||||
file(GLOB_RECURSE NO_USE_FILES ${CMAKE_BINARY_DIR}/_deps/*requirements.txt)
|
||||
message(STATUS "Found the following requirements.txt: ${NO_USE_FILES}")
|
||||
|
@ -887,6 +893,7 @@ endforeach()
|
|||
|
||||
# Run CPack to generate the NuGet package
|
||||
include(CPack)
|
||||
endif()
|
||||
|
||||
if(OCOS_ENABLE_CTEST)
|
||||
include(ext_tests)
|
||||
|
|
|
@ -114,7 +114,7 @@ class BpeModel {
|
|||
|
||||
id2token_map_.resize(vocab_map_.size());
|
||||
for (const auto& [t, i] : vocab_map_) {
|
||||
if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
|
||||
if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
|
||||
continue; // safe purpose.
|
||||
}
|
||||
if (i > id2token_map_.size()) {
|
||||
|
@ -183,7 +183,7 @@ class BpeModel {
|
|||
|
||||
id2token_map_.resize(vocab_map_.size());
|
||||
for (const auto& [t, i] : vocab_map_) {
|
||||
if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
|
||||
if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
|
||||
continue; // safe purpose.
|
||||
}
|
||||
if (i > id2token_map_.size()) {
|
||||
|
@ -256,7 +256,7 @@ class BpeModel {
|
|||
void PerformBPE(std::list<std::pair<uint32_t, uint32_t>>& vals) const {
|
||||
while (vals.size() >= 2) {
|
||||
auto pos_it = vals.end();
|
||||
uint32_t minval = std::numeric_limits<uint32_t>::max();
|
||||
uint32_t minval = (std::numeric_limits<uint32_t>::max)();
|
||||
uint32_t ori_id1 = 0, ori_id2 = 0;
|
||||
uint32_t aim_id = 0;
|
||||
int token_length = 0;
|
||||
|
@ -355,7 +355,7 @@ class BpeModel {
|
|||
std::unordered_map<std::string, uint32_t> vocab_map_;
|
||||
std::vector<std::string> id2token_map_;
|
||||
|
||||
uint32_t unk_id_ = std::numeric_limits<uint32_t>::max();
|
||||
uint32_t unk_id_ = (std::numeric_limits<uint32_t>::max)();
|
||||
bpe::SpecialTokenMap special_tokens_;
|
||||
TrieTree<char32_t> added_tokens_;
|
||||
};
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
cmake_minimum_required(VERSION 3.25)
|
||||
|
||||
project(ortx_api_test)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
ortx
|
||||
GIT_REPOSITORY https://github.com/microsoft/onnxruntime-extensions.git
|
||||
GIT_TAG a7043c56e4f19c4bf11642d390f7b502f80a34ba)
|
||||
|
||||
set(OCOS_BUILD_PRESET token_api_only)
|
||||
FetchContent_MakeAvailable(ortx)
|
||||
|
||||
file(GLOB_RECURSE SOURCES "src/*.cc")
|
||||
add_executable(ortx_api_test ${SOURCES})
|
||||
target_link_libraries(ortx_api_test onnxruntime_extensions)
|
||||
target_include_directories(ortx_api_test PRIVATE ${ortx_SOURCE_DIR}/include)
|
|
@ -0,0 +1,3 @@
|
|||
# Running the Demo
|
||||
|
||||
To run this demo, you'll need a developer tool like Visual Studio Code or a command line tool that supports CMake to configure the project. Once configured, compile the C++ project `ortx_api_test` to build the test program, and then run it.
|
|
@ -0,0 +1,80 @@
|
|||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ortx_tokenizer.h"
|
||||
|
||||
extError_t tokenize_text(const OrtxTokenizer *tokenizer,
|
||||
const char *text, std::string &decoded_text, std::vector<extTokenId_t> &ids)
|
||||
{
|
||||
OrtxTokenId2DArray *tok_2d_output = NULL;
|
||||
const char *tok_input[] = {text};
|
||||
extError_t err = OrtxTokenize(tokenizer, tok_input, 1, &tok_2d_output);
|
||||
if (err != kOrtxOK)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
|
||||
size_t length = 0;
|
||||
const extTokenId_t *token_ids = NULL;
|
||||
OrtxTokenId2DArrayGetItem(tok_2d_output, 0, &token_ids, &length);
|
||||
|
||||
OrtxStringArray *detok_output = NULL;
|
||||
err = OrtxDetokenize1D(tokenizer, token_ids, length, &detok_output);
|
||||
if (err != kOrtxOK)
|
||||
{
|
||||
ORTX_DISPOSE(tok_2d_output);
|
||||
return err;
|
||||
}
|
||||
ids.insert(ids.end(), token_ids, token_ids + length);
|
||||
|
||||
const char *decoded_str = NULL;
|
||||
OrtxStringArrayGetItem(detok_output, 0, &decoded_str);
|
||||
decoded_text = decoded_str;
|
||||
|
||||
ORTX_DISPOSE(tok_2d_output);
|
||||
ORTX_DISPOSE(detok_output);
|
||||
return kOrtxOK;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int ver = OrtxGetAPIVersion();
|
||||
std::cout << "Ortx API version: " << ver << std::endl;
|
||||
OrtxTokenizer *tokenizer = NULL;
|
||||
|
||||
std::cout << "Please specify the tokenizer model file path (like <root>/test/data/llama2)" << std::endl;
|
||||
std::string model_path;
|
||||
std::cin >> model_path;
|
||||
|
||||
extError_t err = OrtxCreateTokenizer(&tokenizer, model_path.c_str());
|
||||
if (err != kOrtxOK)
|
||||
{
|
||||
std::cerr << "Failed to create tokenizer" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *input = "How many hours does it take a man to eat a Helicopter?";
|
||||
std::string decoded_text;
|
||||
std::vector<extTokenId_t> ids;
|
||||
err = tokenize_text(tokenizer, input, decoded_text, ids);
|
||||
if (err != kOrtxOK)
|
||||
{
|
||||
std::cerr << "Failed to tokenize text" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "Input : " << input << std::endl;
|
||||
// output the token ids
|
||||
std::cout << "Token IDs: ";
|
||||
for (const auto &id : ids)
|
||||
{
|
||||
std::cout << id << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "Decoded: " << decoded_text << std::endl;
|
||||
|
||||
OrtxDisposeOnly(tokenizer); // Clean up the tokenizer
|
||||
return 0;
|
||||
}
|
Загрузка…
Ссылка в новой задаче