Support the Unigram tokenizer kind from sentencepiece library (#811)
* initial commit * Ugm vocab loaded is good * test passed * fixes unit test on win32 * finish the parity check * code refinement * code refinement for review
This commit is contained in:
Родитель
0d5d19f67b
Коммит
176c1d0138
|
@ -693,7 +693,7 @@ if(OCOS_ENABLE_BLINGFIRE)
|
|||
endif()
|
||||
|
||||
if(OCOS_ENABLE_GPT2_TOKENIZER OR OCOS_ENABLE_WORDPIECE_TOKENIZER)
|
||||
target_include_directories(ocos_operators PUBLIC ${nlohmann_json_SOURCE_DIR}/single_include)
|
||||
target_include_directories(ocos_operators PUBLIC ${nlohmann_json_SOURCE_DIR}/include)
|
||||
endif()
|
||||
|
||||
# If building a shared library we can't throw an internal exception type across the library boundary as the type
|
||||
|
|
|
@ -11,14 +11,6 @@
|
|||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
#define ORTX_RETURN_IF_ERROR(expr) \
|
||||
do { \
|
||||
auto _status = (expr); \
|
||||
if (_status != nullptr) { \
|
||||
return _status; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
template <typename T>
|
||||
bool TryParseStringWithClassicLocale(std::string_view str, T& value) {
|
||||
if constexpr (std::is_integral<T>::value && std::is_unsigned<T>::value) {
|
||||
|
@ -111,8 +103,3 @@ T ParseEnvironmentVariableWithDefault(const std::string& name, const T& default_
|
|||
|
||||
return default_value;
|
||||
}
|
||||
|
||||
inline bool IsScalarOr1ElementVector(size_t num_dimensions, int64_t shape_size) {
|
||||
if (num_dimensions == 0 || (num_dimensions == 1 && shape_size == 1)) return true;
|
||||
return false;
|
||||
}
|
|
@ -12,19 +12,20 @@
|
|||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
|
||||
namespace ort_extensions {
|
||||
|
||||
class path {
|
||||
public:
|
||||
path() = default;
|
||||
path(const std::string& path) : path_(path) {
|
||||
explicit path(const std::string& path) : path_(path) {
|
||||
#ifdef _WIN32
|
||||
w_path_ = to_wstring();
|
||||
#endif // _WIN32
|
||||
};
|
||||
|
||||
#ifdef _WIN32
|
||||
path(const std::wstring& wpath) {
|
||||
explicit path(const std::wstring& wpath) {
|
||||
int size_needed = WideCharToMultiByte(CP_UTF8, 0, wpath.c_str(), -1, nullptr, 0, nullptr, nullptr);
|
||||
std::string utf8_str(size_needed, 0);
|
||||
WideCharToMultiByte(CP_UTF8, 0, wpath.c_str(), -1, &utf8_str[0], size_needed, nullptr, nullptr);
|
||||
|
@ -53,31 +54,65 @@ class path {
|
|||
return path_;
|
||||
}
|
||||
|
||||
path join(const std::string& path) const {
|
||||
return path_ + separator + path;
|
||||
path join(const std::string& str) const {
|
||||
return path(path_ + separator + str);
|
||||
}
|
||||
|
||||
path operator/(const std::string& path) const {
|
||||
return join(path);
|
||||
path operator/(const std::string& str) const {
|
||||
return join(str);
|
||||
}
|
||||
|
||||
path operator/(const path& path) {
|
||||
return join(path.path_);
|
||||
}
|
||||
|
||||
bool is_regular_file() const {
|
||||
auto info = get_stat();
|
||||
return (info.st_mode & S_IFREG) != 0;
|
||||
}
|
||||
|
||||
bool is_directory() const {
|
||||
auto info = get_stat();
|
||||
return (info.st_mode & S_IFDIR) != 0;
|
||||
}
|
||||
|
||||
std::string extension() const {
|
||||
return path_.substr(path_.find_last_of('.'));
|
||||
}
|
||||
|
||||
std::string parent_path() const {
|
||||
std::string sep = {separator};
|
||||
#ifdef _WIN32
|
||||
sep += "/";
|
||||
#endif // _WIN32
|
||||
auto pos = path_.find_last_of(sep);
|
||||
if (pos == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
return path_.substr(0, pos);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
struct _stat64 get_stat() const {
|
||||
struct _stat64 info;
|
||||
if (_wstat64(w_path_.c_str(), &info) != 0) {
|
||||
return false;
|
||||
return {};
|
||||
}
|
||||
return info;
|
||||
}
|
||||
#else
|
||||
struct stat get_stat() const {
|
||||
struct stat info;
|
||||
if (stat(path_.c_str(), &info) != 0) {
|
||||
return false;
|
||||
return {};
|
||||
}
|
||||
return info;
|
||||
}
|
||||
#endif // _WIN32
|
||||
return (info.st_mode & S_IFDIR) != 0;
|
||||
|
||||
bool exists() const {
|
||||
auto info = get_stat();
|
||||
return (info.st_mode & S_IFMT) != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -95,3 +130,5 @@ class path {
|
|||
};
|
||||
|
||||
} // namespace ort_extensions
|
||||
|
||||
namespace ortx = ort_extensions;
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
FetchContent_Declare(nlohmann_json
|
||||
GIT_REPOSITORY https://github.com/nlohmann/json.git
|
||||
GIT_TAG v3.10.5
|
||||
SOURCE_SUBDIR single_include
|
||||
SOURCE_SUBDIR not_set
|
||||
)
|
||||
|
||||
set(JSON_BuildTests OFF CACHE INTERNAL "")
|
||||
|
||||
add_compile_definitions(JSON_HAS_CPP_17=1)
|
||||
FetchContent_MakeAvailable(nlohmann_json)
|
||||
|
|
|
@ -464,10 +464,10 @@ class Tensor<std::string> : public TensorBase {
|
|||
void SetStringOutput(const std::vector<const char*>& ss, const std::vector<int64_t>& dims) {
|
||||
storage_->SetStringOutput(ss, dims);
|
||||
}
|
||||
const Span<std::string>& AsSpan() {
|
||||
const Span<std::string>& AsSpan() const {
|
||||
ORTX_CXX_API_THROW("span for TensorT of string not implemented", ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
const std::string& AsScalar() {
|
||||
const std::string& AsScalar() const {
|
||||
auto& ss = storage_->Data();
|
||||
if (ss.size() != 1) {
|
||||
ORTX_CXX_API_THROW("to get a scalar, shape must be {1}, actual shape: " + Shape2Str(), ORT_RUNTIME_EXCEPTION);
|
||||
|
@ -542,10 +542,10 @@ class Tensor<std::string_view> : public TensorBase {
|
|||
void SetStringOutput(const std::vector<const char*>& ss, const std::vector<int64_t>& dims) {
|
||||
storage_->SetStringOutput(ss, dims);
|
||||
}
|
||||
const Span<std::string_view>& AsSpan() {
|
||||
const Span<std::string_view>& AsSpan() const {
|
||||
ORTX_CXX_API_THROW("span for TensorT of string not implemented", ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
const std::string_view& AsScalar() {
|
||||
const std::string_view& AsScalar() const {
|
||||
auto& ss = storage_->Data();
|
||||
if (ss.size() != 1) {
|
||||
ORTX_CXX_API_THROW("to get a scalar, shape must be {1}, actual shape: " + Shape2Str(), ORT_RUNTIME_EXCEPTION);
|
||||
|
|
|
@ -26,7 +26,3 @@ ORT_API_CALL GetActiveOrtAPIVersion();
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_C_API)
|
||||
#include "ortx_tokenizer.h"
|
||||
#endif // ENABLE_C_API
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
|
||||
// OrtW: ONNX Runtime C ABI Wrapper
|
||||
namespace OrtW {
|
||||
|
||||
struct CustomOpApi {
|
||||
CustomOpApi(const OrtApi& api) : api_(api) {}
|
||||
|
||||
|
@ -371,3 +370,16 @@ struct OrtTensorDimensions : std::vector<int64_t> {
|
|||
return size() == 1;
|
||||
}
|
||||
};
|
||||
|
||||
inline bool IsScalarOr1ElementVector(size_t num_dimensions, int64_t shape_size) {
|
||||
if (num_dimensions == 0 || (num_dimensions == 1 && shape_size == 1)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
#define ORTX_RETURN_IF_ERROR(expr) \
|
||||
do { \
|
||||
auto _status = (expr); \
|
||||
if (_status != nullptr) { \
|
||||
return _status; \
|
||||
} \
|
||||
} while (0)
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#pragma once
|
||||
#include "ocos.h"
|
||||
#include "add_mul_impl.cuh"
|
||||
#include "ortx_common.h"
|
||||
|
||||
namespace contrib {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#pragma once
|
||||
#include "ocos.h"
|
||||
#include "mul_sigmoid_impl.cuh"
|
||||
#include "ortx_common.h"
|
||||
|
||||
namespace contrib {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#pragma once
|
||||
#include "ocos.h"
|
||||
#include "negxplus1_impl.cuh"
|
||||
#include "ortx_common.h"
|
||||
|
||||
namespace contrib {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#pragma once
|
||||
#include "ocos.h"
|
||||
#include "replace_zero_impl.cuh"
|
||||
#include "ortx_common.h"
|
||||
|
||||
namespace contrib {
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#pragma once
|
||||
#include "ocos.h"
|
||||
#include "transpose_cast_impl.cuh"
|
||||
#include "ortx_common.h"
|
||||
|
||||
namespace contrib {
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include <opencv2/imgcodecs.hpp>
|
||||
|
||||
#include "ocos.h"
|
||||
#include "ortx_common.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
|
|
|
@ -3,10 +3,6 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "ocos.h"
|
||||
#include "ustring.h"
|
||||
#include "narrow.h"
|
||||
#include "ortx_common.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
@ -17,6 +13,10 @@
|
|||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#include "ustring.h"
|
||||
#include "narrow.h"
|
||||
#include "tokjson_types.h"
|
||||
|
||||
struct KernelBpeDecoder {
|
||||
public:
|
||||
virtual ~KernelBpeDecoder() = default;
|
||||
|
@ -115,8 +115,6 @@ struct KernelBpeDecoder {
|
|||
arr_vocab_.shrink_to_fit();
|
||||
}
|
||||
|
||||
const std::string spm_underscore{"\xe2\x96\x81"};
|
||||
|
||||
static bool IsSpmByteWord(std::string_view word) {
|
||||
return word.size() == 6 && word[0] == '<' && word[1] == '0' && word[2] == 'x' && word[5] == '>';
|
||||
}
|
||||
|
@ -146,7 +144,7 @@ struct KernelBpeDecoder {
|
|||
std::copy(ids_dim.begin(), ids_dim.begin() + ids_dim.size() - 1, output_dim.begin());
|
||||
}
|
||||
|
||||
bool spm_mode = byte_decoder_.count(ustring(spm_underscore)[0]) > 0;
|
||||
bool spm_mode = byte_decoder_.count(ustring(ort_extensions::spm_escaped_space)[0]) > 0;
|
||||
|
||||
size_t seq_len = ids_dim.back();
|
||||
size_t string_batch = ids.NumberOfElement() / seq_len;
|
||||
|
@ -180,7 +178,7 @@ struct KernelBpeDecoder {
|
|||
char token = {static_cast<char>(strtol(buf, NULL, 16))};
|
||||
decoded_token.push_back(token);
|
||||
} else {
|
||||
decoded_token.append(ReplaceAll(piece, spm_underscore, " "));
|
||||
decoded_token.append(ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " "));
|
||||
}
|
||||
} else {
|
||||
// the common bpe case
|
||||
|
|
|
@ -7,12 +7,14 @@
|
|||
#include "file_sys.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
#include "bpe_types.h"
|
||||
#include "tokjson_types.h"
|
||||
|
||||
namespace ort_extensions::bpe {
|
||||
|
||||
class TokenJsonConfig final {
|
||||
public:
|
||||
static constexpr const char* kDefaultVocabFile = "tokenizer.json";
|
||||
|
||||
TokenJsonConfig() {}
|
||||
~TokenJsonConfig() {}
|
||||
using json = nlohmann::json;
|
||||
|
@ -24,36 +26,59 @@ class TokenJsonConfig final {
|
|||
return OrtxStatus(kOrtxErrorInvalidArgument, "json_path is empty.");
|
||||
}
|
||||
|
||||
auto file_path = path(json_path) / "tokenizer_config.json";
|
||||
std::ifstream ifs = file_path.open();
|
||||
if (!ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open a json file: " + file_path.string());
|
||||
ortx::path tok_dir(json_path);
|
||||
ortx::path vocab_path(json_path);
|
||||
ortx::path tok_path_obj(json_path);
|
||||
if (tok_path_obj.is_directory()) {
|
||||
vocab_path = tok_dir / kDefaultVocabFile;
|
||||
} else {
|
||||
if (!tok_path_obj.exists()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Invalid file: " + tok_path_obj.string());
|
||||
}
|
||||
|
||||
tok_dir = ortx::path(tok_path_obj.parent_path());
|
||||
}
|
||||
|
||||
auto vocab_file_path = path(json_path) / "tokenizer.json";
|
||||
vocab_path_ = vocab_file_path.string();
|
||||
std::ifstream vocab_fs = vocab_file_path.open();
|
||||
if (!vocab_fs.is_open()) {
|
||||
// No tokenizer.json file present; search for tokenizer module file
|
||||
auto module_file_path = path(json_path) / "tokenizer_module.json";
|
||||
module_path_ = module_file_path.string();
|
||||
std::ifstream tok_module_ifs = module_file_path.open();
|
||||
if (!tok_module_ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "No tokenizer.json or tokenizer_module.json file found.");
|
||||
auto config_path = tok_dir / "tokenizer_config.json";
|
||||
std::ifstream ifs = config_path.open();
|
||||
if (!ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open a json file: " + config_path.string());
|
||||
}
|
||||
|
||||
nlohmann::json json_config = nlohmann::json::parse(ifs);
|
||||
auto module_cfg = tok_dir / "tokenizer_module.json";
|
||||
if (module_cfg.exists()) {
|
||||
module_path_ = module_cfg.string();
|
||||
std::ifstream module_ifs = module_cfg.open();
|
||||
nlohmann::json module_config = nlohmann::json::parse(module_ifs);
|
||||
json_config.update(module_config);
|
||||
}
|
||||
|
||||
model_max_length_ = json_config.value("model_max_length", 1e+30);
|
||||
std::string tiktoken_file = json_config.value("tiktoken_file", "");
|
||||
if (!tiktoken_file.empty()) {
|
||||
auto tktok_path = tok_dir / tiktoken_file;
|
||||
if (tktok_path.exists()) {
|
||||
vocab_path_ = tktok_path.string();
|
||||
} else {
|
||||
nlohmann::json tok_module_json_config = nlohmann::json::parse(tok_module_ifs);
|
||||
auto tiktoken_path = tok_module_json_config.value("tiktoken_file", "");
|
||||
vocab_file_path = path(json_path) / tiktoken_path.c_str();
|
||||
vocab_path_ = vocab_file_path.string();
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Invalid file: " + tiktoken_file);
|
||||
}
|
||||
} else {
|
||||
if (ortx::path(vocab_path).exists()) {
|
||||
vocab_path_ = vocab_path.string();
|
||||
} else {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Invalid file: " + vocab_path.string());
|
||||
}
|
||||
}
|
||||
nlohmann::json json_config = nlohmann::json::parse(ifs);
|
||||
|
||||
tokenizer_class_ = json_config.value("tokenizer_class", "");
|
||||
if (tokenizer_class_.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
add_bos_token_ = json_config.value("add_bos_token", false);
|
||||
add_eos_token_ = json_config.value("add_eos_token", false);
|
||||
clean_up_tokenization_spaces_ = json_config.value("clean_up_tokenization_spaces", false);
|
||||
model_max_length_ = json_config.value("model_max_length", 1e+30);
|
||||
|
||||
tokenizer_class_ = json_config.value("tokenizer_class", "");
|
||||
|
||||
auto tok_iter = json_config.find("bos_token");
|
||||
if (tok_iter != json_config.end() && !tok_iter->is_null()) {
|
||||
|
@ -73,18 +98,12 @@ class TokenJsonConfig final {
|
|||
pad_token_ = json_config.value("pad_token", "");
|
||||
}
|
||||
|
||||
if (tokenizer_class_.empty()) {
|
||||
return OrtxStatus(kOrtxErrorCorruptData, "Failed to get tokenizer class.");
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
const std::string& GetVocabDataFile() const { return vocab_path_; }
|
||||
|
||||
const std::string& GetTikTokenModuleFile() const {
|
||||
return module_path_;
|
||||
}
|
||||
const std::string& GetTikTokenModuleFile() const { return module_path_; }
|
||||
|
||||
public:
|
||||
bool add_bos_token_{};
|
|
@ -1,11 +1,10 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "ortx_common.h"
|
||||
#include "file_sys.h"
|
||||
|
||||
#include "bpe_kernels.h"
|
||||
#include "bpe_json.hpp"
|
||||
#include "bpe_jsoncfg.hpp"
|
||||
#include "bpe_tokenizer.hpp"
|
||||
|
||||
#include "base64.h"
|
||||
|
@ -660,151 +659,7 @@ struct VectorEqual {
|
|||
}
|
||||
};
|
||||
|
||||
OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& config) {
|
||||
std::string voc_file = config.GetVocabDataFile();
|
||||
std::ifstream ifs = path(voc_file).open();
|
||||
if (!ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open json file: " + voc_file);
|
||||
}
|
||||
|
||||
// consider to use SAX parser for large json file
|
||||
nlohmann::json tok_json;
|
||||
std::ifstream module_ifs;
|
||||
|
||||
// Following vocab and merges only used for tiktoken case but accessed outside scope below
|
||||
std::unordered_map<std::string, uint32_t> vocab;
|
||||
std::vector<std::pair<std::string, std::string>> merges;
|
||||
|
||||
if (tiktoken_){
|
||||
std::string module_file = config.GetTikTokenModuleFile();
|
||||
|
||||
module_ifs = path(module_file).open();
|
||||
if (!module_ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open module file: " + module_file);
|
||||
}
|
||||
|
||||
std::unordered_map<std::vector<uint8_t>, uint32_t, VectorHash, VectorEqual> bpe_ranks;
|
||||
|
||||
std::string line;
|
||||
while (std::getline(ifs, line)) {
|
||||
if (!line.empty()) {
|
||||
std::istringstream lineStream(line);
|
||||
std::string token;
|
||||
uint32_t rank;
|
||||
while (lineStream >> token >> rank) {
|
||||
// Decode base64 token and convert rank to int
|
||||
std::vector<uint8_t> decoded_token;
|
||||
base64_decode(token, decoded_token);
|
||||
// Store bpe token and rank
|
||||
bpe_ranks[decoded_token] = rank;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> byte_merges;
|
||||
|
||||
bbpe_tokenizer_ = std::make_unique<BpeModel>();
|
||||
|
||||
for (const auto& item : bpe_ranks) {
|
||||
std::vector<uint8_t> token = item.first;
|
||||
uint32_t rank = item.second;
|
||||
vocab[JsonFastTokenizer::TokenBytesToString(token)] = rank;
|
||||
|
||||
if (token.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> local;
|
||||
for (size_t index = 1; index < token.size(); index++) {
|
||||
std::vector<uint8_t> piece_l(token.begin(), token.begin() + index);
|
||||
std::vector<uint8_t> piece_r(token.begin() + index, token.end());
|
||||
if (bpe_ranks.count(piece_l) && bpe_ranks.count(piece_r)) {
|
||||
local.emplace_back(piece_l, piece_r, rank);
|
||||
}
|
||||
}
|
||||
|
||||
auto compare_bpe_tuples = [&](const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& a,
|
||||
const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& b) {
|
||||
// Compare comparator based on the ranks in bpe_ranks
|
||||
return bpe_ranks[std::get<0>(a)] < bpe_ranks[std::get<0>(b)] ||
|
||||
(bpe_ranks[std::get<0>(a)] == bpe_ranks[std::get<0>(b)] && bpe_ranks[std::get<1>(a)] < bpe_ranks[std::get<1>(b)]);
|
||||
};
|
||||
|
||||
std::sort(local.begin(), local.end(), compare_bpe_tuples);
|
||||
|
||||
byte_merges.insert(byte_merges.end(), local.begin(), local.end());
|
||||
}
|
||||
|
||||
// Custom comparator that compares the third element of the tuples
|
||||
auto compare_merge_tuples = [&](const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& a,
|
||||
const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& b) {
|
||||
return std::get<2>(a) < std::get<2>(b);
|
||||
};
|
||||
|
||||
std::sort(byte_merges.begin(), byte_merges.end(), compare_merge_tuples);
|
||||
|
||||
// Populate merges
|
||||
for (auto& val : byte_merges) {
|
||||
merges.push_back({JsonFastTokenizer::TokenBytesToString(std::get<0>(val)), JsonFastTokenizer::TokenBytesToString(std::get<1>(val))});
|
||||
}
|
||||
}
|
||||
|
||||
const char token_sub[] = "Tokenizer";
|
||||
model_name_ = config.tokenizer_class_.substr(0, config.tokenizer_class_.find(token_sub));
|
||||
json_conf_.name_ = model_name_.c_str();
|
||||
json_conf_.bos_token_ = config.bos_token_.c_str();
|
||||
json_conf_.eos_token_ = config.eos_token_.c_str();
|
||||
json_conf_.unk_token_ = config.unk_token_.c_str();
|
||||
json_conf_.pad_token_ = config.pad_token_.c_str();
|
||||
|
||||
// re-bind the configuration object
|
||||
bpe_conf_ = json_conf_;
|
||||
|
||||
OrtxStatus status;
|
||||
if (tiktoken_){
|
||||
status = bbpe_tokenizer_->Load(vocab,
|
||||
merges,
|
||||
bpe_conf_.get().GetSpecialTokens().c_str(),
|
||||
false);
|
||||
|
||||
module_ifs >> tok_json;
|
||||
} else {
|
||||
ifs >> tok_json;
|
||||
// doesn't work for json with nested objects
|
||||
// auto decoders_node = tok_json.find("/decoder/decoders"_json_pointer);
|
||||
bool has_decoders_node = false;
|
||||
auto decoders_node = tok_json.end();
|
||||
auto decoder_node = tok_json.find("decoder");
|
||||
if (decoder_node != tok_json.end()) {
|
||||
decoders_node = decoder_node->find("decoders");
|
||||
if (decoders_node != decoder_node->end()) {
|
||||
has_decoders_node = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_decoders_node && decoders_node->is_array()) {
|
||||
for(auto step = decoders_node->begin(); step != decoders_node->end(); ++step) {
|
||||
std::string type = step->value("type", "");
|
||||
if (type == "Replace") {
|
||||
std::string target = step->value("/pattern/String"_json_pointer, "");
|
||||
if (target == "\xe2\x96\x81") {
|
||||
json_conf_.spm_model_ = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
auto model_node = tok_json.find("model");
|
||||
if (model_node == tok_json.end()) {
|
||||
return OrtxStatus(kOrtxErrorCorruptData, "Failed to get model node from tokenizer.json");
|
||||
}
|
||||
|
||||
bbpe_tokenizer_ = std::make_unique<BpeModel>();
|
||||
status = bbpe_tokenizer_->Load(*model_node,
|
||||
bpe_conf_.get().GetSpecialTokens().c_str(),
|
||||
bpe_conf_.get().spm_model_);
|
||||
}
|
||||
|
||||
OrtxStatus JsonFastTokenizer::LoadAddedTokens(const json& tok_json, const ort_extensions::bpe::TokenJsonConfig& config) {
|
||||
auto added_tokens = tok_json.find("added_tokens");
|
||||
if (added_tokens != tok_json.end()) {
|
||||
for (const auto& token : *added_tokens) {
|
||||
|
@ -829,18 +684,30 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
|
|||
}
|
||||
}
|
||||
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
return bbpe_tokenizer_->LoadAddedTokens(added_tokens_);
|
||||
}
|
||||
|
||||
status = bbpe_tokenizer_->LoadAddedTokens(added_tokens_);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
// Helper methods (to be added to the class declaration)
|
||||
bool JsonFastTokenizer::CheckForSpmModel(const json& tok_json) {
|
||||
auto decoder_node = tok_json.find("decoder");
|
||||
if (decoder_node != tok_json.end()) {
|
||||
auto decoders_node = decoder_node->find("decoders");
|
||||
if (decoders_node != decoder_node->end() && decoders_node->is_array()) {
|
||||
for (const auto& step : *decoders_node) {
|
||||
std::string type = step.value("type", "");
|
||||
if (type == "Replace") {
|
||||
std::string target = step.value("/pattern/String"_json_pointer, "");
|
||||
if (target == spm_escaped_space) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
add_bos_token_ = config.add_bos_token_;
|
||||
add_eos_token_ = config.add_eos_token_;
|
||||
// add_bos_token is default as false, we need to check post_processor json to see if it is true
|
||||
void JsonFastTokenizer::UpdateTokenAdditionFlags(const json& tok_json, const ort_extensions::bpe::TokenJsonConfig& config) {
|
||||
if (!config.add_bos_token_ && !config.bos_token_.empty()) {
|
||||
auto post_processor = tok_json.find("post_processor");
|
||||
if (post_processor != tok_json.end()) {
|
||||
|
@ -853,6 +720,167 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& config) {
|
||||
std::string voc_file = config.GetVocabDataFile();
|
||||
std::ifstream ifs = path(voc_file).open();
|
||||
if (!ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open json file: " + voc_file);
|
||||
}
|
||||
|
||||
nlohmann::json tok_json;
|
||||
ifs >> tok_json;
|
||||
|
||||
const char token_sub[] = "Tokenizer";
|
||||
model_name_ = config.tokenizer_class_.substr(0, config.tokenizer_class_.find(token_sub));
|
||||
json_conf_.name_ = model_name_.c_str();
|
||||
json_conf_.bos_token_ = config.bos_token_.c_str();
|
||||
json_conf_.eos_token_ = config.eos_token_.c_str();
|
||||
json_conf_.unk_token_ = config.unk_token_.c_str();
|
||||
json_conf_.pad_token_ = config.pad_token_.c_str();
|
||||
|
||||
// re-bind the configuration object
|
||||
bpe_conf_ = json_conf_;
|
||||
|
||||
// Check for SPM model
|
||||
json_conf_.spm_model_ = CheckForSpmModel(tok_json);
|
||||
|
||||
auto model_node = tok_json.find("model");
|
||||
if (model_node == tok_json.end()) {
|
||||
return OrtxStatus(kOrtxErrorCorruptData, "Failed to get model node from tokenizer.json");
|
||||
}
|
||||
|
||||
bbpe_tokenizer_ = std::make_unique<BpeModel>();
|
||||
OrtxStatus status = bbpe_tokenizer_->Load(*model_node,
|
||||
bpe_conf_.get().GetSpecialTokens().c_str(),
|
||||
bpe_conf_.get().spm_model_);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = LoadAddedTokens(tok_json, config);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
add_bos_token_ = config.add_bos_token_;
|
||||
add_eos_token_ = config.add_eos_token_;
|
||||
UpdateTokenAdditionFlags(tok_json, config);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
OrtxStatus JsonFastTokenizer::LoadTikTokenBase64(const ort_extensions::bpe::TokenJsonConfig& config) {
|
||||
std::string voc_file = config.GetVocabDataFile();
|
||||
std::ifstream ifs = path(voc_file).open();
|
||||
if (!ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open json file: " + voc_file);
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, uint32_t> vocab;
|
||||
std::vector<std::pair<std::string, std::string>> merges;
|
||||
std::unordered_map<std::vector<uint8_t>, uint32_t, VectorHash, VectorEqual> bpe_ranks;
|
||||
|
||||
std::string line;
|
||||
while (std::getline(ifs, line)) {
|
||||
if (!line.empty()) {
|
||||
std::istringstream lineStream(line);
|
||||
std::string token;
|
||||
uint32_t rank;
|
||||
while (lineStream >> token >> rank) {
|
||||
// Decode base64 token and convert rank to int
|
||||
std::vector<uint8_t> decoded_token;
|
||||
base64_decode(token, decoded_token);
|
||||
// Store bpe token and rank
|
||||
bpe_ranks[decoded_token] = rank;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> byte_merges;
|
||||
|
||||
bbpe_tokenizer_ = std::make_unique<BpeModel>();
|
||||
|
||||
for (const auto& item : bpe_ranks) {
|
||||
std::vector<uint8_t> token = item.first;
|
||||
uint32_t rank = item.second;
|
||||
vocab[JsonFastTokenizer::TokenBytesToString(token)] = rank;
|
||||
|
||||
if (token.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> local;
|
||||
for (size_t index = 1; index < token.size(); index++) {
|
||||
std::vector<uint8_t> piece_l(token.begin(), token.begin() + index);
|
||||
std::vector<uint8_t> piece_r(token.begin() + index, token.end());
|
||||
if (bpe_ranks.count(piece_l) && bpe_ranks.count(piece_r)) {
|
||||
local.emplace_back(piece_l, piece_r, rank);
|
||||
}
|
||||
}
|
||||
|
||||
auto compare_bpe_tuples = [&](const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& a,
|
||||
const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& b) {
|
||||
// Compare comparator based on the ranks in bpe_ranks
|
||||
return bpe_ranks[std::get<0>(a)] < bpe_ranks[std::get<0>(b)] ||
|
||||
(bpe_ranks[std::get<0>(a)] == bpe_ranks[std::get<0>(b)] && bpe_ranks[std::get<1>(a)] < bpe_ranks[std::get<1>(b)]);
|
||||
};
|
||||
|
||||
std::sort(local.begin(), local.end(), compare_bpe_tuples);
|
||||
|
||||
byte_merges.insert(byte_merges.end(), local.begin(), local.end());
|
||||
}
|
||||
|
||||
// Custom comparator that compares the third element of the tuples
|
||||
auto compare_merge_tuples = [&](const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& a,
|
||||
const std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>& b) {
|
||||
return std::get<2>(a) < std::get<2>(b);
|
||||
};
|
||||
|
||||
std::sort(byte_merges.begin(), byte_merges.end(), compare_merge_tuples);
|
||||
|
||||
// Populate merges
|
||||
for (auto& val : byte_merges) {
|
||||
merges.push_back({JsonFastTokenizer::TokenBytesToString(std::get<0>(val)), JsonFastTokenizer::TokenBytesToString(std::get<1>(val))});
|
||||
}
|
||||
|
||||
const char token_sub[] = "Tokenizer";
|
||||
model_name_ = config.tokenizer_class_.substr(0, config.tokenizer_class_.find(token_sub));
|
||||
json_conf_.name_ = model_name_.c_str();
|
||||
json_conf_.bos_token_ = config.bos_token_.c_str();
|
||||
json_conf_.eos_token_ = config.eos_token_.c_str();
|
||||
json_conf_.unk_token_ = config.unk_token_.c_str();
|
||||
json_conf_.pad_token_ = config.pad_token_.c_str();
|
||||
|
||||
// re-bind the configuration object
|
||||
bpe_conf_ = json_conf_;
|
||||
|
||||
OrtxStatus status = bbpe_tokenizer_->Load(vocab,
|
||||
merges,
|
||||
bpe_conf_.get().GetSpecialTokens().c_str(),
|
||||
false);
|
||||
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
std::string module_file = config.GetTikTokenModuleFile();
|
||||
std::ifstream module_ifs = path(module_file).open();
|
||||
if (!module_ifs.is_open()) {
|
||||
return OrtxStatus(kOrtxErrorInvalidFile, "Failed to open module file: " + module_file);
|
||||
}
|
||||
|
||||
nlohmann::json tok_json;
|
||||
module_ifs >> tok_json;
|
||||
status = LoadAddedTokens(tok_json, config);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
add_bos_token_ = config.add_bos_token_;
|
||||
add_eos_token_ = config.add_eos_token_;
|
||||
UpdateTokenAdditionFlags(tok_json, config);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
|
|
@ -3,15 +3,18 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "ocos.h"
|
||||
#include "ustring.h"
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
#include "bpe_types.h"
|
||||
#include "ortx_tokenizer.h"
|
||||
#include "ext_status.h"
|
||||
#include "op_def_struct.h"
|
||||
#include "nlohmann/json_fwd.hpp"
|
||||
#include "tokjson_types.h"
|
||||
#include "ustring.h"
|
||||
|
||||
|
||||
struct BpeModelConf {
|
||||
const char* name_{"GPT2"}; // this name may be overridden by the tokenizer's attribute.
|
||||
|
@ -25,6 +28,7 @@ struct BpeModelConf {
|
|||
};
|
||||
|
||||
struct KernelBpeTokenizer {
|
||||
using json = nlohmann::json;
|
||||
KernelBpeTokenizer(const BpeModelConf& conf);
|
||||
OrtStatusPtr OnModelAttach(const OrtApi& api, const OrtKernelInfo& info);
|
||||
|
||||
|
@ -113,10 +117,11 @@ class JsonFastTokenizer : public KernelBpeTokenizer {
|
|||
public:
|
||||
JsonFastTokenizer();
|
||||
OrtxStatus Load(const ort_extensions::bpe::TokenJsonConfig& config);
|
||||
OrtxStatus LoadTikTokenBase64(const ort_extensions::bpe::TokenJsonConfig& config);
|
||||
OrtxStatus Compute(const ortc::Tensor<std::string>& input,
|
||||
ortc::Tensor<int64_t>& tokenize_output,
|
||||
std::optional<ortc::Tensor<int64_t>*> attention_mask,
|
||||
std::optional<ortc::Tensor<int64_t>*> offset_mapping) const;
|
||||
std::optional<ortc::Tensor<int64_t>*> attention_mask = std::nullopt,
|
||||
std::optional<ortc::Tensor<int64_t>*> offset_mapping = std::nullopt) const;
|
||||
|
||||
public:
|
||||
const auto& GetAddedTokens() const { return added_tokens_; }
|
||||
|
@ -126,6 +131,10 @@ class JsonFastTokenizer : public KernelBpeTokenizer {
|
|||
|
||||
private:
|
||||
std::string TokenBytesToString(std::vector<uint8_t>& bytes);
|
||||
// template functions to avoid including the huge json header file
|
||||
bool CheckForSpmModel(const json& tok_json);
|
||||
void UpdateTokenAdditionFlags(const json& tok_json, const ort_extensions::bpe::TokenJsonConfig& config);
|
||||
OrtxStatus LoadAddedTokens(const json& tok_json, const ort_extensions::bpe::TokenJsonConfig& config);
|
||||
|
||||
BpeModelConf json_conf_;
|
||||
std::vector<ort_extensions::bpe::AddedToken> added_tokens_;
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
#include "bpe_kernels.h"
|
||||
#include "bpe_decoder.hpp"
|
||||
#include "bpe_json.hpp"
|
||||
#include "bpe_jsoncfg.hpp"
|
||||
#include "bpe_tokenizer.hpp"
|
||||
|
||||
namespace ort_extensions {
|
||||
|
@ -112,7 +112,7 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
|
|||
char buf[3] = {piece[3], piece[4], 0}; // something like <0x20>
|
||||
token = {static_cast<char>(strtol(buf, NULL, 16))};
|
||||
} else {
|
||||
token = ReplaceAll(piece, spm_underscore, " ");
|
||||
token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " ");
|
||||
}
|
||||
|
||||
if (!token.empty() && token[0] == ' ' && f_special_last && add_dummy_prefix_) {
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
#include "nlohmann/json.hpp"
|
||||
#include "bpe_utils.hpp"
|
||||
#include "trietree.hpp"
|
||||
#include "bpe_types.h"
|
||||
#include "tokjson_types.h"
|
||||
|
||||
namespace ort_extensions {
|
||||
|
||||
|
|
|
@ -532,7 +532,7 @@ class TokenWithRegularExp {
|
|||
|
||||
try {
|
||||
return RegexMatchSTD(ustring(regex_expr));
|
||||
} catch (const std::exception& ex) {
|
||||
} catch (const std::exception& /* ex */) {
|
||||
std::string part1 = "Regex '";
|
||||
std::string part2 = "' not supported!";
|
||||
throw std::runtime_error(part1 + regex_expr + part2);
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "ocos.h"
|
||||
#include "ortx_common.h"
|
||||
#include "string_utils.h"
|
||||
#include "string_tensor.h"
|
||||
#include "sentencepiece_processor.h"
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
#include "string_tensor.h"
|
||||
#include "base64.h"
|
||||
#include "narrow.h"
|
||||
#include "ortx_common.h"
|
||||
|
||||
OrtStatusPtr KernelSentencepieceTokenizer::OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) {
|
||||
std::string model_as_string;
|
||||
|
|
|
@ -24,4 +24,6 @@ struct AddedToken final {
|
|||
class TokenJsonConfig; // forward declaration
|
||||
|
||||
} // namespace bpe
|
||||
|
||||
constexpr std::string_view spm_escaped_space = "\xE2\x96\x81";
|
||||
} // namespace ort_extensions
|
|
@ -2,7 +2,6 @@
|
|||
// Licensed under the MIT License.
|
||||
#pragma once
|
||||
#include "ocos.h"
|
||||
#include "ortx_common.h"
|
||||
#include "narrow.h"
|
||||
|
||||
#include <vector>
|
||||
|
|
|
@ -12,12 +12,13 @@
|
|||
|
||||
namespace ort_extensions {
|
||||
|
||||
template <typename CharT, typename ValueT = int>
|
||||
template <typename CharT, typename ValueT = int, int invalid_id = -1>
|
||||
class TrieTree {
|
||||
public:
|
||||
static constexpr int kMaxTokenLength_ = 128;
|
||||
static constexpr ValueT kInvalidId_ = static_cast<ValueT>(invalid_id);
|
||||
|
||||
TrieTree(CharT ch = 0, ValueT invalid_id = -1) : ch_(ch), invalid_id_(invalid_id) {}
|
||||
TrieTree(CharT ch = 0) : ch_(ch), value_(std::nullopt) {}
|
||||
|
||||
void Add(const std::basic_string<CharT>& key, int idx = 0,
|
||||
const std::optional<ValueT>& value = std::nullopt) noexcept {
|
||||
|
@ -36,11 +37,16 @@ class TrieTree {
|
|||
}
|
||||
}
|
||||
|
||||
const TrieTree* Find(CharT ch) const {
|
||||
auto it = to_.find(ch);
|
||||
return it != to_.end() ? it->second.get() : nullptr;
|
||||
}
|
||||
|
||||
ValueT FindLongest(const std::basic_string<CharT>& key, size_t& idx) const noexcept {
|
||||
const TrieTree* u = this;
|
||||
CharT ch = key[idx];
|
||||
|
||||
ValueT tok_id = invalid_id_;
|
||||
ValueT tok_id = invalid_id;
|
||||
size_t idx_end = idx;
|
||||
while (u->to_.count(ch)) {
|
||||
u = u->to_.at(ch).get();
|
||||
|
@ -70,7 +76,7 @@ class TrieTree {
|
|||
auto ch = input[tok_idx];
|
||||
size_t tok_len = 0;
|
||||
size_t idx_end = tok_idx;
|
||||
ValueT tok_id = invalid_id_;
|
||||
ValueT tok_id = invalid_id;
|
||||
|
||||
// try to match a longest token
|
||||
while (u->to_.count(ch)) {
|
||||
|
@ -89,7 +95,7 @@ class TrieTree {
|
|||
}
|
||||
|
||||
tok_idx += 1;
|
||||
if (tok_id == invalid_id_) {
|
||||
if (tok_id == invalid_id) {
|
||||
if (tok_idx < input.length()) {
|
||||
continue;
|
||||
} else {
|
||||
|
@ -102,9 +108,9 @@ class TrieTree {
|
|||
tok_len = idx_end - token_begin_idx;
|
||||
if (token_begin_idx > seg_idx || tok_len == 0) {
|
||||
tokens.emplace_back(std::basic_string_view<CharT>(input.data() + seg_idx, token_begin_idx - seg_idx),
|
||||
invalid_id_);
|
||||
invalid_id);
|
||||
}
|
||||
if (tok_id != invalid_id_) {
|
||||
if (tok_id != invalid_id) {
|
||||
tokens.emplace_back(std::basic_string_view<CharT>(input.data() + token_begin_idx, tok_len), tok_id);
|
||||
tok_idx = idx_end;
|
||||
}
|
||||
|
@ -116,11 +122,18 @@ class TrieTree {
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool HasValue() const {
|
||||
return value_.has_value();
|
||||
}
|
||||
|
||||
ValueT Value() const {
|
||||
return value_.value();
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<CharT, std::unique_ptr<TrieTree>> to_;
|
||||
std::unordered_map<CharT, std::unique_ptr<TrieTree>> to_;
|
||||
std::optional<ValueT> value_;
|
||||
const CharT ch_;
|
||||
const ValueT invalid_id_;
|
||||
};
|
||||
|
||||
} // namespace ort_extensions
|
||||
|
|
|
@ -0,0 +1,457 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
// The implementation is inspired by llama.cpp ugm tokenizer and huggingface FastTokenizer
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cfloat>
|
||||
#include <functional>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "ortx_tokenizer.h"
|
||||
#include "ext_status.h"
|
||||
#include "op_def_struct.h"
|
||||
#include "base64.h"
|
||||
#include "ustring.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "trietree.hpp"
|
||||
#include "bpe_jsoncfg.hpp"
|
||||
|
||||
namespace ort_extensions {
|
||||
|
||||
struct SpmUgmTokenizer {
|
||||
using json = nlohmann::json;
|
||||
using VocabTrieTree = ort_extensions::TrieTree<char, extTokenId_t, -1>;
|
||||
using Vocab = std::unordered_map<std::string, std::tuple<extTokenId_t, double>>;
|
||||
|
||||
SpmUgmTokenizer() = default;
|
||||
|
||||
OrtxStatus LoadSpecialTokens(const json& token_json) {
|
||||
auto special_tokens = token_json.find("added_tokens");
|
||||
if (special_tokens != token_json.end()) {
|
||||
for (const auto& token : special_tokens->items()) {
|
||||
auto id = token.value()["id"].get<extTokenId_t>();
|
||||
bool is_special = token.value()["special"].get<bool>();
|
||||
if (is_special) {
|
||||
special_token_ids_.insert(id);
|
||||
}
|
||||
auto word = token.value()["content"].get<std::string>();
|
||||
user_defined_token_matcher_.Add(word, 0, id);
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
OrtxStatus LoadCharsMap(const json& j_vocab) {
|
||||
auto normalizer = j_vocab.find("normalizer");
|
||||
if (normalizer != j_vocab.end()) {
|
||||
auto iter = normalizer->find("precompiled_charsmap");
|
||||
if (iter != normalizer->end()) {
|
||||
auto charsmap = iter->get<std::string>();
|
||||
if (!base64_decode(charsmap, charsmap_data_)) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorCorruptData, "Failed to decode charsmap.");
|
||||
}
|
||||
|
||||
// std::cout << "charsmap size: " << charsmap_data_.size() << std::endl;
|
||||
// for (size_t i = 0; i < charsmap_data_.size() && i < 100; ++i) {
|
||||
// std::cout << int(charsmap_data_[i]) << " ";
|
||||
// }
|
||||
|
||||
size_t charsmap_offset = 0;
|
||||
|
||||
// First four bytes of precompiled_charsmap contains length of binary
|
||||
// blob containing XOR-compressed compact double array (XCDA) entries
|
||||
uint32_t xcda_blob_size = *(const uint32_t*)&charsmap_data_[0];
|
||||
charsmap_offset += sizeof(xcda_blob_size);
|
||||
if (xcda_blob_size + charsmap_offset >= charsmap_data_.size()) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorCorruptData, "Index out of array bounds in precompiled charsmap!");
|
||||
}
|
||||
|
||||
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
|
||||
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
|
||||
xcda_array_ = (const uint32_t*)&charsmap_data_[charsmap_offset];
|
||||
xcda_array_size_ = xcda_blob_size / sizeof(uint32_t);
|
||||
charsmap_offset += xcda_blob_size;
|
||||
|
||||
// Remaining bytes of precompiled charsmap contain null-terminated
|
||||
// replacement strings for prefixes matched by the XCDA.
|
||||
prefix_replacements_ = reinterpret_cast<const char*>(&charsmap_data_[charsmap_offset]);
|
||||
prefix_replacements_size_ = charsmap_data_.size() - charsmap_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
OrtxStatus LoadConfig(const json& config) {
|
||||
auto pretokenizer_node = config.find("pretokenizer");
|
||||
if (pretokenizer_node != config.end()) {
|
||||
auto pretokenizers_node = pretokenizer_node->find("pretokenizers");
|
||||
if (pretokenizers_node != pretokenizer_node->end()) {
|
||||
for (const auto& pretokenizer : pretokenizers_node->items()) {
|
||||
if (pretokenizer.value().contains("type")) {
|
||||
auto type = pretokenizer.value()["type"].get<std::string>();
|
||||
if (type == "Metaspace") {
|
||||
tokenizer_escape_whitespaces_ = true;
|
||||
}
|
||||
}
|
||||
if (pretokenizer.value().contains("add_prefix_space")) {
|
||||
tokenizer_add_space_prefix_ = pretokenizer.value()["add_prefix_space"].get<bool>();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
OrtxStatus Load(const bpe::TokenJsonConfig& config) {
|
||||
ortx::path vocab_path(config.GetVocabDataFile());
|
||||
if (!vocab_path.exists()) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, "Vocabulary file does not exist.");
|
||||
}
|
||||
|
||||
auto ifs = vocab_path.open();
|
||||
if (!ifs.is_open()) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, "Failed to open vocabulary file.");
|
||||
}
|
||||
|
||||
nlohmann::json j_vocab = json::parse(ifs, nullptr, false, true);
|
||||
if (j_vocab.is_discarded()) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, "Failed to parse vocabulary file.");
|
||||
}
|
||||
|
||||
OrtxStatus status = LoadConfig(j_vocab);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = LoadCharsMap(j_vocab);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = LoadSpecialTokens(j_vocab);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
auto model_node = j_vocab.find("model");
|
||||
if (model_node == j_vocab.end()) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, "Model node not found in vocabulary file.");
|
||||
}
|
||||
|
||||
auto unk_id_iter = model_node->find("unk_id");
|
||||
if (unk_id_iter != model_node->end()) {
|
||||
special_unk_id_ = unk_id_iter->get<extTokenId_t>();
|
||||
}
|
||||
|
||||
auto vocab_node = model_node->find("vocab");
|
||||
if (vocab_node == model_node->end()) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, "Vocabulary not found in model node.");
|
||||
}
|
||||
|
||||
extTokenId_t id = 0;
|
||||
for (const auto& entry : vocab_node->items()) {
|
||||
auto tkn = entry.value()[0].get<std::string>();
|
||||
auto score = entry.value()[1].get<double>();
|
||||
vocab_[tkn] = std::make_tuple(id++, score);
|
||||
}
|
||||
|
||||
scores_.resize(id);
|
||||
double min_score = std::numeric_limits<double>::max();
|
||||
for (const auto& entry : vocab_) {
|
||||
scores_[std::get<0>(entry.second)] = std::get<1>(entry.second);
|
||||
token_matcher_.Add(entry.first, 0, std::get<0>(entry.second));
|
||||
min_score = std::min<double>(min_score, std::get<1>(entry.second));
|
||||
}
|
||||
|
||||
unknown_token_score_ = min_score - unknown_token_score_penalty_;
|
||||
return status;
|
||||
}
|
||||
|
||||
extTokenId_t GetTokenId(const std::string& token) const {
|
||||
auto iter = vocab_.find(token);
|
||||
if (iter == vocab_.end()) {
|
||||
return special_unk_id_;
|
||||
}
|
||||
return std::get<0>(iter->second);
|
||||
}
|
||||
|
||||
OrtxStatus Compute(const ortc::Tensor<std::string>& input, ortc::Tensor<int64_t>& tokenize_output) const {
|
||||
if (input.Shape().size() != 1) {
|
||||
return OrtxStatus(extError_t::kOrtxErrorInvalidArgument, "Input tensor must have rank 1.");
|
||||
}
|
||||
|
||||
std::string normalized;
|
||||
Normalize(input.AsScalar(), &normalized);
|
||||
size_t input_len = normalized.size();
|
||||
if (input_len == 0) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<struct BestTokenization> tokenization_results(input_len + 1, {0, -DBL_MAX, special_unk_id_});
|
||||
tokenization_results[0] = {0, 0, special_unk_id_};
|
||||
|
||||
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||
size_t prefix_offset = input_offset;
|
||||
size_t n_utf8_code_units = std::min<size_t>(ustring::UTF8Len(normalized[input_offset]), input_len - input_offset);
|
||||
|
||||
bool single_codepoint_token_found = false;
|
||||
const struct BestTokenization& current_best = tokenization_results[input_offset];
|
||||
auto node = token_matcher_.Find(normalized[prefix_offset++]);
|
||||
|
||||
while (prefix_offset <= input_len && node != NULL) {
|
||||
if (node->HasValue()) {
|
||||
if (prefix_offset - input_offset == n_utf8_code_units) {
|
||||
single_codepoint_token_found = true;
|
||||
}
|
||||
extTokenId_t token_id = node->Value();
|
||||
const auto& token_data = scores_[token_id];
|
||||
|
||||
const double token_score = special_token_ids_.count(token_id) > 0 ? 0.0 : token_data;
|
||||
const double challenger_score = current_best.score_sum + token_score;
|
||||
struct BestTokenization& current_champ = tokenization_results[prefix_offset];
|
||||
if (challenger_score > current_champ.score_sum) {
|
||||
struct BestTokenization challenger = {input_offset, (float)challenger_score, token_id};
|
||||
current_champ = challenger;
|
||||
}
|
||||
}
|
||||
node = node->Find(normalized[prefix_offset++]);
|
||||
}
|
||||
|
||||
if (!single_codepoint_token_found) {
|
||||
const double challenger_score = current_best.score_sum + unknown_token_score_;
|
||||
prefix_offset = input_offset + n_utf8_code_units;
|
||||
struct BestTokenization& current_champ = tokenization_results[prefix_offset];
|
||||
if (challenger_score > current_champ.score_sum) {
|
||||
struct BestTokenization challenger = {input_offset, (float)challenger_score, special_unk_id_};
|
||||
current_champ = challenger;
|
||||
}
|
||||
}
|
||||
|
||||
input_offset += n_utf8_code_units;
|
||||
}
|
||||
|
||||
std::vector<extTokenId_t> output;
|
||||
output.reserve(input_len);
|
||||
bool is_prev_unknown = false;
|
||||
for (struct BestTokenization& tokenization = tokenization_results[input_len];;
|
||||
tokenization = tokenization_results[tokenization.input_offset]) {
|
||||
bool is_unknown = tokenization.token_id == special_unk_id_;
|
||||
if (!(is_prev_unknown && is_unknown)) {
|
||||
output.push_back(tokenization.token_id);
|
||||
}
|
||||
if (tokenization.input_offset == 0) {
|
||||
break;
|
||||
}
|
||||
is_prev_unknown = is_unknown;
|
||||
}
|
||||
|
||||
bool add_bos = GetTokenId(bos_token_) != special_unk_id_;
|
||||
bool add_eos = GetTokenId(eos_token_) != special_unk_id_;
|
||||
auto output_size = static_cast<int64_t>(output.size());
|
||||
int64_t* id_output = tokenize_output.Allocate({output_size + add_bos + add_eos});
|
||||
if (add_bos) {
|
||||
*id_output = GetTokenId(bos_token_);
|
||||
id_output++;
|
||||
}
|
||||
std::transform(output.begin(), output.end(), id_output, [](extTokenId_t id) { return static_cast<int64_t>(id); });
|
||||
std::reverse(id_output, id_output + output_size);
|
||||
if (add_eos) {
|
||||
*(id_output + output_size) = GetTokenId(eos_token_);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
private:
|
||||
struct NormalizationResult {
|
||||
const char* normalized;
|
||||
size_t normalized_len;
|
||||
size_t consumed_input;
|
||||
};
|
||||
|
||||
void Normalize(const std::string& input, std::string* normalized) const {
|
||||
normalized->clear();
|
||||
normalized->reserve(input.size() * 3);
|
||||
|
||||
const std::string space = tokenizer_escape_whitespaces_ ? std::string(spm_escaped_space) : " ";
|
||||
|
||||
bool shall_prepend_space = !tokenizer_treat_whitespace_as_suffix_ && tokenizer_add_space_prefix_;
|
||||
bool shall_append_space = tokenizer_treat_whitespace_as_suffix_ && tokenizer_add_space_prefix_;
|
||||
bool shall_merge_spaces = tokenizer_remove_extra_whitespaces_;
|
||||
|
||||
bool is_space_prepended = false;
|
||||
bool processing_non_ws = false;
|
||||
|
||||
size_t input_len = input.size();
|
||||
|
||||
for (size_t input_offset = 0; input_offset < input_len;) {
|
||||
auto norm_res = NormalizePrefix(input, input_offset);
|
||||
for (size_t i = 0; i < norm_res.normalized_len; i++) {
|
||||
char c = norm_res.normalized[i];
|
||||
if (c != ' ') {
|
||||
if (!processing_non_ws) {
|
||||
processing_non_ws = true;
|
||||
if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
|
||||
normalized->append(space);
|
||||
is_space_prepended = true;
|
||||
}
|
||||
}
|
||||
normalized->push_back(c);
|
||||
} else {
|
||||
if (processing_non_ws) {
|
||||
processing_non_ws = false;
|
||||
}
|
||||
if (!shall_merge_spaces) {
|
||||
normalized->append(space);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
input_offset += norm_res.consumed_input;
|
||||
}
|
||||
|
||||
if (shall_append_space) {
|
||||
normalized->append(space);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This structure is a view wrapper for XOR-compressed double array (XCDA)
|
||||
* See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
|
||||
* Each bit-packed entry contains:
|
||||
* - BASE array value in bits 10-30
|
||||
* - LCHECK array value in bits 0-7
|
||||
* - LEAF array value in bit 9
|
||||
* Entries containing indexes of replacement sequences have set bit 31
|
||||
*/
|
||||
struct XcdaArrayView {
|
||||
public:
|
||||
XcdaArrayView(const uint32_t* xcda_array, size_t xcda_array_size)
|
||||
: xcda_array_(xcda_array), xcda_array_size_(xcda_array_size) {}
|
||||
uint32_t GetBase(size_t index) {
|
||||
uint32_t packed_node = GetNode(index);
|
||||
return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
|
||||
}
|
||||
uint32_t GetLcheck(size_t index) {
|
||||
uint32_t packed_node = GetNode(index);
|
||||
return packed_node & ((1U << 31) | 0xff);
|
||||
}
|
||||
bool IsLeaf(size_t index) {
|
||||
uint32_t packed_node = GetNode(index);
|
||||
return (packed_node >> 8) & 1;
|
||||
}
|
||||
uint32_t GetValue(size_t index) {
|
||||
uint32_t packed_node = GetNode(index);
|
||||
return packed_node & ((1U << 31) - 1);
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t GetNode(size_t index) {
|
||||
if (index > xcda_array_size_) {
|
||||
ORTX_CXX_API_THROW("[UgmTok]Index out of array bounds in XCDA array!", ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
return xcda_array_[index];
|
||||
}
|
||||
const uint32_t* xcda_array_;
|
||||
size_t xcda_array_size_;
|
||||
};
|
||||
|
||||
struct NormalizationResult NormalizePrefix(const std::string& input, size_t input_offset) const {
|
||||
if (input_offset == input.size()) {
|
||||
return {&input[input_offset], 0, 0};
|
||||
}
|
||||
|
||||
std::string prefix = input.substr(input_offset);
|
||||
size_t prefix_off = 0;
|
||||
auto user_defined_token_match = user_defined_token_matcher_.FindLongest(prefix, prefix_off);
|
||||
if (user_defined_token_match != user_defined_token_matcher_.kInvalidId_) {
|
||||
return {&input[input_offset], prefix_off + input_offset, prefix_off + input_offset};
|
||||
}
|
||||
|
||||
size_t longest_prefix_length = 0;
|
||||
size_t longest_prefix_offset = 0;
|
||||
|
||||
if (xcda_array_size_ > 0) {
|
||||
XcdaArrayView xcda_view(xcda_array_, xcda_array_size_);
|
||||
|
||||
uint32_t node_index = 0;
|
||||
node_index = xcda_view.GetBase(node_index);
|
||||
for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
|
||||
unsigned char c = input[prefix_offset];
|
||||
if (c == 0) {
|
||||
break;
|
||||
}
|
||||
node_index ^= c;
|
||||
if (xcda_view.GetLcheck(node_index) != c) {
|
||||
break;
|
||||
}
|
||||
bool is_leaf = xcda_view.IsLeaf(node_index);
|
||||
node_index ^= xcda_view.GetBase(node_index);
|
||||
if (is_leaf) {
|
||||
longest_prefix_length = prefix_offset - input_offset + 1;
|
||||
longest_prefix_offset = xcda_view.GetValue(node_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (longest_prefix_length > 0) {
|
||||
if (longest_prefix_offset >= prefix_replacements_size_) {
|
||||
ORTX_CXX_API_THROW("[UgmTok]Index out of array bounds in precompiled charsmap!", ORT_RUNTIME_EXCEPTION);
|
||||
}
|
||||
const char* prefix_replacement = &prefix_replacements_[longest_prefix_offset];
|
||||
return {prefix_replacement, strlen(prefix_replacement), longest_prefix_length};
|
||||
} else {
|
||||
// if yes, return this sequence unmodified
|
||||
size_t prefix_offset = input_offset + ustring::UTF8Len(input[input_offset]);
|
||||
if (prefix_offset <= input.size()) {
|
||||
return {&input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset};
|
||||
} else {
|
||||
return {"\xEF\xBF\xBD", 3, 1};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// escaped space symbol - U+2581 (Lower One Eighth Block)
|
||||
static constexpr double unknown_token_score_penalty_ = 10.0;
|
||||
|
||||
std::vector<uint8_t> charsmap_data_;
|
||||
const char* prefix_replacements_ = NULL;
|
||||
size_t prefix_replacements_size_ = 0;
|
||||
|
||||
const uint32_t* xcda_array_ = NULL;
|
||||
size_t xcda_array_size_ = 0;
|
||||
|
||||
VocabTrieTree user_defined_token_matcher_;
|
||||
|
||||
struct BestTokenization {
|
||||
size_t input_offset;
|
||||
double score_sum;
|
||||
extTokenId_t token_id;
|
||||
};
|
||||
|
||||
extTokenId_t special_unk_id_ = -1;
|
||||
double unknown_token_score_;
|
||||
|
||||
Vocab vocab_;
|
||||
std::vector<double> scores_;
|
||||
std::set<extTokenId_t> special_token_ids_;
|
||||
VocabTrieTree token_matcher_;
|
||||
|
||||
public:
|
||||
bool tokenizer_escape_whitespaces_ = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix_ = false;
|
||||
bool tokenizer_add_space_prefix_ = true;
|
||||
bool tokenizer_remove_extra_whitespaces_ = true;
|
||||
std::string bos_token_ = "<s>";
|
||||
std::string eos_token_ = "</s>";
|
||||
std::string pad_token_ = "<pad>";
|
||||
std::string unk_token_ = "<unk>";
|
||||
};
|
||||
|
||||
} // namespace ort_extensions
|
|
@ -1,41 +1,51 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "bpe_kernels.h"
|
||||
#include "bpe_tokenizer.hpp"
|
||||
#include "bpe_decoder.hpp"
|
||||
#include "ugm_kernels.hpp"
|
||||
|
||||
#include "tokenizer_impl.h"
|
||||
|
||||
using namespace ort_extensions;
|
||||
|
||||
TokenizerImpl::TokenizerImpl() : OrtxObjectImpl(extObjectKind_t::kOrtxKindTokenizer) {};
|
||||
namespace ort_extensions {
|
||||
|
||||
TokenizerImpl::TokenizerImpl()
|
||||
: OrtxObjectImpl(extObjectKind_t::kOrtxKindTokenizer) {};
|
||||
TokenizerImpl::~TokenizerImpl() {};
|
||||
|
||||
OrtxStatus TokenizerImpl::Load(const std::string& dir) {
|
||||
OrtxStatus TokenizerImpl::Load(const std::string& tok_path) {
|
||||
tok_config_ = std::make_shared<ort_extensions::bpe::TokenJsonConfig>();
|
||||
auto status = tok_config_->Load(dir);
|
||||
auto status = tok_config_->Load(tok_path);
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
auto vocab_file_path = path(dir) / "tokenizer.json";
|
||||
std::ifstream vocab_fs = vocab_file_path.open();
|
||||
|
||||
tokenizer_ = std::make_unique<JsonFastTokenizer>();
|
||||
if (!vocab_fs.is_open()) {
|
||||
// No tokenizer.json file present; use TikToken tokenizer
|
||||
tokenizer_->tiktoken_ = true;
|
||||
|
||||
// load the tokenizer from a config
|
||||
status = tokenizer_->Load(*tok_config_);
|
||||
} else {
|
||||
// load the tokenizer from a config
|
||||
status = tokenizer_->Load(*tok_config_);
|
||||
|
||||
if (tok_config_->tokenizer_class_.empty()) {
|
||||
auto tokenizer = std::make_unique<SpmUgmTokenizer>();
|
||||
status = tokenizer->Load(*tok_config_);
|
||||
if (status.IsOk()) {
|
||||
detokenizer_ = std::make_unique<BpeStreamingDecoder>();
|
||||
status = detokenizer_->Load(tok_config_, *tokenizer_);
|
||||
tokenizer_ = std::move(tokenizer);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
auto vocab_file_path = ortx::path(tok_config_->GetVocabDataFile());
|
||||
auto tokenizer = std::make_unique<JsonFastTokenizer>();
|
||||
// vocab file is checked in TokenJsonConfig::Load
|
||||
auto fx_load = vocab_file_path.extension() == ".json"?
|
||||
&JsonFastTokenizer::Load: &JsonFastTokenizer::LoadTikTokenBase64;
|
||||
status = (tokenizer.get()->*fx_load)(*tok_config_);
|
||||
|
||||
if (status.IsOk()) {
|
||||
detokenizer_ = std::make_unique<BpeStreamingDecoder>();
|
||||
status = detokenizer_->Load(tok_config_, *tokenizer);
|
||||
}
|
||||
|
||||
if (status.IsOk()) {
|
||||
tokenizer_ = std::move(tokenizer);
|
||||
}
|
||||
|
||||
return status;
|
||||
|
@ -46,7 +56,10 @@ OrtxStatus TokenizerImpl::BatchEncode(const std::vector<std::string_view>& input
|
|||
for (const auto& s : input) {
|
||||
ortc::Tensor<int64_t> ts_output(&CppAllocator::Instance());
|
||||
ortc::Tensor<std::string> ts_input = ortc::Tensor<std::string>(std::vector<std::string>{std::string(s)});
|
||||
OrtxStatus status = tokenizer_->Compute(ts_input, ts_output, std::nullopt, std::nullopt);
|
||||
|
||||
OrtxStatus status = std::visit([&](auto& tokenizer) {
|
||||
return tokenizer->Compute(ts_input, ts_output);
|
||||
}, tokenizer_);
|
||||
|
||||
if (!status.IsOk()) {
|
||||
return status;
|
||||
|
@ -90,7 +103,7 @@ static std::map<std::string, std::string> LANGUAGES = {
|
|||
{"ro", "romanian"}, {"da", "danish"}, {"hu", "hungarian"}, {"ta", "tamil"}, {"no", "norwegian"},
|
||||
{"th", "thai"}, {"ur", "urdu"}, {"hr", "croatian"}, {"bg", "bulgarian"}, {"lt", "lithuanian"},
|
||||
{"la", "latin"}, {"mi", "maori"}, {"ml", "malayalam"}, {"cy", "welsh"}, {"sk", "slovak"},
|
||||
{"te", "telugu"}, {"fa", "persian"}, {"lv", "latvian"}, {"bn", "bengali"}, {"sr", "serbian"},
|
||||
{"te", "telugu"}, {"fa", "persian"}, {"lv", "latvian"}, {"bn", "bangla"}, {"sr", "serbian"},
|
||||
{"az", "azerbaijani"}, {"sl", "slovenian"}, {"kn", "kannada"}, {"et", "estonian"}, {"mk", "macedonian"},
|
||||
{"br", "breton"}, {"eu", "basque"}, {"is", "icelandic"}, {"hy", "armenian"}, {"ne", "nepali"},
|
||||
{"mn", "mongolian"}, {"bs", "bosnian"}, {"kk", "kazakh"}, {"sq", "albanian"}, {"sw", "swahili"},
|
||||
|
@ -105,14 +118,14 @@ static std::map<std::string, std::string> LANGUAGES = {
|
|||
|
||||
OrtxStatus TokenizerImpl::GetDecoderPromptIds(size_t batch_size, const char* lang, const char* task, int no_timestamps,
|
||||
std::vector<std::vector<extTokenId_t>>& t_ids) const {
|
||||
if (tokenizer_ == nullptr) {
|
||||
// since it was only supported by Whisper model, which is bpe only.
|
||||
if (!std::holds_alternative<bpe_tokenizer_t>(tokenizer_)) {
|
||||
return OrtxStatus(kOrtxErrorInvalidArgument, "Tokenizer is not loaded");
|
||||
}
|
||||
// since it was only supported by Whisper model, should we check it here?
|
||||
|
||||
auto translate_token_id = tokenizer_->GetTokenId("<|translate|>");
|
||||
auto transcribe_token_id = tokenizer_->GetTokenId("<|transcribe|>");
|
||||
auto notimestamps_token_id = tokenizer_->GetTokenId("<|notimestamps|>");
|
||||
auto translate_token_id = std::get<bpe_tokenizer_t>(tokenizer_)->GetTokenId("<|translate|>");
|
||||
auto transcribe_token_id = std::get<bpe_tokenizer_t>(tokenizer_)->GetTokenId("<|transcribe|>");
|
||||
auto notimestamps_token_id = std::get<bpe_tokenizer_t>(tokenizer_)->GetTokenId("<|notimestamps|>");
|
||||
std::vector<extTokenId_t> ids;
|
||||
ids.reserve(4);
|
||||
if (lang != nullptr) {
|
||||
|
@ -122,7 +135,7 @@ OrtxStatus TokenizerImpl::GetDecoderPromptIds(size_t batch_size, const char* lan
|
|||
}
|
||||
|
||||
std::string lang_token = "<|" + lang_str->first + "|>";
|
||||
ids.push_back(tokenizer_->GetTokenId(lang_token));
|
||||
ids.push_back(std::get<bpe_tokenizer_t>(tokenizer_)->GetTokenId(lang_token));
|
||||
}
|
||||
|
||||
if (task != nullptr) {
|
||||
|
@ -142,3 +155,5 @@ OrtxStatus TokenizerImpl::GetDecoderPromptIds(size_t batch_size, const char* lan
|
|||
t_ids.resize(batch_size, ids);
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace ort_extensions
|
||||
|
|
|
@ -3,9 +3,11 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "ortx_tokenizer.h"
|
||||
#include <variant>
|
||||
|
||||
#include "bpe_kernels.h"
|
||||
#include "bpe_json.hpp"
|
||||
#include "ugm_kernels.hpp"
|
||||
#include "bpe_jsoncfg.hpp"
|
||||
#include "bpe_streaming.hpp"
|
||||
#include "c_api_utils.hpp"
|
||||
|
||||
|
@ -17,7 +19,7 @@ class TokenizerImpl : public OrtxObjectImpl {
|
|||
virtual ~TokenizerImpl();
|
||||
|
||||
public:
|
||||
OrtxStatus Load(const std::string& dir);
|
||||
OrtxStatus Load(const std::string& tok_path);
|
||||
|
||||
OrtxStatus Tokenize(const std::vector<std::string_view>& input, std::vector<std::vector<extTokenId_t>>& t_ids) const {
|
||||
return BatchEncode(input, t_ids);
|
||||
|
@ -28,7 +30,7 @@ class TokenizerImpl : public OrtxObjectImpl {
|
|||
}
|
||||
|
||||
OrtxStatus Token2Id(const std::string& token, extTokenId_t& id) const {
|
||||
id = tokenizer_->GetTokenId(token);
|
||||
id = std::visit([&](auto& tokenizer) { return tokenizer->GetTokenId(token); }, tokenizer_);
|
||||
return {};
|
||||
}
|
||||
|
||||
|
@ -55,9 +57,11 @@ class TokenizerImpl : public OrtxObjectImpl {
|
|||
std::vector<std::vector<extTokenId_t>>& t_ids) const;
|
||||
|
||||
private:
|
||||
std::string tokenizer_dir_;
|
||||
using bpe_tokenizer_t = std::unique_ptr<JsonFastTokenizer>;
|
||||
using ugm_tokenizer_t = std::unique_ptr<SpmUgmTokenizer>;
|
||||
std::variant<bpe_tokenizer_t, ugm_tokenizer_t> tokenizer_;
|
||||
|
||||
std::shared_ptr<ort_extensions::bpe::TokenJsonConfig> tok_config_;
|
||||
std::unique_ptr<JsonFastTokenizer> tokenizer_;
|
||||
std::unique_ptr<BpeStreamingDecoder> detokenizer_;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
{
|
||||
"added_tokens_decoder": {},
|
||||
"auto_map": {
|
||||
"AutoTokenizer": [
|
||||
"tokenization_phi3_small.Phi3SmallTokenizer",
|
||||
"tokenization_phi3_small.Phi3SmallTokenizer"
|
||||
]
|
||||
},
|
||||
"bos_token": "<|endoftext|>",
|
||||
"chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"eos_token": "<|endoftext|>",
|
||||
"model_max_length": 8192,
|
||||
"pad_token": "<|endoftext|>",
|
||||
"tokenizer_class": "Phi3SmallTokenizer"
|
||||
}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1 @@
|
|||
{"model_max_length": 512}
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"bos_token": "<|endoftext|>",
|
||||
"eos_token": "<|endoftext|>"
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,782 @@
|
|||
{
|
||||
"added_tokens_decoder": {
|
||||
"100256": {
|
||||
"content": "<|dummy_id_2|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100257": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100258": {
|
||||
"content": "<|fim_prefix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100259": {
|
||||
"content": "<|fim_middle|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100260": {
|
||||
"content": "<|fim_suffix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100261": {
|
||||
"content": "<|system|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100262": {
|
||||
"content": "<|user|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100263": {
|
||||
"content": "<|assistant|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100264": {
|
||||
"content": "<|dummy_id_0|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100265": {
|
||||
"content": "<|dummy_id_1|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100266": {
|
||||
"content": "<|end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100267": {
|
||||
"content": "<|dummy_id_3|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100268": {
|
||||
"content": "<|dummy_id_4|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100269": {
|
||||
"content": "<|dummy_id_5|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100270": {
|
||||
"content": "<|dummy_id_6|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100271": {
|
||||
"content": "<|dummy_id_7|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100272": {
|
||||
"content": "<|dummy_id_8|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100273": {
|
||||
"content": "<|dummy_id_9|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100274": {
|
||||
"content": "<|dummy_id_10|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100275": {
|
||||
"content": "<|dummy_id_11|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100276": {
|
||||
"content": "<|endofprompt|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100277": {
|
||||
"content": "<|dummy_id_12|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100278": {
|
||||
"content": "<|dummy_id_13|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100279": {
|
||||
"content": "<|dummy_id_14|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100280": {
|
||||
"content": "<|dummy_id_15|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100281": {
|
||||
"content": "<|dummy_id_16|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100282": {
|
||||
"content": "<|dummy_id_17|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100283": {
|
||||
"content": "<|dummy_id_18|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100284": {
|
||||
"content": "<|dummy_id_19|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100285": {
|
||||
"content": "<|dummy_id_20|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100286": {
|
||||
"content": "<|dummy_id_21|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100287": {
|
||||
"content": "<|dummy_id_22|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100288": {
|
||||
"content": "<|dummy_id_23|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100289": {
|
||||
"content": "<|dummy_id_24|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100290": {
|
||||
"content": "<|dummy_id_25|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100291": {
|
||||
"content": "<|dummy_id_26|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100292": {
|
||||
"content": "<|dummy_id_27|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100293": {
|
||||
"content": "<|dummy_id_28|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100294": {
|
||||
"content": "<|dummy_id_29|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100295": {
|
||||
"content": "<|dummy_id_30|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100296": {
|
||||
"content": "<|dummy_id_31|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100297": {
|
||||
"content": "<|dummy_id_32|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100298": {
|
||||
"content": "<|dummy_id_33|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100299": {
|
||||
"content": "<|dummy_id_34|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100300": {
|
||||
"content": "<|dummy_id_35|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100301": {
|
||||
"content": "<|dummy_id_36|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100302": {
|
||||
"content": "<|dummy_id_37|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100303": {
|
||||
"content": "<|dummy_id_38|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100304": {
|
||||
"content": "<|dummy_id_39|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100305": {
|
||||
"content": "<|dummy_id_40|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100306": {
|
||||
"content": "<|dummy_id_41|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100307": {
|
||||
"content": "<|dummy_id_42|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100308": {
|
||||
"content": "<|dummy_id_43|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100309": {
|
||||
"content": "<|dummy_id_44|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100310": {
|
||||
"content": "<|dummy_id_45|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100311": {
|
||||
"content": "<|dummy_id_46|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100312": {
|
||||
"content": "<|dummy_id_47|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100313": {
|
||||
"content": "<|dummy_id_48|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100314": {
|
||||
"content": "<|dummy_id_49|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100315": {
|
||||
"content": "<|dummy_id_50|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100316": {
|
||||
"content": "<|dummy_id_51|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100317": {
|
||||
"content": "<|dummy_id_52|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100318": {
|
||||
"content": "<|dummy_id_53|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100319": {
|
||||
"content": "<|dummy_id_54|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100320": {
|
||||
"content": "<|dummy_id_55|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100321": {
|
||||
"content": "<|dummy_id_56|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100322": {
|
||||
"content": "<|dummy_id_57|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100323": {
|
||||
"content": "<|dummy_id_58|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100324": {
|
||||
"content": "<|dummy_id_59|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100325": {
|
||||
"content": "<|dummy_id_60|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100326": {
|
||||
"content": "<|dummy_id_61|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100327": {
|
||||
"content": "<|dummy_id_62|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100328": {
|
||||
"content": "<|dummy_id_63|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100329": {
|
||||
"content": "<|dummy_id_64|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100330": {
|
||||
"content": "<|dummy_id_65|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100331": {
|
||||
"content": "<|dummy_id_66|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100332": {
|
||||
"content": "<|dummy_id_67|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100333": {
|
||||
"content": "<|dummy_id_68|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100334": {
|
||||
"content": "<|dummy_id_69|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100335": {
|
||||
"content": "<|dummy_id_70|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100336": {
|
||||
"content": "<|dummy_id_71|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100337": {
|
||||
"content": "<|dummy_id_72|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100338": {
|
||||
"content": "<|dummy_id_73|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100339": {
|
||||
"content": "<|dummy_id_74|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100340": {
|
||||
"content": "<|dummy_id_75|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100341": {
|
||||
"content": "<|dummy_id_76|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100342": {
|
||||
"content": "<|dummy_id_77|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100343": {
|
||||
"content": "<|dummy_id_78|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100344": {
|
||||
"content": "<|dummy_id_79|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100345": {
|
||||
"content": "<|dummy_id_80|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100346": {
|
||||
"content": "<|dummy_id_81|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100347": {
|
||||
"content": "<|dummy_id_82|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100348": {
|
||||
"content": "<|dummy_id_83|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100349": {
|
||||
"content": "<|dummy_id_84|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100350": {
|
||||
"content": "<|dummy_id_85|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100351": {
|
||||
"content": "<|dummy_id_86|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<|endoftext|>",
|
||||
"chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"eos_token": "<|endoftext|>",
|
||||
"model_input_names": [
|
||||
"input_ids",
|
||||
"attention_mask"
|
||||
],
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"tokenizer_class": "PreTrainedTokenizerFast"
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"bos_token": "<|endoftext|>",
|
||||
"eos_token": "<|endoftext|>"
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,782 @@
|
|||
{
|
||||
"added_tokens_decoder": {
|
||||
"100256": {
|
||||
"content": "<|dummy_id_2|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100257": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100258": {
|
||||
"content": "<|fim_prefix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100259": {
|
||||
"content": "<|fim_middle|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100260": {
|
||||
"content": "<|fim_suffix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100261": {
|
||||
"content": "<|system|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100262": {
|
||||
"content": "<|user|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100263": {
|
||||
"content": "<|assistant|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100264": {
|
||||
"content": "<|dummy_id_0|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100265": {
|
||||
"content": "<|dummy_id_1|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100266": {
|
||||
"content": "<|end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100267": {
|
||||
"content": "<|dummy_id_3|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100268": {
|
||||
"content": "<|dummy_id_4|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100269": {
|
||||
"content": "<|dummy_id_5|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100270": {
|
||||
"content": "<|dummy_id_6|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100271": {
|
||||
"content": "<|dummy_id_7|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100272": {
|
||||
"content": "<|dummy_id_8|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100273": {
|
||||
"content": "<|dummy_id_9|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100274": {
|
||||
"content": "<|dummy_id_10|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100275": {
|
||||
"content": "<|dummy_id_11|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100276": {
|
||||
"content": "<|endofprompt|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100277": {
|
||||
"content": "<|dummy_id_12|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100278": {
|
||||
"content": "<|dummy_id_13|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100279": {
|
||||
"content": "<|dummy_id_14|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100280": {
|
||||
"content": "<|dummy_id_15|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100281": {
|
||||
"content": "<|dummy_id_16|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100282": {
|
||||
"content": "<|dummy_id_17|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100283": {
|
||||
"content": "<|dummy_id_18|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100284": {
|
||||
"content": "<|dummy_id_19|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100285": {
|
||||
"content": "<|dummy_id_20|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100286": {
|
||||
"content": "<|dummy_id_21|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100287": {
|
||||
"content": "<|dummy_id_22|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100288": {
|
||||
"content": "<|dummy_id_23|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100289": {
|
||||
"content": "<|dummy_id_24|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100290": {
|
||||
"content": "<|dummy_id_25|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100291": {
|
||||
"content": "<|dummy_id_26|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100292": {
|
||||
"content": "<|dummy_id_27|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100293": {
|
||||
"content": "<|dummy_id_28|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100294": {
|
||||
"content": "<|dummy_id_29|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100295": {
|
||||
"content": "<|dummy_id_30|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100296": {
|
||||
"content": "<|dummy_id_31|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100297": {
|
||||
"content": "<|dummy_id_32|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100298": {
|
||||
"content": "<|dummy_id_33|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100299": {
|
||||
"content": "<|dummy_id_34|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100300": {
|
||||
"content": "<|dummy_id_35|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100301": {
|
||||
"content": "<|dummy_id_36|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100302": {
|
||||
"content": "<|dummy_id_37|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100303": {
|
||||
"content": "<|dummy_id_38|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100304": {
|
||||
"content": "<|dummy_id_39|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100305": {
|
||||
"content": "<|dummy_id_40|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100306": {
|
||||
"content": "<|dummy_id_41|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100307": {
|
||||
"content": "<|dummy_id_42|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100308": {
|
||||
"content": "<|dummy_id_43|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100309": {
|
||||
"content": "<|dummy_id_44|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100310": {
|
||||
"content": "<|dummy_id_45|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100311": {
|
||||
"content": "<|dummy_id_46|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100312": {
|
||||
"content": "<|dummy_id_47|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100313": {
|
||||
"content": "<|dummy_id_48|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100314": {
|
||||
"content": "<|dummy_id_49|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100315": {
|
||||
"content": "<|dummy_id_50|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100316": {
|
||||
"content": "<|dummy_id_51|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100317": {
|
||||
"content": "<|dummy_id_52|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100318": {
|
||||
"content": "<|dummy_id_53|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100319": {
|
||||
"content": "<|dummy_id_54|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100320": {
|
||||
"content": "<|dummy_id_55|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100321": {
|
||||
"content": "<|dummy_id_56|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100322": {
|
||||
"content": "<|dummy_id_57|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100323": {
|
||||
"content": "<|dummy_id_58|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100324": {
|
||||
"content": "<|dummy_id_59|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100325": {
|
||||
"content": "<|dummy_id_60|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100326": {
|
||||
"content": "<|dummy_id_61|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100327": {
|
||||
"content": "<|dummy_id_62|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100328": {
|
||||
"content": "<|dummy_id_63|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100329": {
|
||||
"content": "<|dummy_id_64|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100330": {
|
||||
"content": "<|dummy_id_65|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100331": {
|
||||
"content": "<|dummy_id_66|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100332": {
|
||||
"content": "<|dummy_id_67|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100333": {
|
||||
"content": "<|dummy_id_68|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100334": {
|
||||
"content": "<|dummy_id_69|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100335": {
|
||||
"content": "<|dummy_id_70|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100336": {
|
||||
"content": "<|dummy_id_71|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100337": {
|
||||
"content": "<|dummy_id_72|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100338": {
|
||||
"content": "<|dummy_id_73|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100339": {
|
||||
"content": "<|dummy_id_74|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100340": {
|
||||
"content": "<|dummy_id_75|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100341": {
|
||||
"content": "<|dummy_id_76|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100342": {
|
||||
"content": "<|dummy_id_77|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100343": {
|
||||
"content": "<|dummy_id_78|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100344": {
|
||||
"content": "<|dummy_id_79|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100345": {
|
||||
"content": "<|dummy_id_80|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100346": {
|
||||
"content": "<|dummy_id_81|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100347": {
|
||||
"content": "<|dummy_id_82|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100348": {
|
||||
"content": "<|dummy_id_83|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100349": {
|
||||
"content": "<|dummy_id_84|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100350": {
|
||||
"content": "<|dummy_id_85|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"100351": {
|
||||
"content": "<|dummy_id_86|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<|endoftext|>",
|
||||
"chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"eos_token": "<|endoftext|>",
|
||||
"model_input_names": [
|
||||
"input_ids",
|
||||
"attention_mask"
|
||||
],
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"tokenizer_class": "PreTrainedTokenizerFast"
|
||||
}
|
|
@ -74,4 +74,4 @@
|
|||
"special": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -51,10 +51,10 @@ TEST(CApiTest, StreamApiTest) {
|
|||
for (size_t i = 0; i < sizeof(token_ids) / sizeof(token_ids[0]); i++) {
|
||||
const char* token = NULL;
|
||||
err = OrtxDetokenizeCached(tokenizer, detok_cache, token_ids[i], &token);
|
||||
EXPECT_EQ(err, kOrtxOK);
|
||||
#ifdef _DEBUG
|
||||
std::cout << token;
|
||||
#endif
|
||||
EXPECT_EQ(err, kOrtxOK);
|
||||
}
|
||||
|
||||
#ifdef _DEBUG
|
||||
|
@ -136,13 +136,9 @@ TEST(OrtxTokenizerTest, TicTokenTokenizer) {
|
|||
EXPECT_EQ(out_text[0], input[0]);
|
||||
}
|
||||
|
||||
TEST(OrtxTokenizerTest, Phi3_S_Tokenizer) {
|
||||
if (!std::filesystem::exists("data2/phi-3-small")) {
|
||||
GTEST_SKIP() << "Skip test as extra test data is not deployed.";
|
||||
}
|
||||
|
||||
TEST(OrtxTokenizerTest, Phi3_Small_Hf_Tokenizer) {
|
||||
auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
|
||||
auto status = tokenizer->Load("data2/phi-3-small");
|
||||
auto status = tokenizer->Load("data/tokenizer/phi-3-small-cvt");
|
||||
if (!status.IsOk()) {
|
||||
std::cout << status.ToString() << std::endl;
|
||||
}
|
||||
|
@ -171,7 +167,7 @@ TEST(OrtxTokenizerTest, Phi3_S_Tokenizer) {
|
|||
|
||||
TEST(OrtxTokenizerTest, Phi3_Small_Tokenizer) {
|
||||
auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
|
||||
auto status = tokenizer->Load("data/phi-3-small");
|
||||
auto status = tokenizer->Load("data/tokenizer/phi-3-small");
|
||||
if (!status.IsOk()) {
|
||||
std::cout << status.ToString() << std::endl;
|
||||
}
|
||||
|
@ -399,7 +395,9 @@ TEST(OrtxTokenizerStreamTest, Phi3Tokenizer) {
|
|||
EXPECT_TRUE(tokenizer != nullptr);
|
||||
|
||||
std::vector<std::string_view> input = {
|
||||
R"(こんにちは。データ分析にはいくつかのステップがあります。まずは目的を明確にします。次に、データを収集し、クリーニングを行い ます。その後、データを構造化し、その後、データを分析します。これらのステップを実行することで、データを有意的に分析することができます。)"};
|
||||
R"(こんにちは。データ分析にはいくつかのステップがあります。まずは目的を明確にします。次に、データを収集し、クリーニングを行います。)"
|
||||
R"(その後、データを構造化し、その後、データを分析します。これらのステップを実行することで、データを有意的に分析することができます。)"
|
||||
};
|
||||
std::vector<std::vector<extTokenId_t>> token_ids;
|
||||
status = tokenizer->Tokenize(input, token_ids);
|
||||
EXPECT_TRUE(status.IsOk());
|
||||
|
@ -446,3 +444,24 @@ TEST(OrtxTokenizerTest, WhisperTokenizer) {
|
|||
EXPECT_EQ(err, kOrtxOK);
|
||||
EXPECT_EQ(sot_id, 50258);
|
||||
}
|
||||
|
||||
TEST(OrtxTokenizerTest, SpmUgmTokenizer) {
|
||||
// test the llama2 tokenizer with BPE class, instead of sentencepiece wrapper.
|
||||
OrtxObjectPtr<OrtxTokenizer> tokenizer(OrtxCreateTokenizer, "data/tokenizer/fairseq/xlm-roberta-base");
|
||||
EXPECT_EQ(tokenizer.Code(), kOrtxOK);
|
||||
|
||||
const char* input[] = {"I like walking my cute dog\n and\x17 then, 生活的真谛是 \t\t\t\t \n\n61"};
|
||||
OrtxObjectPtr<OrtxTokenId2DArray> token_ids;
|
||||
OrtxTokenize(tokenizer.get(), input, 1, ort_extensions::ptr(token_ids));
|
||||
EXPECT_EQ(token_ids.Code(), kOrtxOK);
|
||||
|
||||
size_t length = 0;
|
||||
const extTokenId_t* ids = nullptr;
|
||||
OrtxTokenId2DArrayGetItem(token_ids.get(), 0, &ids, &length);
|
||||
std::vector<extTokenId_t> ids_vec(ids, ids + length);
|
||||
|
||||
// expected ids was generated using the following command:
|
||||
// AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||
EXPECT_EQ(ids_vec, std::vector<extTokenId_t>({
|
||||
0, 87, 1884, 122395, 759, 99942, 10269, 136, 7068, 4, 6, 62668, 5364, 245875, 354, 11716, 2}));
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче