Optimize the tokenizer for efficiency (#797)

* optimize the tokenizer for efficiency

* fix the unit test failures.

* fix the api test case failures

* removed the unused code.

* More test cases fixings

* One more fixing

* fix macOS build issues

* refine the test

* add more diagnosis info.

* fix unit test in CI Linux

* fix the pp_api test failure
This commit is contained in:
Wenbing Li 2024-08-27 18:57:50 -07:00 коммит произвёл GitHub
Родитель 2d044adbf9
Коммит 2d02a687be
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
11 изменённых файлов: 91 добавлений и 78 удалений

Просмотреть файл

@ -198,14 +198,14 @@ stages:
- bash: |
set -e -x -u
./build.sh -DOCOS_ENABLE_C_API=ON
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with API enabled and run tests
- bash: |
set -e -x -u
./build.sh -DOCOS_BUILD_PRESET=token_api_only -DOCOS_BUILD_SHARED_LIB=OFF
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with tokenizer API only enabled and run tests

Просмотреть файл

@ -16,7 +16,7 @@ The package contains all custom operators and some Python scripts to manipulate
- no-azure: disable AzureOp kernel build in Python package.
- no-opencv: disable operators based on OpenCV in build.
- cc-debug: generate debug info for extensions binaries and disable C/C++ compiler optimization.
- pp_api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *`
- pp-api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *`
- cuda-archs: specify the CUDA architectures(like 70, 85, etc.), and the multiple values can be combined with semicolon. The default value is nvidia-smi util output of GPU-0
- ort\_pkg\_dir: specify ONNXRuntime package directory the extension project is depending on. This is helpful if you want to use some ONNXRuntime latest function which has not been involved in the official build

Просмотреть файл

@ -48,8 +48,9 @@ class HFTokenizerConverter(CustomOpConverter):
model_dir = hf_tokenizer.name_or_path
else:
model_dir = os.path.dirname(vocab_file)
tokenizer_json = json.load(
open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8"))
f = open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")
tokenizer_json = json.load(f)
f.close()
# get vocab object from json file
vocab = tokenizer_json.get("model", {}).get("vocab", {})
sorted_merges = tokenizer_json.get("model", {}).get("merges", [])

Просмотреть файл

@ -7,7 +7,7 @@ import os
from . import _extensions_pydll as _C
if not hasattr(_C, "delete_object"):
raise ImportError(
"onnxruntime_extensions is not built with pre-processing C API"
"onnxruntime_extensions is not built with pre-processing C API\n"
"To enable it, please build the package with --ortx-user-option=pp_api")
create_processor = _C.create_processor
@ -24,6 +24,7 @@ delete_object = _C.delete_object
class Tokenizer:
def __init__(self, tokenizer_dir):
self.tokenizer = None
if os.path.isdir(tokenizer_dir):
self.tokenizer = create_tokenizer(tokenizer_dir)
else:
@ -41,7 +42,8 @@ class Tokenizer:
f"Downloaded HF file '{resolved_full_file}' cannot be found")
if (os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file)):
raise FileNotFoundError(
f"Downloaded HF files '{resolved_full_file}' and '{resolved_config_file}' are not in the same directory")
f"Downloaded HF files '{resolved_full_file}' "
f"and '{resolved_config_file}' are not in the same directory")
tokenizer_dir = os.path.dirname(resolved_full_file)
self.tokenizer = create_tokenizer(tokenizer_dir)

Просмотреть файл

@ -106,6 +106,7 @@ ustring RemoveConsecutiveSpaces(const ustring& input) {
KernelBpeTokenizer::KernelBpeTokenizer(const BpeModelConf& conf)
: bpe_conf_(conf) {
model_name_ = conf.name_ == nullptr ? "" : conf.name_;
CreateUnicodeByteEncoder();
};
OrtStatusPtr KernelBpeTokenizer::OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) {
@ -175,12 +176,28 @@ uint32_t KernelBpeTokenizer::GetTokenId(const std::string& token) const {
return bbpe_tokenizer_->GetTokenId(token);
}
/*
Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454
Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes
we need to store the strings rather than their IDs, and thereby need a separate map.
*/
void KernelBpeTokenizer::CreateUnicodeByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++);
} else {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i);
}
}
}
std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
int64_t max_length,
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const {
std::vector<int64_t> res;
std::list<std::pair<uint32_t, uint32_t>> byte_list;
bool clean_up_spaces = false;
if (ModelName() == kModel_CLIP) {
@ -191,10 +208,10 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
text = text.strip()
*/
ustring str = RemoveConsecutiveSpaces(input);
if (IsUnicodeSpace(str.front())) {
if (!str.empty() && IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
if (!str.empty() && IsUnicodeSpace(str.back())) {
str.pop_back();
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
@ -274,24 +291,43 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
}
}
// Get byte encodings prior to performing BPE
byte_list.clear();
std::list<std::pair<uint32_t, uint32_t>> byte_list;
std::string token_bytes;
token_bytes.reserve(utf8_token.size() * 2);
size_t token_len = utf8_token.length();
size_t end_diff = 0;
if (clean_up_spaces) {
// Whitespace clean
utf8_token.erase(std::remove(utf8_token.begin(), utf8_token.end(), U' '), utf8_token.end());
token_len = utf8_token.length() - 1;
}
for (int i = 0; i < utf8_token.length(); i++) {
if (i == utf8_token.length() - 1) {
std::string boundary(1, utf8_token[i]);
byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(boundary + "</w>"), 1));
} else {
byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(utf8_token[i])], 1));
}
for (size_t i = 0; i < token_len; i++) {
token_bytes += unicode_byte_encoder_[static_cast<unsigned char>(utf8_token[i])];
}
if (clean_up_spaces) {
end_diff = token_bytes.length();
if (!utf8_token.empty()) {
token_bytes += unicode_byte_encoder_[static_cast<unsigned char>(utf8_token.back())];
token_bytes += "</w>";
}
end_diff = token_bytes.length() - end_diff;
}
auto id = bbpe_tokenizer_->GetTokenId(token_bytes);
if (id != bpe::kInvalidTokenId) {
byte_list.push_back(std::make_pair(id, ort_extensions::narrow<uint32_t>(utf8_token.size())));
} else {
for (char& cp : utf8_token) {
byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)], 1));
token_len = token_bytes.length();
for (size_t i = 0; i < token_len - end_diff; /* i++ */) {
size_t j = ustring::UTF8Len(token_bytes[i]);
byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(token_bytes.substr(i, j)), ort_extensions::narrow<uint32_t>(j)));
i += j;
}
if (end_diff > 0) {
byte_list.push_back(std::make_pair(
bbpe_tokenizer_->GetTokenId(token_bytes.substr(token_len - end_diff, end_diff)), ort_extensions::narrow<uint32_t>(end_diff)));
}
}
@ -343,7 +379,6 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const {
std::vector<int64_t> res;
std::list<std::pair<uint32_t, uint32_t>> byte_list;
// Add BOS token to result
res.push_back(bos_token_id_);
@ -379,7 +414,7 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
}
// Get byte encodings prior to performing BPE
byte_list.clear();
std::list<std::pair<uint32_t, uint32_t>> byte_list;
while (res.size() < max_length && char_pos < ustr.length()) {
auto chr = ustr[char_pos];
@ -559,23 +594,6 @@ SpmTokenizer::SpmTokenizer()
JsonFastTokenizer::JsonFastTokenizer() : KernelBpeTokenizer(kGPT2Configuration) {}
/*
Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454
Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes
we need to store the strings rather than their IDs, and thereby need a separate map.
*/
void JsonFastTokenizer::CreateUnicodeByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++);
} else {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i);
}
}
}
std::string JsonFastTokenizer::TokenBytesToString(std::vector<uint8_t>& bytes) {
std::string result;
for (auto c : bytes) {
@ -647,7 +665,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> byte_merges;
bbpe_tokenizer_ = std::make_unique<BpeModel>();
JsonFastTokenizer::CreateUnicodeByteEncoder();
for (const auto& item : bpe_ranks) {
std::vector<uint8_t> token = item.first;
@ -714,13 +731,19 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
module_ifs >> tok_json;
} else {
ifs >> tok_json;
// doesn't work for json with nested objects
// auto decoders_node = tok_json.find("/decoder/decoders"_json_pointer);
auto decoders_node = tok_json.find("decoder");
if (decoders_node != tok_json.end()) {
decoders_node = decoders_node->find("decoders");
bool has_decoders_node = false;
auto decoders_node = tok_json.end();
auto decoder_node = tok_json.find("decoder");
if (decoder_node != tok_json.end()) {
decoders_node = decoder_node->find("decoders");
if (decoders_node != decoder_node->end()) {
has_decoders_node = true;
}
}
if (decoders_node->is_array()) {
if (has_decoders_node && decoders_node->is_array()) {
for(auto step = decoders_node->begin(); step != decoders_node->end(); ++step) {
std::string type = step->value("type", "");
if (type == "Replace") {
@ -742,7 +765,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
bpe_conf_.get().GetSpecialTokens().c_str(),
bpe_conf_.get().spm_model_);
}
auto added_tokens = tok_json.find("added_tokens");
if (added_tokens != tok_json.end()) {

Просмотреть файл

@ -48,6 +48,8 @@ struct KernelBpeTokenizer {
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const;
void CreateUnicodeByteEncoder();
protected:
std::reference_wrapper<BpeModelConf const> bpe_conf_;
std::string model_name_;
@ -60,6 +62,7 @@ struct KernelBpeTokenizer {
std::optional<bool> add_bos_token_;
std::optional<bool> add_eos_token_;
std::string unicode_byte_encoder_[256] = {};
};
struct GPT2Tokenizer : KernelBpeTokenizer {
@ -122,10 +125,8 @@ class JsonFastTokenizer : public KernelBpeTokenizer {
bool tiktoken_ = false;
private:
void CreateUnicodeByteEncoder();
std::string TokenBytesToString(std::vector<uint8_t>& bytes);
BpeModelConf json_conf_;
std::vector<ort_extensions::bpe::AddedToken> added_tokens_;
std::string unicode_byte_encoder_[256] = {};
};

Просмотреть файл

@ -225,6 +225,7 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
ptrdiff_t z = ustring::ValidateUTF8(text);
if (z <= 0) {
text = text.substr(0, -z);
text += "\ufffd"; // bad utf-8 string
}
decoded_strings.emplace_back(std::move(text));

Просмотреть файл

@ -61,8 +61,6 @@ class BpeModel {
if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}
uint32_t index = 0;
@ -142,8 +140,6 @@ class BpeModel {
if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}
uint32_t index = 0;
@ -196,8 +192,6 @@ class BpeModel {
if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}
uint32_t index = 0;
@ -336,8 +330,6 @@ class BpeModel {
}
}
const auto& ByteEncoder() const { return byte_encoder_; }
uint32_t GetTokenId(const std::string& key) const {
auto it = vocab_map_.find(key);
if (it != vocab_map_.end()) {
@ -370,27 +362,10 @@ class BpeModel {
return (static_cast<uint64_t>(i1) << 32) | (i0 & 0xFFFFFFFFLL);
}
void CreateByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
/*
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
*/
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(index++));
} else {
byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(i));
}
}
}
private:
std::string end_of_word_suffix_;
std::map<uint64_t, BpeNode> bpe_rank_;
uint32_t byte_encoder_[256] = {};
std::unordered_map<std::string, uint32_t> vocab_map_;
std::vector<std::string> id2token_map_;

Просмотреть файл

@ -72,7 +72,7 @@ OrtxStatus TokenizerImpl::BatchDecode(const std::vector<span<extTokenId_t const>
if (!status.IsOk()) {
return status;
}
t_text.emplace_back(ts_output.AsScalar());
t_text.push_back(ts_output.AsScalar());
}
return {};
}

Просмотреть файл

@ -290,6 +290,16 @@ TEST(OrtxTokenizerTest, CodeGenTokenizer) {
EXPECT_TRUE(status.IsOk());
// std::cout << out_text[0] << std::endl;
EXPECT_EQ(out_text[0], input[0]);
// 252 and the following ids cannot be decoded as a valid utf-8 string
std::vector<extTokenId_t> invalid_token_ids_span = {14675, 8466, 705, 252, 538, 5374, 82, 329, 4554};
std::vector<std::string> out_text1;
status = tokenizer->Detokenize({ort_extensions::span<const extTokenId_t>(invalid_token_ids_span)}, out_text1);
EXPECT_TRUE(status.IsOk());
EXPECT_EQ(out_text1.size(), 1);
std::string out_text_ref = out_text1.back();
std::cout << out_text_ref << std::endl;
EXPECT_EQ(out_text_ref.substr(out_text_ref.length() - 3, 3), "\ufffd");
}
TEST(OrtxTokenizerStreamTest, CodeGenTokenizer) {

Просмотреть файл

@ -32,7 +32,7 @@ class TestAutoTokenizer(unittest.TestCase):
def test_phi_3_mini(self):
tokenizer = AutoTokenizer.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct", use_fast=True)
"microsoft/Phi-3-mini-128k-instruct", use_fast=True, add_bos_token=True, add_eos_token=False)
text = ["what are you? \n 给 weiss ich, über was los ist \n",
"@? \n was los ist \n",
"Qué dijiste? \n über 给 ば was los ist im Mannschaft ц \n",
@ -44,9 +44,10 @@ class TestAutoTokenizer(unittest.TestCase):
expected_ids = tokenizer.encode(text[n], return_tensors="np")
try:
np.testing.assert_array_equal(
expected_ids[0], actual_ids[n][1:expected_ids.shape[1] + 1])
# skip the padding tokens in the ort output
expected_ids[0], actual_ids[n][:expected_ids.shape[1]])
except AssertionError:
print("index is ", n)
print("the failed sentence index is ", n)
raise