Fix ::tolower error when locale is not 'C' (#174)

* add test and implement tolower

* fix locale

* fix locale

Co-authored-by: Ze Tao <zetao@microsoft.com>
This commit is contained in:
Mojimi 2021-10-21 11:59:29 +08:00 коммит произвёл GitHub
Родитель 448518534c
Коммит 46d096f1af
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 58232 добавлений и 14 удалений

Просмотреть файл

@ -98,10 +98,23 @@ bool IsAccent(char32_t c)
return c >= 0x300 && c <= 0x36F;
}
// only support latin now
char32_t ToLower(char32_t c) {
if ((c >= 'A') && (c <= 'Z')) {
return c + 'a' - 'A';
}
if ((c >= U'À' && (c <= U'Þ'))) {
return c + U'à' - U'À';
}
return c;
}
char32_t StripAccent(char32_t c)
{
// "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
const char* tr = "AAAAAAÆCEEEEIIIIÐNOOOOO×ØUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
const char32_t * tr = U"AAAAAAÆCEEEEIIIIÐNOOOOO×ØUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
if (c < 192 || c > 255) {
return c;
}
@ -177,4 +190,5 @@ uint64_t Hash64Fast(const char* data, size_t n) {
return static_cast<int64_t>(util::Fingerprint64(data, n));
}
#endif // ENABLE_TF_STRING

Просмотреть файл

@ -65,6 +65,8 @@ bool IsPunct(char32_t c);
bool IsControl(char32_t c);
char32_t ToLower(char32_t c);
char32_t StripAccent(char32_t c);
uint64_t Hash64(const char* data, size_t n, uint64_t seed);

Просмотреть файл

@ -17,7 +17,7 @@ void KernelStringLower::Compute(OrtKernelContext* context) {
GetTensorMutableDataString(api_, ort_, context, input_X, X);
for (int64_t i = 0; i < (int64_t)X.size(); ++i) {
std::transform(X[i].begin(), X[i].end(), X[i].begin(), ::tolower);
std::transform(X[i].begin(), X[i].end(), X[i].begin(), ToLower);
}
OrtTensorDimensions dimensions(ort_, input_X);

Просмотреть файл

@ -38,7 +38,7 @@ std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {
if (do_lower_case_) {
for (auto& c : text) {
c = ::tolower(c);
c = ToLower(c);
}
}

Просмотреть файл

@ -276,16 +276,11 @@ void KernelBertTokenizer::Compute(OrtKernelContext* context) {
std::vector<int64_t> input_ids;
std::vector<int64_t> token_type_ids;
if (input_data.size() == 1 || input_data[1].empty()) {
if (input_data.size() == 1) {
std::vector<int64_t> encode = tokenizer_->Encode(tokenizer_->Tokenize(ustring(input_data[0])));
truncate_->Truncate(encode, (max_length_ > 0 && max_length_ <= 2) ? 0 : max_length_ - 2);
input_ids = tokenizer_->AddSpecialToken(encode);
token_type_ids = tokenizer_->GenerateTypeId(encode);
} else if (input_data[0].empty()) {
std::vector<int64_t> encode = tokenizer_->Encode(tokenizer_->Tokenize(ustring(input_data[1])));
truncate_->Truncate(encode, (max_length_ > 0 && max_length_ <= 2) ? 0 : max_length_ - 2);
input_ids = tokenizer_->AddSpecialToken(encode);
token_type_ids = tokenizer_->GenerateTypeId(encode);
} else {
std::vector<int64_t> encode1 = tokenizer_->Encode(tokenizer_->Tokenize(ustring(input_data[0])));
std::vector<int64_t> encode2 = tokenizer_->Encode(tokenizer_->Tokenize(ustring(input_data[1])));

Просмотреть файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -8,7 +8,7 @@
#include "text/string_lower.hpp"
TEST(utils, test_string_lower) {
TEST(string_operator, test_string_lower) {
auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
std::vector<TestValue> inputs(1);
@ -32,7 +32,7 @@ TEST(utils, test_string_lower) {
}
TEST(utils, test_regex_split_with_offsets) {
TEST(string_operator, test_regex_split_with_offsets) {
auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
std::vector<TestValue> inputs(1);

Просмотреть файл

@ -2,11 +2,12 @@
// Licensed under the MIT License.
#include <filesystem>
#include <locale>
#include "gtest/gtest.h"
#include "ocos.h"
#include "test_kernel.hpp"
TEST(utils, test_bert_tokenizer) {
TEST(tokenizer_opertors, test_bert_tokenizer) {
auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
std::vector<TestValue> inputs(1);
@ -35,10 +36,9 @@ TEST(utils, test_bert_tokenizer) {
model_path = model_path.parent_path();
model_path /= "..";
model_path /= "data";
model_path /= "test_bert_tokenizer1.onnx";
model_path /= "test_bert_tokenizer.onnx";
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {1};
@ -80,4 +80,193 @@ TEST(utils, test_bert_tokenizer) {
outputs[2].dims = {71};
outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {1};
inputs[0].values_string = {""};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {2};
outputs[0].values_int64 = {101, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {2};
outputs[1].values_int64 = {0, 0};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {2};
outputs[2].values_int64 = {1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {2};
inputs[0].values_string = {"M1 Pro and M1 Max scale the amazing M1 architecture to new heights — and for the first time, they bring a system on a chip (SoC) architecture to a pro notebook.",
"Both have more CPU cores, more GPU cores, and more unified memory than M1. Along with a powerful Neural Engine for supercharged machine learning and upgraded media engines with ProRes support, M1 Pro and M1 Max allow pros to do things they never could before."};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {99};
outputs[0].values_int64 = {101, 26528, 5096, 1105, 26528, 3405, 3418, 1103, 6929, 26528, 4220, 1106, 1207, 16291, 100, 1105, 1111, 1103, 1148, 1159, 117, 1152, 2498, 170, 1449, 1113, 170, 11451, 113, 1573, 1658, 114, 4220, 1106, 170, 5250, 17189, 119, 102, 2695, 1138, 1167, 18701, 4160, 1116, 117, 1167, 15175, 2591, 4160, 1116, 117, 1105, 1167, 13943, 2962, 1190, 26528, 119, 6364, 1114, 170, 3110, 151, 8816, 1348, 13451, 1111, 7688, 23131, 3395, 3776, 1105, 9554, 2394, 4540, 1114, 5096, 2069, 1279, 1619, 117, 26528, 5096, 1105, 26528, 3405, 2621, 5250, 1116, 1106, 1202, 1614, 1152, 1309, 1180, 1196, 119, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {99};
outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {99};
outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {2};
inputs[0].values_string = {"a", ""};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {4};
outputs[0].values_int64 = {101, 170, 102, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {4};
outputs[1].values_int64 = {0, 0, 0, 1};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {4};
outputs[2].values_int64 = {1, 1, 1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {1};
inputs[0].values_string = {""};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {2};
outputs[0].values_int64 = {101, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {2};
outputs[1].values_int64 = {0, 0};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {2};
outputs[2].values_int64 = {1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {2};
inputs[0].values_string = {"M1 Pro and M1 Max scale the amazing M1 architecture to new heights — and for the first time, they bring a system on a chip (SoC) architecture to a pro notebook.",
"Both have more CPU cores, more GPU cores, and more unified memory than M1. Along with a powerful Neural Engine for supercharged machine learning and upgraded media engines with ProRes support, M1 Pro and M1 Max allow pros to do things they never could before."};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {99};
outputs[0].values_int64 = {101, 26528, 5096, 1105, 26528, 3405, 3418, 1103, 6929, 26528, 4220, 1106, 1207, 16291, 100, 1105, 1111, 1103, 1148, 1159, 117, 1152, 2498, 170, 1449, 1113, 170, 11451, 113, 1573, 1658, 114, 4220, 1106, 170, 5250, 17189, 119, 102, 2695, 1138, 1167, 18701, 4160, 1116, 117, 1167, 15175, 2591, 4160, 1116, 117, 1105, 1167, 13943, 2962, 1190, 26528, 119, 6364, 1114, 170, 3110, 151, 8816, 1348, 13451, 1111, 7688, 23131, 3395, 3776, 1105, 9554, 2394, 4540, 1114, 5096, 2069, 1279, 1619, 117, 26528, 5096, 1105, 26528, 3405, 2621, 5250, 1116, 1106, 1202, 1614, 1152, 1309, 1180, 1196, 119, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {99};
outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {99};
outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {2};
inputs[0].values_string = {"", "a"};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {4};
outputs[0].values_int64 = {101, 102, 170, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {4};
outputs[1].values_int64 = {0, 0, 1, 1};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {4};
outputs[2].values_int64 = {1, 1, 1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
}
TEST(tokenizer_opertors, test_bert_tokenizer_scalar) {
auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
std::vector<TestValue> inputs(1);
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {};
inputs[0].values_string = {"We look forward to welcoming you to our stores. Whether you shop in a store or shop online, our Specialists can help you buy the products you love."};
std::vector<TestValue> outputs(3);
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {33};
outputs[0].values_int64 = {101, 1195, 1440, 1977, 1106, 20028, 1128, 1106, 1412, 4822, 119, 2480, 1128, 4130, 1107, 170, 2984, 1137, 4130, 3294, 117, 1412, 18137, 1169, 1494, 1128, 4417, 1103, 2982, 1128, 1567, 119, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {33};
outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {33};
outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
std::filesystem::path model_path = __FILE__;
model_path = model_path.parent_path();
model_path /= "..";
model_path /= "data";
model_path /= "test_bert_tokenizer_scalar.onnx";
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
// change locale to system locale
std::locale();
inputs[0].name = "text";
inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
inputs[0].dims = {};
inputs[0].values_string = {
"再见我的爱\n"
"I wanna say goodbye\n"
"再见我的过去\n"
"I want a new life"};
outputs[0].name = "input_ids";
outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[0].dims = {22};
outputs[0].values_int64 = {101, 100, 100, 100, 100, 100, 178, 16445, 1474, 12903, 100, 100, 100, 100, 100, 100, 178, 1328, 170, 1207, 1297, 102};
outputs[1].name = "token_type_ids";
outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[1].dims = {22};
outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
outputs[2].name = "attention_mask";
outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
outputs[2].dims = {22};
outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
}

Просмотреть файл

@ -40,6 +40,14 @@ def _is_punctuation(char):
return False
def _is_upper_case(char):
cp = ord(char)
cat = unicodedata.category(char)
if cat.startswith("Lu") and char.lower() != char:
return True
return False
def find_expect_char_in_range(judge_fun, start, end):
result = []
for c in range(start, end):