Add native test for bert tokenizer (#173)

* add native test for bert tokenizer * add python test * fix unicode category Co-authored-by: Ze Tao <zetao@microsoft.com>
2021-10-20 02:09:38 +08:00 · 2021-10-20 02:09:38 +08:00 · 448518534c
--- a/.gitignore
+++ b/.gitignore
@ -35,6 +35,7 @@ out/
 .setuptools-cmake-build/
 onnxruntime-*-*-*/
 temp_*.onnx
 test/data/*.py
 # Compiled Dynamic libraries
 *.so
--- a/onnxruntime_extensions/_cuops.py
+++ b/onnxruntime_extensions/_cuops.py
@ -20,7 +20,7 @@ class CustomOp:
    def get_inputs(cls): return None
    @classmethod
-    def get_output(cls): return None
+    def get_outputs(cls): return None
    @classmethod
    def serialize_attr(cls, attrs):
--- a/operators/string_utils.cc
+++ b/operators/string_utils.cc
@ -44,6 +44,53 @@ bool IsCJK(char32_t c) {
         || (c >= 0x2F800 && c <= 0x2FA1F);
 }
 // Generated by tools/generate_unicode_category_table.py
 bool IsSpace(char32_t c) {
  if (c == 13||c == 32||c == 160||c == 8239||c == 8287||c == 12288) {
    return true;
  }
  if ((c >= 9 && c <= 10)||(c >= 8192 && c <= 8202)) {
    return true;
  }
  return false;
 }
 // Generated by tools/generate_unicode_category_table.py
 bool IsPunct(char32_t c) {
  if (c == 161||c == 167||c == 171||c == 187||c == 191||c == 894||c == 903||c == 12336||c == 12349) {
    return true;
  }
  if ((c >= 33 && c <= 47)||(c >= 58 && c <= 64)||(c >= 91 && c <= 96)||(c >= 123 && c <= 126)
      ||(c >= 182 && c <= 183)||(c >= 8208 && c <= 8231)||(c >= 8240 && c <= 8259)
      || (c >= 8261 && c <= 8273)||(c >= 8275 && c <= 8286)||(c >= 12289 && c <= 12291)
      ||(c >= 12296 && c <= 12305)||(c >= 12308 && c <= 12319)) {
    return true;
  }
  return false;
 }
 // Generated by tools/generate_unicode_category_table.py
 bool IsControl(char32_t c) {
  if (c == 173||c == 907||c == 909||c == 930||c == 11930||c == 173790||c == 195102
  ) {
    return true;
  }
  if ((c >= 0 && c <= 8)||(c >= 11 && c <= 12)||(c >= 14 && c <= 31)||(c >= 128 && c <= 159)
      ||(c >= 888 && c <= 889)||(c >= 896 && c <= 899)||(c >= 8203 && c <= 8207)
      ||(c >= 8234 && c <= 8238)||(c >= 8288 && c <= 8302)||(c >= 12020 && c <= 12030)
      ||(c >= 40957 && c <= 40958)||(c >= 64110 && c <= 64111)||(c >= 64218 && c <= 64254)
      ||(c >= 177973 && c <= 177982)||(c >= 178206 && c <= 178207)||(c >= 183970 && c <= 183982)) {
    return true;
  }
  return false;
 }
 bool IsAccent(char32_t c)
 {
  // only support part of accent
--- a/operators/string_utils.h
+++ b/operators/string_utils.h
@ -59,6 +59,12 @@ bool IsCJK(char32_t c);
 bool IsAccent(char32_t c);
 bool IsSpace(char32_t c);
 bool IsPunct(char32_t c);
 bool IsControl(char32_t c);
 char32_t StripAccent(char32_t c);
 uint64_t Hash64(const char* data, size_t n, uint64_t seed);
--- a/operators/tokenizer/basic_tokenizer.cc
+++ b/operators/tokenizer/basic_tokenizer.cc
@ -54,22 +54,20 @@ std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {
    }
    // 0x2019 unicode is not punctuation in some Linux platform, 
-    // to be consistent, take it as punctatuation always.
+    // to be consistent, take it as punctuation.
-    if (tokenize_punctuation_ && (::iswpunct(c) || c == wint_t(0x2019))) {
+    if (tokenize_punctuation_ && IsPunct(c)) {
      push_current_token_and_clear();
      push_single_char_and_clear(c);
      continue;
    }
    // split by space
-    if (::iswspace(c)) {
+    if (IsSpace(c)) {
      push_current_token_and_clear();
      continue;
    }
-    // iscntrl will judge \t\f\n\r as control char
+    if (remove_control_chars_ && IsControl(c)) {
    // but it has been filter by isspace(c)
    if (remove_control_chars_ && ::iswcntrl(c)) {
      continue;
    }
--- a/test/data/test_bert_tokenizer1.onnx
+++ b/test/data/test_bert_tokenizer1.onnx
--- a/test/data/test_segment_extraction.onnx
+++ b/test/data/test_segment_extraction.onnx
--- a/test/shared_test/test_ortops_tokenizer.cc
+++ b/test/shared_test/test_ortops_tokenizer.cc
@ -0,0 +1,83 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include <filesystem>
 #include "gtest/gtest.h"
 #include "ocos.h"
 #include "test_kernel.hpp"
 TEST(utils, test_bert_tokenizer) {
  auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
  std::vector<TestValue> inputs(1);
  inputs[0].name = "text";
  inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
  inputs[0].dims = {1};
  inputs[0].values_string = {"We look forward to welcoming you to our stores. Whether you shop in a store or shop online, our Specialists can help you buy the products you love."};
  std::vector<TestValue> outputs(3);
  outputs[0].name = "input_ids";
  outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[0].dims = {34};
  outputs[0].values_int64 = {101, 1284, 1440, 1977, 1106, 20028, 1128, 1106, 1412, 4822, 119, 13197, 1128, 4130, 1107, 170, 2984, 1137, 4130, 3294, 117, 1412, 25607, 1116, 1169, 1494, 1128, 4417, 1103, 2982, 1128, 1567, 119, 102};
  outputs[1].name = "token_type_ids";
  outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[1].dims = {34};
  outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  outputs[2].name = "attention_mask";
  outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[2].dims = {34};
  outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  std::filesystem::path model_path = __FILE__;
  model_path = model_path.parent_path();
  model_path /= "..";
  model_path /= "data";
  model_path /= "test_bert_tokenizer1.onnx";
  TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
  inputs[0].name = "text";
  inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
  inputs[0].dims = {1};
  inputs[0].values_string = {"本想好好的伤感　想放任　但是没泪痕"};
  outputs[0].name = "input_ids";
  outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[0].dims = {17};
  outputs[0].values_int64 = {101, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 102};
  outputs[1].name = "token_type_ids";
  outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[1].dims = {17};
  outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  outputs[2].name = "attention_mask";
  outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[2].dims = {17};
  outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
  inputs[0].name = "text";
  inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
  inputs[0].dims = {1};
  inputs[0].values_string = {"ÀÁÂÃÄÅÇÈÉÊËÌÍÎÑÒÓÔÕÖÚÜ\t䗓𨖷虴𨀐辘𧄋脟𩑢𡗶镇伢𧎼䪱轚榶𢑌㺽𤨡!#$%&(Tom@microsoft.com)*+,-./:;<=>?@[\\]^_`{|}~"};
  outputs[0].name = "input_ids";
  outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[0].dims = {71};
  outputs[0].values_int64 = {101, 13807, 11189, 8101, 27073, 27073, 12738, 11607, 2346, 2346, 2346, 2346, 2346, 2591, 2591, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 106, 108, 109, 110, 111, 113, 2545, 137, 17599, 7301, 4964, 119, 3254, 114, 115, 116, 117, 118, 119, 120, 131, 132, 133, 134, 135, 136, 137, 164, 165, 166, 167, 168, 169, 196, 197, 198, 199, 102};
  outputs[1].name = "token_type_ids";
  outputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[1].dims = {71};
  outputs[1].values_int64 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  outputs[2].name = "attention_mask";
  outputs[2].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
  outputs[2].dims = {71};
  outputs[2].values_int64 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  TestInference(*ort_env, model_path.c_str(), inputs, outputs, GetLibraryPath());
 }
--- a/test/test_bert_tokenizer.py
+++ b/test/test_bert_tokenizer.py
@ -4,24 +4,27 @@ import numpy as np
 import transformers
 from onnxruntime_extensions import PyOrtFunction, BertTokenizer
 bert_cased_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
 bert_uncased_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
 def _get_test_data_file(*sub_dirs):
    test_dir = Path(__file__).parent
    return str(test_dir.joinpath(*sub_dirs))
 bert_cased_tokenizer = transformers.BertTokenizer(_get_test_data_file('data', 'bert_basic_cased_vocab.txt'), False,
                                                  strip_accents=True)
 def _run_basic_case(input, vocab_path):
-    t2stc = PyOrtFunction.from_customop(BertTokenizer, vocab_file=vocab_path, do_lower_case=0)
+    t2stc = PyOrtFunction.from_customop(BertTokenizer, vocab_file=vocab_path, do_lower_case=0, strip_accents=1)
    result = t2stc([input])
    expect_result = bert_cased_tokenizer.encode_plus(input)
    np.testing.assert_array_equal(result[0], expect_result['input_ids'])
    np.testing.assert_array_equal(result[1], expect_result['token_type_ids'])
    np.testing.assert_array_equal(result[2], expect_result['attention_mask'])
 def _run_combined_case(input, vocab_path):
-    t2stc = PyOrtFunction.from_customop(BertTokenizer, vocab_file=vocab_path, do_lower_case=0)
+    t2stc = PyOrtFunction.from_customop(BertTokenizer, vocab_file=vocab_path, do_lower_case=0, strip_accents=1)
    result = t2stc(input)
    expect_result = bert_cased_tokenizer.encode_plus(input[0], input[1])
    np.testing.assert_array_equal(result[0], expect_result['input_ids'])
@ -34,17 +37,19 @@ class TestBertTokenizer(unittest.TestCase):
    def test_text_to_case1(self):
        _run_basic_case(input="Input 'text' must not be empty.",
                        vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_basic_case(
            input="ÀÁÂÃÄÅÇÈÉÊËÌÍÎÑÒÓÔÕÖÚÜ\t䗓𨖷虴𨀐辘𧄋脟𩑢𡗶镇伢𧎼䪱轚榶𢑌㺽𤨡!#$%&(Tom@microsoft.com)*+,-./:;<=>?@[\\]^_`{|}~",
            vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_basic_case(input="网易云音乐", vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_basic_case(input="本想好好的伤感　想放任　但是没泪痕", vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_basic_case(input="网 易 云 音 乐",
                        vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_basic_case(input="cat is playing toys",
                        vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_basic_case(input="cat isnot playing toyssss",
                        vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
-        _run_basic_case(input="cat isnot playing toyssss",
+        _run_combined_case(["网 易 云 音 乐", "cat isnot playing toyssss"],
-                        vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
+                           vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
        _run_combined_case(["网 易 云 音 乐", "cat isnot playing toyssss"], vocab_path=_get_test_data_file('data', 'bert_basic_cased_vocab.txt'))
 if __name__ == "__main__":
--- a/tools/generate_unicode_category_table.py
+++ b/tools/generate_unicode_category_table.py
@ -0,0 +1,144 @@
 import unicodedata
 def _is_whitespace(char):
    """Checks whether `char` is a whitespace character."""
    # \t, \n, and \r are technically control characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False
 def _is_control(char):
    """Checks whether `char` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False
 def _is_punctuation(char):
    """Checks whether `char` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False
 def find_expect_char_in_range(judge_fun, start, end):
    result = []
    for c in range(start, end):
        if judge_fun(chr(c)):
            result.append(c)
    return result
 def find_ranges(nums):
    nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))
 def find_expect_category(category_func):
    expect_category_set = []
    # ASCII
    expect_category_set += find_expect_char_in_range(category_func, 0, 0x7F)
    # 	C1 Controls and Latin-1 Supplement
    expect_category_set += find_expect_char_in_range(category_func, 0x80, 0xFF)
    # Latin Extended-A
    expect_category_set += find_expect_char_in_range(category_func, 0x100, 0x17F)
    # Latin Extended-B
    expect_category_set += find_expect_char_in_range(category_func, 0x180, 0x24F)
    # IPA Extensions
    expect_category_set += find_expect_char_in_range(category_func, 0x250, 0x2AF)
    # Spacing Modifier Letters
    expect_category_set += find_expect_char_in_range(category_func, 0x2B0, 0x2FF)
    # Combining Diacritical Marks
    expect_category_set += find_expect_char_in_range(category_func, 0x300, 0x36F)
    # Greek/Coptic
    expect_category_set += find_expect_char_in_range(category_func, 0x370, 0x3FF)
    # Cyrillic and Cyrillic Supplement
    expect_category_set += find_expect_char_in_range(category_func, 0x400, 0x52F)
    # General Punctuation
    expect_category_set += find_expect_char_in_range(category_func, 0x2000, 0x206F)
    # CJK Radicals Supplement
    expect_category_set += find_expect_char_in_range(category_func, 0x2E80, 0x2EFF)
    # CJK Symbols and Punctuation
    expect_category_set += find_expect_char_in_range(category_func, 0x3000,	0x303F)
    # CJK
    expect_category_set += find_expect_char_in_range(category_func, 0x4E00,	0x9FFF)
    expect_category_set += find_expect_char_in_range(category_func, 0x3400,	0x4DBF)
    expect_category_set += find_expect_char_in_range(category_func, 0x20000, 0x2A6DF)
    expect_category_set += find_expect_char_in_range(category_func, 0x2A700, 0x2B73F)
    expect_category_set += find_expect_char_in_range(category_func, 0x2B740, 0x2CEAF)
    expect_category_set += find_expect_char_in_range(category_func, 0xF900, 0xFAFF)
    expect_category_set += find_expect_char_in_range(category_func, 0x2F800, 0x2FA1F)
    return find_ranges(expect_category_set)
 def print_range(ranges):
    single_set = []
    pair_set = []
    for r in ranges:
        start, end = r
        if start == end:
            single_set.append(start)
        else:
            pair_set.append(r)
    output = "if ("
    for i in range(len(single_set)):
        if i != 0:
            output += "||"
        output += f"c == {single_set[i]}"
    output += ") {\n return true;\n}\n\n"
    output += "if ("
    for i in range(len(pair_set)):
        if i != 0:
            output += "||"
        start, end = pair_set[i]
        output += f"(c >= {start} && c <= {end})"
    output += ") {\n return true;\n}\n\nreturn false;\n"
    print(output)
 print("\nis_whitespace:")
 print_range(find_expect_category(_is_whitespace))
 print("\nis_punctuation:")
 print_range(find_expect_category(_is_punctuation))
 print("\nis_control:")
 print_range(find_expect_category(_is_control))