Ignore inputs missing from vocab in CLIPTokenizer (#462)

* Ignore unknown inputs in CLIPTokenizer * add whitespace clean and unknown token handling * fix const issue * small updates * add single whitespace test case --------- Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
2023-06-01 19:40:43 -07:00 · 2023-06-01 19:40:43 -07:00 · 6aaf2920bf
--- a/operators/tokenizer/bpe_tokenizer.hpp
+++ b/operators/tokenizer/bpe_tokenizer.hpp
@ -15,6 +15,7 @@
 #include "string_tensor.h"

 #include <iostream>
+#include <utility>

 // Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
 inline bool IsUnicodeSpace(char32_t ch) {
@ -54,7 +55,29 @@ inline bool IsUnicodeSpace(char32_t ch) {
 }

 inline bool IsEmptyUString(const ustring& str) {
-  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
+  if (str == ustring(" ")) {
+    return false;
+  } else {
+    return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
+  }
+}
+
+inline bool BothSpaces(char32_t lhs, char32_t rhs) {
+    return (lhs == rhs) && IsUnicodeSpace(lhs);
+}
+
+inline ustring ReplaceString(ustring str, const ustring& search, const ustring& replace) {
+    size_t pos = 0;
+    while ((pos = str.find(search, pos)) != ustring::npos) {
+    str.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+    return str;
+}
+
+inline void WhiteSpaceClean(ustring& str) {
+    str = ReplaceString(str, ustring("\n"), ustring(" "));
+    str.erase(std::unique(str.begin(), str.end(), BothSpaces), str.end());
 }

 class SpecialTokenMap {
@ -269,9 +292,14 @@ class VocabData {
    return special_tokens_.SplitBySpecialTokens(input);
  }

+  // Returns token if key was found in vocab, and unk_id_ otherwise
  int GetEncoding(const std::string& key) {
    auto it = vocab_map_.find(key);
-    return it->second;
+    if (it != end(vocab_map_)) {
+      return it->second;
+    } else {
+      return unk_id_;
+    }
  }

  size_t VocabSize() const { return vocab_map_.size(); }
--- a/operators/tokenizer/clip_tokenizer.cc
+++ b/operators/tokenizer/clip_tokenizer.cc
@ -34,6 +34,8 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne
 std::vector<int64_t> KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list<OffsetMappingType>& offset_map) {
  std::vector<int64_t> res;

+  WhiteSpaceClean(input);
+
  if (IsEmptyUString(input)) {
    return res;
  }
--- a/test/test_cliptok.py
+++ b/test/test_cliptok.py
@ -1,4 +1,4 @@
-import unittest
+import unittest
 import numpy as np
 import numpy.lib.recfunctions as nlr
 import onnxruntime as _ort
@ -112,6 +112,13 @@ class TestCLIPTokenizer(unittest.TestCase):
        self._run_tokenizer(["9 8 7 6 5 4 3 2 1 0"])
        self._run_tokenizer(["9 8 7 - 6 5 4 - 3 2 1 0"])
        self._run_tokenizer(["One Microsoft Way, Redmond, WA"])
+        self._run_tokenizer(["I can feel the magic, \n can you?"])
+        self._run_tokenizer(["\n Hey Cortana"])
+        self._run_tokenizer(["lower newer \n"])
+        self._run_tokenizer(["\n"])
+        self._run_tokenizer(["Testing multiple      sequences       of spaces"])
+        self._run_tokenizer(["      in the beginning and the end.      "])
+        self._run_tokenizer([" "])

    def test_converter(self):
        fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer",