Ignore inputs missing from vocab in CLIPTokenizer (#462)
* Ignore unknown inputs in CLIPTokenizer * add whitespace clean and unknown token handling * fix const issue * small updates * add single whitespace test case --------- Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
This commit is contained in:
Родитель
507358545d
Коммит
6aaf2920bf
|
@ -15,6 +15,7 @@
|
|||
#include "string_tensor.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <utility>
|
||||
|
||||
// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
|
||||
inline bool IsUnicodeSpace(char32_t ch) {
|
||||
|
@ -54,7 +55,29 @@ inline bool IsUnicodeSpace(char32_t ch) {
|
|||
}
|
||||
|
||||
inline bool IsEmptyUString(const ustring& str) {
|
||||
return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
|
||||
if (str == ustring(" ")) {
|
||||
return false;
|
||||
} else {
|
||||
return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
|
||||
}
|
||||
}
|
||||
|
||||
inline bool BothSpaces(char32_t lhs, char32_t rhs) {
|
||||
return (lhs == rhs) && IsUnicodeSpace(lhs);
|
||||
}
|
||||
|
||||
inline ustring ReplaceString(ustring str, const ustring& search, const ustring& replace) {
|
||||
size_t pos = 0;
|
||||
while ((pos = str.find(search, pos)) != ustring::npos) {
|
||||
str.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
inline void WhiteSpaceClean(ustring& str) {
|
||||
str = ReplaceString(str, ustring("\n"), ustring(" "));
|
||||
str.erase(std::unique(str.begin(), str.end(), BothSpaces), str.end());
|
||||
}
|
||||
|
||||
class SpecialTokenMap {
|
||||
|
@ -269,9 +292,14 @@ class VocabData {
|
|||
return special_tokens_.SplitBySpecialTokens(input);
|
||||
}
|
||||
|
||||
// Returns token if key was found in vocab, and unk_id_ otherwise
|
||||
int GetEncoding(const std::string& key) {
|
||||
auto it = vocab_map_.find(key);
|
||||
return it->second;
|
||||
if (it != end(vocab_map_)) {
|
||||
return it->second;
|
||||
} else {
|
||||
return unk_id_;
|
||||
}
|
||||
}
|
||||
|
||||
size_t VocabSize() const { return vocab_map_.size(); }
|
||||
|
|
|
@ -34,6 +34,8 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne
|
|||
std::vector<int64_t> KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list<OffsetMappingType>& offset_map) {
|
||||
std::vector<int64_t> res;
|
||||
|
||||
WhiteSpaceClean(input);
|
||||
|
||||
if (IsEmptyUString(input)) {
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import unittest
|
||||
import unittest
|
||||
import numpy as np
|
||||
import numpy.lib.recfunctions as nlr
|
||||
import onnxruntime as _ort
|
||||
|
@ -112,6 +112,13 @@ class TestCLIPTokenizer(unittest.TestCase):
|
|||
self._run_tokenizer(["9 8 7 6 5 4 3 2 1 0"])
|
||||
self._run_tokenizer(["9 8 7 - 6 5 4 - 3 2 1 0"])
|
||||
self._run_tokenizer(["One Microsoft Way, Redmond, WA"])
|
||||
self._run_tokenizer(["I can feel the magic, \n can you?"])
|
||||
self._run_tokenizer(["\n Hey Cortana"])
|
||||
self._run_tokenizer(["lower newer \n"])
|
||||
self._run_tokenizer(["\n"])
|
||||
self._run_tokenizer(["Testing multiple sequences of spaces"])
|
||||
self._run_tokenizer([" in the beginning and the end. "])
|
||||
self._run_tokenizer([" "])
|
||||
|
||||
def test_converter(self):
|
||||
fn_tokenizer = PyOrtFunction.from_customop("CLIPTokenizer",
|
||||
|
|
Загрузка…
Ссылка в новой задаче