Do not allow the token to include NULL character.

This commit is contained in:
Taku Kudo 2018-08-14 17:59:46 +09:00
Родитель db23e20f7a
Коммит bd5fb57ef1
2 изменённых файлов: 6 добавлений и 0 удалений

Просмотреть файл

@ -90,6 +90,9 @@ bool TrainerInterface::IsValidSentencePiece(
if (*it == kUNKChar) { // UNK must not be included if (*it == kUNKChar) { // UNK must not be included
return false; return false;
} }
if (*it == 0x0000) { // NULL is not allowed for Darts (TRIE).
return false;
}
// kUPPBoundaryChar is included when split_by_upp_for_training is true. // kUPPBoundaryChar is included when split_by_upp_for_training is true.
if (*it == kUPPBoundaryChar) { if (*it == kUPPBoundaryChar) {
return false; return false;

Просмотреть файл

@ -58,6 +58,9 @@ TEST(TrainerInterfaceTest, IsValidSentencePieceTest) {
EXPECT_FALSE(IsValid("$ABC")); EXPECT_FALSE(IsValid("$ABC"));
EXPECT_FALSE(IsValid("ab\tbc")); // "\t" is UPP boundary. EXPECT_FALSE(IsValid("ab\tbc")); // "\t" is UPP boundary.
EXPECT_FALSE(IsValid("ab cd")); EXPECT_FALSE(IsValid("ab cd"));
EXPECT_FALSE(IsValid("ab\0"));
EXPECT_FALSE(IsValid("\0\0"));
EXPECT_FALSE(IsValid("\0"));
trainer_spec.set_split_by_whitespace(false); trainer_spec.set_split_by_whitespace(false);
EXPECT_TRUE(IsValid(WS)); EXPECT_TRUE(IsValid(WS));