Do not allow the token to include NULL character.
This commit is contained in:
Родитель
db23e20f7a
Коммит
bd5fb57ef1
|
@ -90,6 +90,9 @@ bool TrainerInterface::IsValidSentencePiece(
|
||||||
if (*it == kUNKChar) { // UNK must not be included
|
if (*it == kUNKChar) { // UNK must not be included
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (*it == 0x0000) { // NULL is not allowed for Darts (TRIE).
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// kUPPBoundaryChar is included when split_by_upp_for_training is true.
|
// kUPPBoundaryChar is included when split_by_upp_for_training is true.
|
||||||
if (*it == kUPPBoundaryChar) {
|
if (*it == kUPPBoundaryChar) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -58,6 +58,9 @@ TEST(TrainerInterfaceTest, IsValidSentencePieceTest) {
|
||||||
EXPECT_FALSE(IsValid("$ABC"));
|
EXPECT_FALSE(IsValid("$ABC"));
|
||||||
EXPECT_FALSE(IsValid("ab\tbc")); // "\t" is UPP boundary.
|
EXPECT_FALSE(IsValid("ab\tbc")); // "\t" is UPP boundary.
|
||||||
EXPECT_FALSE(IsValid("ab cd"));
|
EXPECT_FALSE(IsValid("ab cd"));
|
||||||
|
EXPECT_FALSE(IsValid("ab\0"));
|
||||||
|
EXPECT_FALSE(IsValid("\0\0"));
|
||||||
|
EXPECT_FALSE(IsValid("\0"));
|
||||||
|
|
||||||
trainer_spec.set_split_by_whitespace(false);
|
trainer_spec.set_split_by_whitespace(false);
|
||||||
EXPECT_TRUE(IsValid(WS));
|
EXPECT_TRUE(IsValid(WS));
|
||||||
|
|
Загрузка…
Ссылка в новой задаче