Do not allow the token to include NULL character.
This commit is contained in:
Родитель
db23e20f7a
Коммит
bd5fb57ef1
|
@ -90,6 +90,9 @@ bool TrainerInterface::IsValidSentencePiece(
|
|||
if (*it == kUNKChar) { // UNK must not be included
|
||||
return false;
|
||||
}
|
||||
if (*it == 0x0000) { // NULL is not allowed for Darts (TRIE).
|
||||
return false;
|
||||
}
|
||||
// kUPPBoundaryChar is included when split_by_upp_for_training is true.
|
||||
if (*it == kUPPBoundaryChar) {
|
||||
return false;
|
||||
|
|
|
@ -58,6 +58,9 @@ TEST(TrainerInterfaceTest, IsValidSentencePieceTest) {
|
|||
EXPECT_FALSE(IsValid("$ABC"));
|
||||
EXPECT_FALSE(IsValid("ab\tbc")); // "\t" is UPP boundary.
|
||||
EXPECT_FALSE(IsValid("ab cd"));
|
||||
EXPECT_FALSE(IsValid("ab\0"));
|
||||
EXPECT_FALSE(IsValid("\0\0"));
|
||||
EXPECT_FALSE(IsValid("\0"));
|
||||
|
||||
trainer_spec.set_split_by_whitespace(false);
|
||||
EXPECT_TRUE(IsValid(WS));
|
||||
|
|
Загрузка…
Ссылка в новой задаче