From bd5fb57ef1bee00abcc0897dd194dfb3302856c9 Mon Sep 17 00:00:00 2001 From: Taku Kudo Date: Tue, 14 Aug 2018 17:59:46 +0900 Subject: [PATCH] Do not allow the token to include NULL character. --- src/trainer_interface.cc | 3 +++ src/trainer_interface_test.cc | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc index aa3a0e0..b222024 100644 --- a/src/trainer_interface.cc +++ b/src/trainer_interface.cc @@ -90,6 +90,9 @@ bool TrainerInterface::IsValidSentencePiece( if (*it == kUNKChar) { // UNK must not be included return false; } + if (*it == 0x0000) { // NULL is not allowed for Darts (TRIE). + return false; + } // kUPPBoundaryChar is included when split_by_upp_for_training is true. if (*it == kUPPBoundaryChar) { return false; diff --git a/src/trainer_interface_test.cc b/src/trainer_interface_test.cc index 03e038f..78ac772 100644 --- a/src/trainer_interface_test.cc +++ b/src/trainer_interface_test.cc @@ -58,6 +58,9 @@ TEST(TrainerInterfaceTest, IsValidSentencePieceTest) { EXPECT_FALSE(IsValid("$ABC")); EXPECT_FALSE(IsValid("ab\tbc")); // "\t" is UPP boundary. EXPECT_FALSE(IsValid("ab cd")); + EXPECT_FALSE(IsValid("ab\0")); + EXPECT_FALSE(IsValid("\0\0")); + EXPECT_FALSE(IsValid("\0")); trainer_spec.set_split_by_whitespace(false); EXPECT_TRUE(IsValid(WS));