Add tests using Japanese model, in order to check Unicode encoding.
This commit is contained in:
Родитель
75695aa1e4
Коммит
42e6b4d4e3
|
@ -0,0 +1,2 @@
|
|||
/*.so
|
||||
/build
|
|
@ -10,6 +10,8 @@ class TestSentencepieceProcessor(unittest.TestCase):
|
|||
def setUp(self):
|
||||
self.sp_ = spm.SentencePieceProcessor()
|
||||
self.assertTrue(self.sp_.Load('test/test_model.model'))
|
||||
self.jasp_ = spm.SentencePieceProcessor()
|
||||
self.assertTrue(self.jasp_.Load('test/test_ja_model.model'))
|
||||
|
||||
def test_load(self):
|
||||
self.assertEqual(1000, self.sp_.GetPieceSize())
|
||||
|
@ -33,6 +35,28 @@ class TestSentencepieceProcessor(unittest.TestCase):
|
|||
self.assertEqual(text, self.sp_.DecodePieces(pieces2))
|
||||
self.assertEqual(text, self.sp_.DecodeIds(ids))
|
||||
|
||||
def test_ja_load(self):
|
||||
self.assertEqual(8000, self.jasp_.GetPieceSize())
|
||||
self.assertEqual(0, self.jasp_.PieceToId('<unk>'))
|
||||
self.assertEqual(1, self.jasp_.PieceToId('<s>'))
|
||||
self.assertEqual(2, self.jasp_.PieceToId('</s>'))
|
||||
self.assertEqual('<unk>', self.jasp_.IdToPiece(0))
|
||||
self.assertEqual('<s>', self.jasp_.IdToPiece(1))
|
||||
self.assertEqual('</s>', self.jasp_.IdToPiece(2))
|
||||
for i in range(self.jasp_.GetPieceSize()):
|
||||
piece = self.jasp_.IdToPiece(i)
|
||||
self.assertEqual(i, self.jasp_.PieceToId(piece))
|
||||
|
||||
def test_ja_roundtrip(self):
|
||||
text = '清水寺は京都にある。'
|
||||
ids = self.jasp_.EncodeAsIds(text)
|
||||
pieces1 = self.jasp_.EncodeAsPieces(text)
|
||||
pieces2 = self.jasp_.Encode(text)
|
||||
self.assertEqual(pieces1, pieces2)
|
||||
self.assertEqual(text, self.jasp_.Decode(pieces1))
|
||||
self.assertEqual(text, self.jasp_.DecodePieces(pieces2))
|
||||
self.assertEqual(text, self.jasp_.DecodeIds(ids))
|
||||
|
||||
def suite():
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTests(unittest.makeSuite(TestSentencepieceProcessor))
|
||||
|
|
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче