From 017ba49151280c3c8ba7bf00a19978647403a12b Mon Sep 17 00:00:00 2001 From: Taku Kudo Date: Wed, 28 Feb 2018 20:56:07 +0900 Subject: [PATCH] Updated document --- README.md | 19 ++++++++++++++++++- python/README.md | 41 ++++++++++++++++++++++++++++++++++++++--- src/spm_encode_main.cc | 8 ++++---- 3 files changed, 60 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3ffc50c..c41b21b 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,18 @@ special symbol. Tokenized sequences do not preserve the necessary information to * (en) Hello world. → [Hello] [World] [.] \(A space between Hello and World\) * (ja) こんにちは世界。 → [こんにちは] [世界] [。] \(No space between こんにちは and 世界\) -## Required packages +## Python module +SentencePiece provides Python wrapper that supports both SentencePiece training and segmentation. +For Linux (x64) environment, you can install Python binary package of SentencePiece with. + +``` +% pip install sentencepiece +``` + +For more detail, [Python module](python/README.md) + + +## Required packages (C++) The following tools and libraries are required to build SentencePiece: * GNU autotools (autoconf automake libtool) @@ -131,6 +142,12 @@ Use `--extra_options` flag to insert the BOS/EOS markers or reverse the input se % spm_encode --extra_options=reverse:bos:eos (reverse input and add and ) ``` +SentencePiece supports nbest segmentation and segmentation sampling with `--output_format=(id|sample)_(piece|id)` flags. +``` +% spm_encode --model= --output_format=sample_piece --nbest_size=-1 --alpha=0.5 < input > output +% spm_encode --model= --output_format=nbest_id --nbest_size=10 < input > output +``` + ## Decode sentence pieces/ids into raw text ``` % spm_decode --model= --input_format=piece < input > output diff --git a/python/README.md b/python/README.md index af52d88..766f61b 100644 --- a/python/README.md +++ b/python/README.md @@ -2,19 +2,18 @@ Python wrapper for SentencePiece with SWIG. This module wraps sentencepiece::SentencePieceProcessor class with the following modifications: * Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely. +* Support model training with SentencePieceTrainer.Train method. * SentencePieceText proto is not supported. * Added __len__ and __getitem__ methods. len(obj) and obj[key] returns vocab size and vocab id respectively. ## Build and Install SentencePiece -You need to install SentencePiece before installing this python wrapper. - You can simply use pip comand to install SentencePiece python module. ``` % pip install sentencepiece ``` -To install the wrapper manually, try the following commands: +To build and install the wrapper manually, you need to install SentencePiece C++ in advance, and then try the following commands: ``` % python setup.py build % sudo python setup.py install @@ -27,6 +26,7 @@ If you don’t have write permission to the global site-packages directory or do ## Usage +### Segmentation ``` % python >>> import sentencepiece as spm @@ -39,6 +39,21 @@ True [284, 47, 11, 4, 15, 400] >>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']) 'This is a test' +>>> sp.NBestEncode("This is a test", 5) +[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']] +>>> for x in range(10): +... sp.SampleEncode("This is a test", -1, 0.1) +... +['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't'] +['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est'] +['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st'] +['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'] +['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't'] +['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'] +['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'] +['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'] +['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st'] +['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't'] >>> sp.DecodeIds([284, 47, 11, 4, 15, 400]) 'This is a test' >>> sp.GetPieceSize() @@ -53,6 +68,26 @@ True 2 ``` +### Model Training +Training is peformed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.Train() function. + +``` +>>> import sentencepiece as spm +>>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000') +unigram_model_trainer.cc(494) LOG(INFO) Starts training with : +input: "test/botchan.txt" +model_prefix: "m" +model_type: UNIGRAM +..snip.. +unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623 +unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623 +unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118 +unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091 +trainer_interface.cc(284) LOG(INFO) Saving model: m.model +trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab +>>> +``` + ## Python2/3 String/Unicode compatibility Sentencepiece python wrapper accepts both Unicode string and legacy byte string. The output string type is determined by the input string type. diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc index aad9723..82905b4 100644 --- a/src/spm_encode_main.cc +++ b/src/spm_encode_main.cc @@ -27,7 +27,7 @@ DEFINE_string(output, "", "output filename"); DEFINE_string(extra_options, "", "':' separated encoder extra options, e.g., \"reverse:bos:eos\""); DEFINE_int32(nbest_size, 10, "NBest size"); -DEFINE_double(theta, 0.5, "Smoothing parameter for sampling mode."); +DEFINE_double(alpha, 0.5, "Smoothing parameter for sampling mode."); int main(int argc, char *argv[]) { std::vector rest_args; @@ -71,17 +71,17 @@ int main(int argc, char *argv[]) { }; } else if (FLAGS_output_format == "sample_piece") { process = [&](const std::string &line) { - sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &sps); + sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &sps); output.WriteLine(sentencepiece::string_util::Join(sps, " ")); }; } else if (FLAGS_output_format == "sample_id") { process = [&](const std::string &line) { - sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &ids); + sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &ids); output.WriteLine(sentencepiece::string_util::Join(ids, " ")); }; } else if (FLAGS_output_format == "sample_proto") { process = [&](const std::string &line) { - sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &spt); + sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &spt); output.WriteLine(spt.Utf8DebugString()); }; } else if (FLAGS_output_format == "nbest_piece") {