From 017ba49151280c3c8ba7bf00a19978647403a12b Mon Sep 17 00:00:00 2001
From: Taku Kudo <taku@google.com>
Date: Wed, 28 Feb 2018 20:56:07 +0900
Subject: [PATCH] Updated document

---
 README.md              | 19 ++++++++++++++++++-
 python/README.md       | 41 ++++++++++++++++++++++++++++++++++++++---
 src/spm_encode_main.cc |  8 ++++----
 3 files changed, 60 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index 3ffc50c..c41b21b 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,18 @@ special symbol. Tokenized sequences do not preserve the necessary information to
 * (en) Hello world.   → [Hello] [World] [.]   \(A space between Hello and World\)
 * (ja) こんにちは世界。  → [こんにちは] [世界] [。] \(No space between こんにちは and 世界\)
 
-## Required packages
+## Python module
+SentencePiece provides Python wrapper that supports both SentencePiece training and segmentation.
+For Linux (x64) environment, you can install Python binary package of SentencePiece with.
+
+```
+% pip install sentencepiece
+```
+
+For more detail, [Python module](python/README.md)
+
+
+## Required packages (C++)
 The following tools and libraries are required to build SentencePiece:
 
 * GNU autotools (autoconf automake libtool)
@@ -131,6 +142,12 @@ Use `--extra_options` flag to insert the BOS/EOS markers or reverse the input se
 % spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>)
 ```
 
+SentencePiece supports nbest segmentation and segmentation sampling with `--output_format=(id|sample)_(piece|id)` flags.
+```
+% spm_encode --model=<model_file> --output_format=sample_piece --nbest_size=-1 --alpha=0.5 < input > output
+% spm_encode --model=<model_file> --output_format=nbest_id --nbest_size=10 < input > output
+```
+
 ## Decode sentence pieces/ids into raw text
 ```
 % spm_decode --model=<model_file> --input_format=piece < input > output
diff --git a/python/README.md b/python/README.md
index af52d88..766f61b 100644
--- a/python/README.md
+++ b/python/README.md
@@ -2,19 +2,18 @@
 
 Python wrapper for SentencePiece with SWIG. This module wraps sentencepiece::SentencePieceProcessor class with the following modifications:
 * Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
+* Support model training with SentencePieceTrainer.Train method.
 * SentencePieceText proto is not supported.
 * Added __len__ and __getitem__ methods. len(obj) and obj[key] returns vocab size and vocab id respectively.
 
 ## Build and Install SentencePiece
-You need to install SentencePiece before installing this python wrapper.
-
 You can simply use pip comand to install SentencePiece python module.
 
 ```
 % pip install sentencepiece
 ```
 
-To install the wrapper manually, try the following commands:
+To build and install the wrapper manually, you need to install SentencePiece C++ in advance, and then try the following commands:
 ```
 % python setup.py build
 % sudo python setup.py install
@@ -27,6 +26,7 @@ If you don’t have write permission to the global site-packages directory or do
 
 ## Usage
 
+### Segmentation
 ```
 % python
 >>> import sentencepiece as spm
@@ -39,6 +39,21 @@ True
 [284, 47, 11, 4, 15, 400]
 >>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
 'This is a test'
+>>> sp.NBestEncode("This is a test", 5)
+[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']]
+>>> for x in range(10):
+...     sp.SampleEncode("This is a test", -1, 0.1)
+...
+['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
+['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est']
+['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
+['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
+['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
+['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
+['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
+['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
+['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
+['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't']
 >>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
 'This is a test'
 >>> sp.GetPieceSize()
@@ -53,6 +68,26 @@ True
 2
 ```
 
+### Model Training
+Training is peformed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to  SentencePieceTrainer.Train() function.
+
+```
+>>> import sentencepiece as spm
+>>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000')
+unigram_model_trainer.cc(494) LOG(INFO) Starts training with : 
+input: "test/botchan.txt"
+model_prefix: "m"
+model_type: UNIGRAM
+..snip..
+unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623
+unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623
+unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118
+unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091
+trainer_interface.cc(284) LOG(INFO) Saving model: m.model
+trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab
+>>>
+```
+
 ## Python2/3 String/Unicode compatibility
 Sentencepiece python wrapper accepts both Unicode string and legacy byte string.
 The output string type is determined by the input string type.
diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc
index aad9723..82905b4 100644
--- a/src/spm_encode_main.cc
+++ b/src/spm_encode_main.cc
@@ -27,7 +27,7 @@ DEFINE_string(output, "", "output filename");
 DEFINE_string(extra_options, "",
               "':' separated encoder extra options, e.g., \"reverse:bos:eos\"");
 DEFINE_int32(nbest_size, 10, "NBest size");
-DEFINE_double(theta, 0.5, "Smoothing parameter for sampling mode.");
+DEFINE_double(alpha, 0.5, "Smoothing parameter for sampling mode.");
 
 int main(int argc, char *argv[]) {
   std::vector<std::string> rest_args;
@@ -71,17 +71,17 @@ int main(int argc, char *argv[]) {
     };
   } else if (FLAGS_output_format == "sample_piece") {
     process = [&](const std::string &line) {
-      sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &sps);
+      sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &sps);
       output.WriteLine(sentencepiece::string_util::Join(sps, " "));
     };
   } else if (FLAGS_output_format == "sample_id") {
     process = [&](const std::string &line) {
-      sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &ids);
+      sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &ids);
       output.WriteLine(sentencepiece::string_util::Join(ids, " "));
     };
   } else if (FLAGS_output_format == "sample_proto") {
     process = [&](const std::string &line) {
-      sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &spt);
+      sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &spt);
       output.WriteLine(spt.Utf8DebugString());
     };
   } else if (FLAGS_output_format == "nbest_piece") {