This commit is contained in:
Taku Kudo 2018-02-28 20:56:07 +09:00
Родитель 45b4527117
Коммит 017ba49151
3 изменённых файлов: 60 добавлений и 8 удалений

Просмотреть файл

@ -67,7 +67,18 @@ special symbol. Tokenized sequences do not preserve the necessary information to
* (en) Hello world. → [Hello] [World] [.] \(A space between Hello and World\) * (en) Hello world. → [Hello] [World] [.] \(A space between Hello and World\)
* (ja) こんにちは世界。 → [こんにちは] [世界] [。] \(No space between こんにちは and 世界\) * (ja) こんにちは世界。 → [こんにちは] [世界] [。] \(No space between こんにちは and 世界\)
## Required packages ## Python module
SentencePiece provides Python wrapper that supports both SentencePiece training and segmentation.
For Linux (x64) environment, you can install Python binary package of SentencePiece with.
```
% pip install sentencepiece
```
For more detail, [Python module](python/README.md)
## Required packages (C++)
The following tools and libraries are required to build SentencePiece: The following tools and libraries are required to build SentencePiece:
* GNU autotools (autoconf automake libtool) * GNU autotools (autoconf automake libtool)
@ -131,6 +142,12 @@ Use `--extra_options` flag to insert the BOS/EOS markers or reverse the input se
% spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>) % spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>)
``` ```
SentencePiece supports nbest segmentation and segmentation sampling with `--output_format=(id|sample)_(piece|id)` flags.
```
% spm_encode --model=<model_file> --output_format=sample_piece --nbest_size=-1 --alpha=0.5 < input > output
% spm_encode --model=<model_file> --output_format=nbest_id --nbest_size=10 < input > output
```
## Decode sentence pieces/ids into raw text ## Decode sentence pieces/ids into raw text
``` ```
% spm_decode --model=<model_file> --input_format=piece < input > output % spm_decode --model=<model_file> --input_format=piece < input > output

Просмотреть файл

@ -2,19 +2,18 @@
Python wrapper for SentencePiece with SWIG. This module wraps sentencepiece::SentencePieceProcessor class with the following modifications: Python wrapper for SentencePiece with SWIG. This module wraps sentencepiece::SentencePieceProcessor class with the following modifications:
* Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely. * Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
* Support model training with SentencePieceTrainer.Train method.
* SentencePieceText proto is not supported. * SentencePieceText proto is not supported.
* Added __len__ and __getitem__ methods. len(obj) and obj[key] returns vocab size and vocab id respectively. * Added __len__ and __getitem__ methods. len(obj) and obj[key] returns vocab size and vocab id respectively.
## Build and Install SentencePiece ## Build and Install SentencePiece
You need to install SentencePiece before installing this python wrapper.
You can simply use pip comand to install SentencePiece python module. You can simply use pip comand to install SentencePiece python module.
``` ```
% pip install sentencepiece % pip install sentencepiece
``` ```
To install the wrapper manually, try the following commands: To build and install the wrapper manually, you need to install SentencePiece C++ in advance, and then try the following commands:
``` ```
% python setup.py build % python setup.py build
% sudo python setup.py install % sudo python setup.py install
@ -27,6 +26,7 @@ If you dont have write permission to the global site-packages directory or do
## Usage ## Usage
### Segmentation
``` ```
% python % python
>>> import sentencepiece as spm >>> import sentencepiece as spm
@ -39,6 +39,21 @@ True
[284, 47, 11, 4, 15, 400] [284, 47, 11, 4, 15, 400]
>>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']) >>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
'This is a test' 'This is a test'
>>> sp.NBestEncode("This is a test", 5)
[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']]
>>> for x in range(10):
... sp.SampleEncode("This is a test", -1, 0.1)
...
['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est']
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't']
>>> sp.DecodeIds([284, 47, 11, 4, 15, 400]) >>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
'This is a test' 'This is a test'
>>> sp.GetPieceSize() >>> sp.GetPieceSize()
@ -53,6 +68,26 @@ True
2 2
``` ```
### Model Training
Training is peformed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.Train() function.
```
>>> import sentencepiece as spm
>>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000')
unigram_model_trainer.cc(494) LOG(INFO) Starts training with :
input: "test/botchan.txt"
model_prefix: "m"
model_type: UNIGRAM
..snip..
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091
trainer_interface.cc(284) LOG(INFO) Saving model: m.model
trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab
>>>
```
## Python2/3 String/Unicode compatibility ## Python2/3 String/Unicode compatibility
Sentencepiece python wrapper accepts both Unicode string and legacy byte string. Sentencepiece python wrapper accepts both Unicode string and legacy byte string.
The output string type is determined by the input string type. The output string type is determined by the input string type.

Просмотреть файл

@ -27,7 +27,7 @@ DEFINE_string(output, "", "output filename");
DEFINE_string(extra_options, "", DEFINE_string(extra_options, "",
"':' separated encoder extra options, e.g., \"reverse:bos:eos\""); "':' separated encoder extra options, e.g., \"reverse:bos:eos\"");
DEFINE_int32(nbest_size, 10, "NBest size"); DEFINE_int32(nbest_size, 10, "NBest size");
DEFINE_double(theta, 0.5, "Smoothing parameter for sampling mode."); DEFINE_double(alpha, 0.5, "Smoothing parameter for sampling mode.");
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
std::vector<std::string> rest_args; std::vector<std::string> rest_args;
@ -71,17 +71,17 @@ int main(int argc, char *argv[]) {
}; };
} else if (FLAGS_output_format == "sample_piece") { } else if (FLAGS_output_format == "sample_piece") {
process = [&](const std::string &line) { process = [&](const std::string &line) {
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &sps); sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &sps);
output.WriteLine(sentencepiece::string_util::Join(sps, " ")); output.WriteLine(sentencepiece::string_util::Join(sps, " "));
}; };
} else if (FLAGS_output_format == "sample_id") { } else if (FLAGS_output_format == "sample_id") {
process = [&](const std::string &line) { process = [&](const std::string &line) {
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &ids); sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &ids);
output.WriteLine(sentencepiece::string_util::Join(ids, " ")); output.WriteLine(sentencepiece::string_util::Join(ids, " "));
}; };
} else if (FLAGS_output_format == "sample_proto") { } else if (FLAGS_output_format == "sample_proto") {
process = [&](const std::string &line) { process = [&](const std::string &line) {
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &spt); sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &spt);
output.WriteLine(spt.Utf8DebugString()); output.WriteLine(spt.Utf8DebugString());
}; };
} else if (FLAGS_output_format == "nbest_piece") { } else if (FLAGS_output_format == "nbest_piece") {