Updated document
This commit is contained in:
Родитель
45b4527117
Коммит
017ba49151
19
README.md
19
README.md
|
@ -67,7 +67,18 @@ special symbol. Tokenized sequences do not preserve the necessary information to
|
||||||
* (en) Hello world. → [Hello] [World] [.] \(A space between Hello and World\)
|
* (en) Hello world. → [Hello] [World] [.] \(A space between Hello and World\)
|
||||||
* (ja) こんにちは世界。 → [こんにちは] [世界] [。] \(No space between こんにちは and 世界\)
|
* (ja) こんにちは世界。 → [こんにちは] [世界] [。] \(No space between こんにちは and 世界\)
|
||||||
|
|
||||||
## Required packages
|
## Python module
|
||||||
|
SentencePiece provides Python wrapper that supports both SentencePiece training and segmentation.
|
||||||
|
For Linux (x64) environment, you can install Python binary package of SentencePiece with.
|
||||||
|
|
||||||
|
```
|
||||||
|
% pip install sentencepiece
|
||||||
|
```
|
||||||
|
|
||||||
|
For more detail, [Python module](python/README.md)
|
||||||
|
|
||||||
|
|
||||||
|
## Required packages (C++)
|
||||||
The following tools and libraries are required to build SentencePiece:
|
The following tools and libraries are required to build SentencePiece:
|
||||||
|
|
||||||
* GNU autotools (autoconf automake libtool)
|
* GNU autotools (autoconf automake libtool)
|
||||||
|
@ -131,6 +142,12 @@ Use `--extra_options` flag to insert the BOS/EOS markers or reverse the input se
|
||||||
% spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>)
|
% spm_encode --extra_options=reverse:bos:eos (reverse input and add <s> and </s>)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
SentencePiece supports nbest segmentation and segmentation sampling with `--output_format=(id|sample)_(piece|id)` flags.
|
||||||
|
```
|
||||||
|
% spm_encode --model=<model_file> --output_format=sample_piece --nbest_size=-1 --alpha=0.5 < input > output
|
||||||
|
% spm_encode --model=<model_file> --output_format=nbest_id --nbest_size=10 < input > output
|
||||||
|
```
|
||||||
|
|
||||||
## Decode sentence pieces/ids into raw text
|
## Decode sentence pieces/ids into raw text
|
||||||
```
|
```
|
||||||
% spm_decode --model=<model_file> --input_format=piece < input > output
|
% spm_decode --model=<model_file> --input_format=piece < input > output
|
||||||
|
|
|
@ -2,19 +2,18 @@
|
||||||
|
|
||||||
Python wrapper for SentencePiece with SWIG. This module wraps sentencepiece::SentencePieceProcessor class with the following modifications:
|
Python wrapper for SentencePiece with SWIG. This module wraps sentencepiece::SentencePieceProcessor class with the following modifications:
|
||||||
* Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
|
* Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
|
||||||
|
* Support model training with SentencePieceTrainer.Train method.
|
||||||
* SentencePieceText proto is not supported.
|
* SentencePieceText proto is not supported.
|
||||||
* Added __len__ and __getitem__ methods. len(obj) and obj[key] returns vocab size and vocab id respectively.
|
* Added __len__ and __getitem__ methods. len(obj) and obj[key] returns vocab size and vocab id respectively.
|
||||||
|
|
||||||
## Build and Install SentencePiece
|
## Build and Install SentencePiece
|
||||||
You need to install SentencePiece before installing this python wrapper.
|
|
||||||
|
|
||||||
You can simply use pip comand to install SentencePiece python module.
|
You can simply use pip comand to install SentencePiece python module.
|
||||||
|
|
||||||
```
|
```
|
||||||
% pip install sentencepiece
|
% pip install sentencepiece
|
||||||
```
|
```
|
||||||
|
|
||||||
To install the wrapper manually, try the following commands:
|
To build and install the wrapper manually, you need to install SentencePiece C++ in advance, and then try the following commands:
|
||||||
```
|
```
|
||||||
% python setup.py build
|
% python setup.py build
|
||||||
% sudo python setup.py install
|
% sudo python setup.py install
|
||||||
|
@ -27,6 +26,7 @@ If you don’t have write permission to the global site-packages directory or do
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
### Segmentation
|
||||||
```
|
```
|
||||||
% python
|
% python
|
||||||
>>> import sentencepiece as spm
|
>>> import sentencepiece as spm
|
||||||
|
@ -39,6 +39,21 @@ True
|
||||||
[284, 47, 11, 4, 15, 400]
|
[284, 47, 11, 4, 15, 400]
|
||||||
>>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
|
>>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
|
||||||
'This is a test'
|
'This is a test'
|
||||||
|
>>> sp.NBestEncode("This is a test", 5)
|
||||||
|
[['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st'], ['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'es', 't']]
|
||||||
|
>>> for x in range(10):
|
||||||
|
... sp.SampleEncode("This is a test", -1, 0.1)
|
||||||
|
...
|
||||||
|
['\xe2\x96\x81', 'T', 'h', 'i', 's', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
|
||||||
|
['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'est']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 's', 't']
|
||||||
|
['\xe2\x96\x81T', 'h', 'is', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81a', '\xe2\x96\x81', 'te', 's', 't']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'e', 'st']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81', 'is', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 't', 'e', 'st']
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81', 'i', 's', '\xe2\x96\x81', 'a', '\xe2\x96\x81', 'te', 's', 't']
|
||||||
>>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
|
>>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
|
||||||
'This is a test'
|
'This is a test'
|
||||||
>>> sp.GetPieceSize()
|
>>> sp.GetPieceSize()
|
||||||
|
@ -53,6 +68,26 @@ True
|
||||||
2
|
2
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Model Training
|
||||||
|
Training is peformed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.Train() function.
|
||||||
|
|
||||||
|
```
|
||||||
|
>>> import sentencepiece as spm
|
||||||
|
>>> spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000')
|
||||||
|
unigram_model_trainer.cc(494) LOG(INFO) Starts training with :
|
||||||
|
input: "test/botchan.txt"
|
||||||
|
model_prefix: "m"
|
||||||
|
model_type: UNIGRAM
|
||||||
|
..snip..
|
||||||
|
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1239 obj=10.4055 num_tokens=36256 num_tokens/piece=29.2623
|
||||||
|
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1239 obj=10.3187 num_tokens=36256 num_tokens/piece=29.2623
|
||||||
|
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=0 size=1100 obj=10.5285 num_tokens=37633 num_tokens/piece=34.2118
|
||||||
|
unigram_model_trainer.cc(529) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4973 num_tokens=37630 num_tokens/piece=34.2091
|
||||||
|
trainer_interface.cc(284) LOG(INFO) Saving model: m.model
|
||||||
|
trainer_interface.cc(293) LOG(INFO) Saving vocabs: m.vocab
|
||||||
|
>>>
|
||||||
|
```
|
||||||
|
|
||||||
## Python2/3 String/Unicode compatibility
|
## Python2/3 String/Unicode compatibility
|
||||||
Sentencepiece python wrapper accepts both Unicode string and legacy byte string.
|
Sentencepiece python wrapper accepts both Unicode string and legacy byte string.
|
||||||
The output string type is determined by the input string type.
|
The output string type is determined by the input string type.
|
||||||
|
|
|
@ -27,7 +27,7 @@ DEFINE_string(output, "", "output filename");
|
||||||
DEFINE_string(extra_options, "",
|
DEFINE_string(extra_options, "",
|
||||||
"':' separated encoder extra options, e.g., \"reverse:bos:eos\"");
|
"':' separated encoder extra options, e.g., \"reverse:bos:eos\"");
|
||||||
DEFINE_int32(nbest_size, 10, "NBest size");
|
DEFINE_int32(nbest_size, 10, "NBest size");
|
||||||
DEFINE_double(theta, 0.5, "Smoothing parameter for sampling mode.");
|
DEFINE_double(alpha, 0.5, "Smoothing parameter for sampling mode.");
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
std::vector<std::string> rest_args;
|
std::vector<std::string> rest_args;
|
||||||
|
@ -71,17 +71,17 @@ int main(int argc, char *argv[]) {
|
||||||
};
|
};
|
||||||
} else if (FLAGS_output_format == "sample_piece") {
|
} else if (FLAGS_output_format == "sample_piece") {
|
||||||
process = [&](const std::string &line) {
|
process = [&](const std::string &line) {
|
||||||
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &sps);
|
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &sps);
|
||||||
output.WriteLine(sentencepiece::string_util::Join(sps, " "));
|
output.WriteLine(sentencepiece::string_util::Join(sps, " "));
|
||||||
};
|
};
|
||||||
} else if (FLAGS_output_format == "sample_id") {
|
} else if (FLAGS_output_format == "sample_id") {
|
||||||
process = [&](const std::string &line) {
|
process = [&](const std::string &line) {
|
||||||
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &ids);
|
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &ids);
|
||||||
output.WriteLine(sentencepiece::string_util::Join(ids, " "));
|
output.WriteLine(sentencepiece::string_util::Join(ids, " "));
|
||||||
};
|
};
|
||||||
} else if (FLAGS_output_format == "sample_proto") {
|
} else if (FLAGS_output_format == "sample_proto") {
|
||||||
process = [&](const std::string &line) {
|
process = [&](const std::string &line) {
|
||||||
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_theta, &spt);
|
sp.SampleEncode(line, FLAGS_nbest_size, FLAGS_alpha, &spt);
|
||||||
output.WriteLine(spt.Utf8DebugString());
|
output.WriteLine(spt.Utf8DebugString());
|
||||||
};
|
};
|
||||||
} else if (FLAGS_output_format == "nbest_piece") {
|
} else if (FLAGS_output_format == "nbest_piece") {
|
||||||
|
|
Загрузка…
Ссылка в новой задаче