From 266f274da0b1c706114bebf15dff2404d2f00907 Mon Sep 17 00:00:00 2001 From: resec Date: Thu, 20 Jul 2017 10:54:12 +0800 Subject: [PATCH] Added C++ tag for all code blocks --- doc/api.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/api.md b/doc/api.md index 68f9e7c..861ea28 100644 --- a/doc/api.md +++ b/doc/api.md @@ -4,7 +4,7 @@ To start working with the SentencePiece model, you will want to include the `sentencepiece_processor.h` header file. Then instantiate sentencepiece::SentencePieceProcessor class and calls `Load`or `LoadOrDie` method to load the model with file path or std::istream. -``` +```C++ #include sentencepiece::SentencePieceProcessor processor; @@ -17,7 +17,7 @@ processor.LoadOrDie("//path/to/model.model"); ## Tokenize text (preprocessing) Calls `SentencePieceProcessor::Encode` method to tokenize text. -``` +```C++ std::vector pieces; processor.Encode("This is a test.", &pieces); for (const std::string &token : pieces) { @@ -27,7 +27,7 @@ for (const std::string &token : pieces) { You will obtain the sequence of vocab ids as follows: -``` +```C++ std::vector ids; processor.Encode("This is a test.", &ids); for (const int id : ids) { @@ -38,7 +38,7 @@ for (const int id : ids) { ## Detokenize text (postprocessing) Calls `SentencePieceProcessor::Decode` method to detokenize a sequence of pieces or ids into a text. Basically it is guaranteed that the detoknization is an inverse operation of Encode, i.e., `Decode(Encode(Normalize(input))) == Normalize(input)`. -``` +```C++ std::vector pieces = { "▁This", "▁is", "▁a", "▁", "te", "st", "." }; // sequence of pieces std::string text processor.Decode(pieces, &text); @@ -52,7 +52,7 @@ std::cout << text << std::endl; ## SentencePieceText proto You will want to use `SentencePieceText` class to obtain the pieces and ids at the same time. This proto also encodes a utf8-byte offset of each piece over user input or detokenized text. -``` +```C++ #include sentencepiece::SentencePieceText spt; @@ -80,7 +80,7 @@ for (const auto &piece : spt.pieces()) { ## Vocabulary management You will want to use the following methods to obtain ids from/to pieces. -``` +```C++ processor.GetPieceSize(); // returns the size of vocabs. processor.PieceToId("foo"); // returns the vocab id of "foo" processor.IdToPiece(10); // returns the string representation of id 10. @@ -91,7 +91,7 @@ processor.IsControl(10); // returns true if the given id is a control token. ## Extra Options Use `SetEncodeExtraOptions` and `SetDecodeExtraOptions` methods to set extra options for encoding and decoding respectively. These methods need to be called just after `Load/LoadOrDie` methods. -``` +```C++ processor.SetEncodeExtraOptions("bos:eos"); // add and . processor.SetEncodeExtraOptions("reverse:bos:eos"); // reverse the input and then add and .