Strengthen the Annotation class: Handle empty sentences and tests (#85)
* Changing Annotation to adhere to [begin, end) * Stronger unit tests on sentences + num words, num sentences * Hotfix with empty string view from EOS * No more absolving empty-sentence; Added tests now defined behaviour * Uncommenting important section in unit test * Ensure empty string view default, beginning at end so marker points * Further strengthen and comment unit-tests, mark exactly where empty sentence is happening * Review comments: Dummy sentence + docs - What should be a simple fast accessor is turning into compute. Normally the way to deal with this, for better or worse, is to put 0 at the beginning of sentenceEndIds_. (Putting 0 at the beginning of sentenceEndIds_) - Indices into what? Mentioned to be flatByteRanges_. * Documentation updates * More changes to docs Co-authored-by: abhi-agg <66322306+abhi-agg@users.noreply.github.com>
This commit is contained in:
Родитель
b345b0e035
Коммит
3daa024eb3
|
@ -11,63 +11,210 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||
/// which sentence went in where and try to use accessor methods on
|
||||
/// AnnotatedText to check if what we have as ground-truth by construction is
|
||||
/// consistent with what is returned.
|
||||
size_t sentences = 20;
|
||||
size_t sentences = 500;
|
||||
size_t maxWords = 40;
|
||||
|
||||
// Set in case needed to see output. The output is in lines of #sentences +
|
||||
// header, which can be split and compared for easy understanding. The ideal
|
||||
// way to inspect what is going wrong is to redirect output and use to split
|
||||
// the different stages by sentences + 1 lines and check the diff.
|
||||
bool debug{false};
|
||||
|
||||
std::mt19937 randomIntGen_;
|
||||
randomIntGen_.seed(42);
|
||||
|
||||
AnnotatedText testAnnotation;
|
||||
std::vector<std::vector<ByteRange>> sentenceWords;
|
||||
std::vector<ByteRange> Words;
|
||||
AnnotatedText testAnnotation; // This the container we add through API and
|
||||
// check if the access is correct.
|
||||
|
||||
// External book-keeping so we have ground truths. Each element represents a
|
||||
// sentence.
|
||||
|
||||
// word byte ranges - for testAnnotation.word(sId, wId)
|
||||
std::vector<std::vector<ByteRange>> groundTruthWords;
|
||||
// sentence byte ranges - for testAnnotation.sentence(sId, wId)
|
||||
std::vector<ByteRange> groundTruthSentences;
|
||||
|
||||
// Prepare the text and construct ByteRanges as intended for sentences and
|
||||
// words. The ByteRanges we construct here are expected to be the
|
||||
// ground-truths for words and sentences. The string being constructed is like
|
||||
// as follows:
|
||||
//
|
||||
// 0-0 0-1 0-2 0-3
|
||||
// 1-0 1-1 1-2 1-3 1-4
|
||||
// 2-0 2-1
|
||||
//
|
||||
// 4-0 4-1 4-2 4-3
|
||||
//
|
||||
// Words are separated by space units.
|
||||
//
|
||||
// Below, we accumulate the text with intended structure as above, and
|
||||
// ground-truth tables populated to be aware of the ByteRanges where they are
|
||||
// meant to be.
|
||||
if (debug) {
|
||||
std::cout << "Preparing text and ground truth-tables" << std::endl;
|
||||
}
|
||||
for (size_t idx = 0; idx < sentences; idx++) {
|
||||
if (idx != 0)
|
||||
testAnnotation.text += "\n";
|
||||
|
||||
Words.clear();
|
||||
size_t words = randomIntGen_() % maxWords + 1;
|
||||
Words.reserve(words);
|
||||
for (size_t idw = 0; idw < words; idw++) {
|
||||
size_t before = testAnnotation.text.size();
|
||||
// Words can be zero, we need to support empty word sentences as well.
|
||||
size_t numWords = randomIntGen_() % maxWords;
|
||||
|
||||
std::vector<ByteRange> wordByteRanges;
|
||||
wordByteRanges.reserve(numWords);
|
||||
|
||||
// For empty sentence, we expect it to be empty and marked in position where
|
||||
// the existing string is if needed to be pointed out.
|
||||
size_t before = testAnnotation.text.size() - 1;
|
||||
size_t sentenceBegin{before}, sentenceEnd{before};
|
||||
|
||||
for (size_t idw = 0; idw < numWords; idw++) {
|
||||
if (idw != 0) {
|
||||
testAnnotation.text += " ";
|
||||
if (debug) {
|
||||
std::cout << " ";
|
||||
}
|
||||
}
|
||||
|
||||
// Get new beginning, accounting for space above.
|
||||
before = testAnnotation.text.size();
|
||||
|
||||
// Add the word
|
||||
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
|
||||
testAnnotation.text += word;
|
||||
if (idw != 0)
|
||||
testAnnotation.text += " ";
|
||||
Words.push_back((ByteRange){before, before + word.size() - 1});
|
||||
}
|
||||
// std::cout << std::endl;
|
||||
|
||||
sentenceWords.push_back(Words);
|
||||
// Do math, before, before + new-word's size.
|
||||
wordByteRanges.push_back((ByteRange){before, before + word.size()});
|
||||
|
||||
if (debug) {
|
||||
std::cout << word;
|
||||
}
|
||||
|
||||
if (idw == 0) {
|
||||
sentenceBegin = before;
|
||||
}
|
||||
if (idw == numWords - 1) {
|
||||
sentenceEnd = before + word.size();
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
groundTruthWords.push_back(wordByteRanges);
|
||||
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
|
||||
}
|
||||
|
||||
// std::cout << "Inserting words:" << std::endl;
|
||||
std::vector<std::vector<marian::string_view>> byteRanges;
|
||||
for (auto &sentence : sentenceWords) {
|
||||
// We prepare string_views now with the known ByteRanges and use the
|
||||
// string_view based AnnotatedText.addSentence(...) API to add sentences to
|
||||
// transparently convert from string_views to ByteRanges, rebasing/working out
|
||||
// the math underneath.
|
||||
|
||||
if (debug) {
|
||||
std::cout << "Inserting words onto container and save ground-truth-table:"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
std::vector<std::vector<marian::string_view>> wordStringViews;
|
||||
for (auto &sentence : groundTruthWords) {
|
||||
std::vector<marian::string_view> wordByteRanges;
|
||||
bool first{true};
|
||||
for (auto &word : sentence) {
|
||||
marian::string_view wordView(&testAnnotation.text[word.begin],
|
||||
word.end - word.begin);
|
||||
word.size());
|
||||
wordByteRanges.push_back(wordView);
|
||||
// std::cout << std::string(wordView) << " ";
|
||||
if (debug) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
std::cout << " ";
|
||||
}
|
||||
std::cout << std::string(wordView);
|
||||
}
|
||||
}
|
||||
testAnnotation.addSentence(wordByteRanges);
|
||||
byteRanges.push_back(wordByteRanges);
|
||||
// std::cout << std::endl;
|
||||
wordStringViews.push_back(wordByteRanges);
|
||||
if (debug) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// std::cout << "From container: " << std::endl;
|
||||
for (int idx = 0; idx < sentenceWords.size(); idx++) {
|
||||
for (int idw = 0; idw < sentenceWords[idx].size(); idw++) {
|
||||
ByteRange expected = sentenceWords[idx][idw];
|
||||
if (debug) {
|
||||
std::cout
|
||||
<< "Inserting sentences onto container and save ground-truth-table"
|
||||
<< std::endl;
|
||||
}
|
||||
std::vector<marian::string_view> sentenceStringViews;
|
||||
for (auto &sentenceByteRange : groundTruthSentences) {
|
||||
char *data = &(testAnnotation.text[sentenceByteRange.begin]);
|
||||
marian::string_view sentenceView(data, sentenceByteRange.size());
|
||||
sentenceStringViews.push_back(sentenceView);
|
||||
|
||||
if (debug) {
|
||||
std::cout << sentenceView << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Access from the sentence(sentenceIdx) API and confirm that the ground truth
|
||||
// we expect is same as what comes out of the container.
|
||||
if (debug) {
|
||||
std::cout << "From container: Sentences" << std::endl;
|
||||
}
|
||||
for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
|
||||
ByteRange expected = groundTruthSentences[idx];
|
||||
ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
|
||||
if (debug) {
|
||||
std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
|
||||
}
|
||||
CHECK(expected.begin == obtained.begin);
|
||||
CHECK(expected.end == obtained.end);
|
||||
std::string expected_string = std::string(sentenceStringViews[idx]);
|
||||
std::string obtained_string = std::string(testAnnotation.sentence(idx));
|
||||
CHECK(expected_string == obtained_string);
|
||||
}
|
||||
|
||||
/// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
|
||||
/// expected words are the same as those obtained from the container.
|
||||
if (debug) {
|
||||
std::cout << "From container: Words" << std::endl;
|
||||
}
|
||||
|
||||
CHECK(groundTruthWords.size() == testAnnotation.numSentences());
|
||||
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
|
||||
CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
|
||||
for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
|
||||
ByteRange expected = groundTruthWords[idx][idw];
|
||||
ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
|
||||
// std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
|
||||
if (debug) {
|
||||
std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
|
||||
}
|
||||
CHECK(expected.begin == obtained.begin);
|
||||
CHECK(expected.end == obtained.end);
|
||||
|
||||
std::string expected_string = std::string(byteRanges[idx][idw]);
|
||||
CHECK(expected_string == std::string(testAnnotation.word(idx, idw)));
|
||||
std::string expected_string = std::string(wordStringViews[idx][idw]);
|
||||
std::string obtained_string = std::string(testAnnotation.word(idx, idw));
|
||||
CHECK(expected_string == obtained_string);
|
||||
}
|
||||
if (debug) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
// std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Try inserting an empty Sentence. This is ensuring we check for empty
|
||||
// Sentence if the random test above does not cover it for some reason.
|
||||
int emptySentenceIdx = sentences;
|
||||
std::vector<marian::string_view> emptySentence;
|
||||
testAnnotation.addSentence(emptySentence);
|
||||
|
||||
// There are no words.
|
||||
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
|
||||
|
||||
// Empty sentence expected at output.
|
||||
std::string expectedEmptyString = "";
|
||||
marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
|
||||
std::string obtainedString = std::string(emptyView.data(), emptyView.size());
|
||||
CHECK(expectedEmptyString == obtainedString);
|
||||
}
|
||||
|
|
|
@ -6,48 +6,44 @@ namespace marian {
|
|||
namespace bergamot {
|
||||
|
||||
void Annotation::addSentence(std::vector<ByteRange> &sentence) {
|
||||
size_t size = flatByteRanges_.size();
|
||||
flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
|
||||
std::end(sentence));
|
||||
sentenceBeginIds_.push_back(size);
|
||||
size_t size = flatByteRanges_.size();
|
||||
sentenceEndIds_.push_back(size);
|
||||
}
|
||||
|
||||
size_t Annotation::numWords(size_t sentenceIdx) const {
|
||||
auto terminals = sentenceTerminalIds(sentenceIdx);
|
||||
return terminals.second - terminals.first + 1;
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t>
|
||||
Annotation::sentenceTerminalIds(size_t sentenceIdx) const {
|
||||
size_t bosId, eosId;
|
||||
bosId = sentenceBeginIds_[sentenceIdx];
|
||||
eosId = sentenceIdx + 1 < numSentences()
|
||||
? sentenceBeginIds_[sentenceIdx + 1] - 1
|
||||
: flatByteRanges_.size() - 1;
|
||||
|
||||
// Out of bound checks.
|
||||
assert(bosId < flatByteRanges_.size());
|
||||
assert(eosId < flatByteRanges_.size());
|
||||
return std::make_pair(bosId, eosId);
|
||||
}
|
||||
|
||||
std::pair<ByteRange, ByteRange>
|
||||
Annotation::sentenceTerminals(size_t sentenceIdx) const {
|
||||
auto terminals = sentenceTerminalIds(sentenceIdx);
|
||||
return std::make_pair(flatByteRanges_[terminals.first],
|
||||
flatByteRanges_[terminals.second]);
|
||||
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
||||
eosId = sentenceEndIds_[sentenceIdx + 1];
|
||||
// Difference between eosId and bosId is the number of words.
|
||||
return eosId - bosId;
|
||||
}
|
||||
|
||||
ByteRange Annotation::sentence(size_t sentenceIdx) const {
|
||||
auto terminals = sentenceTerminals(sentenceIdx);
|
||||
return (ByteRange){terminals.first.begin, terminals.second.end};
|
||||
size_t bosId, eosId;
|
||||
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
||||
eosId = sentenceEndIds_[sentenceIdx + 1];
|
||||
ByteRange sentenceByteRange;
|
||||
|
||||
if (bosId == eosId) {
|
||||
// We have an empty sentence. However, we want to be able to point where in
|
||||
// target this happened through the ranges. We are looking for the end of
|
||||
// the flatByteRange and non-empty sentence before this happened and
|
||||
// construct empty string-view equivalent ByteRange.
|
||||
ByteRange eos = flatByteRanges_[eosId - 1];
|
||||
sentenceByteRange = (ByteRange){eos.end, eos.end};
|
||||
} else {
|
||||
ByteRange bos = flatByteRanges_[bosId];
|
||||
ByteRange eos = flatByteRanges_[eosId - 1];
|
||||
sentenceByteRange = (ByteRange){bos.begin, eos.end};
|
||||
}
|
||||
return sentenceByteRange;
|
||||
}
|
||||
|
||||
ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
|
||||
size_t offset = sentenceBeginIds_[sentenceIdx];
|
||||
// auto terminals = sentenceTerminals(sentenceIdx);
|
||||
// assert(offset + wordIdx <= terminals.second);
|
||||
return flatByteRanges_[offset + wordIdx];
|
||||
size_t bosOffset = sentenceEndIds_[sentenceIdx];
|
||||
return flatByteRanges_[bosOffset + wordIdx];
|
||||
}
|
||||
|
||||
string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
|
||||
|
|
|
@ -19,51 +19,82 @@ struct ByteRange {
|
|||
|
||||
/// An Annotation is a collection of ByteRanges used to denote ancillary
|
||||
/// information of sentences and words on a text of string. Annotation is meant
|
||||
/// for consumption on platforms where string_view creates problems (eg: exports
|
||||
/// through WASM). See AnnotatedText for cases where this is a non-issue.
|
||||
/// for consumption on platforms where `string_view` creates problems (eg:
|
||||
/// exports through WASM) conveniently rebasing them as required into
|
||||
/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
|
||||
///
|
||||
/// **Usage**
|
||||
///
|
||||
/// To ensure rebasing is consistent during creation and updation, use
|
||||
/// `Annotation` best through `AnnotatedText`, which also holds the reference
|
||||
/// string and can work with `string_views`.
|
||||
///
|
||||
/// If used separately, it is on the user to ensure the reference string
|
||||
/// is the same as what the Annotation refers to. For best results, an instance
|
||||
/// is expected to be read only in this mode of operation.
|
||||
///
|
||||
/// **Idea**
|
||||
///
|
||||
/// Annotation is intended to be the same structure conceptually as below,
|
||||
/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
|
||||
/// structure to avoid multiple reallocs keeping efficiency in mind. This is
|
||||
/// achieved by having markers of where sentence ends in the flat container
|
||||
/// storing word ByteRanges.
|
||||
///
|
||||
/// ```cpp
|
||||
/// typedef ByteRange Word;
|
||||
/// // std::vector<ByteRange>, a single sentence
|
||||
/// typedef std::vector<Word> Sentence;
|
||||
/// std::vector<std::vector<ByteRange> // multiple sentences
|
||||
/// typedef std::vector<Sentence> Annotation;
|
||||
///
|
||||
/// Annotation example;
|
||||
/// ```
|
||||
/// This structure exists to provide a consistent API to access the nested
|
||||
/// sentences of varying lengths, which occur in source-text processed into
|
||||
/// multiple sentences, and target-text translated from source as multiple
|
||||
/// sentences, both composed of (sub)-words, providing a List[List] like access
|
||||
/// while storing it in a compact and efficient manner.
|
||||
class Annotation {
|
||||
public:
|
||||
/// Annotation is constructed empty. See addSentence to populate it with
|
||||
/// Annotation is constructed empty. See `addSentence()` to populate it with
|
||||
/// annotations.
|
||||
Annotation() {}
|
||||
Annotation() {
|
||||
// The -1-th sentence ends at 0.
|
||||
sentenceEndIds_.push_back(0);
|
||||
}
|
||||
|
||||
/// Returns the number of sentences annotated in a text.
|
||||
size_t numSentences() const { return sentenceBeginIds_.size(); }
|
||||
size_t numSentences() const { return sentenceEndIds_.size() - 1; }
|
||||
|
||||
/// Returns number of words in the sentece identified by sentenceIdx.
|
||||
/// Returns number of words in the sentence identified by `sentenceIdx`.
|
||||
size_t numWords(size_t sentenceIdx) const;
|
||||
|
||||
/// Adds a sentences from vector<ByteRange> representation, internally doing
|
||||
/// Adds a sentences from `vector<ByteRange>` representation, internally doing
|
||||
/// extra book-keeping for the sentence terminal markings. Sentences are
|
||||
/// expected to be added in order as they occur in text.
|
||||
void addSentence(std::vector<ByteRange> &sentence);
|
||||
|
||||
/// Returns a ByteRange representing wordIdx in sentenceIdx
|
||||
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
|
||||
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
|
||||
/// `.numWords()` for `sentenceIdx` for defined behaviour.
|
||||
ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
|
||||
|
||||
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
|
||||
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
|
||||
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
|
||||
/// less than `.numSentences()`.
|
||||
ByteRange sentence(size_t sentenceIdx) const;
|
||||
|
||||
private:
|
||||
/// A flat storage for ByteRanges. Composed of word ByteRanges, extra
|
||||
/// information in sentenceBeginIds_ to denote sentence boundary markers as
|
||||
/// information in sentenceEndIds_ to denote sentence boundary markers as
|
||||
/// indices.
|
||||
std::vector<ByteRange> flatByteRanges_;
|
||||
|
||||
/// Stores indices where sentences begin
|
||||
std::vector<size_t> sentenceBeginIds_;
|
||||
|
||||
/// Returns ByteRanges corresponding to beginning and end words of sentence
|
||||
/// corresponding to sentenceIdx. This is useful in using the information to
|
||||
/// construct a ByteRange of a sentence taking the begin from the first and
|
||||
/// end from the second.
|
||||
std::pair<ByteRange, ByteRange> sentenceTerminals(size_t sentenceIdx) const;
|
||||
|
||||
/// Returns indices of terminal (word) ByteRanges in sentenceIds_ of a
|
||||
/// sentence corresponding to sentenceIdx. The distance can be used to compute
|
||||
/// number of words in a sentence (numWords) and also to construct the
|
||||
/// terminal ByteRanges (sentenceTerminals).
|
||||
std::pair<size_t, size_t> sentenceTerminalIds(size_t sentenceIdx) const;
|
||||
/// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
|
||||
/// aligned with C++ half interval notions). There is a 0 marker to simplify
|
||||
/// sources, indicating where the -1-th sentence ends.
|
||||
std::vector<size_t> sentenceEndIds_;
|
||||
};
|
||||
|
||||
/// AnnotatedText is effectively std::string text + Annotation, providing the
|
||||
|
|
Загрузка…
Ссылка в новой задаче