Strengthen the Annotation class: Handle empty sentences and tests (#85)

* Changing Annotation to adhere to [begin, end) * Stronger unit tests on sentences + num words, num sentences * Hotfix with empty string view from EOS * No more absolving empty-sentence; Added tests now defined behaviour * Uncommenting important section in unit test * Ensure empty string view default, beginning at end so marker points * Further strengthen and comment unit-tests, mark exactly where empty sentence is happening * Review comments: Dummy sentence + docs - What should be a simple fast accessor is turning into compute. Normally the way to deal with this, for better or worse, is to put 0 at the beginning of sentenceEndIds_. (Putting 0 at the beginning of sentenceEndIds_) - Indices into what? Mentioned to be flatByteRanges_. * Documentation updates * More changes to docs Co-authored-by: abhi-agg <66322306+abhi-agg@users.noreply.github.com>
2021-04-12 17:05:23 +01:00 · 2021-04-12 17:05:23 +01:00 · 3daa024eb3
--- a/src/tests/annotation_tests.cpp
+++ b/src/tests/annotation_tests.cpp
@ -11,63 +11,210 @@ TEST_CASE("Test Annotation API with random sentences") {
  /// which sentence went in where and try to use accessor methods on
  /// AnnotatedText to check if what we have as ground-truth by construction is
  /// consistent with what is returned.
-  size_t sentences = 20;
+  size_t sentences = 500;
  size_t maxWords = 40;

+  // Set in case needed to see output. The output is in lines of #sentences +
+  // header, which can be split and compared for easy understanding. The ideal
+  // way to inspect what is going wrong is to redirect output and use to split
+  // the different stages by sentences + 1 lines and check the diff.
+  bool debug{false};
+
  std::mt19937 randomIntGen_;
  randomIntGen_.seed(42);

-  AnnotatedText testAnnotation;
-  std::vector<std::vector<ByteRange>> sentenceWords;
-  std::vector<ByteRange> Words;
+  AnnotatedText testAnnotation; // This the container we add through API and
+                                // check if the access is correct.

+  // External book-keeping so we have ground truths. Each element represents a
+  // sentence.
+
+  // word byte ranges - for testAnnotation.word(sId, wId)
+  std::vector<std::vector<ByteRange>> groundTruthWords;
+  // sentence byte ranges - for testAnnotation.sentence(sId, wId)
+  std::vector<ByteRange> groundTruthSentences;
+
+  // Prepare the text and construct ByteRanges as intended for sentences and
+  // words. The ByteRanges we construct here are expected to be the
+  // ground-truths for words and sentences. The string being constructed is like
+  // as follows:
+  //
+  //     0-0 0-1 0-2 0-3
+  //     1-0 1-1 1-2 1-3 1-4
+  //     2-0 2-1
+  //
+  //     4-0 4-1 4-2 4-3
+  //
+  // Words are separated by space units.
+  //
+  // Below, we accumulate the text with intended structure as above, and
+  // ground-truth tables populated to be aware of the ByteRanges where they are
+  // meant to be.
+  if (debug) {
+    std::cout << "Preparing text and ground truth-tables" << std::endl;
+  }
  for (size_t idx = 0; idx < sentences; idx++) {
    if (idx != 0)
      testAnnotation.text += "\n";

-    Words.clear();
-    size_t words = randomIntGen_() % maxWords + 1;
-    Words.reserve(words);
-    for (size_t idw = 0; idw < words; idw++) {
-      size_t before = testAnnotation.text.size();
+    // Words can be zero, we need to support empty word sentences as well.
+    size_t numWords = randomIntGen_() % maxWords;
+
+    std::vector<ByteRange> wordByteRanges;
+    wordByteRanges.reserve(numWords);
+
+    // For empty sentence, we expect it to be empty and marked in position where
+    // the existing string is if needed to be pointed out.
+    size_t before = testAnnotation.text.size() - 1;
+    size_t sentenceBegin{before}, sentenceEnd{before};
+
+    for (size_t idw = 0; idw < numWords; idw++) {
+      if (idw != 0) {
+        testAnnotation.text += " ";
+        if (debug) {
+          std::cout << " ";
+        }
+      }
+
+      // Get new beginning, accounting for space above.
+      before = testAnnotation.text.size();
+
+      // Add the word
      std::string word = std::to_string(idx) + "-" + std::to_string(idw);
      testAnnotation.text += word;
-      if (idw != 0)
-        testAnnotation.text += " ";
-      Words.push_back((ByteRange){before, before + word.size() - 1});
-    }
-    // std::cout << std::endl;

-    sentenceWords.push_back(Words);
+      // Do math, before, before + new-word's size.
+      wordByteRanges.push_back((ByteRange){before, before + word.size()});
+
+      if (debug) {
+        std::cout << word;
+      }
+
+      if (idw == 0) {
+        sentenceBegin = before;
+      }
+      if (idw == numWords - 1) {
+        sentenceEnd = before + word.size();
+      }
+    }
+    if (debug) {
+      std::cout << std::endl;
+    }
+
+    groundTruthWords.push_back(wordByteRanges);
+    groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
  }

-  // std::cout << "Inserting words:" << std::endl;
-  std::vector<std::vector<marian::string_view>> byteRanges;
-  for (auto &sentence : sentenceWords) {
+  // We prepare string_views now with the known ByteRanges and use the
+  // string_view based AnnotatedText.addSentence(...) API to add sentences to
+  // transparently convert from string_views to ByteRanges, rebasing/working out
+  // the math underneath.
+
+  if (debug) {
+    std::cout << "Inserting words onto container and save ground-truth-table:"
+              << std::endl;
+  }
+
+  std::vector<std::vector<marian::string_view>> wordStringViews;
+  for (auto &sentence : groundTruthWords) {
    std::vector<marian::string_view> wordByteRanges;
+    bool first{true};
    for (auto &word : sentence) {
      marian::string_view wordView(&testAnnotation.text[word.begin],
-                                   word.end - word.begin);
+                                   word.size());
      wordByteRanges.push_back(wordView);
-      // std::cout << std::string(wordView) << " ";
+      if (debug) {
+        if (first) {
+          first = false;
+        } else {
+          std::cout << " ";
+        }
+        std::cout << std::string(wordView);
+      }
    }
    testAnnotation.addSentence(wordByteRanges);
-    byteRanges.push_back(wordByteRanges);
-    // std::cout << std::endl;
+    wordStringViews.push_back(wordByteRanges);
+    if (debug) {
+      std::cout << std::endl;
+    }
  }

-  // std::cout << "From container: " << std::endl;
-  for (int idx = 0; idx < sentenceWords.size(); idx++) {
-    for (int idw = 0; idw < sentenceWords[idx].size(); idw++) {
-      ByteRange expected = sentenceWords[idx][idw];
+  if (debug) {
+    std::cout
+        << "Inserting sentences onto container and save ground-truth-table"
+        << std::endl;
+  }
+  std::vector<marian::string_view> sentenceStringViews;
+  for (auto &sentenceByteRange : groundTruthSentences) {
+    char *data = &(testAnnotation.text[sentenceByteRange.begin]);
+    marian::string_view sentenceView(data, sentenceByteRange.size());
+    sentenceStringViews.push_back(sentenceView);
+
+    if (debug) {
+      std::cout << sentenceView << std::endl;
+    }
+  }
+
+  // Access from the sentence(sentenceIdx) API and confirm that the ground truth
+  // we expect is same as what comes out of the container.
+  if (debug) {
+    std::cout << "From container: Sentences" << std::endl;
+  }
+  for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
+    ByteRange expected = groundTruthSentences[idx];
+    ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
+    if (debug) {
+      std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
+    }
+    CHECK(expected.begin == obtained.begin);
+    CHECK(expected.end == obtained.end);
+    std::string expected_string = std::string(sentenceStringViews[idx]);
+    std::string obtained_string = std::string(testAnnotation.sentence(idx));
+    CHECK(expected_string == obtained_string);
+  }
+
+  /// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
+  /// expected words are the same as those obtained from the container.
+  if (debug) {
+    std::cout << "From container: Words" << std::endl;
+  }
+
+  CHECK(groundTruthWords.size() == testAnnotation.numSentences());
+  for (int idx = 0; idx < groundTruthWords.size(); idx++) {
+    CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
+  }
+
+  for (int idx = 0; idx < groundTruthWords.size(); idx++) {
+    for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
+      ByteRange expected = groundTruthWords[idx][idw];
      ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
-      // std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
+      if (debug) {
+        std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
+      }
      CHECK(expected.begin == obtained.begin);
      CHECK(expected.end == obtained.end);

-      std::string expected_string = std::string(byteRanges[idx][idw]);
-      CHECK(expected_string == std::string(testAnnotation.word(idx, idw)));
+      std::string expected_string = std::string(wordStringViews[idx][idw]);
+      std::string obtained_string = std::string(testAnnotation.word(idx, idw));
+      CHECK(expected_string == obtained_string);
+    }
+    if (debug) {
+      std::cout << std::endl;
    }
-    // std::cout << std::endl;
  }
+
+  // Try inserting an empty Sentence. This is ensuring we check for empty
+  // Sentence if the random test above does not cover it for some reason.
+  int emptySentenceIdx = sentences;
+  std::vector<marian::string_view> emptySentence;
+  testAnnotation.addSentence(emptySentence);
+
+  // There are no words.
+  CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
+
+  // Empty sentence expected at output.
+  std::string expectedEmptyString = "";
+  marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
+  std::string obtainedString = std::string(emptyView.data(), emptyView.size());
+  CHECK(expectedEmptyString == obtainedString);
 }
--- a/src/translator/sentence_ranges.cpp
+++ b/src/translator/sentence_ranges.cpp
@ -6,48 +6,44 @@ namespace marian {
 namespace bergamot {

 void Annotation::addSentence(std::vector<ByteRange> &sentence) {
-  size_t size = flatByteRanges_.size();
  flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
                         std::end(sentence));
-  sentenceBeginIds_.push_back(size);
+  size_t size = flatByteRanges_.size();
+  sentenceEndIds_.push_back(size);
 }

 size_t Annotation::numWords(size_t sentenceIdx) const {
-  auto terminals = sentenceTerminalIds(sentenceIdx);
-  return terminals.second - terminals.first + 1;
-}
-
-std::pair<size_t, size_t>
-Annotation::sentenceTerminalIds(size_t sentenceIdx) const {
  size_t bosId, eosId;
-  bosId = sentenceBeginIds_[sentenceIdx];
-  eosId = sentenceIdx + 1 < numSentences()
-              ? sentenceBeginIds_[sentenceIdx + 1] - 1
-              : flatByteRanges_.size() - 1;
-
-  // Out of bound checks.
-  assert(bosId < flatByteRanges_.size());
-  assert(eosId < flatByteRanges_.size());
-  return std::make_pair(bosId, eosId);
-}
-
-std::pair<ByteRange, ByteRange>
-Annotation::sentenceTerminals(size_t sentenceIdx) const {
-  auto terminals = sentenceTerminalIds(sentenceIdx);
-  return std::make_pair(flatByteRanges_[terminals.first],
-                        flatByteRanges_[terminals.second]);
+  bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
+  eosId = sentenceEndIds_[sentenceIdx + 1];
+  // Difference between eosId and bosId is the number of words.
+  return eosId - bosId;
 }

 ByteRange Annotation::sentence(size_t sentenceIdx) const {
-  auto terminals = sentenceTerminals(sentenceIdx);
-  return (ByteRange){terminals.first.begin, terminals.second.end};
+  size_t bosId, eosId;
+  bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
+  eosId = sentenceEndIds_[sentenceIdx + 1];
+  ByteRange sentenceByteRange;
+
+  if (bosId == eosId) {
+    // We have an empty sentence. However, we want to be able to point where in
+    // target this happened through the ranges. We are looking for the end of
+    // the flatByteRange and non-empty sentence before this happened and
+    // construct empty string-view equivalent ByteRange.
+    ByteRange eos = flatByteRanges_[eosId - 1];
+    sentenceByteRange = (ByteRange){eos.end, eos.end};
+  } else {
+    ByteRange bos = flatByteRanges_[bosId];
+    ByteRange eos = flatByteRanges_[eosId - 1];
+    sentenceByteRange = (ByteRange){bos.begin, eos.end};
+  }
+  return sentenceByteRange;
 }

 ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
-  size_t offset = sentenceBeginIds_[sentenceIdx];
-  // auto terminals = sentenceTerminals(sentenceIdx);
-  // assert(offset + wordIdx <= terminals.second);
-  return flatByteRanges_[offset + wordIdx];
+  size_t bosOffset = sentenceEndIds_[sentenceIdx];
+  return flatByteRanges_[bosOffset + wordIdx];
 }

 string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
--- a/src/translator/sentence_ranges.h
+++ b/src/translator/sentence_ranges.h
@ -19,51 +19,82 @@ struct ByteRange {

 /// An Annotation is a collection of ByteRanges used to denote ancillary
 /// information of sentences and words on a text of string. Annotation is meant
-/// for consumption on platforms where string_view creates problems (eg: exports
-/// through WASM). See AnnotatedText for cases where this is a non-issue.
+/// for consumption on platforms where `string_view` creates problems (eg:
+/// exports through WASM) conveniently rebasing them as required into
+/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
+///
+/// **Usage**
+///
+/// To ensure rebasing is consistent during creation and updation, use
+/// `Annotation` best through `AnnotatedText`, which also holds the reference
+/// string and can work with `string_views`.
+///
+/// If used separately, it is on the user to ensure the reference string
+/// is the same as what the Annotation refers to. For best results, an instance
+/// is expected to be read only in this mode of operation.
+///
+/// **Idea**
+///
+/// Annotation is intended to be the same structure conceptually as below,
+/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
+/// structure to avoid multiple reallocs keeping efficiency in mind. This is
+/// achieved by having markers of where sentence ends in the flat container
+/// storing word ByteRanges.
+///
+/// ```cpp
+/// typedef ByteRange Word;
+/// // std::vector<ByteRange>, a single sentence
+/// typedef std::vector<Word> Sentence;
+/// std::vector<std::vector<ByteRange> // multiple sentences
+/// typedef std::vector<Sentence> Annotation;
+///
+/// Annotation example;
+/// ```
+/// This structure exists to provide a consistent API to access the nested
+/// sentences of varying lengths, which occur in source-text processed into
+/// multiple sentences, and target-text translated from source as multiple
+/// sentences, both composed of (sub)-words, providing a List[List] like access
+/// while storing it in a compact and efficient manner.
 class Annotation {
 public:
-  /// Annotation is constructed empty. See addSentence to populate it with
+  /// Annotation is constructed empty. See `addSentence()` to populate it with
  /// annotations.
-  Annotation() {}
+  Annotation() {
+    // The -1-th sentence ends at 0.
+    sentenceEndIds_.push_back(0);
+  }

  /// Returns the number of sentences annotated in a text.
-  size_t numSentences() const { return sentenceBeginIds_.size(); }
+  size_t numSentences() const { return sentenceEndIds_.size() - 1; }

-  /// Returns number of words in the sentece identified by sentenceIdx.
+  /// Returns number of words in the sentence identified by `sentenceIdx`.
  size_t numWords(size_t sentenceIdx) const;

-  /// Adds a sentences from vector<ByteRange> representation, internally doing
+  /// Adds a sentences from `vector<ByteRange>` representation, internally doing
  /// extra book-keeping for the sentence terminal markings. Sentences are
  /// expected to be added in order as they occur in text.
  void addSentence(std::vector<ByteRange> &sentence);

-  /// Returns a ByteRange representing wordIdx in sentenceIdx
+  /// Returns a ByteRange representing `wordIdx` in sentence indexed by
+  /// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
+  /// `.numWords()` for `sentenceIdx` for defined behaviour.
  ByteRange word(size_t sentenceIdx, size_t wordIdx) const;

-  /// Returns a ByteRange representing sentence corresponding to sentenceIdx.
+  /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
+  /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
+  /// less than `.numSentences()`.
  ByteRange sentence(size_t sentenceIdx) const;

 private:
  /// A flat storage for ByteRanges. Composed of word ByteRanges, extra
-  /// information in sentenceBeginIds_ to denote sentence boundary markers as
+  /// information in sentenceEndIds_ to denote sentence boundary markers as
  /// indices.
  std::vector<ByteRange> flatByteRanges_;

-  /// Stores indices where sentences begin
-  std::vector<size_t> sentenceBeginIds_;
-
-  /// Returns ByteRanges corresponding to beginning and end words of sentence
-  /// corresponding to sentenceIdx. This is useful in using the information to
-  /// construct a ByteRange of a sentence taking the begin from the first and
-  /// end from the second.
-  std::pair<ByteRange, ByteRange> sentenceTerminals(size_t sentenceIdx) const;
-
-  /// Returns indices of terminal (word) ByteRanges in sentenceIds_ of a
-  /// sentence corresponding to sentenceIdx. The distance can be used to compute
-  /// number of words in a sentence (numWords) and also to construct the
-  /// terminal ByteRanges (sentenceTerminals).
-  std::pair<size_t, size_t> sentenceTerminalIds(size_t sentenceIdx) const;
+  /// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
+  /// aligned with C++ half interval notions). There is a 0 marker to simplify
+  /// sources, indicating where the -1-th sentence ends.
+  std::vector<size_t> sentenceEndIds_;
 };

 /// AnnotatedText is effectively std::string text + Annotation, providing the