Bug 1728708 Part 4 - Simplify WordBreaker::Next() and make it recognize the end of text a word break opportunity. r=jfkthame

A UAX29 compatible word breaker (like ICU4C) treat the end of text as a word break opportunity (rule WB2 [1]), but currently lwbrk word breaker doesn't. The motivation of this patch is to make `WordBreaker::Next()` closer to a UAX29 compatible one (at least for English text), and see if the callers need to change. This should make the future integration of ICU4X segmenter easier. The only caller of WordBreaker::Next() is ClusterIterator's constructor. This patch shouldn't change its behavior because we've already manually assigned a word break point at the end of the line when `aContext` is empty and `aDirection` is -1. This patch generalizes it to all conditions. Also, update TestPrintWordWithBreak() so that the result string makes more sense. [1] https://www.unicode.org/reports/tr29/#WB2 Differential Revision: https://phabricator.services.mozilla.com/D124304
2021-09-08 04:19:38 +00:00 · 2021-09-08 04:19:38 +00:00 · 57b867e7ff
--- a/intl/lwbrk/WordBreaker.cpp
+++ b/intl/lwbrk/WordBreaker.cpp
@ -181,39 +181,37 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen,
 }

 int32_t WordBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
-  WordBreakClass c1, c2;
-  uint32_t cur = aPos;
-  if (cur == aLen) {
+  MOZ_ASSERT(aText);
+
+  if (aPos >= aLen) {
    return NS_WORDBREAKER_NEED_MORE_TEXT;
  }
-  c1 = GetClass(aText[cur]);

-  for (cur++; cur < aLen; cur++) {
-    c2 = GetClass(aText[cur]);
-    if (c2 != c1) {
+  const WordBreakClass posClass = GetClass(aText[aPos]);
+  uint32_t nextBreakPos;
+  for (nextBreakPos = aPos + 1; nextBreakPos < aLen; ++nextBreakPos) {
+    if (posClass != GetClass(aText[nextBreakPos])) {
      break;
    }
  }

-  if (kWbClassScriptioContinua == c1) {
-    // we pass the whole text segment to the complex word breaker to find a
-    // shorter answer
+  if (kWbClassScriptioContinua == posClass) {
+    // We pass the whole text segment to the complex word breaker to find a
+    // shorter answer.
+    const char16_t* segStart = aText + aPos;
+    const uint32_t segLen = nextBreakPos - aPos + 1;
    AutoTArray<uint8_t, 256> breakBefore;
-    breakBefore.SetLength(aLen - aPos);
-    NS_GetComplexLineBreaks(aText + aPos, aLen - aPos, breakBefore.Elements());
-    uint32_t i = 1;
-    while (i < cur - aPos && !breakBefore[i]) {
-      i++;
-    }
-    if (i < cur - aPos) {
-      return aPos + i;
+    breakBefore.SetLength(segLen);
+    NS_GetComplexLineBreaks(segStart, segLen, breakBefore.Elements());
+
+    for (uint32_t i = aPos + 1; i < nextBreakPos; ++i) {
+      if (breakBefore[i - aPos]) {
+        nextBreakPos = i;
+        break;
+      }
    }
  }

-  if (cur == aLen) {
-    return NS_WORDBREAKER_NEED_MORE_TEXT;
-  }
-
-  MOZ_ASSERT(cur != aPos);
-  return cur;
+  MOZ_ASSERT(nextBreakPos != aPos);
+  return nextBreakPos;
 }
--- a/intl/lwbrk/WordBreaker.h
+++ b/intl/lwbrk/WordBreaker.h
@ -28,6 +28,12 @@ class WordBreaker {
                      const char16_t* aText2, uint32_t aTextLen2);
  WordRange FindWord(const char16_t* aText1, uint32_t aTextLen1,
                     uint32_t aOffset);
+
+  // Find the next word break opportunity starting from aPos + 1. It can return
+  // aLen if there's no break opportunity between [aPos + 1, aLen - 1].
+  //
+  // If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return
+  // NS_WORDBREAKER_NEED_MORE_TEXT.
  int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos);

 private:
--- a/intl/lwbrk/gtest/TestBreak.cpp
+++ b/intl/lwbrk/gtest/TestBreak.cpp
@ -19,6 +19,17 @@ using mozilla::ArrayLength;

 // Turn off clang-format to align the ruler comments to the test strings.

+// clang-format off
+static char teng0[] =
+  //           1         2         3         4         5         6         7
+  // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+    "hello world";
+// clang-format on
+
+static uint32_t lexp0[] = {5, 11};
+
+static uint32_t wexp0[] = {5, 6, 11};
+
 // clang-format off
 static char teng1[] =
  //           1         2         3         4         5         6         7
@ -29,9 +40,9 @@ static char teng1[] =
 static uint32_t lexp1[] = {4,  7,  9,  14, 17, 34, 39, 40, 41,
                           42, 49, 54, 62, 64, 67, 69, 73};

-static uint32_t wexp1[] = {4,  5,  7,  8,  9,  10, 14, 15, 17, 18, 22,
-                           23, 33, 34, 35, 39, 43, 48, 49, 50, 54, 55,
-                           56, 57, 62, 63, 64, 65, 67, 68, 69, 70, 72};
+static uint32_t wexp1[] = {4,  5,  7,  8,  9,  10, 14, 15, 17, 18, 22, 23,
+                           33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57,
+                           62, 63, 64, 65, 67, 68, 69, 70, 72, 73};

 // clang-format off
 static char teng2[] =
@ -43,7 +54,7 @@ static char teng2[] =
 static uint32_t lexp2[] = {17, 22, 23, 30, 44};

 static uint32_t wexp2[] = {4,  12, 13, 14, 15, 16, 17, 18, 22,
-                           24, 29, 30, 31, 32, 37, 38, 43};
+                           24, 29, 30, 31, 32, 37, 38, 43, 44};

 // clang-format off
 static char teng3[] =
@ -55,7 +66,7 @@ static char teng3[] =
 static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42};

 static uint32_t wexp3[] = {2,  3,  4,  5,  6,  7,  11, 12, 14, 15,
-                           19, 20, 25, 26, 27, 28, 32, 33, 38};
+                           19, 20, 25, 26, 27, 28, 32, 33, 38, 42};

 static char ruler1[] =
    "          1         2         3         4         5         6         7  ";
@ -141,6 +152,7 @@ TEST(LineBreak, LineBreaker)

  ASSERT_TRUE(t);

+  ASSERT_TRUE(TestASCIILB(t, teng0, lexp0, ArrayLength(lexp0)));
  ASSERT_TRUE(TestASCIILB(t, teng1, lexp1, ArrayLength(lexp1)));
  ASSERT_TRUE(TestASCIILB(t, teng2, lexp2, ArrayLength(lexp2)));
  ASSERT_TRUE(TestASCIILB(t, teng3, lexp3, ArrayLength(lexp3)));
@ -151,6 +163,7 @@ TEST(LineBreak, WordBreaker)
  RefPtr<mozilla::intl::WordBreaker> t = mozilla::intl::WordBreaker::Create();
  ASSERT_TRUE(t);

+  ASSERT_TRUE(TestASCIIWB(t, teng0, wexp0, ArrayLength(wexp0)));
  ASSERT_TRUE(TestASCIIWB(t, teng1, wexp1, ArrayLength(wexp1)));
  ASSERT_TRUE(TestASCIIWB(t, teng2, wexp2, ArrayLength(wexp2)));
  ASSERT_TRUE(TestASCIIWB(t, teng3, wexp3, ArrayLength(wexp3)));
@ -169,6 +182,8 @@ void TestPrintWordWithBreak() {
  uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
  RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create();

+  // This test generate the result string by appending '^' at every word break
+  // opportunity except the one at end of the text.
  nsAutoString result;

  for (uint32_t i = 0; i < numOfFragment; i++) {
@ -179,13 +194,17 @@ void TestPrintWordWithBreak() {
    uint32_t start = 0;
    while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) {
      result.Append(Substring(fragText, start, cur - start));
-      result.Append('^');
+
+      // Append '^' only if cur is within the fragText. We'll check the word
+      // break opportunity between fragText and nextFragText using
+      // BreakInBetween() below.
+      if (cur < static_cast<int32_t>(fragText.Length())) {
+        result.Append('^');
+      }
      start = (cur >= 0 ? cur : cur - start);
      cur = wbk->Next(fragText.get(), fragText.Length(), cur);
    }

-    result.Append(Substring(fragText, fragText.Length() - start));
-
    if (i != numOfFragment - 1) {
      NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]);

@ -198,7 +217,7 @@ void TestPrintWordWithBreak() {
      fragText.Assign(nextFragText);
    }
  }
-  ASSERT_STREQ("is^   ^is^ ^a^ ^  is a intzation^ ^work^ation work.",
+  ASSERT_STREQ("This^   ^is^ ^a^ ^internationalization^ ^work^.",
               NS_ConvertUTF16toUTF8(result).get());
 }

@ -276,6 +295,13 @@ void TestNextWordBreakWithComplexLanguage() {
  ASSERT_TRUE(true);
 }

+void TestNextWordBreakWithEmptyString() {
+  RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create();
+  char16_t empty[] = {};
+  ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, wbk->Next(empty, 0, 0));
+  ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, wbk->Next(empty, 0, 1));
+}
+
 TEST(LineBreak, WordBreakUsage)
 {
  TestPrintWordWithBreak();
--- a/layout/generic/nsTextFrame.cpp
+++ b/layout/generic/nsTextFrame.cpp
@ -8120,10 +8120,11 @@ ClusterIterator::ClusterIterator(nsTextFrame* aTextFrame, int32_t aPosition,

  int32_t textOffset = aTextFrame->GetContentOffset();
  int32_t textLen = aTextFrame->GetContentLength();
-  // XXX(Bug 1631371) Check if this should use a fallible operation as it
-  // pretended earlier.
+
+  // Allocate an extra element to record the word break at the end of the line
+  // or text run in mWordBreak[textLen].
  mWordBreaks.AppendElements(textLen + 1);
-  memset(mWordBreaks.Elements(), false, (textLen + 1) * sizeof(bool));
+  PodZero(mWordBreaks.Elements(), textLen + 1);
  int32_t textStart;
  if (aDirection > 0) {
    if (aContext.IsEmpty()) {
@ -8152,6 +8153,10 @@ ClusterIterator::ClusterIterator(nsTextFrame* aTextFrame, int32_t aPosition,
    }
    mWordBreaks[nextWord - textStart] = true;
  }
+
+  MOZ_ASSERT(
+      textStart + textLen != int32_t(aContext.Length()) || mWordBreaks[textLen],
+      "There should be a word break at the end of a line or text run!");
 }

 nsIFrame::FrameSearchResult nsTextFrame::PeekOffsetWord(