From 3b3a1815ead706acf4ef883afd2f514ba53d4c57 Mon Sep 17 00:00:00 2001
From: James Teh <jteh@mozilla.com>
Date: Tue, 4 Jun 2024 20:36:42 +0000
Subject: [PATCH] Bug 855184 part 1: Add BOUNDARY_CLUSTER so a11y can query
 grapheme clusters, AKA user-perceived characters. r=eeejay

Most OS APIs want a cluster when they ask for a "character", except ATK.
Rather than altering BOUNDARY_CHAR, I added a new BOUNDARY_CLUSTER.
Aside from being less risky and causing less churn, there are cases internally where we want to move a TextLeafPoint by character; e.g. to explicitly move to the next/previous Accessible or to move to the next/previous character in an abstract way without worrying about Accessible boundaries.
Calculating clusters is more expensive, so it doesn't make sense to move by cluster in those cases.

Differential Revision: https://phabricator.services.mozilla.com/D212517
---
 accessible/base/TextLeafRange.cpp             | 59 +++++++++++++++++++
 accessible/base/TextLeafRange.h               |  3 +
 accessible/interfaces/nsIAccessibleText.idl   |  6 ++
 accessible/tests/browser/text/browser_text.js | 28 +++++++++
 accessible/tests/mochitest/text.js            |  1 +
 5 files changed, 97 insertions(+)

diff --git a/accessible/base/TextLeafRange.cpp b/accessible/base/TextLeafRange.cpp
index 41d84a770c74..a1cbbd763995 100644
--- a/accessible/base/TextLeafRange.cpp
+++ b/accessible/base/TextLeafRange.cpp
@@ -1126,6 +1126,9 @@ TextLeafPoint TextLeafPoint::FindBoundary(AccessibleTextBoundary aBoundaryType,
         boundary = searchFrom.FindParagraphSameAcc(aDirection, includeOrigin,
                                                    ignoreListItemMarker);
         break;
+      case nsIAccessibleText::BOUNDARY_CLUSTER:
+        boundary = searchFrom.FindClusterSameAcc(aDirection, includeOrigin);
+        break;
       default:
         MOZ_ASSERT_UNREACHABLE();
         break;
@@ -1369,6 +1372,62 @@ TextLeafPoint TextLeafPoint::FindParagraphSameAcc(
   return TextLeafPoint();
 }
 
+TextLeafPoint TextLeafPoint::FindClusterSameAcc(nsDirection aDirection,
+                                                bool aIncludeOrigin) const {
+  // We don't support clusters which cross nodes. We can live with that because
+  // editor doesn't seem to fully support this either.
+  if (aIncludeOrigin && mOffset == 0) {
+    // Since we don't cross nodes, offset 0 always begins a cluster.
+    return *this;
+  }
+  if (aDirection == eDirPrevious) {
+    if (mOffset == 0) {
+      // We can't go back any further.
+      return TextLeafPoint();
+    }
+    if (!aIncludeOrigin && mOffset == 1) {
+      // Since we don't cross nodes, offset 0 always begins a cluster. We can't
+      // take this fast path if aIncludeOrigin is true because offset 1 might
+      // start a cluster, but we don't know that yet.
+      return TextLeafPoint(mAcc, 0);
+    }
+  }
+  nsAutoString text;
+  mAcc->AppendTextTo(text);
+  if (text.IsEmpty()) {
+    return TextLeafPoint();
+  }
+  if (aDirection == eDirNext &&
+      mOffset == static_cast<int32_t>(text.Length())) {
+    return TextLeafPoint();
+  }
+  // There is GraphemeClusterBreakReverseIteratorUtf16, but it "doesn't
+  // handle conjoining Jamo and emoji". Therefore, we must use
+  // GraphemeClusterBreakIteratorUtf16 even when moving backward.
+  // GraphemeClusterBreakIteratorUtf16::Seek() always starts from the beginning
+  // and repeatedly calls Next(), regardless of the seek offset. The best we
+  // can do is call Next() until we find the offset we need.
+  intl::GraphemeClusterBreakIteratorUtf16 iter(text);
+  // Since we don't cross nodes, offset 0 always begins a cluster.
+  int32_t prevCluster = 0;
+  while (Maybe<uint32_t> next = iter.Next()) {
+    int32_t cluster = static_cast<int32_t>(*next);
+    if (aIncludeOrigin && cluster == mOffset) {
+      return *this;
+    }
+    if (aDirection == eDirPrevious) {
+      if (cluster >= mOffset) {
+        return TextLeafPoint(mAcc, prevCluster);
+      }
+      prevCluster = cluster;
+    } else if (cluster > mOffset) {
+      MOZ_ASSERT(aDirection == eDirNext);
+      return TextLeafPoint(mAcc, cluster);
+    }
+  }
+  return TextLeafPoint();
+}
+
 bool TextLeafPoint::IsInSpellingError() const {
   if (LocalAccessible* acc = mAcc->AsLocal()) {
     auto domRanges = FindDOMSpellingErrors(acc, mOffset, mOffset + 1);
diff --git a/accessible/base/TextLeafRange.h b/accessible/base/TextLeafRange.h
index 23fea2ecfba2..1df0693eadc3 100644
--- a/accessible/base/TextLeafRange.h
+++ b/accessible/base/TextLeafRange.h
@@ -228,6 +228,9 @@ class TextLeafPoint final {
                                      bool aIncludeOrigin,
                                      bool aIgnoreListItemMarker = false) const;
 
+  TextLeafPoint FindClusterSameAcc(nsDirection aDirection,
+                                   bool aIncludeOrigin) const;
+
   bool IsInSpellingError() const;
 
   /**
diff --git a/accessible/interfaces/nsIAccessibleText.idl b/accessible/interfaces/nsIAccessibleText.idl
index 5bd125c30467..1a7d19753747 100644
--- a/accessible/interfaces/nsIAccessibleText.idl
+++ b/accessible/interfaces/nsIAccessibleText.idl
@@ -22,6 +22,8 @@ interface nsIAccessibleText : nsISupports
   const int32_t TEXT_OFFSET_END_OF_TEXT = -1;
   const int32_t TEXT_OFFSET_CARET       = -2;
 
+  // A single Unicode character. For a user-perceived character, see
+  // BOUNDARY_CLUSTER.
   const AccessibleTextBoundary BOUNDARY_CHAR = 0;
   const AccessibleTextBoundary BOUNDARY_WORD_START = 1;
   const AccessibleTextBoundary BOUNDARY_WORD_END = 2;
@@ -30,6 +32,10 @@ interface nsIAccessibleText : nsISupports
   const AccessibleTextBoundary BOUNDARY_LINE_START = 5;
   const AccessibleTextBoundary BOUNDARY_LINE_END = 6;
   const AccessibleTextBoundary BOUNDARY_PARAGRAPH = 7;
+  // A grapheme cluster, AKA user-perceived character. This might consist of
+  // multiple Unicode characters, but a user will perceive this as a single
+  // character and it is treated as such by the caret, selection, etc.
+  const AccessibleTextBoundary BOUNDARY_CLUSTER = 8;
 
   /**
    * The current current caret offset.
diff --git a/accessible/tests/browser/text/browser_text.js b/accessible/tests/browser/text/browser_text.js
index ce1d19bc5d9f..8cc22b4aa44a 100644
--- a/accessible/tests/browser/text/browser_text.js
+++ b/accessible/tests/browser/text/browser_text.js
@@ -333,3 +333,31 @@ addAccessibleTask(
     remoteIframe: true,
   }
 );
+
+/**
+ * Test cluster offsets.
+ */
+addAccessibleTask(
+  `<p id="clusters">À2🤦‍♂️🤦🏼‍♂️5x͇͕̦̍͂͒7È</p>`,
+  async function testCluster(browser, docAcc) {
+    const clusters = findAccessibleChildByID(docAcc, "clusters");
+    testCharacterCount(clusters, 26);
+    testTextAtOffset(clusters, BOUNDARY_CLUSTER, [
+      [0, 1, "À", 0, 2],
+      [2, 2, "2", 2, 3],
+      [3, 7, "🤦‍♂️", 3, 8],
+      [8, 14, "🤦🏼‍♂️", 8, 15],
+      [15, 15, "5", 15, 16],
+      [16, 22, "x͇͕̦̍͂͒", 16, 23],
+      [23, 23, "7", 23, 24],
+      [24, 25, "È", 24, 26],
+      [26, 26, "", 26, 26],
+    ]);
+    // Ensure that BOUNDARY_CHAR returns single Unicode characters.
+    testTextAtOffset(clusters, BOUNDARY_CHAR, [
+      [0, 0, "A", 0, 1],
+      [1, 1, "̀", 1, 2],
+    ]);
+  },
+  { chrome: true, topLevel: true }
+);
diff --git a/accessible/tests/mochitest/text.js b/accessible/tests/mochitest/text.js
index 6fe2a00b83e6..5fad7d5ebbb5 100644
--- a/accessible/tests/mochitest/text.js
+++ b/accessible/tests/mochitest/text.js
@@ -9,6 +9,7 @@ const BOUNDARY_WORD_END = nsIAccessibleText.BOUNDARY_WORD_END;
 const BOUNDARY_LINE_START = nsIAccessibleText.BOUNDARY_LINE_START;
 const BOUNDARY_LINE_END = nsIAccessibleText.BOUNDARY_LINE_END;
 const BOUNDARY_PARAGRAPH = nsIAccessibleText.BOUNDARY_PARAGRAPH;
+const BOUNDARY_CLUSTER = nsIAccessibleText.BOUNDARY_CLUSTER;
 
 const kTextEndOffset = nsIAccessibleText.TEXT_OFFSET_END_OF_TEXT;
 const kCaretOffset = nsIAccessibleText.TEXT_OFFSET_CARET;