Bug 1649187 - Use a fallback table to strip diacritics from non-decomposable characters. r=jfkthame

Implement the design suggested at https://bugzilla.mozilla.org/show_bug.cgi?id=1652910#c5 Differential Revision: https://phabricator.services.mozilla.com/D106674
2021-03-07 16:17:41 +00:00 · 2021-03-07 16:17:41 +00:00 · 0686831376
--- a/browser/components/extensions/test/xpcshell/test_ext_bookmarks.js
+++ b/browser/components/extensions/test/xpcshell/test_ext_bookmarks.js
@ -495,7 +495,7 @@ add_task(async function test_bookmarks() {
        // test bookmarks.search
        return Promise.all([
          browser.bookmarks.create({
-            title: "MØzillä",
+            title: "Μοζιλλας",
            url: "http://møzîllä.örg/",
          }),
          browser.bookmarks.create({
@ -567,7 +567,7 @@ add_task(async function test_bookmarks() {
          results[0].id,
          bookmarkGuids.unfiledGuid,
          0,
-          "MØzillä",
+          "Μοζιλλας",
          "http://xn--mzll-ooa1dud.xn--rg-eka/",
          results[0].dateAdded
        );
@ -908,7 +908,7 @@ add_task(async function test_bookmarks() {
        );

        // is case-insensitive for non-ascii
-        return browser.bookmarks.search("MøZILLÄ");
+        return browser.bookmarks.search("ΜοΖΙΛΛΑς");
      })
      .then(results => {
        browser.test.assertEq(
@ -917,7 +917,7 @@ add_task(async function test_bookmarks() {
          "Expected number of results returned for non-ascii search"
        );
        browser.test.assertEq(
-          "MØzillä",
+          "Μοζιλλας",
          results[0].title,
          "Bookmark has the expected title"
        );
--- a/intl/unicharutil/util/base_chars.py
+++ b/intl/unicharutil/util/base_chars.py
@ -0,0 +1,183 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import, print_function
+
+import re
+from collections import namedtuple
+from unicodedata import category, combining, normalize
+
+UNICODE_LIMIT = 0x110000
+UNICODE_BMP_LIMIT = 0x10000
+
+UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
+UNICODE_COMBINING_CLASS_KANA_VOICING = 8
+UNICODE_COMBINING_CLASS_VIRAMA = 9
+
+BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
+BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))
+
+
+def is_in_bmp(char):
+    return ord(char) < UNICODE_BMP_LIMIT
+
+
+# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
+def is_combining_diacritic(char):
+    return combining(char) not in (
+        UNICODE_COMBINING_CLASS_NOT_REORDERED,
+        UNICODE_COMBINING_CLASS_KANA_VOICING,
+        UNICODE_COMBINING_CLASS_VIRAMA,
+        91,
+        129,
+        130,
+        132,
+    )
+
+
+# Keep this function in sync with IsMathSymbol in nsUnicodeProperties.h.
+def is_math_symbol(char):
+    return category(char) == "Sm"
+
+
+def crosses_bmp(char, base_char):
+    if is_in_bmp(char) != is_in_bmp(base_char):
+        # Mappings that would change the length of a UTF-16 string are not
+        # currently supported.
+        return True
+    if not is_in_bmp(char):
+        # Currently there are no mappings we care about outside of the basic
+        # multilingual plane. However, if such a mapping is added to Unicode in
+        # the future, this warning will appear at build time.
+        print(
+            "Warning: Skipping "
+            + "{:#06x}".format(ord(char))
+            + " → "
+            + "{:#06x}".format(ord(base_char))
+        )
+        print(
+            "base_chars.py and nsUnicodeProperties.cpp need to be rewritten to "
+            "use uint32_t instead of uint16_t."
+        )
+        return True
+    return False
+
+
+def main(header, fallback_table):
+    mappings = {}
+
+    # Glean mappings from decompositions
+
+    for char in range(UNICODE_BMP_LIMIT):
+        char = chr(char)
+        if is_combining_diacritic(char) or is_math_symbol(char):
+            continue
+        decomposition = normalize("NFD", char)
+        if len(decomposition) < 2:
+            continue
+        base_char = decomposition[0]
+        if crosses_bmp(char, base_char):
+            continue
+        next_char = decomposition[1]
+        if not is_combining_diacritic(next_char):
+            # Hangul syllables decompose but do not actually have diacritics.
+            # This also excludes decompositions with the Japanese marks U+3099
+            # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
+            # MARK), which we should not ignore for searching (bug 1624244).
+            continue
+        mappings[char] = base_char
+
+    # Add mappings from the ASCII fallback table
+
+    for line in open(fallback_table, encoding="UTF-8"):
+        m = re.match("^(.) → (.+?) ;", line)
+        if not m:
+            continue
+        char = m.group(1)
+        decomposition = m.group(2)
+        if len(decomposition) >= 3:
+            if decomposition.startswith("'") and decomposition.endswith("'"):
+                decomposition = decomposition[1:-1]
+        if len(decomposition) >= 2:
+            if decomposition.startswith("\\"):
+                decomposition = decomposition[1:]
+        if len(decomposition) > 1:
+            continue
+        if crosses_bmp(char, decomposition):
+            continue
+        mappings[char] = decomposition
+
+    # Organize mappings into contiguous blocks
+
+    mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
+    blocks = []
+    i = 0
+    while i < len(mappings) - 1:
+        offset = i
+        first = mappings[i].char & 0xFF
+        while (
+            i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
+        ):
+            while (
+                i < len(mappings) - 1
+                and mappings[i].char >> 8 == mappings[i + 1].char >> 8
+                and mappings[i + 1].char - mappings[i].char > 1
+            ):
+                char = mappings[i].char + 1
+                mappings.insert(i + 1, BaseCharMapping(char, char))
+                i += 1
+            i += 1
+        last = mappings[i].char & 0xFF
+        blocks.append(BaseCharMappingBlock(first, last, offset))
+        i += 1
+
+    indexes = []
+    for i, block in enumerate(blocks):
+        while len(indexes) < mappings[block.offset].char >> 8:
+            indexes.append(255)
+        indexes.append(i)
+    while len(indexes) < 256:
+        indexes.append(255)
+
+    # Write the mappings to a C header file
+
+    header.write("struct BaseCharMappingBlock {\n")
+    header.write("  uint8_t mFirst;\n")
+    header.write("  uint8_t mLast;\n")
+    header.write("  uint16_t mMappingStartOffset;\n")
+    header.write("};\n")
+    header.write("\n")
+    header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
+    for char, base_char in mappings:
+        header.write(
+            "  /* {:#06x}".format(char) + " */ " + "{:#06x}".format(base_char) + ","
+        )
+        if char != base_char:
+            header.write(" /* " + chr(char) + " → " + chr(base_char) + " */")
+        header.write("\n")
+    header.write("};\n")
+    header.write("\n")
+    header.write(
+        "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
+    )
+    for block in blocks:
+        header.write(
+            "  {"
+            + "{:#04x}".format(block.first)
+            + ", "
+            + "{:#04x}".format(block.last)
+            + ", "
+            + str(block.offset).rjust(4)
+            + "}, // "
+            + "{:#04x}".format(mappings[block.offset].char >> 8)
+            + "xx\n"
+        )
+    header.write("};\n")
+    header.write("\n")
+    header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[256] = {\n")
+    for i, index in enumerate(indexes):
+        header.write(
+            "  " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n"
+        )
+    header.write("};\n")
--- a/intl/unicharutil/util/moz.build
+++ b/intl/unicharutil/util/moz.build
@ -25,4 +25,10 @@ UNIFIED_SOURCES += [
    "nsUnicodeProperties.cpp",
 ]

+GeneratedFile(
+    "BaseChars.h",
+    script="base_chars.py",
+    inputs=["../../icu/source/data/translit/Latin_ASCII.txt"],
+)
+
 FINAL_LIBRARY = "xul"
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@ -11,8 +11,7 @@
 #include "mozilla/HashTable.h"
 #include "nsCharTraits.h"

-#include "unicode/uchar.h"
-#include "unicode/unorm2.h"
+#include "BaseChars.h"

 #define UNICODE_BMP_LIMIT 0x10000
 #define UNICODE_LIMIT 0x110000
@ -310,76 +309,20 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
 }

 uint32_t GetNaked(uint32_t aCh) {
-  using namespace mozilla;
-
-  static const UNormalizer2* normalizer;
-  static HashMap<uint32_t, uint32_t> nakedCharCache;
-
-  NS_ASSERTION(!IsCombiningDiacritic(aCh),
-               "This character needs to be skipped");
-
-  HashMap<uint32_t, uint32_t>::Ptr entry = nakedCharCache.lookup(aCh);
-  if (entry.found()) {
-    return entry->value();
-  }
-
-  UErrorCode error = U_ZERO_ERROR;
-  if (!normalizer) {
-    normalizer = unorm2_getNFDInstance(&error);
-    if (U_FAILURE(error)) {
-      return aCh;
-    }
-  }
-
-  static const size_t MAX_DECOMPOSITION_SIZE = 16;
-  UChar decomposition[MAX_DECOMPOSITION_SIZE];
-  UChar* combiners;
-  int32_t decompositionLen;
-  uint32_t baseChar, nextChar;
-  decompositionLen = unorm2_getDecomposition(normalizer, aCh, decomposition,
-                                             MAX_DECOMPOSITION_SIZE, &error);
-  if (decompositionLen < 1) {
-    // The character does not decompose.
+  MOZ_ASSERT(!IsCombiningDiacritic(aCh), "This character needs to be skipped");
+  if (!IS_IN_BMP(aCh)) {
    return aCh;
  }
-
-  if (NS_IS_HIGH_SURROGATE(decomposition[0])) {
-    baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]);
-    combiners = decomposition + 2;
-  } else {
-    baseChar = decomposition[0];
-    combiners = decomposition + 1;
+  uint8_t index = BASE_CHAR_MAPPING_BLOCK_INDEX[aCh >> 8];
+  if (index == 0xff) {
+    return aCh;
  }
-
-  if (IS_IN_BMP(baseChar) != IS_IN_BMP(aCh)) {
-    // Mappings that would change the length of a UTF-16 string are not
-    // currently supported.
-    baseChar = aCh;
-    goto cache;
+  const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
+  uint8_t lo = aCh & 0xff;
+  if (lo < block.mFirst || lo > block.mLast) {
+    return aCh;
  }
-
-  if (decompositionLen > 1) {
-    if (NS_IS_HIGH_SURROGATE(combiners[0])) {
-      nextChar = SURROGATE_TO_UCS4(combiners[0], combiners[1]);
-    } else {
-      nextChar = combiners[0];
-    }
-    if (!IsCombiningDiacritic(nextChar)) {
-      // Hangul syllables decompose but do not actually have diacritics.
-      // This also excludes decompositions with the Japanese marks U+3099 and
-      // U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND MARK), which
-      // we should not ignore for searching (bug 1624244).
-      baseChar = aCh;
-    }
-  }
-
-cache:
-  if (!nakedCharCache.putNew(aCh, baseChar)) {
-    // We're out of memory, so delete the cache to free some up.
-    nakedCharCache.clearAndCompact();
-  }
-
-  return baseChar;
+  return BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
 }

 }  // end namespace unicode
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@ -249,10 +249,18 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
 //   3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
 //   309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
 // which users report should not be ignored (bug 1624244).
+// Keep this function in sync with is_combining_diacritic in base_chars.py.
 inline bool IsCombiningDiacritic(uint32_t aCh) {
  uint8_t cc = u_getCombiningClass(aCh);
  return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
-         cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING;
+         cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING &&
+         cc != HB_UNICODE_COMBINING_CLASS_VIRAMA && cc != 91 && cc != 129 &&
+         cc != 130 && cc != 132;
+}
+
+// Keep this function in sync with is_math_symbol in base_chars.py.
+inline bool IsMathSymbol(uint32_t aCh) {
+  return u_charType(aCh) == U_MATH_SYMBOL;
 }

 // Remove diacritics from a character
--- a/toolkit/components/find/nsFind.cpp
+++ b/toolkit/components/find/nsFind.cpp
@ -40,13 +40,6 @@ using namespace mozilla::unicode;
 // Yikes!  Casting a char to unichar can fill with ones!
 #define CHAR_TO_UNICHAR(c) ((char16_t)(unsigned char)c)

-#define CH_QUOTE ((char16_t)0x22)
-#define CH_APOSTROPHE ((char16_t)0x27)
-#define CH_LEFT_SINGLE_QUOTE ((char16_t)0x2018)
-#define CH_RIGHT_SINGLE_QUOTE ((char16_t)0x2019)
-#define CH_LEFT_DOUBLE_QUOTE ((char16_t)0x201C)
-#define CH_RIGHT_DOUBLE_QUOTE ((char16_t)0x201D)
-
 #define CH_SHY ((char16_t)0xAD)

 // nsFind::Find casts CH_SHY to char before calling StripChars
@ -780,7 +773,8 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange,
    // diacritics, don't leave c set to a combining diacritical mark. (patc is
    // already guaranteed to not be a combining diacritical mark.)
    c = (t2b ? DecodeChar(t2b, &findex) : CHAR_TO_UNICHAR(t1b[findex]));
-    if (!mMatchDiacritics && IsCombiningDiacritic(c)) {
+    if (!mMatchDiacritics && IsCombiningDiacritic(c) &&
+        !IsMathSymbol(prevChar)) {
      continue;
    }
    patc = DecodeChar(patStr, &pindex);
@ -822,32 +816,6 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange,
      continue;
    }

-    if (!mCaseSensitive) {
-      switch (c) {
-        // treat curly and straight quotes as identical
-        case CH_LEFT_SINGLE_QUOTE:
-        case CH_RIGHT_SINGLE_QUOTE:
-          c = CH_APOSTROPHE;
-          break;
-        case CH_LEFT_DOUBLE_QUOTE:
-        case CH_RIGHT_DOUBLE_QUOTE:
-          c = CH_QUOTE;
-          break;
-      }
-
-      switch (patc) {
-        // treat curly and straight quotes as identical
-        case CH_LEFT_SINGLE_QUOTE:
-        case CH_RIGHT_SINGLE_QUOTE:
-          patc = CH_APOSTROPHE;
-          break;
-        case CH_LEFT_DOUBLE_QUOTE:
-        case CH_RIGHT_DOUBLE_QUOTE:
-          patc = CH_QUOTE;
-          break;
-      }
-    }
-
    if (pindex != (mFindBackward ? patLen : 0) && c != patc && !inWhitespace) {
      // A non-matching '\n' between CJ characters is ignored
      if (c == '\n' && t2b && IS_CJ_CHAR(prevCharInMatch)) {
--- a/toolkit/components/windowcreator/test/test_nsFind.html
+++ b/toolkit/components/windowcreator/test/test_nsFind.html
@ -7,6 +7,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=812837
 https://bugzilla.mozilla.org/show_bug.cgi?id=969980
 https://bugzilla.mozilla.org/show_bug.cgi?id=1589786
 https://bugzilla.mozilla.org/show_bug.cgi?id=1611568
+https://bugzilla.mozilla.org/show_bug.cgi?id=1649187
 -->
 <head>
  <meta charset="UTF-8">
@ -63,14 +64,30 @@ async function runTests() {
  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
  ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");

+  searchValue = "కె";
+  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
+  ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
+
  searchValue = "istanbul";
  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
  ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");

+  searchValue = "wroclaw";
+  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
+  ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
+
+  searchValue = "goteborg";
+  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
+  ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
+
  searchValue = "degrees k";
  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
  ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");

+  searchValue = "≠";
+  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
+  ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
+
  searchValue = "guahe";
  retRange = rf.Find(searchValue, searchRange, startPt, endPt);
  ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
@ -138,7 +155,7 @@ async function runTests() {

  // Curly quotes and straight quotes should match.

-  rf.caseSensitive = false;
+  rf.matchDiacritics = false;
  rf.findBackwards = false;

  function find(node, value) {
@ -218,7 +235,7 @@ async function runTests() {
  assertNotFound(quotes, "\u201Cdoesn't\u201D");

  // Curly quotes and straight quotes should not match.
-  rf.caseSensitive = true;
+  rf.matchDiacritics = true;

  assertFound(quotes, "\"straight\"");
  assertNotFound(quotes, "\u201Cstraight\u201D");
@ -301,8 +318,12 @@ async function runTests() {
 <p id="nullcharsinjected"></p>
 <p id="greek">ΛΌΓΟΣ</p>
 <p id="korean">위</p>
+<p id="telugu">కై</p>
 <p id="turkish">İstanbul</p>
+<p id="polish">Wrocław</p>
+<p id="norwegian">Gøteborg</p>
 <p id="kelvin">degrees &#x212A;</p>
+<p id="math">=</p>
 <p id="guarani">G̃uahe</p>
 <p id="deseret">𐐐𐐯𐑊𐐬 𐐶𐐯𐑉𐑊𐐼!</p>
 <div id="content" style="display: none">