Bug 1726570 - Accelerate nsFind by precomputing a const SharedBitSet for IsCombiningDiacritic. r=emilio

No user-visible change to behavior, except that searching a huge document becomes slightly quicker. Differential Revision: https://phabricator.services.mozilla.com/D123114
2021-08-23 14:17:54 +00:00 · 2021-08-23 14:17:54 +00:00 · 72e566334e
--- a/intl/unicharutil/util/is_combining_diacritic.py
+++ b/intl/unicharutil/util/is_combining_diacritic.py
@ -0,0 +1,98 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from unicodedata import combining
+
+UNICODE_LIMIT = 0x110000
+
+UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
+UNICODE_COMBINING_CLASS_KANA_VOICING = 8
+UNICODE_COMBINING_CLASS_VIRAMA = 9
+
+
+# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
+def is_combining_diacritic(char):
+    return combining(char) not in (
+        UNICODE_COMBINING_CLASS_NOT_REORDERED,
+        UNICODE_COMBINING_CLASS_KANA_VOICING,
+        UNICODE_COMBINING_CLASS_VIRAMA,
+        91,
+        129,
+        130,
+        132,
+    )
+
+
+# See gfxFontUtils.h for the SharedBitSet that we're creating a const instance of here.
+BLOCK_SIZE = 32
+BLOCK_SIZE_BITS = BLOCK_SIZE * 8
+
+
+def main(header):
+    blockIndex = []
+    blocks = []
+
+    # Figure out the contents of each 256-char block, and see if it is unique
+    # or can share an already-allocated block.
+    block = [0] * BLOCK_SIZE
+    byte = 0
+    bit = 0x01
+    for char in range(UNICODE_LIMIT):
+        if is_combining_diacritic(chr(char)):
+            block[byte] |= bit
+        bit <<= 1
+        if bit == 0x100:
+            bit = 0x01
+            byte += 1
+        if byte == BLOCK_SIZE:
+            found = False
+            for b in range(len(blocks)):
+                if block == blocks[b]:
+                    blockIndex.append(b)
+                    found = True
+                    break
+            if not found:
+                blockIndex.append(len(blocks))
+                blocks.append(block)
+            byte = 0
+            block = [0] * BLOCK_SIZE
+
+    # Strip trailing empty blocks from the index.
+    while blockIndex[len(blockIndex) - 1] == 0:
+        del blockIndex[len(blockIndex) - 1]
+
+    # Write the SharedBitSet as data in a C++ header file.
+    header.write("/* !GENERATED DATA -- DO NOT EDIT! */\n")
+    header.write("/* (see is_combining_diacritic.py) */\n")
+    header.write("\n")
+    header.write("#include \"gfxFontUtils.h\"\n")
+    header.write("\n")
+
+    header.write("typedef struct {\n")
+    header.write("  uint16_t mBlockIndexCount;\n")
+    header.write("  uint16_t mBlockCount;\n")
+    header.write("  uint16_t mBlockIndex[" + str(len(blockIndex)) + "];\n")
+    header.write("  uint8_t mBlockData[" + str(len(blocks) * BLOCK_SIZE) + "];\n")
+    header.write("} CombiningDiacriticsBitset_t;\n")
+    header.write("\n")
+
+    header.write("static const CombiningDiacriticsBitset_t COMBINING_DIACRITICS_BITSET_DATA = {\n")
+    header.write("  " + str(len(blockIndex)) + ",\n")
+    header.write("  " + str(len(blocks)) + ",\n")
+    header.write("  {\n")
+    for b in blockIndex:
+        header.write("    " + str(b) + ",\n")
+    header.write("  },\n")
+    header.write("  {\n")
+    for b in blocks:
+        header.write("    ")
+        for i in b:
+            header.write(str(i) + ",")
+        header.write("\n")
+    header.write("  },\n")
+    header.write("};\n")
+    header.write("\n")
+    header.write("static const SharedBitSet* sCombiningDiacriticsSet =\n")
+    header.write("    reinterpret_cast<const SharedBitSet*>(&COMBINING_DIACRITICS_BITSET_DATA);\n")
+    header.write("\n")
--- a/intl/unicharutil/util/moz.build
+++ b/intl/unicharutil/util/moz.build
@ -25,10 +25,18 @@ UNIFIED_SOURCES += [
    "nsUnicodeProperties.cpp",
 ]

+include("/ipc/chromium/chromium-config.mozbuild")
+
 GeneratedFile(
    "BaseChars.h",
    script="base_chars.py",
    inputs=["../../icu/source/data/translit/Latin_ASCII.txt"],
 )

+GeneratedFile(
+    "IsCombiningDiacritic.h",
+    script="is_combining_diacritic.py",
+    force=True
+)
+
 FINAL_LIBRARY = "xul"
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@ -12,6 +12,7 @@
 #include "nsCharTraits.h"

 #include "BaseChars.h"
+#include "IsCombiningDiacritic.h"

 #define UNICODE_BMP_LIMIT 0x10000
 #define UNICODE_LIMIT 0x110000
@ -326,6 +327,10 @@ uint32_t GetNaked(uint32_t aCh) {
         BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
 }

+bool IsCombiningDiacritic(uint32_t aCh) {
+  return sCombiningDiacriticsSet->test(aCh);
+}
+
 }  // end namespace unicode

 }  // end namespace mozilla
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@ -249,14 +249,13 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
 //   3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
 //   309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
 // which users report should not be ignored (bug 1624244).
-// Keep this function in sync with is_combining_diacritic in base_chars.py.
-inline bool IsCombiningDiacritic(uint32_t aCh) {
-  uint8_t cc = u_getCombiningClass(aCh);
-  return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
-         cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING &&
-         cc != HB_UNICODE_COMBINING_CLASS_VIRAMA && cc != 91 && cc != 129 &&
-         cc != 130 && cc != 132;
-}
+// See is_combining_diacritic in base_chars.py and is_combining_diacritic.py.
+//
+// TODO: once ICU4X is integrated (replacing ICU4C) as the source of Unicode
+// properties, re-evaluate whether building the static bitset is worthwhile
+// or if we can revert to simply getting the combining class and comparing
+// to the values we care about at runtime.
+bool IsCombiningDiacritic(uint32_t aCh);

 // Keep this function in sync with is_math_symbol in base_chars.py.
 inline bool IsMathOrMusicSymbol(uint32_t aCh) {