diff --git a/intl/unicharutil/util/is_combining_diacritic.py b/intl/unicharutil/util/is_combining_diacritic.py new file mode 100644 index 000000000000..a8dcec107879 --- /dev/null +++ b/intl/unicharutil/util/is_combining_diacritic.py @@ -0,0 +1,98 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from unicodedata import combining + +UNICODE_LIMIT = 0x110000 + +UNICODE_COMBINING_CLASS_NOT_REORDERED = 0 +UNICODE_COMBINING_CLASS_KANA_VOICING = 8 +UNICODE_COMBINING_CLASS_VIRAMA = 9 + + +# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h. +def is_combining_diacritic(char): + return combining(char) not in ( + UNICODE_COMBINING_CLASS_NOT_REORDERED, + UNICODE_COMBINING_CLASS_KANA_VOICING, + UNICODE_COMBINING_CLASS_VIRAMA, + 91, + 129, + 130, + 132, + ) + + +# See gfxFontUtils.h for the SharedBitSet that we're creating a const instance of here. +BLOCK_SIZE = 32 +BLOCK_SIZE_BITS = BLOCK_SIZE * 8 + + +def main(header): + blockIndex = [] + blocks = [] + + # Figure out the contents of each 256-char block, and see if it is unique + # or can share an already-allocated block. + block = [0] * BLOCK_SIZE + byte = 0 + bit = 0x01 + for char in range(UNICODE_LIMIT): + if is_combining_diacritic(chr(char)): + block[byte] |= bit + bit <<= 1 + if bit == 0x100: + bit = 0x01 + byte += 1 + if byte == BLOCK_SIZE: + found = False + for b in range(len(blocks)): + if block == blocks[b]: + blockIndex.append(b) + found = True + break + if not found: + blockIndex.append(len(blocks)) + blocks.append(block) + byte = 0 + block = [0] * BLOCK_SIZE + + # Strip trailing empty blocks from the index. + while blockIndex[len(blockIndex) - 1] == 0: + del blockIndex[len(blockIndex) - 1] + + # Write the SharedBitSet as data in a C++ header file. + header.write("/* !GENERATED DATA -- DO NOT EDIT! */\n") + header.write("/* (see is_combining_diacritic.py) */\n") + header.write("\n") + header.write("#include \"gfxFontUtils.h\"\n") + header.write("\n") + + header.write("typedef struct {\n") + header.write(" uint16_t mBlockIndexCount;\n") + header.write(" uint16_t mBlockCount;\n") + header.write(" uint16_t mBlockIndex[" + str(len(blockIndex)) + "];\n") + header.write(" uint8_t mBlockData[" + str(len(blocks) * BLOCK_SIZE) + "];\n") + header.write("} CombiningDiacriticsBitset_t;\n") + header.write("\n") + + header.write("static const CombiningDiacriticsBitset_t COMBINING_DIACRITICS_BITSET_DATA = {\n") + header.write(" " + str(len(blockIndex)) + ",\n") + header.write(" " + str(len(blocks)) + ",\n") + header.write(" {\n") + for b in blockIndex: + header.write(" " + str(b) + ",\n") + header.write(" },\n") + header.write(" {\n") + for b in blocks: + header.write(" ") + for i in b: + header.write(str(i) + ",") + header.write("\n") + header.write(" },\n") + header.write("};\n") + header.write("\n") + header.write("static const SharedBitSet* sCombiningDiacriticsSet =\n") + header.write(" reinterpret_cast(&COMBINING_DIACRITICS_BITSET_DATA);\n") + header.write("\n") diff --git a/intl/unicharutil/util/moz.build b/intl/unicharutil/util/moz.build index 897bfad92a8c..b08d66256534 100644 --- a/intl/unicharutil/util/moz.build +++ b/intl/unicharutil/util/moz.build @@ -25,10 +25,18 @@ UNIFIED_SOURCES += [ "nsUnicodeProperties.cpp", ] +include("/ipc/chromium/chromium-config.mozbuild") + GeneratedFile( "BaseChars.h", script="base_chars.py", inputs=["../../icu/source/data/translit/Latin_ASCII.txt"], ) +GeneratedFile( + "IsCombiningDiacritic.h", + script="is_combining_diacritic.py", + force=True +) + FINAL_LIBRARY = "xul" diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp index c1ff1b2d23b6..942f65b2da41 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.cpp +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp @@ -12,6 +12,7 @@ #include "nsCharTraits.h" #include "BaseChars.h" +#include "IsCombiningDiacritic.h" #define UNICODE_BMP_LIMIT 0x10000 #define UNICODE_LIMIT 0x110000 @@ -326,6 +327,10 @@ uint32_t GetNaked(uint32_t aCh) { BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst]; } +bool IsCombiningDiacritic(uint32_t aCh) { + return sCombiningDiacriticsSet->test(aCh); +} + } // end namespace unicode } // end namespace mozilla diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h index 6e7c489d372b..07d26ca02948 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.h +++ b/intl/unicharutil/util/nsUnicodeProperties.h @@ -249,14 +249,13 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength); // 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM // 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM // which users report should not be ignored (bug 1624244). -// Keep this function in sync with is_combining_diacritic in base_chars.py. -inline bool IsCombiningDiacritic(uint32_t aCh) { - uint8_t cc = u_getCombiningClass(aCh); - return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED && - cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING && - cc != HB_UNICODE_COMBINING_CLASS_VIRAMA && cc != 91 && cc != 129 && - cc != 130 && cc != 132; -} +// See is_combining_diacritic in base_chars.py and is_combining_diacritic.py. +// +// TODO: once ICU4X is integrated (replacing ICU4C) as the source of Unicode +// properties, re-evaluate whether building the static bitset is worthwhile +// or if we can revert to simply getting the combining class and comparing +// to the values we care about at runtime. +bool IsCombiningDiacritic(uint32_t aCh); // Keep this function in sync with is_math_symbol in base_chars.py. inline bool IsMathOrMusicSymbol(uint32_t aCh) {