зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1726570 - Accelerate nsFind by precomputing a const SharedBitSet for IsCombiningDiacritic. r=emilio
No user-visible change to behavior, except that searching a huge document becomes slightly quicker. Differential Revision: https://phabricator.services.mozilla.com/D123114
This commit is contained in:
Родитель
35ae073003
Коммит
72e566334e
|
@ -0,0 +1,98 @@
|
|||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from unicodedata import combining
|
||||
|
||||
UNICODE_LIMIT = 0x110000
|
||||
|
||||
UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
|
||||
UNICODE_COMBINING_CLASS_KANA_VOICING = 8
|
||||
UNICODE_COMBINING_CLASS_VIRAMA = 9
|
||||
|
||||
|
||||
# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
|
||||
def is_combining_diacritic(char):
|
||||
return combining(char) not in (
|
||||
UNICODE_COMBINING_CLASS_NOT_REORDERED,
|
||||
UNICODE_COMBINING_CLASS_KANA_VOICING,
|
||||
UNICODE_COMBINING_CLASS_VIRAMA,
|
||||
91,
|
||||
129,
|
||||
130,
|
||||
132,
|
||||
)
|
||||
|
||||
|
||||
# See gfxFontUtils.h for the SharedBitSet that we're creating a const instance of here.
|
||||
BLOCK_SIZE = 32
|
||||
BLOCK_SIZE_BITS = BLOCK_SIZE * 8
|
||||
|
||||
|
||||
def main(header):
|
||||
blockIndex = []
|
||||
blocks = []
|
||||
|
||||
# Figure out the contents of each 256-char block, and see if it is unique
|
||||
# or can share an already-allocated block.
|
||||
block = [0] * BLOCK_SIZE
|
||||
byte = 0
|
||||
bit = 0x01
|
||||
for char in range(UNICODE_LIMIT):
|
||||
if is_combining_diacritic(chr(char)):
|
||||
block[byte] |= bit
|
||||
bit <<= 1
|
||||
if bit == 0x100:
|
||||
bit = 0x01
|
||||
byte += 1
|
||||
if byte == BLOCK_SIZE:
|
||||
found = False
|
||||
for b in range(len(blocks)):
|
||||
if block == blocks[b]:
|
||||
blockIndex.append(b)
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
blockIndex.append(len(blocks))
|
||||
blocks.append(block)
|
||||
byte = 0
|
||||
block = [0] * BLOCK_SIZE
|
||||
|
||||
# Strip trailing empty blocks from the index.
|
||||
while blockIndex[len(blockIndex) - 1] == 0:
|
||||
del blockIndex[len(blockIndex) - 1]
|
||||
|
||||
# Write the SharedBitSet as data in a C++ header file.
|
||||
header.write("/* !GENERATED DATA -- DO NOT EDIT! */\n")
|
||||
header.write("/* (see is_combining_diacritic.py) */\n")
|
||||
header.write("\n")
|
||||
header.write("#include \"gfxFontUtils.h\"\n")
|
||||
header.write("\n")
|
||||
|
||||
header.write("typedef struct {\n")
|
||||
header.write(" uint16_t mBlockIndexCount;\n")
|
||||
header.write(" uint16_t mBlockCount;\n")
|
||||
header.write(" uint16_t mBlockIndex[" + str(len(blockIndex)) + "];\n")
|
||||
header.write(" uint8_t mBlockData[" + str(len(blocks) * BLOCK_SIZE) + "];\n")
|
||||
header.write("} CombiningDiacriticsBitset_t;\n")
|
||||
header.write("\n")
|
||||
|
||||
header.write("static const CombiningDiacriticsBitset_t COMBINING_DIACRITICS_BITSET_DATA = {\n")
|
||||
header.write(" " + str(len(blockIndex)) + ",\n")
|
||||
header.write(" " + str(len(blocks)) + ",\n")
|
||||
header.write(" {\n")
|
||||
for b in blockIndex:
|
||||
header.write(" " + str(b) + ",\n")
|
||||
header.write(" },\n")
|
||||
header.write(" {\n")
|
||||
for b in blocks:
|
||||
header.write(" ")
|
||||
for i in b:
|
||||
header.write(str(i) + ",")
|
||||
header.write("\n")
|
||||
header.write(" },\n")
|
||||
header.write("};\n")
|
||||
header.write("\n")
|
||||
header.write("static const SharedBitSet* sCombiningDiacriticsSet =\n")
|
||||
header.write(" reinterpret_cast<const SharedBitSet*>(&COMBINING_DIACRITICS_BITSET_DATA);\n")
|
||||
header.write("\n")
|
|
@ -25,10 +25,18 @@ UNIFIED_SOURCES += [
|
|||
"nsUnicodeProperties.cpp",
|
||||
]
|
||||
|
||||
include("/ipc/chromium/chromium-config.mozbuild")
|
||||
|
||||
GeneratedFile(
|
||||
"BaseChars.h",
|
||||
script="base_chars.py",
|
||||
inputs=["../../icu/source/data/translit/Latin_ASCII.txt"],
|
||||
)
|
||||
|
||||
GeneratedFile(
|
||||
"IsCombiningDiacritic.h",
|
||||
script="is_combining_diacritic.py",
|
||||
force=True
|
||||
)
|
||||
|
||||
FINAL_LIBRARY = "xul"
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "nsCharTraits.h"
|
||||
|
||||
#include "BaseChars.h"
|
||||
#include "IsCombiningDiacritic.h"
|
||||
|
||||
#define UNICODE_BMP_LIMIT 0x10000
|
||||
#define UNICODE_LIMIT 0x110000
|
||||
|
@ -326,6 +327,10 @@ uint32_t GetNaked(uint32_t aCh) {
|
|||
BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
|
||||
}
|
||||
|
||||
bool IsCombiningDiacritic(uint32_t aCh) {
|
||||
return sCombiningDiacriticsSet->test(aCh);
|
||||
}
|
||||
|
||||
} // end namespace unicode
|
||||
|
||||
} // end namespace mozilla
|
||||
|
|
|
@ -249,14 +249,13 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
|
|||
// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
|
||||
// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
|
||||
// which users report should not be ignored (bug 1624244).
|
||||
// Keep this function in sync with is_combining_diacritic in base_chars.py.
|
||||
inline bool IsCombiningDiacritic(uint32_t aCh) {
|
||||
uint8_t cc = u_getCombiningClass(aCh);
|
||||
return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
|
||||
cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING &&
|
||||
cc != HB_UNICODE_COMBINING_CLASS_VIRAMA && cc != 91 && cc != 129 &&
|
||||
cc != 130 && cc != 132;
|
||||
}
|
||||
// See is_combining_diacritic in base_chars.py and is_combining_diacritic.py.
|
||||
//
|
||||
// TODO: once ICU4X is integrated (replacing ICU4C) as the source of Unicode
|
||||
// properties, re-evaluate whether building the static bitset is worthwhile
|
||||
// or if we can revert to simply getting the combining class and comparing
|
||||
// to the values we care about at runtime.
|
||||
bool IsCombiningDiacritic(uint32_t aCh);
|
||||
|
||||
// Keep this function in sync with is_math_symbol in base_chars.py.
|
||||
inline bool IsMathOrMusicSymbol(uint32_t aCh) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче