Bug 1649187 - Use a fallback table to strip diacritics from non-decomposable characters. r=jfkthame

Implement the design suggested at
https://bugzilla.mozilla.org/show_bug.cgi?id=1652910#c5

Differential Revision: https://phabricator.services.mozilla.com/D106674
This commit is contained in:
Alex Henrie 2021-03-07 16:17:41 +00:00
Родитель f7a790dd78
Коммит 0686831376
7 изменённых файлов: 238 добавлений и 109 удалений

Просмотреть файл

@ -495,7 +495,7 @@ add_task(async function test_bookmarks() {
// test bookmarks.search
return Promise.all([
browser.bookmarks.create({
title: "MØzillä",
title: "Μοζιλλας",
url: "http://møzîllä.örg/",
}),
browser.bookmarks.create({
@ -567,7 +567,7 @@ add_task(async function test_bookmarks() {
results[0].id,
bookmarkGuids.unfiledGuid,
0,
"MØzillä",
"Μοζιλλας",
"http://xn--mzll-ooa1dud.xn--rg-eka/",
results[0].dateAdded
);
@ -908,7 +908,7 @@ add_task(async function test_bookmarks() {
);
// is case-insensitive for non-ascii
return browser.bookmarks.search("MøZILLÄ");
return browser.bookmarks.search("ΜοΖΙΛΛΑς");
})
.then(results => {
browser.test.assertEq(
@ -917,7 +917,7 @@ add_task(async function test_bookmarks() {
"Expected number of results returned for non-ascii search"
);
browser.test.assertEq(
"MØzillä",
"Μοζιλλας",
results[0].title,
"Bookmark has the expected title"
);

Просмотреть файл

@ -0,0 +1,183 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from __future__ import absolute_import, print_function
import re
from collections import namedtuple
from unicodedata import category, combining, normalize
UNICODE_LIMIT = 0x110000
UNICODE_BMP_LIMIT = 0x10000
UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
UNICODE_COMBINING_CLASS_KANA_VOICING = 8
UNICODE_COMBINING_CLASS_VIRAMA = 9
BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))
def is_in_bmp(char):
return ord(char) < UNICODE_BMP_LIMIT
# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
def is_combining_diacritic(char):
return combining(char) not in (
UNICODE_COMBINING_CLASS_NOT_REORDERED,
UNICODE_COMBINING_CLASS_KANA_VOICING,
UNICODE_COMBINING_CLASS_VIRAMA,
91,
129,
130,
132,
)
# Keep this function in sync with IsMathSymbol in nsUnicodeProperties.h.
def is_math_symbol(char):
return category(char) == "Sm"
def crosses_bmp(char, base_char):
if is_in_bmp(char) != is_in_bmp(base_char):
# Mappings that would change the length of a UTF-16 string are not
# currently supported.
return True
if not is_in_bmp(char):
# Currently there are no mappings we care about outside of the basic
# multilingual plane. However, if such a mapping is added to Unicode in
# the future, this warning will appear at build time.
print(
"Warning: Skipping "
+ "{:#06x}".format(ord(char))
+ ""
+ "{:#06x}".format(ord(base_char))
)
print(
"base_chars.py and nsUnicodeProperties.cpp need to be rewritten to "
"use uint32_t instead of uint16_t."
)
return True
return False
def main(header, fallback_table):
mappings = {}
# Glean mappings from decompositions
for char in range(UNICODE_BMP_LIMIT):
char = chr(char)
if is_combining_diacritic(char) or is_math_symbol(char):
continue
decomposition = normalize("NFD", char)
if len(decomposition) < 2:
continue
base_char = decomposition[0]
if crosses_bmp(char, base_char):
continue
next_char = decomposition[1]
if not is_combining_diacritic(next_char):
# Hangul syllables decompose but do not actually have diacritics.
# This also excludes decompositions with the Japanese marks U+3099
# and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
# MARK), which we should not ignore for searching (bug 1624244).
continue
mappings[char] = base_char
# Add mappings from the ASCII fallback table
for line in open(fallback_table, encoding="UTF-8"):
m = re.match("^(.) → (.+?) ;", line)
if not m:
continue
char = m.group(1)
decomposition = m.group(2)
if len(decomposition) >= 3:
if decomposition.startswith("'") and decomposition.endswith("'"):
decomposition = decomposition[1:-1]
if len(decomposition) >= 2:
if decomposition.startswith("\\"):
decomposition = decomposition[1:]
if len(decomposition) > 1:
continue
if crosses_bmp(char, decomposition):
continue
mappings[char] = decomposition
# Organize mappings into contiguous blocks
mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
blocks = []
i = 0
while i < len(mappings) - 1:
offset = i
first = mappings[i].char & 0xFF
while (
i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
):
while (
i < len(mappings) - 1
and mappings[i].char >> 8 == mappings[i + 1].char >> 8
and mappings[i + 1].char - mappings[i].char > 1
):
char = mappings[i].char + 1
mappings.insert(i + 1, BaseCharMapping(char, char))
i += 1
i += 1
last = mappings[i].char & 0xFF
blocks.append(BaseCharMappingBlock(first, last, offset))
i += 1
indexes = []
for i, block in enumerate(blocks):
while len(indexes) < mappings[block.offset].char >> 8:
indexes.append(255)
indexes.append(i)
while len(indexes) < 256:
indexes.append(255)
# Write the mappings to a C header file
header.write("struct BaseCharMappingBlock {\n")
header.write(" uint8_t mFirst;\n")
header.write(" uint8_t mLast;\n")
header.write(" uint16_t mMappingStartOffset;\n")
header.write("};\n")
header.write("\n")
header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
for char, base_char in mappings:
header.write(
" /* {:#06x}".format(char) + " */ " + "{:#06x}".format(base_char) + ","
)
if char != base_char:
header.write(" /* " + chr(char) + "" + chr(base_char) + " */")
header.write("\n")
header.write("};\n")
header.write("\n")
header.write(
"static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
)
for block in blocks:
header.write(
" {"
+ "{:#04x}".format(block.first)
+ ", "
+ "{:#04x}".format(block.last)
+ ", "
+ str(block.offset).rjust(4)
+ "}, // "
+ "{:#04x}".format(mappings[block.offset].char >> 8)
+ "xx\n"
)
header.write("};\n")
header.write("\n")
header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[256] = {\n")
for i, index in enumerate(indexes):
header.write(
" " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n"
)
header.write("};\n")

Просмотреть файл

@ -25,4 +25,10 @@ UNIFIED_SOURCES += [
"nsUnicodeProperties.cpp",
]
GeneratedFile(
"BaseChars.h",
script="base_chars.py",
inputs=["../../icu/source/data/translit/Latin_ASCII.txt"],
)
FINAL_LIBRARY = "xul"

Просмотреть файл

@ -11,8 +11,7 @@
#include "mozilla/HashTable.h"
#include "nsCharTraits.h"
#include "unicode/uchar.h"
#include "unicode/unorm2.h"
#include "BaseChars.h"
#define UNICODE_BMP_LIMIT 0x10000
#define UNICODE_LIMIT 0x110000
@ -310,76 +309,20 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
}
uint32_t GetNaked(uint32_t aCh) {
using namespace mozilla;
static const UNormalizer2* normalizer;
static HashMap<uint32_t, uint32_t> nakedCharCache;
NS_ASSERTION(!IsCombiningDiacritic(aCh),
"This character needs to be skipped");
HashMap<uint32_t, uint32_t>::Ptr entry = nakedCharCache.lookup(aCh);
if (entry.found()) {
return entry->value();
}
UErrorCode error = U_ZERO_ERROR;
if (!normalizer) {
normalizer = unorm2_getNFDInstance(&error);
if (U_FAILURE(error)) {
return aCh;
}
}
static const size_t MAX_DECOMPOSITION_SIZE = 16;
UChar decomposition[MAX_DECOMPOSITION_SIZE];
UChar* combiners;
int32_t decompositionLen;
uint32_t baseChar, nextChar;
decompositionLen = unorm2_getDecomposition(normalizer, aCh, decomposition,
MAX_DECOMPOSITION_SIZE, &error);
if (decompositionLen < 1) {
// The character does not decompose.
MOZ_ASSERT(!IsCombiningDiacritic(aCh), "This character needs to be skipped");
if (!IS_IN_BMP(aCh)) {
return aCh;
}
if (NS_IS_HIGH_SURROGATE(decomposition[0])) {
baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]);
combiners = decomposition + 2;
} else {
baseChar = decomposition[0];
combiners = decomposition + 1;
uint8_t index = BASE_CHAR_MAPPING_BLOCK_INDEX[aCh >> 8];
if (index == 0xff) {
return aCh;
}
if (IS_IN_BMP(baseChar) != IS_IN_BMP(aCh)) {
// Mappings that would change the length of a UTF-16 string are not
// currently supported.
baseChar = aCh;
goto cache;
const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
uint8_t lo = aCh & 0xff;
if (lo < block.mFirst || lo > block.mLast) {
return aCh;
}
if (decompositionLen > 1) {
if (NS_IS_HIGH_SURROGATE(combiners[0])) {
nextChar = SURROGATE_TO_UCS4(combiners[0], combiners[1]);
} else {
nextChar = combiners[0];
}
if (!IsCombiningDiacritic(nextChar)) {
// Hangul syllables decompose but do not actually have diacritics.
// This also excludes decompositions with the Japanese marks U+3099 and
// U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND MARK), which
// we should not ignore for searching (bug 1624244).
baseChar = aCh;
}
}
cache:
if (!nakedCharCache.putNew(aCh, baseChar)) {
// We're out of memory, so delete the cache to free some up.
nakedCharCache.clearAndCompact();
}
return baseChar;
return BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
}
} // end namespace unicode

Просмотреть файл

@ -249,10 +249,18 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
// which users report should not be ignored (bug 1624244).
// Keep this function in sync with is_combining_diacritic in base_chars.py.
inline bool IsCombiningDiacritic(uint32_t aCh) {
uint8_t cc = u_getCombiningClass(aCh);
return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING;
cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING &&
cc != HB_UNICODE_COMBINING_CLASS_VIRAMA && cc != 91 && cc != 129 &&
cc != 130 && cc != 132;
}
// Keep this function in sync with is_math_symbol in base_chars.py.
inline bool IsMathSymbol(uint32_t aCh) {
return u_charType(aCh) == U_MATH_SYMBOL;
}
// Remove diacritics from a character

Просмотреть файл

@ -40,13 +40,6 @@ using namespace mozilla::unicode;
// Yikes! Casting a char to unichar can fill with ones!
#define CHAR_TO_UNICHAR(c) ((char16_t)(unsigned char)c)
#define CH_QUOTE ((char16_t)0x22)
#define CH_APOSTROPHE ((char16_t)0x27)
#define CH_LEFT_SINGLE_QUOTE ((char16_t)0x2018)
#define CH_RIGHT_SINGLE_QUOTE ((char16_t)0x2019)
#define CH_LEFT_DOUBLE_QUOTE ((char16_t)0x201C)
#define CH_RIGHT_DOUBLE_QUOTE ((char16_t)0x201D)
#define CH_SHY ((char16_t)0xAD)
// nsFind::Find casts CH_SHY to char before calling StripChars
@ -780,7 +773,8 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange,
// diacritics, don't leave c set to a combining diacritical mark. (patc is
// already guaranteed to not be a combining diacritical mark.)
c = (t2b ? DecodeChar(t2b, &findex) : CHAR_TO_UNICHAR(t1b[findex]));
if (!mMatchDiacritics && IsCombiningDiacritic(c)) {
if (!mMatchDiacritics && IsCombiningDiacritic(c) &&
!IsMathSymbol(prevChar)) {
continue;
}
patc = DecodeChar(patStr, &pindex);
@ -822,32 +816,6 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange,
continue;
}
if (!mCaseSensitive) {
switch (c) {
// treat curly and straight quotes as identical
case CH_LEFT_SINGLE_QUOTE:
case CH_RIGHT_SINGLE_QUOTE:
c = CH_APOSTROPHE;
break;
case CH_LEFT_DOUBLE_QUOTE:
case CH_RIGHT_DOUBLE_QUOTE:
c = CH_QUOTE;
break;
}
switch (patc) {
// treat curly and straight quotes as identical
case CH_LEFT_SINGLE_QUOTE:
case CH_RIGHT_SINGLE_QUOTE:
patc = CH_APOSTROPHE;
break;
case CH_LEFT_DOUBLE_QUOTE:
case CH_RIGHT_DOUBLE_QUOTE:
patc = CH_QUOTE;
break;
}
}
if (pindex != (mFindBackward ? patLen : 0) && c != patc && !inWhitespace) {
// A non-matching '\n' between CJ characters is ignored
if (c == '\n' && t2b && IS_CJ_CHAR(prevCharInMatch)) {

Просмотреть файл

@ -7,6 +7,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=812837
https://bugzilla.mozilla.org/show_bug.cgi?id=969980
https://bugzilla.mozilla.org/show_bug.cgi?id=1589786
https://bugzilla.mozilla.org/show_bug.cgi?id=1611568
https://bugzilla.mozilla.org/show_bug.cgi?id=1649187
-->
<head>
<meta charset="UTF-8">
@ -63,14 +64,30 @@ async function runTests() {
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
searchValue = "కె";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
searchValue = "istanbul";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
searchValue = "wroclaw";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
searchValue = "goteborg";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
searchValue = "degrees k";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
searchValue = "≠";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
searchValue = "guahe";
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
@ -138,7 +155,7 @@ async function runTests() {
// Curly quotes and straight quotes should match.
rf.caseSensitive = false;
rf.matchDiacritics = false;
rf.findBackwards = false;
function find(node, value) {
@ -218,7 +235,7 @@ async function runTests() {
assertNotFound(quotes, "\u201Cdoesn't\u201D");
// Curly quotes and straight quotes should not match.
rf.caseSensitive = true;
rf.matchDiacritics = true;
assertFound(quotes, "\"straight\"");
assertNotFound(quotes, "\u201Cstraight\u201D");
@ -301,8 +318,12 @@ async function runTests() {
<p id="nullcharsinjected"></p>
<p id="greek">ΛΌΓΟΣ</p>
<p id="korean"></p>
<p id="telugu">కై</p>
<p id="turkish">İstanbul</p>
<p id="polish">Wrocław</p>
<p id="norwegian">Gøteborg</p>
<p id="kelvin">degrees &#x212A;</p>
<p id="math">=</p>
<p id="guarani">G̃uahe</p>
<p id="deseret">𐐐𐐯𐑊𐐬 𐐶𐐯𐑉𐑊𐐼!</p>
<div id="content" style="display: none">