зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1649187 - Use a fallback table to strip diacritics from non-decomposable characters. r=jfkthame
Implement the design suggested at https://bugzilla.mozilla.org/show_bug.cgi?id=1652910#c5 Differential Revision: https://phabricator.services.mozilla.com/D106674
This commit is contained in:
Родитель
f7a790dd78
Коммит
0686831376
|
@ -495,7 +495,7 @@ add_task(async function test_bookmarks() {
|
|||
// test bookmarks.search
|
||||
return Promise.all([
|
||||
browser.bookmarks.create({
|
||||
title: "MØzillä",
|
||||
title: "Μοζιλλας",
|
||||
url: "http://møzîllä.örg/",
|
||||
}),
|
||||
browser.bookmarks.create({
|
||||
|
@ -567,7 +567,7 @@ add_task(async function test_bookmarks() {
|
|||
results[0].id,
|
||||
bookmarkGuids.unfiledGuid,
|
||||
0,
|
||||
"MØzillä",
|
||||
"Μοζιλλας",
|
||||
"http://xn--mzll-ooa1dud.xn--rg-eka/",
|
||||
results[0].dateAdded
|
||||
);
|
||||
|
@ -908,7 +908,7 @@ add_task(async function test_bookmarks() {
|
|||
);
|
||||
|
||||
// is case-insensitive for non-ascii
|
||||
return browser.bookmarks.search("MøZILLÄ");
|
||||
return browser.bookmarks.search("ΜοΖΙΛΛΑς");
|
||||
})
|
||||
.then(results => {
|
||||
browser.test.assertEq(
|
||||
|
@ -917,7 +917,7 @@ add_task(async function test_bookmarks() {
|
|||
"Expected number of results returned for non-ascii search"
|
||||
);
|
||||
browser.test.assertEq(
|
||||
"MØzillä",
|
||||
"Μοζιλλας",
|
||||
results[0].title,
|
||||
"Bookmark has the expected title"
|
||||
);
|
||||
|
|
|
@ -0,0 +1,183 @@
|
|||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from unicodedata import category, combining, normalize
|
||||
|
||||
UNICODE_LIMIT = 0x110000
|
||||
UNICODE_BMP_LIMIT = 0x10000
|
||||
|
||||
UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
|
||||
UNICODE_COMBINING_CLASS_KANA_VOICING = 8
|
||||
UNICODE_COMBINING_CLASS_VIRAMA = 9
|
||||
|
||||
BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
|
||||
BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))
|
||||
|
||||
|
||||
def is_in_bmp(char):
|
||||
return ord(char) < UNICODE_BMP_LIMIT
|
||||
|
||||
|
||||
# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
|
||||
def is_combining_diacritic(char):
|
||||
return combining(char) not in (
|
||||
UNICODE_COMBINING_CLASS_NOT_REORDERED,
|
||||
UNICODE_COMBINING_CLASS_KANA_VOICING,
|
||||
UNICODE_COMBINING_CLASS_VIRAMA,
|
||||
91,
|
||||
129,
|
||||
130,
|
||||
132,
|
||||
)
|
||||
|
||||
|
||||
# Keep this function in sync with IsMathSymbol in nsUnicodeProperties.h.
|
||||
def is_math_symbol(char):
|
||||
return category(char) == "Sm"
|
||||
|
||||
|
||||
def crosses_bmp(char, base_char):
|
||||
if is_in_bmp(char) != is_in_bmp(base_char):
|
||||
# Mappings that would change the length of a UTF-16 string are not
|
||||
# currently supported.
|
||||
return True
|
||||
if not is_in_bmp(char):
|
||||
# Currently there are no mappings we care about outside of the basic
|
||||
# multilingual plane. However, if such a mapping is added to Unicode in
|
||||
# the future, this warning will appear at build time.
|
||||
print(
|
||||
"Warning: Skipping "
|
||||
+ "{:#06x}".format(ord(char))
|
||||
+ " → "
|
||||
+ "{:#06x}".format(ord(base_char))
|
||||
)
|
||||
print(
|
||||
"base_chars.py and nsUnicodeProperties.cpp need to be rewritten to "
|
||||
"use uint32_t instead of uint16_t."
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main(header, fallback_table):
|
||||
mappings = {}
|
||||
|
||||
# Glean mappings from decompositions
|
||||
|
||||
for char in range(UNICODE_BMP_LIMIT):
|
||||
char = chr(char)
|
||||
if is_combining_diacritic(char) or is_math_symbol(char):
|
||||
continue
|
||||
decomposition = normalize("NFD", char)
|
||||
if len(decomposition) < 2:
|
||||
continue
|
||||
base_char = decomposition[0]
|
||||
if crosses_bmp(char, base_char):
|
||||
continue
|
||||
next_char = decomposition[1]
|
||||
if not is_combining_diacritic(next_char):
|
||||
# Hangul syllables decompose but do not actually have diacritics.
|
||||
# This also excludes decompositions with the Japanese marks U+3099
|
||||
# and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
|
||||
# MARK), which we should not ignore for searching (bug 1624244).
|
||||
continue
|
||||
mappings[char] = base_char
|
||||
|
||||
# Add mappings from the ASCII fallback table
|
||||
|
||||
for line in open(fallback_table, encoding="UTF-8"):
|
||||
m = re.match("^(.) → (.+?) ;", line)
|
||||
if not m:
|
||||
continue
|
||||
char = m.group(1)
|
||||
decomposition = m.group(2)
|
||||
if len(decomposition) >= 3:
|
||||
if decomposition.startswith("'") and decomposition.endswith("'"):
|
||||
decomposition = decomposition[1:-1]
|
||||
if len(decomposition) >= 2:
|
||||
if decomposition.startswith("\\"):
|
||||
decomposition = decomposition[1:]
|
||||
if len(decomposition) > 1:
|
||||
continue
|
||||
if crosses_bmp(char, decomposition):
|
||||
continue
|
||||
mappings[char] = decomposition
|
||||
|
||||
# Organize mappings into contiguous blocks
|
||||
|
||||
mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
|
||||
blocks = []
|
||||
i = 0
|
||||
while i < len(mappings) - 1:
|
||||
offset = i
|
||||
first = mappings[i].char & 0xFF
|
||||
while (
|
||||
i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
|
||||
):
|
||||
while (
|
||||
i < len(mappings) - 1
|
||||
and mappings[i].char >> 8 == mappings[i + 1].char >> 8
|
||||
and mappings[i + 1].char - mappings[i].char > 1
|
||||
):
|
||||
char = mappings[i].char + 1
|
||||
mappings.insert(i + 1, BaseCharMapping(char, char))
|
||||
i += 1
|
||||
i += 1
|
||||
last = mappings[i].char & 0xFF
|
||||
blocks.append(BaseCharMappingBlock(first, last, offset))
|
||||
i += 1
|
||||
|
||||
indexes = []
|
||||
for i, block in enumerate(blocks):
|
||||
while len(indexes) < mappings[block.offset].char >> 8:
|
||||
indexes.append(255)
|
||||
indexes.append(i)
|
||||
while len(indexes) < 256:
|
||||
indexes.append(255)
|
||||
|
||||
# Write the mappings to a C header file
|
||||
|
||||
header.write("struct BaseCharMappingBlock {\n")
|
||||
header.write(" uint8_t mFirst;\n")
|
||||
header.write(" uint8_t mLast;\n")
|
||||
header.write(" uint16_t mMappingStartOffset;\n")
|
||||
header.write("};\n")
|
||||
header.write("\n")
|
||||
header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
|
||||
for char, base_char in mappings:
|
||||
header.write(
|
||||
" /* {:#06x}".format(char) + " */ " + "{:#06x}".format(base_char) + ","
|
||||
)
|
||||
if char != base_char:
|
||||
header.write(" /* " + chr(char) + " → " + chr(base_char) + " */")
|
||||
header.write("\n")
|
||||
header.write("};\n")
|
||||
header.write("\n")
|
||||
header.write(
|
||||
"static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
|
||||
)
|
||||
for block in blocks:
|
||||
header.write(
|
||||
" {"
|
||||
+ "{:#04x}".format(block.first)
|
||||
+ ", "
|
||||
+ "{:#04x}".format(block.last)
|
||||
+ ", "
|
||||
+ str(block.offset).rjust(4)
|
||||
+ "}, // "
|
||||
+ "{:#04x}".format(mappings[block.offset].char >> 8)
|
||||
+ "xx\n"
|
||||
)
|
||||
header.write("};\n")
|
||||
header.write("\n")
|
||||
header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[256] = {\n")
|
||||
for i, index in enumerate(indexes):
|
||||
header.write(
|
||||
" " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n"
|
||||
)
|
||||
header.write("};\n")
|
|
@ -25,4 +25,10 @@ UNIFIED_SOURCES += [
|
|||
"nsUnicodeProperties.cpp",
|
||||
]
|
||||
|
||||
GeneratedFile(
|
||||
"BaseChars.h",
|
||||
script="base_chars.py",
|
||||
inputs=["../../icu/source/data/translit/Latin_ASCII.txt"],
|
||||
)
|
||||
|
||||
FINAL_LIBRARY = "xul"
|
||||
|
|
|
@ -11,8 +11,7 @@
|
|||
#include "mozilla/HashTable.h"
|
||||
#include "nsCharTraits.h"
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/unorm2.h"
|
||||
#include "BaseChars.h"
|
||||
|
||||
#define UNICODE_BMP_LIMIT 0x10000
|
||||
#define UNICODE_LIMIT 0x110000
|
||||
|
@ -310,76 +309,20 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
|
|||
}
|
||||
|
||||
uint32_t GetNaked(uint32_t aCh) {
|
||||
using namespace mozilla;
|
||||
|
||||
static const UNormalizer2* normalizer;
|
||||
static HashMap<uint32_t, uint32_t> nakedCharCache;
|
||||
|
||||
NS_ASSERTION(!IsCombiningDiacritic(aCh),
|
||||
"This character needs to be skipped");
|
||||
|
||||
HashMap<uint32_t, uint32_t>::Ptr entry = nakedCharCache.lookup(aCh);
|
||||
if (entry.found()) {
|
||||
return entry->value();
|
||||
}
|
||||
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
if (!normalizer) {
|
||||
normalizer = unorm2_getNFDInstance(&error);
|
||||
if (U_FAILURE(error)) {
|
||||
return aCh;
|
||||
}
|
||||
}
|
||||
|
||||
static const size_t MAX_DECOMPOSITION_SIZE = 16;
|
||||
UChar decomposition[MAX_DECOMPOSITION_SIZE];
|
||||
UChar* combiners;
|
||||
int32_t decompositionLen;
|
||||
uint32_t baseChar, nextChar;
|
||||
decompositionLen = unorm2_getDecomposition(normalizer, aCh, decomposition,
|
||||
MAX_DECOMPOSITION_SIZE, &error);
|
||||
if (decompositionLen < 1) {
|
||||
// The character does not decompose.
|
||||
MOZ_ASSERT(!IsCombiningDiacritic(aCh), "This character needs to be skipped");
|
||||
if (!IS_IN_BMP(aCh)) {
|
||||
return aCh;
|
||||
}
|
||||
|
||||
if (NS_IS_HIGH_SURROGATE(decomposition[0])) {
|
||||
baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]);
|
||||
combiners = decomposition + 2;
|
||||
} else {
|
||||
baseChar = decomposition[0];
|
||||
combiners = decomposition + 1;
|
||||
uint8_t index = BASE_CHAR_MAPPING_BLOCK_INDEX[aCh >> 8];
|
||||
if (index == 0xff) {
|
||||
return aCh;
|
||||
}
|
||||
|
||||
if (IS_IN_BMP(baseChar) != IS_IN_BMP(aCh)) {
|
||||
// Mappings that would change the length of a UTF-16 string are not
|
||||
// currently supported.
|
||||
baseChar = aCh;
|
||||
goto cache;
|
||||
const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
|
||||
uint8_t lo = aCh & 0xff;
|
||||
if (lo < block.mFirst || lo > block.mLast) {
|
||||
return aCh;
|
||||
}
|
||||
|
||||
if (decompositionLen > 1) {
|
||||
if (NS_IS_HIGH_SURROGATE(combiners[0])) {
|
||||
nextChar = SURROGATE_TO_UCS4(combiners[0], combiners[1]);
|
||||
} else {
|
||||
nextChar = combiners[0];
|
||||
}
|
||||
if (!IsCombiningDiacritic(nextChar)) {
|
||||
// Hangul syllables decompose but do not actually have diacritics.
|
||||
// This also excludes decompositions with the Japanese marks U+3099 and
|
||||
// U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND MARK), which
|
||||
// we should not ignore for searching (bug 1624244).
|
||||
baseChar = aCh;
|
||||
}
|
||||
}
|
||||
|
||||
cache:
|
||||
if (!nakedCharCache.putNew(aCh, baseChar)) {
|
||||
// We're out of memory, so delete the cache to free some up.
|
||||
nakedCharCache.clearAndCompact();
|
||||
}
|
||||
|
||||
return baseChar;
|
||||
return BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
|
||||
}
|
||||
|
||||
} // end namespace unicode
|
||||
|
|
|
@ -249,10 +249,18 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
|
|||
// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
|
||||
// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
|
||||
// which users report should not be ignored (bug 1624244).
|
||||
// Keep this function in sync with is_combining_diacritic in base_chars.py.
|
||||
inline bool IsCombiningDiacritic(uint32_t aCh) {
|
||||
uint8_t cc = u_getCombiningClass(aCh);
|
||||
return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
|
||||
cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING;
|
||||
cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING &&
|
||||
cc != HB_UNICODE_COMBINING_CLASS_VIRAMA && cc != 91 && cc != 129 &&
|
||||
cc != 130 && cc != 132;
|
||||
}
|
||||
|
||||
// Keep this function in sync with is_math_symbol in base_chars.py.
|
||||
inline bool IsMathSymbol(uint32_t aCh) {
|
||||
return u_charType(aCh) == U_MATH_SYMBOL;
|
||||
}
|
||||
|
||||
// Remove diacritics from a character
|
||||
|
|
|
@ -40,13 +40,6 @@ using namespace mozilla::unicode;
|
|||
// Yikes! Casting a char to unichar can fill with ones!
|
||||
#define CHAR_TO_UNICHAR(c) ((char16_t)(unsigned char)c)
|
||||
|
||||
#define CH_QUOTE ((char16_t)0x22)
|
||||
#define CH_APOSTROPHE ((char16_t)0x27)
|
||||
#define CH_LEFT_SINGLE_QUOTE ((char16_t)0x2018)
|
||||
#define CH_RIGHT_SINGLE_QUOTE ((char16_t)0x2019)
|
||||
#define CH_LEFT_DOUBLE_QUOTE ((char16_t)0x201C)
|
||||
#define CH_RIGHT_DOUBLE_QUOTE ((char16_t)0x201D)
|
||||
|
||||
#define CH_SHY ((char16_t)0xAD)
|
||||
|
||||
// nsFind::Find casts CH_SHY to char before calling StripChars
|
||||
|
@ -780,7 +773,8 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange,
|
|||
// diacritics, don't leave c set to a combining diacritical mark. (patc is
|
||||
// already guaranteed to not be a combining diacritical mark.)
|
||||
c = (t2b ? DecodeChar(t2b, &findex) : CHAR_TO_UNICHAR(t1b[findex]));
|
||||
if (!mMatchDiacritics && IsCombiningDiacritic(c)) {
|
||||
if (!mMatchDiacritics && IsCombiningDiacritic(c) &&
|
||||
!IsMathSymbol(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
patc = DecodeChar(patStr, &pindex);
|
||||
|
@ -822,32 +816,6 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange,
|
|||
continue;
|
||||
}
|
||||
|
||||
if (!mCaseSensitive) {
|
||||
switch (c) {
|
||||
// treat curly and straight quotes as identical
|
||||
case CH_LEFT_SINGLE_QUOTE:
|
||||
case CH_RIGHT_SINGLE_QUOTE:
|
||||
c = CH_APOSTROPHE;
|
||||
break;
|
||||
case CH_LEFT_DOUBLE_QUOTE:
|
||||
case CH_RIGHT_DOUBLE_QUOTE:
|
||||
c = CH_QUOTE;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (patc) {
|
||||
// treat curly and straight quotes as identical
|
||||
case CH_LEFT_SINGLE_QUOTE:
|
||||
case CH_RIGHT_SINGLE_QUOTE:
|
||||
patc = CH_APOSTROPHE;
|
||||
break;
|
||||
case CH_LEFT_DOUBLE_QUOTE:
|
||||
case CH_RIGHT_DOUBLE_QUOTE:
|
||||
patc = CH_QUOTE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pindex != (mFindBackward ? patLen : 0) && c != patc && !inWhitespace) {
|
||||
// A non-matching '\n' between CJ characters is ignored
|
||||
if (c == '\n' && t2b && IS_CJ_CHAR(prevCharInMatch)) {
|
||||
|
|
|
@ -7,6 +7,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=812837
|
|||
https://bugzilla.mozilla.org/show_bug.cgi?id=969980
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=1589786
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=1611568
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=1649187
|
||||
-->
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
|
@ -63,14 +64,30 @@ async function runTests() {
|
|||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
|
||||
|
||||
searchValue = "కె";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
|
||||
|
||||
searchValue = "istanbul";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
|
||||
|
||||
searchValue = "wroclaw";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
|
||||
|
||||
searchValue = "goteborg";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
|
||||
|
||||
searchValue = "degrees k";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
|
||||
|
||||
searchValue = "≠";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(!retRange, "\"" + searchValue + "\" found (not caseSensitive)");
|
||||
|
||||
searchValue = "guahe";
|
||||
retRange = rf.Find(searchValue, searchRange, startPt, endPt);
|
||||
ok(retRange, "\"" + searchValue + "\" not found (not caseSensitive)");
|
||||
|
@ -138,7 +155,7 @@ async function runTests() {
|
|||
|
||||
// Curly quotes and straight quotes should match.
|
||||
|
||||
rf.caseSensitive = false;
|
||||
rf.matchDiacritics = false;
|
||||
rf.findBackwards = false;
|
||||
|
||||
function find(node, value) {
|
||||
|
@ -218,7 +235,7 @@ async function runTests() {
|
|||
assertNotFound(quotes, "\u201Cdoesn't\u201D");
|
||||
|
||||
// Curly quotes and straight quotes should not match.
|
||||
rf.caseSensitive = true;
|
||||
rf.matchDiacritics = true;
|
||||
|
||||
assertFound(quotes, "\"straight\"");
|
||||
assertNotFound(quotes, "\u201Cstraight\u201D");
|
||||
|
@ -301,8 +318,12 @@ async function runTests() {
|
|||
<p id="nullcharsinjected"></p>
|
||||
<p id="greek">ΛΌΓΟΣ</p>
|
||||
<p id="korean">위</p>
|
||||
<p id="telugu">కై</p>
|
||||
<p id="turkish">İstanbul</p>
|
||||
<p id="polish">Wrocław</p>
|
||||
<p id="norwegian">Gøteborg</p>
|
||||
<p id="kelvin">degrees K</p>
|
||||
<p id="math">=</p>
|
||||
<p id="guarani">G̃uahe</p>
|
||||
<p id="deseret">𐐐𐐯𐑊𐐬 𐐶𐐯𐑉𐑊𐐼!</p>
|
||||
<div id="content" style="display: none">
|
||||
|
|
Загрузка…
Ссылка в новой задаче