Bug 1281448 - part 1+2 - Update character property table generator script for Unicode 9 (in particular, security/xidmodifications.txt is replaced by security/IdentifierStatus.txt and IdentifierType.txt), and adjust APIs to fit the new identifier-type property model; update the generated data files. r=m_kato

This commit is contained in:
Jonathan Kew 2016-11-14 09:23:49 +00:00
Родитель 40ea799982
Коммит b809e13f8d
7 изменённых файлов: 1438 добавлений и 1380 удалений

Просмотреть файл

@ -724,10 +724,9 @@ gfxShapedText::SetGlyphs(uint32_t aIndex, CompressedGlyph aGlyph,
#define ZWNJ 0x200C
#define ZWJ 0x200D
static inline bool
IsDefaultIgnorable(uint32_t aChar)
IsIgnorable(uint32_t aChar)
{
return GetIdentifierModification(aChar) == XIDMOD_DEFAULT_IGNORABLE ||
aChar == ZWNJ || aChar == ZWJ;
return (IsDefaultIgnorable(aChar)) || aChar == ZWNJ || aChar == ZWJ;
}
void
@ -743,7 +742,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont)
DetailedGlyph *details = AllocateDetailedGlyphs(aIndex, 1);
details->mGlyphID = aChar;
if (IsDefaultIgnorable(aChar)) {
if (IsIgnorable(aChar)) {
// Setting advance width to zero will prevent drawing the hexbox
details->mAdvance = 0;
} else {
@ -761,7 +760,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont)
bool
gfxShapedText::FilterIfIgnorable(uint32_t aIndex, uint32_t aCh)
{
if (IsDefaultIgnorable(aCh)) {
if (IsIgnorable(aCh)) {
// There are a few default-ignorables of Letter category (currently,
// just the Hangul filler characters) that we'd better not discard
// if they're followed by additional characters in the same cluster.

Просмотреть файл

@ -23,19 +23,21 @@
# - HangulSyllableType.txt
# - LineBreak.txt
# - EastAsianWidth.txt
# - DerivedCoreProperties.txt
# - ReadMe.txt (to record version/date of the UCD)
# - Unihan_Variants.txt (from Unihan.zip)
# though this may change if we find a need for additional properties.
#
# The Unicode data files listed above should be together in one directory.
#
# We also require the file
# http://www.unicode.org/Public/security/latest/xidmodifications.txt
# This file should be in a sub-directory "security" immediately below the
# We also require the files
# http://www.unicode.org/Public/security/latest/IdentifierStatus.txt
# http://www.unicode.org/Public/security/latest/IdentifierType.txt
# These files should be in a sub-directory "security" immediately below the
# directory containing the other Unicode data files.
#
# We also require the latest data file for UTR50, currently revision-13:
# http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt
# We also require the latest data file for UTR50, currently revision-15:
# http://www.unicode.org/Public/vertical/revision-15/VerticalOrientation-15.txt
# This file should be in a sub-directory "vertical" immediately below the
# directory containing the other Unicode data files.
#
@ -140,20 +142,35 @@ sub readIcuHeader
die "didn't find ICU script codes\n" if $sc == -1;
my %xidmodCode = (
'Recommended' => 0,
'Inclusion' => 1,
'Uncommon_Use' => 2,
'Technical' => 3,
'Obsolete' => 4,
'Aspirational' => 5,
'Limited_Use' => 6,
'Exclusion' => 7,
'Not_XID' => 8,
'Not_NFKC' => 9,
'Default_Ignorable' => 10,
'Deprecated' => 11,
'not-chars' => 12
# We don't currently store these values; %idType is used only to check that
# properties listed in the IdentifierType.txt file are recognized. We record
# only the %mappedIdType values that are used by nsIDNService::isLabelSafe.
# In practice, it would be sufficient for us to read only the last value in
# IdentifierType.txt, but we check that all values are known so that we'll get
# a warning if future updates introduce new ones, and can consider whether
# they need to be taken into account.
my %idType = (
"Not_Character" => 0,
"Recommended" => 1,
"Inclusion" => 2,
"Uncommon_Use" => 3,
"Technical" => 4,
"Obsolete" => 5,
"Aspirational" => 6,
"Limited_Use" => 7,
"Exclusion" => 8,
"Not_XID" => 9,
"Not_NFKC" => 10,
"Default_Ignorable" => 11,
"Deprecated" => 12
);
# These match the IdentifierType enum in nsUnicodeProperties.h.
my %mappedIdType = (
"Restricted" => 0,
"Allowed" => 1,
"Aspirational" => 2 # for Aspirational characters that are not excluded
# by another attribute.
);
my %bidicategoryCode = (
@ -229,7 +246,10 @@ my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum
"CP" => 36,
"CJ" => 37,
"HL" => 38,
"RI" => 39
"RI" => 39,
"EB" => 40,
"EM" => 41,
"ZWJ" => 42
);
my %eastAsianWidthCode = (
@ -249,7 +269,7 @@ my @mirror;
my @pairedBracketType;
my @hangul;
my @casemap;
my @xidmod;
my @idtype;
my @numericvalue;
my @hanVariant;
my @bidicategory;
@ -258,13 +278,14 @@ my @fullWidthInverse;
my @verticalOrientation;
my @lineBreak;
my @eastAsianWidthFWH;
my @defaultIgnorable;
for (my $i = 0; $i < 0x110000; ++$i) {
$script[$i] = $scriptCode{"UNKNOWN"};
$category[$i] = $catCode{"UNASSIGNED"};
$combining[$i] = 0;
$pairedBracketType[$i] = 0;
$casemap[$i] = 0;
$xidmod[$i] = $xidmodCode{"not-chars"};
$idtype[$i] = $mappedIdType{'Restricted'};
$numericvalue[$i] = -1;
$hanVariant[$i] = 0;
$bidicategory[$i] = $bidicategoryCode{"L"};
@ -273,6 +294,7 @@ for (my $i = 0; $i < 0x110000; ++$i) {
$verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
$lineBreak[$i] = $lineBreakCode{"XX"};
$eastAsianWidthFWH[$i] = 0;
$defaultIgnorable[$i] = 0;
}
# blocks where the default for bidi category is not L
@ -557,25 +579,67 @@ while (<FH>) {
}
close FH;
# read xidmodifications.txt
open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
# read DerivedCoreProperties.txt (for Default-Ignorables)
open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
push @versionInfo, $_;
last if /Date:/;
}
while (<FH>) {
s/#.*//;
if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) {
my $start = hex "0x$1";
my $end = (defined $2) ? hex "0x$2" : $start;
for (my $i = $start; $i <= $end; ++$i) {
$defaultIgnorable[$i] = 1;
}
}
}
close FH;
# read IdentifierStatus.txt
open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
unless (/\xef\xbb\xbf/) {
push @versionInfo, $_;
}
last if /Generated:/;
s/\xef\xbb\xbf//;
push @versionInfo, $_;
last if /Date:/;
}
while (<FH>) {
if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
my $xidmod = $3;
warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
$xidmod = $xidmodCode{$xidmod};
if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+Allowed/) {
my $start = hex "0x$1";
my $end = (defined $2) ? hex "0x$2" : $start;
for (my $i = $start; $i <= $end; ++$i) {
$xidmod[$i] = $xidmod;
$idtype[$i] = $mappedIdType{'Allowed'};
}
}
}
close FH;
# read IdentifierType.txt, to find Aspirational characters
open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
s/\xef\xbb\xbf//;
push @versionInfo, $_;
last if /Date:/;
}
while (<FH>) {
if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) {
my $idtype = $3;
foreach (split(/ /, $idtype)) {
warn "unknown Identifier Type $_" unless exists $idType{$_};
}
my $start = hex "0x$1";
my $end = (defined $2) ? hex "0x$2" : $start;
if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) {
for (my $i = $start; $i <= $end; ++$i) {
$idtype[$i] = $mappedIdType{'Aspirational'};
}
}
}
}
@ -617,8 +681,8 @@ while (<FH>) {
}
close FH;
# read VerticalOrientation-13.txt
open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
# read VerticalOrientation-15.txt
open FH, "< $UNICODE/vertical/VerticalOrientation-15.txt" or die "can't open UTR50 data file VerticalOrientation-15.txt\n";
push @versionInfo, "";
while (<FH>) {
chomp;
@ -738,14 +802,15 @@ sub sprintCharProps2_short
{
my $usv = shift;
return sprintf("{%d,%d},",
$verticalOrientation[$usv], $xidmod[$usv]);
$verticalOrientation[$usv], $idtype[$usv]);
}
$type = q|
struct nsCharProps2 {
// Currently only 6 bits are defined here, so 2 more could be added without
// affecting the storage requirements for this struct.
// Currently only 4 bits are defined here, so 4 more could be added without
// affecting the storage requirements for this struct. Or we could pack two
// records per byte, at the cost of a slightly more complex accessor.
unsigned char mVertOrient:2;
unsigned char mXidmod:4;
unsigned char mIdType:2;
};
|;
&genTables("#if ENABLE_INTL_API", "#endif",
@ -754,23 +819,31 @@ struct nsCharProps2 {
sub sprintCharProps2_full
{
my $usv = shift;
return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d},",
return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d,%d},",
$script[$usv], $pairedBracketType[$usv],
$eastAsianWidthFWH[$usv], $category[$usv],
$bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv],
$verticalOrientation[$usv], $lineBreak[$usv]);
$idtype[$usv], $defaultIgnorable[$usv], $bidicategory[$usv],
$verticalOrientation[$usv], $lineBreak[$usv],
$numericvalue[$usv]);
}
$type = q|
// This struct currently requires 5 bytes. We try to ensure that whole-byte
// fields will not straddle byte boundaries, to optimize access to them.
struct nsCharProps2 {
unsigned char mScriptCode:8;
// -- byte boundary --
unsigned char mPairedBracketType:2;
unsigned char mEastAsianWidthFWH:1;
unsigned char mCategory:5;
// -- byte boundary --
unsigned char mIdType:2;
unsigned char mDefaultIgnorable:1;
unsigned char mBidiCategory:5;
unsigned char mXidmod:4;
signed char mNumericValue:5;
// -- byte boundary --
unsigned char mVertOrient:2;
unsigned char mLineBreak; // only 6 bits actually needed
unsigned char mLineBreak:6;
// -- byte boundary --
signed char mNumericValue; // only 5 bits are actually needed here
};
|;
&genTables("#if !ENABLE_INTL_API", "#endif",

Просмотреть файл

@ -56,16 +56,18 @@ GetCharProps2(uint32_t aCh)
static const nsCharProps2 undefined = {
#if ENABLE_INTL_API
VERTICAL_ORIENTATION_R,
XIDMOD_NOT_CHARS
0 // IdentifierType
#else
uint8_t(Script::UNKNOWN),
PAIRED_BRACKET_TYPE_NONE,
0, // EastAsianWidthFWH
HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
0, // IdentifierType
0, // DefaultIgnorable
eCharType_LeftToRight,
XIDMOD_NOT_CHARS,
-1, // Numeric Value
VERTICAL_ORIENTATION_R
VERTICAL_ORIENTATION_R,
0, // LineBreak
-1 // Numeric Value
#endif
};
return undefined;
@ -300,6 +302,7 @@ bool IsEastAsianWidthFWH(uint32_t aCh)
{
return GetCharProps2(aCh).mEastAsianWidthFWH;
}
#endif
#define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \

Просмотреть файл

@ -40,20 +40,13 @@ enum PairedBracketType {
PAIRED_BRACKET_TYPE_CLOSE = 2
};
enum XidmodType {
XIDMOD_RECOMMENDED,
XIDMOD_INCLUSION,
XIDMOD_UNCOMMON_USE,
XIDMOD_TECHNICAL,
XIDMOD_OBSOLETE,
XIDMOD_ASPIRATIONAL,
XIDMOD_LIMITED_USE,
XIDMOD_EXCLUSION,
XIDMOD_NOT_XID,
XIDMOD_NOT_NFKC,
XIDMOD_DEFAULT_IGNORABLE,
XIDMOD_DEPRECATED,
XIDMOD_NOT_CHARS
/* Flags for Unicode security IdentifierType.txt attributes. Only a subset
of these are currently checked by Gecko, so we only define flags for the
ones we need. */
enum IdentifierType {
IDTYPE_RESTRICTED = 0,
IDTYPE_ALLOWED = 1,
IDTYPE_ASPIRATIONAL = 2,
};
#if ENABLE_INTL_API // ICU is available, so simply forward to its API
@ -172,6 +165,12 @@ IsEastAsianWidthFWH(uint32_t aCh)
return false;
}
inline bool
IsDefaultIgnorable(uint32_t aCh)
{
return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT);
}
#else // not ENABLE_INTL_API
// Return whether the char has a mirrored-pair counterpart.
@ -211,6 +210,12 @@ uint32_t GetTitlecaseForAll(uint32_t aCh); // maps both UC and LC to titlecase
// Return whether the char has EastAsianWidth class F or W or H.
bool IsEastAsianWidthFWH(uint32_t aCh);
// Return whether the char is default-ignorable.
inline bool IsDefaultIgnorable(uint32_t aCh)
{
return GetCharProps2(aCh).mDefaultIgnorable;
}
#endif // !ENABLE_INTL_API
// returns the simplified Gen Category as defined in nsIUGenCategory
@ -222,8 +227,8 @@ inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
return VerticalOrientation(GetCharProps2(aCh).mVertOrient);
}
inline XidmodType GetIdentifierModification(uint32_t aCh) {
return XidmodType(GetCharProps2(aCh).mXidmod);
inline IdentifierType GetIdentifierType(uint32_t aCh) {
return IdentifierType(GetCharProps2(aCh).mIdType);
}
uint32_t GetFullWidth(uint32_t aCh);

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -11,13 +11,13 @@
*/
/*
* Created on Wed Oct 26 09:12:45 2016 from UCD data files with version info:
* Created on Fri Nov 11 17:42:07 2016 from UCD data files with version info:
*
# Date: 2015-06-16, 20:24:00 GMT [KW]
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
# Date: 2016-06-20, 14:59:00 GMT [KW]
# © 2016 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# For documentation, see the following:
@ -25,41 +25,44 @@
# UAX #38, "Unicode Han Database (Unihan)"
# UAX #44, "Unicode Character Database."
#
# The UAXes can be accessed at http://www.unicode.org/versions/Unicode8.0.0/
# The UAXes can be accessed at http://www.unicode.org/versions/Unicode9.0.0/
This directory contains the final data files
for the Unicode Character Database, for Version 8.0.0 of the Unicode
Standard.
for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
# Scripts-9.0.0.txt
# Date: 2016-06-01, 10:34:37 GMT
# Scripts-8.0.0.txt
# Date: 2015-03-11, 22:29:42 GMT [MD]
# BidiMirroring-9.0.0.txt
# Date: 2016-01-21, 22:00:00 GMT [KW, LI]
# BidiMirroring-8.0.0.txt
# Date: 2015-01-20, 18:30:00 GMT [KW, LI]
# BidiBrackets-9.0.0.txt
# Date: 2016-06-07, 22:30:00 GMT [AG, LI, KW]
# BidiBrackets-8.0.0.txt
# Date: 2015-01-20, 19:00:00 GMT [AG, LI, KW]
# HangulSyllableType-9.0.0.txt
# Date: 2016-03-02, 18:55:01 GMT
# HangulSyllableType-8.0.0.txt
# Date: 2014-12-16, 23:07:45 GMT [MD]
# LineBreak-9.0.0.txt
# Date: 2016-05-26, 01:00:00 GMT [KW, LI]
# LineBreak-8.0.0.txt
# Date: 2015-02-13, 09:15:00 GMT [KW, LI]
# EastAsianWidth-9.0.0.txt
# Date: 2016-05-27, 17:00:00 GMT [KW, LI]
# EastAsianWidth-8.0.0.txt
# Date: 2015-02-10, 21:00:00 GMT [KW, LI]
# DerivedCoreProperties-9.0.0.txt
# Date: 2016-06-01, 10:34:24 GMT
# File: xidmodifications.txt
# Version: 8.0.0
# Generated: 2015-05-17, 03:09:04 GMT
# IdentifierStatus.txt
# Date: 2016-06-16, 13:41:30 GMT
# IdentifierType.txt
# Date: 2016-06-16, 13:41:30 GMT
#
# Unihan_Variants.txt
# Date: 2015-04-30 18:38:20 GMT [JHJ]
# Date: 2016-06-01 07:01:48 GMT [JHJ]
# VerticalOrientation-13.txt
# Date: 2014-09-03, 17:30:00 GMT [EM, KI, LI]
# VerticalOrientation-15.txt
# Date: 2015-11-16, 20:00:00 GMT [EM, KI, LI]
*
* * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
@ -83,26 +86,34 @@ struct nsCharProps1 {
#if ENABLE_INTL_API
struct nsCharProps2 {
// Currently only 6 bits are defined here, so 2 more could be added without
// affecting the storage requirements for this struct.
// Currently only 4 bits are defined here, so 4 more could be added without
// affecting the storage requirements for this struct. Or we could pack two
// records per byte, at the cost of a slightly more complex accessor.
unsigned char mVertOrient:2;
unsigned char mXidmod:4;
unsigned char mIdType:2;
};
#endif
#if !ENABLE_INTL_API
// This struct currently requires 5 bytes. We try to ensure that whole-byte
// fields will not straddle byte boundaries, to optimize access to them.
struct nsCharProps2 {
unsigned char mScriptCode:8;
// -- byte boundary --
unsigned char mPairedBracketType:2;
unsigned char mEastAsianWidthFWH:1;
unsigned char mCategory:5;
// -- byte boundary --
unsigned char mIdType:2;
unsigned char mDefaultIgnorable:1;
unsigned char mBidiCategory:5;
unsigned char mXidmod:4;
signed char mNumericValue:5;
// -- byte boundary --
unsigned char mVertOrient:2;
unsigned char mLineBreak; // only 6 bits actually needed
unsigned char mLineBreak:6;
// -- byte boundary --
signed char mNumericValue; // only 5 bits are actually needed here
};
#endif
@ -279,8 +290,16 @@ enum class Script {
MULTANI = 164,
PAU_CIN_HAU = 165,
SIDDHAM = 166,
ADLAM = 167,
BHAIKSUKI = 168,
MARCHEN = 169,
NEWA = 170,
OSAGE = 171,
HAN_WITH_BOPOMOFO = 172,
JAMO = 173,
SYMBOLS_EMOJI = 174,
NUM_SCRIPT_CODES = 167,
NUM_SCRIPT_CODES = 175,
INVALID = -1
};

Просмотреть файл

@ -814,12 +814,11 @@ bool nsIDNService::isLabelSafe(const nsAString &label)
}
// Check for restricted characters; aspirational scripts are permitted
XidmodType xm = GetIdentifierModification(ch);
if (xm != XIDMOD_RECOMMENDED &&
xm != XIDMOD_INCLUSION &&
xm != XIDMOD_ASPIRATIONAL) {
IdentifierType idType = GetIdentifierType(ch);
if (idType == IDTYPE_RESTRICTED) {
return false;
}
MOZ_ASSERT(idType == IDTYPE_ALLOWED || idType == IDTYPE_ASPIRATIONAL);
// Check for mixed script
Script script = GetScriptCode(ch);