Bug 1281448 - part 1+2 - Update character property table generator script for Unicode 9 (in particular, security/xidmodifications.txt is replaced by security/IdentifierStatus.txt and IdentifierType.txt), and adjust APIs to fit the new identifier-type property model; update the generated data files. r=m_kato

2016-11-14 09:23:49 +00:00 · 2016-11-14 09:23:49 +00:00 · b809e13f8d
--- a/gfx/thebes/gfxFont.cpp
+++ b/gfx/thebes/gfxFont.cpp
@ -724,10 +724,9 @@ gfxShapedText::SetGlyphs(uint32_t aIndex, CompressedGlyph aGlyph,
 #define ZWNJ 0x200C
 #define ZWJ  0x200D
 static inline bool
-IsDefaultIgnorable(uint32_t aChar)
+IsIgnorable(uint32_t aChar)
 {
-    return GetIdentifierModification(aChar) == XIDMOD_DEFAULT_IGNORABLE ||
-           aChar == ZWNJ || aChar == ZWJ;
+    return (IsDefaultIgnorable(aChar)) || aChar == ZWNJ || aChar == ZWJ;
 }

 void
@ -743,7 +742,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont)
    DetailedGlyph *details = AllocateDetailedGlyphs(aIndex, 1);

    details->mGlyphID = aChar;
-    if (IsDefaultIgnorable(aChar)) {
+    if (IsIgnorable(aChar)) {
        // Setting advance width to zero will prevent drawing the hexbox
        details->mAdvance = 0;
    } else {
@ -761,7 +760,7 @@ gfxShapedText::SetMissingGlyph(uint32_t aIndex, uint32_t aChar, gfxFont *aFont)
 bool
 gfxShapedText::FilterIfIgnorable(uint32_t aIndex, uint32_t aCh)
 {
-    if (IsDefaultIgnorable(aCh)) {
+    if (IsIgnorable(aCh)) {
        // There are a few default-ignorables of Letter category (currently,
        // just the Hangul filler characters) that we'd better not discard
        // if they're followed by additional characters in the same cluster.
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@ -23,19 +23,21 @@
 #       - HangulSyllableType.txt
 #       - LineBreak.txt
 #       - EastAsianWidth.txt
+#       - DerivedCoreProperties.txt
 #       - ReadMe.txt (to record version/date of the UCD)
 #       - Unihan_Variants.txt (from Unihan.zip)
 #     though this may change if we find a need for additional properties.
 #
 #     The Unicode data files listed above should be together in one directory.
 #
-#     We also require the file
-#        http://www.unicode.org/Public/security/latest/xidmodifications.txt
-#     This file should be in a sub-directory "security" immediately below the
+#     We also require the files
+#        http://www.unicode.org/Public/security/latest/IdentifierStatus.txt
+#        http://www.unicode.org/Public/security/latest/IdentifierType.txt
+#     These files should be in a sub-directory "security" immediately below the
 #        directory containing the other Unicode data files.
 #
-#     We also require the latest data file for UTR50, currently revision-13:
-#        http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt
+#     We also require the latest data file for UTR50, currently revision-15:
+#        http://www.unicode.org/Public/vertical/revision-15/VerticalOrientation-15.txt
 #     This file should be in a sub-directory "vertical" immediately below the
 #        directory containing the other Unicode data files.
 #
@ -140,20 +142,35 @@ sub readIcuHeader

 die "didn't find ICU script codes\n" if $sc == -1;

-my %xidmodCode = (
-'Recommended'       => 0,
-'Inclusion'         => 1,
-'Uncommon_Use'      => 2,
-'Technical'         => 3,
-'Obsolete'          => 4,
-'Aspirational'      => 5,
-'Limited_Use'       => 6,
-'Exclusion'         => 7,
-'Not_XID'           => 8,
-'Not_NFKC'          => 9,
-'Default_Ignorable' => 10,
-'Deprecated'        => 11,
-'not-chars'         => 12
+# We don't currently store these values; %idType is used only to check that
+# properties listed in the IdentifierType.txt file are recognized. We record
+# only the %mappedIdType values that are used by nsIDNService::isLabelSafe.
+# In practice, it would be sufficient for us to read only the last value in
+# IdentifierType.txt, but we check that all values are known so that we'll get
+# a warning if future updates introduce new ones, and can consider whether
+# they need to be taken into account.
+my %idType = (
+  "Not_Character"     => 0,
+  "Recommended"       => 1,
+  "Inclusion"         => 2,
+  "Uncommon_Use"      => 3,
+  "Technical"         => 4,
+  "Obsolete"          => 5,
+  "Aspirational"      => 6,
+  "Limited_Use"       => 7,
+  "Exclusion"         => 8,
+  "Not_XID"           => 9,
+  "Not_NFKC"          => 10,
+  "Default_Ignorable" => 11,
+  "Deprecated"        => 12
+);
+
+# These match the IdentifierType enum in nsUnicodeProperties.h.
+my %mappedIdType = (
+  "Restricted"   => 0,
+  "Allowed"      => 1,
+  "Aspirational" => 2 # for Aspirational characters that are not excluded
+                      # by another attribute.
 );

 my %bidicategoryCode = (
@ -229,7 +246,10 @@ my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum
  "CP" => 36,
  "CJ" => 37,
  "HL" => 38,
-  "RI" => 39
+  "RI" => 39,
+  "EB" => 40,
+  "EM" => 41,
+  "ZWJ" => 42
 );

 my %eastAsianWidthCode = (
@ -249,7 +269,7 @@ my @mirror;
 my @pairedBracketType;
 my @hangul;
 my @casemap;
-my @xidmod;
+my @idtype;
 my @numericvalue;
 my @hanVariant;
 my @bidicategory;
@ -258,13 +278,14 @@ my @fullWidthInverse;
 my @verticalOrientation;
 my @lineBreak;
 my @eastAsianWidthFWH;
+my @defaultIgnorable;
 for (my $i = 0; $i < 0x110000; ++$i) {
    $script[$i] = $scriptCode{"UNKNOWN"};
    $category[$i] = $catCode{"UNASSIGNED"};
    $combining[$i] = 0;
    $pairedBracketType[$i] = 0;
    $casemap[$i] = 0;
-    $xidmod[$i] = $xidmodCode{"not-chars"};
+    $idtype[$i] = $mappedIdType{'Restricted'};
    $numericvalue[$i] = -1;
    $hanVariant[$i] = 0;
    $bidicategory[$i] = $bidicategoryCode{"L"};
@ -273,6 +294,7 @@ for (my $i = 0; $i < 0x110000; ++$i) {
    $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
    $lineBreak[$i] = $lineBreakCode{"XX"};
    $eastAsianWidthFWH[$i] = 0;
+    $defaultIgnorable[$i] = 0;
 }

 # blocks where the default for bidi category is not L
@ -557,25 +579,67 @@ while (<FH>) {
 }
 close FH;

-# read xidmodifications.txt
-open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
+# read DerivedCoreProperties.txt (for Default-Ignorables)
+open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+    chomp;
+    push @versionInfo, $_;
+    last if /Date:/;
+}
+while (<FH>) {
+    s/#.*//;
+    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) {
+        my $start = hex "0x$1";
+        my $end = (defined $2) ? hex "0x$2" : $start;
+        for (my $i = $start; $i <= $end; ++$i) {
+            $defaultIgnorable[$i] = 1;
+        }
+    }
+}
+close FH;
+
+# read IdentifierStatus.txt
+open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n";
 push @versionInfo, "";
 while (<FH>) {
  chomp;
-  unless (/\xef\xbb\xbf/) {
-    push @versionInfo, $_;
-  }
-  last if /Generated:/;
+  s/\xef\xbb\xbf//;
+  push @versionInfo, $_;
+  last if /Date:/;
 }
 while (<FH>) {
-  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
-    my $xidmod = $3;
-    warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
-    $xidmod = $xidmodCode{$xidmod};
+  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+Allowed/) {
    my $start = hex "0x$1";
    my $end = (defined $2) ? hex "0x$2" : $start;
    for (my $i = $start; $i <= $end; ++$i) {
-      $xidmod[$i] = $xidmod;
+      $idtype[$i] = $mappedIdType{'Allowed'};
+    }
+  }
+}
+close FH;
+
+# read IdentifierType.txt, to find Aspirational characters
+open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n";
+push @versionInfo, "";
+while (<FH>) {
+  chomp;
+  s/\xef\xbb\xbf//;
+  push @versionInfo, $_;
+  last if /Date:/;
+}
+while (<FH>) {
+  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) {
+    my $idtype = $3;
+    foreach (split(/ /, $idtype)) {
+      warn "unknown Identifier Type $_" unless exists $idType{$_};
+    }
+    my $start = hex "0x$1";
+    my $end = (defined $2) ? hex "0x$2" : $start;
+    if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) {
+      for (my $i = $start; $i <= $end; ++$i) {
+        $idtype[$i] = $mappedIdType{'Aspirational'};
+      }
    }
  }
 }
@ -617,8 +681,8 @@ while (<FH>) {
 }
 close FH;

-# read VerticalOrientation-13.txt
-open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
+# read VerticalOrientation-15.txt
+open FH, "< $UNICODE/vertical/VerticalOrientation-15.txt" or die "can't open UTR50 data file VerticalOrientation-15.txt\n";
 push @versionInfo, "";
 while (<FH>) {
    chomp;
@ -738,14 +802,15 @@ sub sprintCharProps2_short
 {
  my $usv = shift;
  return sprintf("{%d,%d},",
-                 $verticalOrientation[$usv], $xidmod[$usv]);
+                 $verticalOrientation[$usv], $idtype[$usv]);
 }
 $type = q|
 struct nsCharProps2 {
-  // Currently only 6 bits are defined here, so 2 more could be added without
-  // affecting the storage requirements for this struct.
+  // Currently only 4 bits are defined here, so 4 more could be added without
+  // affecting the storage requirements for this struct. Or we could pack two
+  // records per byte, at the cost of a slightly more complex accessor.
  unsigned char mVertOrient:2;
-  unsigned char mXidmod:4;
+  unsigned char mIdType:2;
 };
 |;
 &genTables("#if ENABLE_INTL_API", "#endif",
@ -754,23 +819,31 @@ struct nsCharProps2 {
 sub sprintCharProps2_full
 {
  my $usv = shift;
-  return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d},",
+  return sprintf("{%d,%d,%d,%d,%d,%d,%d,%d,%d,%d},",
                 $script[$usv], $pairedBracketType[$usv],
                 $eastAsianWidthFWH[$usv], $category[$usv],
-                 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv],
-                 $verticalOrientation[$usv], $lineBreak[$usv]);
+                 $idtype[$usv], $defaultIgnorable[$usv], $bidicategory[$usv],
+                 $verticalOrientation[$usv], $lineBreak[$usv],
+                 $numericvalue[$usv]);
 }
 $type = q|
+// This struct currently requires 5 bytes. We try to ensure that whole-byte
+// fields will not straddle byte boundaries, to optimize access to them.
 struct nsCharProps2 {
  unsigned char mScriptCode:8;
+  // -- byte boundary --
  unsigned char mPairedBracketType:2;
  unsigned char mEastAsianWidthFWH:1;
  unsigned char mCategory:5;
+  // -- byte boundary --
+  unsigned char mIdType:2;
+  unsigned char mDefaultIgnorable:1;
  unsigned char mBidiCategory:5;
-  unsigned char mXidmod:4;
-  signed char   mNumericValue:5;
+  // -- byte boundary --
  unsigned char mVertOrient:2;
-  unsigned char mLineBreak; // only 6 bits actually needed
+  unsigned char mLineBreak:6;
+  // -- byte boundary --
+  signed char   mNumericValue; // only 5 bits are actually needed here
 };
 |;
 &genTables("#if !ENABLE_INTL_API", "#endif",
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@ -56,16 +56,18 @@ GetCharProps2(uint32_t aCh)
    static const nsCharProps2 undefined = {
 #if ENABLE_INTL_API
        VERTICAL_ORIENTATION_R,
-        XIDMOD_NOT_CHARS
+        0 // IdentifierType
 #else
        uint8_t(Script::UNKNOWN),
        PAIRED_BRACKET_TYPE_NONE,
        0, // EastAsianWidthFWH
        HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
+        0, // IdentifierType
+        0, // DefaultIgnorable
        eCharType_LeftToRight,
-        XIDMOD_NOT_CHARS,
-        -1, // Numeric Value
-        VERTICAL_ORIENTATION_R
+        VERTICAL_ORIENTATION_R,
+        0, // LineBreak
+        -1 // Numeric Value
 #endif
    };
    return undefined;
@ -300,6 +302,7 @@ bool IsEastAsianWidthFWH(uint32_t aCh)
 {
    return GetCharProps2(aCh).mEastAsianWidthFWH;
 }
+
 #endif

 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@ -40,20 +40,13 @@ enum PairedBracketType {
  PAIRED_BRACKET_TYPE_CLOSE = 2
 };

-enum XidmodType {
-  XIDMOD_RECOMMENDED,
-  XIDMOD_INCLUSION,
-  XIDMOD_UNCOMMON_USE,
-  XIDMOD_TECHNICAL,
-  XIDMOD_OBSOLETE,
-  XIDMOD_ASPIRATIONAL,
-  XIDMOD_LIMITED_USE,
-  XIDMOD_EXCLUSION,
-  XIDMOD_NOT_XID,
-  XIDMOD_NOT_NFKC,
-  XIDMOD_DEFAULT_IGNORABLE,
-  XIDMOD_DEPRECATED,
-  XIDMOD_NOT_CHARS
+/* Flags for Unicode security IdentifierType.txt attributes. Only a subset
+   of these are currently checked by Gecko, so we only define flags for the
+   ones we need. */
+enum IdentifierType {
+  IDTYPE_RESTRICTED = 0,
+  IDTYPE_ALLOWED = 1,
+  IDTYPE_ASPIRATIONAL = 2,
 };

 #if ENABLE_INTL_API // ICU is available, so simply forward to its API
@ -172,6 +165,12 @@ IsEastAsianWidthFWH(uint32_t aCh)
  return false;
 }

+inline bool
+IsDefaultIgnorable(uint32_t aCh)
+{
+  return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT);
+}
+
 #else // not ENABLE_INTL_API

 // Return whether the char has a mirrored-pair counterpart.
@ -211,6 +210,12 @@ uint32_t GetTitlecaseForAll(uint32_t aCh); // maps both UC and LC to titlecase
 // Return whether the char has EastAsianWidth class F or W or H.
 bool IsEastAsianWidthFWH(uint32_t aCh);

+// Return whether the char is default-ignorable.
+inline bool IsDefaultIgnorable(uint32_t aCh)
+{
+  return GetCharProps2(aCh).mDefaultIgnorable;
+}
+
 #endif // !ENABLE_INTL_API

 // returns the simplified Gen Category as defined in nsIUGenCategory
@ -222,8 +227,8 @@ inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
  return VerticalOrientation(GetCharProps2(aCh).mVertOrient);
 }

-inline XidmodType GetIdentifierModification(uint32_t aCh) {
-  return XidmodType(GetCharProps2(aCh).mXidmod);
+inline IdentifierType GetIdentifierType(uint32_t aCh) {
+  return IdentifierType(GetCharProps2(aCh).mIdType);
 }

 uint32_t GetFullWidth(uint32_t aCh);
--- a/intl/unicharutil/util/nsUnicodePropertyData.cpp
+++ b/intl/unicharutil/util/nsUnicodePropertyData.cpp
--- a/intl/unicharutil/util/nsUnicodeScriptCodes.h
+++ b/intl/unicharutil/util/nsUnicodeScriptCodes.h
@ -11,13 +11,13 @@
 */

 /*
- * Created on Wed Oct 26 09:12:45 2016 from UCD data files with version info:
+ * Created on Fri Nov 11 17:42:07 2016 from UCD data files with version info:
 *

-# Date: 2015-06-16, 20:24:00 GMT [KW]
-#
 # Unicode Character Database
-# Copyright (c) 1991-2015 Unicode, Inc.
+# Date: 2016-06-20, 14:59:00 GMT [KW]
+# © 2016 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
 # For documentation, see the following:
@ -25,41 +25,44 @@
 # UAX #38, "Unicode Han Database (Unihan)"
 # UAX #44, "Unicode Character Database."
 #
-# The UAXes can be accessed at http://www.unicode.org/versions/Unicode8.0.0/
+# The UAXes can be accessed at http://www.unicode.org/versions/Unicode9.0.0/

 This directory contains the final data files
-for the Unicode Character Database, for Version 8.0.0 of the Unicode
-Standard.
+for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.

+# Scripts-9.0.0.txt
+# Date: 2016-06-01, 10:34:37 GMT

-# Scripts-8.0.0.txt
-# Date: 2015-03-11, 22:29:42 GMT [MD]
+# BidiMirroring-9.0.0.txt
+# Date: 2016-01-21, 22:00:00 GMT [KW, LI]

-# BidiMirroring-8.0.0.txt
-# Date: 2015-01-20, 18:30:00 GMT [KW, LI]
+# BidiBrackets-9.0.0.txt
+# Date: 2016-06-07, 22:30:00 GMT [AG, LI, KW]

-# BidiBrackets-8.0.0.txt
-# Date: 2015-01-20, 19:00:00 GMT [AG, LI, KW]
+# HangulSyllableType-9.0.0.txt
+# Date: 2016-03-02, 18:55:01 GMT

-# HangulSyllableType-8.0.0.txt
-# Date: 2014-12-16, 23:07:45 GMT [MD]
+# LineBreak-9.0.0.txt
+# Date: 2016-05-26, 01:00:00 GMT [KW, LI]

-# LineBreak-8.0.0.txt
-# Date: 2015-02-13, 09:15:00 GMT [KW, LI]
+# EastAsianWidth-9.0.0.txt
+# Date: 2016-05-27, 17:00:00 GMT [KW, LI]

-# EastAsianWidth-8.0.0.txt
-# Date: 2015-02-10, 21:00:00 GMT [KW, LI]
+# DerivedCoreProperties-9.0.0.txt
+# Date: 2016-06-01, 10:34:24 GMT

-# File: xidmodifications.txt
-# Version: 8.0.0
-# Generated: 2015-05-17, 03:09:04 GMT
+# IdentifierStatus.txt
+# Date: 2016-06-16, 13:41:30 GMT
+
+# IdentifierType.txt
+# Date: 2016-06-16, 13:41:30 GMT

 #
 # Unihan_Variants.txt
-# Date: 2015-04-30 18:38:20 GMT [JHJ]
+# Date: 2016-06-01 07:01:48 GMT [JHJ]

-# VerticalOrientation-13.txt
-# Date: 2014-09-03, 17:30:00 GMT [EM, KI, LI]
+# VerticalOrientation-15.txt
+# Date: 2015-11-16, 20:00:00 GMT [EM, KI, LI]

 *
 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
@ -83,26 +86,34 @@ struct nsCharProps1 {
 #if ENABLE_INTL_API

 struct nsCharProps2 {
-  // Currently only 6 bits are defined here, so 2 more could be added without
-  // affecting the storage requirements for this struct.
+  // Currently only 4 bits are defined here, so 4 more could be added without
+  // affecting the storage requirements for this struct. Or we could pack two
+  // records per byte, at the cost of a slightly more complex accessor.
  unsigned char mVertOrient:2;
-  unsigned char mXidmod:4;
+  unsigned char mIdType:2;
 };

 #endif

 #if !ENABLE_INTL_API

+// This struct currently requires 5 bytes. We try to ensure that whole-byte
+// fields will not straddle byte boundaries, to optimize access to them.
 struct nsCharProps2 {
  unsigned char mScriptCode:8;
+  // -- byte boundary --
  unsigned char mPairedBracketType:2;
  unsigned char mEastAsianWidthFWH:1;
  unsigned char mCategory:5;
+  // -- byte boundary --
+  unsigned char mIdType:2;
+  unsigned char mDefaultIgnorable:1;
  unsigned char mBidiCategory:5;
-  unsigned char mXidmod:4;
-  signed char   mNumericValue:5;
+  // -- byte boundary --
  unsigned char mVertOrient:2;
-  unsigned char mLineBreak; // only 6 bits actually needed
+  unsigned char mLineBreak:6;
+  // -- byte boundary --
+  signed char   mNumericValue; // only 5 bits are actually needed here
 };

 #endif
@ -279,8 +290,16 @@ enum class Script {
  MULTANI = 164,
  PAU_CIN_HAU = 165,
  SIDDHAM = 166,
+  ADLAM = 167,
+  BHAIKSUKI = 168,
+  MARCHEN = 169,
+  NEWA = 170,
+  OSAGE = 171,
+  HAN_WITH_BOPOMOFO = 172,
+  JAMO = 173,
+  SYMBOLS_EMOJI = 174,

-  NUM_SCRIPT_CODES = 167,
+  NUM_SCRIPT_CODES = 175,

  INVALID = -1
 };
--- a/netwerk/dns/nsIDNService.cpp
+++ b/netwerk/dns/nsIDNService.cpp
@ -814,12 +814,11 @@ bool nsIDNService::isLabelSafe(const nsAString &label)
    }

    // Check for restricted characters; aspirational scripts are permitted
-    XidmodType xm = GetIdentifierModification(ch);
-    if (xm != XIDMOD_RECOMMENDED &&
-        xm != XIDMOD_INCLUSION &&
-        xm != XIDMOD_ASPIRATIONAL) {
+    IdentifierType idType = GetIdentifierType(ch);
+    if (idType == IDTYPE_RESTRICTED) {
      return false;
    }
+    MOZ_ASSERT(idType == IDTYPE_ALLOWED || idType == IDTYPE_ASPIRATIONAL);

    // Check for mixed script
    Script script = GetScriptCode(ch);