Bug 724538 - When ICU is available in the build, replace most of nsCharProps2 fields with ICU property accessors. r=emk

2016-01-13 15:45:22 +00:00 · 2016-01-13 15:45:22 +00:00 · 06f42574aa
--- a/gfx/thebes/gfxFontconfigFonts.cpp
+++ b/gfx/thebes/gfxFontconfigFonts.cpp
@ -1625,9 +1625,14 @@ gfxPangoFontGroup::FindFontForChar(uint32_t aCh, uint32_t aPrevCh,
        nextFont = 1;
    }

-    // Pango, GLib, and Thebes (but not harfbuzz!) all happen to use the same
-    // script codes, so we can just cast the value here.
-    const PangoScript script = static_cast<PangoScript>(aRunScript);
+    // Our MOZ_SCRIPT_* codes may not match the PangoScript enumeration values
+    // (if we're using ICU's codes), so convert by mapping through ISO 15924 tag.
+    // Note that PangoScript is defined to be compatible with GUnicodeScript:
+    // https://developer.gnome.org/pango/stable/pango-Scripts-and-Languages.html#PangoScript
+    const hb_tag_t scriptTag = GetScriptTagForCode(aRunScript);
+    const PangoScript script =
+      (const PangoScript)g_unicode_script_from_iso15924(scriptTag);
+
    // Might be nice to call pango_language_includes_script only once for the
    // run rather than for each character.
    PangoLanguage *scriptLang;
@ -1654,19 +1659,6 @@ gfxPangoFontGroup::FindFontForChar(uint32_t aCh, uint32_t aPrevCh,
    return nullptr;
 }

-// Sanity-check: spot-check a few constants to confirm that Thebes and
-// Pango script codes really do match
-#define CHECK_SCRIPT_CODE(script) \
-    PR_STATIC_ASSERT(int32_t(MOZ_SCRIPT_##script) == \
-                     int32_t(PANGO_SCRIPT_##script))
-
-CHECK_SCRIPT_CODE(COMMON);
-CHECK_SCRIPT_CODE(INHERITED);
-CHECK_SCRIPT_CODE(ARABIC);
-CHECK_SCRIPT_CODE(LATIN);
-CHECK_SCRIPT_CODE(UNKNOWN);
-CHECK_SCRIPT_CODE(NKO);
-
 /**
 ** gfxFcFont
 **/
--- a/gfx/thebes/gfxScriptItemizer.cpp
+++ b/gfx/thebes/gfxScriptItemizer.cpp
@ -158,19 +158,11 @@ gfxScriptItemizer::Next(uint32_t& aRunStart, uint32_t& aRunLimit,
            }
        }

-        // Get the nsCharProps2 record for the current character,
-        // so we can read the script and (if needed) the gen category
-        // without needing to do two multi-level lookups.
-        // NOTE that this means we're relying on an implementation detail
-        // of the nsUnicodeProperties tables, and might have to revise this
-        // if the nsCharProps records used there are modified in future.
-        const nsCharProps2& charProps = GetCharProps2(ch);
-
        // Initialize gc to UNASSIGNED; we'll only set it to the true GC
        // if the character has script=COMMON, otherwise we don't care.
        uint8_t gc = HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;

-        sc = charProps.mScriptCode;
+        sc = GetScriptCode(ch);
        if (sc == MOZ_SCRIPT_COMMON) {
            /*
             * Paired character handling:
@ -183,7 +175,7 @@ gfxScriptItemizer::Next(uint32_t& aRunStart, uint32_t& aRunLimit,
             * We only do this if the script is COMMON; for chars with
             * specific script assignments, we just use them as-is.
             */
-            gc = charProps.mCategory;
+            GetGeneralCategory(ch);
            if (gc == HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION) {
                uint32_t endPairChar = mozilla::unicode::GetMirroredChar(ch);
                if (endPairChar != ch) {
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@ -40,8 +40,9 @@
 #
 # (2) Run this tool using a command line of the form
 #
-#         perl genUnicodePropertyData.pl \
-#                 /path/to/harfbuzz/src  \
+#         perl genUnicodePropertyData.pl      \
+#                 /path/to/harfbuzz/src       \
+#                 /path/to/icu/common/unicode \
 #                 /path/to/UCD-directory
 #
 #     This will generate (or overwrite!) the files
@ -54,15 +55,17 @@
 use strict;
 use List::Util qw(first);

-if ($#ARGV != 1) {
+if ($#ARGV != 2) {
    print <<__EOT;
 # Run this tool using a command line of the form
 #
-#     perl genUnicodePropertyData.pl \\
-#             /path/to/harfbuzz/src  \\
+#     perl genUnicodePropertyData.pl      \\
+#             /path/to/harfbuzz/src       \\
+#             /path/to/icu/common/unicode \\
 #             /path/to/UCD-directory
 #
 # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
+# icu/common/unicode is the directory containing ICU 'common' public headers,
 # and UCD-directory is a directory containing the current Unicode Character
 # Database files (UnicodeData.txt, etc), available from
 # http://www.unicode.org/Public/UNIDATA/, with additional resources as
@ -78,190 +81,20 @@ __EOT
    exit 0;
 }

-# load HB_Script and HB_Category constants
+my $HARFBUZZ = $ARGV[0];
+my $ICU = $ARGV[1];
+my $UNICODE = $ARGV[2];

-# NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
-# script codes as used by Glib/Pango/etc.
-# We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
-# compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
+# load HB_Category constants

-# CHECK that this matches Pango source (as found for example at 
-# http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
-# for as many codes as that defines (currently up through Unicode 5.1)
-# and the GLib enumeration
-# http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
-# (currently defined up through Unicode 6.0).
-# Constants beyond these may be regarded as unstable for now, but we don't actually
-# depend on the specific values.
-my %scriptCode = (
-  INVALID => -1,
-  COMMON => 0,
-  INHERITED => 1,
-  ARABIC => 2,
-  ARMENIAN => 3,
-  BENGALI => 4,
-  BOPOMOFO => 5,
-  CHEROKEE => 6,
-  COPTIC => 7,
-  CYRILLIC => 8,
-  DESERET => 9,
-  DEVANAGARI => 10,
-  ETHIOPIC => 11,
-  GEORGIAN => 12,
-  GOTHIC => 13,
-  GREEK => 14,
-  GUJARATI => 15,
-  GURMUKHI => 16,
-  HAN => 17,
-  HANGUL => 18,
-  HEBREW => 19,
-  HIRAGANA => 20,
-  KANNADA => 21,
-  KATAKANA => 22,
-  KHMER => 23,
-  LAO => 24,
-  LATIN => 25,
-  MALAYALAM => 26,
-  MONGOLIAN => 27,
-  MYANMAR => 28,
-  OGHAM => 29,
-  OLD_ITALIC => 30,
-  ORIYA => 31,
-  RUNIC => 32,
-  SINHALA => 33,
-  SYRIAC => 34,
-  TAMIL => 35,
-  TELUGU => 36,
-  THAANA => 37,
-  THAI => 38,
-  TIBETAN => 39,
-  CANADIAN_ABORIGINAL => 40,
-  YI => 41,
-  TAGALOG => 42,
-  HANUNOO => 43,
-  BUHID => 44,
-  TAGBANWA => 45,
-# unicode 4.0 additions
-  BRAILLE => 46,
-  CYPRIOT => 47,
-  LIMBU => 48,
-  OSMANYA => 49,
-  SHAVIAN => 50,
-  LINEAR_B => 51,
-  TAI_LE => 52,
-  UGARITIC => 53,
-# unicode 4.1 additions
-  NEW_TAI_LUE => 54,
-  BUGINESE => 55,
-  GLAGOLITIC => 56,
-  TIFINAGH => 57,
-  SYLOTI_NAGRI => 58,
-  OLD_PERSIAN => 59,
-  KHAROSHTHI => 60,
-# unicode 5.0 additions
-  UNKNOWN => 61,
-  BALINESE => 62,
-  CUNEIFORM => 63,
-  PHOENICIAN => 64,
-  PHAGS_PA => 65,
-  NKO => 66,
-# unicode 5.1 additions
-  KAYAH_LI => 67,
-  LEPCHA => 68,
-  REJANG => 69,
-  SUNDANESE => 70,
-  SAURASHTRA => 71,
-  CHAM => 72,
-  OL_CHIKI => 73,
-  VAI => 74,
-  CARIAN => 75,
-  LYCIAN => 76,
-  LYDIAN => 77,
-# unicode 5.2 additions
-  AVESTAN => 78,
-  BAMUM => 79,
-  EGYPTIAN_HIEROGLYPHS => 80,
-  IMPERIAL_ARAMAIC => 81,
-  INSCRIPTIONAL_PAHLAVI => 82,
-  INSCRIPTIONAL_PARTHIAN => 83,
-  JAVANESE => 84,
-  KAITHI => 85,
-  LISU => 86,
-  MEETEI_MAYEK => 87,
-  OLD_SOUTH_ARABIAN => 88,
-  OLD_TURKIC => 89,
-  SAMARITAN => 90,
-  TAI_THAM => 91,
-  TAI_VIET => 92,
-# unicode 6.0 additions
-  BATAK => 93,
-  BRAHMI => 94,
-  MANDAIC => 95,
-# unicode 6.1 additions
-  CHAKMA => 96,
-  MEROITIC_CURSIVE => 97,
-  MEROITIC_HIEROGLYPHS => 98,
-  MIAO => 99,
-  SHARADA => 100,
-  SORA_SOMPENG => 101,
-  TAKRI => 102,
-# unicode 7.0 additions
-  BASSA_VAH => 103,
-  CAUCASIAN_ALBANIAN => 104,
-  DUPLOYAN => 105,
-  ELBASAN => 106,
-  GRANTHA => 107,
-  KHOJKI => 108,
-  KHUDAWADI => 109,
-  LINEAR_A => 110,
-  MAHAJANI => 111,
-  MANICHAEAN => 112,
-  MENDE_KIKAKUI => 113,
-  MODI => 114,
-  MRO => 115,
-  NABATAEAN => 116,
-  OLD_NORTH_ARABIAN => 117,
-  OLD_PERMIC => 118,
-  PAHAWH_HMONG => 119,
-  PALMYRENE => 120,
-  PAU_CIN_HAU => 121,
-  PSALTER_PAHLAVI => 122,
-  SIDDHAM => 123,
-  TIRHUTA => 124,
-  WARANG_CITI => 125,
-# unicode 8.0 additions
-  AHOM => 126,
-  ANATOLIAN_HIEROGLYPHS => 127,
-  HATRAN => 128,
-  MULTANI => 129,
-  OLD_HUNGARIAN => 130,
-  SIGNWRITING => 131,
-
-# additional "script" code, not from Unicode (but matches ISO 15924's Zmth tag)
-  MATHEMATICAL_NOTATION => 132,
-);
-
-my $sc = -1;
 my $cc = -1;
 my %catCode;
-my @scriptCodeToTag;
-my @scriptCodeToName;

 sub readHarfBuzzHeader
 {
    my $file = shift;
-    open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
+    open FH, "< $HARFBUZZ/$file" or die "can't open harfbuzz header $HARFBUZZ/$file\n";
    while (<FH>) {
-        s/CANADIAN_SYLLABICS/CANADIAN_ABORIGINAL/; # harfbuzz and unicode disagree on this name :(
-        if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
-            unless (exists $scriptCode{$1}) {
-                warn "unknown script name $1 found in $file\n";
-                next;
-            }
-            $sc = $scriptCode{$1};
-            $scriptCodeToTag[$sc] = $2;
-            $scriptCodeToName[$sc] = $1;
-        }
        if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
            $cc++;
            $catCode{$1} = $cc;
@ -270,16 +103,40 @@ sub readHarfBuzzHeader
    close FH;
 }

-&readHarfBuzzHeader("hb-common.h");
 &readHarfBuzzHeader("hb-unicode.h");

-die "didn't find HarfBuzz script codes\n" if $sc == -1;
 die "didn't find HarfBuzz category codes\n" if $cc == -1;

-# Additional code not present in HarfBuzz headers:
-$sc = $scriptCode{"MATHEMATICAL_NOTATION"};
-$scriptCodeToTag[$sc] = "'Z','m','t','h'";
-$scriptCodeToName[$sc] = "MATHEMATICAL_NOTATION";
+my %scriptCode;
+my @scriptCodeToTag;
+my @scriptCodeToName;
+
+my $sc = -1;
+
+sub readIcuHeader
+{
+    my $file = shift;
+    open FH, "< $ICU/$file" or die "can't open ICU header $ICU/$file\n";
+    while (<FH>) {
+        # adjust for ICU vs UCD naming discrepancies
+        s/LANNA/TAI_THAM/;
+        s/MEITEI_MAYEK/MEETEI_MAYEK/;
+        s/ORKHON/OLD_TURKIC/;
+        s/MENDE/MENDE_KIKAKUI/;
+        s/SIGN_WRITING/SIGNWRITING/;
+        if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) {
+            $sc = $2;
+            $scriptCode{$1} = $sc;
+            $scriptCodeToTag[$sc] = $3;
+            $scriptCodeToName[$sc] = $1;
+        }
+    }
+    close FH;
+}
+
+&readIcuHeader("uscript.h");
+
+die "didn't find ICU script codes\n" if $sc == -1;

 my %xidmodCode = (
 'Recommended'       => 0,
@ -317,9 +174,9 @@ my %bidicategoryCode = (
  "PDF" => "16", # Pop Directional Format
  "NSM" => "17", # Non-Spacing Mark
  "BN"  => "18", # Boundary Neutral
-  "LRI" => "19", # Left-to-Right Isolate
-  "RLI" => "20", # Right-to-left Isolate
-  "FSI" => "21", # First Strong Isolate
+  "FSI" => "19", # First Strong Isolate
+  "LRI" => "20", # Left-to-Right Isolate
+  "RLI" => "21", # Right-to-left Isolate
  "PDI" => "22"  # Pop Direcitonal Isolate
 );

@ -404,7 +261,7 @@ my %ucd2hb = (

 # read ReadMe.txt
 my @versionInfo;
-open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
+open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
 while (<FH>) {
    chomp;
    push @versionInfo, $_;
@ -418,7 +275,7 @@ my $kLowerToUpper = 0x10000000;
 my $kCaseMapCharMask = 0x001fffff;

 # read UnicodeData.txt
-open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
+open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
 while (<FH>) {
    chomp;
    my @fields = split /;/;
@ -490,7 +347,7 @@ while (<FH>) {
 close FH;

 # read Scripts.txt
-open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
+open FH, "< $UNICODE/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
 push @versionInfo, "";
 while (<FH>) {
    chomp;
@ -500,8 +357,8 @@ while (<FH>) {
 while (<FH>) {
    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
        my $script = uc($3);
-        warn "unknown script $script" unless exists $scriptCode{$script};
-        $script = $scriptCode{$script};
+        warn "unknown ICU script $script" unless exists $scriptCode{$script};
+        my $script = $scriptCode{$script};
        my $start = hex "0x$1";
        my $end = (defined $2) ? hex "0x$2" : $start;
        for (my $i = $start; $i <= $end; ++$i) {
@ -515,7 +372,7 @@ close FH;
 my @offsets = ();
 push @offsets, 0;

-open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
+open FH, "< $UNICODE/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
 push @versionInfo, "";
 while (<FH>) {
    chomp;
@ -543,7 +400,7 @@ my %pairedBracketTypeCode = (
  'O' => 1,
  'C' => 2
 );
-open FH, "< $ARGV[1]/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
+open FH, "< $UNICODE/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
 push @versionInfo, "";
 while (<FH>) {
    chomp;
@ -570,7 +427,7 @@ my %hangulType = (
  'LV'  => 0x03,
  'LVT' => 0x07
 );
-open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
+open FH, "< $UNICODE/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
 push @versionInfo, "";
 while (<FH>) {
    chomp;
@ -593,7 +450,7 @@ while (<FH>) {
 close FH;

 # read xidmodifications.txt
-open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
+open FH, "< $UNICODE/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
 push @versionInfo, "";
 while (<FH>) {
  chomp;
@ -616,7 +473,7 @@ while (<FH>) {
 }
 close FH;

-open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
+open FH, "< $UNICODE/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
 push @versionInfo, "";
 while (<FH>) {
  chomp;
@ -653,7 +510,7 @@ while (<FH>) {
 close FH;

 # read VerticalOrientation-13.txt
-open FH, "< $ARGV[1]/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
+open FH, "< $UNICODE/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
 push @versionInfo, "";
 while (<FH>) {
    chomp;
@ -732,21 +589,25 @@ $versionInfo

 __END

+print DATA_TABLES "#if !ENABLE_INTL_API\n";
 print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
-  printf DATA_TABLES "  HB_TAG(%s)", $scriptCodeToTag[$i];
+  printf DATA_TABLES "  HB_TAG('%c','%c','%c','%c')", unpack('cccc', $scriptCodeToTag[$i]);
  print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
 }
-print DATA_TABLES "};\n\n";
+print DATA_TABLES "};\n";
+print DATA_TABLES "#endif\n\n";

 our $totalData = 0;

+print DATA_TABLES "#if !ENABLE_INTL_API\n";
 print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
 for (my $i = 0; $i < scalar @offsets; ++$i) {
    printf DATA_TABLES "  $offsets[$i]";
    print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
 }
-print DATA_TABLES "};\n\n";
+print DATA_TABLES "};\n";
+print DATA_TABLES "#endif\n\n";

 print HEADER "#pragma pack(1)\n\n";

@ -762,11 +623,26 @@ struct nsCharProps1 {
  unsigned char mCombiningClass:8;
 };
 /;
-print DATA_TABLES "#ifndef ENABLE_INTL_API\n";
-&genTables("CharProp1", $type, "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
-print DATA_TABLES "#endif\n\n";
+&genTables("#if !ENABLE_INTL_API", "#endif",
+           "CharProp1", $type, "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);

-sub sprintCharProps2
+sub sprintCharProps2_short
+{
+  my $usv = shift;
+  return sprintf("{%d,%d,%d},",
+                 $pairedBracketType[$usv], $verticalOrientation[$usv], $xidmod[$usv]);
+}
+$type = q/
+struct nsCharProps2 {
+  unsigned char mPairedBracketType:2;
+  unsigned char mVertOrient:2;
+  unsigned char mXidmod:4;
+};
+/;
+&genTables("#if ENABLE_INTL_API", "#endif",
+           "CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2_short, 16, 1, 1);
+
+sub sprintCharProps2_full
 {
  my $usv = shift;
  return sprintf("{%d,%d,%d,%d,%d,%d,%d},",
@ -785,7 +661,8 @@ struct nsCharProps2 {
  unsigned char mVertOrient:2;
 };
 |;
-&genTables("CharProp2", $type, "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
+&genTables("#if !ENABLE_INTL_API", "#endif",
+           "CharProp2", $type, "nsCharProps2", 11, 5, \&sprintCharProps2_full, 16, 4, 1);

 print HEADER "#pragma pack()\n\n";

@ -800,21 +677,22 @@ sub sprintHanVariants
  }
  return sprintf("0x%02x,", $val);
 }
-&genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
+## Han Variant data currently unused but may be needed in future, see bug 857481
+## &genTables("", "", "HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);

 sub sprintFullWidth
 {
  my $usv = shift;
  return sprintf("0x%04x,", $fullWidth[$usv]);
 }
-&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
+&genTables("", "", "FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);

 sub sprintCasemap
 {
  my $usv = shift;
  return sprintf("0x%08x,", $casemap[$usv]);
 }
-&genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
+&genTables("", "", "CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);

 print STDERR "Total data = $totalData\n";

@ -826,8 +704,16 @@ printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCha

 sub genTables
 {
-  my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
+  my ($guardBegin, $guardEnd,
+      $prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;

+  if ($typedef ne '') {
+    print HEADER "$guardBegin\n";
+    print HEADER "$typedef\n";
+    print HEADER "$guardEnd\n\n";
+  }
+
+  print DATA_TABLES "\n$guardBegin\n";
  print DATA_TABLES "#define k${prefix}MaxPlane  $maxPlane\n";
  print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
  print DATA_TABLES "#define k${prefix}CharBits  $charBits\n";
@ -888,8 +774,6 @@ sub genTables
  }
  print DATA_TABLES "};\n\n";

-  print HEADER "$typedef\n\n" if $typedef ne '';
-
  my $pageLen = $charsPerPage / $charsPerEntry;
  print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
  for (my $i = 0; $i < scalar @char; ++$i) {
@ -897,7 +781,8 @@ sub genTables
    print DATA_TABLES $char[$i];
    print DATA_TABLES $i < $#char ? "},\n" : "}\n";
  }
-  print DATA_TABLES "};\n\n";
+  print DATA_TABLES "};\n";
+  print DATA_TABLES "$guardEnd\n";

  my $dataSize = $pmCount * $indexLen * $pmBits/8 +
                 $chCount * $pageLen * $bytesPerEntry + 
--- a/intl/unicharutil/util/nsBidiUtils.h
+++ b/intl/unicharutil/util/nsBidiUtils.h
@ -14,7 +14,8 @@
    *  for the detailed definition of the following categories
    *
    *  The values here must match the equivalents in %bidicategorycode in
-    *  mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl
+    *  mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl,
+    *  and must also match the values used by ICU's UCharDirection.
    */

 enum nsCharType   { 
@ -37,9 +38,9 @@ enum nsCharType   {
  eCharType_PopDirectionalFormat     = 16,
  eCharType_DirNonSpacingMark        = 17,
  eCharType_BoundaryNeutral          = 18,
-  eCharType_LeftToRightIsolate       = 19,
-  eCharType_RightToLeftIsolate       = 20,
-  eCharType_FirstStrongIsolate       = 21,
+  eCharType_FirstStrongIsolate       = 19,
+  eCharType_LeftToRightIsolate       = 20,
+  eCharType_RightToLeftIsolate       = 21,
  eCharType_PopDirectionalIsolate    = 22,
  eCharType_CharTypeCount
 };
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@ -11,12 +11,12 @@

 #if ENABLE_INTL_API
 #include "unicode/uchar.h"
+#include "unicode/uscript.h"
 #endif

 #define UNICODE_BMP_LIMIT 0x10000
 #define UNICODE_LIMIT     0x110000

-
 #ifndef ENABLE_INTL_API
 static const nsCharProps1&
 GetCharProps1(uint32_t aCh)
@ -56,14 +56,21 @@ GetCharProps2(uint32_t aCh)

    NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
    // Default values for unassigned
+    using namespace mozilla::unicode;
    static const nsCharProps2 undefined = {
-        MOZ_SCRIPT_UNKNOWN,                      // Script code
-        0,                                       // East Asian Width
-        HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,  // General Category
-        eCharType_LeftToRight,                   // Bidi Category
-        mozilla::unicode::XIDMOD_NOT_CHARS,      // Xidmod
-        -1,                                      // Numeric Value
-        mozilla::unicode::HVT_NotHan             // Han variant
+#if ENABLE_INTL_API
+        PAIRED_BRACKET_TYPE_NONE,
+        VERTICAL_ORIENTATION_R,
+        XIDMOD_NOT_CHARS
+#else
+        MOZ_SCRIPT_UNKNOWN,
+        PAIRED_BRACKET_TYPE_NONE,
+        HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
+        eCharType_LeftToRight,
+        XIDMOD_NOT_CHARS,
+        -1, // Numeric Value
+        VERTICAL_ORIENTATION_R
+#endif
    };
    return undefined;
 }
@ -93,7 +100,7 @@ to provide the most compact storage, depending on the distribution
 of values.
 */

-nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
+const nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
  /*
   * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
   * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
@ -130,6 +137,69 @@ nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
  /* SPACE_SEPARATOR */     nsIUGenCategory::kSeparator
 };

+#ifdef ENABLE_INTL_API
+const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
+  HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
+  HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
+  HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
+  HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
+  HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
+  HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
+  HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
+  HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
+  HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
+  HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
+  HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
+  HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
+  HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
+  HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
+  HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
+  HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
+  HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
+  HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
+  HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
+  HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
+  HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
+  HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
+  HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
+  HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
+  HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
+  HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
+  HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
+  HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
+  HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
+  HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
+};
+#endif
+
+uint8_t GetGeneralCategory(uint32_t aCh) {
+#if ENABLE_INTL_API
+  return sICUtoHBcategory[u_charType(aCh)];
+#else
+  return GetCharProps2(aCh).mCategory;
+#endif
+}
+
+nsCharType GetBidiCat(uint32_t aCh) {
+#if ENABLE_INTL_API
+  return nsCharType(u_charDirection(aCh));
+#else
+  return nsCharType(GetCharProps2(aCh).mBidiCategory);
+#endif
+}
+
+int8_t GetNumericValue(uint32_t aCh) {
+#if ENABLE_INTL_API
+  UNumericType type =
+    UNumericType(u_getIntPropertyValue(aCh, UCHAR_NUMERIC_TYPE));
+  return type == U_NT_DECIMAL || type == U_NT_DIGIT
+         ? int8_t(u_getNumericValue(aCh))
+         : -1;  
+#else
+  return GetCharProps2(aCh).mNumericValue;
+#endif
+}
+
 uint32_t
 GetMirroredChar(uint32_t aCh)
 {
@ -160,14 +230,30 @@ GetCombiningClass(uint32_t aCh)
 #endif
 }

+uint8_t
+GetScriptCode(uint32_t aCh)
+{
+#if ENABLE_INTL_API
+    UErrorCode err = U_ZERO_ERROR;
+    return uscript_getScript(aCh, &err);
+#else
+    return GetCharProps2(aCh).mScriptCode;
+#endif
+}
+
 uint32_t
 GetScriptTagForCode(int32_t aScriptCode)
 {
+#if ENABLE_INTL_API
+    const char* tag = uscript_getShortName(UScriptCode(aScriptCode));
+    return HB_TAG(tag[0], tag[1], tag[2], tag[3]);
+#else
    // this will safely return 0 for negative script codes, too :)
    if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
        return 0;
    }
    return sScriptCodeToTag[aScriptCode];
+#endif
 }

 PairedBracketType GetPairedBracketType(uint32_t aCh)
@ -254,6 +340,7 @@ GetTitlecaseForAll(uint32_t aCh)
    return aCh;
 }

+#if 0 // currently unused - bug 857481
 HanVariantType
 GetHanVariant(uint32_t aCh)
 {
@ -272,6 +359,7 @@ GetHanVariant(uint32_t aCh)
    // extract the appropriate 2-bit field from the value
    return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
 }
+#endif

 uint32_t
 GetFullWidth(uint32_t aCh)
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@ -16,7 +16,7 @@ namespace mozilla {

 namespace unicode {

-extern nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[];
+extern const nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[];

 // Return whether the char has a mirrored-pair counterpart.
 uint32_t GetMirroredChar(uint32_t aCh);
@ -26,25 +26,19 @@ bool HasMirroredChar(uint32_t aChr);
 uint8_t GetCombiningClass(uint32_t aCh);

 // returns the detailed General Category in terms of HB_UNICODE_* values
-inline uint8_t GetGeneralCategory(uint32_t aCh) {
-  return GetCharProps2(aCh).mCategory;
-}
+uint8_t GetGeneralCategory(uint32_t aCh);

 // returns the simplified Gen Category as defined in nsIUGenCategory
 inline nsIUGenCategory::nsUGenCategory GetGenCategory(uint32_t aCh) {
  return sDetailedToGeneralCategory[GetGeneralCategory(aCh)];
 }

-inline uint8_t GetScriptCode(uint32_t aCh) {
-  return GetCharProps2(aCh).mScriptCode;
-}
+nsCharType GetBidiCat(uint32_t aCh);
+
+uint8_t GetScriptCode(uint32_t aCh);

 uint32_t GetScriptTagForCode(int32_t aScriptCode);

-inline nsCharType GetBidiCat(uint32_t aCh) {
-  return nsCharType(GetCharProps2(aCh).mBidiCategory);
-}
-
 /* This MUST match the values assigned by genUnicodePropertyData.pl! */
 enum VerticalOrientation {
  VERTICAL_ORIENTATION_U  = 0,
@ -93,10 +87,9 @@ inline XidmodType GetIdentifierModification(uint32_t aCh) {
 * To restrict to decimal digits, the caller should also check whether
 * GetGeneralCategory returns HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER
 */
-inline int8_t GetNumericValue(uint32_t aCh) {
-  return GetCharProps2(aCh).mNumericValue;
-}
+int8_t GetNumericValue(uint32_t aCh);

+#if 0 // currently unused - bug 857481
 enum HanVariantType {
  HVT_NotHan = 0x0,
  HVT_SimplifiedOnly = 0x1,
@ -105,6 +98,7 @@ enum HanVariantType {
 };

 HanVariantType GetHanVariant(uint32_t aCh);
+#endif

 uint32_t GetFullWidth(uint32_t aCh);