diff --git a/content/base/public/nsLineBreaker.h b/content/base/public/nsLineBreaker.h index 9fbaa8dbf68..a9503a588d6 100644 --- a/content/base/public/nsLineBreaker.h +++ b/content/base/public/nsLineBreaker.h @@ -40,6 +40,7 @@ #include "nsString.h" #include "nsTArray.h" +#include "nsILineBreaker.h" class nsIAtom; @@ -81,10 +82,7 @@ public: nsLineBreaker(); ~nsLineBreaker(); - static inline PRBool IsSpace(PRUnichar u) - { - return u == 0x0020 || u == 0x200b/*ZWSP*/ || u == '\n' || u == '\t'; - } + static inline PRBool IsSpace(PRUnichar u) { return NS_IsSpace(u); } static inline PRBool IsComplexASCIIChar(PRUnichar u) { diff --git a/intl/lwbrk/public/nsILineBreaker.h b/intl/lwbrk/public/nsILineBreaker.h index 94726ec521f..5430962379e 100644 --- a/intl/lwbrk/public/nsILineBreaker.h +++ b/intl/lwbrk/public/nsILineBreaker.h @@ -72,4 +72,18 @@ public: NS_DEFINE_STATIC_IID_ACCESSOR(nsILineBreaker, NS_ILINEBREAKER_IID) +static inline PRBool +NS_IsSpace(PRUnichar u) +{ + return u == 0x0020 || // SPACE + u == 0x0009 || // CHARACTER TABULATION + u == 0x000D || // CARRIAGE RETURN + (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE, + // EM SPACE, THREE-PER-EM SPACE, + // FOUR-PER-SPACE, SIX-PER-EM SPACE, + (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE, + // HAIR SPACE, ZERO WIDTH SPACE + u == 0x3000; // IDEOGRAPHIC SPACE +} + #endif /* nsILineBreaker_h__ */ diff --git a/intl/lwbrk/src/jisx4501class.h b/intl/lwbrk/src/jisx4501class.h index 7f7d7d9e813..e3032db9b21 100644 --- a/intl/lwbrk/src/jisx4501class.h +++ b/intl/lwbrk/src/jisx4501class.h @@ -43,104 +43,104 @@ static const PRUint32 gLBClass00[32] = { 0x55555555, // U+0008 - U+000F 0x55555555, // U+0010 - U+0017 0x55555555, // U+0018 - U+001F -0x88108815, // U+0020 - U+0027 -0x11118810, // U+0028 - U+002F +0x7AABAAA5, // U+0020 - U+0027 +0x7A7AAAA9, // U+0028 - U+002F 0x66666666, // U+0030 - U+0037 -0x11101866, // U+0038 - U+003F -0x88888888, // U+0040 - U+0047 -0x88888888, // U+0048 - U+004F -0x88888888, // U+0050 - U+0057 -0x88100888, // U+0058 - U+005F -0x88888888, // U+0060 - U+0067 -0x88888888, // U+0068 - U+006F -0x88888888, // U+0070 - U+0077 -0x88180888, // U+0078 - U+007F -0x88888888, // U+0080 - U+0087 -0x88888888, // U+0088 - U+008F -0x88888888, // U+0090 - U+0097 -0x88888888, // U+0098 - U+009F -0x88383488, // U+00A0 - U+00A7 -0x88888888, // U+00A8 - U+00AF -0x88888881, // U+00B0 - U+00B7 -0x88888888, // U+00B8 - U+00BF -0x88888888, // U+00C0 - U+00C7 -0x88888888, // U+00C8 - U+00CF -0x88888888, // U+00D0 - U+00D7 -0x88888888, // U+00D8 - U+00DF -0x88888888, // U+00E0 - U+00E7 -0x88888888, // U+00E8 - U+00EF -0x88888888, // U+00F0 - U+00F7 -0x88888888, // U+00F8 - U+00FF +0x1AA9AA66, // U+0038 - U+003F +0x77777777, // U+0040 - U+0047 +0x77777777, // U+0048 - U+004F +0x77777777, // U+0050 - U+0057 +0x77AA9777, // U+0058 - U+005F +0x77777777, // U+0060 - U+0067 +0x77777777, // U+0068 - U+006F +0x77777777, // U+0070 - U+0077 +0x7AAA9777, // U+0078 - U+007F +0x77777777, // U+0080 - U+0087 +0x77777777, // U+0088 - U+008F +0x77777777, // U+0090 - U+0097 +0x77777777, // U+0098 - U+009F +0xAA9A9AAB, // U+00A0 - U+00A7 +0x77A9A77A, // U+00A8 - U+00AF +0xAAAAAAAA, // U+00B0 - U+00B7 +0xAAAAAAAA, // U+00B8 - U+00BF +0x77777777, // U+00C0 - U+00C7 +0x77777777, // U+00C8 - U+00CF +0x77777777, // U+00D0 - U+00D7 +0x77777777, // U+00D8 - U+00DF +0x77777777, // U+00E0 - U+00E7 +0x77777777, // U+00E8 - U+00EF +0xA7777777, // U+00F0 - U+00F7 +0x77777777, // U+00F8 - U+00FF }; static const PRUint32 gLBClass20[32] = { -0x55555555, // U+2000 - U+2007 -0x88885555, // U+2008 - U+200F -0x88828888, // U+2010 - U+2017 -0x88888888, // U+2018 - U+201F -0x81118888, // U+2020 - U+2027 -0x88888888, // U+2028 - U+202F -0x88884444, // U+2030 - U+2037 -0x88815888, // U+2038 - U+203F -0x88818888, // U+2040 - U+2047 -0x88888888, // U+2048 - U+204F -0x88888888, // U+2050 - U+2057 -0x88888888, // U+2058 - U+205F -0x88888888, // U+2060 - U+2067 -0x88888888, // U+2068 - U+206F -0x88888888, // U+2070 - U+2077 -0x88888888, // U+2078 - U+207F -0x88888888, // U+2080 - U+2087 -0x88888888, // U+2088 - U+208F -0x88888888, // U+2090 - U+2097 -0x88888888, // U+2098 - U+209F -0x88888888, // U+20A0 - U+20A7 -0x88888888, // U+20A8 - U+20AF -0x88888888, // U+20B0 - U+20B7 -0x88888888, // U+20B8 - U+20BF -0x88888888, // U+20C0 - U+20C7 -0x88888888, // U+20C8 - U+20CF -0x88888888, // U+20D0 - U+20D7 -0x88888888, // U+20D8 - U+20DF -0x88888888, // U+20E0 - U+20E7 -0x88888888, // U+20E8 - U+20EF -0x88888888, // U+20F0 - U+20F7 -0x88888888, // U+20F8 - U+20FF +0xB5555555, // U+2000 - U+2007 +0x77775555, // U+2008 - U+200F +0x777211B1, // U+2010 - U+2017 +0x77777777, // U+2018 - U+201F +0xA2227777, // U+2020 - U+2027 +0xB7777777, // U+2028 - U+202F +0x77744444, // U+2030 - U+2037 +0x7A115107, // U+2038 - U+203F +0x11017777, // U+2040 - U+2047 +0x77777711, // U+2048 - U+204F +0x77777777, // U+2050 - U+2057 +0x77777777, // U+2058 - U+205F +0x77777777, // U+2060 - U+2067 +0x77777777, // U+2068 - U+206F +0x77777777, // U+2070 - U+2077 +0x77777777, // U+2078 - U+207F +0x77777777, // U+2080 - U+2087 +0x77777777, // U+2088 - U+208F +0x77777777, // U+2090 - U+2097 +0x77777777, // U+2098 - U+209F +0x77777777, // U+20A0 - U+20A7 +0x77777777, // U+20A8 - U+20AF +0x77777777, // U+20B0 - U+20B7 +0x77777777, // U+20B8 - U+20BF +0x77777777, // U+20C0 - U+20C7 +0x77777777, // U+20C8 - U+20CF +0x77777777, // U+20D0 - U+20D7 +0x77777777, // U+20D8 - U+20DF +0x77777777, // U+20E0 - U+20E7 +0x77777777, // U+20E8 - U+20EF +0x77777777, // U+20F0 - U+20F7 +0x77777777, // U+20F8 - U+20FF }; static const PRUint32 gLBClass21[32] = { -0x88888888, // U+2100 - U+2107 -0x88888888, // U+2108 - U+210F -0x83888888, // U+2110 - U+2117 -0x88888888, // U+2118 - U+211F -0x87888888, // U+2120 - U+2127 -0x88888888, // U+2128 - U+212F -0x88888888, // U+2130 - U+2137 -0x88888888, // U+2138 - U+213F -0x88888888, // U+2140 - U+2147 -0x88888888, // U+2148 - U+214F -0x88888888, // U+2150 - U+2157 -0x88888888, // U+2158 - U+215F +0x77777777, // U+2100 - U+2107 +0x77777777, // U+2108 - U+210F +0x73777777, // U+2110 - U+2117 +0x77777777, // U+2118 - U+211F +0x77777777, // U+2120 - U+2127 +0x77777777, // U+2128 - U+212F +0x77777777, // U+2130 - U+2137 +0x77777777, // U+2138 - U+213F +0x77777777, // U+2140 - U+2147 +0x77777777, // U+2148 - U+214F +0x77777777, // U+2150 - U+2157 +0x77777777, // U+2158 - U+215F 0x55555555, // U+2160 - U+2167 0x55555555, // U+2168 - U+216F 0x55555555, // U+2170 - U+2177 0x55555555, // U+2178 - U+217F -0x88888888, // U+2180 - U+2187 -0x88888888, // U+2188 - U+218F -0x88888888, // U+2190 - U+2197 -0x88888888, // U+2198 - U+219F -0x88888888, // U+21A0 - U+21A7 -0x88888888, // U+21A8 - U+21AF -0x88888888, // U+21B0 - U+21B7 -0x88888888, // U+21B8 - U+21BF -0x88888888, // U+21C0 - U+21C7 -0x88888888, // U+21C8 - U+21CF -0x88888888, // U+21D0 - U+21D7 -0x88888888, // U+21D8 - U+21DF -0x88888888, // U+21E0 - U+21E7 -0x88888888, // U+21E8 - U+21EF -0x88888888, // U+21F0 - U+21F7 -0x88888888, // U+21F8 - U+21FF +0x77777777, // U+2180 - U+2187 +0x77777777, // U+2188 - U+218F +0x77777777, // U+2190 - U+2197 +0x77777777, // U+2198 - U+219F +0x77777777, // U+21A0 - U+21A7 +0x77777777, // U+21A8 - U+21AF +0x77777777, // U+21B0 - U+21B7 +0x77777777, // U+21B8 - U+21BF +0x77777777, // U+21C0 - U+21C7 +0x77777777, // U+21C8 - U+21CF +0x77777777, // U+21D0 - U+21D7 +0x77777777, // U+21D8 - U+21DF +0x77777777, // U+21E0 - U+21E7 +0x77777777, // U+21E8 - U+21EF +0x77777777, // U+21F0 - U+21F7 +0x77777777, // U+21F8 - U+21FF }; static const PRUint32 gLBClass30[32] = { @@ -179,37 +179,37 @@ static const PRUint32 gLBClass30[32] = { }; static const PRUint32 gLBClass0E[32] = { -0x99999999, // U+0E00 - U+0E07 -0x99999999, // U+0E08 - U+0E0F -0x99999999, // U+0E10 - U+0E17 -0x99999999, // U+0E18 - U+0E1F -0x99999999, // U+0E20 - U+0E27 -0x19999999, // U+0E28 - U+0E2F -0x99999999, // U+0E30 - U+0E37 -0x09999999, // U+0E38 - U+0E3F -0x91999999, // U+0E40 - U+0E47 -0x89999999, // U+0E48 - U+0E4F +0x88888888, // U+0E00 - U+0E07 +0x88888888, // U+0E08 - U+0E0F +0x88888888, // U+0E10 - U+0E17 +0x88888888, // U+0E18 - U+0E1F +0x88888888, // U+0E20 - U+0E27 +0x18888888, // U+0E28 - U+0E2F +0x88888888, // U+0E30 - U+0E37 +0x08888888, // U+0E38 - U+0E3F +0x81888888, // U+0E40 - U+0E47 +0x78888888, // U+0E48 - U+0E4F 0x66666666, // U+0E50 - U+0E57 -0x99991166, // U+0E58 - U+0E5F -0x99999999, // U+0E60 - U+0E67 -0x99999999, // U+0E68 - U+0E6F -0x99999999, // U+0E70 - U+0E77 -0x99999999, // U+0E78 - U+0E7F -0x99999999, // U+0E80 - U+0E87 -0x99999999, // U+0E88 - U+0E8F -0x99999999, // U+0E90 - U+0E97 -0x99999999, // U+0E98 - U+0E9F -0x99999999, // U+0EA0 - U+0EA7 -0x19999999, // U+0EA8 - U+0EAF -0x99999999, // U+0EB0 - U+0EB7 -0x99999999, // U+0EB8 - U+0EBF -0x91999999, // U+0EC0 - U+0EC7 -0x99999999, // U+0EC8 - U+0ECF +0x88881166, // U+0E58 - U+0E5F +0x88888888, // U+0E60 - U+0E67 +0x88888888, // U+0E68 - U+0E6F +0x88888888, // U+0E70 - U+0E77 +0x88888888, // U+0E78 - U+0E7F +0x88888888, // U+0E80 - U+0E87 +0x88888888, // U+0E88 - U+0E8F +0x88888888, // U+0E90 - U+0E97 +0x88888888, // U+0E98 - U+0E9F +0x88888888, // U+0EA0 - U+0EA7 +0x18888888, // U+0EA8 - U+0EAF +0x88888888, // U+0EB0 - U+0EB7 +0x88888888, // U+0EB8 - U+0EBF +0x81888888, // U+0EC0 - U+0EC7 +0x88888888, // U+0EC8 - U+0ECF 0x66666666, // U+0ED0 - U+0ED7 -0x99999966, // U+0ED8 - U+0EDF -0x99999999, // U+0EE0 - U+0EE7 -0x99999999, // U+0EE8 - U+0EEF -0x99999999, // U+0EF0 - U+0EF7 -0x99999999, // U+0EF8 - U+0EFF +0x88888866, // U+0ED8 - U+0EDF +0x88888888, // U+0EE0 - U+0EE7 +0x88888888, // U+0EE8 - U+0EEF +0x88888888, // U+0EF0 - U+0EF7 +0x88888888, // U+0EF8 - U+0EFF }; diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp index 013e3774d2c..b1fd3ea6fba 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp @@ -66,78 +66,97 @@ 4 X X X X X X 5 X X X X X X 6 X X X X X X - 7 X X X X X X X - 8 X X X X X X E + 7 X X X X X X X + 8 X X X X X X E 9 X X X X X X 10 X X X X X X 11 X X X X X X - 12 X X X X X X + 12 X X X X X X 13 X X X X X X X 14 X X X X X X X - 15 X X X X X X X X X + 15 X X X X X X X X X 16 X X X X X X X X - 17 X X X X X E - 18 X X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X 19 X E E E E E X X X X X X X X X X X X E X E E 20 X X X X X E * Same Char # Other Char - + X Cannot Break + The classes mean: + 1: Open parenthesis + 2: Close parenthesis + 3: Prohibit a line break before + 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") + 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) + 6: Full stop + 7: Non-breakable between same characters + 8: Prefix (e.g., "$", "NO.") + 9: Postfix (e.g., "%") + 10: Ideographic space + 11: Hiragana + 12: Japanese characters (except class 11) + 13: Subscript + 14: Ruby + 15: Numeric + 16: Alphabet + 17: Space for Western language + 18: Western characters (except class 17) + 19: Split line note (Warichu) begin quote + 20: Split line note (Warichu) end quote + 2. Simplified by remove the class which we do not care - However, since we do not care about class 13(Subscript), 14(Ruby), - 19(split line note begin quote), and 20(split line note end quote) - we can simplify this par table into the following + However, since we do not care about class 13(Subscript), 14(Ruby), + 16 (Aphabet), 19(split line note begin quote), and 20(split line note end + quote) we can simplify this par table into the following Class of Leading Class of Trailing Char Class - Char + Char - 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18 - - 1 X X X X X X X X X X X X X X X X - 2 X X X X X - 3 X X X X X + 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 + + 1 X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X 4 X X X X X - 5 X X X X X - 6 X X X X X - 7 X X X X X X - 8 X X X X X X - 9 X X X X X - 10 X X X X X - 11 X X X X X - 12 X X X X X - 15 X X X X X X X X - 16 X X X X X X X - 17 X X X X X - 18 X X X X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 17 X X X X X + 18 X X X X X X X 3. Simplified by merged classes - After the 2 simplification, the pair table have some duplication + After the 2 simplification, the pair table have some duplication a. class 2, 3, 4, 5, 6, are the same- we can merged them b. class 10, 11, 12, 17 are the same- we can merged them Class of Leading Class of Trailing Char Class - Char + Char - 1 [a] 7 8 9 [b]15 16 18 - - 1 X X X X X X X X X - [a] X - 7 X X - 8 X X - 9 X - [b] X - 15 X X X X - 16 X X X - 18 X X X X + 1 [a] 7 8 9 [b]15 18 + 1 X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 18 X X X 4. We add COMPLEX characters and make it breakable w/ all ther class @@ -145,41 +164,173 @@ Class of Leading Class of Trailing Char Class - Char + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 18 X X X + COMPLEX X T - 1 [a] 7 8 9 [b]15 16 18 COMPLEX - - 1 X X X X X X X X X X - [a] X - 7 X X - 8 X X - 9 X - [b] X - 15 X X X X - 16 X X X - 18 X X X X - COMPLEX X T - T : need special handling - 5. Now we use one bit to encode weather it is breakable, and use 2 bytes + + 5. However, we need two special class for some punctuations/parentheses, + theirs breaking rules like character class (18), see bug 389056. + And also we need character like punctuation that is same behavior with 18, + but the characters are not letters of all languages. (e.g., '_') + [c]. Based on open parenthesis class (1), but it is not breakable after + character class (18) or numeric class (15). + [d]. Based on close parenthesis (or punctuation) class (2), but it is not + breakable before character class (18) or numeric class (15). + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] + + 1 X X X X X X X X X X X + [a] X X X + 7 X X + 8 X X + 9 X + [b] X X + 15 X X X X X X + 18 X X X X X + COMPLEX X T + [c] X X X X X X X X X X X + [d] X X X X + + + 6. And Unicode has "NON-BREAK" characters. The lines should be broken around + them. But in JIS X 4051, such class is not, therefore, we create [e]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X X + 7 X X X + 8 X X X + 9 X X + [b] X X X + 15 X X X X X X X + 18 X X X X X X + COMPLEX X T X + [c] X X X X X X X X X X X X + [d] X X X X X + [e] X X X X X X X X X X X X + + + 7. Now we use one bit to encode weather it is breakable, and use 2 bytes for one row, then the bit table will look like: 18 <- 1 - - 1 0000 0011 1111 1111 = 0x03FF - [a] 0000 0000 0000 0010 = 0x0002 - 7 0000 0000 0000 0110 = 0x0006 - 8 0000 0000 0100 0010 = 0x0042 - 9 0000 0000 0000 0010 = 0x0002 - [b] 0000 0000 0000 0010 = 0x0002 - 15 0000 0001 0101 0010 = 0x0152 - 16 0000 0001 1000 0010 = 0x0182 - 18 0000 0001 1100 0010 = 0x01C2 - COMPLEX 0000 0010 0000 0010 = 0x0202 - 5. Now we map the class to number - + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1110 0000 0010 = 0x0E02 + 7 0000 1000 0000 0110 = 0x0806 + 8 0000 1000 0100 0010 = 0x0842 + 9 0000 1000 0000 0010 = 0x0802 + [b] 0000 1100 0000 0010 = 0x0C02 + 15 0000 1110 1101 0010 = 0x0ED2 + 18 0000 1110 1100 0010 = 0x0EC2 + COMPLEX 0000 1001 0000 0010 = 0x0902 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1100 1100 0010 = 0x0CC2 + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +#define MAX_CLASSES 12 + +static const PRUint16 gPair[MAX_CLASSES] = { + 0x0FFF, + 0x0E02, + 0x0806, + 0x0842, + 0x0802, + 0x0C02, + 0x0ED2, + 0x0EC2, + 0x0902, + 0x0FFF, + 0x0CC2, + 0x0FFF +}; + + +/* + + 8. And if the character is not enough far from word start, word end and + another break point, we should not break in non-CJK languages. + I.e., Don't break around 15, 18, [c] and [d], but don't change + that if they are related to [b]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X X X X + 7 X X X X X X X + 8 X X X X X X + 9 X X X X X X + [b] X X X + 15 X X X X X X X X X X X + 18 X X X X X X X X X X X + COMPLEX X X X T X X X + [c] X X X X X X X X X X X X + [d] X X X X X X X X X X X + [e] X X X X X X X X X X X X + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1110 1100 0010 = 0x0EC2 + 7 0000 1110 1100 0110 = 0x0EC6 + 8 0000 1110 1100 0010 = 0x0EC2 + 9 0000 1110 1100 0010 = 0x0EC2 + [b] 0000 1100 0000 0010 = 0x0C02 + 15 0000 1111 1101 1111 = 0x0FDF + 18 0000 1111 1101 1111 = 0x0FDF + COMPLEX 0000 1111 1100 0010 = 0x0FC2 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1111 1101 1111 = 0x0EDF + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +static const PRUint16 gPairConservative[MAX_CLASSES] = { + 0x0FFF, + 0x0EC2, + 0x0EC6, + 0x0EC2, + 0x0EC2, + 0x0C02, + 0x0FDF, + 0x0FDF, + 0x0FC2, + 0x0FFF, + 0x0EDF, + 0x0FFF +}; + + +/* + + 9. Now we map the class to number + 0: 1 1: [a]- 2, 3, 4, 5, 6 2: 7 @@ -187,27 +338,59 @@ 4: 9 5: [b]- 10, 11, 12, 17 6: 15 - 7: 16 - 8: 18 - 9: COMPLEX + 7: 18 + 8: COMPLEX + 9: [c] + A: [d] + B: [e] + + and they mean: + 0: Open parenthesis + 1: Punctuation that prohibits break before + 2: Non-breakable between same classes + 3: Prefix + 4: Postfix + 5: Breakable character (Spaces and Most Japanese characters) + 6: Numeric + 7: Characters + 8: Need special handling characters (E.g., Thai) + 9: Open parentheses like Character (See bug 389056) + A: Close parenthese (or punctuations) like Character (See bug 389056) + B: Non breakable (See bug 390920) */ -#define MAX_CLASSES 10 +#define CLASS_NONE PR_INT8_MAX -static const PRUint16 gPair[MAX_CLASSES] = { - 0x03FF, - 0x0002, - 0x0006, - 0x0042, - 0x0002, - 0x0002, - 0x0152, - 0x0182, - 0x01C2, - 0x0202 -}; +#define CLASS_OPEN 0x00 +#define CLASS_CLOSE 0x01 +#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 +#define CLASS_PREFIX 0x03 +#define CLASS_POSTFFIX 0x04 +#define CLASS_BREAKABLE 0x05 +#define CLASS_NUMERIC 0x06 +#define CLASS_CHARACTER 0x07 +#define CLASS_COMPLEX 0x08 +#define CLASS_OPEN_LIKE_CHARACTER 0x09 +#define CLASS_CLOSE_LIKE_CHARACTER 0x0A +#define CLASS_NON_BREAKABLE 0x0B +#define U_NULL PRUnichar(0x0000) +#define U_SLASH PRUnichar('/') +#define U_SPACE PRUnichar(' ') +#define U_HYPHEN PRUnichar('-') +#define U_EQUAL PRUnichar('=') +#define U_PERCENT PRUnichar('%') +#define U_AMPERSAND PRUnichar('&') +#define U_BACKSLASH PRUnichar('\\') + +#define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \ + (c) == U_SLASH || \ + (c) == U_PERCENT || \ + (c) == U_AMPERSAND || \ + (c) == U_BACKSLASH) + +#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) static inline int GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l) @@ -215,10 +398,6 @@ GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l) return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f); } -#define CLASS_COMPLEX 9 - - - static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(PRUnichar u) { @@ -240,54 +419,48 @@ IS_COMPLEX(PRUnichar u) return (0x0e01 <= (u) && (u) <= 0x0e5b); } -static inline int -IS_SPACE(PRUnichar u) +static inline PRBool +IS_NONBREAKABLE_SPACE(PRUnichar u) { - return ((u) == 0x0020 || (u) == 0x0009 || (u) == 0x000a || (u) == 0x000d || (u)==0x200b); + return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE } -static PRInt8 GetClass(PRUnichar u) +static inline PRBool +IS_HYPHEN(PRUnichar u) +{ + return (u == U_HYPHEN || + u == 0x058A || // ARMENIAN HYPHEN + u == 0x2010 || // HYPHEN + u == 0x2012); // FIGURE DASH +} + +static PRInt8 +GetClass(PRUnichar u) { PRUint16 h = u & 0xFF00; PRUint16 l = u & 0x00ff; PRInt8 c; - + // Handle 3 range table first - if( 0x0000 == h) - { + if (0x0000 == h) { c = GETCLASSFROMTABLE(gLBClass00, l); - } - else if( 0x0E00 == h) - { + } else if (0x0E00 == h) { c = GETCLASSFROMTABLE(gLBClass0E, l); - } - else if( 0x2000 == h) - { + } else if (0x2000 == h) { c = GETCLASSFROMTABLE(gLBClass20, l); - } - else if( 0x2100 == h) - { + } else if (0x2100 == h) { c = GETCLASSFROMTABLE(gLBClass21, l); - } - else if( 0x3000 == h) - { + } else if (0x3000 == h) { c = GETCLASSFROMTABLE(gLBClass30, l); - } - else if ( ( ( 0x3200 <= u) && ( u <= 0xA4CF) ) || // CJK and Yi - ( ( 0xAC00 <= h) && ( h <= 0xD7FF) ) || // Hangul - ( ( 0xf900 <= h) && ( h <= 0xfaff) ) - ) - { - c = 5; // CJK character, Han, and Han Compatability - } - else if( 0xff00 == h) - { - if( l < 0x0060) // Fullwidth ASCII variant - { + } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi + ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul + ((0xf900 <= h) && (h <= 0xfaff))) { + c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatability + } else if (0xff00 == h) { + if (l < 0x0060) { // Fullwidth ASCII variant c = GETCLASSFROMTABLE(gLBClass00, (l+0x20)); } else if (l < 0x00a0) { - switch (l) - { + switch (l) { case 0x61: c = GetClass(0x3002); break; case 0x62: c = GetClass(0x300c); break; case 0x63: c = GetClass(0x300d); break; @@ -296,53 +469,77 @@ static PRInt8 GetClass(PRUnichar u) case 0x9e: c = GetClass(0x309b); break; case 0x9f: c = GetClass(0x309c); break; default: - if(IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) - c = 1; // jis x4051 class 3 + if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) + c = CLASS_CLOSE; // jis x4051 class 3 else - c = 5; // jis x4051 class 11 + c = CLASS_BREAKABLE; // jis x4051 class 11 break; } - // Halfwidth Katakana variants - } else if( l < 0x00e0) { - c = 8; // Halfwidth Hangul variants - } else if( l < 0x00f0) { - static PRUnichar NarrowFFEx[16] = - { + // Halfwidth Katakana variants + } else if (l < 0x00e0) { + c = CLASS_CHARACTER; // Halfwidth Hangul variants + } else if (l < 0x00f0) { + static PRUnichar NarrowFFEx[16] = { 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000 }; c = GetClass(NarrowFFEx[l - 0x00e0]); } else { - c = 8; + c = CLASS_CHARACTER; } - } - else if( 0x3100 == h) { - if ( l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun - // XXX: This is per UAX #14, but UAX #14 may change - // the line breaking rules about Kanbun and Bopomofo. - c = 5; + } else if (0x3100 == h) { + if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun + // XXX: This is per UAX #14, but UAX #14 may change + // the line breaking rules about Kanbun and Bopomofo. + c = CLASS_BREAKABLE; + } else if (l >= 0xf0) { // Katakana small letters for Ainu + c = CLASS_CLOSE; + } else { // unassigned + c = CLASS_CHARACTER; } - else if ( l >= 0xf0) - { // Katakana small letters for Ainu - c = 1; - } - else // unassigned - { - c = 8; - } - } - else { - c = 8; // others + } else if (0x0300 == h) { + if (0x4F == l || (0x5C <= l && l <= 0x62)) + c = CLASS_NON_BREAKABLE; + else + c = CLASS_CHARACTER; + } else if (0x0500 == h) { + // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) + if (l == 0x8A) + c = GETCLASSFROMTABLE(gLBClass00, PRUint16(U_HYPHEN)); + else + c = CLASS_CHARACTER; + } else if (0x0F00 == h) { + if (0x08 == l || 0x0C == l || 0x12 == l) + c = CLASS_NON_BREAKABLE; + else + c = CLASS_CHARACTER; + } else if (0x1800 == h) { + if (0x0E == l) + c = CLASS_NON_BREAKABLE; + else + c = CLASS_CHARACTER; + } else { + c = CLASS_CHARACTER; // others } return c; } -static PRBool GetPair(PRInt8 c1, PRInt8 c2) +static PRBool +GetPair(PRInt8 c1, PRInt8 c2) { - NS_ASSERTION( c1 < MAX_CLASSES ,"illegal classes 1"); - NS_ASSERTION( c2 < MAX_CLASSES ,"illegal classes 2"); + NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); - return (0 == ((gPair[c1] >> c2 ) & 0x0001)); + return (0 == ((gPair[c1] >> c2) & 0x0001)); +} + +static PRBool +GetPairConservative(PRInt8 c1, PRInt8 c2) +{ + NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2"); + + return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); } nsJISx4051LineBreaker::nsJISx4051LineBreaker() @@ -355,81 +552,204 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker() NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker) -#define U_PERIOD PRUnichar('.') -#define U_COMMA PRUnichar(',') -#define U_SEMICOLON PRUnichar(';') -#define U_SLASH PRUnichar('/') -#define U_SPACE PRUnichar(' ') -#define U_HYPHEN PRUnichar('-') -#define U_EQUAL PRUnichar('=') -#define U_NULL PRUnichar(0x0000) -#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019) -#define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \ - (c) == U_COMMA || \ - (c) == U_SEMICOLON || \ - (c) == U_SLASH || \ - (c) == U_HYPHEN || \ - (c) == U_EQUAL || \ - (c) == U_RIGHT_SINGLE_QUOTATION_MARK) -#define NUMERIC_CLASS 6 // JIS x4051 class 15 is now map to simplified class 6 -#define CHARACTER_CLASS 8 // JIS x4051 class 18 is now map to simplified class 8 -#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) +class ContextState { +public: + ContextState(const PRUnichar* aText, PRUint32 aLength) { + mUniText = aText; + mText = nsnull; + mLength = aLength; + Init(); + } -static PRInt8 ContextualAnalysis( - PRUnichar prev, PRUnichar cur, PRUnichar next) + ContextState(const PRUint8* aText, PRUint32 aLength) { + mUniText = nsnull; + mText = aText; + mLength = aLength; + Init(); + } + + PRUint32 Length() { return mLength; } + PRUint32 Index() { return mIndex; } + + PRUnichar GetCharAt(PRUint32 aIndex) { + NS_ASSERTION(0 <= aIndex && aIndex < mLength, "Out of range!"); + return mUniText ? mUniText[aIndex] : PRUnichar(mText[aIndex]); + } + + void AdvanceIndexTo(PRUint32 aIndex) { + NS_ASSERTION(mIndex <= aIndex, "the index cannot decrease."); + NS_ASSERTION(aIndex < mLength, "out of range"); + mIndex = aIndex; + } + + void NotifyBreakBefore() { mLastBreakIndex = mIndex; } + +// A word of western language should not be broken. But even if the word has +// only ASCII characters, non-natural context words should be broken, e.g., +// URL and file path. For protecting the natural words, we should use +// conservative breaking rules at following conditions: +// 1. at near the start of word +// 2. at near the end of word +// 3. at near the latest broken point +// CONSERVATIVE_BREAK_RANGE define the 'near' in characters. +#define CONSERVATIVE_BREAK_RANGE 6 + + PRBool UseConservativeBreaking(PRUint32 aOffset = 0) { + if (mHasCJKChar) + return PR_FALSE; + PRUint32 index = mIndex + aOffset; + PRBool result = (index < CONSERVATIVE_BREAK_RANGE || + mLength - index < CONSERVATIVE_BREAK_RANGE || + index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE); + if (result || !mHasNonbreakableSpace) + return result; + + // This text has no-breakable space, we need to check whether the index + // is near it. + + // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here. + for (PRUint32 i = index - 1; index - CONSERVATIVE_BREAK_RANGE < i; --i) { + if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) + return PR_TRUE; + if (i == 0) + break; + } + // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE. + for (PRUint32 i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) { + if (IS_NONBREAKABLE_SPACE(GetCharAt(i))) + return PR_TRUE; + } + return PR_FALSE; + } + + PRBool HasCharacterAlready(PRUnichar aCh) { + // Be careful for the index being unsigned. + if (mIndex == 0) + return PR_FALSE; + for (PRUint32 i = mIndex - 1; 0 < i; --i) { + if (GetCharAt(i) == aCh) + return PR_TRUE; + if (i == 0) + break; + } + return PR_FALSE; + } + + PRUnichar GetPreviousNonHyphenCharacter() { + NS_ASSERTION(IS_HYPHEN(GetCharAt(mIndex)), + "current character isn't hyphen"); + // Be careful for the index being unsigned. + if (mIndex == 0) + return PR_FALSE; + for (PRUint32 i = mIndex - 1; 0 < i; --i) { + PRUnichar ch = GetCharAt(i); + if (!IS_HYPHEN(ch)) + return ch; + if (i == 0) + break; + } + return U_NULL; + } + +private: + void Init() { + mIndex = 0; + mLastBreakIndex = 0; + mHasCJKChar = 0; + mHasNonbreakableSpace = 0; + + for (PRUint32 i = 0; i < mLength; ++i) { + PRUnichar u = GetCharAt(i); + if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) + mHasNonbreakableSpace = 1; + else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u)) + mHasCJKChar = 1; + } + } + + const PRUnichar* mUniText; + const PRUint8* mText; + + PRUint32 mIndex; + PRUint32 mLength; // length of text + PRUint32 mLastBreakIndex; + PRPackedBool mHasCJKChar; // if the text has CJK character, this is true. + PRPackedBool mHasNonbreakableSpace; // if the text has no-breakable space, + // this is true. +}; + +static PRInt8 +ContextualAnalysis(PRUnichar prev, PRUnichar cur, PRUnichar next, + ContextState &aState) { - if(U_COMMA == cur || U_SEMICOLON == cur) - { - if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next)) - return NUMERIC_CLASS; - } - else if(U_PERIOD == cur) - { - if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) && - IS_ASCII_DIGIT(next)) - return NUMERIC_CLASS; + // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. - // By assigning a full stop character class only when it's followed by - // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) - // doesn't matter, either way, we prevent lines from breaking around - // full stop in those cases while still allowing it to end a line when - // followed by CJK characters. With an additional condition of it being - // preceded by class 0 or class > 5, we make sure that it does not - // start a line (see bug 164759). - PRUint8 pc = prev != U_NULL ? GetClass(prev) : CHARACTER_CLASS; - if((pc > 5 || pc == 0) && GetClass(next) > 5) - return CHARACTER_CLASS; - } - else if(U_SLASH == cur || U_HYPHEN == cur || U_EQUAL == cur) - { - // if slash is a first character, don't break at this point (e.g., "/root") - if (U_SLASH == cur && prev == U_NULL) - return CHARACTER_CLASS; - if (IS_ASCII_DIGIT(next)) - return NUMERIC_CLASS; - } - else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur) - { - // somehow people use this as ' in "it's" sometimes... - if(U_SPACE != next) - return CHARACTER_CLASS; - } - return GetClass(cur); + if (IS_HYPHEN(cur)) { + // If next character is hyphen, we don't need to break between them. + if (IS_HYPHEN(next)) + return CLASS_CHARACTER; + // If prev and next characters are numeric, it may be in Math context. + // So, we should not break here. + PRBool prevIsNum = IS_ASCII_DIGIT(prev); + PRBool nextIsNum = IS_ASCII_DIGIT(next); + if (prevIsNum && nextIsNum) + return CLASS_NUMERIC; + // If one side is numeric and the other is a character, or if both sides are + // characters, the hyphen should be breakable. + if (!aState.UseConservativeBreaking(1)) { + PRUnichar prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); + if (prevOfHyphen && next) { + PRBool prevIsChar = !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen) && + GetClass(prevOfHyphen) == CLASS_CHARACTER; + PRBool nextIsChar = !NEED_CONTEXTUAL_ANALYSIS(next) && + GetClass(next) == CLASS_CHARACTER; + if ((prevIsNum || prevIsChar) && (nextIsNum || nextIsChar)) + return CLASS_CLOSE; + } + } + } else if (cur == U_SLASH || cur == U_BACKSLASH) { + // If this is immediately after same char, we should not break here. + if (prev == cur) + return CLASS_CHARACTER; + // If this text has two or more (BACK)SLASHs, this may be file path or URL. + if (!aState.UseConservativeBreaking() && + aState.HasCharacterAlready(cur)) + return CLASS_OPEN; + } else if (cur == U_PERCENT) { + // If this is a part of the param of URL, we should break before. + if (!aState.UseConservativeBreaking()) { + if (aState.Index() >= 3 && + aState.GetCharAt(aState.Index() - 3) == U_PERCENT) + return CLASS_OPEN; + if (aState.Index() + 3 < aState.Length() && + aState.GetCharAt(aState.Index() + 3) == U_PERCENT) + return CLASS_OPEN; + } + } else if (cur == U_AMPERSAND) { + // If this may be a separator of params of URL, we should break after. + if (!aState.UseConservativeBreaking(1) && + aState.HasCharacterAlready(U_EQUAL)) + return CLASS_CLOSE; + } else { + NS_ERROR("Forgot to handle the current character!"); + } + return GetClass(cur); } -PRInt32 nsJISx4051LineBreaker::WordMove( - const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos, PRInt8 aDirection) +PRInt32 +nsJISx4051LineBreaker::WordMove(const PRUnichar* aText, PRUint32 aLen, + PRUint32 aPos, PRInt8 aDirection) { PRBool textNeedsJISx4051 = PR_FALSE; PRInt32 begin, end; - for (begin = aPos; begin > 0 && !IS_SPACE(aText[begin - 1]); --begin) { + for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { if (IS_CJK_CHAR(aText[begin]) || IS_COMPLEX(aText[begin])) { textNeedsJISx4051 = PR_TRUE; } } - for (end = aPos + 1; end < PRInt32(aLen) && !IS_SPACE(aText[end]); ++end) { + for (end = aPos + 1; end < PRInt32(aLen) && !NS_IsSpace(aText[end]); ++end) { if (IS_CJK_CHAR(aText[end]) || IS_COMPLEX(aText[end])) { textNeedsJISx4051 = PR_TRUE; } @@ -458,8 +778,9 @@ PRInt32 nsJISx4051LineBreaker::WordMove( return ret; } -PRInt32 nsJISx4051LineBreaker::Next( - const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos) +PRInt32 +nsJISx4051LineBreaker::Next(const PRUnichar* aText, PRUint32 aLen, + PRUint32 aPos) { NS_ASSERTION(aText, "aText shouldn't be null"); NS_ASSERTION(aLen > aPos, "Illegal value (length > position)"); @@ -468,8 +789,9 @@ PRInt32 nsJISx4051LineBreaker::Next( return nextPos < PRInt32(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; } -PRInt32 nsJISx4051LineBreaker::Prev( - const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos) +PRInt32 +nsJISx4051LineBreaker::Prev(const PRUnichar* aText, PRUint32 aLen, + PRUint32 aPos) { NS_ASSERTION(aText, "aText shouldn't be null"); NS_ASSERTION(aLen >= aPos, "Illegal value (length >= position)"); @@ -483,16 +805,19 @@ nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLeng PRPackedBool* aBreakBefore) { PRUint32 cur; - PRInt8 lastClass = -1; + PRInt8 lastClass = CLASS_NONE; + ContextState state(aChars, aLength); for (cur = 0; cur < aLength; ++cur) { PRUnichar ch = aChars[cur]; PRInt8 cl; + state.AdvanceIndexTo(cur); if (NEED_CONTEXTUAL_ANALYSIS(ch)) { cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch, - cur + 1 < aLength ? aChars[cur + 1] : U_NULL); + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, + state); } else { cl = GetClass(ch); } @@ -501,11 +826,16 @@ nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLeng if (cur > 0) { NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, "Loop should have prevented adjacent complex chars here"); - allowBreak = GetPair(lastClass, cl); + if (state.UseConservativeBreaking()) + allowBreak = GetPairConservative(lastClass, cl); + else + allowBreak = GetPair(lastClass, cl); } else { allowBreak = PR_FALSE; } aBreakBefore[cur] = allowBreak; + if (allowBreak) + state.NotifyBreakBefore(); lastClass = cl; if (CLASS_COMPLEX == cl) { PRUint32 end = cur + 1; @@ -530,27 +860,35 @@ nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUint8* aChars, PRUint32 aLength PRPackedBool* aBreakBefore) { PRUint32 cur; - PRInt8 lastClass = -1; + PRInt8 lastClass = CLASS_NONE; + ContextState state(aChars, aLength); for (cur = 0; cur < aLength; ++cur) { PRUnichar ch = aChars[cur]; PRInt8 cl; + state.AdvanceIndexTo(cur); if (NEED_CONTEXTUAL_ANALYSIS(ch)) { cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch, - cur + 1 < aLength ? aChars[cur + 1] : U_NULL); + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, + state); } else { cl = GetClass(ch); } PRBool allowBreak; if (cur > 0) { - allowBreak = GetPair(lastClass, cl); + if (state.UseConservativeBreaking()) + allowBreak = GetPairConservative(lastClass, cl); + else + allowBreak = GetPair(lastClass, cl); } else { allowBreak = PR_FALSE; } aBreakBefore[cur] = allowBreak; + if (allowBreak) + state.NotifyBreakBefore(); lastClass = cl; } } diff --git a/intl/lwbrk/tools/anzx4501.html b/intl/lwbrk/tools/anzx4501.html index 85b0e7689c4..44da8ee3917 100644 --- a/intl/lwbrk/tools/anzx4501.html +++ b/intl/lwbrk/tools/anzx4501.html @@ -53,11 +53,10 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -14 -3 - -17 +12 +1 +13 @@ -78,24 +77,25 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 1 -13 -2 +11 1 + + 01_[a] 31 2 -32 -6 +28 +3 -71 +64 @@ -112,17 +112,17 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -2 -14 - - -16 - - -2 -3 +4 +12 1 +11 + + +2 +1 + + @@ -131,9 +131,48 @@ Analysis of JIS X 4051 to Unicode General Category Mapping +4 + + +4 + + + + + + + + + + + + + + + + 1 + +3 + + + + + + + + + +03_8 + + + + + +1 + 1 @@ -151,7 +190,6 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -1 @@ -161,44 +199,6 @@ Analysis of JIS X 4051 to Unicode General Category Mapping - - - - -03_8 - - - - - -3 - -3 - - - - - - - - - - - - - - - - - - - - - - -2 - - 1 @@ -209,8 +209,8 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -4 -1 +5 + 5 @@ -233,9 +233,9 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -4 +5 + -1 @@ -245,20 +245,20 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 05_[b] 33 -154 +153 -53 +33 2 -305 -13 -560 +5 +12 +238 32 1 -154 +153 @@ -266,7 +266,7 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 33 -20 + @@ -277,10 +277,10 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -305 +5 -13 +12 06_15 @@ -321,85 +321,46 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -07_16 - -1 - - - - - -1 - - - - - - - - -1 - - - - - - - - - - - - - - - - - - - - - -08_18 -10 -659 -4 -130 -56 -941 -2 -1802 - -10 - - -368 -1 -4 - -286 - - -4 +07_18 +19 +157 +33 +54 +125 3 -127 -3 +391 + +19 + + +67 5 -3 4 -6 + +81 + + + + +3 30 -5 -12 -10 -273 -646 -1 -1 +4 +2 +2 +2 +4 +36 +4 +3 +23 +99 +1 +1 +1 -09_COMPLEX +08_COMPLEX @@ -438,6 +399,123 @@ Analysis of JIS X 4051 to Unicode General Category Mapping +09_[c] + + + + +3 +4 + +7 + + + + + + + + + + + + + + + + + + + + + +3 +2 + +2 + + + + + +0A_[d] +1 +2 + +6 +21 +16 + +46 + +1 + + +2 + + + + + + + + + +6 + + +3 +1 +1 +16 + +2 +3 +7 +4 + + + + +0B_[e] + + + + +1 +1 +3 +5 + + + + + + + + + + + + + + + + +1 + + + + + +1 + + + + + +3 + X @@ -487,74 +565,26 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 04_9 05_[b] 06_15 -07_16 -08_18 -09_COMPLEX +07_18 +08_COMPLEX +09_[c] +0A_[d] +0B_[e] X 00 -6 -14 -2 1 + + + 33 10 +126 -156 - - - -01 - - - - - - - - -128 - - - -02 - - - - - - - - -89 - - - -03 - - - - - - - - -76 - - - -04 - - - - - - - - -226 - +7 +44 +2 0E @@ -565,22 +595,26 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 20 - 1 + + + 20 +2 +11 +4 5 -1 +12 -4 -13 +101 -86 - +2 +3 21 @@ -591,88 +625,12 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 32 -1 -162 - - - -22 +163 - - - -242 - - - -23 - - - - - - - - -1 - - - -24 - - - - - - - - -139 - - - -25 - - - - - - - - -230 - - - -26 - - - - - - - - -106 - - - -27 - - - - - - - - -160 - - 30 10 @@ -686,43 +644,6 @@ Analysis of JIS X 4051 to Unicode General Category Mapping - -32 - - - - - -132 - - - - - - -33 - - - - - -188 - - - - - - -4E - - - - - -256 - - - diff --git a/intl/lwbrk/tools/anzx4501.pl b/intl/lwbrk/tools/anzx4501.pl index 92a25f0de08..fa4d0349046 100644 --- a/intl/lwbrk/tools/anzx4501.pl +++ b/intl/lwbrk/tools/anzx4501.pl @@ -396,11 +396,11 @@ printf "[%s || %s]\n", $r, $def; } print HEADER "};\n\n"; } -printarray("00", "8"); -printarray("20", "8"); -printarray("21", "8"); +printarray("00", "7"); +printarray("20", "7"); +printarray("21", "7"); printarray("30", "5"); -printarray("0E", "9"); +printarray("0E", "8"); #print %rangecount; diff --git a/intl/lwbrk/tools/jisx4501class.txt b/intl/lwbrk/tools/jisx4501class.txt index 9a6095069dc..81e1c29706b 100644 --- a/intl/lwbrk/tools/jisx4501class.txt +++ b/intl/lwbrk/tools/jisx4501class.txt @@ -1,7 +1,85 @@ -0028;;1 -002F;;2 -005B;;1 -007B;;1 +0000;001f;17 +0020;;17 +0024;;24 +0027;;18 +0028;;22 +002D;;18 +002F;;18 +0021;002F;23 +0030;0039;15 +003C;;22 +003F;;4 +003A;003F;23 +0040;;18 +0041;005A;18 +005B;;22 +005E;;18 +005F;;18 +005B;005F;23 +0060;;18 +0061;007A;18 +007B;;22 +007B;007E;23 +00A0;;24 +00A3;;22 +00A5;;22 +00A9;;18 +00AA;;18 +00AC;;22 +00AE;;18 +00AF;;18 +00A1;00BF;23 +00B0;;18 +00F7;;23 +00C0;00FF;18 +0E3F;;1 +0E2F;;4 +0E46;;4 +0E5A;0E5B;4 +0E50;0E59;15 +0E4F;;18 +0EAF;;4 +0EC6;;4 +0ED0;0ED9;15 +2007;;24 +2000;200B;17 +200C;200F;18 +2010;;2 +2011;;24 +2012;2013;2 +2014;;7 +2015;;18 +2016;2017;18 +2018;201F;18 +2020;2023;18 +2024;2026;7 +2027;;23 +2028;202E;18 +202F;;24 +2030;2034;9 +2035;2038;18 +2039;;1 +203A;;2 +203B;;12 +203C;203D;3 +203E;;23 +203F;2043;18 +2044;;3 +2045;;1 +2046;;2 +2047;2049;3 +204A;2063;18 +206A;206F;18 +2070;2071;18 +2074;208E;18 +2090;2094;18 +2116;;8 +2160;217F;12 +2190;21EA;a12 +2126;;18 +2100;2138;18 +2153;2182;18 +2190;21EA;18 3008;;1 300A;;1 300C;;1 @@ -12,10 +90,6 @@ 3018;;1 301A;;1 301D;;1 -0029;;2 -002C;;2 -005D;;2 -007D;;2 3001;;2 3009;;2 300B;;2 @@ -28,8 +102,6 @@ 301B;;2 301E;;2 301F;;2 -203C;;3 -2044;;3 301C;;3 3041;;3 3043;;3 @@ -58,65 +130,11 @@ 30FC;;3 30FD;;3 30FE;;3 -0021;;4 -003F;;4 -003A;;18 -003B;;5 30FB;;5 -002E;;6 3002;;6 -2014;;7 -2024;;2 -2025;;2 -2026;;2 -0024;;1 -005C;;1 -00A3;;8 -00A5;;8 -2116;;8 -0025;;2 -00A2;;9 -00B0;;2 -2030;;9 -2031;;9 -2032;;9 -2033;;9 3000;;10 3042;3094;11 3099;309E;3 -002B;;18 -002D;;2 -003C;;1 -003D;;2 -003E;;2 -00A7;;18 -00A9;;18 -00AE;;18 -00B1;;18 -00B6;;18 -00D7;;18 -00F7;;18 -203B;;12 -2160;217F;12 -2190;21EA;a12 -2460;24EA;a12 -2500;257F;a12 -2580;2595;a12 -25A0;25EF;a12 -2600;2613;a12 -261A;266F;a12 -2701;2704;a12 -2706;2709;a12 -270C;2727;a12 -2729;274B;a12 -274D;;a12 -274F;2752;a12 -2756;;a12 -2758;275E;a12 -2761;2767;a12 -2776;2794;a12 -2798;27AF;a12 -27B1;27BE;a12 3003;;12 3004;;12 3006;;12 @@ -126,76 +144,3 @@ 3020;;12 3036;;12 30A2;30FA;12 -3220;3243;12 -3280;32B0;12 -32D0;32FE;12 -3300;3357;12 -3371;3376;12 -3380;33DD;12 -4E00;9F45;12 -0030;0039;15 -2126;;16 -0020;;17 -0000;001f;17 -0021;007E;18 -00A1;00FF;18 -0100;017F;18 -0250;02A8;18 -0374;0375;18 -037A;;18 -037E;;18 -0384;038A;18 -038C;;18 -038E;03A1;18 -03A3;03CE;18 -0401;040C;18 -040E;044F;18 -0451;045C;18 -045E;047F;18 -0480;0486;18 -0480;0486;18 -0490;04C4;18 -04C7;04C8;18 -04CB;04CC;18 -04D0;04EB;18 -04EE;04F5;18 -04F8;04F9;18 -2000;200B;17 -200C;202E;18 -2030;2046;18 -2070;;18 -2074;208E;18 -20A0;20AA;18 -2100;2138;18 -2153;2182;18 -2190;21EA;18 -2200;227F;18 -2280;22F1;18 -2312;;18 -2460;24EA;18 -2500;257F;18 -2580;2595;18 -25A0;25EF;18 -2600;2613;18 -261A;266F;18 -2701;2704;18 -2706;2709;18 -270C;2727;18 -2729;274B;18 -274D;;18 -274F;2752;18 -2756;;18 -2758;275E;18 -2761;2767;18 -2776;2794;18 -2798;27AF;18 -27B1;27BE;18 -0E3F;;1 -0E2F;;4 -0E46;;4 -0E5A;0E5B;4 -0E50;0E59;15 -0E4F;;18 -0EAF;;4 -0EC6;;4 -0ED0;0ED9;15 diff --git a/intl/lwbrk/tools/jisx4501simp.txt b/intl/lwbrk/tools/jisx4501simp.txt index 294f0e6efda..e12a7fd8059 100644 --- a/intl/lwbrk/tools/jisx4501simp.txt +++ b/intl/lwbrk/tools/jisx4501simp.txt @@ -13,9 +13,12 @@ 13;X 14;X 15;06_15 -16;07_16 +16;X 17;05_[b] -18;08_18 +18;07_18 19;X 20;X -21;09_COMPLEX +21;08_COMPLEX +22;09_[c] +23;0A_[d] +24;0B_[e] diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html index 03260b240f3..3a05d8b9588 100755 --- a/intl/lwbrk/tools/spec_table.html +++ b/intl/lwbrk/tools/spec_table.html @@ -21,99 +21,102 @@ td {

This is a specification table for line breaking.

-

The value 'A' means the line breakable After the character, and 'B' means Before. 'BA' means Before and After.

-

(C) which is the tail of the browser name means Character. (N) means Numeric. -This means that they are around the character. E.g., "a$a" is a testcase for (C), "0$0" is a testcase for (N).

+

The values of IE7 and Opera9: 'A' means that the line is breakable After the character, and 'B' means Before. 'BA' means Before and After.

+

(C) which is the tail of the IE7 and the Opera9 means Character. (N) means Numeric. +This means that they are around the character at testing. E.g., "a$a" is a testcase for (C), "0$0" is a testcase for (N).

+

Gecko is not breaking the lines on most western language context. But for file paths, URLs and very long word which is connected hyphens, +some characters might be breakable. They are 'breakable' in the table. However, they are not always breakable, +they depend on the context in the word.

- + - + - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - + + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - + - +
characterGecko(C)Gecko(N)IE7(C)IE7(N)Opera9.2(C)Opera9.2(N)
characterGeckoIE7(C)IE7(N)Opera9.2(C)Opera9.2(N)
characterGecko(C)Gecko(N)IE7(C)IE7(N)Opera9.2(C)Opera9.2(N)
characterGeckoIE7(C)IE7(N)Opera9.2(C)Opera9.2(N)
0x21!AAAA
0x22"
0x23#
0x24$BBB
0x25%AAAA
0x26&
0x27'
0x28(BBBB
0x29)AAAA
0x2A*
0x2B+
0x2C,A
0x2D-ABABAAA
0x2E.
0x2F/AAA
0x21!AA
0x22"
0x23#
0x24$B
0x25%breakableAA
0x26&breakable
0x27'
0x28(BB
0x29)AA
0x2A*
0x2B+
0x2C,
0x2D-breakableBABAAA
0x2E.
0x2F/breakableAA
0x3A:
0x3B;A
0x3C<BB
0x3D=A
0x3E>AA
0x3F?AAAA
0x3A:
0x3B;breakable
0x3C<
0x3D=
0x3E>
0x3F?AA
0x40@
0x40@
0x5B[BBBB
0x5C\BBB
0x5D]AAAA
0x5E^
0x5F_
0x5B[BB
0x5C\breakableB
0x5D]AA
0x5E^
0x5F_
0x60`
0x60`
0x7B{BBBB
0x7C|AA
0x7D}AAAA
0x7E~
0x7B{BB
0x7C|AA
0x7D}AA
0x7E~
0xA1¡
0xA2¢BAAAA
0xA3£BABB
0xA4¤
0xA5¥BABB
0xA6¦
0xA7§
0xA8¨
0xA9©
0xAAª
0xAB«
0xAC¬
0xAE®
0xAF¯
0xA1¡
0xA2¢AA
0xA3£B
0xA4¤
0xA5¥B
0xA6¦
0xA7§
0xA8¨
0xA9©
0xAAª
0xAB«
0xAC¬
0xAE®
0xAF¯
0xB0°AAAA
0xB1±
0xB2²
0xB3³
0xB4´BB
0xB5µ
0xB6
0xB7·
0xB8¸
0xB9¹
0xBAº
0xBB»
0xBC¼
0xBD½
0xBE¾
0xBF¿
0xB0°AA
0xB1±
0xB2²
0xB3³
0xB4´BB
0xB5µ
0xB6
0xB7·
0xB8¸
0xB9¹
0xBAº
0xBB»
0xBC¼
0xBD½
0xBE¾
0xBF¿
0xD7×
0xD7×
0xF7÷
0xF7÷