From a13479d95487de73a8d084f12a255470f6251931 Mon Sep 17 00:00:00 2001 From: "masayuki@d-toybox.com" Date: Thu, 12 Jul 2007 08:52:51 -0700 Subject: [PATCH] Bug 255990 Characters below U+0100 are not subject to line-breaking rules at all r+sr=roc --- content/base/public/nsLineBreaker.h | 11 +- content/base/src/nsLineBreaker.cpp | 27 +++- intl/lwbrk/public/nsILineBreaker.h | 8 +- intl/lwbrk/src/jisx4501class.h | 24 ++-- intl/lwbrk/src/nsJISx4501LineBreaker.cpp | 91 +++++++++--- intl/lwbrk/src/nsJISx4501LineBreaker.h | 2 + intl/lwbrk/tools/anzx4501.html | 170 +++++++++++------------ intl/lwbrk/tools/jisx4501class.txt | 39 +++--- 8 files changed, 221 insertions(+), 151 deletions(-) diff --git a/content/base/public/nsLineBreaker.h b/content/base/public/nsLineBreaker.h index 063ebc04674..44990ea8531 100644 --- a/content/base/public/nsLineBreaker.h +++ b/content/base/public/nsLineBreaker.h @@ -86,9 +86,18 @@ public: return u == 0x0020 || u == 0x200b/*ZWSP*/ || u == '\n' || u == '\t'; } + static inline PRBool IsComplexASCIIChar(PRUnichar u) + { + return !((0x0030 <= u && u <= 0x0039) || + (0x0041 <= u && u <= 0x005A) || + (0x0061 <= u && u <= 0x007A)); + } + static inline PRBool IsComplexChar(PRUnichar u) { - return (0x1100 <= u && u <= 0x11ff) || + return IsComplexASCIIChar(u) || + (0x1100 <= u && u <= 0x11ff) || + (0x2000 <= u && u <= 0x21ff) || (0x2e80 <= u && u <= 0xd7ff) || (0xf900 <= u && u <= 0xfaff) || (0xff00 <= u && u <= 0xffef); diff --git a/content/base/src/nsLineBreaker.cpp b/content/base/src/nsLineBreaker.cpp index 94714012d9d..9b68add3d6d 100644 --- a/content/base/src/nsLineBreaker.cpp +++ b/content/base/src/nsLineBreaker.cpp @@ -214,6 +214,10 @@ nsLineBreaker::AppendText(nsIAtom* aLangGroup, const PRUint8* aText, PRUint32 aL while (offset < aLength && !IsSpace(aText[offset])) { mCurrentWord.AppendElement(aText[offset]); + if (!mCurrentWordContainsComplexChar && + IsComplexASCIIChar(aText[offset])) { + mCurrentWordContainsComplexChar = PR_TRUE; + } ++offset; } @@ -249,6 +253,7 @@ nsLineBreaker::AppendText(nsIAtom* aLangGroup, const PRUint8* aText, PRUint32 aL } } PRUint32 wordStart = offset; + PRBool wordHasComplexChar = PR_FALSE; for (;;) { PRUint8 ch = aText[offset]; @@ -261,17 +266,31 @@ nsLineBreaker::AppendText(nsIAtom* aLangGroup, const PRUint8* aText, PRUint32 aL mAfterSpace = isSpace; if (isSpace) { - // The current word can't have any complex characters inside it - // because this is 8-bit text, so just ignore it + if (offset > wordStart && wordHasComplexChar) { + if (aFlags & BREAK_ALLOW_INSIDE) { + // Save current start-of-word state because GetJISx4051Breaks will + // set it to false + PRPackedBool currentStart = breakState[wordStart]; + nsContentUtils::LineBreaker()-> + GetJISx4051Breaks(aText + wordStart, offset - wordStart, + breakState.Elements() + wordStart); + breakState[wordStart] = currentStart; + } + wordHasComplexChar = PR_FALSE; + } + ++offset; if (offset >= aLength) break; wordStart = offset; } else { + if (!wordHasComplexChar && IsComplexASCIIChar(ch)) { + wordHasComplexChar = PR_TRUE; + } ++offset; if (offset >= aLength) { // Save this word - mCurrentWordContainsComplexChar = PR_FALSE; + mCurrentWordContainsComplexChar = wordHasComplexChar; PRUint32 len = offset - wordStart; PRUnichar* elems = mCurrentWord.AppendElements(len); if (!elems) @@ -285,8 +304,6 @@ nsLineBreaker::AppendText(nsIAtom* aLangGroup, const PRUint8* aText, PRUint32 aL offset = wordStart + 1; break; } - // We can't break inside words in 8-bit text (no complex characters), so - // there is no need to do anything else to handle words } } diff --git a/intl/lwbrk/public/nsILineBreaker.h b/intl/lwbrk/public/nsILineBreaker.h index 51e9a10d9ed..cd2a48d3b91 100644 --- a/intl/lwbrk/public/nsILineBreaker.h +++ b/intl/lwbrk/public/nsILineBreaker.h @@ -43,10 +43,10 @@ #define NS_LINEBREAKER_NEED_MORE_TEXT -1 -// {c3d9f25f-7cea-4a76-a08f-05c431353448} +// {C9C5938E-70EF-4db2-ADEE-E7B2CCFBBEE6} #define NS_ILINEBREAKER_IID \ -{ 0xc3d9f25f, 0x7cea, 0x4a76, \ - { 0xa0, 0x8f, 0x05, 0xc4, 0x31, 0x35, 0x34, 0x48 } } +{ 0xc9c5938e, 0x70ef, 0x4db2, \ + { 0xad, 0xee, 0xe7, 0xb2, 0xcc, 0xfb, 0xbe, 0xe6 } } class nsILineBreaker : public nsISupports { @@ -70,6 +70,8 @@ public: // output array. virtual void GetJISx4051Breaks(const PRUnichar* aText, PRUint32 aLength, PRPackedBool* aBreakBefore) = 0; + virtual void GetJISx4051Breaks(const PRUint8* aText, PRUint32 aLength, + PRPackedBool* aBreakBefore) = 0; }; NS_DEFINE_STATIC_IID_ACCESSOR(nsILineBreaker, NS_ILINEBREAKER_IID) diff --git a/intl/lwbrk/src/jisx4501class.h b/intl/lwbrk/src/jisx4501class.h index 470671d9855..a8f49372613 100644 --- a/intl/lwbrk/src/jisx4501class.h +++ b/intl/lwbrk/src/jisx4501class.h @@ -22,8 +22,8 @@ * Contributor(s): * * Alternatively, the contents of this file may be used under the terms of - * either of the GNU General Public License Version 2 or later (the "GPL"), - * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to @@ -43,14 +43,14 @@ static const PRUint32 gLBClass00[32] = { 0x55555555, // U+0008 - U+000F 0x55555555, // U+0010 - U+0017 0x55555555, // U+0018 - U+001F -0x88438815, // U+0020 - U+0027 -0x81515810, // U+0028 - U+002F +0x88108815, // U+0020 - U+0027 +0x11118810, // U+0028 - U+002F 0x66666666, // U+0030 - U+0037 -0x11501166, // U+0038 - U+003F +0x11101866, // U+0038 - U+003F 0x88888888, // U+0040 - U+0047 0x88888888, // U+0048 - U+004F 0x88888888, // U+0050 - U+0057 -0x88130888, // U+0058 - U+005F +0x88100888, // U+0058 - U+005F 0x88888888, // U+0060 - U+0067 0x88888888, // U+0068 - U+006F 0x88888888, // U+0070 - U+0077 @@ -59,17 +59,17 @@ static const PRUint32 gLBClass00[32] = { 0x88888888, // U+0088 - U+008F 0x88888888, // U+0090 - U+0097 0x88888888, // U+0098 - U+009F -0x58383488, // U+00A0 - U+00A7 -0x85888858, // U+00A8 - U+00AF -0x85888854, // U+00B0 - U+00B7 +0x88383488, // U+00A0 - U+00A7 +0x88888888, // U+00A8 - U+00AF +0x88888881, // U+00B0 - U+00B7 0x88888888, // U+00B8 - U+00BF 0x88888888, // U+00C0 - U+00C7 0x88888888, // U+00C8 - U+00CF -0x58888888, // U+00D0 - U+00D7 +0x88888888, // U+00D0 - U+00D7 0x88888888, // U+00D8 - U+00DF 0x88888888, // U+00E0 - U+00E7 0x88888888, // U+00E8 - U+00EF -0x58888888, // U+00F0 - U+00F7 +0x88888888, // U+00F0 - U+00F7 0x88888888, // U+00F8 - U+00FF }; @@ -77,7 +77,7 @@ static const PRUint32 gLBClass20[32] = { 0x55555555, // U+2000 - U+2007 0x88885555, // U+2008 - U+200F 0x88828888, // U+2010 - U+2017 -0x01100110, // U+2018 - U+201F +0x88888888, // U+2018 - U+201F 0x81118888, // U+2020 - U+2027 0x88888888, // U+2028 - U+202F 0x88884444, // U+2030 - U+2037 diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp index ff41b103878..bce7c4edfc8 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp @@ -349,12 +349,21 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker() NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker) -#define U_PERIOD ((PRUnichar) '.') -#define U_COMMA ((PRUnichar) ',') -#define U_SPACE ((PRUnichar) ' ') -#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019) +#define U_PERIOD PRUnichar('.') +#define U_COMMA PRUnichar(',') +#define U_SEMICOLON PRUnichar(';') +#define U_SLASH PRUnichar('/') +#define U_SPACE PRUnichar(' ') +#define U_HYPHEN PRUnichar('-') +#define U_EQUAL PRUnichar('=') +#define U_NULL PRUnichar(0x0000) +#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019) #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \ (c) == U_COMMA || \ + (c) == U_SEMICOLON || \ + (c) == U_SLASH || \ + (c) == U_HYPHEN || \ + (c) == U_EQUAL || \ (c) == U_RIGHT_SINGLE_QUOTATION_MARK) #define NUMERIC_CLASS 6 // JIS x4051 class 15 is now map to simplified class 6 #define CHARACTER_CLASS 8 // JIS x4051 class 18 is now map to simplified class 8 @@ -363,17 +372,17 @@ NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker) static PRInt8 ContextualAnalysis( PRUnichar prev, PRUnichar cur, PRUnichar next) { - if(U_COMMA == cur) + if(U_COMMA == cur || U_SEMICOLON == cur) { - if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next)) + if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next)) return NUMERIC_CLASS; } else if(U_PERIOD == cur) { - if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && - IS_ASCII_DIGIT (next)) + if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) && + IS_ASCII_DIGIT(next)) return NUMERIC_CLASS; - + // By assigning a full stop character class only when it's followed by // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) // doesn't matter, either way, we prevent lines from breaking around @@ -381,10 +390,18 @@ static PRInt8 ContextualAnalysis( // followed by CJK characters. With an additional condition of it being // preceded by class 0 or class > 5, we make sure that it does not // start a line (see bug 164759). - PRUint8 pc = GetClass(prev); + PRUint8 pc = prev != U_NULL ? GetClass(prev) : CHARACTER_CLASS; if((pc > 5 || pc == 0) && GetClass(next) > 5) return CHARACTER_CLASS; } + else if(U_SLASH == cur || U_HYPHEN == cur || U_EQUAL == cur) + { + // if slash is a first character, don't break at this point (e.g., "/root") + if (U_SLASH == cur && prev == U_NULL) + return CHARACTER_CLASS; + if (IS_ASCII_DIGIT(next)) + return NUMERIC_CLASS; + } else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur) { // somehow people use this as ' in "it's" sometimes... @@ -433,7 +450,7 @@ ROUTE_CJK_BETWEEN: PRInt8 c1, c2; if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1])) - c1 = ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0, + c1 = ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL, aText1[aTextLen1-1], aText2[0]); else @@ -442,7 +459,7 @@ ROUTE_CJK_BETWEEN: if(NEED_CONTEXTUAL_ANALYSIS(aText2[0])) c2 = ContextualAnalysis(aText1[aTextLen1-1], aText2[0], - (aTextLen2>1)?aText2[1]:0); + (aTextLen2>1)?aText2[1]:U_NULL); else c2 = GetClass(aText2[0]); @@ -481,9 +498,9 @@ ROUTE_CJK_NEXT: cur = aPos; if(NEED_CONTEXTUAL_ANALYSIS(aText[cur])) { - c1 = ContextualAnalysis((cur>0)?aText[cur-1]:0, + c1 = ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL, aText[cur], - (cur<(aLen-1)) ?aText[cur+1]:0); + (cur<(aLen-1)) ?aText[cur+1]:U_NULL); } else { c1 = GetClass(aText[cur]); } @@ -495,9 +512,9 @@ ROUTE_CJK_NEXT: { if(NEED_CONTEXTUAL_ANALYSIS(aText[cur])) { - c2 = ContextualAnalysis((cur>0)?aText[cur-1]:0, + c2 = ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL, aText[cur], - (cur<(aLen-1)) ?aText[cur+1]:0); + (cur<(aLen-1)) ?aText[cur+1]:U_NULL); } else { c2 = GetClass(aText[cur]); } @@ -537,9 +554,9 @@ ROUTE_CJK_PREV: PRInt8 c1, c2; if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1])) { - c2 = ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0, + c2 = ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL, aText[cur-1], - (cur0)?aText[cur-2]:0, + c1 = ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL, aText[cur-1], - (cur 0 ? aChars[cur - 1] : 0, + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch, - cur + 1 < aLength ? aChars[cur + 1] : 0); + cur + 1 < aLength ? aChars[cur + 1] : U_NULL); } else { cl = GetClass(ch); } @@ -599,3 +616,33 @@ nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLeng lastClass = cl; } } + +void +nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUint8* aChars, PRUint32 aLength, + PRPackedBool* aBreakBefore) +{ + PRUint32 cur; + PRInt8 lastClass = -1; + + for (cur = 0; cur < aLength; ++cur) { + PRUnichar ch = aChars[cur]; + PRInt8 cl; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, + ch, + cur + 1 < aLength ? aChars[cur + 1] : U_NULL); + } else { + cl = GetClass(ch); + } + + PRBool allowBreak; + if (cur > 0) { + allowBreak = GetPair(lastClass, cl); + } else { + allowBreak = PR_FALSE; + } + aBreakBefore[cur] = allowBreak; + lastClass = cl; + } +} diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.h b/intl/lwbrk/src/nsJISx4501LineBreaker.h index 51cdc250c43..764c9087b37 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.h +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h @@ -57,6 +57,8 @@ public: virtual void GetJISx4051Breaks(const PRUnichar* aText, PRUint32 aLength, PRPackedBool* aBreakBefore); + virtual void GetJISx4051Breaks(const PRUint8* aText, PRUint32 aLength, + PRPackedBool* aBreakBefore); }; #endif /* nsJISx4501LineBreaker_h__ */ diff --git a/intl/lwbrk/tools/anzx4501.html b/intl/lwbrk/tools/anzx4501.html index f416346d52c..327ecc19c6d 100644 --- a/intl/lwbrk/tools/anzx4501.html +++ b/intl/lwbrk/tools/anzx4501.html @@ -53,10 +53,10 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -17 +14 +2 - -17 +16 @@ -76,12 +76,12 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -4 +1 13 +1 - - +1 @@ -92,10 +92,10 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 27 2 -29 -3 +30 +6 -61 +65 @@ -111,30 +111,30 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -1 -1 + +2 +14 + + 14 -2 -9 -2 2 +3 1 - 02_7 -4 +1 -4 +1 @@ -155,7 +155,7 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -3 + @@ -171,33 +171,33 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -4 - -4 - - - - - - - - - - - - - - - - - - - - - - 3 +3 + + + + + + + + + + + + + + + + + + + + + + +2 + 1 @@ -209,10 +209,10 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -5 -2 +4 +1 -7 +5 @@ -233,27 +233,27 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -5 +4 1 -1 + 05_[b] -32 +33 154 53 -3 -316 2 +305 +13 560 32 - +1 @@ -268,7 +268,7 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 33 20 -1 + @@ -276,11 +276,11 @@ Analysis of JIS X 4051 to Unicode General Category Mapping -7 -309 + +305 -2 +13 06_15 @@ -361,23 +361,23 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 08_18 -9 -661 +10 +660 4 130 -51 -929 -14 -1798 +55 +940 +2 +1801 -9 +10 -370 +367 1 5 -285 +287 4 @@ -385,19 +385,19 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 3 127 3 +5 +3 +4 6 -3 -2 -2 -32 -3 +29 +5 12 -8 -258 -651 +10 +273 +645 1 1 -12 + 09_nbsp @@ -493,15 +493,15 @@ Analysis of JIS X 4051 to Unicode General Category Mapping X 00 -3 -9 +6 +14 -3 -3 -45 +2 +1 +33 10 -149 +156 @@ -558,15 +558,15 @@ Analysis of JIS X 4051 to Unicode General Category Mapping 20 -4 -6 -4 -4 +5 1 +4 +13 -90 + +86 diff --git a/intl/lwbrk/tools/jisx4501class.txt b/intl/lwbrk/tools/jisx4501class.txt index 3a7125cea8a..e0de2707f40 100644 --- a/intl/lwbrk/tools/jisx4501class.txt +++ b/intl/lwbrk/tools/jisx4501class.txt @@ -1,10 +1,7 @@ 0028;;1 +002F;;2 005B;;1 007B;;1 -2018;;1 -201B;;1 -201C;;1 -201F;;1 3008;;1 300A;;1 300C;;1 @@ -19,10 +16,6 @@ 002C;;2 005D;;2 007D;;2 -2019;;2 -201A;;2 -201D;;2 -201E;;2 3001;;2 3009;;2 300B;;2 @@ -67,7 +60,7 @@ 30FE;;3 0021;;4 003F;;4 -003A;;5 +003A;;18 003B;;5 30FB;;5 002E;;6 @@ -76,14 +69,14 @@ 2024;;2 2025;;2 2026;;2 -0024;;8 -005C;;8 +0024;;1 +005C;;1 00A3;;8 00A5;;8 2116;;8 -0025;;9 +0025;;2 00A2;;9 -00B0;;9 +00B0;;2 2030;;9 2031;;9 2032;;9 @@ -91,18 +84,18 @@ 3000;;10 3042;3094;11 3099;309E;3 -002B;;12 -002D;;12 +002B;;18 +002D;;2 003C;;1 -003D;;12 +003D;;2 003E;;2 -00A7;;12 -00A9;;12 -00AE;;12 -00B1;;12 -00B6;;12 -00D7;;12 -00F7;;12 +00A7;;18 +00A9;;18 +00AE;;18 +00B1;;18 +00B6;;18 +00D7;;18 +00F7;;18 203B;;12 2160;217F;12 2190;21EA;a12