From 4ca1c88ab76c1469518d6583f0e53880698a6e94 Mon Sep 17 00:00:00 2001 From: "masayuki%d-toybox.com" Date: Thu, 13 Jul 2006 17:42:39 +0000 Subject: [PATCH] Bug 255990 Characters below U+0100 are not subject to line-breaking rules at all. the patch based on jshin's patch. r=jshin, sr=roc --- intl/lwbrk/public/nsILineBreaker.h | 10 +- intl/lwbrk/src/jisx4501class.h | 2 +- intl/lwbrk/src/nsJISx4501LineBreaker.cpp | 130 +++++++++-------------- intl/lwbrk/src/nsJISx4501LineBreaker.h | 3 + intl/lwbrk/tools/jisx4501class.txt | 1 + layout/generic/nsTextTransformer.cpp | 9 +- 6 files changed, 73 insertions(+), 82 deletions(-) diff --git a/intl/lwbrk/public/nsILineBreaker.h b/intl/lwbrk/public/nsILineBreaker.h index 85078823ae3..2c1a8658f43 100644 --- a/intl/lwbrk/public/nsILineBreaker.h +++ b/intl/lwbrk/public/nsILineBreaker.h @@ -43,10 +43,10 @@ #define NS_LINEBREAKER_NEED_MORE_TEXT -1 -// {E86B3375-BF89-11d2-B3AF-00805F8A6670} +// {7509772F-770C-44e8-AAFA-8032E5A35370} #define NS_ILINEBREAKER_IID \ -{ 0xe86b3375, 0xbf89, 0x11d2, \ - { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } +{ 0x7509772f, 0x770c, 0x44e8, \ + { 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } } class nsILineBreaker : public nsISupports @@ -57,6 +57,10 @@ public: const PRUnichar* aText2 , PRUint32 aTextLen2) = 0; + virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1, + PRUnichar aChar2) = 0; + + virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos) = 0; diff --git a/intl/lwbrk/src/jisx4501class.h b/intl/lwbrk/src/jisx4501class.h index 470671d9855..a82b098d18e 100644 --- a/intl/lwbrk/src/jisx4501class.h +++ b/intl/lwbrk/src/jisx4501class.h @@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = { 0x55555555, // U+0010 - U+0017 0x55555555, // U+0018 - U+001F 0x88438815, // U+0020 - U+0027 -0x81515810, // U+0028 - U+002F +0x11515810, // U+0028 - U+002F 0x66666666, // U+0030 - U+0037 0x11501166, // U+0038 - U+003F 0x88888888, // U+0040 - U+0047 diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp index 15cf6505d10..e82fd552663 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp @@ -350,12 +350,19 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker() NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker) -#define U_PERIOD ((PRUnichar) '.') -#define U_COMMA ((PRUnichar) ',') -#define U_SPACE ((PRUnichar) ' ') -#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019) +#define U_PERIOD PRUnichar('.') +#define U_COMMA PRUnichar(',') +#define U_COLON PRUnichar(':') +#define U_SEMICOLON PRUnichar(';') +#define U_SLASH PRUnichar('/') +#define U_SPACE PRUnichar(' ') +#define U_NULL PRUnichar(0x0000) +#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019) #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \ (c) == U_COMMA || \ + (c) == U_COLON || \ + (c) == U_SEMICOLON || \ + (c) == U_SLASH || \ (c) == U_RIGHT_SINGLE_QUOTATION_MARK) #define NUMERIC_CLASS 6 // JIS x4051 class 15 is now map to simplified class 6 #define CHARACTER_CLASS 8 // JIS x4051 class 18 is now map to simplified class 8 @@ -365,17 +372,17 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis( PRUnichar prev, PRUnichar cur, PRUnichar next ) { - if(U_COMMA == cur) + if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur) { - if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next)) + if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next)) return NUMERIC_CLASS; } else if(U_PERIOD == cur) { - if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && - IS_ASCII_DIGIT (next)) + if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) && + IS_ASCII_DIGIT(next)) return NUMERIC_CLASS; - + // By assigning a full stop character class only when it's followed by // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) // doesn't matter, either way, we prevent lines from breaking around @@ -387,6 +394,12 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis( if((pc > 5 || pc == 0) && GetClass(next) > 5) return CHARACTER_CLASS; } + else if(U_SLASH == cur) + { + // We don't need to check prev character. Because SLASH breaks only after. + if (IS_ASCII_DIGIT(next)) + return NUMERIC_CLASS; + } else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur) { // somehow people use this as ' in "it's" sometimes... @@ -396,6 +409,25 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis( return this->GetClass(cur); } +PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1, + PRUnichar aChar2) +{ + NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input"); + + PRInt8 c1, c2; + if(NEED_CONTEXTUAL_ANALYSIS(aChar1)) + c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2); + else + c1 = this->GetClass(aChar1); + + if(NEED_CONTEXTUAL_ANALYSIS(aChar2)) + c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL); + else + c2 = this->GetClass(aChar2); + + return GetPair(c1, c2); +} + PRBool nsJISx4051LineBreaker::BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1, @@ -408,34 +440,9 @@ PRBool nsJISx4051LineBreaker::BreakInBetween( return PR_FALSE; } - //search for CJK characters until a space is found. - //if CJK char is found before space, use 4051, otherwise western - PRInt32 cur; - - for (cur= aTextLen1-1; cur>=0; cur--) - { - if (IS_SPACE(aText1[cur])) - break; - if (IS_CJK_CHAR(aText1[cur])) - goto ROUTE_CJK_BETWEEN; - } - - for (cur= 0; cur < (PRInt32)aTextLen2; cur++) - { - if (IS_SPACE(aText2[cur])) - break; - if (IS_CJK_CHAR(aText2[cur])) - goto ROUTE_CJK_BETWEEN; - } - - //now apply western rule. - return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]); - -ROUTE_CJK_BETWEEN: - PRInt8 c1, c2; if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1])) - c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0, + c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL, aText1[aTextLen1-1], aText2[0]); else @@ -444,7 +451,7 @@ ROUTE_CJK_BETWEEN: if(NEED_CONTEXTUAL_ANALYSIS(aText2[0])) c2 = this->ContextualAnalysis(aText1[aTextLen1-1], aText2[0], - (aTextLen2>1)?aText2[1]:0); + (aTextLen2>1)?aText2[1]:U_NULL); else c2 = this->GetClass(aText2[0]); @@ -466,26 +473,13 @@ PRInt32 nsJISx4051LineBreaker::Next( NS_ASSERTION(aText, "aText shouldn't be null"); NS_ASSERTION(aLen > aPos, "Illegal value (length > position)"); - //forward check for CJK characters until a space is found. - //if CJK char is found before space, use 4051, otherwise western - PRUint32 cur; - for (cur = aPos; cur < aLen; ++cur) - { - if (IS_SPACE(aText[cur])) - return cur; - if (IS_CJK_CHAR(aText[cur])) - goto ROUTE_CJK_NEXT; - } - return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text - -ROUTE_CJK_NEXT: PRInt8 c1, c2; - cur = aPos; + PRUint32 cur = aPos; if(NEED_CONTEXTUAL_ANALYSIS(aText[cur])) { - c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0, + c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL, aText[cur], - (cur<(aLen-1)) ?aText[cur+1]:0); + (cur<(aLen-1)) ?aText[cur+1]:U_NULL); } else { c1 = this->GetClass(aText[cur]); } @@ -497,9 +491,9 @@ ROUTE_CJK_NEXT: { if(NEED_CONTEXTUAL_ANALYSIS(aText[cur])) { - c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0, + c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL, aText[cur], - (cur<(aLen-1)) ?aText[cur+1]:0); + (cur<(aLen-1)) ?aText[cur+1]:U_NULL); } else { c2 = this->GetClass(aText[cur]); } @@ -517,31 +511,13 @@ PRInt32 nsJISx4051LineBreaker::Prev( { NS_ASSERTION(aText, "aText shouldn't be null"); - //backward check for CJK characters until a space is found. - //if CJK char is found before space, use 4051, otherwise western - PRUint32 cur; - for (cur = aPos - 1; cur > 0; --cur) - { - if (IS_SPACE(aText[cur])) - { - if (cur != aPos - 1) // XXXldb Why? - ++cur; - return cur; - } - if (IS_CJK_CHAR(aText[cur])) - goto ROUTE_CJK_PREV; - } - - return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text - -ROUTE_CJK_PREV: - cur = aPos; + PRUint32 cur = aPos; PRInt8 c1, c2; if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1])) { - c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0, + c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL, aText[cur-1], - (curGetClass(aText[cur-1]); } @@ -553,9 +529,9 @@ ROUTE_CJK_PREV: { if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1])) { - c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0, + c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL, aText[cur-1], - (curGetClass(aText[cur-1]); } diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.h b/intl/lwbrk/src/nsJISx4501LineBreaker.h index 6ad374f200e..b1a58d5c550 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.h +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h @@ -48,6 +48,9 @@ public: nsJISx4051LineBreaker(); virtual ~nsJISx4051LineBreaker(); + PRBool CanBreakBetweenLatin1(PRUnichar aChar1, + PRUnichar aChar2); + PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1, const PRUnichar* aText2 , PRUint32 aTextLen2); diff --git a/intl/lwbrk/tools/jisx4501class.txt b/intl/lwbrk/tools/jisx4501class.txt index 3a7125cea8a..c94d0d9a955 100644 --- a/intl/lwbrk/tools/jisx4501class.txt +++ b/intl/lwbrk/tools/jisx4501class.txt @@ -1,4 +1,5 @@ 0028;;1 +002F;;2 005B;;1 007B;;1 2018;;1 diff --git a/layout/generic/nsTextTransformer.cpp b/layout/generic/nsTextTransformer.cpp index fc17f13b81f..07ef3cb1f8d 100644 --- a/layout/generic/nsTextTransformer.cpp +++ b/layout/generic/nsTextTransformer.cpp @@ -348,8 +348,11 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen, bp2 += mBufferPos; } + PRUnichar prevCh; + PRUnichar ch = 0; for (; offset < fragLen; offset++) { - unsigned char ch = *cp++; + prevCh = (ch == ' ') ? CH_NBSP : ch; + ch = *cp++; if (XP_IS_SPACE(ch)) { break; } @@ -357,6 +360,10 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen, ch = ' '; *aWasTransformed = PR_TRUE; } + else if (offset != mOffset && + nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) { + break; + } else if (IS_DISCARDED(ch)) { // Strip discarded characters from the transformed output continue;