Bug 255990 Characters below U+0100 are not subject to line-breaking rules at all. the patch based on jshin's patch. r=jshin, sr=roc

This commit is contained in:
masayuki%d-toybox.com 2006-07-13 17:42:39 +00:00
Родитель 634828ca74
Коммит 421034d9e5
6 изменённых файлов: 73 добавлений и 82 удалений

Просмотреть файл

@ -43,10 +43,10 @@
#define NS_LINEBREAKER_NEED_MORE_TEXT -1
// {E86B3375-BF89-11d2-B3AF-00805F8A6670}
// {7509772F-770C-44e8-AAFA-8032E5A35370}
#define NS_ILINEBREAKER_IID \
{ 0xe86b3375, 0xbf89, 0x11d2, \
{ 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
{ 0x7509772f, 0x770c, 0x44e8, \
{ 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } }
class nsILineBreaker : public nsISupports
@ -57,6 +57,10 @@ public:
const PRUnichar* aText2 ,
PRUint32 aTextLen2) = 0;
virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
PRUnichar aChar2) = 0;
virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen,
PRUint32 aPos) = 0;

Просмотреть файл

@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = {
0x55555555, // U+0010 - U+0017
0x55555555, // U+0018 - U+001F
0x88438815, // U+0020 - U+0027
0x81515810, // U+0028 - U+002F
0x11515810, // U+0028 - U+002F
0x66666666, // U+0030 - U+0037
0x11501166, // U+0038 - U+003F
0x88888888, // U+0040 - U+0047

Просмотреть файл

@ -350,12 +350,19 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)
#define U_PERIOD ((PRUnichar) '.')
#define U_COMMA ((PRUnichar) ',')
#define U_SPACE ((PRUnichar) ' ')
#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
#define U_PERIOD PRUnichar('.')
#define U_COMMA PRUnichar(',')
#define U_COLON PRUnichar(':')
#define U_SEMICOLON PRUnichar(';')
#define U_SLASH PRUnichar('/')
#define U_SPACE PRUnichar(' ')
#define U_NULL PRUnichar(0x0000)
#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019)
#define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
(c) == U_COMMA || \
(c) == U_COLON || \
(c) == U_SEMICOLON || \
(c) == U_SLASH || \
(c) == U_RIGHT_SINGLE_QUOTATION_MARK)
#define NUMERIC_CLASS 6 // JIS x4051 class 15 is now map to simplified class 6
#define CHARACTER_CLASS 8 // JIS x4051 class 18 is now map to simplified class 8
@ -365,17 +372,17 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis(
PRUnichar prev, PRUnichar cur, PRUnichar next
)
{
if(U_COMMA == cur)
if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur)
{
if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next))
return NUMERIC_CLASS;
}
else if(U_PERIOD == cur)
{
if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) &&
IS_ASCII_DIGIT (next))
if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) &&
IS_ASCII_DIGIT(next))
return NUMERIC_CLASS;
// By assigning a full stop character class only when it's followed by
// class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai)
// doesn't matter, either way, we prevent lines from breaking around
@ -387,6 +394,12 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis(
if((pc > 5 || pc == 0) && GetClass(next) > 5)
return CHARACTER_CLASS;
}
else if(U_SLASH == cur)
{
// We don't need to check prev character. Because SLASH breaks only after.
if (IS_ASCII_DIGIT(next))
return NUMERIC_CLASS;
}
else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
{
// somehow people use this as ' in "it's" sometimes...
@ -396,6 +409,25 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis(
return this->GetClass(cur);
}
PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1,
PRUnichar aChar2)
{
NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input");
PRInt8 c1, c2;
if(NEED_CONTEXTUAL_ANALYSIS(aChar1))
c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2);
else
c1 = this->GetClass(aChar1);
if(NEED_CONTEXTUAL_ANALYSIS(aChar2))
c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL);
else
c2 = this->GetClass(aChar2);
return GetPair(c1, c2);
}
PRBool nsJISx4051LineBreaker::BreakInBetween(
const PRUnichar* aText1 , PRUint32 aTextLen1,
@ -408,34 +440,9 @@ PRBool nsJISx4051LineBreaker::BreakInBetween(
return PR_FALSE;
}
//search for CJK characters until a space is found.
//if CJK char is found before space, use 4051, otherwise western
PRInt32 cur;
for (cur= aTextLen1-1; cur>=0; cur--)
{
if (IS_SPACE(aText1[cur]))
break;
if (IS_CJK_CHAR(aText1[cur]))
goto ROUTE_CJK_BETWEEN;
}
for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
{
if (IS_SPACE(aText2[cur]))
break;
if (IS_CJK_CHAR(aText2[cur]))
goto ROUTE_CJK_BETWEEN;
}
//now apply western rule.
return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
ROUTE_CJK_BETWEEN:
PRInt8 c1, c2;
if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL,
aText1[aTextLen1-1],
aText2[0]);
else
@ -444,7 +451,7 @@ ROUTE_CJK_BETWEEN:
if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
aText2[0],
(aTextLen2>1)?aText2[1]:0);
(aTextLen2>1)?aText2[1]:U_NULL);
else
c2 = this->GetClass(aText2[0]);
@ -466,26 +473,13 @@ PRInt32 nsJISx4051LineBreaker::Next(
NS_ASSERTION(aText, "aText shouldn't be null");
NS_ASSERTION(aLen > aPos, "Illegal value (length > position)");
//forward check for CJK characters until a space is found.
//if CJK char is found before space, use 4051, otherwise western
PRUint32 cur;
for (cur = aPos; cur < aLen; ++cur)
{
if (IS_SPACE(aText[cur]))
return cur;
if (IS_CJK_CHAR(aText[cur]))
goto ROUTE_CJK_NEXT;
}
return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
ROUTE_CJK_NEXT:
PRInt8 c1, c2;
cur = aPos;
PRUint32 cur = aPos;
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
{
c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
aText[cur],
(cur<(aLen-1)) ?aText[cur+1]:0);
(cur<(aLen-1)) ?aText[cur+1]:U_NULL);
} else {
c1 = this->GetClass(aText[cur]);
}
@ -497,9 +491,9 @@ ROUTE_CJK_NEXT:
{
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
{
c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
aText[cur],
(cur<(aLen-1)) ?aText[cur+1]:0);
(cur<(aLen-1)) ?aText[cur+1]:U_NULL);
} else {
c2 = this->GetClass(aText[cur]);
}
@ -517,31 +511,13 @@ PRInt32 nsJISx4051LineBreaker::Prev(
{
NS_ASSERTION(aText, "aText shouldn't be null");
//backward check for CJK characters until a space is found.
//if CJK char is found before space, use 4051, otherwise western
PRUint32 cur;
for (cur = aPos - 1; cur > 0; --cur)
{
if (IS_SPACE(aText[cur]))
{
if (cur != aPos - 1) // XXXldb Why?
++cur;
return cur;
}
if (IS_CJK_CHAR(aText[cur]))
goto ROUTE_CJK_PREV;
}
return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
ROUTE_CJK_PREV:
cur = aPos;
PRUint32 cur = aPos;
PRInt8 c1, c2;
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
{
c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
aText[cur-1],
(cur<aLen) ?aText[cur]:0);
(cur<aLen) ?aText[cur]:U_NULL);
} else {
c2 = this->GetClass(aText[cur-1]);
}
@ -553,9 +529,9 @@ ROUTE_CJK_PREV:
{
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
{
c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
aText[cur-1],
(cur<aLen) ?aText[cur]:0);
(cur<aLen) ?aText[cur]:U_NULL);
} else {
c1 = this->GetClass(aText[cur-1]);
}

Просмотреть файл

@ -48,6 +48,9 @@ public:
nsJISx4051LineBreaker();
virtual ~nsJISx4051LineBreaker();
PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
PRUnichar aChar2);
PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
const PRUnichar* aText2 , PRUint32 aTextLen2);

Просмотреть файл

@ -1,4 +1,5 @@
0028;;1
002F;;2
005B;;1
007B;;1
2018;;1

Просмотреть файл

@ -348,8 +348,11 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
bp2 += mBufferPos;
}
PRUnichar prevCh;
PRUnichar ch = 0;
for (; offset < fragLen; offset++) {
unsigned char ch = *cp++;
prevCh = (ch == ' ') ? CH_NBSP : ch;
ch = *cp++;
if (XP_IS_SPACE(ch)) {
break;
}
@ -357,6 +360,10 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
ch = ' ';
*aWasTransformed = PR_TRUE;
}
else if (offset != mOffset &&
nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) {
break;
}
else if (IS_DISCARDED(ch)) {
// Strip discarded characters from the transformed output
continue;