Bug 255990 Characters below U+0100 are not subject to line-breaking rules at all. the patch based on jshin's patch. r=jshin, sr=roc

2006-07-13 17:42:39 +00:00 · 2006-07-13 17:42:39 +00:00 · 4ca1c88ab7
--- a/intl/lwbrk/public/nsILineBreaker.h
+++ b/intl/lwbrk/public/nsILineBreaker.h
@ -43,10 +43,10 @@

 #define NS_LINEBREAKER_NEED_MORE_TEXT -1

-// {E86B3375-BF89-11d2-B3AF-00805F8A6670}
+// {7509772F-770C-44e8-AAFA-8032E5A35370}
 #define NS_ILINEBREAKER_IID \
-{ 0xe86b3375, 0xbf89, 0x11d2, \
-    { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+{ 0x7509772f, 0x770c, 0x44e8, \
+    { 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } }


 class nsILineBreaker : public nsISupports
@ -57,6 +57,10 @@ public:
                                 const PRUnichar* aText2 , 
                                 PRUint32 aTextLen2) = 0;

+  virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
+                                       PRUnichar aChar2) = 0; 
+
+
  virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, 
                        PRUint32 aPos) = 0;

--- a/intl/lwbrk/src/jisx4501class.h
+++ b/intl/lwbrk/src/jisx4501class.h
@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = {
 0x55555555, // U+0010 - U+0017
 0x55555555, // U+0018 - U+001F
 0x88438815, // U+0020 - U+0027
-0x81515810, // U+0028 - U+002F
+0x11515810, // U+0028 - U+002F
 0x66666666, // U+0030 - U+0037
 0x11501166, // U+0038 - U+003F
 0x88888888, // U+0040 - U+0047
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
@ -350,12 +350,19 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker()

 NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)

-#define U_PERIOD ((PRUnichar) '.')
-#define U_COMMA ((PRUnichar) ',')
-#define U_SPACE ((PRUnichar) ' ')
-#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
+#define U_PERIOD    PRUnichar('.')
+#define U_COMMA     PRUnichar(',')
+#define U_COLON     PRUnichar(':')
+#define U_SEMICOLON PRUnichar(';')
+#define U_SLASH     PRUnichar('/')
+#define U_SPACE     PRUnichar(' ')
+#define U_NULL      PRUnichar(0x0000)
+#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019)
 #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
                                     (c) == U_COMMA || \
+                                     (c) == U_COLON || \
+                                     (c) == U_SEMICOLON || \
+                                     (c) == U_SLASH || \
                                     (c) == U_RIGHT_SINGLE_QUOTATION_MARK)
 #define NUMERIC_CLASS  6 // JIS x4051 class 15 is now map to simplified class 6
 #define CHARACTER_CLASS  8 // JIS x4051 class 18 is now map to simplified class 8
@ -365,17 +372,17 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
  PRUnichar prev, PRUnichar cur, PRUnichar next
 )
 {
-   if(U_COMMA == cur)
+   if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur)
   {
-     if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
+     if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next))
       return NUMERIC_CLASS;
   }
   else if(U_PERIOD == cur)
   {
-     if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && 
-         IS_ASCII_DIGIT (next))
+     if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) &&
+        IS_ASCII_DIGIT(next))
       return NUMERIC_CLASS;
- 
+
     // By assigning a full stop  character class only when it's followed by
     // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) 
     // doesn't matter, either way, we prevent lines from breaking around 
@ -387,6 +394,12 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
     if((pc > 5 || pc == 0)  && GetClass(next) > 5)
       return CHARACTER_CLASS;
   }
+   else if(U_SLASH == cur)
+   {
+     // We don't need to check prev character. Because SLASH breaks only after.
+     if (IS_ASCII_DIGIT(next))
+       return NUMERIC_CLASS;
+   }
   else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
   {
     // somehow people use this as ' in "it's" sometimes...
@ -396,6 +409,25 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
   return this->GetClass(cur);
 }

+PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1,
+                                                    PRUnichar aChar2)
+{
+  NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input");
+
+  PRInt8 c1, c2;
+  if(NEED_CONTEXTUAL_ANALYSIS(aChar1))
+    c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2);
+  else 
+    c1 = this->GetClass(aChar1); 
+
+  if(NEED_CONTEXTUAL_ANALYSIS(aChar2))
+    c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL); 
+  else 
+    c2 = this->GetClass(aChar2); 
+
+  return GetPair(c1, c2);
+}
+

 PRBool nsJISx4051LineBreaker::BreakInBetween(
  const PRUnichar* aText1 , PRUint32 aTextLen1,
@ -408,34 +440,9 @@ PRBool nsJISx4051LineBreaker::BreakInBetween(
     return PR_FALSE;
  }

-  //search for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRInt32 cur;
-
-  for (cur= aTextLen1-1; cur>=0; cur--)
-  {
-    if (IS_SPACE(aText1[cur]))
-      break;
-    if (IS_CJK_CHAR(aText1[cur]))
-      goto ROUTE_CJK_BETWEEN;
-  }
-
-  for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
-  {
-    if (IS_SPACE(aText2[cur]))
-      break;
-    if (IS_CJK_CHAR(aText2[cur]))
-      goto ROUTE_CJK_BETWEEN;
-  }
-
-  //now apply western rule.
-  return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
-
-ROUTE_CJK_BETWEEN:
-
  PRInt8 c1, c2;
  if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
-    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
+    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL,
                                  aText1[aTextLen1-1],
                                  aText2[0]);
  else 
@ -444,7 +451,7 @@ ROUTE_CJK_BETWEEN:
  if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
    c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
                                  aText2[0],
-                                  (aTextLen2>1)?aText2[1]:0);
+                                  (aTextLen2>1)?aText2[1]:U_NULL);
  else 
    c2 = this->GetClass(aText2[0]);

@ -466,26 +473,13 @@ PRInt32 nsJISx4051LineBreaker::Next(
  NS_ASSERTION(aText, "aText shouldn't be null");
  NS_ASSERTION(aLen > aPos, "Illegal value (length > position)");

-  //forward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos; cur < aLen; ++cur)
-  {
-    if (IS_SPACE(aText[cur]))
-      return cur;
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_NEXT;
-  }
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_NEXT:
  PRInt8 c1, c2;
-  cur = aPos;
+  PRUint32 cur = aPos;
  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
  {
-    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
+    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
                                  aText[cur],
-                                  (cur<(aLen-1)) ?aText[cur+1]:0);
+                                  (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
  } else  {
    c1 = this->GetClass(aText[cur]);
  }
@ -497,9 +491,9 @@ ROUTE_CJK_NEXT:
  {
     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
     {
-       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
+       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
                                  aText[cur],
-                                  (cur<(aLen-1)) ?aText[cur+1]:0);
+                                  (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
     } else {
       c2 = this->GetClass(aText[cur]);
     }
@ -517,31 +511,13 @@ PRInt32 nsJISx4051LineBreaker::Prev(
 {
  NS_ASSERTION(aText, "aText shouldn't be null");

-  //backward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos - 1; cur > 0; --cur)
-  {
-    if (IS_SPACE(aText[cur]))
-    {
-      if (cur != aPos - 1) // XXXldb Why?
-        ++cur;
-      return cur;
-    }
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_PREV;
-  }
-
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_PREV:
-  cur = aPos;
+  PRUint32 cur = aPos;
  PRInt8 c1, c2;
  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
  {
-    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
+    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
                                  aText[cur-1],
-                                  (cur<aLen) ?aText[cur]:0);
+                                  (cur<aLen) ?aText[cur]:U_NULL);
  } else  {
    c2 = this->GetClass(aText[cur-1]);
  }
@ -553,9 +529,9 @@ ROUTE_CJK_PREV:
  {
     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
     {
-       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
+       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
                                  aText[cur-1],
-                                  (cur<aLen) ?aText[cur]:0);
+                                  (cur<aLen) ?aText[cur]:U_NULL);
     } else {
       c1 = this->GetClass(aText[cur-1]);
     }
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.h
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h
@ -48,6 +48,9 @@ public:
  nsJISx4051LineBreaker();
  virtual ~nsJISx4051LineBreaker();

+  PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
+                               PRUnichar aChar2);
+
  PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
                         const PRUnichar* aText2 , PRUint32 aTextLen2);

--- a/intl/lwbrk/tools/jisx4501class.txt
+++ b/intl/lwbrk/tools/jisx4501class.txt
@ -1,4 +1,5 @@
 0028;;1
+002F;;2
 005B;;1
 007B;;1
 2018;;1
--- a/layout/generic/nsTextTransformer.cpp
+++ b/layout/generic/nsTextTransformer.cpp
@ -348,8 +348,11 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
    bp2 += mBufferPos;
  }

+  PRUnichar prevCh;
+  PRUnichar ch = 0;
  for (; offset < fragLen; offset++) {
-    unsigned char ch = *cp++;
+    prevCh = (ch == ' ') ? CH_NBSP : ch;
+    ch = *cp++;
    if (XP_IS_SPACE(ch)) {
      break;
    }
@ -357,6 +360,10 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
      ch = ' ';
      *aWasTransformed = PR_TRUE;
    }
+    else if (offset != mOffset &&
+             nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) {
+      break;
+    }
    else if (IS_DISCARDED(ch)) {
      // Strip discarded characters from the transformed output
      continue;