From 4ca1c88ab76c1469518d6583f0e53880698a6e94 Mon Sep 17 00:00:00 2001
From: "masayuki%d-toybox.com" <masayuki%d-toybox.com>
Date: Thu, 13 Jul 2006 17:42:39 +0000
Subject: [PATCH] Bug 255990 Characters below U+0100 are not subject to
 line-breaking rules at all. the patch based on jshin's patch. r=jshin, sr=roc

---
 intl/lwbrk/public/nsILineBreaker.h       |  10 +-
 intl/lwbrk/src/jisx4501class.h           |   2 +-
 intl/lwbrk/src/nsJISx4501LineBreaker.cpp | 130 +++++++++--------------
 intl/lwbrk/src/nsJISx4501LineBreaker.h   |   3 +
 intl/lwbrk/tools/jisx4501class.txt       |   1 +
 layout/generic/nsTextTransformer.cpp     |   9 +-
 6 files changed, 73 insertions(+), 82 deletions(-)

diff --git a/intl/lwbrk/public/nsILineBreaker.h b/intl/lwbrk/public/nsILineBreaker.h
index 85078823ae3..2c1a8658f43 100644
--- a/intl/lwbrk/public/nsILineBreaker.h
+++ b/intl/lwbrk/public/nsILineBreaker.h
@@ -43,10 +43,10 @@
 
 #define NS_LINEBREAKER_NEED_MORE_TEXT -1
 
-// {E86B3375-BF89-11d2-B3AF-00805F8A6670}
+// {7509772F-770C-44e8-AAFA-8032E5A35370}
 #define NS_ILINEBREAKER_IID \
-{ 0xe86b3375, 0xbf89, 0x11d2, \
-    { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+{ 0x7509772f, 0x770c, 0x44e8, \
+    { 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } }
 
 
 class nsILineBreaker : public nsISupports
@@ -57,6 +57,10 @@ public:
                                  const PRUnichar* aText2 , 
                                  PRUint32 aTextLen2) = 0;
 
+  virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
+                                       PRUnichar aChar2) = 0; 
+
+
   virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, 
                         PRUint32 aPos) = 0;
 
diff --git a/intl/lwbrk/src/jisx4501class.h b/intl/lwbrk/src/jisx4501class.h
index 470671d9855..a82b098d18e 100644
--- a/intl/lwbrk/src/jisx4501class.h
+++ b/intl/lwbrk/src/jisx4501class.h
@@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = {
 0x55555555, // U+0010 - U+0017
 0x55555555, // U+0018 - U+001F
 0x88438815, // U+0020 - U+0027
-0x81515810, // U+0028 - U+002F
+0x11515810, // U+0028 - U+002F
 0x66666666, // U+0030 - U+0037
 0x11501166, // U+0038 - U+003F
 0x88888888, // U+0040 - U+0047
diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
index 15cf6505d10..e82fd552663 100644
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
@@ -350,12 +350,19 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
 
 NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)
 
-#define U_PERIOD ((PRUnichar) '.')
-#define U_COMMA ((PRUnichar) ',')
-#define U_SPACE ((PRUnichar) ' ')
-#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
+#define U_PERIOD    PRUnichar('.')
+#define U_COMMA     PRUnichar(',')
+#define U_COLON     PRUnichar(':')
+#define U_SEMICOLON PRUnichar(';')
+#define U_SLASH     PRUnichar('/')
+#define U_SPACE     PRUnichar(' ')
+#define U_NULL      PRUnichar(0x0000)
+#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019)
 #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
                                      (c) == U_COMMA || \
+                                     (c) == U_COLON || \
+                                     (c) == U_SEMICOLON || \
+                                     (c) == U_SLASH || \
                                      (c) == U_RIGHT_SINGLE_QUOTATION_MARK)
 #define NUMERIC_CLASS  6 // JIS x4051 class 15 is now map to simplified class 6
 #define CHARACTER_CLASS  8 // JIS x4051 class 18 is now map to simplified class 8
@@ -365,17 +372,17 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
   PRUnichar prev, PRUnichar cur, PRUnichar next
 )
 {
-   if(U_COMMA == cur)
+   if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur)
    {
-     if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
+     if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next))
        return NUMERIC_CLASS;
    }
    else if(U_PERIOD == cur)
    {
-     if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && 
-         IS_ASCII_DIGIT (next))
+     if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) &&
+        IS_ASCII_DIGIT(next))
        return NUMERIC_CLASS;
- 
+
      // By assigning a full stop  character class only when it's followed by
      // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) 
      // doesn't matter, either way, we prevent lines from breaking around 
@@ -387,6 +394,12 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
      if((pc > 5 || pc == 0)  && GetClass(next) > 5)
        return CHARACTER_CLASS;
    }
+   else if(U_SLASH == cur)
+   {
+     // We don't need to check prev character. Because SLASH breaks only after.
+     if (IS_ASCII_DIGIT(next))
+       return NUMERIC_CLASS;
+   }
    else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
    {
      // somehow people use this as ' in "it's" sometimes...
@@ -396,6 +409,25 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
    return this->GetClass(cur);
 }
 
+PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1,
+                                                    PRUnichar aChar2)
+{
+  NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input");
+
+  PRInt8 c1, c2;
+  if(NEED_CONTEXTUAL_ANALYSIS(aChar1))
+    c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2);
+  else 
+    c1 = this->GetClass(aChar1); 
+
+  if(NEED_CONTEXTUAL_ANALYSIS(aChar2))
+    c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL); 
+  else 
+    c2 = this->GetClass(aChar2); 
+
+  return GetPair(c1, c2);
+}
+
 
 PRBool nsJISx4051LineBreaker::BreakInBetween(
   const PRUnichar* aText1 , PRUint32 aTextLen1,
@@ -408,34 +440,9 @@ PRBool nsJISx4051LineBreaker::BreakInBetween(
      return PR_FALSE;
   }
 
-  //search for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRInt32 cur;
-
-  for (cur= aTextLen1-1; cur>=0; cur--)
-  {
-    if (IS_SPACE(aText1[cur]))
-      break;
-    if (IS_CJK_CHAR(aText1[cur]))
-      goto ROUTE_CJK_BETWEEN;
-  }
-
-  for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
-  {
-    if (IS_SPACE(aText2[cur]))
-      break;
-    if (IS_CJK_CHAR(aText2[cur]))
-      goto ROUTE_CJK_BETWEEN;
-  }
-
-  //now apply western rule.
-  return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
-
-ROUTE_CJK_BETWEEN:
-
   PRInt8 c1, c2;
   if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
-    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
+    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL,
                                   aText1[aTextLen1-1],
                                   aText2[0]);
   else 
@@ -444,7 +451,7 @@ ROUTE_CJK_BETWEEN:
   if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
     c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
                                   aText2[0],
-                                  (aTextLen2>1)?aText2[1]:0);
+                                  (aTextLen2>1)?aText2[1]:U_NULL);
   else 
     c2 = this->GetClass(aText2[0]);
 
@@ -466,26 +473,13 @@ PRInt32 nsJISx4051LineBreaker::Next(
   NS_ASSERTION(aText, "aText shouldn't be null");
   NS_ASSERTION(aLen > aPos, "Illegal value (length > position)");
 
-  //forward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos; cur < aLen; ++cur)
-  {
-    if (IS_SPACE(aText[cur]))
-      return cur;
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_NEXT;
-  }
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_NEXT:
   PRInt8 c1, c2;
-  cur = aPos;
+  PRUint32 cur = aPos;
   if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
   {
-    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
+    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
                                   aText[cur],
-                                  (cur<(aLen-1)) ?aText[cur+1]:0);
+                                  (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
   } else  {
     c1 = this->GetClass(aText[cur]);
   }
@@ -497,9 +491,9 @@ ROUTE_CJK_NEXT:
   {
      if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
      {
-       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
+       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
                                   aText[cur],
-                                  (cur<(aLen-1)) ?aText[cur+1]:0);
+                                  (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
      } else {
        c2 = this->GetClass(aText[cur]);
      }
@@ -517,31 +511,13 @@ PRInt32 nsJISx4051LineBreaker::Prev(
 {
   NS_ASSERTION(aText, "aText shouldn't be null");
 
-  //backward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos - 1; cur > 0; --cur)
-  {
-    if (IS_SPACE(aText[cur]))
-    {
-      if (cur != aPos - 1) // XXXldb Why?
-        ++cur;
-      return cur;
-    }
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_PREV;
-  }
-
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_PREV:
-  cur = aPos;
+  PRUint32 cur = aPos;
   PRInt8 c1, c2;
   if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
   {
-    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
+    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
                                   aText[cur-1],
-                                  (cur<aLen) ?aText[cur]:0);
+                                  (cur<aLen) ?aText[cur]:U_NULL);
   } else  {
     c2 = this->GetClass(aText[cur-1]);
   }
@@ -553,9 +529,9 @@ ROUTE_CJK_PREV:
   {
      if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
      {
-       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
+       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
                                   aText[cur-1],
-                                  (cur<aLen) ?aText[cur]:0);
+                                  (cur<aLen) ?aText[cur]:U_NULL);
      } else {
        c1 = this->GetClass(aText[cur-1]);
      }
diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.h b/intl/lwbrk/src/nsJISx4501LineBreaker.h
index 6ad374f200e..b1a58d5c550 100644
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.h
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h
@@ -48,6 +48,9 @@ public:
   nsJISx4051LineBreaker();
   virtual ~nsJISx4051LineBreaker();
 
+  PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
+                               PRUnichar aChar2);
+
   PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
                          const PRUnichar* aText2 , PRUint32 aTextLen2);
 
diff --git a/intl/lwbrk/tools/jisx4501class.txt b/intl/lwbrk/tools/jisx4501class.txt
index 3a7125cea8a..c94d0d9a955 100644
--- a/intl/lwbrk/tools/jisx4501class.txt
+++ b/intl/lwbrk/tools/jisx4501class.txt
@@ -1,4 +1,5 @@
 0028;;1
+002F;;2
 005B;;1
 007B;;1
 2018;;1
diff --git a/layout/generic/nsTextTransformer.cpp b/layout/generic/nsTextTransformer.cpp
index fc17f13b81f..07ef3cb1f8d 100644
--- a/layout/generic/nsTextTransformer.cpp
+++ b/layout/generic/nsTextTransformer.cpp
@@ -348,8 +348,11 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
     bp2 += mBufferPos;
   }
 
+  PRUnichar prevCh;
+  PRUnichar ch = 0;
   for (; offset < fragLen; offset++) {
-    unsigned char ch = *cp++;
+    prevCh = (ch == ' ') ? CH_NBSP : ch;
+    ch = *cp++;
     if (XP_IS_SPACE(ch)) {
       break;
     }
@@ -357,6 +360,10 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
       ch = ' ';
       *aWasTransformed = PR_TRUE;
     }
+    else if (offset != mOffset &&
+             nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) {
+      break;
+    }
     else if (IS_DISCARDED(ch)) {
       // Strip discarded characters from the transformed output
       continue;