Bug 494099, HZ-GB-2312 converter reads beyond input buffer and omits characters at block boundaries. r=VYV03354@nifty.ne.jp

2009-06-16 00:13:28 -07:00 · 2009-06-16 00:13:28 -07:00 · 306b8cb3e6
--- a/intl/uconv/tests/unit/test_bug90411.js
+++ b/intl/uconv/tests/unit/test_bug90411.js
@ -0,0 +1,67 @@
+/* Test case for bug 90411
+ *
+ * Uses nsIConverterInputStream to decode GB_HK test.
+ *
+ * Sample text is: 
+ * 问他谁是傻瓜了5分钟。但是，他谁不要求仍然是一个傻瓜永远
+ * 我听见 我忘记; 我看见 我记住; 我做 我了解。
+ */
+
+const sample = "~{NJK{K-JGI59OAK~}5~{7VVS!#5+JG#,K{K-2;R*GsHTH;JGR;8vI59OS@T6!#~} ~{NRL}<{~} ~{NRM|<G~}; ~{NR?4<{~} ~{NR<GW!~}; ~{NRWv~} ~{NRAK=b!#~}";
+
+const expected = "\u95EE\u4ED6\u8C01\u662F\u50BB\u74DC\u4E865\u5206\u949F\u3002\u4F46\u662F\uFF0C\u4ED6\u8C01\u4E0D\u8981\u6C42\u4ECD\u7136\u662F\u4E00\u4E2A\u50BB\u74DC\u6C38\u8FDC\u3002 \u6211\u542C\u89C1 \u6211\u5FD8\u8BB0; \u6211\u770B\u89C1 \u6211\u8BB0\u4F4F; \u6211\u505A \u6211\u4E86\u89E3\u3002"; 
+
+const charset="HZ-GB-2312";
+
+function testCase(bufferLength)
+{
+  var dataURI = "data:text/plain;charset=" + charset + "," + sample;
+
+  var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
+					 "nsIIOService");
+  var ConverterInputStream =
+      Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
+			     "nsIConverterInputStream",
+			     "init");
+
+  var ios = new IOService();
+  var channel = ios.newChannel(dataURI, "", null);
+  var testInputStream = channel.open();
+  var testConverter = new ConverterInputStream(testInputStream,
+					       charset,
+					       bufferLength,
+					       0xFFFD);
+
+  if (!(testConverter instanceof
+	Components.interfaces.nsIUnicharLineInputStream))
+      throw "not line input stream";
+
+  var outStr = "";
+  var more;
+  do {
+      // read the line and check for eof
+      var line = {};
+      more = testConverter.readLine(line);
+      outStr += line.value;
+  } while (more);
+
+  if (outStr != expected) {
+    dump("Failed with bufferLength = " + bufferLength + "\n");
+    if (outStr.length == expected.length) {
+      for (i = 0; i < outStr.length; ++i) {
+	if (outStr.charCodeAt(i) != expected.charCodeAt(i)) {
+	  dump(i + ": " + outStr.charCodeAt(i).toString(16) + " != " + expected.charCodeAt(i).toString(16) + "\n");
+	}
+      }
+    }
+  }
+
+  // escape the strings before comparing for better readability
+  do_check_eq(escape(outStr), escape(expected));
+}
+
+function run_test()
+{
+  testCase(32);
+  testCase(33);
+}
--- a/intl/uconv/ucvcn/nsHZToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsHZToUnicode.cpp
@ -68,19 +68,23 @@
 //----------------------------------------------------------------------
 // Subclassing of nsTablesDecoderSupport class [implementation]

-#define HZ_STATE_GB		1
-#define HZ_STATE_ASCII	2
-#define HZ_STATE_TILD	3
+#define HZ_STATE_GB     1
+#define HZ_STATE_ASCII  2
+#define HZ_STATE_ODD_BYTE_FLAG 0x80
 #define HZLEAD1 '~'
 #define HZLEAD2 '{'
 #define HZLEAD3 '}'
 #define HZLEAD4 '\n'
+#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
+#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))

 nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
 {
-  mHZState = HZ_STATE_ASCII;	// per HZ spec, default to ASCII state 
+  mHZState = HZ_STATE_ASCII;    // per HZ spec, default to ASCII state 
  mRunLength = 0;
+  mOddByte = 0;
 }
+
 //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
 NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
  const char* aSrc, 
@ -91,114 +95,98 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
  PRInt32 i=0;
  PRInt32 iSrcLength = *aSrcLength;
  PRInt32 iDestlen = 0;
-  PRUint8 ch1, ch2;
-  nsresult res = NS_OK;
  *aSrcLength=0;
-  for (i=0;i<iSrcLength;i++)
-  {
-    if ( iDestlen >= (*aDestLength) )
-    {
+  nsresult res = NS_OK;
+  char oddByte = mOddByte;
+
+  for (i=0; i<iSrcLength; i++) {
+    if (iDestlen >= (*aDestLength)) {
      res = NS_OK_UDEC_MOREOUTPUT;
      break;
    }
-    if ( *aSrc & 0x80 ) // if it is a 8-bit byte
-    {
-      if (UINT8_IN_RANGE(0x81, aSrc[0], 0xFE) &&
-          UINT8_IN_RANGE(0x40, aSrc[1], 0xFE)) {
-        // The source is a 8-bit GBCode
-        *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
+
+    char srcByte = *aSrc++;
+    (*aSrcLength)++;
+    if (!HZ_ODD_BYTE_STATE) {
+      if (srcByte & 0x80 || srcByte == HZLEAD1 || HZ_ENCODING_STATE == HZ_STATE_GB) { 
+        oddByte = srcByte;
+        mHZState |= HZ_STATE_ODD_BYTE_FLAG;
      } else {
-        *aDest = UCS2_NO_MAPPING;
+        *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
+        iDestlen++;
      }
-      aSrc += 2;
-      i++;
-      iDestlen++;
-      aDest++;
-      *aSrcLength = i+1;
-      continue;
-    }
-    // otherwise, it is a 7-bit byte 
-    // The source will be an ASCII or a 7-bit HZ code depending on ch1
-    ch1 = *aSrc;
-    ch2	= *(aSrc+1);
-    if (ch1 == HZLEAD1 )  // if it is lead by '~'
-    {
-      switch (ch2)
-      {
-        case HZLEAD2: 
-          // we got a '~{'
-          // we are switching to HZ state
-          mHZState = HZ_STATE_GB;
-          mRunLength = 0;
-          aSrc += 2;
-          i++;
-          break;
-        case HZLEAD3: 
-          // we got a '~}'
-          // we are switching to ASCII state
-          mHZState = HZ_STATE_ASCII;
-          aSrc += 2;
-          i++;
-          if (mRunLength == 0) {
-            *aDest = UCS2_NO_MAPPING;
+    } else {
+      if (oddByte & 0x80) { // if it is a 8-bit byte
+        if (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
+            UINT8_IN_RANGE(0x40, srcByte, 0xFE)) {
+          // The source is a 8-bit GBCode
+          *aDest++ = mUtil.GBKCharToUnicode(oddByte, srcByte);
+        } else {
+          *aDest++ = UCS2_NO_MAPPING;
+        }
+        iDestlen++;
+      // otherwise, it is a 7-bit byte 
+      // The source will be an ASCII or a 7-bit HZ code depending on oddByte
+      } else if (oddByte == HZLEAD1) { // if it is lead by '~'
+        switch (srcByte) {
+          case HZLEAD2: 
+            // we got a '~{'
+            // we are switching to HZ state
+            mHZState = HZ_STATE_GB | HZ_ODD_BYTE_STATE;
+            mRunLength = 0;
+            break;
+
+          case HZLEAD3: 
+            // we got a '~}'
+            // we are switching to ASCII state
+            mHZState = HZ_STATE_ASCII | HZ_ODD_BYTE_STATE;
+            if (mRunLength == 0) {
+              *aDest++ = UCS2_NO_MAPPING;
+              iDestlen++;
+            }
+            mRunLength = 0;
+            break;
+
+          case HZLEAD1: 
+            // we got a '~~', process like an ASCII, but no state change
+            *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
            iDestlen++;
-            aDest++;
-          }
-          mRunLength = 0;
-          break;
-        case HZLEAD1: 
-          // we got a '~~', process like an ASCII, but no state change
-          aSrc++;
-          *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
-          aSrc++;
-          i++;
-          iDestlen++;
-          aDest++;
-          mRunLength++;
-          break;
-        case HZLEAD4:	
-          // we got a "~\n", it means maintain double byte mode cross lines, ignore the '~' itself
-          //  mHZState = HZ_STATE_GB; 
-          // I find that "~\n" should interpreted as line continuation without mode change
-          // It should not be interpreted as line continuation with double byte mode on
-          aSrc++;
-          break;
-        default:
-          // undefined ESC sequence '~X' are ignored since this is a illegal combination 
-          aSrc += 2;
-          *aDest = UCS2_NO_MAPPING;
-          iDestlen++;
-          aDest++;
-          break;
-      };
-      continue;// go for next loop
-    }
-    // ch1 != '~'
-    switch (mHZState)
-    {
-      case HZ_STATE_GB:
-        // the following chars are HZ
-        *aDest = mUtil.GBKCharToUnicode(aSrc[0]|0x80, aSrc[1]|0x80);
-        aSrc += 2;
-        i++;
-        iDestlen++;
-        aDest++;
+            mRunLength++;
+            break;
+
+          case HZLEAD4:   
+            // we got a "~\n", it means maintain double byte mode cross lines,
+            // ignore the '~' itself
+            //  mHZState = HZ_STATE_GB; 
+            // I find that "~\n" should interpreted as line continuation
+            // without mode change
+            // It should not be interpreted as line continuation with double
+            // byte mode on
+            break;
+
+          default:
+            // undefined ESC sequence '~X' are ignored since this is an
+            // illegal combination 
+            *aDest++ = UCS2_NO_MAPPING;
+            iDestlen++;
+            break;
+        }
+      } else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
+        *aDest++ = mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80);
        mRunLength++;
-        break;
-      case HZ_STATE_ASCII:
-      default:
-        // default behavior also like an ASCII
-        // when the source is an ASCII
-        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
-        aSrc++;
        iDestlen++;
-        aDest++;
-        break;
+      } else {
+        NS_NOTREACHED("2-byte sequence that we don't know how to handle");
+        *aDest++ = UCS2_NO_MAPPING;
+        iDestlen++;
+      }
+      oddByte = 0;
+      mHZState &= ~HZ_STATE_ODD_BYTE_FLAG;
    }
-    *aSrcLength = i+1;
-  }// for loop
+  } // for loop
+  mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0;
  *aDestLength = iDestlen;
-  return NS_OK;
+  return res;
 }


--- a/intl/uconv/ucvcn/nsHZToUnicode.h
+++ b/intl/uconv/ucvcn/nsHZToUnicode.h
@ -71,6 +71,7 @@ protected:
 private:
  PRInt16 mHZState;
  PRUint32 mRunLength; // length of a run of 8-bit GB-encoded characters
+  char mOddByte; // first byte of a multi-byte sequence from a previous buffer

 };