Don't split a surrogate pair over two output buffers. Bug 600974, r=emk, a=blocking

2010-10-20 09:11:16 -07:00 · 2010-10-20 09:11:16 -07:00 · 3c47aaf992
--- a/intl/uconv/ucvcn/nsGBKToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsGBKToUnicode.cpp
@ -197,8 +197,7 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
               *aDest = UCS2_NO_MAPPING;
           } else {
              // let's try supplement mapping
-             NS_ASSERTION(( (iDestlen+1) <= (*aDestLength) ), "no enouth output memory");
-             if ( (iDestlen+1) <= (*aDestLength) )
+             if ( (iDestlen+1) < (*aDestLength) )
             {
               if(DecodeToSurrogate(aSrc, aDest))
               {
@ -209,7 +208,13 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
                 *aDest = UCS2_NO_MAPPING;
              }
             } else {
-               *aDest = UCS2_NO_MAPPING;
+               if (*aDestLength < 2) {
+                 NS_ERROR("insufficient space in output buffer");
+                 *aDest = UCS2_NO_MAPPING;
+               } else {
+                 rv = NS_OK_UDEC_MOREOUTPUT;
+                 break;
+               }
             }
           }
        } else {
--- a/intl/uconv/ucvlatin/nsUCS2BEToUnicode.cpp
+++ b/intl/uconv/ucvlatin/nsUCS2BEToUnicode.cpp
@ -42,14 +42,16 @@
 #include <string.h>
 #include "prtypes.h"

-#define STATE_NORMAL          0
-#define STATE_HALF_CODE_POINT 1
-#define STATE_FIRST_CALL      2
-#define STATE_FOUND_BOM       3
+#define STATE_NORMAL             0
+#define STATE_HALF_CODE_POINT    1
+#define STATE_FIRST_CALL         2
+#define STATE_FOUND_BOM          3
+#define STATE_ODD_SURROGATE_PAIR 4

 static nsresult
 UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
-                      PRUnichar& aOddHighSurrogate, const char * aSrc,
+                      PRUnichar& aOddHighSurrogate, PRUnichar& aOddLowSurrogate,
+                      const char * aSrc,
                      PRInt32 * aSrcLength, PRUnichar * aDest,
                      PRInt32 * aDestLength,
                      PRBool aSwapBytes)
@ -59,32 +61,51 @@ UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
  PRUnichar* dest = aDest;
  PRUnichar* destEnd = aDest + *aDestLength;

-  if(STATE_FOUND_BOM == aState) // caller found a BOM
-  {
-    if (*aSrcLength < 2)
-      return NS_ERROR_ILLEGAL_INPUT;
-    src+=2;
-    aState = STATE_NORMAL;
-  } else if(STATE_FIRST_CALL == aState) { // first time called
-    if (*aSrcLength < 2)
-      return NS_ERROR_ILLEGAL_INPUT;
-
-    // Eliminate BOM (0xFEFF). Note that different endian case is taken care of
-    // in |Convert| of LE and BE converters. Here, we only have to
-    // deal with the same endian case. That is, 0xFFFE (byte-swapped BOM) is
-    // illegal.
-    if(0xFEFF == *((PRUnichar*)src)) {
+  switch(aState) {
+    case STATE_FOUND_BOM:
+      if (*aSrcLength < 2)
+        return NS_ERROR_ILLEGAL_INPUT;
      src+=2;
-    } else if(0xFFFE == *((PRUnichar*)src)) {
-      *aSrcLength=0;
-      *aDestLength=0;
-      return NS_ERROR_ILLEGAL_INPUT;
-    }  
-    aState = STATE_NORMAL;
+      aState = STATE_NORMAL;
+      break;
+
+    case STATE_FIRST_CALL: // first time called
+      if (*aSrcLength < 2)
+        return NS_ERROR_ILLEGAL_INPUT;
+
+      // Eliminate BOM (0xFEFF). Note that different endian case is taken care
+      // of in |Convert| of LE and BE converters. Here, we only have to
+      // deal with the same endian case. That is, 0xFFFE (byte-swapped BOM) is
+      // illegal.
+      if(0xFEFF == *((PRUnichar*)src)) {
+        src+=2;
+      } else if(0xFFFE == *((PRUnichar*)src)) {
+        *aSrcLength=0;
+        *aDestLength=0;
+        return NS_ERROR_ILLEGAL_INPUT;
+      }  
+      aState = STATE_NORMAL;
+      break;
+
+    case STATE_ODD_SURROGATE_PAIR:
+      if (*aDestLength < 2)
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+      else {
+        *dest++ = aOddHighSurrogate;
+        *dest++ = aOddLowSurrogate;
+        aOddHighSurrogate = aOddLowSurrogate = 0;
+        aState = STATE_NORMAL;
+      }
+      break;
+
+    case STATE_NORMAL:
+    case STATE_HALF_CODE_POINT:
+    default:
+      break;
  }

  if (src == srcEnd) {
-    *aDestLength = 0;
+    *aDestLength = dest - aDest;
    return NS_OK;
  }

@ -140,17 +161,19 @@ have_codepoint:
      oddHighSurrogate = u;
    }
    else /* if (NS_IS_LOW_SURROGATE(u)) */ {
-      if (oddHighSurrogate) {
-        if (dest == destEnd - 1) {
-          *dest++ = UCS2_REPLACEMENT_CHAR;
+      if (oddHighSurrogate && *aDestLength > 1) {
+        if (dest + 1 >= destEnd) {
+          aOddLowSurrogate = u;
+          aOddHighSurrogate = oddHighSurrogate;
+          aState = STATE_ODD_SURROGATE_PAIR;
          goto error;
        }
        *dest++ = oddHighSurrogate;
        *dest++ = u;
-        oddHighSurrogate = 0;
      } else {
        *dest++ = UCS2_REPLACEMENT_CHAR;
      }
+      oddHighSurrogate = 0;
    }
  }
  if (src != srcEnd) {
@ -177,6 +200,7 @@ nsUTF16ToUnicodeBase::Reset()
  mState = STATE_FIRST_CALL;
  mOddByte = 0;
  mOddHighSurrogate = 0;
+  mOddLowSurrogate = 0;
  return NS_OK;
 }

@ -185,9 +209,11 @@ nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength,
                                   PRInt32 * aDestLength)
 {
  // the left-over data of the previous run have to be taken into account.
-  *aDestLength = (aSrcLength +
-                    ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2 +
-                 ((mOddHighSurrogate != 0) ? 1 : 0);
+  *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2;
+  if (mOddHighSurrogate)
+    (*aDestLength)++;
+  if (mOddLowSurrogate)
+    (*aDestLength)++;
  return NS_OK;
 }

@ -216,6 +242,7 @@ nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
 #endif

  nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
+                                      mOddLowSurrogate,
                                      aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_LITTLE_ENDIAN
                                      PR_TRUE
@ -250,6 +277,7 @@ nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
 #endif
    
  nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
+                                      mOddLowSurrogate,
                                      aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_BIG_ENDIAN
                                      PR_TRUE
@ -308,6 +336,7 @@ nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
    }
    
    nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
+                                        mOddLowSurrogate,
                                        aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_BIG_ENDIAN
                                        (mEndian == kLittleEndian)
--- a/intl/uconv/ucvlatin/nsUCS2BEToUnicode.h
+++ b/intl/uconv/ucvlatin/nsUCS2BEToUnicode.h
@ -62,6 +62,8 @@ protected:
  PRUint8 mOddByte;
  // to store an odd high surrogate left over between runs
  PRUnichar mOddHighSurrogate;
+  // to store an odd low surrogate left over between runs
+  PRUnichar mOddLowSurrogate;
 };

 // UTF-16 big endian