Bug 746900 - Implement "Best Practices for Using U+FFFD" from the Unicode standard. r=smontagu

2012-11-26 20:38:19 -05:00 · 2012-11-26 20:38:19 -05:00 · cd71c36381
--- a/intl/uconv/src/nsUTF8ToUnicode.cpp
+++ b/intl/uconv/src/nsUTF8ToUnicode.cpp
@ -214,52 +214,38 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
    mFirst = false;

  for (in = aSrc; ((in < inend) && (out < outend)); ++in) {
+    uint8_t c = *in;
    if (0 == mState) {
      // When mState is zero we expect either a US-ASCII character or a
      // multi-octet sequence.
-      if (0 == (0x80 & (*in))) {
+      if (c < 0x80) {  // 00..7F
        int32_t max_loops = NS_MIN(inend - in, outend - out);
        Convert_ascii_run(in, out, max_loops);
        --in; // match the rest of the cases
        mBytes = 1;
-      } else if (0xC0 == (0xE0 & (*in))) {
+      } else if (c < 0xC2) {  // C0/C1
+        // Overlong 2 octet sequence
+        res = NS_ERROR_ILLEGAL_INPUT;
+        break;
+      } else if (c < 0xE0) {  // C2..DF
        // First octet of 2 octet sequence
-        mUcs4 = (uint32_t)(*in);
+        mUcs4 = c;
        mUcs4 = (mUcs4 & 0x1F) << 6;
        mState = 1;
        mBytes = 2;
-      } else if (0xE0 == (0xF0 & (*in))) {
+      } else if (c < 0xF0) {  // E0..EF
        // First octet of 3 octet sequence
-        mUcs4 = (uint32_t)(*in);
+        mUcs4 = c;
        mUcs4 = (mUcs4 & 0x0F) << 12;
        mState = 2;
        mBytes = 3;
-      } else if (0xF0 == (0xF8 & (*in))) {
+      } else if (c < 0xF5) {  // F0..F4
        // First octet of 4 octet sequence
-        mUcs4 = (uint32_t)(*in);
+        mUcs4 = c;
        mUcs4 = (mUcs4 & 0x07) << 18;
        mState = 3;
        mBytes = 4;
-      } else if (0xF8 == (0xFC & (*in))) {
-        /* First octet of 5 octet sequence.
-         *
-         * This is illegal because the encoded codepoint must be either
-         * (a) not the shortest form or
-         * (b) outside the Unicode range of 0-0x10FFFF.
-         * Rather than trying to resynchronize, we will carry on until the end
-         * of the sequence and let the later error handling code catch it.
-         */
-        mUcs4 = (uint32_t)(*in);
-        mUcs4 = (mUcs4 & 0x03) << 24;
-        mState = 4;
-        mBytes = 5;
-      } else if (0xFC == (0xFE & (*in))) {
-        // First octet of 6 octet sequence, see comments for 5 octet sequence.
-        mUcs4 = (uint32_t)(*in);
-        mUcs4 = (mUcs4 & 1) << 30;
-        mState = 5;
-        mBytes = 6;
-      } else {
+      } else {  // F5..FF
        /* Current octet is neither in the US-ASCII range nor a legal first
         * octet of a multi-octet sequence.
         *
@ -272,32 +258,34 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
    } else {
      // When mState is non-zero, we expect a continuation of the multi-octet
      // sequence
-      if (0x80 == (0xC0 & (*in))) {
+      if (0x80 == (0xC0 & c)) {
+        if (mState > 1) {
+          // If we are here, all possibilities are:
+          // mState == 2 && mBytes == 3 ||
+          // mState == 2 && mBytes == 4 ||
+          // mState == 3 && mBytes == 4
+          if (mBytes == 3 && (!mUcs4 && c < 0xA0 ||  // E0 80..9F
+                              mUcs4 == 0xD000 && c > 0x9F) ||  // ED A0..BF
+              mState == 3 && (!mUcs4 && c < 0x90 ||  // F0 80..8F
+                              mUcs4 == 0x100000 && c > 0x8F)) {  // F4 90..BF
+            // illegal sequences or sequences converted into illegal ranges.
+            in--;
+            res = NS_ERROR_ILLEGAL_INPUT;
+            break;
+          }
+        }
+
        // Legal continuation.
        uint32_t shift = (mState - 1) * 6;
-        uint32_t tmp = *in;
+        uint32_t tmp = c;
        tmp = (tmp & 0x0000003FL) << shift;
        mUcs4 |= tmp;

        if (0 == --mState) {
          /* End of the multi-octet sequence. mUcs4 now contains the final
           * Unicode codepoint to be output
-           *
-           * Check for illegal sequences and codepoints.
           */

-          // From Unicode 3.1, non-shortest form is illegal
-          if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
-              ((3 == mBytes) && (mUcs4 < 0x0800)) ||
-              ((4 == mBytes) && (mUcs4 < 0x10000)) ||
-              (4 < mBytes) ||
-              // From Unicode 3.2, surrogate characters are illegal
-              ((mUcs4 & 0xFFFFF800) == 0xD800) ||
-              // Codepoints outside the Unicode range are illegal
-              (mUcs4 > 0x10FFFF)) {
-            res = NS_ERROR_ILLEGAL_INPUT;
-            break;
-          }
          if (mUcs4 > 0xFFFF) {
            // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
            if (out + 2 > outend) {
@ -320,7 +308,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
          mFirst = false;
        }
      } else {
-        /* ((0xC0 & (*in) != 0x80) && (mState != 0))
+        /* ((0xC0 & c != 0x80) && (mState != 0))
         * 
         * Incomplete multi-octet sequence. Unconsume this
         * octet and return an error condition. Caller is responsible
--- a/intl/uconv/src/nsUnicodeToUTF8.cpp
+++ b/intl/uconv/src/nsUnicodeToUTF8.cpp
@ -50,9 +50,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
      return NS_OK_UENC_MOREOUTPUT;
    }
    if (*src < (PRUnichar)0xdc00 || *src > (PRUnichar)0xdfff) { //not a pair
-      *dest++ = (char)0xe0 | (mHighSurrogate >> 12);
-      *dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
-      *dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
+      *dest++ = (char)0xef; //replacement character
+      *dest++ = (char)0xbf;
+      *dest++ = (char)0xbd;
      destLen -= 3;
    } else { 
      n = ((mHighSurrogate - (PRUnichar)0xd800) << 10) + 
@ -79,7 +79,17 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
      *dest++ = (char)0xc0 | (*src >> 6);
      *dest++ = (char)0x80 | (*src & 0x003f);
      destLen -= 2;
-    } else if (*src >= (PRUnichar)0xD800 && *src < (PRUnichar)0xDC00) {
+    } else if (*src >= (PRUnichar)0xd800 && *src <= (PRUnichar)0xdfff) {
+      if (*src >= (PRUnichar)0xdc00) { //not a pair
+        if (destLen < 3)
+          goto error_more_output;
+        *dest++ = (char)0xef; //replacement character
+        *dest++ = (char)0xbf;
+        *dest++ = (char)0xbd;
+        destLen -= 3;
+        ++src;
+        continue;
+      }
      if ((src+1) >= srcEnd) {
        //we need another surrogate to complete this unicode char
        mHighSurrogate = *src;
@ -90,9 +100,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
      if (destLen < 4)
        goto error_more_output;
      if (*(src+1) < (PRUnichar)0xdc00 || *(src+1) > 0xdfff) { //not a pair
-        *dest++ = (char)0xe0 | (*src >> 12);
-        *dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
-        *dest++ = (char)0x80 | (*src & 0x003f);
+        *dest++ = (char)0xef; //replacement character
+        *dest++ = (char)0xbf;
+        *dest++ = (char)0xbd;
        destLen -= 3;
      } else {
        n = ((*src - (PRUnichar)0xd800) << 10) + (*(src+1) - (PRUnichar)0xdc00) + (uint32_t)0x10000;
@ -133,9 +143,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, int32_t * aDestLength)
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
-    *dest++ = (char)0xe0 | (mHighSurrogate >> 12);
-    *dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
-    *dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
+    *dest++ = (char)0xef; //replacement character
+    *dest++ = (char)0xbf;
+    *dest++ = (char)0xbd;
    mHighSurrogate = 0;
    *aDestLength = 3;
    return NS_OK;
--- a/intl/uconv/tests/unit/test_utf8_illegals.js
+++ b/intl/uconv/tests/unit/test_utf8_illegals.js
@ -3,30 +3,92 @@
 const Cc = Components.Constructor;
 const Ci = Components.interfaces;

-const inStrings1 = new Array("%c0%af",              // long forms of 0x2F
-                             "%e0%80%af",
-                             "%f0%80%80%af",
-                             "%f8%80%80%80%af",
-                             "%fc%80%80%80%80%af",
-                                                    // lone surrogates
+const tests = [
+{ inStrings: ["%80",                 // Illegal or incomplete sequences
+              "%8f",
+              "%90",
+              "%9f",
+              "%a0",
+              "%bf",
+              "%c0",
+              "%c1",
+              "%c2",
+              "%df",
+              "%e0",
+              "%e0%a0",
+              "%e0%bf",
+              "%ed%80",
+              "%ed%9f",
+              "%ef",
+              "%ef%bf",
+              "%f0",
+              "%f0%90",
+              "%f0%90%80",
+              "%f0%90%bf",
+              "%f0%bf",
+              "%f0%bf%80",
+              "%f0%bf%bf",
+              "%f4",
+              "%f4%80",
+              "%f4%80%80",
+              "%f4%80%bf",
+              "%f4%8f",
+              "%f4%8f%80",
+              "%f4%8f%bf",
+              "%f5",
+              "%f7",
+              "%f8",
+              "%fb",
+              "%fc",
+              "%fd"],
+  expected: "ABC\ufffdXYZ" },
+
+{ inStrings: ["%c0%af",              // Illegal bytes in 2-octet
+              "%c1%af"],             //  sequences
+  expected: "ABC\ufffd\ufffdXYZ" },
+
+{ inStrings: ["%e0%80%80",           // Illegal bytes in 3-octet
+              "%e0%80%af",           //  sequences
+              "%e0%9f%bf",
+                                     // long surrogates
              "%ed%a0%80",           // D800
              "%ed%ad%bf",           // DB7F
              "%ed%ae%80",           // DB80
              "%ed%af%bf",           // DBFF
              "%ed%b0%80",           // DC00
              "%ed%be%80",           // DF80
-                             "%ed%bf%bf");          // DFFF
-const expected1 = "ABC\ufffdXYZ";
+              "%ed%bf%bf"],          // DFFF
+  expected: "ABC\ufffd\ufffd\ufffdXYZ" },
+
+{ inStrings: ["%f0%80%80%80",        // Illegal bytes in 4-octet
+              "%f0%80%80%af",        //  sequences
+              "%f0%8f%bf%bf",
+              "%f4%90%80%80",
+              "%f4%bf%bf%bf",
+              "%f5%80%80%80",
+              "%f7%bf%bf%bf"],
+  expected: "ABC\ufffd\ufffd\ufffd\ufffdXYZ" },
+
+{ inStrings: ["%f8%80%80%80%80",     // Illegal bytes in 5-octet
+              "%f8%80%80%80%af",     //  sequences
+              "%fb%bf%bf%bf%bf"],
+  expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" },
+
                                     // Surrogate pairs
-const inStrings2 = new Array("%ed%a0%80%ed%b0%80",  // D800 DC00
+{ inStrings: ["%ed%a0%80%ed%b0%80",  // D800 DC00
              "%ed%a0%80%ed%bf%bf",  // D800 DFFF
              "%ed%ad%bf%ed%b0%80",  // DB7F DC00
              "%ed%ad%bf%ed%bf%bf",  // DB7F DFFF
              "%ed%ae%80%ed%b0%80",  // DB80 DC00
              "%ed%ae%80%ed%bf%bf",  // DB80 DFFF
              "%ed%af%bf%ed%b0%80",  // DBFF DC00
-                             "%ed%ad%bf%ed%bf%bf"); // DBFF DFFF
-const expected2 = "ABC\ufffd\ufffdXYZ";
+              "%ed%ad%bf%ed%bf%bf",  // DBFF DFFF
+              "%fc%80%80%80%80%80",  // Illegal bytes in 6-octet
+              "%fc%80%80%80%80%af",  //  sequences
+              "%fd%bf%bf%bf%bf%bf"],
+  expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" },
+];
+

 function testCaseInputStream(inStr, expected)
 {
@ -66,12 +128,9 @@ function testCaseInputStream(inStr, expected)
 }

 function run_test() {
-    for (var i = 0; i < inStrings1.length; ++i) {
-	var inStr = inStrings1[i];
-	testCaseInputStream(inStr, expected1);
+  for (var t of tests) {
+    for (var inStr of t.inStrings) {
+      testCaseInputStream(inStr, t.expected);
    }
-    for (var i = 0; i < inStrings2.length; ++i) {
-	var inStr = inStrings2[i];
-	testCaseInputStream(inStr, expected2);
  }
 }