Fixing bug 395651. Make our UTF-8 and UTF-16 comparison code etc more robust when dealing with invalid UTF-16 data. r+sr=jonas@sicking.cc, a=jst@mozilla.org/jonas@sicking.cc

2007-09-14 16:09:49 -07:00 · 2007-09-14 16:09:49 -07:00 · 6132ff04ba
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -293,13 +293,18 @@ public:
      }
    else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
      {
-        if (*buffer == end)
+        if (p == end)
          {
-            NS_ERROR("Unexpected end of buffer after high surrogate");
+            // Found a high surrogate the end of the buffer. Flag this
+            // as an error and return the Unicode replacement
+            // character 0xFFFD.
+
+            NS_WARNING("Unexpected end of buffer after high surrogate");
+
            if (err)
              *err = PR_TRUE;
-
-            return 0;
+            *buffer = p;
+            return 0xFFFD;
          }

        // D800- DBFF - High Surrogate
@ -319,15 +324,31 @@ public:
          }
        else
          {
-            NS_ERROR("got a High Surrogate but no low surrogate");
-            // output nothing.
+            // Found a high surrogate followed by something other than
+            // a low surrogate. Flag this as an error and return the
+            // Unicode replacement character 0xFFFD.
+
+            NS_WARNING("got a High Surrogate but no low surrogate");
+
+            if (err)
+              *err = PR_TRUE;
+            *buffer = p;
+            return 0xFFFD;
          }
      }
    else // U+DC00 - U+DFFF
      {
        // DC00- DFFF - Low Surrogate
-        NS_ERROR("got a low Surrogate but no high surrogate");
-        // output nothing.
+
+        // Found a low surrogate w/o a preceeding high surrogate. Flag
+        // this as an error and return the Unicode replacement
+        // character 0xFFFD.
+
+        NS_WARNING("got a low Surrogate but no high surrogate");
+        if (err)
+          *err = PR_TRUE;
+        *buffer = p;
+        return 0xFFFD;
      }

    if (err)
@ -359,10 +380,15 @@ public:
      {
        if (iter == end)
          {
+            // Found a high surrogate the end of the buffer. Flag this
+            // as an error and return the Unicode replacement
+            // character 0xFFFD.
+
+            NS_WARNING("Unexpected end of buffer after high surrogate");
+
            if (err)
              *err = PR_TRUE;
-
-            return 0;
+            return 0xFFFD;
          }

        // D800- DBFF - High Surrogate
@ -381,15 +407,30 @@ public:
          }
        else
          {
-            NS_ERROR("got a High Surrogate but no low surrogate");
-            // output nothing.
+            // Found a high surrogate followed by something other than
+            // a low surrogate. Flag this as an error and return the
+            // Unicode replacement character 0xFFFD.
+
+            NS_WARNING("got a High Surrogate but no low surrogate");
+
+            if (err)
+              *err = PR_TRUE;
+            return 0xFFFD;
          }
      }
    else // U+DC00 - U+DFFF
      {
        // DC00- DFFF - Low Surrogate
-        NS_ERROR("got a low Surrogate but no high surrogate");
-        // output nothing.
+
+        // Found a low surrogate w/o a preceeding high surrogate. Flag
+        // this as an error and return the Unicode replacement
+        // character 0xFFFD.
+
+        NS_WARNING("got a low Surrogate but no high surrogate");
+
+        if (err)
+          *err = PR_TRUE;
+        return 0xFFFD;
      }

    if (err)
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -1158,11 +1158,20 @@ CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
            if (err)
              return PR_INT32_MIN;

-            PRUint32 c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end,
-                                                            &err);
-            if (err)
-              return PR_INT32_MIN;
-
+            PRUint32 c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end);
+            // The above UTF16CharEnumerator::NextChar() calls can
+            // fail, but if it does for anything other than no data to
+            // look at (which can't happen here), it returns the
+            // Unicode replacement character 0xFFFD for the invalid
+            // data they were fed. Ignore that error and treat invalid
+            // UTF16 as 0xFFFD.
+            //
+            // This matches what our UTF16 to UTF8 conversion code
+            // does, and thus a UTF8 string that came from an invalid
+            // UTF16 string will compare equal to the invalid UTF16
+            // string it came from. Same is true for any other UTF16
+            // string differs only in the invalid part of the string.
+            
            if (c8_32 != c16_32)
              return c8_32 < c16_32 ? -1 : 1;
          }