Bug 575175: Make the HTML5 parser deal with Unicode decoder signaling -1 consumed bytes. r=smontagu a=blocking

2010-07-26 12:11:09 -07:00 · 2010-07-26 12:11:09 -07:00 · f2738cb6f4
--- a/intl/uconv/public/nsIUnicodeDecoder.h
+++ b/intl/uconv/public/nsIUnicodeDecoder.h
@ -125,8 +125,9 @@ public:
   * encountered, like a format error, the converter stop and return error.
   * However, we should keep in mind that we need to be lax in decoding. When
   * a decoding error is returned to the caller, it is the caller's
-   * responsibility to advance over the bad byte and reset the decoder before
-   * trying to call the decoder again.
+   * responsibility to advance over the bad byte (unless aSrcLength is -1 in
+   * which case the caller should call the decoder with 0 offset again) and
+   * reset the decoder before trying to call the decoder again.
   *
   * Converter required behavior:
   * In this order: when output space is full - return right away. When input
@ -137,7 +138,9 @@ public:
   *
   * @param aSrc        [IN] the source data buffer
   * @param aSrcLength  [IN/OUT] the length of source data buffer; after
-   *                    conversion will contain the number of bytes read
+   *                    conversion will contain the number of bytes read or
+   *                    -1 on error to indicate that the caller should re-push
+   *                    the same buffer after resetting the decoder
   * @param aDest       [OUT] the destination data buffer
   * @param aDestLength [IN/OUT] the length of the destination data buffer;
   *                    after conversion will contain the number of Unicode
--- a/intl/uconv/src/nsUTF8ToUnicode.cpp
+++ b/intl/uconv/src/nsUTF8ToUnicode.cpp
@ -65,7 +65,7 @@ nsUTF8ToUnicode::nsUTF8ToUnicode()
 * However, there is an edge case where the output can be longer than the
 *  input: if the previous buffer ended with an incomplete multi-byte
 *  sequence and this buffer does not begin with a valid continuation
- *  byte, we will return NS_ERROR_UNEXPECTED and the caller may insert a
+ *  byte, we will return NS_ERROR_ILLEGAL_INPUT and the caller may insert a
 *  replacement character in the output buffer which corresponds to no
 *  character in the input buffer. So in the worst case the destination
 *  will need to be one code unit longer than the source.
@ -341,7 +341,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
         * Return an error condition. Caller is responsible for flushing and
         * refilling the buffer and resetting state.
         */
-        res = NS_ERROR_UNEXPECTED;
+        res = NS_ERROR_ILLEGAL_INPUT;
        break;
      }
    } else {
@ -370,7 +370,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
              ((mUcs4 & 0xFFFFF800) == 0xD800) ||
              // Codepoints outside the Unicode range are illegal
              (mUcs4 > 0x10FFFF)) {
-            res = NS_ERROR_UNEXPECTED;
+            res = NS_ERROR_ILLEGAL_INPUT;
            break;
          }
          if (mUcs4 > 0xFFFF) {
@ -396,7 +396,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
         * for flushing and refilling the buffer and resetting state.
         */
        in--;
-        res = NS_ERROR_UNEXPECTED;
+        res = NS_ERROR_ILLEGAL_INPUT;
        break;
      }
    }
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@ -476,7 +476,7 @@ nsHtml5StreamParser::WriteStreamBytes(const PRUint8* aFromSegment,
  if (mLastBuffer->getEnd() == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
    mLastBuffer = (mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE));
  }
-  PRUint32 totalByteCount = 0;
+  PRInt32 totalByteCount = 0;
  for (;;) {
    PRInt32 end = mLastBuffer->getEnd();
    PRInt32 byteCount = aCount - totalByteCount;
@ -491,19 +491,31 @@ nsHtml5StreamParser::WriteStreamBytes(const PRUint8* aFromSegment,
    totalByteCount += byteCount;
    aFromSegment += byteCount;

-    NS_ASSERTION(mLastBuffer->getEnd() <= NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE, "The Unicode decoder wrote too much data.");
+    NS_ASSERTION(end <= NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE,
+        "The Unicode decoder wrote too much data.");
+    NS_ASSERTION(byteCount >= -1, "The decoder consumed fewer than -1 bytes.");
+    NS_ASSERTION(byteCount > 0 || NS_FAILED(convResult),
+        "The decoder consumed too few bytes but did not signal an error.");

    if (NS_FAILED(convResult)) {
+      // Using the more generic NS_FAILED test above in case there are still
+      // decoders around that don't use NS_ERROR_ILLEGAL_INPUT properly.
+      NS_ASSERTION(convResult == NS_ERROR_ILLEGAL_INPUT,
+          "The decoder signaled an error other than NS_ERROR_ILLEGAL_INPUT.");
+
      // There's an illegal byte in the input. It's now the responsibility
      // of this calling code to output a U+FFFD REPLACEMENT CHARACTER and
      // reset the decoder.

-      NS_ASSERTION(totalByteCount < aCount,
-                   "The decoder signaled an error but consumed all input.");
-      if (totalByteCount < aCount) {
+      if (totalByteCount < (PRInt32)aCount) {
        // advance over the bad byte
        ++totalByteCount;
        ++aFromSegment;
+      } else {
+        NS_NOTREACHED("The decoder signaled an error but consumed all input.");
+        // Recovering from this situation in case there are still broken
+        // decoders, since nsScanner had recovery code, too.
+        totalByteCount = (PRInt32)aCount;
      }

      // Emit the REPLACEMENT CHARACTER
@ -515,16 +527,18 @@ nsHtml5StreamParser::WriteStreamBytes(const PRUint8* aFromSegment,
      }

      mUnicodeDecoder->Reset();
-      if (totalByteCount == aCount) {
-        *aWriteCount = totalByteCount;
+      if (totalByteCount == (PRInt32)aCount) {
+        *aWriteCount = (PRUint32)totalByteCount;
        return NS_OK;
      }
    } else if (convResult == NS_PARTIAL_MORE_OUTPUT) {
      mLastBuffer = mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
-      NS_ASSERTION(totalByteCount < aCount, "The Unicode decoder has consumed too many bytes.");
+      NS_ASSERTION(totalByteCount < (PRInt32)aCount,
+          "The Unicode decoder consumed too many bytes.");
    } else {
-      NS_ASSERTION(totalByteCount == aCount, "The Unicode decoder consumed the wrong number of bytes.");
-      *aWriteCount = totalByteCount;
+      NS_ASSERTION(totalByteCount == (PRInt32)aCount,
+          "The Unicode decoder consumed the wrong number of bytes.");
+      *aWriteCount = (PRUint32)totalByteCount;
      return NS_OK;
    }
  }