Bug 716579 - Let a BOM override HTTP-level charset in the HTML and XML parsers. r=smaug.

2012-11-06 13:57:51 +02:00 · 2012-11-06 13:57:51 +02:00 · 4038858f3f
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@ -24,6 +24,7 @@
 #include "expat.h"
 #include "nsINestedURI.h"
 #include "nsCharsetSource.h"
+#include "nsIWyciwygChannel.h"

 using namespace mozilla;

@ -495,8 +496,8 @@ nsHtml5StreamParser::FinalizeSniffing(const uint8_t* aFromSegment, // can be nul
                                      uint32_t aCountToSniffingLimit)
 {
  NS_ASSERTION(IsParserThread(), "Wrong thread!");
-  NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
-      "Should not finalize sniffing when already confident.");
+  NS_ASSERTION(mCharsetSource < kCharsetFromParentForced,
+      "Should not finalize sniffing when using forced charset.");
  if (mMode == VIEW_SOURCE_XML) {
    static const XML_Memory_Handling_Suite memsuite =
      {
@ -634,6 +635,11 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
  NS_ASSERTION(IsParserThread(), "Wrong thread!");
  nsresult rv = NS_OK;
  uint32_t writeCount;
+
+  // mCharset and mCharsetSource potentially have come from channel or higher
+  // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
+  // If we don't find a BOM, the previously set values of mCharset and
+  // mCharsetSource are not modified by the BOM sniffing here.
  for (uint32_t i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
    switch (mBomState) {
      case BOM_SNIFFING_NOT_STARTED:
@ -701,8 +707,36 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
        break;
    }
  }
-  // if we get here, there either was no BOM or the BOM sniffing isn't complete yet
+  // if we get here, there either was no BOM or the BOM sniffing isn't complete
+  // yet
  
+  if (mBomState == BOM_SNIFFING_OVER &&
+    mCharsetSource >= kCharsetFromChannel) {
+    // There was no BOM and the charset came from channel or higher. mCharset
+    // still contains the charset from the channel or higher as set by an
+    // earlier call to SetDocumentCharset(), since we didn't find a BOM and
+    // overwrite mCharset.
+    nsCOMPtr<nsICharsetConverterManager> convManager =
+      do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
+    convManager->GetUnicodeDecoder(mCharset.get(),
+                                   getter_AddRefs(mUnicodeDecoder));
+    if (mUnicodeDecoder) {
+      mUnicodeDecoder->SetInputErrorBehavior(
+          nsIUnicodeDecoder::kOnError_Recover);
+      mFeedChardet = false;
+      mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
+      mMetaScanner = nullptr;
+      return WriteSniffingBufferAndCurrentSegment(aFromSegment,
+                                                  aCount,
+                                                  aWriteCount);
+    } else {
+      // nsHTMLDocument is supposed to make sure this does not happen. Let's
+      // deal with this anyway, since who knows how kCharsetFromOtherComponent
+      // is used.
+      mCharsetSource = kCharsetFromWeakDocTypeDefault;
+    }
+  }
+
  if (!mMetaScanner && (mMode == NORMAL ||
                        mMode == VIEW_SOURCE_HTML ||
                        mMode == LOAD_AS_DATA)) {
@ -963,7 +997,13 @@ nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
    mFeedChardet = false;
  }
  
-  if (mCharsetSource <= kCharsetFromMetaPrescan) {
+  nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
+  if (wyciwygChannel) {
+    mReparseForbidden = true;
+    mFeedChardet = false;
+    // If we are reloading a document.open()ed doc, fall through to converter
+    // instantiation here and avoid BOM sniffing.
+  } else if (mCharsetSource < kCharsetFromParentForced) {
    // we aren't ready to commit to an encoding yet
    // leave converter uninstantiated for now
    return NS_OK;
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@ -41,6 +41,7 @@
 #include "mozilla/Mutex.h"
 #include "nsParserConstants.h"
 #include "nsCharsetSource.h"
+#include "nsContentUtils.h"

 using namespace mozilla;

@ -1250,8 +1251,7 @@ nsParser::Parse(nsIURI* aURL,
    }
    NS_ConvertUTF8toUTF16 theName(spec);

-    nsScanner* theScanner = new nsScanner(theName, false, mCharset,
-                                          mCharsetSource);
+    nsScanner* theScanner = new nsScanner(theName, false);
    CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey,
                                            mCommand, aListener);
    if (pc && theScanner) {
@ -1311,7 +1311,7 @@ nsParser::Parse(const nsAString& aSourceBuffer,
    if (!pc) {
      // Only make a new context if we don't have one, OR if we do, but has a
      // different context key.
-      nsScanner* theScanner = new nsScanner(mUnusedInput, mCharset, mCharsetSource);
+      nsScanner* theScanner = new nsScanner(mUnusedInput);
      NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);

      eAutoDetectResult theStatus = eUnknownDetect;
@ -1674,11 +1674,6 @@ nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext)
 }


-#define UTF16_BOM "UTF-16"
-#define UTF16_BE "UTF-16BE"
-#define UTF16_LE "UTF-16LE"
-#define UTF8 "UTF-8"
-
 static inline bool IsSecondMarker(unsigned char aChar)
 {
  switch (aChar) {
@ -1693,146 +1688,87 @@ static inline bool IsSecondMarker(unsigned char aChar)
 }

 static bool
-DetectByteOrderMark(const unsigned char* aBytes, int32_t aLen,
-                    nsCString& oCharset, int32_t& oCharsetSource)
+ExtractCharsetFromXmlDeclaration(const unsigned char* aBytes, int32_t aLen,
+                                 nsCString& oCharset)
 {
- oCharsetSource= kCharsetFromAutoDetection;
- oCharset.Truncate();
- // See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
- // for details
- // Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
- // We need to check that
- // UCS2 BOM FEFF = UTF8 EF BB BF
- switch(aBytes[0])
-	 {
-   case 0x00:
-     if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
-        // 00 3C 00
-        if(IsSecondMarker(aBytes[3])) {
-           // 00 3C 00 SM UTF-16,  big-endian, no Byte Order Mark 
-           oCharset.Assign(UTF16_BE); 
-           oCharsetSource = kCharsetFromByteOrderMark;
-        } 
-     }
-   break;
-   case 0x3C:
-     if(0x00==aBytes[1] && (0x00==aBytes[3])) {
-        // 3C 00 XX 00
-        if(IsSecondMarker(aBytes[2])) {
-           // 3C 00 SM 00 UTF-16,  little-endian, no Byte Order Mark 
-           oCharset.Assign(UTF16_LE); 
-           oCharsetSource = kCharsetFromByteOrderMark;
-        } 
-     // For html, meta tag detector is invoked before this so that we have 
-     // to deal only with XML here.
-     } else if(                     (0x3F==aBytes[1]) &&
-               (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
-               (0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
-       // 3C 3F 78 6D
-       // ASCII characters are in their normal positions, so we can safely
-       // deal with the XML declaration in the old C way
-       // The shortest string so far (strlen==5):
-       // <?xml
-       int32_t i;
-       bool versionFound = false, encodingFound = false;
-       for (i=6; i < aLen && !encodingFound; ++i) {
-         // end of XML declaration?
-         if ((((char*)aBytes)[i] == '?') && 
-           ((i+1) < aLen) &&
-           (((char*)aBytes)[i+1] == '>')) {
-           break;
-         }
-         // Version is required.
-         if (!versionFound) {
-           // Want to avoid string comparisons, hence looking for 'n'
-           // and only if found check the string leading to it. Not
-           // foolproof, but fast.
-           // The shortest string allowed before this is  (strlen==13):
-           // <?xml version
-           if ((((char*)aBytes)[i] == 'n') &&
-             (i >= 12) && 
-             (0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
-             // Fast forward through version
-             char q = 0;
-             for (++i; i < aLen; ++i) {
-               char qi = ((char*)aBytes)[i];
-               if (qi == '\'' || qi == '"') {
-                 if (q && q == qi) {
-                   //  ending quote
-                   versionFound = true;
-                   break;
-                 } else {
-                   // Starting quote
-                   q = qi;
-                 }
-               }
-             }
-           }
-         } else {
-           // encoding must follow version
-           // Want to avoid string comparisons, hence looking for 'g'
-           // and only if found check the string leading to it. Not
-           // foolproof, but fast.
-           // The shortest allowed string before this (strlen==26):
-           // <?xml version="1" encoding
-           if ((((char*)aBytes)[i] == 'g') &&
-             (i >= 25) && 
-             (0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
-             int32_t encStart = 0;
-             char q = 0;
-             for (++i; i < aLen; ++i) {
-               char qi = ((char*)aBytes)[i];
-               if (qi == '\'' || qi == '"') {
-                 if (q && q == qi) {
-                   int32_t count = i - encStart;
-                   // encoding value is invalid if it is UTF-16
-                   if (count > 0 && 
-                     (0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
-                     oCharset.Assign((char*)(aBytes+encStart),count);
-                     oCharsetSource = kCharsetFromMetaTag;
-                   }
-                   encodingFound = true;
-                   break;
-                 } else {
-                   encStart = i+1;
-                   q = qi;
-                 }
-               }
-             }
-           }
-         } // if (!versionFound)
-       } // for
-     }
-   break;
-   case 0xEF:  
-     if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
-        // EF BB BF
-        // Win2K UTF-8 BOM
-        oCharset.Assign(UTF8); 
-        oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   case 0xFE:
-     if(0xFF==aBytes[1]) {
-        // FE FF UTF-16, big-endian 
-        oCharset.Assign(UTF16_BOM); 
-        oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   case 0xFF:
-     if(0xFE==aBytes[1]) {
-       // FF FE
-       // UTF-16, little-endian 
-       oCharset.Assign(UTF16_BOM); 
-       oCharsetSource= kCharsetFromByteOrderMark;
-     }
-   break;
-   // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
-   //   We do not care EBCIDIC here....
-   // }
-   // break;
- }  // switch
- return !oCharset.IsEmpty();
+  // This code is rather pointless to have. Might as well reuse expat as
+  // seen in nsHtml5StreamParser. -- hsivonen
+  oCharset.Truncate();
+  if ((aLen >= 5) &&
+      ('<' == aBytes[0]) &&
+      ('?' == aBytes[1]) &&
+      ('x' == aBytes[2]) &&
+      ('m' == aBytes[3]) &&
+      ('l' == aBytes[4])) {
+    int32_t i;
+    bool versionFound = false, encodingFound = false;
+    for (i = 6; i < aLen && !encodingFound; ++i) {
+      // end of XML declaration?
+      if ((((char*) aBytes)[i] == '?') &&
+          ((i + 1) < aLen) &&
+          (((char*) aBytes)[i + 1] == '>')) {
+        break;
+      }
+      // Version is required.
+      if (!versionFound) {
+        // Want to avoid string comparisons, hence looking for 'n'
+        // and only if found check the string leading to it. Not
+        // foolproof, but fast.
+        // The shortest string allowed before this is  (strlen==13):
+        // <?xml version
+        if ((((char*) aBytes)[i] == 'n') &&
+            (i >= 12) &&
+            (0 == PL_strncmp("versio", (char*) (aBytes + i - 6), 6))) {
+          // Fast forward through version
+          char q = 0;
+          for (++i; i < aLen; ++i) {
+            char qi = ((char*) aBytes)[i];
+            if (qi == '\'' || qi == '"') {
+              if (q && q == qi) {
+                //  ending quote
+                versionFound = true;
+                break;
+              } else {
+                // Starting quote
+                q = qi;
+              }
+            }
+          }
+        }
+      } else {
+        // encoding must follow version
+        // Want to avoid string comparisons, hence looking for 'g'
+        // and only if found check the string leading to it. Not
+        // foolproof, but fast.
+        // The shortest allowed string before this (strlen==26):
+        // <?xml version="1" encoding
+        if ((((char*) aBytes)[i] == 'g') && (i >= 25) && (0 == PL_strncmp(
+            "encodin", (char*) (aBytes + i - 7), 7))) {
+          int32_t encStart = 0;
+          char q = 0;
+          for (++i; i < aLen; ++i) {
+            char qi = ((char*) aBytes)[i];
+            if (qi == '\'' || qi == '"') {
+              if (q && q == qi) {
+                int32_t count = i - encStart;
+                // encoding value is invalid if it is UTF-16
+                if (count > 0 && (0 != PL_strcmp("UTF-16",
+                    (char*) (aBytes + encStart)))) {
+                  oCharset.Assign((char*) (aBytes + encStart), count);
+                }
+                encodingFound = true;
+                break;
+              } else {
+                encStart = i + 1;
+                q = qi;
+              }
+            }
+          }
+        }
+      } // if (!versionFound)
+    } // for
+  }
+  return !oCharset.IsEmpty();
 }

 inline const char
@ -1843,131 +1779,6 @@ GetNextChar(nsACString::const_iterator& aStart,
  return (++aStart != aEnd) ? *aStart : '\0';
 }

-bool
-nsParser::DetectMetaTag(const char* aBytes,
-                        int32_t aLen,
-                        nsCString& aCharset,
-                        int32_t& aCharsetSource)
-{
-  aCharsetSource= kCharsetFromMetaTag;
-  aCharset.SetLength(0);
-
-  // XXX Only look inside HTML documents for now. For XML
-  // documents we should be looking inside the XMLDecl.
-  if (!mParserContext->mMimeType.EqualsLiteral(TEXT_HTML)) {
-    return false;
-  }
-
-  // Fast and loose parsing to determine if we have a complete
-  // META tag in this block, looking upto 2k into it.
-  const nsASingleFragmentCString& str =
-      Substring(aBytes, aBytes + NS_MIN(aLen, 2048));
-  // XXXldb Should be const_char_iterator when FindInReadable supports it.
-  nsACString::const_iterator begin, end;
-
-  str.BeginReading(begin);
-  str.EndReading(end);
-  nsACString::const_iterator currPos(begin);
-  nsACString::const_iterator tokEnd;
-  nsACString::const_iterator tagEnd(begin);
-
-  while (currPos != end) {
-    if (!FindCharInReadable('<', currPos, end))
-      break; // no tag found in this buffer
-
-    if (GetNextChar(currPos, end) == '!') {
-      if (GetNextChar(currPos, end) != '-' ||
-          GetNextChar(currPos, end) != '-') {
-        // If we only see a <! not followed by --, just skip to the next >.
-        if (!FindCharInReadable('>', currPos, end)) {
-          return false; // No more tags to follow.
-        }
-
-        // Continue searching for a meta tag following this "comment".
-        ++currPos;
-        continue;
-      }
-
-      // Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
-      bool foundMDC = false;
-      bool foundMatch = false;
-      while (!foundMDC) {
-        if (GetNextChar(currPos, end) == '-' &&
-            GetNextChar(currPos, end) == '-') {
-          foundMatch = !foundMatch; // toggle until we've matching "--"
-        } else if (currPos == end) {
-          return false; // Couldn't find --[*s]> in this buffer
-        } else if (foundMatch && *currPos == '>') {
-          foundMDC = true; // found comment end delimiter.
-          ++currPos;
-        }
-      }
-      continue; // continue searching for META tag.
-    }
-
-    // Find the end of the tag, break if incomplete
-    tagEnd = currPos;
-    if (!FindCharInReadable('>', tagEnd, end))
-      break;
-
-    // If this is not a META tag, continue to next loop
-    if ( (*currPos != 'm' && *currPos != 'M') ||
-         (*(++currPos) != 'e' && *currPos != 'E') ||
-         (*(++currPos) != 't' && *currPos != 'T') ||
-         (*(++currPos) != 'a' && *currPos != 'A') ||
-         !nsCRT::IsAsciiSpace(*(++currPos))) {
-      currPos = tagEnd;
-      continue;
-    }
-
-    // If could not find "charset" in this tag, skip this tag and try next
-    tokEnd = tagEnd;
-    if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"),
-                                       currPos, tokEnd)) {
-      currPos = tagEnd;
-      continue;
-    }
-    currPos = tokEnd;
-
-    // skip spaces before '='
-    while (*currPos == kSpace || *currPos == kNewLine ||
-           *currPos == kCR || *currPos == kTab) {
-      ++currPos;
-    }
-    // skip '='
-    if (*currPos != '=') {
-      currPos = tagEnd;
-      continue;
-    }
-    ++currPos;
-    // skip spaces after '='
-    while (*currPos == kSpace || *currPos == kNewLine ||
-           *currPos == kCR || *currPos == kTab) {
-      ++currPos;
-    }
-
-    // skip open quote
-    if (*currPos == '\'' || *currPos == '\"')
-      ++currPos;
-
-    // find the end of charset string
-    tokEnd = currPos;
-    while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd)
-      ++tokEnd;
-
-    // return true if we successfully got something for charset
-    if (currPos != tokEnd) {
-      aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
-      return true;
-    }
-
-    // Nothing specified as charset, continue next loop
-    currPos = tagEnd;
-  }
-
-  return false;
-}
-
 static NS_METHOD
 NoOpParserWriteFunc(nsIInputStream* in,
                void* closure,
@ -2003,7 +1814,8 @@ ParserWriteFunc(nsIInputStream* in,
 {
  nsresult result;
  ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
-  const char* buf = fromRawSegment;
+  const unsigned char* buf =
+    reinterpret_cast<const unsigned char*> (fromRawSegment);
  uint32_t theNumRead = count;

  if (!pws) {
@ -2011,47 +1823,37 @@ ParserWriteFunc(nsIInputStream* in,
  }

  if (pws->mNeedCharsetCheck) {
-    int32_t guessSource;
-    nsAutoCString guess;
-    nsAutoCString preferred;
-
    pws->mNeedCharsetCheck = false;
-    if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
-        ((count >= 4) &&
-         DetectByteOrderMark((const unsigned char*)buf,
-                             theNumRead, guess, guessSource))) {
-      result = nsCharsetAlias::GetPreferred(guess, preferred);
-      // Only continue if it's a recognized charset and not
-      // one of a designated set that we ignore.
-      if (NS_SUCCEEDED(result) &&
-          ((kCharsetFromByteOrderMark == guessSource) ||
-           (!preferred.EqualsLiteral("UTF-16") &&
-            !preferred.EqualsLiteral("UTF-16BE") &&
-            !preferred.EqualsLiteral("UTF-16LE")))) {
-        guess = preferred;
-        pws->mParser->SetDocumentCharset(guess, guessSource);
-        pws->mParser->SetSinkCharset(preferred);
-        nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
-        if (channel) {
-          nsCOMPtr<nsISupports> cacheToken;
-          channel->GetCacheToken(getter_AddRefs(cacheToken));
-          if (cacheToken) {
-            nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
-            if (cacheDescriptor) {
-#ifdef DEBUG
-              nsresult rv =
-#endif
-                cacheDescriptor->SetMetaDataElement("charset",
-                                                    guess.get());
-              NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
-            }
-          }
+    int32_t source;
+    nsAutoCString preferred;
+    nsAutoCString maybePrefer;
+    pws->mParser->GetDocumentCharset(preferred, source);
+
+    // This code was bogus when I found it. It expects the BOM or the XML
+    // declaration to be entirely in the first network buffer. -- hsivonen
+    if (nsContentUtils::CheckForBOM(buf, count, maybePrefer)) {
+      // The decoder will swallow the BOM. The UTF-16 will re-sniff for
+      // endianness. The value of preferred is now either "UTF-8" or "UTF-16".
+      preferred.Assign(maybePrefer);
+      source = kCharsetFromByteOrderMark;
+    } else if (source < kCharsetFromChannel) {
+      nsAutoCString declCharset;
+
+      if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
+        nsresult rv = nsCharsetAlias::GetPreferred(declCharset, maybePrefer);
+        if (NS_SUCCEEDED(rv)) {
+          preferred.Assign(maybePrefer);
+          source = kCharsetFromMetaTag;
        }
      }
    }
+
+    pws->mParser->SetDocumentCharset(preferred, source);
+    pws->mParser->SetSinkCharset(preferred);
+
  }

-  result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
+  result = pws->mScanner->Append(fromRawSegment, theNumRead, pws->mRequest);
  if (NS_SUCCEEDED(result)) {
    *writeCount = count;
  }
@ -2103,8 +1905,7 @@ nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,

    uint32_t totalRead;
    ParserWriteStruct pws;
-    pws.mNeedCharsetCheck =
-      (0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
+    pws.mNeedCharsetCheck = true;
    pws.mParser = this;
    pws.mScanner = theContext->mScanner;
    pws.mRequest = request;
--- a/parser/htmlparser/src/nsParser.h
+++ b/parser/htmlparser/src/nsParser.h
@ -246,15 +246,6 @@ class nsParser : public nsIParser,
     */
    virtual nsIStreamListener* GetStreamListener();

-    /** 
-     * Detects the existence of a META tag with charset information in 
-     * the given buffer.
-     */
-    bool DetectMetaTag(const char* aBytes, 
-                         int32_t aLen, 
-                         nsCString& oCharset, 
-                         int32_t& oCharsetSource);
-
    void SetSinkCharset(nsACString& aCharset);

    /**
--- a/parser/htmlparser/src/nsScanner.cpp
+++ b/parser/htmlparser/src/nsScanner.cpp
@ -57,8 +57,7 @@ const int   kBufsize=64;
 *  @param   aMode represents the parser mode (nav, other)
 *  @return  
 */
-nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
-                     int32_t aSource)
+nsScanner::nsScanner(const nsAString& anHTMLString)
 {
  MOZ_COUNT_CTOR(nsScanner);

@ -84,13 +83,8 @@ nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
 *  Use this constructor if you want i/o to be based on strings 
 *  the scanner receives. If you pass a null filename, you
 *  can still provide data to the scanner via append.
- *
- *  @update  gess 5/12/98
- *  @param   aFilename --
- *  @return  
 */
-nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
-                     const nsACString& aCharset, int32_t aSource)
+nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
  : mFilename(aFilename)
 {
  MOZ_COUNT_CTOR(nsScanner);
@ -115,7 +109,8 @@ nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
  mCharsetSource = kCharsetUninitialized;
  mHasInvalidCharacter = false;
  mReplacementCharacter = PRUnichar(0x0);
-  SetDocumentCharset(aCharset, aSource);
+  // XML defaults to UTF-8 and about:blank is UTF-8, too.
+  SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
 }

 nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
@ -130,6 +125,7 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
    res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
    if(NS_SUCCEEDED(res) && same)
    {
+      mCharsetSource = aSource;
      return NS_OK; // no difference, don't change it
    }
  }
@ -137,16 +133,9 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
  // different, need to change it
  nsCString charsetName;
  res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
+  MOZ_ASSERT(NS_SUCCEEDED(res), "Should never call with a bogus aCharset.");

-  if(NS_FAILED(res) && (mCharsetSource == kCharsetUninitialized))
-  {
-     // failed - unknown alias , fallback to ISO-8859-1
-    mCharset.AssignLiteral("ISO-8859-1");
-  }
-  else
-  {
-    mCharset.Assign(charsetName);
-  }
+  mCharset.Assign(charsetName);

  mCharsetSource = aSource;

--- a/parser/htmlparser/src/nsScanner.h
+++ b/parser/htmlparser/src/nsScanner.h
@ -42,29 +42,15 @@ class nsScanner {
  public:

      /**
-       *  Use this constructor if you want i/o to be based on 
-       *  a single string you hand in during construction.
-       *  This short cut was added for Javascript.
-       *
-       *  @update  ftang 3/02/99
-       *  @param   aCharset charset
-       *  @param   aCharsetSource - where the charset info came from 
-       *  @param   aMode represents the parser mode (nav, other)
-       *  @return  
+       *  Use this constructor for the XML fragment parsing case
       */
-      nsScanner(const nsAString& anHTMLString, const nsACString& aCharset, int32_t aSource);
+      nsScanner(const nsAString& anHTMLString);

      /**
       *  Use this constructor if you want i/o to be based on 
       *  a file (therefore a stream) or just data you provide via Append().
-       *
-       *  @update  ftang 3/02/99
-       *  @param   aCharset charset
-       *  @param   aCharsetSource - where the charset info came from 
-       *  @param   aMode represents the parser mode (nav, other)
-       *  @return  
       */
-      nsScanner(nsString& aFilename,bool aCreateStream, const nsACString& aCharset, int32_t aSource);
+      nsScanner(nsString& aFilename, bool aCreateStream);

      ~nsScanner();

--- a/parser/htmlparser/tests/mochitest/Makefile.in
+++ b/parser/htmlparser/tests/mochitest/Makefile.in
@ -75,6 +75,15 @@ MOCHITEST_FILES =	parser_datreader.js \
 		test_viewsource.html \
 		test_bug715112.html \
 		test_bug715739.html \
+		test_bug716579.html \
+		file_bug716579-8.html \
+		file_bug716579-8.html^headers^ \
+		file_bug716579-16.html \
+		file_bug716579-16.html^headers^ \
+		file_bug716579-8.xhtml \
+		file_bug716579-8.xhtml^headers^ \
+		file_bug716579-16.xhtml \
+		file_bug716579-16.xhtml^headers^ \
 		test_bug717180.html \
 		file_bug717180.html \
 		$(NULL)
--- a/parser/htmlparser/tests/mochitest/file_bug716579-16.html
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.html
--- a/parser/htmlparser/tests/mochitest/file_bug716579-16.html^headers^
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.html^headers^
@ -0,0 +1 @@
+Content-Type: text/html; charset=windows-874
--- a/parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml
--- a/parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml^headers^
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-16.xhtml^headers^
@ -0,0 +1 @@
+Content-Type: application/xhtml+xml; charset=windows-874
--- a/parser/htmlparser/tests/mochitest/file_bug716579-8.html
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.html
@ -0,0 +1,3 @@
+<script>
+parent.html8 = "€";
+</script>
--- a/parser/htmlparser/tests/mochitest/file_bug716579-8.html^headers^
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.html^headers^
@ -0,0 +1 @@
+Content-Type: text/html; charset=windows-874
--- a/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml
@ -0,0 +1,7 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<body>
+<script>
+parent.xml8 = "€";
+</script>
+</body>
+</html>
--- a/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml^headers^
+++ b/parser/htmlparser/tests/mochitest/file_bug716579-8.xhtml^headers^
@ -0,0 +1 @@
+Content-Type: application/xhtml+xml; charset=windows-874
--- a/parser/htmlparser/tests/mochitest/test_bug716579.html
+++ b/parser/htmlparser/tests/mochitest/test_bug716579.html
@ -0,0 +1,44 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=716579
+-->
+<head>
+  <meta charset="windows-1251">
+  <title>Test for Bug 716579</title>
+  <script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=716579">Mozilla Bug 716579</a>
+<p id="display"></p>
+<pre id="test">
+<script type="application/javascript">
+
+/** Test for Bug 716579 **/
+
+var html8 = "FAIL";
+var html16 = "FAIL";
+var xml8 = "FAIL";
+var xml16 = "FAIL"; 
+
+SimpleTest.waitForExplicitFinish();
+
+window.onload = function() {
+  is(html8, "\u20AC", "HTML UTF-8 failed.");
+  is(html16, "\u20AC", "HTML UTF-16 failed.");
+  is(xml8, "\u20AC", "XML UTF-8 failed.");
+  is(xml16, "\u20AC", "XML UTF-16 failed.");
+  SimpleTest.finish();
+};
+
+</script>
+</pre>
+<div id="content" style="display: none">
+<iframe src="file_bug716579-8.html"></iframe>  
+<iframe src="file_bug716579-16.html"></iframe>  
+<iframe src="file_bug716579-8.xhtml"></iframe>  
+<iframe src="file_bug716579-16.xhtml"></iframe>  
+</div>
+</body>
+</html>
--- a/parser/nsCharsetSource.h
+++ b/parser/nsCharsetSource.h
@ -17,9 +17,9 @@
 #define kCharsetFromMetaPrescan         8 // this one and smaller: HTML5 Tentative
 #define kCharsetFromMetaTag             9 // this one and greater: HTML5 Confident
 #define kCharsetFromIrreversibleAutoDetection 10
-#define kCharsetFromByteOrderMark      11
-#define kCharsetFromChannel            12
-#define kCharsetFromOtherComponent     13
+#define kCharsetFromChannel            11
+#define kCharsetFromOtherComponent     12
+#define kCharsetFromByteOrderMark      13
 // Levels below here will be forced onto childframes too
 #define kCharsetFromParentForced       14
 #define kCharsetFromUserForced         15
				`@ -0,0 +1 @@`
				`Content-Type: text/html; charset=windows-874`
				`@ -0,0 +1 @@`
				`Content-Type: application/xhtml+xml; charset=windows-874`