Fix for bug 81253. We now sniff upto the first 2k of the first buffer of any HTML stream, looking for a META tag with charset information. If charset information is found, we use it for unicode conversion. This deals with the bulk of cases where we used to do a reload based on charset information in the document. In the worst case (if charset information exists but isn't found during sniffing) we fall back to the reload case. This fix improves initial page load performance for pages with a charset. Degradation in performance for pages loaded out of the cache is still being investigated. r=harishd, sr=waterson

2001-06-29 22:56:58 +00:00 · 2001-06-29 22:56:58 +00:00 · 9a358b6df4
--- a/content/base/src/nsPlainTextSerializer.h
+++ b/content/base/src/nsPlainTextSerializer.h
@ -83,6 +83,7 @@ public:
  NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode) { return NS_OK; }
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0) { return NS_OK; }
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }

  // nsIHTMLContentSink
  NS_IMETHOD SetTitle(const nsString& aValue) { return NS_OK; }
--- a/content/html/document/src/nsHTMLContentSink.cpp
+++ b/content/html/document/src/nsHTMLContentSink.cpp
@ -212,6 +212,7 @@ public:
  NS_IMETHOD AddLeaf(const nsIParserNode& aNode);
  NS_IMETHOD NotifyError(const nsParserError* aError);
  NS_IMETHOD FlushPendingNotifications();
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
  NS_IMETHOD AddComment(const nsIParserNode& aNode);
  NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
@ -5034,7 +5035,6 @@ HTMLContentSink::ProcessSTYLETag(const nsIParserNode& aNode)
      title.CompressWhitespace();

      element->GetAttribute(kNameSpaceID_HTML, nsHTMLAtoms::type, type);
-
      element->GetAttribute(kNameSpaceID_HTML, nsHTMLAtoms::media, media);
      media.ToLowerCase(); // HTML4.0 spec is inconsistent, make it case INSENSITIVE

@ -5119,6 +5119,16 @@ HTMLContentSink::FlushPendingNotifications()
  return result;
 }

+NS_IMETHODIMP 
+HTMLContentSink::SetDocumentCharset(nsAWritableString& aCharset)
+{
+  if (mDocument) {
+    return mDocument->SetDocumentCharacterSet(aCharset);
+  }
+  
+  return NS_OK;
+}
+
 NS_IMETHODIMP
 HTMLContentSink::DoFragment(PRBool aFlag) 
 {
--- a/content/html/document/src/nsHTMLDocument.cpp
+++ b/content/html/document/src/nsHTMLDocument.cpp
@ -702,6 +702,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
            }
          }
    }
+
    nsresult rv_detect = NS_OK;
    if(! gInitDetector)
    {
@ -732,6 +733,22 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
    nsXPIDLCString urlSpec;
    aURL->GetSpec(getter_Copies(urlSpec));

+    if (cacheDescriptor && urlSpec)
+    {
+      if (kCharsetFromCache > charsetSource) 
+      {
+        nsXPIDLCString cachedCharset;
+        rv = cacheDescriptor->GetMetaDataElement("charset",
+                                                 getter_Copies(cachedCharset));
+        if (NS_SUCCEEDED(rv) && PL_strlen(cachedCharset) > 0)
+        {
+          charset.AssignWithConversion(cachedCharset);
+          charsetSource = kCharsetFromCache;
+        }
+      }    
+      rv = NS_OK;
+    }
+
    if (scheme && nsCRT::strcasecmp("about", scheme) && (kCharsetFromBookmarks > charsetSource))
    {
      nsCOMPtr<nsIRDFDataSource>  datasource;
@ -757,22 +774,6 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
       }
    } 

-    if (cacheDescriptor && urlSpec)
-    {
-      if (kCharsetFromCache > charsetSource) 
-      {
-        nsXPIDLCString cachedCharset;
-        rv = cacheDescriptor->GetMetaDataElement("charset",
-                                                 getter_Copies(cachedCharset));
-        if (NS_SUCCEEDED(rv) && PL_strlen(cachedCharset) > 0)
-        {
-          charset.AssignWithConversion(cachedCharset);
-          charsetSource = kCharsetFromCache;
-        }
-      }    
-      rv = NS_OK;
-    }
-
    if (kCharsetFromParentFrame > charsetSource) {
      if (dcInfo) {
        nsCOMPtr<nsIAtom> csAtom;
--- a/content/html/document/src/nsHTMLFragmentContentSink.cpp
+++ b/content/html/document/src/nsHTMLFragmentContentSink.cpp
@ -96,6 +96,7 @@ public:
  NS_IMETHOD OpenMap(const nsIParserNode& aNode);
  NS_IMETHOD CloseMap(const nsIParserNode& aNode);
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
  NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
  NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
  NS_IMETHOD WillProcessAToken(void) { return NS_OK; }
--- a/content/xml/document/src/nsXMLContentSink.cpp
+++ b/content/xml/document/src/nsXMLContentSink.cpp
@ -1375,6 +1375,16 @@ nsXMLContentSink::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)
  return doc->AppendChild(docType, getter_AddRefs(tmpNode));
 }

+NS_IMETHODIMP 
+nsXMLContentSink::SetDocumentCharset(nsAWritableString& aCharset)
+{
+  if (mDocument) {
+    return mDocument->SetDocumentCharacterSet(aCharset);
+  }
+  
+  return NS_OK;
+}
+
 nsresult
 nsXMLContentSink::FlushText(PRBool aCreateTextNode, PRBool* aDidFlush)
 {
--- a/content/xml/document/src/nsXMLContentSink.h
+++ b/content/xml/document/src/nsXMLContentSink.h
@ -96,6 +96,7 @@ public:
  NS_IMETHOD NotifyError(const nsParserError* aError);
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);

  // nsIXMLContentSink
  NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);  
--- a/content/xul/document/src/nsXULContentSink.cpp
+++ b/content/xul/document/src/nsXULContentSink.cpp
@ -130,6 +130,7 @@ public:
    NS_IMETHOD NotifyError(const nsParserError* aError);
    NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
    NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+    NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);

    // nsIXMLContentSink
    NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);    
@ -954,6 +955,17 @@ XULContentSinkImpl::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)
    return NS_OK;
 }

+NS_IMETHODIMP 
+XULContentSinkImpl::SetDocumentCharset(nsAWritableString& aCharset)
+{
+    nsCOMPtr<nsIDocument> doc = do_QueryReferent(mDocument);
+    if (doc) {
+        return doc->SetDocumentCharacterSet(aCharset);
+    }
+  
+    return NS_OK;
+}
+

 NS_IMETHODIMP 
 XULContentSinkImpl::AddCharacterData(const nsIParserNode& aNode)
--- a/htmlparser/robot/nsRobotSink.cpp
+++ b/htmlparser/robot/nsRobotSink.cpp
@ -86,6 +86,7 @@ public:
  NS_IMETHOD WillResume(void) { return NS_OK; }
  NS_IMETHOD SetParser(nsIParser* aParser) { return NS_OK; }
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
  NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
  NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
  NS_IMETHOD WillProcessAToken(void) { return NS_OK; }
--- a/htmlparser/src/nsHTMLNullSink.cpp
+++ b/htmlparser/src/nsHTMLNullSink.cpp
@ -55,6 +55,7 @@ public:
  NS_IMETHOD AddComment(const nsIParserNode& aNode);
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
  NS_IMETHOD FlushPendingNotifications() {return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) {return NS_OK;}

  // nsIHTMLContentSink
  NS_IMETHOD SetTitle(const nsString& aValue);
--- a/htmlparser/src/nsIContentSink.h
+++ b/htmlparser/src/nsIContentSink.h
@ -168,6 +168,12 @@ public:
   * is in sync with the state of the sink.
   */
  NS_IMETHOD FlushPendingNotifications()=0;
+
+  /**
+   * Set the document character set. This should be passed on to the
+   * document itself.
+   */
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset)=0;
 };

 #endif /* nsIContentSink_h___ */
--- a/htmlparser/src/nsIParser.h
+++ b/htmlparser/src/nsIParser.h
@ -87,12 +87,12 @@ typedef enum {
   kCharsetFromWeakDocTypeDefault,
   kCharsetFromUserDefault ,
   kCharsetFromDocTypeDefault,
-   kCharsetFromCache,
   kCharsetFromParentFrame,
   kCharsetFromBookmarks,
   kCharsetFromAutoDetection,
   kCharsetFromMetaTag,
   kCharsetFromByteOrderMark,
+   kCharsetFromCache,
   kCharsetFromHTTPHeader,
   kCharsetFromUserForced,
   kCharsetFromOtherComponent,
--- a/htmlparser/src/nsLoggingSink.h
+++ b/htmlparser/src/nsLoggingSink.h
@ -57,6 +57,7 @@ public:
  NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }

  // nsIHTMLContentSink
  NS_IMETHOD SetTitle(const nsString& aValue);
--- a/htmlparser/src/nsParser.cpp
+++ b/htmlparser/src/nsParser.cpp
@ -39,12 +39,16 @@
 #include "nsViewSourceHTML.h" 
 #include "nsIStringStream.h"
 #include "nsIChannel.h"
+#include "nsICachingChannel.h"
+#include "nsICacheEntryDescriptor.h"
+#include "nsICharsetAlias.h"
 #include "nsIProgressEventSink.h"
 #include "nsIInputStream.h"
 #include "CNavDTD.h"
 #include "COtherDTD.h"
 #include "prenv.h" 
 #include "nsParserCIID.h"
+#include "nsReadableUtils.h"
 #include "nsCOMPtr.h"
 #include "nsIEventQueue.h"
 #include "nsIEventQueueService.h"
@ -543,6 +547,13 @@ void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSo
     mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
 }

+void nsParser::SetSinkCharset(nsAWritableString& aCharset)
+{
+  if (mSink) {
+    mSink->SetDocumentCharset(aCharset);
+  }
+}
+
 /**
 *  This method gets called in order to set the content
 *  sink for this parser to dump nodes to.
@ -613,14 +624,14 @@ nsDTDMode nsParser::GetParseMode(void){
 }


-
+template <class CharT>
 class CWordTokenizer {
 public:
-  CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
+  CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
    mLength=0;
    mOffset=aStartOffset;
    mMaxOffset=aMaxOffset;
-    mBuffer=aString.GetUnicode();
+    mBuffer=aBuffer;
    mEndBuffer=mBuffer+mMaxOffset;
  }

@ -633,25 +644,33 @@ public:
  // Returns offset of nth word, or -1 (if out of words).
  //********************************************************************************
  
-  PRInt32 GetNextWord() {
+  PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {

-    const PRUnichar *cp=mBuffer+mOffset+mLength;  //skip last word
+    const CharT *cp=mBuffer+mOffset+mLength;  //skip last word

    mLength=0;  //reset this
    mOffset=-1; //reset this        

    //now skip whitespace...

-    PRUnichar target=0;
+    CharT target=0;
    PRBool    done=PR_FALSE;

    while((!done) && (cp++<mEndBuffer)) {
      switch(*cp) {
        case kSpace:  case kNewLine:
        case kCR:     case kTab:
+        case kEqual:
          continue;

        case kQuote:
+          target=*cp;
+          if (aSkipQuotes) {
+            cp++;
+          }
+          done=PR_TRUE;
+          break;
+
        case kMinus:
          target=*cp;
          done=PR_TRUE;
@ -665,7 +684,7 @@ public:

    if(cp<mEndBuffer) {  

-      const PRUnichar *firstcp=cp; //hang onto this...      
+      const CharT *firstcp=cp; //hang onto this...      
      PRInt32 theDashCount=2;

      cp++; //just skip first letter to simplify processing...
@ -693,7 +712,8 @@ public:
             (kGreaterThan==*cp) ||
             (kQuote==*cp) ||
             (kCR==*cp) ||
-             (kTab==*cp)) {
+             (kTab==*cp) || 
+             (kEqual == *cp)) {
            break;
          }
        }
@ -707,11 +727,15 @@ public:
    return mOffset;
  }

+  PRInt32 GetLength() const {
+    return mLength;
+  }
+
  PRInt32     mOffset;
  PRInt32     mMaxOffset;
  PRInt32     mLength;
-  const PRUnichar*  mBuffer;
-  const PRUnichar*  mEndBuffer;
+  const CharT*  mBuffer;
+  const CharT*  mEndBuffer;
 };


@ -848,7 +872,7 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&
  if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {  

    const PRUnichar*  theBuffer=aBuffer.GetUnicode();
-    CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
+    CWordTokenizer<PRUnichar> theTokenizer(theBuffer,theLTPos,theGTPos);
    theOffset=theTokenizer.GetNextWord();  //try to find ?xml, !doctype, etc...

    if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
@ -2297,7 +2321,7 @@ nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
 #define UCS4_3412 "X-ISO-10646-UCS-4-3412"
 #define UTF8 "UTF-8"

-static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
+static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
 oCharsetSource= kCharsetFromAutoDetection;
 oCharset.SetLength(0);
 // see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
@ -2407,11 +2431,121 @@ static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsS
 return oCharset.Length() > 0;
 }

+static const char kHTTPEquivStr[] = "http-equiv";
+static const PRInt32 kHTTPEquivStrLen = sizeof(kHTTPEquivStr)-1;
+static const char kContentTypeStr[] = "Content-Type";
+static const PRInt32 kContentTypeStrLen = sizeof(kContentTypeStr)-1;
+static const char kContentStr[] = "content";
+static const PRInt32 kContentStrLen = sizeof(kContentStr)-1;
+static const char kCharsetStr[] = "charset";
+static const PRInt32 kCharsetStrLen = sizeof(kCharsetStr)-1;
+
+PRBool 
+nsParser::DetectMetaTag(const char* aBytes, 
+                        PRInt32 aLen, 
+                        nsString& aCharset, 
+                        nsCharsetSource& aCharsetSource) 
+{
+  PRBool foundContentType = PR_FALSE;
+  aCharsetSource= kCharsetFromMetaTag;
+  aCharset.SetLength(0);
+
+  // XXX Only look inside HTML documents for now. For XML
+  // documents we should be looking inside the XMLDecl.
+  if (!mParserContext->mMimeType.Equals(NS_ConvertASCIItoUCS2(kHTMLTextContentType))) {
+    return PR_FALSE;
+  }
+
+  // Fast and loose parsing to determine if we have a complete
+  // META tag in this block, looking upto 2k into it.
+  nsDependentCString str(aBytes, PR_MIN(aLen, 2048));
+  nsReadingIterator<char> begin, end;
+  
+  str.BeginReading(begin);
+  str.EndReading(end);
+  nsReadingIterator<char> tagStart(begin);
+  nsReadingIterator<char> tagEnd(end);
+  
+  do {
+    // Find the string META and make sure it's not right at the beginning
+    if (CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("META"), tagStart, tagEnd) &&
+        (tagStart != begin)) {
+      // Back up one to confirm that this is a tag
+      if (*--tagStart == '<') {
+        const char* attrStart = tagEnd.get();
+        const char* attrEnd;
+        
+        // Find the end of the tag
+        FindInReadable(NS_LITERAL_CSTRING(">"), tagEnd, end);
+        attrEnd = tagEnd.get();
+        
+        CWordTokenizer<char> tokenizer(attrStart, 0, attrEnd-attrStart);
+        PRInt32 offset;
+        
+        // Start looking at the attributes
+        while ((offset = tokenizer.GetNextWord()) != kNotFound) {
+          // We need to have a HTTP-EQUIV attribute whose value is 
+          // "Content-Type"
+          if ((tokenizer.GetLength() >= kHTTPEquivStrLen) &&
+              (nsCRT::strncasecmp(attrStart+offset, 
+                                 kHTTPEquivStr, kHTTPEquivStrLen) == 0)) {
+            if (((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) &&
+                (tokenizer.GetLength() >= kContentTypeStrLen) &&
+                (nsCRT::strncasecmp(attrStart+offset, 
+                                    kContentTypeStr, kContentTypeStrLen) == 0)) {
+              foundContentType = PR_TRUE;
+            }
+            else {
+              break;
+            }
+          }
+          // And a CONTENT attribute
+          else if ((tokenizer.GetLength() >= kContentStrLen) &&
+                   (nsCRT::strncasecmp(attrStart+offset, 
+                                      kContentStr, kContentStrLen) == 0)) {
+            // The next word is the value which itself needs to be parsed
+            if ((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) {
+              const char* contentStart = attrStart+offset;
+              CWordTokenizer<char> contentTokenizer(contentStart, 0, 
+                                                    tokenizer.GetLength());
+
+              // Read the content type
+              if (contentTokenizer.GetNextWord() != kNotFound) {
+                // Now see if we have a charset
+                if (((offset = contentTokenizer.GetNextWord()) != kNotFound) &&
+                    (contentTokenizer.GetLength() >= kCharsetStrLen) &&
+                    (nsCRT::strncasecmp(contentStart+offset,
+                                        kCharsetStr, kCharsetStrLen) == 0)) {
+                  // The next word is the charset
+                  if ((offset = contentTokenizer.GetNextWord()) != kNotFound) {
+                    aCharset.Assign(NS_ConvertASCIItoUCS2(contentStart+offset, 
+                                                          contentTokenizer.GetLength()));
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        if (foundContentType && (aCharset.Length() > 0)) {
+          return PR_TRUE;
+        }
+      }
+    }  
+   
+    tagStart = tagEnd;
+    tagEnd = end;
+  } while (tagStart != end);
+  
+  return PR_FALSE;
+}
+
 typedef struct {
-  PRBool mNeedCheckFirst4Bytes;
+  PRBool mNeedCharsetCheck;
  nsParser* mParser;
  nsIParserFilter* mParserFilter;
  nsScanner* mScanner;
+  nsIRequest* mRequest;
 } ParserWriteStruct;

 /*
@ -2437,19 +2571,40 @@ ParserWriteFunc(nsIInputStream* in,
    return NS_ERROR_FAILURE;
  }

-  if(pws->mNeedCheckFirst4Bytes && (count >= 4)) { 
+  if(pws->mNeedCharsetCheck) { 
    nsCharsetSource guessSource; 
-    nsAutoString guess; 
+    nsAutoString guess, preferred; 
  
-    pws->mNeedCheckFirst4Bytes = PR_FALSE; 
-    if(detectByteOrderMark((const unsigned char*)buf, 
-                           theNumRead, guess, guessSource)) 
-      { 
+    pws->mNeedCharsetCheck = PR_FALSE; 
+    if(pws->mParser->DetectMetaTag(buf, theNumRead,
+                                   guess, guessSource) ||
+       ((count >= 4) && 
+        DetectByteOrderMark((const unsigned char*)buf, 
+                            theNumRead, guess, guessSource))) { 
 #ifdef DEBUG_XMLENCODING 
-        printf("xmlencoding detect- %s\n", guess.ToNewCString()); 
+      printf("xmlencoding detect- %s\n", guess.ToNewCString()); 
 #endif 
-        pws->mParser->SetDocumentCharset(guess, guessSource); 
-      } 
+      nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
+      result = alias->GetPreferred(guess, preferred);
+      if (NS_SUCCEEDED(result)) {
+        guess.Assign(preferred);
+      }
+      pws->mParser->SetDocumentCharset(guess, guessSource); 
+      pws->mParser->SetSinkCharset(guess);
+      nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
+      if (channel) {
+        nsCOMPtr<nsISupports> cacheToken;
+        channel->GetCacheToken(getter_AddRefs(cacheToken));
+        if (cacheToken) {
+          nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
+          if (cacheDescriptor) {
+            nsresult rv = cacheDescriptor->SetMetaDataElement("charset",
+                                                              NS_ConvertUCS2toUTF8(guess).get());
+            NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
+          }
+        }
+      }
+    } 
  } 

  if(pws->mParserFilter) 
@ -2503,11 +2658,12 @@ NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail

    PRUint32 totalRead;
    ParserWriteStruct pws;
-    pws.mNeedCheckFirst4Bytes = 
-              ((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection)); 
+    pws.mNeedCharsetCheck = 
+              ((0 == sourceOffset) && (mCharsetSource<kCharsetFromMetaTag)); 
    pws.mParser = this;
    pws.mParserFilter = mParserFilter;
    pws.mScanner = theContext->mScanner;
+    pws.mRequest = request;

    result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
    if (NS_FAILED(result)) {
--- a/htmlparser/src/nsParser.h
+++ b/htmlparser/src/nsParser.h
@ -328,6 +328,17 @@ class nsParser : public nsIParser,
                                   const nsString* aMimeType=nsnull, 
                                   nsDTDMode aDTDMode=eDTDMode_unknown);

+    /** 
+     * Detects the existence of a META tag with charset information in 
+     * the given buffer.
+     */
+    PRBool DetectMetaTag(const char* aBytes, 
+                         PRInt32 aLen, 
+                         nsString& oCharset, 
+                         nsCharsetSource& oCharsetSource);
+
+    void SetSinkCharset(nsAWritableString& aCharset);
+
    /**
     *  Removes continue parsing events
     *  @update  kmcclusk 5/18/98
--- a/parser/htmlparser/robot/nsRobotSink.cpp
+++ b/parser/htmlparser/robot/nsRobotSink.cpp
@ -86,6 +86,7 @@ public:
  NS_IMETHOD WillResume(void) { return NS_OK; }
  NS_IMETHOD SetParser(nsIParser* aParser) { return NS_OK; }
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
  NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
  NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
  NS_IMETHOD WillProcessAToken(void) { return NS_OK; }
--- a/parser/htmlparser/src/nsHTMLNullSink.cpp
+++ b/parser/htmlparser/src/nsHTMLNullSink.cpp
@ -55,6 +55,7 @@ public:
  NS_IMETHOD AddComment(const nsIParserNode& aNode);
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
  NS_IMETHOD FlushPendingNotifications() {return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) {return NS_OK;}

  // nsIHTMLContentSink
  NS_IMETHOD SetTitle(const nsString& aValue);
--- a/parser/htmlparser/src/nsIContentSink.h
+++ b/parser/htmlparser/src/nsIContentSink.h
@ -168,6 +168,12 @@ public:
   * is in sync with the state of the sink.
   */
  NS_IMETHOD FlushPendingNotifications()=0;
+
+  /**
+   * Set the document character set. This should be passed on to the
+   * document itself.
+   */
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset)=0;
 };

 #endif /* nsIContentSink_h___ */
--- a/parser/htmlparser/src/nsIParser.h
+++ b/parser/htmlparser/src/nsIParser.h
@ -87,12 +87,12 @@ typedef enum {
   kCharsetFromWeakDocTypeDefault,
   kCharsetFromUserDefault ,
   kCharsetFromDocTypeDefault,
-   kCharsetFromCache,
   kCharsetFromParentFrame,
   kCharsetFromBookmarks,
   kCharsetFromAutoDetection,
   kCharsetFromMetaTag,
   kCharsetFromByteOrderMark,
+   kCharsetFromCache,
   kCharsetFromHTTPHeader,
   kCharsetFromUserForced,
   kCharsetFromOtherComponent,
--- a/parser/htmlparser/src/nsLoggingSink.h
+++ b/parser/htmlparser/src/nsLoggingSink.h
@ -57,6 +57,7 @@ public:
  NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
  NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
  NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+  NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }

  // nsIHTMLContentSink
  NS_IMETHOD SetTitle(const nsString& aValue);
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@ -39,12 +39,16 @@
 #include "nsViewSourceHTML.h" 
 #include "nsIStringStream.h"
 #include "nsIChannel.h"
+#include "nsICachingChannel.h"
+#include "nsICacheEntryDescriptor.h"
+#include "nsICharsetAlias.h"
 #include "nsIProgressEventSink.h"
 #include "nsIInputStream.h"
 #include "CNavDTD.h"
 #include "COtherDTD.h"
 #include "prenv.h" 
 #include "nsParserCIID.h"
+#include "nsReadableUtils.h"
 #include "nsCOMPtr.h"
 #include "nsIEventQueue.h"
 #include "nsIEventQueueService.h"
@ -543,6 +547,13 @@ void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSo
     mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
 }

+void nsParser::SetSinkCharset(nsAWritableString& aCharset)
+{
+  if (mSink) {
+    mSink->SetDocumentCharset(aCharset);
+  }
+}
+
 /**
 *  This method gets called in order to set the content
 *  sink for this parser to dump nodes to.
@ -613,14 +624,14 @@ nsDTDMode nsParser::GetParseMode(void){
 }


-
+template <class CharT>
 class CWordTokenizer {
 public:
-  CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
+  CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
    mLength=0;
    mOffset=aStartOffset;
    mMaxOffset=aMaxOffset;
-    mBuffer=aString.GetUnicode();
+    mBuffer=aBuffer;
    mEndBuffer=mBuffer+mMaxOffset;
  }

@ -633,25 +644,33 @@ public:
  // Returns offset of nth word, or -1 (if out of words).
  //********************************************************************************
  
-  PRInt32 GetNextWord() {
+  PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {

-    const PRUnichar *cp=mBuffer+mOffset+mLength;  //skip last word
+    const CharT *cp=mBuffer+mOffset+mLength;  //skip last word

    mLength=0;  //reset this
    mOffset=-1; //reset this        

    //now skip whitespace...

-    PRUnichar target=0;
+    CharT target=0;
    PRBool    done=PR_FALSE;

    while((!done) && (cp++<mEndBuffer)) {
      switch(*cp) {
        case kSpace:  case kNewLine:
        case kCR:     case kTab:
+        case kEqual:
          continue;

        case kQuote:
+          target=*cp;
+          if (aSkipQuotes) {
+            cp++;
+          }
+          done=PR_TRUE;
+          break;
+
        case kMinus:
          target=*cp;
          done=PR_TRUE;
@ -665,7 +684,7 @@ public:

    if(cp<mEndBuffer) {  

-      const PRUnichar *firstcp=cp; //hang onto this...      
+      const CharT *firstcp=cp; //hang onto this...      
      PRInt32 theDashCount=2;

      cp++; //just skip first letter to simplify processing...
@ -693,7 +712,8 @@ public:
             (kGreaterThan==*cp) ||
             (kQuote==*cp) ||
             (kCR==*cp) ||
-             (kTab==*cp)) {
+             (kTab==*cp) || 
+             (kEqual == *cp)) {
            break;
          }
        }
@ -707,11 +727,15 @@ public:
    return mOffset;
  }

+  PRInt32 GetLength() const {
+    return mLength;
+  }
+
  PRInt32     mOffset;
  PRInt32     mMaxOffset;
  PRInt32     mLength;
-  const PRUnichar*  mBuffer;
-  const PRUnichar*  mEndBuffer;
+  const CharT*  mBuffer;
+  const CharT*  mEndBuffer;
 };


@ -848,7 +872,7 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&
  if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {  

    const PRUnichar*  theBuffer=aBuffer.GetUnicode();
-    CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
+    CWordTokenizer<PRUnichar> theTokenizer(theBuffer,theLTPos,theGTPos);
    theOffset=theTokenizer.GetNextWord();  //try to find ?xml, !doctype, etc...

    if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
@ -2297,7 +2321,7 @@ nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
 #define UCS4_3412 "X-ISO-10646-UCS-4-3412"
 #define UTF8 "UTF-8"

-static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
+static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
 oCharsetSource= kCharsetFromAutoDetection;
 oCharset.SetLength(0);
 // see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
@ -2407,11 +2431,121 @@ static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsS
 return oCharset.Length() > 0;
 }

+static const char kHTTPEquivStr[] = "http-equiv";
+static const PRInt32 kHTTPEquivStrLen = sizeof(kHTTPEquivStr)-1;
+static const char kContentTypeStr[] = "Content-Type";
+static const PRInt32 kContentTypeStrLen = sizeof(kContentTypeStr)-1;
+static const char kContentStr[] = "content";
+static const PRInt32 kContentStrLen = sizeof(kContentStr)-1;
+static const char kCharsetStr[] = "charset";
+static const PRInt32 kCharsetStrLen = sizeof(kCharsetStr)-1;
+
+PRBool 
+nsParser::DetectMetaTag(const char* aBytes, 
+                        PRInt32 aLen, 
+                        nsString& aCharset, 
+                        nsCharsetSource& aCharsetSource) 
+{
+  PRBool foundContentType = PR_FALSE;
+  aCharsetSource= kCharsetFromMetaTag;
+  aCharset.SetLength(0);
+
+  // XXX Only look inside HTML documents for now. For XML
+  // documents we should be looking inside the XMLDecl.
+  if (!mParserContext->mMimeType.Equals(NS_ConvertASCIItoUCS2(kHTMLTextContentType))) {
+    return PR_FALSE;
+  }
+
+  // Fast and loose parsing to determine if we have a complete
+  // META tag in this block, looking upto 2k into it.
+  nsDependentCString str(aBytes, PR_MIN(aLen, 2048));
+  nsReadingIterator<char> begin, end;
+  
+  str.BeginReading(begin);
+  str.EndReading(end);
+  nsReadingIterator<char> tagStart(begin);
+  nsReadingIterator<char> tagEnd(end);
+  
+  do {
+    // Find the string META and make sure it's not right at the beginning
+    if (CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("META"), tagStart, tagEnd) &&
+        (tagStart != begin)) {
+      // Back up one to confirm that this is a tag
+      if (*--tagStart == '<') {
+        const char* attrStart = tagEnd.get();
+        const char* attrEnd;
+        
+        // Find the end of the tag
+        FindInReadable(NS_LITERAL_CSTRING(">"), tagEnd, end);
+        attrEnd = tagEnd.get();
+        
+        CWordTokenizer<char> tokenizer(attrStart, 0, attrEnd-attrStart);
+        PRInt32 offset;
+        
+        // Start looking at the attributes
+        while ((offset = tokenizer.GetNextWord()) != kNotFound) {
+          // We need to have a HTTP-EQUIV attribute whose value is 
+          // "Content-Type"
+          if ((tokenizer.GetLength() >= kHTTPEquivStrLen) &&
+              (nsCRT::strncasecmp(attrStart+offset, 
+                                 kHTTPEquivStr, kHTTPEquivStrLen) == 0)) {
+            if (((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) &&
+                (tokenizer.GetLength() >= kContentTypeStrLen) &&
+                (nsCRT::strncasecmp(attrStart+offset, 
+                                    kContentTypeStr, kContentTypeStrLen) == 0)) {
+              foundContentType = PR_TRUE;
+            }
+            else {
+              break;
+            }
+          }
+          // And a CONTENT attribute
+          else if ((tokenizer.GetLength() >= kContentStrLen) &&
+                   (nsCRT::strncasecmp(attrStart+offset, 
+                                      kContentStr, kContentStrLen) == 0)) {
+            // The next word is the value which itself needs to be parsed
+            if ((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) {
+              const char* contentStart = attrStart+offset;
+              CWordTokenizer<char> contentTokenizer(contentStart, 0, 
+                                                    tokenizer.GetLength());
+
+              // Read the content type
+              if (contentTokenizer.GetNextWord() != kNotFound) {
+                // Now see if we have a charset
+                if (((offset = contentTokenizer.GetNextWord()) != kNotFound) &&
+                    (contentTokenizer.GetLength() >= kCharsetStrLen) &&
+                    (nsCRT::strncasecmp(contentStart+offset,
+                                        kCharsetStr, kCharsetStrLen) == 0)) {
+                  // The next word is the charset
+                  if ((offset = contentTokenizer.GetNextWord()) != kNotFound) {
+                    aCharset.Assign(NS_ConvertASCIItoUCS2(contentStart+offset, 
+                                                          contentTokenizer.GetLength()));
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        if (foundContentType && (aCharset.Length() > 0)) {
+          return PR_TRUE;
+        }
+      }
+    }  
+   
+    tagStart = tagEnd;
+    tagEnd = end;
+  } while (tagStart != end);
+  
+  return PR_FALSE;
+}
+
 typedef struct {
-  PRBool mNeedCheckFirst4Bytes;
+  PRBool mNeedCharsetCheck;
  nsParser* mParser;
  nsIParserFilter* mParserFilter;
  nsScanner* mScanner;
+  nsIRequest* mRequest;
 } ParserWriteStruct;

 /*
@ -2437,19 +2571,40 @@ ParserWriteFunc(nsIInputStream* in,
    return NS_ERROR_FAILURE;
  }

-  if(pws->mNeedCheckFirst4Bytes && (count >= 4)) { 
+  if(pws->mNeedCharsetCheck) { 
    nsCharsetSource guessSource; 
-    nsAutoString guess; 
+    nsAutoString guess, preferred; 
  
-    pws->mNeedCheckFirst4Bytes = PR_FALSE; 
-    if(detectByteOrderMark((const unsigned char*)buf, 
-                           theNumRead, guess, guessSource)) 
-      { 
+    pws->mNeedCharsetCheck = PR_FALSE; 
+    if(pws->mParser->DetectMetaTag(buf, theNumRead,
+                                   guess, guessSource) ||
+       ((count >= 4) && 
+        DetectByteOrderMark((const unsigned char*)buf, 
+                            theNumRead, guess, guessSource))) { 
 #ifdef DEBUG_XMLENCODING 
-        printf("xmlencoding detect- %s\n", guess.ToNewCString()); 
+      printf("xmlencoding detect- %s\n", guess.ToNewCString()); 
 #endif 
-        pws->mParser->SetDocumentCharset(guess, guessSource); 
-      } 
+      nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
+      result = alias->GetPreferred(guess, preferred);
+      if (NS_SUCCEEDED(result)) {
+        guess.Assign(preferred);
+      }
+      pws->mParser->SetDocumentCharset(guess, guessSource); 
+      pws->mParser->SetSinkCharset(guess);
+      nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
+      if (channel) {
+        nsCOMPtr<nsISupports> cacheToken;
+        channel->GetCacheToken(getter_AddRefs(cacheToken));
+        if (cacheToken) {
+          nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
+          if (cacheDescriptor) {
+            nsresult rv = cacheDescriptor->SetMetaDataElement("charset",
+                                                              NS_ConvertUCS2toUTF8(guess).get());
+            NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
+          }
+        }
+      }
+    } 
  } 

  if(pws->mParserFilter) 
@ -2503,11 +2658,12 @@ NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail

    PRUint32 totalRead;
    ParserWriteStruct pws;
-    pws.mNeedCheckFirst4Bytes = 
-              ((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection)); 
+    pws.mNeedCharsetCheck = 
+              ((0 == sourceOffset) && (mCharsetSource<kCharsetFromMetaTag)); 
    pws.mParser = this;
    pws.mParserFilter = mParserFilter;
    pws.mScanner = theContext->mScanner;
+    pws.mRequest = request;

    result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
    if (NS_FAILED(result)) {
--- a/parser/htmlparser/src/nsParser.h
+++ b/parser/htmlparser/src/nsParser.h
@ -328,6 +328,17 @@ class nsParser : public nsIParser,
                                   const nsString* aMimeType=nsnull, 
                                   nsDTDMode aDTDMode=eDTDMode_unknown);

+    /** 
+     * Detects the existence of a META tag with charset information in 
+     * the given buffer.
+     */
+    PRBool DetectMetaTag(const char* aBytes, 
+                         PRInt32 aLen, 
+                         nsString& oCharset, 
+                         nsCharsetSource& oCharsetSource);
+
+    void SetSinkCharset(nsAWritableString& aCharset);
+
    /**
     *  Removes continue parsing events
     *  @update  kmcclusk 5/18/98
--- a/rdf/base/src/nsRDFContentSink.cpp
+++ b/rdf/base/src/nsRDFContentSink.cpp
@ -168,6 +168,7 @@ public:
    NS_IMETHOD NotifyError(const nsParserError* aError);
    NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
    NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
+    NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }

    // nsIXMLContentSink
    NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);