зеркало из https://github.com/mozilla/pjs.git
Fix for bug 81253. We now sniff upto the first 2k of the first buffer of any HTML stream, looking for a META tag with charset information. If charset information is found, we use it for unicode conversion. This deals with the bulk of cases where we used to do a reload based on charset information in the document. In the worst case (if charset information exists but isn't found during sniffing) we fall back to the reload case. This fix improves initial page load performance for pages with a charset. Degradation in performance for pages loaded out of the cache is still being investigated. r=harishd, sr=waterson
This commit is contained in:
Родитель
2c8c49c99c
Коммит
59a48b8d8a
|
@ -83,6 +83,7 @@ public:
|
|||
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode) { return NS_OK; }
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0) { return NS_OK; }
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
|
||||
// nsIHTMLContentSink
|
||||
NS_IMETHOD SetTitle(const nsString& aValue) { return NS_OK; }
|
||||
|
|
|
@ -212,6 +212,7 @@ public:
|
|||
NS_IMETHOD AddLeaf(const nsIParserNode& aNode);
|
||||
NS_IMETHOD NotifyError(const nsParserError* aError);
|
||||
NS_IMETHOD FlushPendingNotifications();
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
|
||||
NS_IMETHOD AddComment(const nsIParserNode& aNode);
|
||||
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
|
@ -5034,7 +5035,6 @@ HTMLContentSink::ProcessSTYLETag(const nsIParserNode& aNode)
|
|||
title.CompressWhitespace();
|
||||
|
||||
element->GetAttribute(kNameSpaceID_HTML, nsHTMLAtoms::type, type);
|
||||
|
||||
element->GetAttribute(kNameSpaceID_HTML, nsHTMLAtoms::media, media);
|
||||
media.ToLowerCase(); // HTML4.0 spec is inconsistent, make it case INSENSITIVE
|
||||
|
||||
|
@ -5119,6 +5119,16 @@ HTMLContentSink::FlushPendingNotifications()
|
|||
return result;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
HTMLContentSink::SetDocumentCharset(nsAWritableString& aCharset)
|
||||
{
|
||||
if (mDocument) {
|
||||
return mDocument->SetDocumentCharacterSet(aCharset);
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
HTMLContentSink::DoFragment(PRBool aFlag)
|
||||
{
|
||||
|
|
|
@ -702,6 +702,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
nsresult rv_detect = NS_OK;
|
||||
if(! gInitDetector)
|
||||
{
|
||||
|
@ -732,6 +733,22 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
|||
nsXPIDLCString urlSpec;
|
||||
aURL->GetSpec(getter_Copies(urlSpec));
|
||||
|
||||
if (cacheDescriptor && urlSpec)
|
||||
{
|
||||
if (kCharsetFromCache > charsetSource)
|
||||
{
|
||||
nsXPIDLCString cachedCharset;
|
||||
rv = cacheDescriptor->GetMetaDataElement("charset",
|
||||
getter_Copies(cachedCharset));
|
||||
if (NS_SUCCEEDED(rv) && PL_strlen(cachedCharset) > 0)
|
||||
{
|
||||
charset.AssignWithConversion(cachedCharset);
|
||||
charsetSource = kCharsetFromCache;
|
||||
}
|
||||
}
|
||||
rv = NS_OK;
|
||||
}
|
||||
|
||||
if (scheme && nsCRT::strcasecmp("about", scheme) && (kCharsetFromBookmarks > charsetSource))
|
||||
{
|
||||
nsCOMPtr<nsIRDFDataSource> datasource;
|
||||
|
@ -757,22 +774,6 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
|||
}
|
||||
}
|
||||
|
||||
if (cacheDescriptor && urlSpec)
|
||||
{
|
||||
if (kCharsetFromCache > charsetSource)
|
||||
{
|
||||
nsXPIDLCString cachedCharset;
|
||||
rv = cacheDescriptor->GetMetaDataElement("charset",
|
||||
getter_Copies(cachedCharset));
|
||||
if (NS_SUCCEEDED(rv) && PL_strlen(cachedCharset) > 0)
|
||||
{
|
||||
charset.AssignWithConversion(cachedCharset);
|
||||
charsetSource = kCharsetFromCache;
|
||||
}
|
||||
}
|
||||
rv = NS_OK;
|
||||
}
|
||||
|
||||
if (kCharsetFromParentFrame > charsetSource) {
|
||||
if (dcInfo) {
|
||||
nsCOMPtr<nsIAtom> csAtom;
|
||||
|
|
|
@ -96,6 +96,7 @@ public:
|
|||
NS_IMETHOD OpenMap(const nsIParserNode& aNode);
|
||||
NS_IMETHOD CloseMap(const nsIParserNode& aNode);
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
|
||||
NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
|
||||
NS_IMETHOD WillProcessAToken(void) { return NS_OK; }
|
||||
|
|
|
@ -1375,6 +1375,16 @@ nsXMLContentSink::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)
|
|||
return doc->AppendChild(docType, getter_AddRefs(tmpNode));
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsXMLContentSink::SetDocumentCharset(nsAWritableString& aCharset)
|
||||
{
|
||||
if (mDocument) {
|
||||
return mDocument->SetDocumentCharacterSet(aCharset);
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
nsresult
|
||||
nsXMLContentSink::FlushText(PRBool aCreateTextNode, PRBool* aDidFlush)
|
||||
{
|
||||
|
|
|
@ -96,6 +96,7 @@ public:
|
|||
NS_IMETHOD NotifyError(const nsParserError* aError);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
|
||||
|
||||
// nsIXMLContentSink
|
||||
NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);
|
||||
|
|
|
@ -130,6 +130,7 @@ public:
|
|||
NS_IMETHOD NotifyError(const nsParserError* aError);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
|
||||
|
||||
// nsIXMLContentSink
|
||||
NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);
|
||||
|
@ -954,6 +955,17 @@ XULContentSinkImpl::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)
|
|||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
XULContentSinkImpl::SetDocumentCharset(nsAWritableString& aCharset)
|
||||
{
|
||||
nsCOMPtr<nsIDocument> doc = do_QueryReferent(mDocument);
|
||||
if (doc) {
|
||||
return doc->SetDocumentCharacterSet(aCharset);
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
|
||||
NS_IMETHODIMP
|
||||
XULContentSinkImpl::AddCharacterData(const nsIParserNode& aNode)
|
||||
|
|
|
@ -86,6 +86,7 @@ public:
|
|||
NS_IMETHOD WillResume(void) { return NS_OK; }
|
||||
NS_IMETHOD SetParser(nsIParser* aParser) { return NS_OK; }
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
|
||||
NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
|
||||
NS_IMETHOD WillProcessAToken(void) { return NS_OK; }
|
||||
|
|
|
@ -55,6 +55,7 @@ public:
|
|||
NS_IMETHOD AddComment(const nsIParserNode& aNode);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() {return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) {return NS_OK;}
|
||||
|
||||
// nsIHTMLContentSink
|
||||
NS_IMETHOD SetTitle(const nsString& aValue);
|
||||
|
|
|
@ -1,173 +0,0 @@
|
|||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public
|
||||
* License Version 1.1 (the "License"); you may not use this file
|
||||
* except in compliance with the License. You may obtain a copy of
|
||||
* the License at http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS
|
||||
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
||||
* implied. See the License for the specific language governing
|
||||
* rights and limitations under the License.
|
||||
*
|
||||
* The Original Code is Mozilla Communicator client code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is Netscape Communications
|
||||
* Corporation. Portions created by Netscape are
|
||||
* Copyright (C) 1998 Netscape Communications Corporation. All
|
||||
* Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*/
|
||||
#ifndef nsIContentSink_h___
|
||||
#define nsIContentSink_h___
|
||||
|
||||
/**
|
||||
* MODULE NOTES:
|
||||
* @update gess 4/1/98
|
||||
*
|
||||
* This file declares the concrete IContentSink interface.
|
||||
* This pure virtual interface is used as the "glue" that connects the parsing
|
||||
* process to the content model construction process.
|
||||
*
|
||||
* The icontentsink interface is a very lightweight wrapper that represents the
|
||||
* content-sink model building process. There is another one that you may care
|
||||
* about more, which is the IHTMLContentSink interface. (See that file for details).
|
||||
*/
|
||||
|
||||
#include "nsIParserNode.h"
|
||||
#include "nsISupports.h"
|
||||
#include "nsParserError.h"
|
||||
|
||||
class nsIParser;
|
||||
|
||||
#define NS_ICONTENT_SINK_IID \
|
||||
{ 0xa6cf9052, 0x15b3, 0x11d2,{0x93, 0x2e, 0x00, 0x80, 0x5f, 0x8a, 0xdd, 0x32}}
|
||||
|
||||
// The base value for the content ID counter.
|
||||
// Values greater than or equal to this base value are used
|
||||
// by each of the content sinks to assign unique values
|
||||
// to the content objects created by them.
|
||||
#define NS_CONTENT_ID_COUNTER_BASE 10000
|
||||
|
||||
class nsIContentSink : public nsISupports {
|
||||
public:
|
||||
|
||||
NS_DEFINE_STATIC_IID_ACCESSOR(NS_ICONTENT_SINK_IID)
|
||||
|
||||
/**
|
||||
* This method gets called when the parser begins the process
|
||||
* of building the content model via the content sink.
|
||||
*
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD WillBuildModel(void)=0;
|
||||
|
||||
/**
|
||||
* This method gets called when the parser concludes the process
|
||||
* of building the content model via the content sink.
|
||||
*
|
||||
* @param aQualityLevel describes how well formed the doc was.
|
||||
* 0=GOOD; 1=FAIR; 2=POOR;
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD DidBuildModel(PRInt32 aQualityLevel)=0;
|
||||
|
||||
/**
|
||||
* This method gets called when the parser gets i/o blocked,
|
||||
* and wants to notify the sink that it may be a while before
|
||||
* more data is available.
|
||||
*
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD WillInterrupt(void)=0;
|
||||
|
||||
/**
|
||||
* This method gets called when the parser i/o gets unblocked,
|
||||
* and we're about to start dumping content again to the sink.
|
||||
*
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD WillResume(void)=0;
|
||||
|
||||
/**
|
||||
* This method gets called by the parser so that the content
|
||||
* sink can retain a reference to the parser. The expectation
|
||||
* is that the content sink will drop the reference when it
|
||||
* gets the DidBuildModel notification i.e. when parsing is done.
|
||||
*/
|
||||
NS_IMETHOD SetParser(nsIParser* aParser)=0;
|
||||
|
||||
/**
|
||||
* This method is used to open a generic container in the sink.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD OpenContainer(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This method gets called by the parser when a close
|
||||
* container tag has been consumed and needs to be closed.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD CloseContainer(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser when you want to add
|
||||
* a leaf node to the current container in the content
|
||||
* model.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddLeaf(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser when you want to add
|
||||
* a leaf node to the current container in the content
|
||||
* model.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddComment(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser when you want to add
|
||||
* a leaf node to the current container in the content
|
||||
* model.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This method is called by the parser when it encounters
|
||||
* a document type declaration.
|
||||
*
|
||||
* XXX Should the parser also part the internal subset?
|
||||
*
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)=0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser if it hits an unrecoverable
|
||||
* error (in XML, if the document is not well-formed or valid).
|
||||
*
|
||||
* @param aErrorResult the error code
|
||||
*/
|
||||
NS_IMETHOD NotifyError(const nsParserError* aError)=0;
|
||||
|
||||
/**
|
||||
* Flush all pending notifications so that the content model
|
||||
* is in sync with the state of the sink.
|
||||
*/
|
||||
NS_IMETHOD FlushPendingNotifications()=0;
|
||||
};
|
||||
|
||||
#endif /* nsIContentSink_h___ */
|
|
@ -87,12 +87,12 @@ typedef enum {
|
|||
kCharsetFromWeakDocTypeDefault,
|
||||
kCharsetFromUserDefault ,
|
||||
kCharsetFromDocTypeDefault,
|
||||
kCharsetFromCache,
|
||||
kCharsetFromParentFrame,
|
||||
kCharsetFromBookmarks,
|
||||
kCharsetFromAutoDetection,
|
||||
kCharsetFromMetaTag,
|
||||
kCharsetFromByteOrderMark,
|
||||
kCharsetFromCache,
|
||||
kCharsetFromHTTPHeader,
|
||||
kCharsetFromUserForced,
|
||||
kCharsetFromOtherComponent,
|
||||
|
|
|
@ -57,6 +57,7 @@ public:
|
|||
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
|
||||
// nsIHTMLContentSink
|
||||
NS_IMETHOD SetTitle(const nsString& aValue);
|
||||
|
|
|
@ -39,12 +39,16 @@
|
|||
#include "nsViewSourceHTML.h"
|
||||
#include "nsIStringStream.h"
|
||||
#include "nsIChannel.h"
|
||||
#include "nsICachingChannel.h"
|
||||
#include "nsICacheEntryDescriptor.h"
|
||||
#include "nsICharsetAlias.h"
|
||||
#include "nsIProgressEventSink.h"
|
||||
#include "nsIInputStream.h"
|
||||
#include "CNavDTD.h"
|
||||
#include "COtherDTD.h"
|
||||
#include "prenv.h"
|
||||
#include "nsParserCIID.h"
|
||||
#include "nsReadableUtils.h"
|
||||
#include "nsCOMPtr.h"
|
||||
#include "nsIEventQueue.h"
|
||||
#include "nsIEventQueueService.h"
|
||||
|
@ -543,6 +547,13 @@ void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSo
|
|||
mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
|
||||
}
|
||||
|
||||
void nsParser::SetSinkCharset(nsAWritableString& aCharset)
|
||||
{
|
||||
if (mSink) {
|
||||
mSink->SetDocumentCharset(aCharset);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This method gets called in order to set the content
|
||||
* sink for this parser to dump nodes to.
|
||||
|
@ -613,14 +624,14 @@ nsDTDMode nsParser::GetParseMode(void){
|
|||
}
|
||||
|
||||
|
||||
|
||||
template <class CharT>
|
||||
class CWordTokenizer {
|
||||
public:
|
||||
CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
|
||||
CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
|
||||
mLength=0;
|
||||
mOffset=aStartOffset;
|
||||
mMaxOffset=aMaxOffset;
|
||||
mBuffer=aString.GetUnicode();
|
||||
mBuffer=aBuffer;
|
||||
mEndBuffer=mBuffer+mMaxOffset;
|
||||
}
|
||||
|
||||
|
@ -633,25 +644,33 @@ public:
|
|||
// Returns offset of nth word, or -1 (if out of words).
|
||||
//********************************************************************************
|
||||
|
||||
PRInt32 GetNextWord() {
|
||||
PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {
|
||||
|
||||
const PRUnichar *cp=mBuffer+mOffset+mLength; //skip last word
|
||||
const CharT *cp=mBuffer+mOffset+mLength; //skip last word
|
||||
|
||||
mLength=0; //reset this
|
||||
mOffset=-1; //reset this
|
||||
|
||||
//now skip whitespace...
|
||||
|
||||
PRUnichar target=0;
|
||||
CharT target=0;
|
||||
PRBool done=PR_FALSE;
|
||||
|
||||
while((!done) && (cp++<mEndBuffer)) {
|
||||
switch(*cp) {
|
||||
case kSpace: case kNewLine:
|
||||
case kCR: case kTab:
|
||||
case kEqual:
|
||||
continue;
|
||||
|
||||
case kQuote:
|
||||
target=*cp;
|
||||
if (aSkipQuotes) {
|
||||
cp++;
|
||||
}
|
||||
done=PR_TRUE;
|
||||
break;
|
||||
|
||||
case kMinus:
|
||||
target=*cp;
|
||||
done=PR_TRUE;
|
||||
|
@ -665,7 +684,7 @@ public:
|
|||
|
||||
if(cp<mEndBuffer) {
|
||||
|
||||
const PRUnichar *firstcp=cp; //hang onto this...
|
||||
const CharT *firstcp=cp; //hang onto this...
|
||||
PRInt32 theDashCount=2;
|
||||
|
||||
cp++; //just skip first letter to simplify processing...
|
||||
|
@ -693,7 +712,8 @@ public:
|
|||
(kGreaterThan==*cp) ||
|
||||
(kQuote==*cp) ||
|
||||
(kCR==*cp) ||
|
||||
(kTab==*cp)) {
|
||||
(kTab==*cp) ||
|
||||
(kEqual == *cp)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -707,11 +727,15 @@ public:
|
|||
return mOffset;
|
||||
}
|
||||
|
||||
PRInt32 GetLength() const {
|
||||
return mLength;
|
||||
}
|
||||
|
||||
PRInt32 mOffset;
|
||||
PRInt32 mMaxOffset;
|
||||
PRInt32 mLength;
|
||||
const PRUnichar* mBuffer;
|
||||
const PRUnichar* mEndBuffer;
|
||||
const CharT* mBuffer;
|
||||
const CharT* mEndBuffer;
|
||||
};
|
||||
|
||||
|
||||
|
@ -848,7 +872,7 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&
|
|||
if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {
|
||||
|
||||
const PRUnichar* theBuffer=aBuffer.GetUnicode();
|
||||
CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
|
||||
CWordTokenizer<PRUnichar> theTokenizer(theBuffer,theLTPos,theGTPos);
|
||||
theOffset=theTokenizer.GetNextWord(); //try to find ?xml, !doctype, etc...
|
||||
|
||||
if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
|
||||
|
@ -2297,7 +2321,7 @@ nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
|
|||
#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
|
||||
#define UTF8 "UTF-8"
|
||||
|
||||
static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
|
||||
static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
|
||||
oCharsetSource= kCharsetFromAutoDetection;
|
||||
oCharset.SetLength(0);
|
||||
// see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
|
||||
|
@ -2407,11 +2431,121 @@ static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsS
|
|||
return oCharset.Length() > 0;
|
||||
}
|
||||
|
||||
static const char kHTTPEquivStr[] = "http-equiv";
|
||||
static const PRInt32 kHTTPEquivStrLen = sizeof(kHTTPEquivStr)-1;
|
||||
static const char kContentTypeStr[] = "Content-Type";
|
||||
static const PRInt32 kContentTypeStrLen = sizeof(kContentTypeStr)-1;
|
||||
static const char kContentStr[] = "content";
|
||||
static const PRInt32 kContentStrLen = sizeof(kContentStr)-1;
|
||||
static const char kCharsetStr[] = "charset";
|
||||
static const PRInt32 kCharsetStrLen = sizeof(kCharsetStr)-1;
|
||||
|
||||
PRBool
|
||||
nsParser::DetectMetaTag(const char* aBytes,
|
||||
PRInt32 aLen,
|
||||
nsString& aCharset,
|
||||
nsCharsetSource& aCharsetSource)
|
||||
{
|
||||
PRBool foundContentType = PR_FALSE;
|
||||
aCharsetSource= kCharsetFromMetaTag;
|
||||
aCharset.SetLength(0);
|
||||
|
||||
// XXX Only look inside HTML documents for now. For XML
|
||||
// documents we should be looking inside the XMLDecl.
|
||||
if (!mParserContext->mMimeType.Equals(NS_ConvertASCIItoUCS2(kHTMLTextContentType))) {
|
||||
return PR_FALSE;
|
||||
}
|
||||
|
||||
// Fast and loose parsing to determine if we have a complete
|
||||
// META tag in this block, looking upto 2k into it.
|
||||
nsDependentCString str(aBytes, PR_MIN(aLen, 2048));
|
||||
nsReadingIterator<char> begin, end;
|
||||
|
||||
str.BeginReading(begin);
|
||||
str.EndReading(end);
|
||||
nsReadingIterator<char> tagStart(begin);
|
||||
nsReadingIterator<char> tagEnd(end);
|
||||
|
||||
do {
|
||||
// Find the string META and make sure it's not right at the beginning
|
||||
if (CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("META"), tagStart, tagEnd) &&
|
||||
(tagStart != begin)) {
|
||||
// Back up one to confirm that this is a tag
|
||||
if (*--tagStart == '<') {
|
||||
const char* attrStart = tagEnd.get();
|
||||
const char* attrEnd;
|
||||
|
||||
// Find the end of the tag
|
||||
FindInReadable(NS_LITERAL_CSTRING(">"), tagEnd, end);
|
||||
attrEnd = tagEnd.get();
|
||||
|
||||
CWordTokenizer<char> tokenizer(attrStart, 0, attrEnd-attrStart);
|
||||
PRInt32 offset;
|
||||
|
||||
// Start looking at the attributes
|
||||
while ((offset = tokenizer.GetNextWord()) != kNotFound) {
|
||||
// We need to have a HTTP-EQUIV attribute whose value is
|
||||
// "Content-Type"
|
||||
if ((tokenizer.GetLength() >= kHTTPEquivStrLen) &&
|
||||
(nsCRT::strncasecmp(attrStart+offset,
|
||||
kHTTPEquivStr, kHTTPEquivStrLen) == 0)) {
|
||||
if (((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) &&
|
||||
(tokenizer.GetLength() >= kContentTypeStrLen) &&
|
||||
(nsCRT::strncasecmp(attrStart+offset,
|
||||
kContentTypeStr, kContentTypeStrLen) == 0)) {
|
||||
foundContentType = PR_TRUE;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// And a CONTENT attribute
|
||||
else if ((tokenizer.GetLength() >= kContentStrLen) &&
|
||||
(nsCRT::strncasecmp(attrStart+offset,
|
||||
kContentStr, kContentStrLen) == 0)) {
|
||||
// The next word is the value which itself needs to be parsed
|
||||
if ((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) {
|
||||
const char* contentStart = attrStart+offset;
|
||||
CWordTokenizer<char> contentTokenizer(contentStart, 0,
|
||||
tokenizer.GetLength());
|
||||
|
||||
// Read the content type
|
||||
if (contentTokenizer.GetNextWord() != kNotFound) {
|
||||
// Now see if we have a charset
|
||||
if (((offset = contentTokenizer.GetNextWord()) != kNotFound) &&
|
||||
(contentTokenizer.GetLength() >= kCharsetStrLen) &&
|
||||
(nsCRT::strncasecmp(contentStart+offset,
|
||||
kCharsetStr, kCharsetStrLen) == 0)) {
|
||||
// The next word is the charset
|
||||
if ((offset = contentTokenizer.GetNextWord()) != kNotFound) {
|
||||
aCharset.Assign(NS_ConvertASCIItoUCS2(contentStart+offset,
|
||||
contentTokenizer.GetLength()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (foundContentType && (aCharset.Length() > 0)) {
|
||||
return PR_TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tagStart = tagEnd;
|
||||
tagEnd = end;
|
||||
} while (tagStart != end);
|
||||
|
||||
return PR_FALSE;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
PRBool mNeedCheckFirst4Bytes;
|
||||
PRBool mNeedCharsetCheck;
|
||||
nsParser* mParser;
|
||||
nsIParserFilter* mParserFilter;
|
||||
nsScanner* mScanner;
|
||||
nsIRequest* mRequest;
|
||||
} ParserWriteStruct;
|
||||
|
||||
/*
|
||||
|
@ -2437,19 +2571,40 @@ ParserWriteFunc(nsIInputStream* in,
|
|||
return NS_ERROR_FAILURE;
|
||||
}
|
||||
|
||||
if(pws->mNeedCheckFirst4Bytes && (count >= 4)) {
|
||||
if(pws->mNeedCharsetCheck) {
|
||||
nsCharsetSource guessSource;
|
||||
nsAutoString guess;
|
||||
nsAutoString guess, preferred;
|
||||
|
||||
pws->mNeedCheckFirst4Bytes = PR_FALSE;
|
||||
if(detectByteOrderMark((const unsigned char*)buf,
|
||||
theNumRead, guess, guessSource))
|
||||
{
|
||||
pws->mNeedCharsetCheck = PR_FALSE;
|
||||
if(pws->mParser->DetectMetaTag(buf, theNumRead,
|
||||
guess, guessSource) ||
|
||||
((count >= 4) &&
|
||||
DetectByteOrderMark((const unsigned char*)buf,
|
||||
theNumRead, guess, guessSource))) {
|
||||
#ifdef DEBUG_XMLENCODING
|
||||
printf("xmlencoding detect- %s\n", guess.ToNewCString());
|
||||
printf("xmlencoding detect- %s\n", guess.ToNewCString());
|
||||
#endif
|
||||
pws->mParser->SetDocumentCharset(guess, guessSource);
|
||||
}
|
||||
nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
|
||||
result = alias->GetPreferred(guess, preferred);
|
||||
if (NS_SUCCEEDED(result)) {
|
||||
guess.Assign(preferred);
|
||||
}
|
||||
pws->mParser->SetDocumentCharset(guess, guessSource);
|
||||
pws->mParser->SetSinkCharset(guess);
|
||||
nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
|
||||
if (channel) {
|
||||
nsCOMPtr<nsISupports> cacheToken;
|
||||
channel->GetCacheToken(getter_AddRefs(cacheToken));
|
||||
if (cacheToken) {
|
||||
nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
|
||||
if (cacheDescriptor) {
|
||||
nsresult rv = cacheDescriptor->SetMetaDataElement("charset",
|
||||
NS_ConvertUCS2toUTF8(guess).get());
|
||||
NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(pws->mParserFilter)
|
||||
|
@ -2503,11 +2658,12 @@ NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail
|
|||
|
||||
PRUint32 totalRead;
|
||||
ParserWriteStruct pws;
|
||||
pws.mNeedCheckFirst4Bytes =
|
||||
((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
|
||||
pws.mNeedCharsetCheck =
|
||||
((0 == sourceOffset) && (mCharsetSource<kCharsetFromMetaTag));
|
||||
pws.mParser = this;
|
||||
pws.mParserFilter = mParserFilter;
|
||||
pws.mScanner = theContext->mScanner;
|
||||
pws.mRequest = request;
|
||||
|
||||
result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
|
||||
if (NS_FAILED(result)) {
|
||||
|
|
|
@ -328,6 +328,17 @@ class nsParser : public nsIParser,
|
|||
const nsString* aMimeType=nsnull,
|
||||
nsDTDMode aDTDMode=eDTDMode_unknown);
|
||||
|
||||
/**
|
||||
* Detects the existence of a META tag with charset information in
|
||||
* the given buffer.
|
||||
*/
|
||||
PRBool DetectMetaTag(const char* aBytes,
|
||||
PRInt32 aLen,
|
||||
nsString& oCharset,
|
||||
nsCharsetSource& oCharsetSource);
|
||||
|
||||
void SetSinkCharset(nsAWritableString& aCharset);
|
||||
|
||||
/**
|
||||
* Removes continue parsing events
|
||||
* @update kmcclusk 5/18/98
|
||||
|
|
|
@ -86,6 +86,7 @@ public:
|
|||
NS_IMETHOD WillResume(void) { return NS_OK; }
|
||||
NS_IMETHOD SetParser(nsIParser* aParser) { return NS_OK; }
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
|
||||
NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
|
||||
NS_IMETHOD WillProcessAToken(void) { return NS_OK; }
|
||||
|
|
|
@ -55,6 +55,7 @@ public:
|
|||
NS_IMETHOD AddComment(const nsIParserNode& aNode);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() {return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) {return NS_OK;}
|
||||
|
||||
// nsIHTMLContentSink
|
||||
NS_IMETHOD SetTitle(const nsString& aValue);
|
||||
|
|
|
@ -1,173 +0,0 @@
|
|||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public
|
||||
* License Version 1.1 (the "License"); you may not use this file
|
||||
* except in compliance with the License. You may obtain a copy of
|
||||
* the License at http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS
|
||||
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
||||
* implied. See the License for the specific language governing
|
||||
* rights and limitations under the License.
|
||||
*
|
||||
* The Original Code is Mozilla Communicator client code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is Netscape Communications
|
||||
* Corporation. Portions created by Netscape are
|
||||
* Copyright (C) 1998 Netscape Communications Corporation. All
|
||||
* Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*/
|
||||
#ifndef nsIContentSink_h___
|
||||
#define nsIContentSink_h___
|
||||
|
||||
/**
|
||||
* MODULE NOTES:
|
||||
* @update gess 4/1/98
|
||||
*
|
||||
* This file declares the concrete IContentSink interface.
|
||||
* This pure virtual interface is used as the "glue" that connects the parsing
|
||||
* process to the content model construction process.
|
||||
*
|
||||
* The icontentsink interface is a very lightweight wrapper that represents the
|
||||
* content-sink model building process. There is another one that you may care
|
||||
* about more, which is the IHTMLContentSink interface. (See that file for details).
|
||||
*/
|
||||
|
||||
#include "nsIParserNode.h"
|
||||
#include "nsISupports.h"
|
||||
#include "nsParserError.h"
|
||||
|
||||
class nsIParser;
|
||||
|
||||
#define NS_ICONTENT_SINK_IID \
|
||||
{ 0xa6cf9052, 0x15b3, 0x11d2,{0x93, 0x2e, 0x00, 0x80, 0x5f, 0x8a, 0xdd, 0x32}}
|
||||
|
||||
// The base value for the content ID counter.
|
||||
// Values greater than or equal to this base value are used
|
||||
// by each of the content sinks to assign unique values
|
||||
// to the content objects created by them.
|
||||
#define NS_CONTENT_ID_COUNTER_BASE 10000
|
||||
|
||||
class nsIContentSink : public nsISupports {
|
||||
public:
|
||||
|
||||
NS_DEFINE_STATIC_IID_ACCESSOR(NS_ICONTENT_SINK_IID)
|
||||
|
||||
/**
|
||||
* This method gets called when the parser begins the process
|
||||
* of building the content model via the content sink.
|
||||
*
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD WillBuildModel(void)=0;
|
||||
|
||||
/**
|
||||
* This method gets called when the parser concludes the process
|
||||
* of building the content model via the content sink.
|
||||
*
|
||||
* @param aQualityLevel describes how well formed the doc was.
|
||||
* 0=GOOD; 1=FAIR; 2=POOR;
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD DidBuildModel(PRInt32 aQualityLevel)=0;
|
||||
|
||||
/**
|
||||
* This method gets called when the parser gets i/o blocked,
|
||||
* and wants to notify the sink that it may be a while before
|
||||
* more data is available.
|
||||
*
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD WillInterrupt(void)=0;
|
||||
|
||||
/**
|
||||
* This method gets called when the parser i/o gets unblocked,
|
||||
* and we're about to start dumping content again to the sink.
|
||||
*
|
||||
* @update 5/7/98 gess
|
||||
*/
|
||||
NS_IMETHOD WillResume(void)=0;
|
||||
|
||||
/**
|
||||
* This method gets called by the parser so that the content
|
||||
* sink can retain a reference to the parser. The expectation
|
||||
* is that the content sink will drop the reference when it
|
||||
* gets the DidBuildModel notification i.e. when parsing is done.
|
||||
*/
|
||||
NS_IMETHOD SetParser(nsIParser* aParser)=0;
|
||||
|
||||
/**
|
||||
* This method is used to open a generic container in the sink.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD OpenContainer(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This method gets called by the parser when a close
|
||||
* container tag has been consumed and needs to be closed.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD CloseContainer(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser when you want to add
|
||||
* a leaf node to the current container in the content
|
||||
* model.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddLeaf(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser when you want to add
|
||||
* a leaf node to the current container in the content
|
||||
* model.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddComment(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser when you want to add
|
||||
* a leaf node to the current container in the content
|
||||
* model.
|
||||
*
|
||||
* @update 4/1/98 gess
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode) = 0;
|
||||
|
||||
/**
|
||||
* This method is called by the parser when it encounters
|
||||
* a document type declaration.
|
||||
*
|
||||
* XXX Should the parser also part the internal subset?
|
||||
*
|
||||
* @param nsIParserNode reference to parser node interface
|
||||
*/
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)=0;
|
||||
|
||||
/**
|
||||
* This gets called by the parser if it hits an unrecoverable
|
||||
* error (in XML, if the document is not well-formed or valid).
|
||||
*
|
||||
* @param aErrorResult the error code
|
||||
*/
|
||||
NS_IMETHOD NotifyError(const nsParserError* aError)=0;
|
||||
|
||||
/**
|
||||
* Flush all pending notifications so that the content model
|
||||
* is in sync with the state of the sink.
|
||||
*/
|
||||
NS_IMETHOD FlushPendingNotifications()=0;
|
||||
};
|
||||
|
||||
#endif /* nsIContentSink_h___ */
|
|
@ -87,12 +87,12 @@ typedef enum {
|
|||
kCharsetFromWeakDocTypeDefault,
|
||||
kCharsetFromUserDefault ,
|
||||
kCharsetFromDocTypeDefault,
|
||||
kCharsetFromCache,
|
||||
kCharsetFromParentFrame,
|
||||
kCharsetFromBookmarks,
|
||||
kCharsetFromAutoDetection,
|
||||
kCharsetFromMetaTag,
|
||||
kCharsetFromByteOrderMark,
|
||||
kCharsetFromCache,
|
||||
kCharsetFromHTTPHeader,
|
||||
kCharsetFromUserForced,
|
||||
kCharsetFromOtherComponent,
|
||||
|
|
|
@ -57,6 +57,7 @@ public:
|
|||
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
|
||||
// nsIHTMLContentSink
|
||||
NS_IMETHOD SetTitle(const nsString& aValue);
|
||||
|
|
|
@ -39,12 +39,16 @@
|
|||
#include "nsViewSourceHTML.h"
|
||||
#include "nsIStringStream.h"
|
||||
#include "nsIChannel.h"
|
||||
#include "nsICachingChannel.h"
|
||||
#include "nsICacheEntryDescriptor.h"
|
||||
#include "nsICharsetAlias.h"
|
||||
#include "nsIProgressEventSink.h"
|
||||
#include "nsIInputStream.h"
|
||||
#include "CNavDTD.h"
|
||||
#include "COtherDTD.h"
|
||||
#include "prenv.h"
|
||||
#include "nsParserCIID.h"
|
||||
#include "nsReadableUtils.h"
|
||||
#include "nsCOMPtr.h"
|
||||
#include "nsIEventQueue.h"
|
||||
#include "nsIEventQueueService.h"
|
||||
|
@ -543,6 +547,13 @@ void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSo
|
|||
mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
|
||||
}
|
||||
|
||||
void nsParser::SetSinkCharset(nsAWritableString& aCharset)
|
||||
{
|
||||
if (mSink) {
|
||||
mSink->SetDocumentCharset(aCharset);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This method gets called in order to set the content
|
||||
* sink for this parser to dump nodes to.
|
||||
|
@ -613,14 +624,14 @@ nsDTDMode nsParser::GetParseMode(void){
|
|||
}
|
||||
|
||||
|
||||
|
||||
template <class CharT>
|
||||
class CWordTokenizer {
|
||||
public:
|
||||
CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
|
||||
CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
|
||||
mLength=0;
|
||||
mOffset=aStartOffset;
|
||||
mMaxOffset=aMaxOffset;
|
||||
mBuffer=aString.GetUnicode();
|
||||
mBuffer=aBuffer;
|
||||
mEndBuffer=mBuffer+mMaxOffset;
|
||||
}
|
||||
|
||||
|
@ -633,25 +644,33 @@ public:
|
|||
// Returns offset of nth word, or -1 (if out of words).
|
||||
//********************************************************************************
|
||||
|
||||
PRInt32 GetNextWord() {
|
||||
PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {
|
||||
|
||||
const PRUnichar *cp=mBuffer+mOffset+mLength; //skip last word
|
||||
const CharT *cp=mBuffer+mOffset+mLength; //skip last word
|
||||
|
||||
mLength=0; //reset this
|
||||
mOffset=-1; //reset this
|
||||
|
||||
//now skip whitespace...
|
||||
|
||||
PRUnichar target=0;
|
||||
CharT target=0;
|
||||
PRBool done=PR_FALSE;
|
||||
|
||||
while((!done) && (cp++<mEndBuffer)) {
|
||||
switch(*cp) {
|
||||
case kSpace: case kNewLine:
|
||||
case kCR: case kTab:
|
||||
case kEqual:
|
||||
continue;
|
||||
|
||||
case kQuote:
|
||||
target=*cp;
|
||||
if (aSkipQuotes) {
|
||||
cp++;
|
||||
}
|
||||
done=PR_TRUE;
|
||||
break;
|
||||
|
||||
case kMinus:
|
||||
target=*cp;
|
||||
done=PR_TRUE;
|
||||
|
@ -665,7 +684,7 @@ public:
|
|||
|
||||
if(cp<mEndBuffer) {
|
||||
|
||||
const PRUnichar *firstcp=cp; //hang onto this...
|
||||
const CharT *firstcp=cp; //hang onto this...
|
||||
PRInt32 theDashCount=2;
|
||||
|
||||
cp++; //just skip first letter to simplify processing...
|
||||
|
@ -693,7 +712,8 @@ public:
|
|||
(kGreaterThan==*cp) ||
|
||||
(kQuote==*cp) ||
|
||||
(kCR==*cp) ||
|
||||
(kTab==*cp)) {
|
||||
(kTab==*cp) ||
|
||||
(kEqual == *cp)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -707,11 +727,15 @@ public:
|
|||
return mOffset;
|
||||
}
|
||||
|
||||
PRInt32 GetLength() const {
|
||||
return mLength;
|
||||
}
|
||||
|
||||
PRInt32 mOffset;
|
||||
PRInt32 mMaxOffset;
|
||||
PRInt32 mLength;
|
||||
const PRUnichar* mBuffer;
|
||||
const PRUnichar* mEndBuffer;
|
||||
const CharT* mBuffer;
|
||||
const CharT* mEndBuffer;
|
||||
};
|
||||
|
||||
|
||||
|
@ -848,7 +872,7 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&
|
|||
if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {
|
||||
|
||||
const PRUnichar* theBuffer=aBuffer.GetUnicode();
|
||||
CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
|
||||
CWordTokenizer<PRUnichar> theTokenizer(theBuffer,theLTPos,theGTPos);
|
||||
theOffset=theTokenizer.GetNextWord(); //try to find ?xml, !doctype, etc...
|
||||
|
||||
if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
|
||||
|
@ -2297,7 +2321,7 @@ nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
|
|||
#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
|
||||
#define UTF8 "UTF-8"
|
||||
|
||||
static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
|
||||
static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
|
||||
oCharsetSource= kCharsetFromAutoDetection;
|
||||
oCharset.SetLength(0);
|
||||
// see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
|
||||
|
@ -2407,11 +2431,121 @@ static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsS
|
|||
return oCharset.Length() > 0;
|
||||
}
|
||||
|
||||
static const char kHTTPEquivStr[] = "http-equiv";
|
||||
static const PRInt32 kHTTPEquivStrLen = sizeof(kHTTPEquivStr)-1;
|
||||
static const char kContentTypeStr[] = "Content-Type";
|
||||
static const PRInt32 kContentTypeStrLen = sizeof(kContentTypeStr)-1;
|
||||
static const char kContentStr[] = "content";
|
||||
static const PRInt32 kContentStrLen = sizeof(kContentStr)-1;
|
||||
static const char kCharsetStr[] = "charset";
|
||||
static const PRInt32 kCharsetStrLen = sizeof(kCharsetStr)-1;
|
||||
|
||||
PRBool
|
||||
nsParser::DetectMetaTag(const char* aBytes,
|
||||
PRInt32 aLen,
|
||||
nsString& aCharset,
|
||||
nsCharsetSource& aCharsetSource)
|
||||
{
|
||||
PRBool foundContentType = PR_FALSE;
|
||||
aCharsetSource= kCharsetFromMetaTag;
|
||||
aCharset.SetLength(0);
|
||||
|
||||
// XXX Only look inside HTML documents for now. For XML
|
||||
// documents we should be looking inside the XMLDecl.
|
||||
if (!mParserContext->mMimeType.Equals(NS_ConvertASCIItoUCS2(kHTMLTextContentType))) {
|
||||
return PR_FALSE;
|
||||
}
|
||||
|
||||
// Fast and loose parsing to determine if we have a complete
|
||||
// META tag in this block, looking upto 2k into it.
|
||||
nsDependentCString str(aBytes, PR_MIN(aLen, 2048));
|
||||
nsReadingIterator<char> begin, end;
|
||||
|
||||
str.BeginReading(begin);
|
||||
str.EndReading(end);
|
||||
nsReadingIterator<char> tagStart(begin);
|
||||
nsReadingIterator<char> tagEnd(end);
|
||||
|
||||
do {
|
||||
// Find the string META and make sure it's not right at the beginning
|
||||
if (CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("META"), tagStart, tagEnd) &&
|
||||
(tagStart != begin)) {
|
||||
// Back up one to confirm that this is a tag
|
||||
if (*--tagStart == '<') {
|
||||
const char* attrStart = tagEnd.get();
|
||||
const char* attrEnd;
|
||||
|
||||
// Find the end of the tag
|
||||
FindInReadable(NS_LITERAL_CSTRING(">"), tagEnd, end);
|
||||
attrEnd = tagEnd.get();
|
||||
|
||||
CWordTokenizer<char> tokenizer(attrStart, 0, attrEnd-attrStart);
|
||||
PRInt32 offset;
|
||||
|
||||
// Start looking at the attributes
|
||||
while ((offset = tokenizer.GetNextWord()) != kNotFound) {
|
||||
// We need to have a HTTP-EQUIV attribute whose value is
|
||||
// "Content-Type"
|
||||
if ((tokenizer.GetLength() >= kHTTPEquivStrLen) &&
|
||||
(nsCRT::strncasecmp(attrStart+offset,
|
||||
kHTTPEquivStr, kHTTPEquivStrLen) == 0)) {
|
||||
if (((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) &&
|
||||
(tokenizer.GetLength() >= kContentTypeStrLen) &&
|
||||
(nsCRT::strncasecmp(attrStart+offset,
|
||||
kContentTypeStr, kContentTypeStrLen) == 0)) {
|
||||
foundContentType = PR_TRUE;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// And a CONTENT attribute
|
||||
else if ((tokenizer.GetLength() >= kContentStrLen) &&
|
||||
(nsCRT::strncasecmp(attrStart+offset,
|
||||
kContentStr, kContentStrLen) == 0)) {
|
||||
// The next word is the value which itself needs to be parsed
|
||||
if ((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) {
|
||||
const char* contentStart = attrStart+offset;
|
||||
CWordTokenizer<char> contentTokenizer(contentStart, 0,
|
||||
tokenizer.GetLength());
|
||||
|
||||
// Read the content type
|
||||
if (contentTokenizer.GetNextWord() != kNotFound) {
|
||||
// Now see if we have a charset
|
||||
if (((offset = contentTokenizer.GetNextWord()) != kNotFound) &&
|
||||
(contentTokenizer.GetLength() >= kCharsetStrLen) &&
|
||||
(nsCRT::strncasecmp(contentStart+offset,
|
||||
kCharsetStr, kCharsetStrLen) == 0)) {
|
||||
// The next word is the charset
|
||||
if ((offset = contentTokenizer.GetNextWord()) != kNotFound) {
|
||||
aCharset.Assign(NS_ConvertASCIItoUCS2(contentStart+offset,
|
||||
contentTokenizer.GetLength()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (foundContentType && (aCharset.Length() > 0)) {
|
||||
return PR_TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tagStart = tagEnd;
|
||||
tagEnd = end;
|
||||
} while (tagStart != end);
|
||||
|
||||
return PR_FALSE;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
PRBool mNeedCheckFirst4Bytes;
|
||||
PRBool mNeedCharsetCheck;
|
||||
nsParser* mParser;
|
||||
nsIParserFilter* mParserFilter;
|
||||
nsScanner* mScanner;
|
||||
nsIRequest* mRequest;
|
||||
} ParserWriteStruct;
|
||||
|
||||
/*
|
||||
|
@ -2437,19 +2571,40 @@ ParserWriteFunc(nsIInputStream* in,
|
|||
return NS_ERROR_FAILURE;
|
||||
}
|
||||
|
||||
if(pws->mNeedCheckFirst4Bytes && (count >= 4)) {
|
||||
if(pws->mNeedCharsetCheck) {
|
||||
nsCharsetSource guessSource;
|
||||
nsAutoString guess;
|
||||
nsAutoString guess, preferred;
|
||||
|
||||
pws->mNeedCheckFirst4Bytes = PR_FALSE;
|
||||
if(detectByteOrderMark((const unsigned char*)buf,
|
||||
theNumRead, guess, guessSource))
|
||||
{
|
||||
pws->mNeedCharsetCheck = PR_FALSE;
|
||||
if(pws->mParser->DetectMetaTag(buf, theNumRead,
|
||||
guess, guessSource) ||
|
||||
((count >= 4) &&
|
||||
DetectByteOrderMark((const unsigned char*)buf,
|
||||
theNumRead, guess, guessSource))) {
|
||||
#ifdef DEBUG_XMLENCODING
|
||||
printf("xmlencoding detect- %s\n", guess.ToNewCString());
|
||||
printf("xmlencoding detect- %s\n", guess.ToNewCString());
|
||||
#endif
|
||||
pws->mParser->SetDocumentCharset(guess, guessSource);
|
||||
}
|
||||
nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
|
||||
result = alias->GetPreferred(guess, preferred);
|
||||
if (NS_SUCCEEDED(result)) {
|
||||
guess.Assign(preferred);
|
||||
}
|
||||
pws->mParser->SetDocumentCharset(guess, guessSource);
|
||||
pws->mParser->SetSinkCharset(guess);
|
||||
nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
|
||||
if (channel) {
|
||||
nsCOMPtr<nsISupports> cacheToken;
|
||||
channel->GetCacheToken(getter_AddRefs(cacheToken));
|
||||
if (cacheToken) {
|
||||
nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
|
||||
if (cacheDescriptor) {
|
||||
nsresult rv = cacheDescriptor->SetMetaDataElement("charset",
|
||||
NS_ConvertUCS2toUTF8(guess).get());
|
||||
NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(pws->mParserFilter)
|
||||
|
@ -2503,11 +2658,12 @@ NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail
|
|||
|
||||
PRUint32 totalRead;
|
||||
ParserWriteStruct pws;
|
||||
pws.mNeedCheckFirst4Bytes =
|
||||
((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
|
||||
pws.mNeedCharsetCheck =
|
||||
((0 == sourceOffset) && (mCharsetSource<kCharsetFromMetaTag));
|
||||
pws.mParser = this;
|
||||
pws.mParserFilter = mParserFilter;
|
||||
pws.mScanner = theContext->mScanner;
|
||||
pws.mRequest = request;
|
||||
|
||||
result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
|
||||
if (NS_FAILED(result)) {
|
||||
|
|
|
@ -328,6 +328,17 @@ class nsParser : public nsIParser,
|
|||
const nsString* aMimeType=nsnull,
|
||||
nsDTDMode aDTDMode=eDTDMode_unknown);
|
||||
|
||||
/**
|
||||
* Detects the existence of a META tag with charset information in
|
||||
* the given buffer.
|
||||
*/
|
||||
PRBool DetectMetaTag(const char* aBytes,
|
||||
PRInt32 aLen,
|
||||
nsString& oCharset,
|
||||
nsCharsetSource& oCharsetSource);
|
||||
|
||||
void SetSinkCharset(nsAWritableString& aCharset);
|
||||
|
||||
/**
|
||||
* Removes continue parsing events
|
||||
* @update kmcclusk 5/18/98
|
||||
|
|
|
@ -168,6 +168,7 @@ public:
|
|||
NS_IMETHOD NotifyError(const nsParserError* aError);
|
||||
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
|
||||
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
|
||||
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
|
||||
|
||||
// nsIXMLContentSink
|
||||
NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);
|
||||
|
|
Загрузка…
Ссылка в новой задаче