Fix for bug 81253. We now sniff upto the first 2k of the first buffer of any HTML stream, looking for a META tag with charset information. If charset information is found, we use it for unicode conversion. This deals with the bulk of cases where we used to do a reload based on charset information in the document. In the worst case (if charset information exists but isn't found during sniffing) we fall back to the reload case. This fix improves initial page load performance for pages with a charset. Degradation in performance for pages loaded out of the cache is still being investigated. r=harishd, sr=waterson

This commit is contained in:
vidur%netscape.com 2001-06-29 22:56:58 +00:00
Родитель d626846739
Коммит 9a358b6df4
22 изменённых файлов: 456 добавлений и 67 удалений

Просмотреть файл

@ -83,6 +83,7 @@ public:
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode) { return NS_OK; }
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0) { return NS_OK; }
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
// nsIHTMLContentSink
NS_IMETHOD SetTitle(const nsString& aValue) { return NS_OK; }

Просмотреть файл

@ -212,6 +212,7 @@ public:
NS_IMETHOD AddLeaf(const nsIParserNode& aNode);
NS_IMETHOD NotifyError(const nsParserError* aError);
NS_IMETHOD FlushPendingNotifications();
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
NS_IMETHOD AddComment(const nsIParserNode& aNode);
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
@ -5034,7 +5035,6 @@ HTMLContentSink::ProcessSTYLETag(const nsIParserNode& aNode)
title.CompressWhitespace();
element->GetAttribute(kNameSpaceID_HTML, nsHTMLAtoms::type, type);
element->GetAttribute(kNameSpaceID_HTML, nsHTMLAtoms::media, media);
media.ToLowerCase(); // HTML4.0 spec is inconsistent, make it case INSENSITIVE
@ -5119,6 +5119,16 @@ HTMLContentSink::FlushPendingNotifications()
return result;
}
NS_IMETHODIMP
HTMLContentSink::SetDocumentCharset(nsAWritableString& aCharset)
{
if (mDocument) {
return mDocument->SetDocumentCharacterSet(aCharset);
}
return NS_OK;
}
NS_IMETHODIMP
HTMLContentSink::DoFragment(PRBool aFlag)
{

Просмотреть файл

@ -702,6 +702,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
}
}
}
nsresult rv_detect = NS_OK;
if(! gInitDetector)
{
@ -732,6 +733,22 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
nsXPIDLCString urlSpec;
aURL->GetSpec(getter_Copies(urlSpec));
if (cacheDescriptor && urlSpec)
{
if (kCharsetFromCache > charsetSource)
{
nsXPIDLCString cachedCharset;
rv = cacheDescriptor->GetMetaDataElement("charset",
getter_Copies(cachedCharset));
if (NS_SUCCEEDED(rv) && PL_strlen(cachedCharset) > 0)
{
charset.AssignWithConversion(cachedCharset);
charsetSource = kCharsetFromCache;
}
}
rv = NS_OK;
}
if (scheme && nsCRT::strcasecmp("about", scheme) && (kCharsetFromBookmarks > charsetSource))
{
nsCOMPtr<nsIRDFDataSource> datasource;
@ -757,22 +774,6 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
}
}
if (cacheDescriptor && urlSpec)
{
if (kCharsetFromCache > charsetSource)
{
nsXPIDLCString cachedCharset;
rv = cacheDescriptor->GetMetaDataElement("charset",
getter_Copies(cachedCharset));
if (NS_SUCCEEDED(rv) && PL_strlen(cachedCharset) > 0)
{
charset.AssignWithConversion(cachedCharset);
charsetSource = kCharsetFromCache;
}
}
rv = NS_OK;
}
if (kCharsetFromParentFrame > charsetSource) {
if (dcInfo) {
nsCOMPtr<nsIAtom> csAtom;

Просмотреть файл

@ -96,6 +96,7 @@ public:
NS_IMETHOD OpenMap(const nsIParserNode& aNode);
NS_IMETHOD CloseMap(const nsIParserNode& aNode);
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
NS_IMETHOD WillProcessAToken(void) { return NS_OK; }

Просмотреть файл

@ -1375,6 +1375,16 @@ nsXMLContentSink::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)
return doc->AppendChild(docType, getter_AddRefs(tmpNode));
}
NS_IMETHODIMP
nsXMLContentSink::SetDocumentCharset(nsAWritableString& aCharset)
{
if (mDocument) {
return mDocument->SetDocumentCharacterSet(aCharset);
}
return NS_OK;
}
nsresult
nsXMLContentSink::FlushText(PRBool aCreateTextNode, PRBool* aDidFlush)
{

Просмотреть файл

@ -96,6 +96,7 @@ public:
NS_IMETHOD NotifyError(const nsParserError* aError);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
// nsIXMLContentSink
NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);

Просмотреть файл

@ -130,6 +130,7 @@ public:
NS_IMETHOD NotifyError(const nsParserError* aError);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset);
// nsIXMLContentSink
NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);
@ -954,6 +955,17 @@ XULContentSinkImpl::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode)
return NS_OK;
}
NS_IMETHODIMP
XULContentSinkImpl::SetDocumentCharset(nsAWritableString& aCharset)
{
nsCOMPtr<nsIDocument> doc = do_QueryReferent(mDocument);
if (doc) {
return doc->SetDocumentCharacterSet(aCharset);
}
return NS_OK;
}
NS_IMETHODIMP
XULContentSinkImpl::AddCharacterData(const nsIParserNode& aNode)

Просмотреть файл

@ -86,6 +86,7 @@ public:
NS_IMETHOD WillResume(void) { return NS_OK; }
NS_IMETHOD SetParser(nsIParser* aParser) { return NS_OK; }
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
NS_IMETHOD WillProcessAToken(void) { return NS_OK; }

Просмотреть файл

@ -55,6 +55,7 @@ public:
NS_IMETHOD AddComment(const nsIParserNode& aNode);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() {return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) {return NS_OK;}
// nsIHTMLContentSink
NS_IMETHOD SetTitle(const nsString& aValue);

Просмотреть файл

@ -168,6 +168,12 @@ public:
* is in sync with the state of the sink.
*/
NS_IMETHOD FlushPendingNotifications()=0;
/**
* Set the document character set. This should be passed on to the
* document itself.
*/
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset)=0;
};
#endif /* nsIContentSink_h___ */

Просмотреть файл

@ -87,12 +87,12 @@ typedef enum {
kCharsetFromWeakDocTypeDefault,
kCharsetFromUserDefault ,
kCharsetFromDocTypeDefault,
kCharsetFromCache,
kCharsetFromParentFrame,
kCharsetFromBookmarks,
kCharsetFromAutoDetection,
kCharsetFromMetaTag,
kCharsetFromByteOrderMark,
kCharsetFromCache,
kCharsetFromHTTPHeader,
kCharsetFromUserForced,
kCharsetFromOtherComponent,

Просмотреть файл

@ -57,6 +57,7 @@ public:
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
// nsIHTMLContentSink
NS_IMETHOD SetTitle(const nsString& aValue);

Просмотреть файл

@ -39,12 +39,16 @@
#include "nsViewSourceHTML.h"
#include "nsIStringStream.h"
#include "nsIChannel.h"
#include "nsICachingChannel.h"
#include "nsICacheEntryDescriptor.h"
#include "nsICharsetAlias.h"
#include "nsIProgressEventSink.h"
#include "nsIInputStream.h"
#include "CNavDTD.h"
#include "COtherDTD.h"
#include "prenv.h"
#include "nsParserCIID.h"
#include "nsReadableUtils.h"
#include "nsCOMPtr.h"
#include "nsIEventQueue.h"
#include "nsIEventQueueService.h"
@ -543,6 +547,13 @@ void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSo
mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
}
void nsParser::SetSinkCharset(nsAWritableString& aCharset)
{
if (mSink) {
mSink->SetDocumentCharset(aCharset);
}
}
/**
* This method gets called in order to set the content
* sink for this parser to dump nodes to.
@ -613,14 +624,14 @@ nsDTDMode nsParser::GetParseMode(void){
}
template <class CharT>
class CWordTokenizer {
public:
CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
mLength=0;
mOffset=aStartOffset;
mMaxOffset=aMaxOffset;
mBuffer=aString.GetUnicode();
mBuffer=aBuffer;
mEndBuffer=mBuffer+mMaxOffset;
}
@ -633,25 +644,33 @@ public:
// Returns offset of nth word, or -1 (if out of words).
//********************************************************************************
PRInt32 GetNextWord() {
PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {
const PRUnichar *cp=mBuffer+mOffset+mLength; //skip last word
const CharT *cp=mBuffer+mOffset+mLength; //skip last word
mLength=0; //reset this
mOffset=-1; //reset this
//now skip whitespace...
PRUnichar target=0;
CharT target=0;
PRBool done=PR_FALSE;
while((!done) && (cp++<mEndBuffer)) {
switch(*cp) {
case kSpace: case kNewLine:
case kCR: case kTab:
case kEqual:
continue;
case kQuote:
target=*cp;
if (aSkipQuotes) {
cp++;
}
done=PR_TRUE;
break;
case kMinus:
target=*cp;
done=PR_TRUE;
@ -665,7 +684,7 @@ public:
if(cp<mEndBuffer) {
const PRUnichar *firstcp=cp; //hang onto this...
const CharT *firstcp=cp; //hang onto this...
PRInt32 theDashCount=2;
cp++; //just skip first letter to simplify processing...
@ -693,7 +712,8 @@ public:
(kGreaterThan==*cp) ||
(kQuote==*cp) ||
(kCR==*cp) ||
(kTab==*cp)) {
(kTab==*cp) ||
(kEqual == *cp)) {
break;
}
}
@ -707,11 +727,15 @@ public:
return mOffset;
}
PRInt32 GetLength() const {
return mLength;
}
PRInt32 mOffset;
PRInt32 mMaxOffset;
PRInt32 mLength;
const PRUnichar* mBuffer;
const PRUnichar* mEndBuffer;
const CharT* mBuffer;
const CharT* mEndBuffer;
};
@ -848,7 +872,7 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&
if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {
const PRUnichar* theBuffer=aBuffer.GetUnicode();
CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
CWordTokenizer<PRUnichar> theTokenizer(theBuffer,theLTPos,theGTPos);
theOffset=theTokenizer.GetNextWord(); //try to find ?xml, !doctype, etc...
if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
@ -2297,7 +2321,7 @@ nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
#define UTF8 "UTF-8"
static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
oCharsetSource= kCharsetFromAutoDetection;
oCharset.SetLength(0);
// see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
@ -2407,11 +2431,121 @@ static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsS
return oCharset.Length() > 0;
}
static const char kHTTPEquivStr[] = "http-equiv";
static const PRInt32 kHTTPEquivStrLen = sizeof(kHTTPEquivStr)-1;
static const char kContentTypeStr[] = "Content-Type";
static const PRInt32 kContentTypeStrLen = sizeof(kContentTypeStr)-1;
static const char kContentStr[] = "content";
static const PRInt32 kContentStrLen = sizeof(kContentStr)-1;
static const char kCharsetStr[] = "charset";
static const PRInt32 kCharsetStrLen = sizeof(kCharsetStr)-1;
PRBool
nsParser::DetectMetaTag(const char* aBytes,
PRInt32 aLen,
nsString& aCharset,
nsCharsetSource& aCharsetSource)
{
PRBool foundContentType = PR_FALSE;
aCharsetSource= kCharsetFromMetaTag;
aCharset.SetLength(0);
// XXX Only look inside HTML documents for now. For XML
// documents we should be looking inside the XMLDecl.
if (!mParserContext->mMimeType.Equals(NS_ConvertASCIItoUCS2(kHTMLTextContentType))) {
return PR_FALSE;
}
// Fast and loose parsing to determine if we have a complete
// META tag in this block, looking upto 2k into it.
nsDependentCString str(aBytes, PR_MIN(aLen, 2048));
nsReadingIterator<char> begin, end;
str.BeginReading(begin);
str.EndReading(end);
nsReadingIterator<char> tagStart(begin);
nsReadingIterator<char> tagEnd(end);
do {
// Find the string META and make sure it's not right at the beginning
if (CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("META"), tagStart, tagEnd) &&
(tagStart != begin)) {
// Back up one to confirm that this is a tag
if (*--tagStart == '<') {
const char* attrStart = tagEnd.get();
const char* attrEnd;
// Find the end of the tag
FindInReadable(NS_LITERAL_CSTRING(">"), tagEnd, end);
attrEnd = tagEnd.get();
CWordTokenizer<char> tokenizer(attrStart, 0, attrEnd-attrStart);
PRInt32 offset;
// Start looking at the attributes
while ((offset = tokenizer.GetNextWord()) != kNotFound) {
// We need to have a HTTP-EQUIV attribute whose value is
// "Content-Type"
if ((tokenizer.GetLength() >= kHTTPEquivStrLen) &&
(nsCRT::strncasecmp(attrStart+offset,
kHTTPEquivStr, kHTTPEquivStrLen) == 0)) {
if (((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) &&
(tokenizer.GetLength() >= kContentTypeStrLen) &&
(nsCRT::strncasecmp(attrStart+offset,
kContentTypeStr, kContentTypeStrLen) == 0)) {
foundContentType = PR_TRUE;
}
else {
break;
}
}
// And a CONTENT attribute
else if ((tokenizer.GetLength() >= kContentStrLen) &&
(nsCRT::strncasecmp(attrStart+offset,
kContentStr, kContentStrLen) == 0)) {
// The next word is the value which itself needs to be parsed
if ((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) {
const char* contentStart = attrStart+offset;
CWordTokenizer<char> contentTokenizer(contentStart, 0,
tokenizer.GetLength());
// Read the content type
if (contentTokenizer.GetNextWord() != kNotFound) {
// Now see if we have a charset
if (((offset = contentTokenizer.GetNextWord()) != kNotFound) &&
(contentTokenizer.GetLength() >= kCharsetStrLen) &&
(nsCRT::strncasecmp(contentStart+offset,
kCharsetStr, kCharsetStrLen) == 0)) {
// The next word is the charset
if ((offset = contentTokenizer.GetNextWord()) != kNotFound) {
aCharset.Assign(NS_ConvertASCIItoUCS2(contentStart+offset,
contentTokenizer.GetLength()));
}
}
}
}
}
}
if (foundContentType && (aCharset.Length() > 0)) {
return PR_TRUE;
}
}
}
tagStart = tagEnd;
tagEnd = end;
} while (tagStart != end);
return PR_FALSE;
}
typedef struct {
PRBool mNeedCheckFirst4Bytes;
PRBool mNeedCharsetCheck;
nsParser* mParser;
nsIParserFilter* mParserFilter;
nsScanner* mScanner;
nsIRequest* mRequest;
} ParserWriteStruct;
/*
@ -2437,19 +2571,40 @@ ParserWriteFunc(nsIInputStream* in,
return NS_ERROR_FAILURE;
}
if(pws->mNeedCheckFirst4Bytes && (count >= 4)) {
if(pws->mNeedCharsetCheck) {
nsCharsetSource guessSource;
nsAutoString guess;
nsAutoString guess, preferred;
pws->mNeedCheckFirst4Bytes = PR_FALSE;
if(detectByteOrderMark((const unsigned char*)buf,
theNumRead, guess, guessSource))
{
pws->mNeedCharsetCheck = PR_FALSE;
if(pws->mParser->DetectMetaTag(buf, theNumRead,
guess, guessSource) ||
((count >= 4) &&
DetectByteOrderMark((const unsigned char*)buf,
theNumRead, guess, guessSource))) {
#ifdef DEBUG_XMLENCODING
printf("xmlencoding detect- %s\n", guess.ToNewCString());
printf("xmlencoding detect- %s\n", guess.ToNewCString());
#endif
pws->mParser->SetDocumentCharset(guess, guessSource);
}
nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
result = alias->GetPreferred(guess, preferred);
if (NS_SUCCEEDED(result)) {
guess.Assign(preferred);
}
pws->mParser->SetDocumentCharset(guess, guessSource);
pws->mParser->SetSinkCharset(guess);
nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
if (channel) {
nsCOMPtr<nsISupports> cacheToken;
channel->GetCacheToken(getter_AddRefs(cacheToken));
if (cacheToken) {
nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
if (cacheDescriptor) {
nsresult rv = cacheDescriptor->SetMetaDataElement("charset",
NS_ConvertUCS2toUTF8(guess).get());
NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
}
}
}
}
}
if(pws->mParserFilter)
@ -2503,11 +2658,12 @@ NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail
PRUint32 totalRead;
ParserWriteStruct pws;
pws.mNeedCheckFirst4Bytes =
((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
pws.mNeedCharsetCheck =
((0 == sourceOffset) && (mCharsetSource<kCharsetFromMetaTag));
pws.mParser = this;
pws.mParserFilter = mParserFilter;
pws.mScanner = theContext->mScanner;
pws.mRequest = request;
result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
if (NS_FAILED(result)) {

Просмотреть файл

@ -328,6 +328,17 @@ class nsParser : public nsIParser,
const nsString* aMimeType=nsnull,
nsDTDMode aDTDMode=eDTDMode_unknown);
/**
* Detects the existence of a META tag with charset information in
* the given buffer.
*/
PRBool DetectMetaTag(const char* aBytes,
PRInt32 aLen,
nsString& oCharset,
nsCharsetSource& oCharsetSource);
void SetSinkCharset(nsAWritableString& aCharset);
/**
* Removes continue parsing events
* @update kmcclusk 5/18/98

Просмотреть файл

@ -86,6 +86,7 @@ public:
NS_IMETHOD WillResume(void) { return NS_OK; }
NS_IMETHOD SetParser(nsIParser* aParser) { return NS_OK; }
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
NS_IMETHOD WillProcessTokens(void) { return NS_OK; }
NS_IMETHOD DidProcessTokens(void) { return NS_OK; }
NS_IMETHOD WillProcessAToken(void) { return NS_OK; }

Просмотреть файл

@ -55,6 +55,7 @@ public:
NS_IMETHOD AddComment(const nsIParserNode& aNode);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() {return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) {return NS_OK;}
// nsIHTMLContentSink
NS_IMETHOD SetTitle(const nsString& aValue);

Просмотреть файл

@ -168,6 +168,12 @@ public:
* is in sync with the state of the sink.
*/
NS_IMETHOD FlushPendingNotifications()=0;
/**
* Set the document character set. This should be passed on to the
* document itself.
*/
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset)=0;
};
#endif /* nsIContentSink_h___ */

Просмотреть файл

@ -87,12 +87,12 @@ typedef enum {
kCharsetFromWeakDocTypeDefault,
kCharsetFromUserDefault ,
kCharsetFromDocTypeDefault,
kCharsetFromCache,
kCharsetFromParentFrame,
kCharsetFromBookmarks,
kCharsetFromAutoDetection,
kCharsetFromMetaTag,
kCharsetFromByteOrderMark,
kCharsetFromCache,
kCharsetFromHTTPHeader,
kCharsetFromUserForced,
kCharsetFromOtherComponent,

Просмотреть файл

@ -57,6 +57,7 @@ public:
NS_IMETHOD AddProcessingInstruction(const nsIParserNode& aNode);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
// nsIHTMLContentSink
NS_IMETHOD SetTitle(const nsString& aValue);

Просмотреть файл

@ -39,12 +39,16 @@
#include "nsViewSourceHTML.h"
#include "nsIStringStream.h"
#include "nsIChannel.h"
#include "nsICachingChannel.h"
#include "nsICacheEntryDescriptor.h"
#include "nsICharsetAlias.h"
#include "nsIProgressEventSink.h"
#include "nsIInputStream.h"
#include "CNavDTD.h"
#include "COtherDTD.h"
#include "prenv.h"
#include "nsParserCIID.h"
#include "nsReadableUtils.h"
#include "nsCOMPtr.h"
#include "nsIEventQueue.h"
#include "nsIEventQueueService.h"
@ -543,6 +547,13 @@ void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSo
mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
}
void nsParser::SetSinkCharset(nsAWritableString& aCharset)
{
if (mSink) {
mSink->SetDocumentCharset(aCharset);
}
}
/**
* This method gets called in order to set the content
* sink for this parser to dump nodes to.
@ -613,14 +624,14 @@ nsDTDMode nsParser::GetParseMode(void){
}
template <class CharT>
class CWordTokenizer {
public:
CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
mLength=0;
mOffset=aStartOffset;
mMaxOffset=aMaxOffset;
mBuffer=aString.GetUnicode();
mBuffer=aBuffer;
mEndBuffer=mBuffer+mMaxOffset;
}
@ -633,25 +644,33 @@ public:
// Returns offset of nth word, or -1 (if out of words).
//********************************************************************************
PRInt32 GetNextWord() {
PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {
const PRUnichar *cp=mBuffer+mOffset+mLength; //skip last word
const CharT *cp=mBuffer+mOffset+mLength; //skip last word
mLength=0; //reset this
mOffset=-1; //reset this
//now skip whitespace...
PRUnichar target=0;
CharT target=0;
PRBool done=PR_FALSE;
while((!done) && (cp++<mEndBuffer)) {
switch(*cp) {
case kSpace: case kNewLine:
case kCR: case kTab:
case kEqual:
continue;
case kQuote:
target=*cp;
if (aSkipQuotes) {
cp++;
}
done=PR_TRUE;
break;
case kMinus:
target=*cp;
done=PR_TRUE;
@ -665,7 +684,7 @@ public:
if(cp<mEndBuffer) {
const PRUnichar *firstcp=cp; //hang onto this...
const CharT *firstcp=cp; //hang onto this...
PRInt32 theDashCount=2;
cp++; //just skip first letter to simplify processing...
@ -693,7 +712,8 @@ public:
(kGreaterThan==*cp) ||
(kQuote==*cp) ||
(kCR==*cp) ||
(kTab==*cp)) {
(kTab==*cp) ||
(kEqual == *cp)) {
break;
}
}
@ -707,11 +727,15 @@ public:
return mOffset;
}
PRInt32 GetLength() const {
return mLength;
}
PRInt32 mOffset;
PRInt32 mMaxOffset;
PRInt32 mLength;
const PRUnichar* mBuffer;
const PRUnichar* mEndBuffer;
const CharT* mBuffer;
const CharT* mEndBuffer;
};
@ -848,7 +872,7 @@ void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType&
if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {
const PRUnichar* theBuffer=aBuffer.GetUnicode();
CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
CWordTokenizer<PRUnichar> theTokenizer(theBuffer,theLTPos,theGTPos);
theOffset=theTokenizer.GetNextWord(); //try to find ?xml, !doctype, etc...
if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
@ -2297,7 +2321,7 @@ nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
#define UTF8 "UTF-8"
static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
oCharsetSource= kCharsetFromAutoDetection;
oCharset.SetLength(0);
// see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
@ -2407,11 +2431,121 @@ static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsS
return oCharset.Length() > 0;
}
static const char kHTTPEquivStr[] = "http-equiv";
static const PRInt32 kHTTPEquivStrLen = sizeof(kHTTPEquivStr)-1;
static const char kContentTypeStr[] = "Content-Type";
static const PRInt32 kContentTypeStrLen = sizeof(kContentTypeStr)-1;
static const char kContentStr[] = "content";
static const PRInt32 kContentStrLen = sizeof(kContentStr)-1;
static const char kCharsetStr[] = "charset";
static const PRInt32 kCharsetStrLen = sizeof(kCharsetStr)-1;
PRBool
nsParser::DetectMetaTag(const char* aBytes,
PRInt32 aLen,
nsString& aCharset,
nsCharsetSource& aCharsetSource)
{
PRBool foundContentType = PR_FALSE;
aCharsetSource= kCharsetFromMetaTag;
aCharset.SetLength(0);
// XXX Only look inside HTML documents for now. For XML
// documents we should be looking inside the XMLDecl.
if (!mParserContext->mMimeType.Equals(NS_ConvertASCIItoUCS2(kHTMLTextContentType))) {
return PR_FALSE;
}
// Fast and loose parsing to determine if we have a complete
// META tag in this block, looking upto 2k into it.
nsDependentCString str(aBytes, PR_MIN(aLen, 2048));
nsReadingIterator<char> begin, end;
str.BeginReading(begin);
str.EndReading(end);
nsReadingIterator<char> tagStart(begin);
nsReadingIterator<char> tagEnd(end);
do {
// Find the string META and make sure it's not right at the beginning
if (CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("META"), tagStart, tagEnd) &&
(tagStart != begin)) {
// Back up one to confirm that this is a tag
if (*--tagStart == '<') {
const char* attrStart = tagEnd.get();
const char* attrEnd;
// Find the end of the tag
FindInReadable(NS_LITERAL_CSTRING(">"), tagEnd, end);
attrEnd = tagEnd.get();
CWordTokenizer<char> tokenizer(attrStart, 0, attrEnd-attrStart);
PRInt32 offset;
// Start looking at the attributes
while ((offset = tokenizer.GetNextWord()) != kNotFound) {
// We need to have a HTTP-EQUIV attribute whose value is
// "Content-Type"
if ((tokenizer.GetLength() >= kHTTPEquivStrLen) &&
(nsCRT::strncasecmp(attrStart+offset,
kHTTPEquivStr, kHTTPEquivStrLen) == 0)) {
if (((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) &&
(tokenizer.GetLength() >= kContentTypeStrLen) &&
(nsCRT::strncasecmp(attrStart+offset,
kContentTypeStr, kContentTypeStrLen) == 0)) {
foundContentType = PR_TRUE;
}
else {
break;
}
}
// And a CONTENT attribute
else if ((tokenizer.GetLength() >= kContentStrLen) &&
(nsCRT::strncasecmp(attrStart+offset,
kContentStr, kContentStrLen) == 0)) {
// The next word is the value which itself needs to be parsed
if ((offset = tokenizer.GetNextWord(PR_TRUE)) != kNotFound) {
const char* contentStart = attrStart+offset;
CWordTokenizer<char> contentTokenizer(contentStart, 0,
tokenizer.GetLength());
// Read the content type
if (contentTokenizer.GetNextWord() != kNotFound) {
// Now see if we have a charset
if (((offset = contentTokenizer.GetNextWord()) != kNotFound) &&
(contentTokenizer.GetLength() >= kCharsetStrLen) &&
(nsCRT::strncasecmp(contentStart+offset,
kCharsetStr, kCharsetStrLen) == 0)) {
// The next word is the charset
if ((offset = contentTokenizer.GetNextWord()) != kNotFound) {
aCharset.Assign(NS_ConvertASCIItoUCS2(contentStart+offset,
contentTokenizer.GetLength()));
}
}
}
}
}
}
if (foundContentType && (aCharset.Length() > 0)) {
return PR_TRUE;
}
}
}
tagStart = tagEnd;
tagEnd = end;
} while (tagStart != end);
return PR_FALSE;
}
typedef struct {
PRBool mNeedCheckFirst4Bytes;
PRBool mNeedCharsetCheck;
nsParser* mParser;
nsIParserFilter* mParserFilter;
nsScanner* mScanner;
nsIRequest* mRequest;
} ParserWriteStruct;
/*
@ -2437,19 +2571,40 @@ ParserWriteFunc(nsIInputStream* in,
return NS_ERROR_FAILURE;
}
if(pws->mNeedCheckFirst4Bytes && (count >= 4)) {
if(pws->mNeedCharsetCheck) {
nsCharsetSource guessSource;
nsAutoString guess;
nsAutoString guess, preferred;
pws->mNeedCheckFirst4Bytes = PR_FALSE;
if(detectByteOrderMark((const unsigned char*)buf,
theNumRead, guess, guessSource))
{
pws->mNeedCharsetCheck = PR_FALSE;
if(pws->mParser->DetectMetaTag(buf, theNumRead,
guess, guessSource) ||
((count >= 4) &&
DetectByteOrderMark((const unsigned char*)buf,
theNumRead, guess, guessSource))) {
#ifdef DEBUG_XMLENCODING
printf("xmlencoding detect- %s\n", guess.ToNewCString());
printf("xmlencoding detect- %s\n", guess.ToNewCString());
#endif
pws->mParser->SetDocumentCharset(guess, guessSource);
}
nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
result = alias->GetPreferred(guess, preferred);
if (NS_SUCCEEDED(result)) {
guess.Assign(preferred);
}
pws->mParser->SetDocumentCharset(guess, guessSource);
pws->mParser->SetSinkCharset(guess);
nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
if (channel) {
nsCOMPtr<nsISupports> cacheToken;
channel->GetCacheToken(getter_AddRefs(cacheToken));
if (cacheToken) {
nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
if (cacheDescriptor) {
nsresult rv = cacheDescriptor->SetMetaDataElement("charset",
NS_ConvertUCS2toUTF8(guess).get());
NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
}
}
}
}
}
if(pws->mParserFilter)
@ -2503,11 +2658,12 @@ NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail
PRUint32 totalRead;
ParserWriteStruct pws;
pws.mNeedCheckFirst4Bytes =
((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
pws.mNeedCharsetCheck =
((0 == sourceOffset) && (mCharsetSource<kCharsetFromMetaTag));
pws.mParser = this;
pws.mParserFilter = mParserFilter;
pws.mScanner = theContext->mScanner;
pws.mRequest = request;
result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
if (NS_FAILED(result)) {

Просмотреть файл

@ -328,6 +328,17 @@ class nsParser : public nsIParser,
const nsString* aMimeType=nsnull,
nsDTDMode aDTDMode=eDTDMode_unknown);
/**
* Detects the existence of a META tag with charset information in
* the given buffer.
*/
PRBool DetectMetaTag(const char* aBytes,
PRInt32 aLen,
nsString& oCharset,
nsCharsetSource& oCharsetSource);
void SetSinkCharset(nsAWritableString& aCharset);
/**
* Removes continue parsing events
* @update kmcclusk 5/18/98

Просмотреть файл

@ -168,6 +168,7 @@ public:
NS_IMETHOD NotifyError(const nsParserError* aError);
NS_IMETHOD AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode=0);
NS_IMETHOD FlushPendingNotifications() { return NS_OK; }
NS_IMETHOD SetDocumentCharset(nsAWritableString& aCharset) { return NS_OK; }
// nsIXMLContentSink
NS_IMETHOD AddXMLDecl(const nsIParserNode& aNode);