зеркало из https://github.com/mozilla/gecko-dev.git
Bug 716579 - Let a BOM override HTTP-level charset in the HTML and XML parsers. r=smaug.
This commit is contained in:
Родитель
d3ba646030
Коммит
4038858f3f
|
@ -24,6 +24,7 @@
|
|||
#include "expat.h"
|
||||
#include "nsINestedURI.h"
|
||||
#include "nsCharsetSource.h"
|
||||
#include "nsIWyciwygChannel.h"
|
||||
|
||||
using namespace mozilla;
|
||||
|
||||
|
@ -495,8 +496,8 @@ nsHtml5StreamParser::FinalizeSniffing(const uint8_t* aFromSegment, // can be nul
|
|||
uint32_t aCountToSniffingLimit)
|
||||
{
|
||||
NS_ASSERTION(IsParserThread(), "Wrong thread!");
|
||||
NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
|
||||
"Should not finalize sniffing when already confident.");
|
||||
NS_ASSERTION(mCharsetSource < kCharsetFromParentForced,
|
||||
"Should not finalize sniffing when using forced charset.");
|
||||
if (mMode == VIEW_SOURCE_XML) {
|
||||
static const XML_Memory_Handling_Suite memsuite =
|
||||
{
|
||||
|
@ -634,6 +635,11 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
|
|||
NS_ASSERTION(IsParserThread(), "Wrong thread!");
|
||||
nsresult rv = NS_OK;
|
||||
uint32_t writeCount;
|
||||
|
||||
// mCharset and mCharsetSource potentially have come from channel or higher
|
||||
// by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
|
||||
// If we don't find a BOM, the previously set values of mCharset and
|
||||
// mCharsetSource are not modified by the BOM sniffing here.
|
||||
for (uint32_t i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
|
||||
switch (mBomState) {
|
||||
case BOM_SNIFFING_NOT_STARTED:
|
||||
|
@ -701,8 +707,36 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
|
|||
break;
|
||||
}
|
||||
}
|
||||
// if we get here, there either was no BOM or the BOM sniffing isn't complete yet
|
||||
// if we get here, there either was no BOM or the BOM sniffing isn't complete
|
||||
// yet
|
||||
|
||||
if (mBomState == BOM_SNIFFING_OVER &&
|
||||
mCharsetSource >= kCharsetFromChannel) {
|
||||
// There was no BOM and the charset came from channel or higher. mCharset
|
||||
// still contains the charset from the channel or higher as set by an
|
||||
// earlier call to SetDocumentCharset(), since we didn't find a BOM and
|
||||
// overwrite mCharset.
|
||||
nsCOMPtr<nsICharsetConverterManager> convManager =
|
||||
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
|
||||
convManager->GetUnicodeDecoder(mCharset.get(),
|
||||
getter_AddRefs(mUnicodeDecoder));
|
||||
if (mUnicodeDecoder) {
|
||||
mUnicodeDecoder->SetInputErrorBehavior(
|
||||
nsIUnicodeDecoder::kOnError_Recover);
|
||||
mFeedChardet = false;
|
||||
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
|
||||
mMetaScanner = nullptr;
|
||||
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
|
||||
aCount,
|
||||
aWriteCount);
|
||||
} else {
|
||||
// nsHTMLDocument is supposed to make sure this does not happen. Let's
|
||||
// deal with this anyway, since who knows how kCharsetFromOtherComponent
|
||||
// is used.
|
||||
mCharsetSource = kCharsetFromWeakDocTypeDefault;
|
||||
}
|
||||
}
|
||||
|
||||
if (!mMetaScanner && (mMode == NORMAL ||
|
||||
mMode == VIEW_SOURCE_HTML ||
|
||||
mMode == LOAD_AS_DATA)) {
|
||||
|
@ -963,7 +997,13 @@ nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
|
|||
mFeedChardet = false;
|
||||
}
|
||||
|
||||
if (mCharsetSource <= kCharsetFromMetaPrescan) {
|
||||
nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
|
||||
if (wyciwygChannel) {
|
||||
mReparseForbidden = true;
|
||||
mFeedChardet = false;
|
||||
// If we are reloading a document.open()ed doc, fall through to converter
|
||||
// instantiation here and avoid BOM sniffing.
|
||||
} else if (mCharsetSource < kCharsetFromParentForced) {
|
||||
// we aren't ready to commit to an encoding yet
|
||||
// leave converter uninstantiated for now
|
||||
return NS_OK;
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
#include "mozilla/Mutex.h"
|
||||
#include "nsParserConstants.h"
|
||||
#include "nsCharsetSource.h"
|
||||
#include "nsContentUtils.h"
|
||||
|
||||
using namespace mozilla;
|
||||
|
||||
|
@ -1250,8 +1251,7 @@ nsParser::Parse(nsIURI* aURL,
|
|||
}
|
||||
NS_ConvertUTF8toUTF16 theName(spec);
|
||||
|
||||
nsScanner* theScanner = new nsScanner(theName, false, mCharset,
|
||||
mCharsetSource);
|
||||
nsScanner* theScanner = new nsScanner(theName, false);
|
||||
CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey,
|
||||
mCommand, aListener);
|
||||
if (pc && theScanner) {
|
||||
|
@ -1311,7 +1311,7 @@ nsParser::Parse(const nsAString& aSourceBuffer,
|
|||
if (!pc) {
|
||||
// Only make a new context if we don't have one, OR if we do, but has a
|
||||
// different context key.
|
||||
nsScanner* theScanner = new nsScanner(mUnusedInput, mCharset, mCharsetSource);
|
||||
nsScanner* theScanner = new nsScanner(mUnusedInput);
|
||||
NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
|
||||
|
||||
eAutoDetectResult theStatus = eUnknownDetect;
|
||||
|
@ -1674,11 +1674,6 @@ nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext)
|
|||
}
|
||||
|
||||
|
||||
#define UTF16_BOM "UTF-16"
|
||||
#define UTF16_BE "UTF-16BE"
|
||||
#define UTF16_LE "UTF-16LE"
|
||||
#define UTF8 "UTF-8"
|
||||
|
||||
static inline bool IsSecondMarker(unsigned char aChar)
|
||||
{
|
||||
switch (aChar) {
|
||||
|
@ -1693,146 +1688,87 @@ static inline bool IsSecondMarker(unsigned char aChar)
|
|||
}
|
||||
|
||||
static bool
|
||||
DetectByteOrderMark(const unsigned char* aBytes, int32_t aLen,
|
||||
nsCString& oCharset, int32_t& oCharsetSource)
|
||||
ExtractCharsetFromXmlDeclaration(const unsigned char* aBytes, int32_t aLen,
|
||||
nsCString& oCharset)
|
||||
{
|
||||
oCharsetSource= kCharsetFromAutoDetection;
|
||||
oCharset.Truncate();
|
||||
// See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
|
||||
// for details
|
||||
// Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
|
||||
// We need to check that
|
||||
// UCS2 BOM FEFF = UTF8 EF BB BF
|
||||
switch(aBytes[0])
|
||||
{
|
||||
case 0x00:
|
||||
if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
|
||||
// 00 3C 00
|
||||
if(IsSecondMarker(aBytes[3])) {
|
||||
// 00 3C 00 SM UTF-16, big-endian, no Byte Order Mark
|
||||
oCharset.Assign(UTF16_BE);
|
||||
oCharsetSource = kCharsetFromByteOrderMark;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0x3C:
|
||||
if(0x00==aBytes[1] && (0x00==aBytes[3])) {
|
||||
// 3C 00 XX 00
|
||||
if(IsSecondMarker(aBytes[2])) {
|
||||
// 3C 00 SM 00 UTF-16, little-endian, no Byte Order Mark
|
||||
oCharset.Assign(UTF16_LE);
|
||||
oCharsetSource = kCharsetFromByteOrderMark;
|
||||
}
|
||||
// For html, meta tag detector is invoked before this so that we have
|
||||
// to deal only with XML here.
|
||||
} else if( (0x3F==aBytes[1]) &&
|
||||
(0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
|
||||
(0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
|
||||
// 3C 3F 78 6D
|
||||
// ASCII characters are in their normal positions, so we can safely
|
||||
// deal with the XML declaration in the old C way
|
||||
// The shortest string so far (strlen==5):
|
||||
// <?xml
|
||||
int32_t i;
|
||||
bool versionFound = false, encodingFound = false;
|
||||
for (i=6; i < aLen && !encodingFound; ++i) {
|
||||
// end of XML declaration?
|
||||
if ((((char*)aBytes)[i] == '?') &&
|
||||
((i+1) < aLen) &&
|
||||
(((char*)aBytes)[i+1] == '>')) {
|
||||
break;
|
||||
}
|
||||
// Version is required.
|
||||
if (!versionFound) {
|
||||
// Want to avoid string comparisons, hence looking for 'n'
|
||||
// and only if found check the string leading to it. Not
|
||||
// foolproof, but fast.
|
||||
// The shortest string allowed before this is (strlen==13):
|
||||
// <?xml version
|
||||
if ((((char*)aBytes)[i] == 'n') &&
|
||||
(i >= 12) &&
|
||||
(0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
|
||||
// Fast forward through version
|
||||
char q = 0;
|
||||
for (++i; i < aLen; ++i) {
|
||||
char qi = ((char*)aBytes)[i];
|
||||
if (qi == '\'' || qi == '"') {
|
||||
if (q && q == qi) {
|
||||
// ending quote
|
||||
versionFound = true;
|
||||
break;
|
||||
} else {
|
||||
// Starting quote
|
||||
q = qi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// encoding must follow version
|
||||
// Want to avoid string comparisons, hence looking for 'g'
|
||||
// and only if found check the string leading to it. Not
|
||||
// foolproof, but fast.
|
||||
// The shortest allowed string before this (strlen==26):
|
||||
// <?xml version="1" encoding
|
||||
if ((((char*)aBytes)[i] == 'g') &&
|
||||
(i >= 25) &&
|
||||
(0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
|
||||
int32_t encStart = 0;
|
||||
char q = 0;
|
||||
for (++i; i < aLen; ++i) {
|
||||
char qi = ((char*)aBytes)[i];
|
||||
if (qi == '\'' || qi == '"') {
|
||||
if (q && q == qi) {
|
||||
int32_t count = i - encStart;
|
||||
// encoding value is invalid if it is UTF-16
|
||||
if (count > 0 &&
|
||||
(0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
|
||||
oCharset.Assign((char*)(aBytes+encStart),count);
|
||||
oCharsetSource = kCharsetFromMetaTag;
|
||||
}
|
||||
encodingFound = true;
|
||||
break;
|
||||
} else {
|
||||
encStart = i+1;
|
||||
q = qi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // if (!versionFound)
|
||||
} // for
|
||||
}
|
||||
break;
|
||||
case 0xEF:
|
||||
if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
|
||||
// EF BB BF
|
||||
// Win2K UTF-8 BOM
|
||||
oCharset.Assign(UTF8);
|
||||
oCharsetSource= kCharsetFromByteOrderMark;
|
||||
}
|
||||
break;
|
||||
case 0xFE:
|
||||
if(0xFF==aBytes[1]) {
|
||||
// FE FF UTF-16, big-endian
|
||||
oCharset.Assign(UTF16_BOM);
|
||||
oCharsetSource= kCharsetFromByteOrderMark;
|
||||
}
|
||||
break;
|
||||
case 0xFF:
|
||||
if(0xFE==aBytes[1]) {
|
||||
// FF FE
|
||||
// UTF-16, little-endian
|
||||
oCharset.Assign(UTF16_BOM);
|
||||
oCharsetSource= kCharsetFromByteOrderMark;
|
||||
}
|
||||
break;
|
||||
// case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
|
||||
// We do not care EBCIDIC here....
|
||||
// }
|
||||
// break;
|
||||
} // switch
|
||||
return !oCharset.IsEmpty();
|
||||
// This code is rather pointless to have. Might as well reuse expat as
|
||||
// seen in nsHtml5StreamParser. -- hsivonen
|
||||
oCharset.Truncate();
|
||||
if ((aLen >= 5) &&
|
||||
('<' == aBytes[0]) &&
|
||||
('?' == aBytes[1]) &&
|
||||
('x' == aBytes[2]) &&
|
||||
('m' == aBytes[3]) &&
|
||||
('l' == aBytes[4])) {
|
||||
int32_t i;
|
||||
bool versionFound = false, encodingFound = false;
|
||||
for (i = 6; i < aLen && !encodingFound; ++i) {
|
||||
// end of XML declaration?
|
||||
if ((((char*) aBytes)[i] == '?') &&
|
||||
((i + 1) < aLen) &&
|
||||
(((char*) aBytes)[i + 1] == '>')) {
|
||||
break;
|
||||
}
|
||||
// Version is required.
|
||||
if (!versionFound) {
|
||||
// Want to avoid string comparisons, hence looking for 'n'
|
||||
// and only if found check the string leading to it. Not
|
||||
// foolproof, but fast.
|
||||
// The shortest string allowed before this is (strlen==13):
|
||||
// <?xml version
|
||||
if ((((char*) aBytes)[i] == 'n') &&
|
||||
(i >= 12) &&
|
||||
(0 == PL_strncmp("versio", (char*) (aBytes + i - 6), 6))) {
|
||||
// Fast forward through version
|
||||
char q = 0;
|
||||
for (++i; i < aLen; ++i) {
|
||||
char qi = ((char*) aBytes)[i];
|
||||
if (qi == '\'' || qi == '"') {
|
||||
if (q && q == qi) {
|
||||
// ending quote
|
||||
versionFound = true;
|
||||
break;
|
||||
} else {
|
||||
// Starting quote
|
||||
q = qi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// encoding must follow version
|
||||
// Want to avoid string comparisons, hence looking for 'g'
|
||||
// and only if found check the string leading to it. Not
|
||||
// foolproof, but fast.
|
||||
// The shortest allowed string before this (strlen==26):
|
||||
// <?xml version="1" encoding
|
||||
if ((((char*) aBytes)[i] == 'g') && (i >= 25) && (0 == PL_strncmp(
|
||||
"encodin", (char*) (aBytes + i - 7), 7))) {
|
||||
int32_t encStart = 0;
|
||||
char q = 0;
|
||||
for (++i; i < aLen; ++i) {
|
||||
char qi = ((char*) aBytes)[i];
|
||||
if (qi == '\'' || qi == '"') {
|
||||
if (q && q == qi) {
|
||||
int32_t count = i - encStart;
|
||||
// encoding value is invalid if it is UTF-16
|
||||
if (count > 0 && (0 != PL_strcmp("UTF-16",
|
||||
(char*) (aBytes + encStart)))) {
|
||||
oCharset.Assign((char*) (aBytes + encStart), count);
|
||||
}
|
||||
encodingFound = true;
|
||||
break;
|
||||
} else {
|
||||
encStart = i + 1;
|
||||
q = qi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // if (!versionFound)
|
||||
} // for
|
||||
}
|
||||
return !oCharset.IsEmpty();
|
||||
}
|
||||
|
||||
inline const char
|
||||
|
@ -1843,131 +1779,6 @@ GetNextChar(nsACString::const_iterator& aStart,
|
|||
return (++aStart != aEnd) ? *aStart : '\0';
|
||||
}
|
||||
|
||||
bool
|
||||
nsParser::DetectMetaTag(const char* aBytes,
|
||||
int32_t aLen,
|
||||
nsCString& aCharset,
|
||||
int32_t& aCharsetSource)
|
||||
{
|
||||
aCharsetSource= kCharsetFromMetaTag;
|
||||
aCharset.SetLength(0);
|
||||
|
||||
// XXX Only look inside HTML documents for now. For XML
|
||||
// documents we should be looking inside the XMLDecl.
|
||||
if (!mParserContext->mMimeType.EqualsLiteral(TEXT_HTML)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fast and loose parsing to determine if we have a complete
|
||||
// META tag in this block, looking upto 2k into it.
|
||||
const nsASingleFragmentCString& str =
|
||||
Substring(aBytes, aBytes + NS_MIN(aLen, 2048));
|
||||
// XXXldb Should be const_char_iterator when FindInReadable supports it.
|
||||
nsACString::const_iterator begin, end;
|
||||
|
||||
str.BeginReading(begin);
|
||||
str.EndReading(end);
|
||||
nsACString::const_iterator currPos(begin);
|
||||
nsACString::const_iterator tokEnd;
|
||||
nsACString::const_iterator tagEnd(begin);
|
||||
|
||||
while (currPos != end) {
|
||||
if (!FindCharInReadable('<', currPos, end))
|
||||
break; // no tag found in this buffer
|
||||
|
||||
if (GetNextChar(currPos, end) == '!') {
|
||||
if (GetNextChar(currPos, end) != '-' ||
|
||||
GetNextChar(currPos, end) != '-') {
|
||||
// If we only see a <! not followed by --, just skip to the next >.
|
||||
if (!FindCharInReadable('>', currPos, end)) {
|
||||
return false; // No more tags to follow.
|
||||
}
|
||||
|
||||
// Continue searching for a meta tag following this "comment".
|
||||
++currPos;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
|
||||
bool foundMDC = false;
|
||||
bool foundMatch = false;
|
||||
while (!foundMDC) {
|
||||
if (GetNextChar(currPos, end) == '-' &&
|
||||
GetNextChar(currPos, end) == '-') {
|
||||
foundMatch = !foundMatch; // toggle until we've matching "--"
|
||||
} else if (currPos == end) {
|
||||
return false; // Couldn't find --[*s]> in this buffer
|
||||
} else if (foundMatch && *currPos == '>') {
|
||||
foundMDC = true; // found comment end delimiter.
|
||||
++currPos;
|
||||
}
|
||||
}
|
||||
continue; // continue searching for META tag.
|
||||
}
|
||||
|
||||
// Find the end of the tag, break if incomplete
|
||||
tagEnd = currPos;
|
||||
if (!FindCharInReadable('>', tagEnd, end))
|
||||
break;
|
||||
|
||||
// If this is not a META tag, continue to next loop
|
||||
if ( (*currPos != 'm' && *currPos != 'M') ||
|
||||
(*(++currPos) != 'e' && *currPos != 'E') ||
|
||||
(*(++currPos) != 't' && *currPos != 'T') ||
|
||||
(*(++currPos) != 'a' && *currPos != 'A') ||
|
||||
!nsCRT::IsAsciiSpace(*(++currPos))) {
|
||||
currPos = tagEnd;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If could not find "charset" in this tag, skip this tag and try next
|
||||
tokEnd = tagEnd;
|
||||
if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"),
|
||||
currPos, tokEnd)) {
|
||||
currPos = tagEnd;
|
||||
continue;
|
||||
}
|
||||
currPos = tokEnd;
|
||||
|
||||
// skip spaces before '='
|
||||
while (*currPos == kSpace || *currPos == kNewLine ||
|
||||
*currPos == kCR || *currPos == kTab) {
|
||||
++currPos;
|
||||
}
|
||||
// skip '='
|
||||
if (*currPos != '=') {
|
||||
currPos = tagEnd;
|
||||
continue;
|
||||
}
|
||||
++currPos;
|
||||
// skip spaces after '='
|
||||
while (*currPos == kSpace || *currPos == kNewLine ||
|
||||
*currPos == kCR || *currPos == kTab) {
|
||||
++currPos;
|
||||
}
|
||||
|
||||
// skip open quote
|
||||
if (*currPos == '\'' || *currPos == '\"')
|
||||
++currPos;
|
||||
|
||||
// find the end of charset string
|
||||
tokEnd = currPos;
|
||||
while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd)
|
||||
++tokEnd;
|
||||
|
||||
// return true if we successfully got something for charset
|
||||
if (currPos != tokEnd) {
|
||||
aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Nothing specified as charset, continue next loop
|
||||
currPos = tagEnd;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static NS_METHOD
|
||||
NoOpParserWriteFunc(nsIInputStream* in,
|
||||
void* closure,
|
||||
|
@ -2003,7 +1814,8 @@ ParserWriteFunc(nsIInputStream* in,
|
|||
{
|
||||
nsresult result;
|
||||
ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
|
||||
const char* buf = fromRawSegment;
|
||||
const unsigned char* buf =
|
||||
reinterpret_cast<const unsigned char*> (fromRawSegment);
|
||||
uint32_t theNumRead = count;
|
||||
|
||||
if (!pws) {
|
||||
|
@ -2011,47 +1823,37 @@ ParserWriteFunc(nsIInputStream* in,
|
|||
}
|
||||
|
||||
if (pws->mNeedCharsetCheck) {
|
||||
int32_t guessSource;
|
||||
nsAutoCString guess;
|
||||
nsAutoCString preferred;
|
||||
|
||||
pws->mNeedCharsetCheck = false;
|
||||
if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
|
||||
((count >= 4) &&
|
||||
DetectByteOrderMark((const unsigned char*)buf,
|
||||
theNumRead, guess, guessSource))) {
|
||||
result = nsCharsetAlias::GetPreferred(guess, preferred);
|
||||
// Only continue if it's a recognized charset and not
|
||||
// one of a designated set that we ignore.
|
||||
if (NS_SUCCEEDED(result) &&
|
||||
((kCharsetFromByteOrderMark == guessSource) ||
|
||||
(!preferred.EqualsLiteral("UTF-16") &&
|
||||
!preferred.EqualsLiteral("UTF-16BE") &&
|
||||
!preferred.EqualsLiteral("UTF-16LE")))) {
|
||||
guess = preferred;
|
||||
pws->mParser->SetDocumentCharset(guess, guessSource);
|
||||
pws->mParser->SetSinkCharset(preferred);
|
||||
nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
|
||||
if (channel) {
|
||||
nsCOMPtr<nsISupports> cacheToken;
|
||||
channel->GetCacheToken(getter_AddRefs(cacheToken));
|
||||
if (cacheToken) {
|
||||
nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
|
||||
if (cacheDescriptor) {
|
||||
#ifdef DEBUG
|
||||
nsresult rv =
|
||||
#endif
|
||||
cacheDescriptor->SetMetaDataElement("charset",
|
||||
guess.get());
|
||||
NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
|
||||
}
|
||||
}
|
||||
int32_t source;
|
||||
nsAutoCString preferred;
|
||||
nsAutoCString maybePrefer;
|
||||
pws->mParser->GetDocumentCharset(preferred, source);
|
||||
|
||||
// This code was bogus when I found it. It expects the BOM or the XML
|
||||
// declaration to be entirely in the first network buffer. -- hsivonen
|
||||
if (nsContentUtils::CheckForBOM(buf, count, maybePrefer)) {
|
||||
// The decoder will swallow the BOM. The UTF-16 will re-sniff for
|
||||
// endianness. The value of preferred is now either "UTF-8" or "UTF-16".
|
||||
preferred.Assign(maybePrefer);
|
||||
source = kCharsetFromByteOrderMark;
|
||||
} else if (source < kCharsetFromChannel) {
|
||||
nsAutoCString declCharset;
|
||||
|
||||
if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
|
||||
nsresult rv = nsCharsetAlias::GetPreferred(declCharset, maybePrefer);
|
||||
if (NS_SUCCEEDED(rv)) {
|
||||
preferred.Assign(maybePrefer);
|
||||
source = kCharsetFromMetaTag;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pws->mParser->SetDocumentCharset(preferred, source);
|
||||
pws->mParser->SetSinkCharset(preferred);
|
||||
|
||||
}
|
||||
|
||||
result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
|
||||
result = pws->mScanner->Append(fromRawSegment, theNumRead, pws->mRequest);
|
||||
if (NS_SUCCEEDED(result)) {
|
||||
*writeCount = count;
|
||||
}
|
||||
|
@ -2103,8 +1905,7 @@ nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,
|
|||
|
||||
uint32_t totalRead;
|
||||
ParserWriteStruct pws;
|
||||
pws.mNeedCharsetCheck =
|
||||
(0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
|
||||
pws.mNeedCharsetCheck = true;
|
||||
pws.mParser = this;
|
||||
pws.mScanner = theContext->mScanner;
|
||||
pws.mRequest = request;
|
||||
|
|
|
@ -246,15 +246,6 @@ class nsParser : public nsIParser,
|
|||
*/
|
||||
virtual nsIStreamListener* GetStreamListener();
|
||||
|
||||
/**
|
||||
* Detects the existence of a META tag with charset information in
|
||||
* the given buffer.
|
||||
*/
|
||||
bool DetectMetaTag(const char* aBytes,
|
||||
int32_t aLen,
|
||||
nsCString& oCharset,
|
||||
int32_t& oCharsetSource);
|
||||
|
||||
void SetSinkCharset(nsACString& aCharset);
|
||||
|
||||
/**
|
||||
|
|
|
@ -57,8 +57,7 @@ const int kBufsize=64;
|
|||
* @param aMode represents the parser mode (nav, other)
|
||||
* @return
|
||||
*/
|
||||
nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
|
||||
int32_t aSource)
|
||||
nsScanner::nsScanner(const nsAString& anHTMLString)
|
||||
{
|
||||
MOZ_COUNT_CTOR(nsScanner);
|
||||
|
||||
|
@ -84,13 +83,8 @@ nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
|
|||
* Use this constructor if you want i/o to be based on strings
|
||||
* the scanner receives. If you pass a null filename, you
|
||||
* can still provide data to the scanner via append.
|
||||
*
|
||||
* @update gess 5/12/98
|
||||
* @param aFilename --
|
||||
* @return
|
||||
*/
|
||||
nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
|
||||
const nsACString& aCharset, int32_t aSource)
|
||||
nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
|
||||
: mFilename(aFilename)
|
||||
{
|
||||
MOZ_COUNT_CTOR(nsScanner);
|
||||
|
@ -115,7 +109,8 @@ nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
|
|||
mCharsetSource = kCharsetUninitialized;
|
||||
mHasInvalidCharacter = false;
|
||||
mReplacementCharacter = PRUnichar(0x0);
|
||||
SetDocumentCharset(aCharset, aSource);
|
||||
// XML defaults to UTF-8 and about:blank is UTF-8, too.
|
||||
SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
|
||||
}
|
||||
|
||||
nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
|
||||
|
@ -130,6 +125,7 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
|
|||
res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
|
||||
if(NS_SUCCEEDED(res) && same)
|
||||
{
|
||||
mCharsetSource = aSource;
|
||||
return NS_OK; // no difference, don't change it
|
||||
}
|
||||
}
|
||||
|
@ -137,16 +133,9 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
|
|||
// different, need to change it
|
||||
nsCString charsetName;
|
||||
res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
|
||||
MOZ_ASSERT(NS_SUCCEEDED(res), "Should never call with a bogus aCharset.");
|
||||
|
||||
if(NS_FAILED(res) && (mCharsetSource == kCharsetUninitialized))
|
||||
{
|
||||
// failed - unknown alias , fallback to ISO-8859-1
|
||||
mCharset.AssignLiteral("ISO-8859-1");
|
||||
}
|
||||
else
|
||||
{
|
||||
mCharset.Assign(charsetName);
|
||||
}
|
||||
mCharset.Assign(charsetName);
|
||||
|
||||
mCharsetSource = aSource;
|
||||
|
||||
|
|
|
@ -42,29 +42,15 @@ class nsScanner {
|
|||
public:
|
||||
|
||||
/**
|
||||
* Use this constructor if you want i/o to be based on
|
||||
* a single string you hand in during construction.
|
||||
* This short cut was added for Javascript.
|
||||
*
|
||||
* @update ftang 3/02/99
|
||||
* @param aCharset charset
|
||||
* @param aCharsetSource - where the charset info came from
|
||||
* @param aMode represents the parser mode (nav, other)
|
||||
* @return
|
||||
* Use this constructor for the XML fragment parsing case
|
||||
*/
|
||||
nsScanner(const nsAString& anHTMLString, const nsACString& aCharset, int32_t aSource);
|
||||
nsScanner(const nsAString& anHTMLString);
|
||||
|
||||
/**
|
||||
* Use this constructor if you want i/o to be based on
|
||||
* a file (therefore a stream) or just data you provide via Append().
|
||||
*
|
||||
* @update ftang 3/02/99
|
||||
* @param aCharset charset
|
||||
* @param aCharsetSource - where the charset info came from
|
||||
* @param aMode represents the parser mode (nav, other)
|
||||
* @return
|
||||
*/
|
||||
nsScanner(nsString& aFilename,bool aCreateStream, const nsACString& aCharset, int32_t aSource);
|
||||
nsScanner(nsString& aFilename, bool aCreateStream);
|
||||
|
||||
~nsScanner();
|
||||
|
||||
|
|
|
@ -75,6 +75,15 @@ MOCHITEST_FILES = parser_datreader.js \
|
|||
test_viewsource.html \
|
||||
test_bug715112.html \
|
||||
test_bug715739.html \
|
||||
test_bug716579.html \
|
||||
file_bug716579-8.html \
|
||||
file_bug716579-8.html^headers^ \
|
||||
file_bug716579-16.html \
|
||||
file_bug716579-16.html^headers^ \
|
||||
file_bug716579-8.xhtml \
|
||||
file_bug716579-8.xhtml^headers^ \
|
||||
file_bug716579-16.xhtml \
|
||||
file_bug716579-16.xhtml^headers^ \
|
||||
test_bug717180.html \
|
||||
file_bug717180.html \
|
||||
$(NULL)
|
||||
|
|
Двоичный файл не отображается.
|
@ -0,0 +1 @@
|
|||
Content-Type: text/html; charset=windows-874
|
Двоичный файл не отображается.
|
@ -0,0 +1 @@
|
|||
Content-Type: application/xhtml+xml; charset=windows-874
|
|
@ -0,0 +1,3 @@
|
|||
<script>
|
||||
parent.html8 = "€";
|
||||
</script>
|
|
@ -0,0 +1 @@
|
|||
Content-Type: text/html; charset=windows-874
|
|
@ -0,0 +1,7 @@
|
|||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<body>
|
||||
<script>
|
||||
parent.xml8 = "€";
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1 @@
|
|||
Content-Type: application/xhtml+xml; charset=windows-874
|
|
@ -0,0 +1,44 @@
|
|||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=716579
|
||||
-->
|
||||
<head>
|
||||
<meta charset="windows-1251">
|
||||
<title>Test for Bug 716579</title>
|
||||
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
|
||||
</head>
|
||||
<body>
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=716579">Mozilla Bug 716579</a>
|
||||
<p id="display"></p>
|
||||
<pre id="test">
|
||||
<script type="application/javascript">
|
||||
|
||||
/** Test for Bug 716579 **/
|
||||
|
||||
var html8 = "FAIL";
|
||||
var html16 = "FAIL";
|
||||
var xml8 = "FAIL";
|
||||
var xml16 = "FAIL";
|
||||
|
||||
SimpleTest.waitForExplicitFinish();
|
||||
|
||||
window.onload = function() {
|
||||
is(html8, "\u20AC", "HTML UTF-8 failed.");
|
||||
is(html16, "\u20AC", "HTML UTF-16 failed.");
|
||||
is(xml8, "\u20AC", "XML UTF-8 failed.");
|
||||
is(xml16, "\u20AC", "XML UTF-16 failed.");
|
||||
SimpleTest.finish();
|
||||
};
|
||||
|
||||
</script>
|
||||
</pre>
|
||||
<div id="content" style="display: none">
|
||||
<iframe src="file_bug716579-8.html"></iframe>
|
||||
<iframe src="file_bug716579-16.html"></iframe>
|
||||
<iframe src="file_bug716579-8.xhtml"></iframe>
|
||||
<iframe src="file_bug716579-16.xhtml"></iframe>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -17,9 +17,9 @@
|
|||
#define kCharsetFromMetaPrescan 8 // this one and smaller: HTML5 Tentative
|
||||
#define kCharsetFromMetaTag 9 // this one and greater: HTML5 Confident
|
||||
#define kCharsetFromIrreversibleAutoDetection 10
|
||||
#define kCharsetFromByteOrderMark 11
|
||||
#define kCharsetFromChannel 12
|
||||
#define kCharsetFromOtherComponent 13
|
||||
#define kCharsetFromChannel 11
|
||||
#define kCharsetFromOtherComponent 12
|
||||
#define kCharsetFromByteOrderMark 13
|
||||
// Levels below here will be forced onto childframes too
|
||||
#define kCharsetFromParentForced 14
|
||||
#define kCharsetFromUserForced 15
|
||||
|
|
Загрузка…
Ссылка в новой задаче