Bug 631751 - Sniff Basic Latin BOMless UTF-16 for IE compat. r=bzbarsky, a=bzbarsky.

This commit is contained in:
Henri Sivonen 2011-02-16 08:40:35 +02:00
Родитель 13bff10470
Коммит f68ddc8cba
8 изменённых файлов: 130 добавлений и 7 удалений

Просмотреть файл

@ -70,6 +70,10 @@ _TEST_FILES = \
test_bug547487.html \
bug620106_text.html \
test_bug620106.html \
bug631751le_text.html \
test_bug631751le.html \
bug631751be_text.html \
test_bug631751be.html \
$(NULL)
libs:: $(_TEST_FILES)

Двоичные данные
extensions/universalchardet/tests/bug631751be_text.html Normal file

Двоичный файл не отображается.

Двоичные данные
extensions/universalchardet/tests/bug631751le_text.html Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,33 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=631751
-->
<head>
<title>Test for Bug 631751</title>
<script type="text/javascript"
src="chrome://mochikit/content/MochiKit/packed.js"></script>
<script type="text/javascript"
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
</script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css"
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=631751">Mozilla Bug 631751</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 631751 **/
/* Note! This test uses the chardet test harness but doesn't test chardet! */
CharsetDetectionTests("bug631751be_text.html",
"UTF-16BE",
new Array(""));
</script>
</pre>
</body>
</html>

Просмотреть файл

@ -0,0 +1,33 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=631751
-->
<head>
<title>Test for Bug 631751</title>
<script type="text/javascript"
src="chrome://mochikit/content/MochiKit/packed.js"></script>
<script type="text/javascript"
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
</script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css"
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=631751">Mozilla Bug 631751</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 631751 **/
/* Note! This test uses the chardet test harness but doesn't test chardet! */
CharsetDetectionTests("bug631751le_text.html",
"UTF-16LE",
new Array(""));
</script>
</pre>
</body>
</html>

Просмотреть файл

@ -334,6 +334,48 @@ nsHtml5StreamParser::SetupDecodingFromBom(const char* aCharsetName, const char*
return rv;
}
void
nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
PRUint32 aCountToSniffingLimit)
{
// Make sure there's enough data. Require room for "<title></title>"
if (mSniffingLength + aCountToSniffingLimit < 30) {
return;
}
// even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
PRBool byteNonZero[2] = { PR_FALSE, PR_FALSE };
PRUint32 i = 0;
if (mSniffingBuffer) {
for (; i < mSniffingLength; ++i) {
if (mSniffingBuffer[i]) {
if (byteNonZero[1 - (i % 2)]) {
return;
}
byteNonZero[i % 2] = PR_TRUE;
}
}
}
if (aFromSegment) {
for (PRUint32 j = 0; j < aCountToSniffingLimit; ++j) {
if (aFromSegment[j]) {
if (byteNonZero[1 - ((i + j) % 2)]) {
return;
}
byteNonZero[(i + j) % 2] = PR_TRUE;
}
}
}
if (byteNonZero[0]) {
mCharset.Assign("UTF-16LE");
} else {
mCharset.Assign("UTF-16BE");
}
mCharsetSource = kCharsetFromIrreversibleAutoDetection;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mFeedChardet = PR_FALSE;
}
nsresult
nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null
PRUint32 aCount,
@ -346,6 +388,10 @@ nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be nul
mFeedChardet = PR_FALSE;
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
}
// Check for BOMless UTF-16 with Basic
// Latin content for compat with IE. See bug 631751.
SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit);
// the charset may have been set now
// maybe try chardet now;
if (mFeedChardet) {
PRBool dontFeed;

Просмотреть файл

@ -256,7 +256,13 @@ class nsHtml5StreamParser : public nsIStreamListener,
nsresult WriteStreamBytes(const PRUint8* aFromSegment,
PRUint32 aCount,
PRUint32* aWriteCount);
/**
* Check whether every other byte in the sniffing buffer is zero.
*/
void SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
PRUint32 aCountToSniffingLimit);
/**
* <meta charset> scan failed. Try chardet if applicable. After this, the
* the parser will have some encoding even if a last resolt fallback.

Просмотреть файл

@ -98,13 +98,14 @@ enum eParserDocType {
#define kCharsetFromHintPrevDoc 7
#define kCharsetFromMetaPrescan 8 // this one and smaller: HTML5 Tentative
#define kCharsetFromMetaTag 9 // this one and greater: HTML5 Confident
#define kCharsetFromByteOrderMark 10
#define kCharsetFromChannel 11
#define kCharsetFromOtherComponent 12
#define kCharsetFromIrreversibleAutoDetection 10
#define kCharsetFromByteOrderMark 11
#define kCharsetFromChannel 12
#define kCharsetFromOtherComponent 13
// Levels below here will be forced onto childframes too
#define kCharsetFromParentForced 13
#define kCharsetFromUserForced 14
#define kCharsetFromPreviousLoading 15
#define kCharsetFromParentForced 14
#define kCharsetFromUserForced 15
#define kCharsetFromPreviousLoading 16
enum eStreamState {eNone,eOnStart,eOnDataAvail,eOnStop};