Bug 1727491 - Remove support for BOMless unlabeled Latin1 Supplement-range UTF-16LE|BE. r=emk

Differential Revision: https://phabricator.services.mozilla.com/D123596
This commit is contained in:
Henri Sivonen 2021-09-01 09:13:29 +00:00
Родитель 9fa1f1913b
Коммит 5397b4f0a9
10 изменённых файлов: 3 добавлений и 61 удалений

Двоичные данные
dom/html/test/test_bug615595.html

Двоичный файл не отображается.

Просмотреть файл

@ -12,7 +12,6 @@ EncLateMetaReload=The page was reloaded, because the character encoding declarat
EncLateMetaTooLate=The character encoding declaration of document was found too late for it to take effect. The encoding declaration needs to be moved to be within the first 1024 bytes of the file.
EncMetaUnsupported=An unsupported character encoding was declared for the HTML document using a meta tag. The declaration was ignored.
EncProtocolUnsupported=An unsupported character encoding was declared on the transfer protocol level. The declaration was ignored.
EncBomlessUtf16=Detected UTF-16-encoded Basic Latin-only text without a byte order mark and without a transfer protocol-level declaration. Encoding this content in UTF-16 is inefficient and the character encoding should have been declared in any case.
EncMetaUtf16=A meta tag was used to declare the character encoding as UTF-16. This was interpreted as an UTF-8 declaration instead.
EncMetaUserDefined=A meta tag was used to declare the character encoding as x-user-defined. This was interpreted as a windows-1252 declaration instead for compatibility with intentionally mis-encoded legacy fonts. This site should migrate to Unicode.

Просмотреть файл

@ -23,7 +23,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=631751
/** Test for Bug 631751 **/
/* Note! This test uses the chardet test harness but doesn't test chardet! */
CharsetDetectionTests("bug631751be_text.html",
"UTF-16BE",
"UTF-8", // Test runs from file: URL, so ASCII gets detected as UTF-8.
new Array(""));
</script>
</pre>

Просмотреть файл

@ -23,7 +23,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=631751
/** Test for Bug 631751 **/
/* Note! This test uses the chardet test harness but doesn't test chardet! */
CharsetDetectionTests("bug631751le_text.html",
"UTF-16LE",
"UTF-8", // Test runs from file: URL, so ASCII gets detected as UTF-8.
new Array(""));
</script>
</pre>

Просмотреть файл

@ -503,44 +503,6 @@ void nsHtml5StreamParser::SetupDecodingFromUtf16BogoXml(
mLastBuffer->AdvanceEnd(3);
}
void nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const uint8_t* aBuf,
size_t aBufLen) {
// Avoid underspecified heuristic craziness for XHR
if (mMode == LOAD_AS_DATA) {
return;
}
// Make sure there's enough data. Require room for "<title></title>"
if (aBufLen < 30) {
return;
}
// even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
bool byteZero[2] = {false, false};
bool byteNonZero[2] = {false, false};
uint32_t i = 0;
for (; i < aBufLen; ++i) {
if (aBuf[i]) {
if (byteNonZero[1 - (i % 2)]) {
return;
}
byteNonZero[i % 2] = true;
} else {
if (byteZero[1 - (i % 2)]) {
return;
}
byteZero[i % 2] = true;
}
}
if (byteNonZero[0]) {
mEncoding = UTF_16LE_ENCODING;
} else {
mEncoding = UTF_16BE_ENCODING;
}
mCharsetSource = kCharsetFromIrreversibleAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
DontGuessEncoding();
mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16", true, 0);
}
void nsHtml5StreamParser::SetEncodingFromExpat(const char16_t* aEncoding) {
if (aEncoding) {
nsDependentString utf16(aEncoding);
@ -734,15 +696,9 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span<const uint8_t> aFromSegment,
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromXmlDeclaration;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else if (mCharsetSource < kCharsetFromIrreversibleAutoDetection) {
// meta scan and XML declaration check failed.
// Check for BOMless UTF-16 with Basic
// Latin content for compat with IE. See bug 631751.
SniffBOMlessUTF16BasicLatin(buf, bufLen);
}
}
if (mForceAutoDetection &&
mCharsetSource != kCharsetFromIrreversibleAutoDetection) {
if (mForceAutoDetection) {
// neither meta nor XML declaration found, honor override
FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, false);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);

Просмотреть файл

@ -349,11 +349,6 @@ class nsHtml5StreamParser final : public nsISupports {
*/
nsresult WriteStreamBytes(mozilla::Span<const uint8_t> aFromSegment);
/**
* Check whether every other byte in the sniffing buffer is zero.
*/
void SniffBOMlessUTF16BasicLatin(const uint8_t* aBuf, size_t aBufLen);
/**
* Write the start of the stream to detector.
*/

Двоичный файл не отображается.

Просмотреть файл

@ -16,7 +16,6 @@ support-files =
file_bug594730-9.html
file_bug642908.sjs
file_bug655682.sjs
file_bug672453_bomless_utf16.html
file_bug672453_http_unsupported.html
file_bug672453_http_unsupported.html^headers^
file_bug672453_late_meta.html

Просмотреть файл

@ -23,7 +23,6 @@ var tests = [
"file_bug672453_meta_restart.html",
"file_bug672453_meta_unsupported.html",
"file_bug672453_http_unsupported.html",
"file_bug672453_bomless_utf16.html",
"file_bug672453_meta_utf16.html",
"file_bug672453_meta_non_superset.html",
"file_bug672453_meta_userdefined.html",
@ -50,10 +49,6 @@ var expectedErrors = [
sourceName: "http://mochi.test:8888/tests/parser/htmlparser/tests/mochitest/file_bug672453_http_unsupported.html",
lineNumber: 0,
isWarning: false },
{ errorMessage: "Detected UTF-16-encoded Basic Latin-only text without a byte order mark and without a transfer protocol-level declaration. Encoding this content in UTF-16 is inefficient and the character encoding should have been declared in any case.",
sourceName: "http://mochi.test:8888/tests/parser/htmlparser/tests/mochitest/file_bug672453_bomless_utf16.html",
lineNumber: 0,
isWarning: false },
{ errorMessage: "A meta tag was used to declare the character encoding as UTF-16. This was interpreted as an UTF-8 declaration instead.",
sourceName: "http://mochi.test:8888/tests/parser/htmlparser/tests/mochitest/file_bug672453_meta_utf16.html",
lineNumber: 1,

Просмотреть файл

@ -35,8 +35,6 @@ enum {
kCharsetFromFinalUserForcedAutoDetection,
kCharsetFromXmlDeclarationUtf16, // This one is overridden by
// kCharsetFromChannel
kCharsetFromIrreversibleAutoDetection, // This one is overridden by
// kCharsetFromChannel
kCharsetFromByteOrderMark,
kCharsetFromUtf8OnlyMime, // For JSON, WebVTT and such
kCharsetFromBuiltIn, // resource: URLs