From f68ddc8cba6c6936cf9f322ae36e718349d35e99 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Wed, 16 Feb 2011 08:40:35 +0200 Subject: [PATCH] Bug 631751 - Sniff Basic Latin BOMless UTF-16 for IE compat. r=bzbarsky, a=bzbarsky. --- extensions/universalchardet/tests/Makefile.in | 4 ++ .../tests/bug631751be_text.html | Bin 0 -> 354 bytes .../tests/bug631751le_text.html | Bin 0 -> 366 bytes .../tests/test_bug631751be.html | 33 +++++++++++++ .../tests/test_bug631751le.html | 33 +++++++++++++ parser/html/nsHtml5StreamParser.cpp | 46 ++++++++++++++++++ parser/html/nsHtml5StreamParser.h | 8 ++- parser/htmlparser/public/nsIParser.h | 13 ++--- 8 files changed, 130 insertions(+), 7 deletions(-) create mode 100644 extensions/universalchardet/tests/bug631751be_text.html create mode 100644 extensions/universalchardet/tests/bug631751le_text.html create mode 100644 extensions/universalchardet/tests/test_bug631751be.html create mode 100644 extensions/universalchardet/tests/test_bug631751le.html diff --git a/extensions/universalchardet/tests/Makefile.in b/extensions/universalchardet/tests/Makefile.in index 321d52bc4db7..36f12d026296 100644 --- a/extensions/universalchardet/tests/Makefile.in +++ b/extensions/universalchardet/tests/Makefile.in @@ -70,6 +70,10 @@ _TEST_FILES = \ test_bug547487.html \ bug620106_text.html \ test_bug620106.html \ + bug631751le_text.html \ + test_bug631751le.html \ + bug631751be_text.html \ + test_bug631751be.html \ $(NULL) libs:: $(_TEST_FILES) diff --git a/extensions/universalchardet/tests/bug631751be_text.html b/extensions/universalchardet/tests/bug631751be_text.html new file mode 100644 index 0000000000000000000000000000000000000000..104d50399860172e3ba0d4b0a41529b4afb00686 GIT binary patch literal 354 zcmchTyAFat5JgYzSLg>2Vq--t@KI^BfWi`i$R>aVn;8GzyQ2~kd#jzveay*{uwcuH z4H*~C?3mNw>1bIqBl+2C3aXFhaq_m*+EdLpBF(*YCs(hoRNY5&3=G?T5u_@o1fiD)}l2VnanNd{r1Jw6H`VVH3oHO^AQb>_sDqz18mQow;+eDJ%Az z*>T}U&VeNjo(C-(7NkF_rl5MyJWk%0S~|1)q$0g^ayfZBZQQEiWWJtW`l_}g5o;2u z!IF*n3e6p*?wPV-f5?|Sb$Z+C#T47*(;O&can8x^N>sdz^M7Q4WZpb+eSdQVB=I~{ literal 0 HcmV?d00001 diff --git a/extensions/universalchardet/tests/test_bug631751be.html b/extensions/universalchardet/tests/test_bug631751be.html new file mode 100644 index 000000000000..ee61dae5c027 --- /dev/null +++ b/extensions/universalchardet/tests/test_bug631751be.html @@ -0,0 +1,33 @@ + + + + + Test for Bug 631751 + + + + + + +Mozilla Bug 631751 +

+ + +
+
+
+ + diff --git a/extensions/universalchardet/tests/test_bug631751le.html b/extensions/universalchardet/tests/test_bug631751le.html new file mode 100644 index 000000000000..613678be9276 --- /dev/null +++ b/extensions/universalchardet/tests/test_bug631751le.html @@ -0,0 +1,33 @@ + + + + + Test for Bug 631751 + + + + + + +Mozilla Bug 631751 +

+ + +
+
+
+ + diff --git a/parser/html/nsHtml5StreamParser.cpp b/parser/html/nsHtml5StreamParser.cpp index 59ce27afba3d..56e03aee307c 100644 --- a/parser/html/nsHtml5StreamParser.cpp +++ b/parser/html/nsHtml5StreamParser.cpp @@ -334,6 +334,48 @@ nsHtml5StreamParser::SetupDecodingFromBom(const char* aCharsetName, const char* return rv; } +void +nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment, + PRUint32 aCountToSniffingLimit) +{ + // Make sure there's enough data. Require room for "" + if (mSniffingLength + aCountToSniffingLimit < 30) { + return; + } + // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1 + PRBool byteNonZero[2] = { PR_FALSE, PR_FALSE }; + PRUint32 i = 0; + if (mSniffingBuffer) { + for (; i < mSniffingLength; ++i) { + if (mSniffingBuffer[i]) { + if (byteNonZero[1 - (i % 2)]) { + return; + } + byteNonZero[i % 2] = PR_TRUE; + } + } + } + if (aFromSegment) { + for (PRUint32 j = 0; j < aCountToSniffingLimit; ++j) { + if (aFromSegment[j]) { + if (byteNonZero[1 - ((i + j) % 2)]) { + return; + } + byteNonZero[(i + j) % 2] = PR_TRUE; + } + } + } + + if (byteNonZero[0]) { + mCharset.Assign("UTF-16LE"); + } else { + mCharset.Assign("UTF-16BE"); + } + mCharsetSource = kCharsetFromIrreversibleAutoDetection; + mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource); + mFeedChardet = PR_FALSE; +} + nsresult nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null PRUint32 aCount, @@ -346,6 +388,10 @@ nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be nul mFeedChardet = PR_FALSE; return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount); } + // Check for BOMless UTF-16 with Basic + // Latin content for compat with IE. See bug 631751. + SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit); + // the charset may have been set now // maybe try chardet now; if (mFeedChardet) { PRBool dontFeed; diff --git a/parser/html/nsHtml5StreamParser.h b/parser/html/nsHtml5StreamParser.h index bd36a4d2bf61..67f68e2f8f3e 100644 --- a/parser/html/nsHtml5StreamParser.h +++ b/parser/html/nsHtml5StreamParser.h @@ -256,7 +256,13 @@ class nsHtml5StreamParser : public nsIStreamListener, nsresult WriteStreamBytes(const PRUint8* aFromSegment, PRUint32 aCount, PRUint32* aWriteCount); - + + /** + * Check whether every other byte in the sniffing buffer is zero. + */ + void SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment, + PRUint32 aCountToSniffingLimit); + /** * scan failed. Try chardet if applicable. After this, the * the parser will have some encoding even if a last resolt fallback. diff --git a/parser/htmlparser/public/nsIParser.h b/parser/htmlparser/public/nsIParser.h index da610c48a618..de78b0765bff 100644 --- a/parser/htmlparser/public/nsIParser.h +++ b/parser/htmlparser/public/nsIParser.h @@ -98,13 +98,14 @@ enum eParserDocType { #define kCharsetFromHintPrevDoc 7 #define kCharsetFromMetaPrescan 8 // this one and smaller: HTML5 Tentative #define kCharsetFromMetaTag 9 // this one and greater: HTML5 Confident -#define kCharsetFromByteOrderMark 10 -#define kCharsetFromChannel 11 -#define kCharsetFromOtherComponent 12 +#define kCharsetFromIrreversibleAutoDetection 10 +#define kCharsetFromByteOrderMark 11 +#define kCharsetFromChannel 12 +#define kCharsetFromOtherComponent 13 // Levels below here will be forced onto childframes too -#define kCharsetFromParentForced 13 -#define kCharsetFromUserForced 14 -#define kCharsetFromPreviousLoading 15 +#define kCharsetFromParentForced 14 +#define kCharsetFromUserForced 15 +#define kCharsetFromPreviousLoading 16 enum eStreamState {eNone,eOnStart,eOnDataAvail,eOnStop};