diff --git a/extensions/universalchardet/src/base/CharDistribution.cpp b/extensions/universalchardet/src/base/CharDistribution.cpp index ce72d3b93d72..4817c0e2f3a4 100644 --- a/extensions/universalchardet/src/base/CharDistribution.cpp +++ b/extensions/universalchardet/src/base/CharDistribution.cpp @@ -49,12 +49,13 @@ #define MINIMUM_DATA_THRESHOLD 4 //return confidence base on received data -float CharDistributionAnalysis::GetConfidence() +float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage) { //if we didn't receive any character in our consideration range, or the // number of frequent characters is below the minimum threshold, return // negative answer - if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD) + if (mTotalChars <= 0 || + !aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD) return SURE_NO; if (mTotalChars != mFreqChars) { diff --git a/extensions/universalchardet/src/base/CharDistribution.h b/extensions/universalchardet/src/base/CharDistribution.h index dbc9e564876c..99f338ca6579 100644 --- a/extensions/universalchardet/src/base/CharDistribution.h +++ b/extensions/universalchardet/src/base/CharDistribution.h @@ -71,7 +71,7 @@ public: } //return confidence base on existing data - float GetConfidence(); + float GetConfidence(PRBool aIsPreferredLanguage); //Reset analyser, clear any state void Reset(void) diff --git a/extensions/universalchardet/src/base/JpCntx.cpp b/extensions/universalchardet/src/base/JpCntx.cpp index 812c3bf9633e..8375fdcfc7e0 100644 --- a/extensions/universalchardet/src/base/JpCntx.cpp +++ b/extensions/universalchardet/src/base/JpCntx.cpp @@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void) } #define DONT_KNOW (float)-1 -float JapaneseContextAnalysis::GetConfidence() +float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage) { //This is just one way to calculate confidence. It works well for me. - if (mTotalRel > MINIMUM_DATA_THRESHOLD) + if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD) return ((float)(mTotalRel - mRelSample[0]))/mTotalRel; else return (float)DONT_KNOW; diff --git a/extensions/universalchardet/src/base/JpCntx.h b/extensions/universalchardet/src/base/JpCntx.h index 1760b752ac78..e86445932304 100644 --- a/extensions/universalchardet/src/base/JpCntx.h +++ b/extensions/universalchardet/src/base/JpCntx.h @@ -74,7 +74,7 @@ public: mLastCharOrder = order; } - float GetConfidence(); + float GetConfidence(PRBool aIsPreferredLanguage); void Reset(void); void SetOpion(){} PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} diff --git a/extensions/universalchardet/src/base/nsBig5Prober.cpp b/extensions/universalchardet/src/base/nsBig5Prober.cpp index 7288eb15966a..aee5cd483e2a 100644 --- a/extensions/universalchardet/src/base/nsBig5Prober.cpp +++ b/extensions/universalchardet/src/base/nsBig5Prober.cpp @@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsBig5Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/extensions/universalchardet/src/base/nsBig5Prober.h b/extensions/universalchardet/src/base/nsBig5Prober.h index 188c6d7618f1..5ae357643fc8 100644 --- a/extensions/universalchardet/src/base/nsBig5Prober.h +++ b/extensions/universalchardet/src/base/nsBig5Prober.h @@ -44,8 +44,10 @@ class nsBig5Prober: public nsCharSetProber { public: - nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel); - Reset();} + nsBig5Prober(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&Big5SMModel); + Reset();} virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "Big5";} @@ -63,6 +65,7 @@ protected: //Big5ContextAnalysis mContextAnalyser; Big5DistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/extensions/universalchardet/src/base/nsEUCJPProber.cpp b/extensions/universalchardet/src/base/nsEUCJPProber.cpp index 68a8b519aca5..35387dfddacd 100644 --- a/extensions/universalchardet/src/base/nsEUCJPProber.cpp +++ b/extensions/universalchardet/src/base/nsEUCJPProber.cpp @@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCJPProber::GetConfidence(void) { - float contxtCf = mContextAnalyser.GetConfidence(); - float distribCf = mDistributionAnalyser.GetConfidence(); + float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (contxtCf > distribCf ? contxtCf : distribCf); } diff --git a/extensions/universalchardet/src/base/nsEUCJPProber.h b/extensions/universalchardet/src/base/nsEUCJPProber.h index 05974e214c35..a7a2f51471f3 100644 --- a/extensions/universalchardet/src/base/nsEUCJPProber.h +++ b/extensions/universalchardet/src/base/nsEUCJPProber.h @@ -50,8 +50,10 @@ class nsEUCJPProber: public nsCharSetProber { public: - nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); - Reset();} + nsEUCJPProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); + Reset();} virtual ~nsEUCJPProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "EUC-JP";} @@ -68,6 +70,7 @@ protected: EUCJPDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/extensions/universalchardet/src/base/nsEUCKRProber.cpp b/extensions/universalchardet/src/base/nsEUCKRProber.cpp index 7d278ad8fd64..396b09527e5f 100644 --- a/extensions/universalchardet/src/base/nsEUCKRProber.cpp +++ b/extensions/universalchardet/src/base/nsEUCKRProber.cpp @@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCKRProber::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/extensions/universalchardet/src/base/nsEUCKRProber.h b/extensions/universalchardet/src/base/nsEUCKRProber.h index 959584cd8431..8e0998460280 100644 --- a/extensions/universalchardet/src/base/nsEUCKRProber.h +++ b/extensions/universalchardet/src/base/nsEUCKRProber.h @@ -44,8 +44,11 @@ class nsEUCKRProber: public nsCharSetProber { public: - nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); - Reset();} + nsEUCKRProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); + Reset(); + } virtual ~nsEUCKRProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "EUC-KR";} @@ -63,6 +66,7 @@ protected: //EUCKRContextAnalysis mContextAnalyser; EUCKRDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/extensions/universalchardet/src/base/nsEUCTWProber.cpp b/extensions/universalchardet/src/base/nsEUCTWProber.cpp index ef2bcf50c336..710e413eb1db 100644 --- a/extensions/universalchardet/src/base/nsEUCTWProber.cpp +++ b/extensions/universalchardet/src/base/nsEUCTWProber.cpp @@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCTWProber::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/extensions/universalchardet/src/base/nsEUCTWProber.h b/extensions/universalchardet/src/base/nsEUCTWProber.h index cd4a4aff482c..911d50b03992 100644 --- a/extensions/universalchardet/src/base/nsEUCTWProber.h +++ b/extensions/universalchardet/src/base/nsEUCTWProber.h @@ -44,8 +44,10 @@ class nsEUCTWProber: public nsCharSetProber { public: - nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); - Reset();} + nsEUCTWProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); + Reset();} virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "x-euc-tw";} @@ -63,6 +65,7 @@ protected: //EUCTWContextAnalysis mContextAnalyser; EUCTWDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/extensions/universalchardet/src/base/nsGB2312Prober.cpp b/extensions/universalchardet/src/base/nsGB2312Prober.cpp index f05b6d7fa965..95374c3122e6 100644 --- a/extensions/universalchardet/src/base/nsGB2312Prober.cpp +++ b/extensions/universalchardet/src/base/nsGB2312Prober.cpp @@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsGB18030Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/extensions/universalchardet/src/base/nsGB2312Prober.h b/extensions/universalchardet/src/base/nsGB2312Prober.h index be01907886e2..4bdac3bbe032 100644 --- a/extensions/universalchardet/src/base/nsGB2312Prober.h +++ b/extensions/universalchardet/src/base/nsGB2312Prober.h @@ -46,8 +46,10 @@ class nsGB18030Prober: public nsCharSetProber { public: - nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel); - Reset();} + nsGB18030Prober(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&GB18030SMModel); + Reset();} virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "gb18030";} @@ -65,6 +67,7 @@ protected: //GB2312ContextAnalysis mContextAnalyser; GB2312DistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp b/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp index ffb80cfe5a95..b07234db5ff0 100644 --- a/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp +++ b/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp @@ -63,17 +63,17 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) mProbers[0] = new nsUTF8Prober(); if (aLanguageFilter & NS_FILTER_JAPANESE) { - mProbers[1] = new nsSJISProber(); - mProbers[2] = new nsEUCJPProber(); + mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE); + mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE); } if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) - mProbers[3] = new nsGB18030Prober(); + mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED); if (aLanguageFilter & NS_FILTER_KOREAN) - mProbers[4] = new nsEUCKRProber(); + mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN); if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) { - mProbers[5] = new nsBig5Prober(); - mProbers[6] = new nsEUCTWProber(); + mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); + mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); } Reset(); } diff --git a/extensions/universalchardet/src/base/nsSJISProber.cpp b/extensions/universalchardet/src/base/nsSJISProber.cpp index eb39d91d2784..5b7e7fddc795 100644 --- a/extensions/universalchardet/src/base/nsSJISProber.cpp +++ b/extensions/universalchardet/src/base/nsSJISProber.cpp @@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) float nsSJISProber::GetConfidence(void) { - float contxtCf = mContextAnalyser.GetConfidence(); - float distribCf = mDistributionAnalyser.GetConfidence(); + float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (contxtCf > distribCf ? contxtCf : distribCf); } diff --git a/extensions/universalchardet/src/base/nsSJISProber.h b/extensions/universalchardet/src/base/nsSJISProber.h index 759cca606cf5..1efb6e3d639e 100644 --- a/extensions/universalchardet/src/base/nsSJISProber.h +++ b/extensions/universalchardet/src/base/nsSJISProber.h @@ -51,8 +51,10 @@ class nsSJISProber: public nsCharSetProber { public: - nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel); - Reset();} + nsSJISProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&SJISSMModel); + Reset();} virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "Shift_JIS";} @@ -69,6 +71,7 @@ protected: SJISDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/extensions/universalchardet/tests/Makefile.in b/extensions/universalchardet/tests/Makefile.in index e900286a4ab7..27e45f2d8ba2 100644 --- a/extensions/universalchardet/tests/Makefile.in +++ b/extensions/universalchardet/tests/Makefile.in @@ -83,6 +83,9 @@ _TEST_FILES = \ test_bug426271-euc-jp.html \ bug426271_text-utf-8.html \ test_bug426271-utf-8.html \ + bug431054_text.html \ + test_bug431054.html \ + test_bug431054-japanese.html \ $(NULL) libs:: $(_TEST_FILES) diff --git a/extensions/universalchardet/tests/bug431054_text.html b/extensions/universalchardet/tests/bug431054_text.html new file mode 100644 index 000000000000..aa88a4e1d2d1 --- /dev/null +++ b/extensions/universalchardet/tests/bug431054_text.html @@ -0,0 +1,5 @@ + + +¥í¥°¥¤¥ó + + diff --git a/extensions/universalchardet/tests/test_bug306272.html b/extensions/universalchardet/tests/test_bug306272.html index 9f7643fc53a7..296b34cfd6f0 100644 --- a/extensions/universalchardet/tests/test_bug306272.html +++ b/extensions/universalchardet/tests/test_bug306272.html @@ -22,7 +22,6 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=306272 CharsetDetectionTests("bug306272_text.html", "UTF-8", new Array("ja_parallel_state_machine", - "ko_parallel_state_machine", "zh_parallel_state_machine", "zhtw_parallel_state_machine", "zhcn_parallel_state_machine", diff --git a/extensions/universalchardet/tests/test_bug431054-japanese.html b/extensions/universalchardet/tests/test_bug431054-japanese.html new file mode 100644 index 000000000000..537f30b298a9 --- /dev/null +++ b/extensions/universalchardet/tests/test_bug431054-japanese.html @@ -0,0 +1,28 @@ + + + + + Test for Bug 431054 + + + + + + +Mozilla Bug 431054 +

+ + +
+
+
+ + diff --git a/extensions/universalchardet/tests/test_bug431054.html b/extensions/universalchardet/tests/test_bug431054.html new file mode 100644 index 000000000000..4a8ac0f4d890 --- /dev/null +++ b/extensions/universalchardet/tests/test_bug431054.html @@ -0,0 +1,32 @@ + + + + + Test for Bug 431054 + + + + + + +Mozilla Bug 431054 +

+ + +
+
+
+ +