Don't require a minimum data threshold for encodings that correspond to the selected language. Bug 431054, r=Masatoshi Kimura (:emk) <VYV03354@nifty.ne.jp>

This commit is contained in:
Simon Montagu 2008-10-30 10:57:25 -07:00
Родитель 55ce30ca67
Коммит 4fa9ab24c0
22 изменённых файлов: 120 добавлений и 33 удалений

Просмотреть файл

@ -49,12 +49,13 @@
#define MINIMUM_DATA_THRESHOLD 4
//return confidence base on received data
float CharDistributionAnalysis::GetConfidence()
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{
//if we didn't receive any character in our consideration range, or the
// number of frequent characters is below the minimum threshold, return
// negative answer
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
if (mTotalChars <= 0 ||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
return SURE_NO;
if (mTotalChars != mFreqChars) {

Просмотреть файл

@ -71,7 +71,7 @@ public:
}
//return confidence base on existing data
float GetConfidence();
float GetConfidence(PRBool aIsPreferredLanguage);
//Reset analyser, clear any state
void Reset(void)

Просмотреть файл

@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void)
}
#define DONT_KNOW (float)-1
float JapaneseContextAnalysis::GetConfidence()
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{
//This is just one way to calculate confidence. It works well for me.
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
else
return (float)DONT_KNOW;

Просмотреть файл

@ -74,7 +74,7 @@ public:
mLastCharOrder = order;
}
float GetConfidence();
float GetConfidence(PRBool aIsPreferredLanguage);
void Reset(void);
void SetOpion(){}
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}

Просмотреть файл

@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsBig5Prober::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

Просмотреть файл

@ -44,8 +44,10 @@
class nsBig5Prober: public nsCharSetProber {
public:
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
nsBig5Prober(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Big5";}
@ -63,6 +65,7 @@ protected:
//Big5ContextAnalysis mContextAnalyser;
Big5DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

Просмотреть файл

@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCJPProber::GetConfidence(void)
{
float contxtCf = mContextAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence();
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf);
}

Просмотреть файл

@ -50,8 +50,10 @@
class nsEUCJPProber: public nsCharSetProber {
public:
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
nsEUCJPProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-JP";}
@ -68,6 +70,7 @@ protected:
EUCJPDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

Просмотреть файл

@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCKRProber::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

Просмотреть файл

@ -44,8 +44,11 @@
class nsEUCKRProber: public nsCharSetProber {
public:
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();}
nsEUCKRProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();
}
virtual ~nsEUCKRProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-KR";}
@ -63,6 +66,7 @@ protected:
//EUCKRContextAnalysis mContextAnalyser;
EUCKRDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

Просмотреть файл

@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCTWProber::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

Просмотреть файл

@ -44,8 +44,10 @@
class nsEUCTWProber: public nsCharSetProber {
public:
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
nsEUCTWProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "x-euc-tw";}
@ -63,6 +65,7 @@ protected:
//EUCTWContextAnalysis mContextAnalyser;
EUCTWDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

Просмотреть файл

@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsGB18030Prober::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

Просмотреть файл

@ -46,8 +46,10 @@
class nsGB18030Prober: public nsCharSetProber {
public:
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
nsGB18030Prober(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "gb18030";}
@ -65,6 +67,7 @@ protected:
//GB2312ContextAnalysis mContextAnalyser;
GB2312DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

Просмотреть файл

@ -63,17 +63,17 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
mProbers[0] = new nsUTF8Prober();
if (aLanguageFilter & NS_FILTER_JAPANESE)
{
mProbers[1] = new nsSJISProber();
mProbers[2] = new nsEUCJPProber();
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
}
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
mProbers[3] = new nsGB18030Prober();
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
if (aLanguageFilter & NS_FILTER_KOREAN)
mProbers[4] = new nsEUCKRProber();
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
{
mProbers[5] = new nsBig5Prober();
mProbers[6] = new nsEUCTWProber();
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
Reset();
}

Просмотреть файл

@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsSJISProber::GetConfidence(void)
{
float contxtCf = mContextAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence();
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf);
}

Просмотреть файл

@ -51,8 +51,10 @@
class nsSJISProber: public nsCharSetProber {
public:
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
nsSJISProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Shift_JIS";}
@ -69,6 +71,7 @@ protected:
SJISDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

Просмотреть файл

@ -83,6 +83,9 @@ _TEST_FILES = \
test_bug426271-euc-jp.html \
bug426271_text-utf-8.html \
test_bug426271-utf-8.html \
bug431054_text.html \
test_bug431054.html \
test_bug431054-japanese.html \
$(NULL)
libs:: $(_TEST_FILES)

Просмотреть файл

@ -0,0 +1,5 @@
<html>
<body>
ログイン
</body>
</html>

Просмотреть файл

@ -22,7 +22,6 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=306272
CharsetDetectionTests("bug306272_text.html",
"UTF-8",
new Array("ja_parallel_state_machine",
"ko_parallel_state_machine",
"zh_parallel_state_machine",
"zhtw_parallel_state_machine",
"zhcn_parallel_state_machine",

Просмотреть файл

@ -0,0 +1,28 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=431054
-->
<head>
<title>Test for Bug 431054</title>
<script type="text/javascript" src="/MochiKit/MochiKit.js"></script>
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=426271">Mozilla Bug 431054</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 431054 **/
CharsetDetectionTests("bug431054_text.html",
"EUC-JP",
new Array("ja_parallel_state_machine"));
</script>
</pre>
</body>
</html>

Просмотреть файл

@ -0,0 +1,32 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=431054
-->
<head>
<title>Test for Bug 431054</title>
<script type="text/javascript" src="/MochiKit/MochiKit.js"></script>
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=426271">Mozilla Bug 431054</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 431054 **/
CharsetDetectionTests("bug431054_text.html",
"windows-1252",
new Array("zhtw_parallel_state_machine",
"zhcn_parallel_state_machine",
"zh_parallel_state_machine",
"cjk_parallel_state_machine",
"universal_charset_detector"));
</script>
</pre>
</body>
</html>