зеркало из https://github.com/mozilla/gecko-dev.git
Don't require a minimum data threshold for encodings that correspond to the selected language. Bug 431054, r=Masatoshi Kimura (:emk) <VYV03354@nifty.ne.jp>
This commit is contained in:
Родитель
55ce30ca67
Коммит
4fa9ab24c0
|
@ -49,12 +49,13 @@
|
|||
#define MINIMUM_DATA_THRESHOLD 4
|
||||
|
||||
//return confidence base on received data
|
||||
float CharDistributionAnalysis::GetConfidence()
|
||||
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
//if we didn't receive any character in our consideration range, or the
|
||||
// number of frequent characters is below the minimum threshold, return
|
||||
// negative answer
|
||||
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
if (mTotalChars <= 0 ||
|
||||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
return SURE_NO;
|
||||
|
||||
if (mTotalChars != mFreqChars) {
|
||||
|
|
|
@ -71,7 +71,7 @@ public:
|
|||
}
|
||||
|
||||
//return confidence base on existing data
|
||||
float GetConfidence();
|
||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||
|
||||
//Reset analyser, clear any state
|
||||
void Reset(void)
|
||||
|
|
|
@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void)
|
|||
}
|
||||
#define DONT_KNOW (float)-1
|
||||
|
||||
float JapaneseContextAnalysis::GetConfidence()
|
||||
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
//This is just one way to calculate confidence. It works well for me.
|
||||
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
||||
else
|
||||
return (float)DONT_KNOW;
|
||||
|
|
|
@ -74,7 +74,7 @@ public:
|
|||
mLastCharOrder = order;
|
||||
}
|
||||
|
||||
float GetConfidence();
|
||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||
void Reset(void);
|
||||
void SetOpion(){}
|
||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
||||
|
|
|
@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsBig5Prober::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -44,8 +44,10 @@
|
|||
|
||||
class nsBig5Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();}
|
||||
nsBig5Prober(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();}
|
||||
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "Big5";}
|
||||
|
@ -63,6 +65,7 @@ protected:
|
|||
//Big5ContextAnalysis mContextAnalyser;
|
||||
Big5DistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCJPProber::GetConfidence(void)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
|
|
@ -50,8 +50,10 @@
|
|||
|
||||
class nsEUCJPProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();}
|
||||
nsEUCJPProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCJPProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "EUC-JP";}
|
||||
|
@ -68,6 +70,7 @@ protected:
|
|||
EUCJPDistributionAnalysis mDistributionAnalyser;
|
||||
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCKRProber::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -44,8 +44,11 @@
|
|||
|
||||
class nsEUCKRProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||
Reset();}
|
||||
nsEUCKRProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||
Reset();
|
||||
}
|
||||
virtual ~nsEUCKRProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "EUC-KR";}
|
||||
|
@ -63,6 +66,7 @@ protected:
|
|||
//EUCKRContextAnalysis mContextAnalyser;
|
||||
EUCKRDistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCTWProber::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -44,8 +44,10 @@
|
|||
|
||||
class nsEUCTWProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();}
|
||||
nsEUCTWProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "x-euc-tw";}
|
||||
|
@ -63,6 +65,7 @@ protected:
|
|||
//EUCTWContextAnalysis mContextAnalyser;
|
||||
EUCTWDistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsGB18030Prober::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -46,8 +46,10 @@
|
|||
|
||||
class nsGB18030Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();}
|
||||
nsGB18030Prober(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();}
|
||||
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "gb18030";}
|
||||
|
@ -65,6 +67,7 @@ protected:
|
|||
//GB2312ContextAnalysis mContextAnalyser;
|
||||
GB2312DistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -63,17 +63,17 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
|||
mProbers[0] = new nsUTF8Prober();
|
||||
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||
{
|
||||
mProbers[1] = new nsSJISProber();
|
||||
mProbers[2] = new nsEUCJPProber();
|
||||
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||
}
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||
mProbers[3] = new nsGB18030Prober();
|
||||
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
|
||||
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||
mProbers[4] = new nsEUCKRProber();
|
||||
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
|
||||
{
|
||||
mProbers[5] = new nsBig5Prober();
|
||||
mProbers[6] = new nsEUCTWProber();
|
||||
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
}
|
||||
Reset();
|
||||
}
|
||||
|
|
|
@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsSJISProber::GetConfidence(void)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
|
|
@ -51,8 +51,10 @@
|
|||
|
||||
class nsSJISProber: public nsCharSetProber {
|
||||
public:
|
||||
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();}
|
||||
nsSJISProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();}
|
||||
virtual ~nsSJISProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "Shift_JIS";}
|
||||
|
@ -69,6 +71,7 @@ protected:
|
|||
SJISDistributionAnalysis mDistributionAnalyser;
|
||||
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -83,6 +83,9 @@ _TEST_FILES = \
|
|||
test_bug426271-euc-jp.html \
|
||||
bug426271_text-utf-8.html \
|
||||
test_bug426271-utf-8.html \
|
||||
bug431054_text.html \
|
||||
test_bug431054.html \
|
||||
test_bug431054-japanese.html \
|
||||
$(NULL)
|
||||
|
||||
libs:: $(_TEST_FILES)
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
ログイン
|
||||
</body>
|
||||
</html>
|
|
@ -22,7 +22,6 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=306272
|
|||
CharsetDetectionTests("bug306272_text.html",
|
||||
"UTF-8",
|
||||
new Array("ja_parallel_state_machine",
|
||||
"ko_parallel_state_machine",
|
||||
"zh_parallel_state_machine",
|
||||
"zhtw_parallel_state_machine",
|
||||
"zhcn_parallel_state_machine",
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=431054
|
||||
-->
|
||||
<head>
|
||||
<title>Test for Bug 431054</title>
|
||||
<script type="text/javascript" src="/MochiKit/MochiKit.js"></script>
|
||||
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
|
||||
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
|
||||
</head>
|
||||
<body>
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=426271">Mozilla Bug 431054</a>
|
||||
<p id="display"></p>
|
||||
<div id="content" style="display: none">
|
||||
</div>
|
||||
<iframe id="testframe"></iframe>
|
||||
<pre id="test">
|
||||
<script class="testbody" type="text/javascript">
|
||||
/** Test for Bug 431054 **/
|
||||
CharsetDetectionTests("bug431054_text.html",
|
||||
"EUC-JP",
|
||||
new Array("ja_parallel_state_machine"));
|
||||
</script>
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,32 @@
|
|||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=431054
|
||||
-->
|
||||
<head>
|
||||
<title>Test for Bug 431054</title>
|
||||
<script type="text/javascript" src="/MochiKit/MochiKit.js"></script>
|
||||
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
|
||||
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
|
||||
</head>
|
||||
<body>
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=426271">Mozilla Bug 431054</a>
|
||||
<p id="display"></p>
|
||||
<div id="content" style="display: none">
|
||||
</div>
|
||||
<iframe id="testframe"></iframe>
|
||||
<pre id="test">
|
||||
<script class="testbody" type="text/javascript">
|
||||
/** Test for Bug 431054 **/
|
||||
CharsetDetectionTests("bug431054_text.html",
|
||||
"windows-1252",
|
||||
new Array("zhtw_parallel_state_machine",
|
||||
"zhcn_parallel_state_machine",
|
||||
"zh_parallel_state_machine",
|
||||
"cjk_parallel_state_machine",
|
||||
"universal_charset_detector"));
|
||||
</script>
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
Загрузка…
Ссылка в новой задаче