зеркало из https://github.com/mozilla/pjs.git
#162894 Extend universal detector's coverage to include iso-8859-1
Added latin1 prober. r=smontagu, sr=jst, a=asa
This commit is contained in:
Родитель
44921bc550
Коммит
d142af8049
|
@ -0,0 +1,206 @@
|
|||
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public License
|
||||
* Version 1.1 (the "License"); you may not use this file except in
|
||||
* compliance with the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the NPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the NPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "nsLatin1Prober.h"
|
||||
#include "prmem.h"
|
||||
|
||||
#define UDF 0 // undefined
|
||||
#define OTH 1 //other
|
||||
#define ASC 2 // ascii capital letter
|
||||
#define ASS 3 // ascii small letter
|
||||
#define ACV 4 // accent capital vowel
|
||||
#define ACO 5 // accent capital other
|
||||
#define ASV 6 // accent small vowel
|
||||
#define ASO 7 // accent small other
|
||||
#define CLASS_NUM 8 // total classes
|
||||
|
||||
static unsigned char Latin1_CharToClass[] =
|
||||
{
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
|
||||
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
|
||||
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
|
||||
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
|
||||
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
|
||||
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
|
||||
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
|
||||
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
|
||||
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
|
||||
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
|
||||
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
|
||||
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
|
||||
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
|
||||
};
|
||||
|
||||
|
||||
/* 0 : illegal
|
||||
1 : very unlikely
|
||||
2 : normal
|
||||
3 : very likely
|
||||
*/
|
||||
static char Latin1ClassModel[] =
|
||||
{
|
||||
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
||||
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
|
||||
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
|
||||
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
|
||||
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
|
||||
};
|
||||
|
||||
void nsLatin1Prober::Reset(void)
|
||||
{
|
||||
mState = eDetecting;
|
||||
mLastCharClass = OTH;
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++)
|
||||
mFreqCounter[i] = 0;
|
||||
}
|
||||
|
||||
PRBool nsLatin1Prober::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
|
||||
{
|
||||
//do filtering to reduce load to probers
|
||||
char *newptr;
|
||||
char *prevPtr, *curPtr;
|
||||
PRBool isInTag = PR_FALSE;
|
||||
|
||||
newptr = *newBuf = (char*)PR_MALLOC(aLen);
|
||||
if (!newptr)
|
||||
return PR_FALSE;
|
||||
|
||||
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
|
||||
{
|
||||
if (*curPtr == '>')
|
||||
isInTag = PR_FALSE;
|
||||
else if (*curPtr == '<')
|
||||
isInTag = PR_TRUE;
|
||||
|
||||
if (!(*curPtr & 0x80) &&
|
||||
(*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
|
||||
{
|
||||
if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
|
||||
// and it is not inside a tag, keep it
|
||||
{
|
||||
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
||||
prevPtr++;
|
||||
*newptr++ = ' ';
|
||||
}
|
||||
else
|
||||
prevPtr = curPtr+1;
|
||||
}
|
||||
}
|
||||
|
||||
newLen = newptr - *newBuf;
|
||||
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
|
||||
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
char *newBuf1;
|
||||
PRUint32 newLen1;
|
||||
|
||||
if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
|
||||
newBuf1 = (char*)aBuf;
|
||||
newLen1 = aLen;
|
||||
}
|
||||
|
||||
char charClass;
|
||||
char freq;
|
||||
for (PRUint32 i = 0; i < newLen1; i++)
|
||||
{
|
||||
charClass = Latin1_CharToClass[newBuf1[i]];
|
||||
freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
|
||||
if (freq == 0) {
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
mFreqCounter[freq]++;
|
||||
mLastCharClass = charClass;
|
||||
}
|
||||
|
||||
if (newBuf1 != aBuf)
|
||||
PR_FREEIF(newBuf1);
|
||||
|
||||
return mState;
|
||||
}
|
||||
|
||||
float nsLatin1Prober::GetConfidence(void)
|
||||
{
|
||||
if (mState == eNotMe)
|
||||
return 0.01f;
|
||||
|
||||
float confidence;
|
||||
PRUint32 total = 0;
|
||||
for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
|
||||
total += mFreqCounter[i];
|
||||
|
||||
confidence = mFreqCounter[3]*1.0f / total;
|
||||
confidence -= mFreqCounter[1]*20.0f/total;
|
||||
|
||||
if (confidence < 0.0f)
|
||||
confidence = 0.0f;
|
||||
|
||||
// lower the confidence of latin1 so that other more accurate detector
|
||||
// can take priority.
|
||||
confidence *= 0.60f;
|
||||
|
||||
return confidence;
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public License
|
||||
* Version 1.1 (the "License"); you may not use this file except in
|
||||
* compliance with the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the NPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the NPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#ifndef nsLatin1Prober_h__
|
||||
#define nsLatin1Prober_h__
|
||||
|
||||
#include "nsCharSetProber.h"
|
||||
|
||||
#define FREQ_CAT_NUM 4
|
||||
|
||||
class nsLatin1Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsLatin1Prober(void){Reset();};
|
||||
virtual ~nsLatin1Prober(void){};
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "windows-1252";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
|
||||
protected:
|
||||
PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
|
||||
|
||||
nsProbingState mState;
|
||||
char mLastCharClass;
|
||||
PRUint32 mFreqCounter[FREQ_CAT_NUM];
|
||||
};
|
||||
|
||||
|
||||
#endif /* nsLatin1Prober_h__ */
|
||||
|
|
@ -52,6 +52,7 @@
|
|||
#include "nsMBCSGroupProber.h"
|
||||
#include "nsSBCSGroupProber.h"
|
||||
#include "nsEscCharsetProber.h"
|
||||
#include "nsLatin1Prober.h"
|
||||
|
||||
static NS_DEFINE_CID(kUniversalDetectorCID, NS_UNIVERSAL_DETECTOR_CID);
|
||||
static NS_DEFINE_CID(kUniversalStringDetectorCID, NS_UNIVERSAL_STRING_DETECTOR_CID);
|
||||
|
@ -156,7 +157,10 @@ void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
if (nsnull == mCharSetProbers[0])
|
||||
mCharSetProbers[0] = new nsMBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[1])
|
||||
mCharSetProbers[1] = new nsSBCSGroupProber; }
|
||||
mCharSetProbers[1] = new nsSBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
mCharSetProbers[2] = new nsLatin1Prober;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -56,7 +56,7 @@
|
|||
|
||||
class nsCharSetProber;
|
||||
|
||||
#define NUM_OF_CHARSET_PROBERS 2
|
||||
#define NUM_OF_CHARSET_PROBERS 3
|
||||
|
||||
typedef enum {
|
||||
ePureAscii = 0,
|
||||
|
|
Загрузка…
Ссылка в новой задаче