1999-06-25 05:33:14 +04:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
|
|
|
*
|
1999-11-06 06:43:54 +03:00
|
|
|
* The contents of this file are subject to the Netscape Public
|
|
|
|
* License Version 1.1 (the "License"); you may not use this file
|
|
|
|
* except in compliance with the License. You may obtain a copy of
|
|
|
|
* the License at http://www.mozilla.org/NPL/
|
1999-06-25 05:33:14 +04:00
|
|
|
*
|
1999-11-06 06:43:54 +03:00
|
|
|
* Software distributed under the License is distributed on an "AS
|
|
|
|
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
|
|
* implied. See the License for the specific language governing
|
|
|
|
* rights and limitations under the License.
|
1999-06-25 05:33:14 +04:00
|
|
|
*
|
1999-11-06 06:43:54 +03:00
|
|
|
* The Original Code is mozilla.org code.
|
|
|
|
*
|
|
|
|
* The Initial Developer of the Original Code is Netscape
|
1999-06-25 05:33:14 +04:00
|
|
|
* Communications Corporation. Portions created by Netscape are
|
1999-11-06 06:43:54 +03:00
|
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All
|
|
|
|
* Rights Reserved.
|
|
|
|
*
|
|
|
|
* Contributor(s):
|
2000-01-11 23:49:15 +03:00
|
|
|
* Pierre Phaneuf <pp@ludusdesign.com>
|
1999-06-25 05:33:14 +04:00
|
|
|
*/
|
|
|
|
#include <iostream.h>
|
|
|
|
#include "nsISupports.h"
|
|
|
|
#include "nsIComponentManager.h"
|
|
|
|
#include "nsICharsetDetector.h"
|
|
|
|
#include "nsICharsetDetectionObserver.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
1999-06-25 05:42:20 +04:00
|
|
|
#ifdef XP_PC
|
1999-06-25 05:33:14 +04:00
|
|
|
#include <io.h>
|
1999-06-25 05:42:20 +04:00
|
|
|
#endif
|
|
|
|
#ifdef XP_UNIX
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
1999-06-25 05:33:14 +04:00
|
|
|
|
|
|
|
|
1999-07-17 00:52:07 +04:00
|
|
|
class nsStatis {
|
|
|
|
public:
|
|
|
|
nsStatis() { };
|
|
|
|
virtual ~nsStatis() { };
|
|
|
|
virtual PRBool HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
|
|
|
virtual void DataEnd() = 0;
|
|
|
|
virtual void Report()=0;
|
|
|
|
};
|
|
|
|
|
|
|
|
class nsBaseStatis : public nsStatis {
|
|
|
|
public:
|
|
|
|
nsBaseStatis(unsigned char aL, unsigned char aH, float aR) ;
|
|
|
|
virtual ~nsBaseStatis() {};
|
|
|
|
virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
|
|
|
|
virtual void DataEnd() ;
|
|
|
|
virtual void Report();
|
|
|
|
protected:
|
|
|
|
unsigned char mLWordHi;
|
|
|
|
unsigned char mLWordLo;
|
|
|
|
private:
|
|
|
|
PRUint32 mNumOf2Bytes;
|
|
|
|
PRUint32 mNumOfLChar;
|
|
|
|
PRUint32 mNumOfLWord;
|
|
|
|
PRUint32 mLWordLength;
|
|
|
|
PRUint32 mLWordLen[10];
|
|
|
|
float mR;
|
|
|
|
PRBool mTailByte;
|
|
|
|
PRBool mLastLChar;
|
|
|
|
};
|
|
|
|
nsBaseStatis::nsBaseStatis(unsigned char aL, unsigned char aH, float aR)
|
|
|
|
{
|
|
|
|
mNumOf2Bytes = mNumOfLWord = mLWordLength = mNumOfLChar= 0;
|
|
|
|
mTailByte = mLastLChar = PR_FALSE;
|
|
|
|
for(PRUint32 i =0;i < 20; i++)
|
|
|
|
mLWordLen[i] = 0;
|
|
|
|
mLWordHi = aH;
|
|
|
|
mLWordLo = aL;
|
|
|
|
mR = aR;
|
|
|
|
}
|
|
|
|
PRBool nsBaseStatis::HandleData(const char* aBuf, PRUint32 aLen)
|
|
|
|
{
|
|
|
|
for(PRUint32 i=0; i < aLen; i++)
|
|
|
|
{
|
|
|
|
if(mTailByte)
|
|
|
|
mTailByte = PR_FALSE;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
mTailByte = (0x80 == ( aBuf[i] & 0x80));
|
|
|
|
if(mTailByte)
|
|
|
|
{
|
|
|
|
mNumOf2Bytes++;
|
|
|
|
unsigned char a = (unsigned char) aBuf[i];
|
|
|
|
PRBool thisLChar = (( mLWordLo <= a) && (a <= mLWordHi));
|
|
|
|
if(thisLChar)
|
|
|
|
{
|
|
|
|
mNumOfLChar++;
|
|
|
|
mLWordLength++;
|
|
|
|
} else {
|
|
|
|
if(mLastLChar) {
|
|
|
|
mNumOfLWord++;
|
|
|
|
mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
|
|
|
|
mLWordLength =0 ;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mLastLChar = thisLChar;
|
|
|
|
} else {
|
|
|
|
if(mLastLChar) {
|
|
|
|
mNumOfLWord++;
|
|
|
|
mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
|
|
|
|
mLWordLength =0 ;
|
|
|
|
mLastLChar = PR_FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PR_TRUE;
|
|
|
|
}
|
|
|
|
void nsBaseStatis::DataEnd()
|
|
|
|
{
|
|
|
|
if(mLastLChar) {
|
|
|
|
mNumOfLWord++;
|
|
|
|
mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void nsBaseStatis::Report()
|
|
|
|
{
|
|
|
|
if(mNumOf2Bytes > 0)
|
|
|
|
{
|
|
|
|
/*
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("LChar Ratio = %d : %d ( %5.3f)\n",
|
1999-07-17 00:52:07 +04:00
|
|
|
mNumOfLChar,
|
|
|
|
mNumOf2Bytes,
|
2000-10-29 02:17:53 +04:00
|
|
|
((float)mNumOfLChar / (float)mNumOf2Bytes) * 100);
|
1999-07-17 00:52:07 +04:00
|
|
|
*/
|
|
|
|
float rate = (float) mNumOfLChar / (float) mNumOf2Bytes;
|
|
|
|
float delta = (rate - mR) / mR;
|
|
|
|
delta *= delta * 1000;
|
|
|
|
#ifdef EXPERIMENT
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("Exp = %f \n",delta);
|
1999-07-17 00:52:07 +04:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
|
|
if(mNumOfLChar > 0)
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("LWord Word = %d : %d (%5.3f)\n",
|
1999-07-17 00:52:07 +04:00
|
|
|
mNumOfLWord,
|
|
|
|
mNumOfLChar,
|
2000-10-29 02:17:53 +04:00
|
|
|
((float)mNumOfLWord / (float)mNumOfLChar) * 100);
|
1999-07-17 00:52:07 +04:00
|
|
|
if(mNumOfLWord > 0)
|
|
|
|
{
|
|
|
|
PRUint32 ac =0;
|
|
|
|
for(PRUint32 i=0;i<10;i++)
|
|
|
|
{
|
|
|
|
ac += mLWordLen[i];
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("LWord Word Length[%d]= %d -> %5.3f%% %5.3f%%\n", i+1,
|
1999-07-17 00:52:07 +04:00
|
|
|
mLWordLen[i],
|
|
|
|
(((float)mLWordLen[i] / (float)mNumOfLWord) * 100),
|
|
|
|
(((float)ac / (float)mNumOfLWord) * 100));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class nsSimpleStatis : public nsStatis {
|
|
|
|
public:
|
|
|
|
nsSimpleStatis(unsigned char aL, unsigned char aH, float aR,const char* aCharset) ;
|
|
|
|
virtual ~nsSimpleStatis() {};
|
|
|
|
virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
|
|
|
|
virtual void DataEnd() ;
|
|
|
|
virtual void Report();
|
|
|
|
protected:
|
|
|
|
unsigned char mLWordHi;
|
|
|
|
unsigned char mLWordLo;
|
|
|
|
private:
|
|
|
|
PRUint32 mNumOf2Bytes;
|
|
|
|
PRUint32 mNumOfLChar;
|
|
|
|
float mR;
|
|
|
|
const char* mCharset;
|
|
|
|
PRBool mTailByte;
|
|
|
|
};
|
|
|
|
nsSimpleStatis::nsSimpleStatis(unsigned char aL, unsigned char aH, float aR, const char* aCharset)
|
|
|
|
{
|
|
|
|
mNumOf2Bytes = mNumOfLChar= 0;
|
|
|
|
mTailByte = PR_FALSE;
|
|
|
|
mLWordHi = aH;
|
|
|
|
mLWordLo = aL;
|
|
|
|
mR = aR;
|
|
|
|
mCharset = aCharset;
|
|
|
|
}
|
|
|
|
PRBool nsSimpleStatis::HandleData(const char* aBuf, PRUint32 aLen)
|
|
|
|
{
|
|
|
|
for(PRUint32 i=0; i < aLen; i++)
|
|
|
|
{
|
|
|
|
if(mTailByte)
|
|
|
|
mTailByte = PR_FALSE;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
mTailByte = (0x80 == ( aBuf[i] & 0x80));
|
|
|
|
if(mTailByte)
|
|
|
|
{
|
|
|
|
mNumOf2Bytes++;
|
|
|
|
unsigned char a = (unsigned char) aBuf[i];
|
|
|
|
PRBool thisLChar = (( mLWordLo <= a) && (a <= mLWordHi));
|
|
|
|
if(thisLChar)
|
|
|
|
mNumOfLChar++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PR_TRUE;
|
|
|
|
}
|
|
|
|
void nsSimpleStatis::DataEnd()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
void nsSimpleStatis::Report()
|
|
|
|
{
|
|
|
|
if(mNumOf2Bytes > 0)
|
|
|
|
{
|
|
|
|
float rate = (float) mNumOfLChar / (float) mNumOf2Bytes;
|
|
|
|
float delta = (rate - mR) / mR;
|
|
|
|
delta = delta * delta * (float)100;
|
|
|
|
#ifdef EXPERIMENT
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("Exp = %f \n",delta);
|
1999-07-17 00:52:07 +04:00
|
|
|
if(delta < 1.0)
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("This is %s\n" ,mCharset);
|
1999-07-17 00:52:07 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
//==========================================================
|
|
|
|
|
|
|
|
|
1999-06-25 05:33:14 +04:00
|
|
|
#define MAXBSIZE (1L << 13)
|
|
|
|
|
|
|
|
void usage() {
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("Usage: DetectFile detector blocksize\n"
|
1999-06-25 05:33:14 +04:00
|
|
|
" detector: "
|
2000-04-05 04:14:52 +04:00
|
|
|
"ja_parallel_state_machine,"
|
|
|
|
"ko_parallel_state_machine,"
|
|
|
|
"zhcn_parallel_state_machine,"
|
|
|
|
"zhtw_parallel_state_machine,"
|
|
|
|
"zh_parallel_state_machine,"
|
|
|
|
"cjk_parallel_state_machine,"
|
1999-07-17 00:52:07 +04:00
|
|
|
"ruprob,"
|
|
|
|
"ukprob,"
|
1999-06-25 05:33:14 +04:00
|
|
|
"\n blocksize: 1 ~ %ld\n"
|
|
|
|
" Data are passed in from STDIN\n"
|
|
|
|
, MAXBSIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
class nsReporter : public nsICharsetDetectionObserver
|
|
|
|
{
|
|
|
|
NS_DECL_ISUPPORTS
|
|
|
|
public:
|
|
|
|
nsReporter() { NS_INIT_REFCNT(); };
|
|
|
|
virtual ~nsReporter() { };
|
|
|
|
|
|
|
|
NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf)
|
|
|
|
{
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("RESULT CHARSET : %s\n", aCharset);
|
|
|
|
printf("RESULT Confident : %d\n", aConf);
|
1999-06-25 05:33:14 +04:00
|
|
|
return NS_OK;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2000-01-11 23:49:15 +03:00
|
|
|
NS_IMPL_ISUPPORTS(nsReporter, NS_GET_IID(nsICharsetDetectionObserver))
|
1999-06-25 05:33:14 +04:00
|
|
|
|
|
|
|
nsresult GetDetector(const char* key, nsICharsetDetector** det)
|
|
|
|
{
|
|
|
|
char buf[128];
|
2000-09-14 03:57:52 +04:00
|
|
|
strcpy(buf, NS_CHARSET_DETECTOR_CONTRACTID_BASE);
|
1999-06-25 05:33:14 +04:00
|
|
|
strcat(buf, key);
|
|
|
|
return nsComponentManager::CreateInstance(
|
|
|
|
buf,
|
|
|
|
nsnull,
|
2000-01-11 23:49:15 +03:00
|
|
|
NS_GET_IID(nsICharsetDetector),
|
1999-06-25 05:33:14 +04:00
|
|
|
(void**)det);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
nsresult GetObserver(nsICharsetDetectionObserver** aRes)
|
|
|
|
{
|
|
|
|
*aRes = nsnull;
|
|
|
|
nsReporter* rep = new nsReporter();
|
|
|
|
if(rep) {
|
2000-01-11 23:49:15 +03:00
|
|
|
return rep->QueryInterface(NS_GET_IID(nsICharsetDetectionObserver) ,
|
1999-06-25 05:33:14 +04:00
|
|
|
(void**)aRes);
|
|
|
|
}
|
|
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
char buf[MAXBSIZE];
|
|
|
|
PRUint32 bs;
|
|
|
|
if( 3 != argc )
|
|
|
|
{
|
|
|
|
usage();
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("Need 2 arguments\n");
|
1999-06-25 05:33:14 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
bs = atoi(argv[2]);
|
|
|
|
if((bs <1)||(bs>MAXBSIZE))
|
|
|
|
{
|
|
|
|
usage();
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("blocksize out of range - %s\n", argv[2]);
|
1999-06-25 05:33:14 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
nsresult rev = NS_OK;
|
|
|
|
nsICharsetDetector *det = nsnull;
|
|
|
|
rev = GetDetector(argv[1], &det);
|
|
|
|
if(NS_FAILED(rev) || (nsnull == det) ){
|
|
|
|
usage();
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("Invalid Detector - %s\n", argv[1]);
|
|
|
|
printf("XPCOM ERROR CODE = %x\n", rev);
|
1999-06-25 05:33:14 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
nsICharsetDetectionObserver *obs = nsnull;
|
|
|
|
rev = GetObserver(&obs);
|
|
|
|
if(NS_SUCCEEDED(rev)) {
|
|
|
|
rev = det->Init(obs);
|
|
|
|
NS_IF_RELEASE(obs);
|
|
|
|
if(NS_FAILED(rev))
|
|
|
|
{
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("XPCOM ERROR CODE = %x\n", rev);
|
1999-06-25 05:33:14 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
} else {
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("XPCOM ERROR CODE = %x\n", rev);
|
1999-06-25 05:33:14 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t sz;
|
|
|
|
PRBool done = PR_FALSE;
|
1999-07-17 00:52:07 +04:00
|
|
|
nsSimpleStatis ks(0xb0,0xc8, (float)0.95952, "EUC-KR");
|
|
|
|
nsSimpleStatis js(0xa4,0xa5, (float)0.45006, "EUC-JP");
|
|
|
|
nsStatis* stat[2] = {&ks, &js};
|
|
|
|
PRUint32 i;
|
1999-06-25 05:33:14 +04:00
|
|
|
do
|
|
|
|
{
|
|
|
|
sz = read(0, buf, bs);
|
|
|
|
if(sz > 0) {
|
1999-07-17 00:52:07 +04:00
|
|
|
if(! done) {
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("call DoIt %d\n",sz);
|
1999-07-17 00:52:07 +04:00
|
|
|
rev = det->DoIt( buf, sz, &done);
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("DoIt return Done = %d\n",done);
|
1999-07-17 00:52:07 +04:00
|
|
|
if(NS_FAILED(rev))
|
|
|
|
{
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("XPCOM ERROR CODE = %x\n", rev);
|
1999-07-17 00:52:07 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
1999-06-25 05:33:14 +04:00
|
|
|
}
|
1999-07-17 00:52:07 +04:00
|
|
|
for(i=0;i<2;i++)
|
|
|
|
stat[i]->HandleData(buf, sz);
|
1999-06-25 05:33:14 +04:00
|
|
|
}
|
1999-07-17 00:52:07 +04:00
|
|
|
// } while((sz > 0) && (!done) );
|
|
|
|
} while(sz > 0);
|
|
|
|
if(!done)
|
1999-06-25 05:33:14 +04:00
|
|
|
{
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("Done = %d\n",done);
|
|
|
|
printf("call Done %d\n",sz);
|
1999-07-17 00:52:07 +04:00
|
|
|
rev = det->Done();
|
|
|
|
if(NS_FAILED(rev))
|
|
|
|
{
|
2000-10-29 02:17:53 +04:00
|
|
|
printf("XPCOM ERROR CODE = %x\n", rev);
|
1999-07-17 00:52:07 +04:00
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for(i=0;i<2;i++) {
|
|
|
|
stat[i]->DataEnd();
|
|
|
|
stat[i]->Report();
|
1999-06-25 05:33:14 +04:00
|
|
|
}
|
2000-10-29 02:17:53 +04:00
|
|
|
printf( "Done\n");
|
1999-06-25 05:33:14 +04:00
|
|
|
|
|
|
|
NS_IF_RELEASE(det);
|
2000-10-29 02:17:53 +04:00
|
|
|
printf( "Done 2\n");
|
1999-06-25 05:33:14 +04:00
|
|
|
return (0);
|
|
|
|
}
|