From 10ed7b66ae363b2b250e38af5a1b5064bf374800 Mon Sep 17 00:00:00 2001 From: "ftang%netscape.com" Date: Tue, 16 May 2000 22:50:16 +0000 Subject: [PATCH] iimprove charset detectors to use statistic model --- intl/chardet/src/Big5Statistics.h | 221 ++++++++++++++++++++++ intl/chardet/src/EUCJPStatistics.h | 221 ++++++++++++++++++++++ intl/chardet/src/EUCKRStatistics.h | 221 ++++++++++++++++++++++ intl/chardet/src/EUCTWStatistics.h | 221 ++++++++++++++++++++++ intl/chardet/src/GB2312Statistics.h | 221 ++++++++++++++++++++++ intl/chardet/src/nsPSMDetectors.cpp | 284 ++++++++++++++++++++++++++-- 6 files changed, 1369 insertions(+), 20 deletions(-) create mode 100644 intl/chardet/src/Big5Statistics.h create mode 100644 intl/chardet/src/EUCJPStatistics.h create mode 100644 intl/chardet/src/EUCKRStatistics.h create mode 100644 intl/chardet/src/EUCTWStatistics.h create mode 100644 intl/chardet/src/GB2312Statistics.h diff --git a/intl/chardet/src/Big5Statistics.h b/intl/chardet/src/Big5Statistics.h new file mode 100644 index 00000000000..4e8090ec117 --- /dev/null +++ b/intl/chardet/src/Big5Statistics.h @@ -0,0 +1,221 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All + * Rights Reserved. + * + * Contributor(s): + */ +{ + { + 0.000000f, // FreqH[a1] + 0.000000f, // FreqH[a2] + 0.000000f, // FreqH[a3] + 0.114427f, // FreqH[a4] + 0.061058f, // FreqH[a5] + 0.075598f, // FreqH[a6] + 0.048386f, // FreqH[a7] + 0.063966f, // FreqH[a8] + 0.027094f, // FreqH[a9] + 0.095787f, // FreqH[aa] + 0.029525f, // FreqH[ab] + 0.031331f, // FreqH[ac] + 0.036915f, // FreqH[ad] + 0.021805f, // FreqH[ae] + 0.019349f, // FreqH[af] + 0.037496f, // FreqH[b0] + 0.018068f, // FreqH[b1] + 0.012760f, // FreqH[b2] + 0.030053f, // FreqH[b3] + 0.017339f, // FreqH[b4] + 0.016731f, // FreqH[b5] + 0.019501f, // FreqH[b6] + 0.011240f, // FreqH[b7] + 0.032973f, // FreqH[b8] + 0.016658f, // FreqH[b9] + 0.015872f, // FreqH[ba] + 0.021458f, // FreqH[bb] + 0.012378f, // FreqH[bc] + 0.017003f, // FreqH[bd] + 0.020802f, // FreqH[be] + 0.012454f, // FreqH[bf] + 0.009239f, // FreqH[c0] + 0.012829f, // FreqH[c1] + 0.007922f, // FreqH[c2] + 0.010079f, // FreqH[c3] + 0.009815f, // FreqH[c4] + 0.010104f, // FreqH[c5] + 0.000000f, // FreqH[c6] + 0.000000f, // FreqH[c7] + 0.000000f, // FreqH[c8] + 0.000053f, // FreqH[c9] + 0.000035f, // FreqH[ca] + 0.000105f, // FreqH[cb] + 0.000031f, // FreqH[cc] + 0.000088f, // FreqH[cd] + 0.000027f, // FreqH[ce] + 0.000027f, // FreqH[cf] + 0.000026f, // FreqH[d0] + 0.000035f, // FreqH[d1] + 0.000024f, // FreqH[d2] + 0.000034f, // FreqH[d3] + 0.000375f, // FreqH[d4] + 0.000025f, // FreqH[d5] + 0.000028f, // FreqH[d6] + 0.000020f, // FreqH[d7] + 0.000024f, // FreqH[d8] + 0.000028f, // FreqH[d9] + 0.000031f, // FreqH[da] + 0.000059f, // FreqH[db] + 0.000040f, // FreqH[dc] + 0.000030f, // FreqH[dd] + 0.000079f, // FreqH[de] + 0.000037f, // FreqH[df] + 0.000040f, // FreqH[e0] + 0.000023f, // FreqH[e1] + 0.000030f, // FreqH[e2] + 0.000027f, // FreqH[e3] + 0.000064f, // FreqH[e4] + 0.000020f, // FreqH[e5] + 0.000027f, // FreqH[e6] + 0.000025f, // FreqH[e7] + 0.000074f, // FreqH[e8] + 0.000019f, // FreqH[e9] + 0.000023f, // FreqH[ea] + 0.000021f, // FreqH[eb] + 0.000018f, // FreqH[ec] + 0.000017f, // FreqH[ed] + 0.000035f, // FreqH[ee] + 0.000021f, // FreqH[ef] + 0.000019f, // FreqH[f0] + 0.000025f, // FreqH[f1] + 0.000017f, // FreqH[f2] + 0.000037f, // FreqH[f3] + 0.000018f, // FreqH[f4] + 0.000018f, // FreqH[f5] + 0.000019f, // FreqH[f6] + 0.000022f, // FreqH[f7] + 0.000033f, // FreqH[f8] + 0.000032f, // FreqH[f9] + 0.000000f, // FreqH[fa] + 0.000000f, // FreqH[fb] + 0.000000f, // FreqH[fc] + 0.000000f, // FreqH[fd] + 0.000000f // FreqH[fe] + }, +0.020606f, // Lead Byte StdDev +0.010638f, // Lead Byte Mean +0.675261f, // Lead Byte Weight + { + 0.020256f, // FreqL[a1] + 0.003293f, // FreqL[a2] + 0.045811f, // FreqL[a3] + 0.016650f, // FreqL[a4] + 0.007066f, // FreqL[a5] + 0.004146f, // FreqL[a6] + 0.009229f, // FreqL[a7] + 0.007333f, // FreqL[a8] + 0.003296f, // FreqL[a9] + 0.005239f, // FreqL[aa] + 0.008282f, // FreqL[ab] + 0.003791f, // FreqL[ac] + 0.006116f, // FreqL[ad] + 0.003536f, // FreqL[ae] + 0.004024f, // FreqL[af] + 0.016654f, // FreqL[b0] + 0.009334f, // FreqL[b1] + 0.005429f, // FreqL[b2] + 0.033392f, // FreqL[b3] + 0.006121f, // FreqL[b4] + 0.008983f, // FreqL[b5] + 0.002801f, // FreqL[b6] + 0.004221f, // FreqL[b7] + 0.010357f, // FreqL[b8] + 0.014695f, // FreqL[b9] + 0.077937f, // FreqL[ba] + 0.006314f, // FreqL[bb] + 0.004020f, // FreqL[bc] + 0.007331f, // FreqL[bd] + 0.007150f, // FreqL[be] + 0.005341f, // FreqL[bf] + 0.009195f, // FreqL[c0] + 0.005350f, // FreqL[c1] + 0.005698f, // FreqL[c2] + 0.004472f, // FreqL[c3] + 0.007242f, // FreqL[c4] + 0.004039f, // FreqL[c5] + 0.011154f, // FreqL[c6] + 0.016184f, // FreqL[c7] + 0.004741f, // FreqL[c8] + 0.012814f, // FreqL[c9] + 0.007679f, // FreqL[ca] + 0.008045f, // FreqL[cb] + 0.016631f, // FreqL[cc] + 0.009451f, // FreqL[cd] + 0.016487f, // FreqL[ce] + 0.007287f, // FreqL[cf] + 0.012688f, // FreqL[d0] + 0.017421f, // FreqL[d1] + 0.013205f, // FreqL[d2] + 0.031480f, // FreqL[d3] + 0.003404f, // FreqL[d4] + 0.009149f, // FreqL[d5] + 0.008921f, // FreqL[d6] + 0.007514f, // FreqL[d7] + 0.008683f, // FreqL[d8] + 0.008203f, // FreqL[d9] + 0.031403f, // FreqL[da] + 0.011733f, // FreqL[db] + 0.015617f, // FreqL[dc] + 0.015306f, // FreqL[dd] + 0.004004f, // FreqL[de] + 0.010899f, // FreqL[df] + 0.009961f, // FreqL[e0] + 0.008388f, // FreqL[e1] + 0.010920f, // FreqL[e2] + 0.003925f, // FreqL[e3] + 0.008585f, // FreqL[e4] + 0.009108f, // FreqL[e5] + 0.015546f, // FreqL[e6] + 0.004659f, // FreqL[e7] + 0.006934f, // FreqL[e8] + 0.007023f, // FreqL[e9] + 0.020252f, // FreqL[ea] + 0.005387f, // FreqL[eb] + 0.024704f, // FreqL[ec] + 0.006963f, // FreqL[ed] + 0.002625f, // FreqL[ee] + 0.009512f, // FreqL[ef] + 0.002971f, // FreqL[f0] + 0.008233f, // FreqL[f1] + 0.010000f, // FreqL[f2] + 0.011973f, // FreqL[f3] + 0.010553f, // FreqL[f4] + 0.005945f, // FreqL[f5] + 0.006349f, // FreqL[f6] + 0.009401f, // FreqL[f7] + 0.008577f, // FreqL[f8] + 0.008186f, // FreqL[f9] + 0.008159f, // FreqL[fa] + 0.005033f, // FreqL[fb] + 0.008714f, // FreqL[fc] + 0.010614f, // FreqL[fd] + 0.006554f // FreqL[fe] + }, +0.009909f, // Trail Byte StdDev +0.010638f, // Trail Byte Mean +0.324739f // Trial Byte Weight +}; diff --git a/intl/chardet/src/EUCJPStatistics.h b/intl/chardet/src/EUCJPStatistics.h new file mode 100644 index 00000000000..fd92dd2b96f --- /dev/null +++ b/intl/chardet/src/EUCJPStatistics.h @@ -0,0 +1,221 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All + * Rights Reserved. + * + * Contributor(s): + */ +{ + { + 0.364808f, // FreqH[a1] + 0.000000f, // FreqH[a2] + 0.000000f, // FreqH[a3] + 0.145325f, // FreqH[a4] + 0.304891f, // FreqH[a5] + 0.000000f, // FreqH[a6] + 0.000000f, // FreqH[a7] + 0.000000f, // FreqH[a8] + 0.000000f, // FreqH[a9] + 0.000000f, // FreqH[aa] + 0.000000f, // FreqH[ab] + 0.000000f, // FreqH[ac] + 0.000000f, // FreqH[ad] + 0.000000f, // FreqH[ae] + 0.000000f, // FreqH[af] + 0.001835f, // FreqH[b0] + 0.010771f, // FreqH[b1] + 0.006462f, // FreqH[b2] + 0.001157f, // FreqH[b3] + 0.002114f, // FreqH[b4] + 0.003231f, // FreqH[b5] + 0.001356f, // FreqH[b6] + 0.007420f, // FreqH[b7] + 0.004189f, // FreqH[b8] + 0.003231f, // FreqH[b9] + 0.003032f, // FreqH[ba] + 0.033190f, // FreqH[bb] + 0.006303f, // FreqH[bc] + 0.006064f, // FreqH[bd] + 0.009973f, // FreqH[be] + 0.002354f, // FreqH[bf] + 0.003670f, // FreqH[c0] + 0.009135f, // FreqH[c1] + 0.001675f, // FreqH[c2] + 0.002792f, // FreqH[c3] + 0.002194f, // FreqH[c4] + 0.014720f, // FreqH[c5] + 0.011928f, // FreqH[c6] + 0.000878f, // FreqH[c7] + 0.013124f, // FreqH[c8] + 0.001077f, // FreqH[c9] + 0.009295f, // FreqH[ca] + 0.003471f, // FreqH[cb] + 0.002872f, // FreqH[cc] + 0.002433f, // FreqH[cd] + 0.000957f, // FreqH[ce] + 0.001636f, // FreqH[cf] + 0.000000f, // FreqH[d0] + 0.000000f, // FreqH[d1] + 0.000000f, // FreqH[d2] + 0.000000f, // FreqH[d3] + 0.000000f, // FreqH[d4] + 0.000000f, // FreqH[d5] + 0.000000f, // FreqH[d6] + 0.000000f, // FreqH[d7] + 0.000000f, // FreqH[d8] + 0.000000f, // FreqH[d9] + 0.000000f, // FreqH[da] + 0.000000f, // FreqH[db] + 0.000000f, // FreqH[dc] + 0.000000f, // FreqH[dd] + 0.000080f, // FreqH[de] + 0.000279f, // FreqH[df] + 0.000000f, // FreqH[e0] + 0.000000f, // FreqH[e1] + 0.000000f, // FreqH[e2] + 0.000000f, // FreqH[e3] + 0.000000f, // FreqH[e4] + 0.000000f, // FreqH[e5] + 0.000000f, // FreqH[e6] + 0.000000f, // FreqH[e7] + 0.000000f, // FreqH[e8] + 0.000000f, // FreqH[e9] + 0.000000f, // FreqH[ea] + 0.000000f, // FreqH[eb] + 0.000000f, // FreqH[ec] + 0.000000f, // FreqH[ed] + 0.000000f, // FreqH[ee] + 0.000000f, // FreqH[ef] + 0.000000f, // FreqH[f0] + 0.000000f, // FreqH[f1] + 0.000000f, // FreqH[f2] + 0.000000f, // FreqH[f3] + 0.000000f, // FreqH[f4] + 0.000000f, // FreqH[f5] + 0.000000f, // FreqH[f6] + 0.000000f, // FreqH[f7] + 0.000000f, // FreqH[f8] + 0.000000f, // FreqH[f9] + 0.000000f, // FreqH[fa] + 0.000000f, // FreqH[fb] + 0.000000f, // FreqH[fc] + 0.000080f, // FreqH[fd] + 0.000000f // FreqH[fe] + }, +0.050407f, // Lead Byte StdDev +0.010638f, // Lead Byte Mean +0.640871f, // Lead Byte Weight + { + 0.002473f, // FreqL[a1] + 0.039134f, // FreqL[a2] + 0.152745f, // FreqL[a3] + 0.009694f, // FreqL[a4] + 0.000359f, // FreqL[a5] + 0.022180f, // FreqL[a6] + 0.000758f, // FreqL[a7] + 0.004308f, // FreqL[a8] + 0.000160f, // FreqL[a9] + 0.002513f, // FreqL[aa] + 0.003072f, // FreqL[ab] + 0.001316f, // FreqL[ac] + 0.003830f, // FreqL[ad] + 0.001037f, // FreqL[ae] + 0.003590f, // FreqL[af] + 0.000957f, // FreqL[b0] + 0.000160f, // FreqL[b1] + 0.000239f, // FreqL[b2] + 0.006462f, // FreqL[b3] + 0.001596f, // FreqL[b4] + 0.031554f, // FreqL[b5] + 0.001316f, // FreqL[b6] + 0.002194f, // FreqL[b7] + 0.016555f, // FreqL[b8] + 0.003271f, // FreqL[b9] + 0.000678f, // FreqL[ba] + 0.000598f, // FreqL[bb] + 0.206438f, // FreqL[bc] + 0.000718f, // FreqL[bd] + 0.001077f, // FreqL[be] + 0.003710f, // FreqL[bf] + 0.001356f, // FreqL[c0] + 0.001356f, // FreqL[c1] + 0.000439f, // FreqL[c2] + 0.004388f, // FreqL[c3] + 0.005704f, // FreqL[c4] + 0.000878f, // FreqL[c5] + 0.010172f, // FreqL[c6] + 0.007061f, // FreqL[c7] + 0.014680f, // FreqL[c8] + 0.000638f, // FreqL[c9] + 0.025730f, // FreqL[ca] + 0.002792f, // FreqL[cb] + 0.000718f, // FreqL[cc] + 0.001795f, // FreqL[cd] + 0.091551f, // FreqL[ce] + 0.000758f, // FreqL[cf] + 0.003909f, // FreqL[d0] + 0.000558f, // FreqL[d1] + 0.031195f, // FreqL[d2] + 0.007061f, // FreqL[d3] + 0.001316f, // FreqL[d4] + 0.022579f, // FreqL[d5] + 0.006981f, // FreqL[d6] + 0.007260f, // FreqL[d7] + 0.001117f, // FreqL[d8] + 0.000239f, // FreqL[d9] + 0.012127f, // FreqL[da] + 0.000878f, // FreqL[db] + 0.003790f, // FreqL[dc] + 0.001077f, // FreqL[dd] + 0.000758f, // FreqL[de] + 0.002114f, // FreqL[df] + 0.002234f, // FreqL[e0] + 0.000678f, // FreqL[e1] + 0.002992f, // FreqL[e2] + 0.003311f, // FreqL[e3] + 0.023416f, // FreqL[e4] + 0.001237f, // FreqL[e5] + 0.002753f, // FreqL[e6] + 0.005146f, // FreqL[e7] + 0.002194f, // FreqL[e8] + 0.007021f, // FreqL[e9] + 0.008497f, // FreqL[ea] + 0.013763f, // FreqL[eb] + 0.011768f, // FreqL[ec] + 0.006303f, // FreqL[ed] + 0.001915f, // FreqL[ee] + 0.000638f, // FreqL[ef] + 0.008776f, // FreqL[f0] + 0.000918f, // FreqL[f1] + 0.003431f, // FreqL[f2] + 0.057603f, // FreqL[f3] + 0.000439f, // FreqL[f4] + 0.000439f, // FreqL[f5] + 0.000758f, // FreqL[f6] + 0.002872f, // FreqL[f7] + 0.001675f, // FreqL[f8] + 0.011050f, // FreqL[f9] + 0.000000f, // FreqL[fa] + 0.000279f, // FreqL[fb] + 0.012127f, // FreqL[fc] + 0.000718f, // FreqL[fd] + 0.007380f // FreqL[fe] + }, +0.028247f, // Trail Byte StdDev +0.010638f, // Trail Byte Mean +0.359129f // Trial Byte Weight +}; diff --git a/intl/chardet/src/EUCKRStatistics.h b/intl/chardet/src/EUCKRStatistics.h new file mode 100644 index 00000000000..a1d961309f4 --- /dev/null +++ b/intl/chardet/src/EUCKRStatistics.h @@ -0,0 +1,221 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All + * Rights Reserved. + * + * Contributor(s): + */ +{ + { + 0.000000f, // FreqH[a1] + 0.000000f, // FreqH[a2] + 0.000000f, // FreqH[a3] + 0.000000f, // FreqH[a4] + 0.000000f, // FreqH[a5] + 0.000000f, // FreqH[a6] + 0.000000f, // FreqH[a7] + 0.000412f, // FreqH[a8] + 0.000000f, // FreqH[a9] + 0.000000f, // FreqH[aa] + 0.000000f, // FreqH[ab] + 0.000000f, // FreqH[ac] + 0.000000f, // FreqH[ad] + 0.000000f, // FreqH[ae] + 0.000000f, // FreqH[af] + 0.057502f, // FreqH[b0] + 0.033182f, // FreqH[b1] + 0.002267f, // FreqH[b2] + 0.016076f, // FreqH[b3] + 0.014633f, // FreqH[b4] + 0.032976f, // FreqH[b5] + 0.004122f, // FreqH[b6] + 0.011336f, // FreqH[b7] + 0.058533f, // FreqH[b8] + 0.024526f, // FreqH[b9] + 0.025969f, // FreqH[ba] + 0.054411f, // FreqH[bb] + 0.019580f, // FreqH[bc] + 0.063273f, // FreqH[bd] + 0.113974f, // FreqH[be] + 0.029885f, // FreqH[bf] + 0.150041f, // FreqH[c0] + 0.059151f, // FreqH[c1] + 0.002679f, // FreqH[c2] + 0.009893f, // FreqH[c3] + 0.014839f, // FreqH[c4] + 0.026381f, // FreqH[c5] + 0.015045f, // FreqH[c6] + 0.069456f, // FreqH[c7] + 0.089860f, // FreqH[c8] + 0.000000f, // FreqH[c9] + 0.000000f, // FreqH[ca] + 0.000000f, // FreqH[cb] + 0.000000f, // FreqH[cc] + 0.000000f, // FreqH[cd] + 0.000000f, // FreqH[ce] + 0.000000f, // FreqH[cf] + 0.000000f, // FreqH[d0] + 0.000000f, // FreqH[d1] + 0.000000f, // FreqH[d2] + 0.000000f, // FreqH[d3] + 0.000000f, // FreqH[d4] + 0.000000f, // FreqH[d5] + 0.000000f, // FreqH[d6] + 0.000000f, // FreqH[d7] + 0.000000f, // FreqH[d8] + 0.000000f, // FreqH[d9] + 0.000000f, // FreqH[da] + 0.000000f, // FreqH[db] + 0.000000f, // FreqH[dc] + 0.000000f, // FreqH[dd] + 0.000000f, // FreqH[de] + 0.000000f, // FreqH[df] + 0.000000f, // FreqH[e0] + 0.000000f, // FreqH[e1] + 0.000000f, // FreqH[e2] + 0.000000f, // FreqH[e3] + 0.000000f, // FreqH[e4] + 0.000000f, // FreqH[e5] + 0.000000f, // FreqH[e6] + 0.000000f, // FreqH[e7] + 0.000000f, // FreqH[e8] + 0.000000f, // FreqH[e9] + 0.000000f, // FreqH[ea] + 0.000000f, // FreqH[eb] + 0.000000f, // FreqH[ec] + 0.000000f, // FreqH[ed] + 0.000000f, // FreqH[ee] + 0.000000f, // FreqH[ef] + 0.000000f, // FreqH[f0] + 0.000000f, // FreqH[f1] + 0.000000f, // FreqH[f2] + 0.000000f, // FreqH[f3] + 0.000000f, // FreqH[f4] + 0.000000f, // FreqH[f5] + 0.000000f, // FreqH[f6] + 0.000000f, // FreqH[f7] + 0.000000f, // FreqH[f8] + 0.000000f, // FreqH[f9] + 0.000000f, // FreqH[fa] + 0.000000f, // FreqH[fb] + 0.000000f, // FreqH[fc] + 0.000000f, // FreqH[fd] + 0.000000f // FreqH[fe] + }, +0.025593f, // Lead Byte StdDev +0.010638f, // Lead Byte Mean +0.647437f, // Lead Byte Weight + { + 0.016694f, // FreqL[a1] + 0.000000f, // FreqL[a2] + 0.012778f, // FreqL[a3] + 0.030091f, // FreqL[a4] + 0.002679f, // FreqL[a5] + 0.006595f, // FreqL[a6] + 0.001855f, // FreqL[a7] + 0.000824f, // FreqL[a8] + 0.005977f, // FreqL[a9] + 0.004740f, // FreqL[aa] + 0.003092f, // FreqL[ab] + 0.000824f, // FreqL[ac] + 0.019580f, // FreqL[ad] + 0.037304f, // FreqL[ae] + 0.008244f, // FreqL[af] + 0.014633f, // FreqL[b0] + 0.001031f, // FreqL[b1] + 0.000000f, // FreqL[b2] + 0.003298f, // FreqL[b3] + 0.002061f, // FreqL[b4] + 0.006183f, // FreqL[b5] + 0.005977f, // FreqL[b6] + 0.000824f, // FreqL[b7] + 0.021847f, // FreqL[b8] + 0.014839f, // FreqL[b9] + 0.052968f, // FreqL[ba] + 0.017312f, // FreqL[bb] + 0.007626f, // FreqL[bc] + 0.000412f, // FreqL[bd] + 0.000824f, // FreqL[be] + 0.011129f, // FreqL[bf] + 0.000000f, // FreqL[c0] + 0.000412f, // FreqL[c1] + 0.001649f, // FreqL[c2] + 0.005977f, // FreqL[c3] + 0.065746f, // FreqL[c4] + 0.020198f, // FreqL[c5] + 0.021434f, // FreqL[c6] + 0.014633f, // FreqL[c7] + 0.004122f, // FreqL[c8] + 0.001649f, // FreqL[c9] + 0.000824f, // FreqL[ca] + 0.000824f, // FreqL[cb] + 0.051937f, // FreqL[cc] + 0.019580f, // FreqL[cd] + 0.023289f, // FreqL[ce] + 0.026381f, // FreqL[cf] + 0.040396f, // FreqL[d0] + 0.009068f, // FreqL[d1] + 0.001443f, // FreqL[d2] + 0.003710f, // FreqL[d3] + 0.007420f, // FreqL[d4] + 0.001443f, // FreqL[d5] + 0.013190f, // FreqL[d6] + 0.002885f, // FreqL[d7] + 0.000412f, // FreqL[d8] + 0.003298f, // FreqL[d9] + 0.025969f, // FreqL[da] + 0.000412f, // FreqL[db] + 0.000412f, // FreqL[dc] + 0.006183f, // FreqL[dd] + 0.003298f, // FreqL[de] + 0.066983f, // FreqL[df] + 0.002679f, // FreqL[e0] + 0.002267f, // FreqL[e1] + 0.011129f, // FreqL[e2] + 0.000412f, // FreqL[e3] + 0.010099f, // FreqL[e4] + 0.015251f, // FreqL[e5] + 0.007626f, // FreqL[e6] + 0.043899f, // FreqL[e7] + 0.003710f, // FreqL[e8] + 0.002679f, // FreqL[e9] + 0.001443f, // FreqL[ea] + 0.010923f, // FreqL[eb] + 0.002885f, // FreqL[ec] + 0.009068f, // FreqL[ed] + 0.019992f, // FreqL[ee] + 0.000412f, // FreqL[ef] + 0.008450f, // FreqL[f0] + 0.005153f, // FreqL[f1] + 0.000000f, // FreqL[f2] + 0.010099f, // FreqL[f3] + 0.000000f, // FreqL[f4] + 0.001649f, // FreqL[f5] + 0.012160f, // FreqL[f6] + 0.011542f, // FreqL[f7] + 0.006595f, // FreqL[f8] + 0.001855f, // FreqL[f9] + 0.010923f, // FreqL[fa] + 0.000412f, // FreqL[fb] + 0.023702f, // FreqL[fc] + 0.003710f, // FreqL[fd] + 0.001855f // FreqL[fe] + }, +0.013937f, // Trail Byte StdDev +0.010638f, // Trail Byte Mean +0.352563f // Trial Byte Weight +}; diff --git a/intl/chardet/src/EUCTWStatistics.h b/intl/chardet/src/EUCTWStatistics.h new file mode 100644 index 00000000000..52ba88a2a55 --- /dev/null +++ b/intl/chardet/src/EUCTWStatistics.h @@ -0,0 +1,221 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All + * Rights Reserved. + * + * Contributor(s): + */ +{ + { + 0.000000f, // FreqH[a1] + 0.000000f, // FreqH[a2] + 0.000000f, // FreqH[a3] + 0.000000f, // FreqH[a4] + 0.000000f, // FreqH[a5] + 0.000000f, // FreqH[a6] + 0.000000f, // FreqH[a7] + 0.000000f, // FreqH[a8] + 0.000000f, // FreqH[a9] + 0.000000f, // FreqH[aa] + 0.000000f, // FreqH[ab] + 0.000000f, // FreqH[ac] + 0.000000f, // FreqH[ad] + 0.000000f, // FreqH[ae] + 0.000000f, // FreqH[af] + 0.000000f, // FreqH[b0] + 0.000000f, // FreqH[b1] + 0.000000f, // FreqH[b2] + 0.000000f, // FreqH[b3] + 0.000000f, // FreqH[b4] + 0.000000f, // FreqH[b5] + 0.000000f, // FreqH[b6] + 0.000000f, // FreqH[b7] + 0.000000f, // FreqH[b8] + 0.000000f, // FreqH[b9] + 0.000000f, // FreqH[ba] + 0.000000f, // FreqH[bb] + 0.000000f, // FreqH[bc] + 0.000000f, // FreqH[bd] + 0.000000f, // FreqH[be] + 0.000000f, // FreqH[bf] + 0.000000f, // FreqH[c0] + 0.000000f, // FreqH[c1] + 0.000000f, // FreqH[c2] + 0.000000f, // FreqH[c3] + 0.119286f, // FreqH[c4] + 0.052233f, // FreqH[c5] + 0.044126f, // FreqH[c6] + 0.052494f, // FreqH[c7] + 0.045906f, // FreqH[c8] + 0.019038f, // FreqH[c9] + 0.032465f, // FreqH[ca] + 0.026252f, // FreqH[cb] + 0.025502f, // FreqH[cc] + 0.015963f, // FreqH[cd] + 0.052493f, // FreqH[ce] + 0.019256f, // FreqH[cf] + 0.015137f, // FreqH[d0] + 0.031782f, // FreqH[d1] + 0.017370f, // FreqH[d2] + 0.018494f, // FreqH[d3] + 0.015575f, // FreqH[d4] + 0.016621f, // FreqH[d5] + 0.007444f, // FreqH[d6] + 0.011642f, // FreqH[d7] + 0.013916f, // FreqH[d8] + 0.019159f, // FreqH[d9] + 0.016445f, // FreqH[da] + 0.007851f, // FreqH[db] + 0.011079f, // FreqH[dc] + 0.022842f, // FreqH[dd] + 0.015513f, // FreqH[de] + 0.010033f, // FreqH[df] + 0.009950f, // FreqH[e0] + 0.010347f, // FreqH[e1] + 0.013103f, // FreqH[e2] + 0.015371f, // FreqH[e3] + 0.012502f, // FreqH[e4] + 0.007436f, // FreqH[e5] + 0.018253f, // FreqH[e6] + 0.014134f, // FreqH[e7] + 0.008907f, // FreqH[e8] + 0.005411f, // FreqH[e9] + 0.009570f, // FreqH[ea] + 0.013598f, // FreqH[eb] + 0.006092f, // FreqH[ec] + 0.007409f, // FreqH[ed] + 0.008432f, // FreqH[ee] + 0.005816f, // FreqH[ef] + 0.009349f, // FreqH[f0] + 0.005472f, // FreqH[f1] + 0.007170f, // FreqH[f2] + 0.007420f, // FreqH[f3] + 0.003681f, // FreqH[f4] + 0.007523f, // FreqH[f5] + 0.004610f, // FreqH[f6] + 0.006154f, // FreqH[f7] + 0.003348f, // FreqH[f8] + 0.005074f, // FreqH[f9] + 0.005922f, // FreqH[fa] + 0.005254f, // FreqH[fb] + 0.004682f, // FreqH[fc] + 0.002093f, // FreqH[fd] + 0.000000f // FreqH[fe] + }, +0.016681f, // Lead Byte StdDev +0.010638f, // Lead Byte Mean +0.715599f, // Lead Byte Weight + { + 0.028933f, // FreqL[a1] + 0.011371f, // FreqL[a2] + 0.011053f, // FreqL[a3] + 0.007232f, // FreqL[a4] + 0.010192f, // FreqL[a5] + 0.004093f, // FreqL[a6] + 0.015043f, // FreqL[a7] + 0.011752f, // FreqL[a8] + 0.022387f, // FreqL[a9] + 0.008410f, // FreqL[aa] + 0.012448f, // FreqL[ab] + 0.007473f, // FreqL[ac] + 0.003594f, // FreqL[ad] + 0.007139f, // FreqL[ae] + 0.018912f, // FreqL[af] + 0.006083f, // FreqL[b0] + 0.003302f, // FreqL[b1] + 0.010215f, // FreqL[b2] + 0.008791f, // FreqL[b3] + 0.024236f, // FreqL[b4] + 0.014107f, // FreqL[b5] + 0.014108f, // FreqL[b6] + 0.010303f, // FreqL[b7] + 0.009728f, // FreqL[b8] + 0.007877f, // FreqL[b9] + 0.009719f, // FreqL[ba] + 0.007952f, // FreqL[bb] + 0.021028f, // FreqL[bc] + 0.005764f, // FreqL[bd] + 0.009341f, // FreqL[be] + 0.006591f, // FreqL[bf] + 0.012517f, // FreqL[c0] + 0.005921f, // FreqL[c1] + 0.008982f, // FreqL[c2] + 0.008771f, // FreqL[c3] + 0.012802f, // FreqL[c4] + 0.005926f, // FreqL[c5] + 0.008342f, // FreqL[c6] + 0.003086f, // FreqL[c7] + 0.006843f, // FreqL[c8] + 0.007576f, // FreqL[c9] + 0.004734f, // FreqL[ca] + 0.016404f, // FreqL[cb] + 0.008803f, // FreqL[cc] + 0.008071f, // FreqL[cd] + 0.005349f, // FreqL[ce] + 0.008566f, // FreqL[cf] + 0.010840f, // FreqL[d0] + 0.015401f, // FreqL[d1] + 0.031904f, // FreqL[d2] + 0.008670f, // FreqL[d3] + 0.011479f, // FreqL[d4] + 0.010936f, // FreqL[d5] + 0.007617f, // FreqL[d6] + 0.008995f, // FreqL[d7] + 0.008114f, // FreqL[d8] + 0.008658f, // FreqL[d9] + 0.005934f, // FreqL[da] + 0.010452f, // FreqL[db] + 0.009142f, // FreqL[dc] + 0.004519f, // FreqL[dd] + 0.008339f, // FreqL[de] + 0.007476f, // FreqL[df] + 0.007027f, // FreqL[e0] + 0.006025f, // FreqL[e1] + 0.021804f, // FreqL[e2] + 0.024248f, // FreqL[e3] + 0.015895f, // FreqL[e4] + 0.003768f, // FreqL[e5] + 0.010171f, // FreqL[e6] + 0.010007f, // FreqL[e7] + 0.010178f, // FreqL[e8] + 0.008316f, // FreqL[e9] + 0.006832f, // FreqL[ea] + 0.006364f, // FreqL[eb] + 0.009141f, // FreqL[ec] + 0.009148f, // FreqL[ed] + 0.012081f, // FreqL[ee] + 0.011914f, // FreqL[ef] + 0.004464f, // FreqL[f0] + 0.014257f, // FreqL[f1] + 0.006907f, // FreqL[f2] + 0.011292f, // FreqL[f3] + 0.018622f, // FreqL[f4] + 0.008149f, // FreqL[f5] + 0.004636f, // FreqL[f6] + 0.006612f, // FreqL[f7] + 0.013478f, // FreqL[f8] + 0.012614f, // FreqL[f9] + 0.005186f, // FreqL[fa] + 0.048285f, // FreqL[fb] + 0.006816f, // FreqL[fc] + 0.006743f, // FreqL[fd] + 0.008671f // FreqL[fe] + }, +0.006630f, // Trail Byte StdDev +0.010638f, // Trail Byte Mean +0.284401f // Trial Byte Weight +}; diff --git a/intl/chardet/src/GB2312Statistics.h b/intl/chardet/src/GB2312Statistics.h new file mode 100644 index 00000000000..9da162ec316 --- /dev/null +++ b/intl/chardet/src/GB2312Statistics.h @@ -0,0 +1,221 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * The contents of this file are subject to the Netscape Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Netscape + * Communications Corporation. Portions created by Netscape are + * Copyright (C) 1998 Netscape Communications Corporation. All + * Rights Reserved. + * + * Contributor(s): + */ +{ + { + 0.011628f, // FreqH[a1] + 0.000000f, // FreqH[a2] + 0.000000f, // FreqH[a3] + 0.000000f, // FreqH[a4] + 0.000000f, // FreqH[a5] + 0.000000f, // FreqH[a6] + 0.000000f, // FreqH[a7] + 0.000000f, // FreqH[a8] + 0.000000f, // FreqH[a9] + 0.000000f, // FreqH[aa] + 0.000000f, // FreqH[ab] + 0.000000f, // FreqH[ac] + 0.000000f, // FreqH[ad] + 0.000000f, // FreqH[ae] + 0.000000f, // FreqH[af] + 0.011628f, // FreqH[b0] + 0.012403f, // FreqH[b1] + 0.009302f, // FreqH[b2] + 0.003876f, // FreqH[b3] + 0.017829f, // FreqH[b4] + 0.037209f, // FreqH[b5] + 0.008527f, // FreqH[b6] + 0.010078f, // FreqH[b7] + 0.019380f, // FreqH[b8] + 0.054264f, // FreqH[b9] + 0.010078f, // FreqH[ba] + 0.041085f, // FreqH[bb] + 0.020930f, // FreqH[bc] + 0.018605f, // FreqH[bd] + 0.010078f, // FreqH[be] + 0.013178f, // FreqH[bf] + 0.016279f, // FreqH[c0] + 0.006202f, // FreqH[c1] + 0.009302f, // FreqH[c2] + 0.017054f, // FreqH[c3] + 0.011628f, // FreqH[c4] + 0.008527f, // FreqH[c5] + 0.004651f, // FreqH[c6] + 0.006202f, // FreqH[c7] + 0.017829f, // FreqH[c8] + 0.024806f, // FreqH[c9] + 0.020155f, // FreqH[ca] + 0.013953f, // FreqH[cb] + 0.032558f, // FreqH[cc] + 0.035659f, // FreqH[cd] + 0.068217f, // FreqH[ce] + 0.010853f, // FreqH[cf] + 0.036434f, // FreqH[d0] + 0.117054f, // FreqH[d1] + 0.027907f, // FreqH[d2] + 0.100775f, // FreqH[d3] + 0.010078f, // FreqH[d4] + 0.017829f, // FreqH[d5] + 0.062016f, // FreqH[d6] + 0.012403f, // FreqH[d7] + 0.000000f, // FreqH[d8] + 0.000000f, // FreqH[d9] + 0.000000f, // FreqH[da] + 0.000000f, // FreqH[db] + 0.000000f, // FreqH[dc] + 0.000000f, // FreqH[dd] + 0.000000f, // FreqH[de] + 0.000000f, // FreqH[df] + 0.000000f, // FreqH[e0] + 0.000000f, // FreqH[e1] + 0.000000f, // FreqH[e2] + 0.000000f, // FreqH[e3] + 0.000000f, // FreqH[e4] + 0.000000f, // FreqH[e5] + 0.000000f, // FreqH[e6] + 0.000000f, // FreqH[e7] + 0.000000f, // FreqH[e8] + 0.000000f, // FreqH[e9] + 0.001550f, // FreqH[ea] + 0.000000f, // FreqH[eb] + 0.000000f, // FreqH[ec] + 0.000000f, // FreqH[ed] + 0.000000f, // FreqH[ee] + 0.000000f, // FreqH[ef] + 0.000000f, // FreqH[f0] + 0.000000f, // FreqH[f1] + 0.000000f, // FreqH[f2] + 0.000000f, // FreqH[f3] + 0.000000f, // FreqH[f4] + 0.000000f, // FreqH[f5] + 0.000000f, // FreqH[f6] + 0.000000f, // FreqH[f7] + 0.000000f, // FreqH[f8] + 0.000000f, // FreqH[f9] + 0.000000f, // FreqH[fa] + 0.000000f, // FreqH[fb] + 0.000000f, // FreqH[fc] + 0.000000f, // FreqH[fd] + 0.000000f // FreqH[fe] + }, +0.020081f, // Lead Byte StdDev +0.010638f, // Lead Byte Mean +0.586533f, // Lead Byte Weight + { + 0.006202f, // FreqL[a1] + 0.031008f, // FreqL[a2] + 0.005426f, // FreqL[a3] + 0.003101f, // FreqL[a4] + 0.001550f, // FreqL[a5] + 0.003101f, // FreqL[a6] + 0.082171f, // FreqL[a7] + 0.014729f, // FreqL[a8] + 0.006977f, // FreqL[a9] + 0.001550f, // FreqL[aa] + 0.013953f, // FreqL[ab] + 0.000000f, // FreqL[ac] + 0.013953f, // FreqL[ad] + 0.010078f, // FreqL[ae] + 0.008527f, // FreqL[af] + 0.006977f, // FreqL[b0] + 0.004651f, // FreqL[b1] + 0.003101f, // FreqL[b2] + 0.003101f, // FreqL[b3] + 0.003101f, // FreqL[b4] + 0.008527f, // FreqL[b5] + 0.003101f, // FreqL[b6] + 0.005426f, // FreqL[b7] + 0.005426f, // FreqL[b8] + 0.005426f, // FreqL[b9] + 0.003101f, // FreqL[ba] + 0.001550f, // FreqL[bb] + 0.006202f, // FreqL[bc] + 0.014729f, // FreqL[bd] + 0.010853f, // FreqL[be] + 0.000000f, // FreqL[bf] + 0.011628f, // FreqL[c0] + 0.000000f, // FreqL[c1] + 0.031783f, // FreqL[c2] + 0.013953f, // FreqL[c3] + 0.030233f, // FreqL[c4] + 0.039535f, // FreqL[c5] + 0.008527f, // FreqL[c6] + 0.015504f, // FreqL[c7] + 0.000000f, // FreqL[c8] + 0.003101f, // FreqL[c9] + 0.008527f, // FreqL[ca] + 0.016279f, // FreqL[cb] + 0.005426f, // FreqL[cc] + 0.001550f, // FreqL[cd] + 0.013953f, // FreqL[ce] + 0.013953f, // FreqL[cf] + 0.044961f, // FreqL[d0] + 0.003101f, // FreqL[d1] + 0.004651f, // FreqL[d2] + 0.006977f, // FreqL[d3] + 0.001550f, // FreqL[d4] + 0.005426f, // FreqL[d5] + 0.012403f, // FreqL[d6] + 0.001550f, // FreqL[d7] + 0.015504f, // FreqL[d8] + 0.000000f, // FreqL[d9] + 0.006202f, // FreqL[da] + 0.001550f, // FreqL[db] + 0.000000f, // FreqL[dc] + 0.007752f, // FreqL[dd] + 0.006977f, // FreqL[de] + 0.001550f, // FreqL[df] + 0.009302f, // FreqL[e0] + 0.011628f, // FreqL[e1] + 0.004651f, // FreqL[e2] + 0.010853f, // FreqL[e3] + 0.012403f, // FreqL[e4] + 0.017829f, // FreqL[e5] + 0.005426f, // FreqL[e6] + 0.024806f, // FreqL[e7] + 0.000000f, // FreqL[e8] + 0.006202f, // FreqL[e9] + 0.000000f, // FreqL[ea] + 0.082171f, // FreqL[eb] + 0.015504f, // FreqL[ec] + 0.004651f, // FreqL[ed] + 0.000000f, // FreqL[ee] + 0.006977f, // FreqL[ef] + 0.004651f, // FreqL[f0] + 0.000000f, // FreqL[f1] + 0.008527f, // FreqL[f2] + 0.012403f, // FreqL[f3] + 0.004651f, // FreqL[f4] + 0.003876f, // FreqL[f5] + 0.003101f, // FreqL[f6] + 0.022481f, // FreqL[f7] + 0.024031f, // FreqL[f8] + 0.001550f, // FreqL[f9] + 0.047287f, // FreqL[fa] + 0.009302f, // FreqL[fb] + 0.001550f, // FreqL[fc] + 0.005426f, // FreqL[fd] + 0.017054f // FreqL[fe] + }, +0.014156f, // Trail Byte StdDev +0.010638f, // Trail Byte Mean +0.413467f // Trial Byte Weight +}; diff --git a/intl/chardet/src/nsPSMDetectors.cpp b/intl/chardet/src/nsPSMDetectors.cpp index 2017f8e050a..84eeb499416 100644 --- a/intl/chardet/src/nsPSMDetectors.cpp +++ b/intl/chardet/src/nsPSMDetectors.cpp @@ -22,6 +22,7 @@ */ +#include #include "nsVerifier.h" //---- for verifiers #include "nsSJISVerifier.h" @@ -95,7 +96,140 @@ NS_DEFINE_CID(kCJKStringPSMDetectorCID, NS_CJK_STRING_PSMDETECTOR_CID); #define DETECTOR_DEBUG +typedef struct { + float mFirstByteFreq[94]; + float mFirstByteStdDev; + float mFirstByteMean; + float mFirstByteWeight; + float mSecoundByteFreq[94]; + float mSecoundByteStdDev; + float mSecoundByteMean; + float mSecoundByteWeight; +} nsEUCStatistics; +static nsEUCStatistics gBig5Statistics = +#include "Big5Statistics.h" +// end of UECTWStatistics.h include + +static nsEUCStatistics gEUCTWStatistics = +#include "EUCTWStatistics.h" +// end of UECTWStatistics.h include + +static nsEUCStatistics gGB2312Statistics = +#include "GB2312Statistics.h" +// end of GB2312Statistics.h include + +static nsEUCStatistics gEUCJPStatistics = +#include "EUCJPStatistics.h" +// end of EUCJPStatistics.h include + +static nsEUCStatistics gEUCKRStatistics = +#include "EUCKRStatistics.h" +// end of EUCKRStatistics.h include + +class nsEUCSampler { + public: + nsEUCSampler() { + mTotal =0; + mThreshold = 2000; + mState = 0; + PRInt32 i; + for(i=0;i<94;i++) + mFirstByteCnt[i] = mSecondByteCnt[i]=0; + } + PRBool EnoughData() { return mTotal > mThreshold; } + PRBool GetSomeData() { return mTotal > 1; } + PRBool Sample(const char* aIn, PRUint32 aLen); + void CalFreq(); + float GetScore(const float* aFirstByteFreq, float aFirstByteWeight, + const float* aSecondByteFreq, float aSecondByteWeight); + float GetScore(const float* array1, const float* array2); + private: + PRUint32 mTotal; + PRUint32 mThreshold; + PRInt8 mState; + PRUint32 mFirstByteCnt[94]; + PRUint32 mSecondByteCnt[94]; + float mFirstByteFreq[94]; + float mSecondByteFreq[94]; + +}; +PRBool nsEUCSampler::Sample(const char* aIn, PRUint32 aLen) +{ + if(mState == 1) + return PR_FALSE; + const unsigned char* p = (const unsigned char*) aIn; + if(aLen + mTotal > 0x80000000) + aLen = 0x80000000 - mTotal; + + PRUint32 i; + for(i=0; (i *p)) { + mState = 1; + } else { + mTotal++; + mFirstByteCnt[*p - 0x00a1]++; + mState = 2; + } + } + break; + case 1: + break; + case 2: + if( *p & 0x0080) + { + if((0x00ff == *p) || ( 0x00a1 > *p)) { + mState = 1; + } else { + mTotal++; + mSecondByteCnt[*p - 0x00a1]++; + mState = 0; + } + } else { + mState = 1; + } + break; + default: + mState = 1; + } + } + return (1 != mState ); +} +float nsEUCSampler::GetScore(const float* aFirstByteFreq, float aFirstByteWeight, + const float* aSecondByteFreq, float aSecondByteWeight) +{ + return GetScore(aFirstByteFreq, mFirstByteFreq) ; +/* + return aFirstByteWeight * GetScore(aFirstByteFreq, mFirstByteFreq) + + aSecondByteWeight * GetScore(aSecondByteFreq, mSecondByteFreq); +*/ +} + +float nsEUCSampler::GetScore(const float* array1, const float* array2) +{ + float s; + float sum=0.0; + PRUint16 i; + for(i=0;i<94;i++) { + s = array1[i] - array2[i]; + sum += s * s; + } + return (float)sqrt((double)sum) / 94.0f; +} + +void nsEUCSampler::CalFreq() +{ + PRUint32 i; + for(i = 0 ; i < 94; i++) { + mFirstByteFreq[i] = (float)mFirstByteCnt[i] / (float)mTotal; + mSecondByteFreq[i] = (float)mSecondByteCnt[i] / (float)mTotal; + } +} /* In the current design, we know the following combination of verifiers are not good- @@ -114,7 +248,7 @@ NS_DEFINE_CID(kCJKStringPSMDetectorCID, NS_CJK_STRING_PSMDETECTOR_CID); #define MAX_VERIFIERS 16 class nsPSMDetector { public : - nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet); + nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet, nsEUCStatistics** aStatisticsSet); virtual ~nsPSMDetector() {}; virtual PRBool HandleData(const char* aBuf, PRUint32 aLen); @@ -127,18 +261,25 @@ protected: PRUint8 mState[MAX_VERIFIERS]; PRUint8 mItemIdx[MAX_VERIFIERS]; nsVerifier** mVerifier; + nsEUCStatistics** mStatisticsData; PRBool mDone; + PRBool mRunSampler; +protected: + void Sample(const char* aBuf, PRUint32 aLen, PRBool aLastChance=PR_FALSE); private: #ifdef DETECTOR_DEBUG PRUint32 mDbgTest; PRUint32 mDbgLen; #endif + nsEUCSampler mSampler; }; //---------------------------------------------------------- -nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet) +nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet, nsEUCStatistics** aStatisticsSet) { + mRunSampler = (nsnull != aStatisticsSet); + mStatisticsData = aStatisticsSet; mDone= PR_FALSE; mItems = aItems; NS_ASSERTION(MAX_VERIFIERS >= aItems , "MAX_VERIFIERS is too small!"); @@ -156,6 +297,8 @@ nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet) //---------------------------------------------------------- void nsPSMDetector::DataEnd() { + if(mRunSampler) + Sample(nsnull, 0, PR_TRUE); } //---------------------------------------------------------- @@ -251,12 +394,76 @@ PRBool nsPSMDetector::HandleData(const char* aBuf, PRUint32 aLen) } } } + if(mRunSampler) + Sample(aBuf, aLen); + #ifdef DETECTOR_DEBUG mDbgLen += aLen; #endif return PR_FALSE; } +void nsPSMDetector::Sample(const char* aBuf, PRUint32 aLen, PRBool aLastChance) +{ + PRInt32 nonUCS2Num=0; + PRInt32 j; + PRInt32 eucNum=0; + for(j = 0; j < mItems; j++) { + if(nsnull != mStatisticsData[mItemIdx[j]]) + eucNum++; + if(((&nsUCS2BEVerifier) != mVerifier[mItemIdx[j]]) && + ((&nsUCS2LEVerifier) != mVerifier[mItemIdx[j]])) { + nonUCS2Num++; + } + } + mRunSampler = (eucNum > 1); + if(mRunSampler) { + mRunSampler = mSampler.Sample(aBuf, aLen); + if(((aLastChance && mSampler.GetSomeData()) || + mSampler.EnoughData()) + && (eucNum == nonUCS2Num)) { + mSampler.CalFreq(); +#ifdef DETECTOR_DEBUG + printf("We cannot figure out charset from the encoding, " + "All EUC based charset share the same encoding structure.\n" + "Detect based on statistics"); + if(aLastChance) { + printf(" after we receive all the data.\n"); + } else { + printf(" after we receive enough data.\n"); + } +#endif + PRInt32 bestIdx; + PRInt32 eucCnt=0; + float bestScore = 0.0f; + for(j = 0; j < mItems; j++) { + if(nsnull != mStatisticsData[mItemIdx[j]]) + { + float score = mSampler.GetScore( + mStatisticsData[mItemIdx[j]]->mFirstByteFreq, + mStatisticsData[mItemIdx[j]]->mFirstByteWeight, + mStatisticsData[mItemIdx[j]]->mSecoundByteFreq, + mStatisticsData[mItemIdx[j]]->mSecoundByteWeight ); +#ifdef DETECTOR_DEBUG + printf("Differences between %s and this data is %2.8f\n", + mVerifier[mItemIdx[j]]->charset, + score); +#endif + if(( 0 == eucCnt++) || (bestScore > score )) { + bestScore = score; + bestIdx = j; + } // if(( 0 == eucCnt++) || (bestScore > score )) + } // if(nsnull != ...) + } // for +#ifdef DETECTOR_DEBUG + printf("Based on the statistic, we decide it is %s", + mVerifier[mItemIdx[bestIdx]]->charset); +#endif + Report( mVerifier[mItemIdx[bestIdx]]->charset); + mDone = PR_TRUE; + } // if (eucNum == nonUCS2Num) + } // if(mRunSampler) +} //========================================================== /* This class won't detect x-euc-tw for now. It can only @@ -277,6 +484,15 @@ static nsVerifier *gZhTwVerifierSet[ZHTW_DETECTOR_NUM_VERIFIERS] = { &nsUCS2BEVerifier, &nsUCS2LEVerifier }; +static nsEUCStatistics *gZhTwStatisticsSet[ZHTW_DETECTOR_NUM_VERIFIERS] = { + nsnull, + &gBig5Statistics, + nsnull, + &gEUCTWStatistics, + nsnull, + nsnull, + nsnull +}; //========================================================== #define KO_DETECTOR_NUM_VERIFIERS 6 static nsVerifier *gKoVerifierSet[KO_DETECTOR_NUM_VERIFIERS] = { @@ -322,6 +538,17 @@ static nsVerifier *gZhVerifierSet[ZH_DETECTOR_NUM_VERIFIERS] = { &nsUCS2BEVerifier, &nsUCS2LEVerifier }; +static nsEUCStatistics *gZhStatisticsSet[ZH_DETECTOR_NUM_VERIFIERS] = { + nsnull, + &gGB2312Statistics, + &gBig5Statistics, + nsnull, + nsnull, + &gEUCTWStatistics, + nsnull, + nsnull, + nsnull +}; //========================================================== #define CJK_DETECTOR_NUM_VERIFIERS 14 static nsVerifier *gCJKVerifierSet[CJK_DETECTOR_NUM_VERIFIERS] = { @@ -340,6 +567,22 @@ static nsVerifier *gCJKVerifierSet[CJK_DETECTOR_NUM_VERIFIERS] = { &nsUCS2BEVerifier, &nsUCS2LEVerifier }; +static nsEUCStatistics *gCJKStatisticsSet[CJK_DETECTOR_NUM_VERIFIERS] = { + nsnull, + nsnull, + &gEUCJPStatistics, + nsnull, + &gEUCKRStatistics, + nsnull, + &gBig5Statistics, + &gEUCTWStatistics, + &gGB2312Statistics, + nsnull, + nsnull, + nsnull, + nsnull, + nsnull +}; //========================================================== class nsXPCOMDetector : private nsPSMDetector, @@ -347,7 +590,7 @@ class nsXPCOMDetector : { NS_DECL_ISUPPORTS public: - nsXPCOMDetector(PRUint8 aItems, nsVerifier** aVer); + nsXPCOMDetector(PRUint8 aItems, nsVerifier** aVer, nsEUCStatistics** aStatisticsSet); virtual ~nsXPCOMDetector(); NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver); NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe); @@ -360,8 +603,8 @@ private: nsICharsetDetectionObserver* mObserver; }; //---------------------------------------------------------- -nsXPCOMDetector::nsXPCOMDetector(PRUint8 aItems, nsVerifier **aVer) - : nsPSMDetector( aItems, aVer) +nsXPCOMDetector::nsXPCOMDetector(PRUint8 aItems, nsVerifier **aVer, nsEUCStatistics** aStatisticsSet) + : nsPSMDetector( aItems, aVer, aStatisticsSet) { NS_INIT_REFCNT(); PR_AtomicIncrement(&g_InstanceCount); @@ -404,6 +647,7 @@ NS_IMETHODIMP nsXPCOMDetector::DoIt( NS_IMETHODIMP nsXPCOMDetector::Done() { NS_ASSERTION(mObserver != nsnull , "have not init yet"); + this->DataEnd(); return NS_OK; } //---------------------------------------------------------- @@ -418,7 +662,7 @@ class nsXPCOMStringDetector : { NS_DECL_ISUPPORTS public: - nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer); + nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer, nsEUCStatistics** aStatisticsSet); virtual ~nsXPCOMStringDetector(); NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, const char** oCharset, @@ -429,8 +673,8 @@ private: const char* mResult; }; //---------------------------------------------------------- -nsXPCOMStringDetector::nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer) - : nsPSMDetector( aItems, aVer) +nsXPCOMStringDetector::nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer, nsEUCStatistics** aStatisticsSet) + : nsPSMDetector( aItems, aVer, aStatisticsSet) { NS_INIT_REFCNT(); PR_AtomicIncrement(&g_InstanceCount); @@ -515,29 +759,29 @@ NS_IMETHODIMP nsXPCOMDetectorFactory::CreateInstance( nsXPCOMStringDetector *inst2 = nsnull; if (mCID.Equals(kJAPSMDetectorCID)) { - inst1 = new nsXPCOMDetector(JA_DETECTOR_NUM_VERIFIERS, gJaVerifierSet); + inst1 = new nsXPCOMDetector(JA_DETECTOR_NUM_VERIFIERS, gJaVerifierSet, nsnull); } else if (mCID.Equals(kKOPSMDetectorCID)) { - inst1 = new nsXPCOMDetector(KO_DETECTOR_NUM_VERIFIERS, gKoVerifierSet); + inst1 = new nsXPCOMDetector(KO_DETECTOR_NUM_VERIFIERS, gKoVerifierSet, nsnull); } else if (mCID.Equals(kZHCNPSMDetectorCID)) { - inst1 = new nsXPCOMDetector(ZHCN_DETECTOR_NUM_VERIFIERS, gZhCnVerifierSet); + inst1 = new nsXPCOMDetector(ZHCN_DETECTOR_NUM_VERIFIERS, gZhCnVerifierSet, nsnull); } else if (mCID.Equals(kZHTWPSMDetectorCID)) { - inst1 = new nsXPCOMDetector(ZHTW_DETECTOR_NUM_VERIFIERS, gZhTwVerifierSet); + inst1 = new nsXPCOMDetector(ZHTW_DETECTOR_NUM_VERIFIERS, gZhTwVerifierSet, gZhTwStatisticsSet); } else if (mCID.Equals(kZHPSMDetectorCID)) { - inst1 = new nsXPCOMDetector(ZH_DETECTOR_NUM_VERIFIERS, gZhVerifierSet); + inst1 = new nsXPCOMDetector(ZH_DETECTOR_NUM_VERIFIERS, gZhVerifierSet, gZhStatisticsSet); } else if (mCID.Equals(kCJKPSMDetectorCID)) { - inst1 = new nsXPCOMDetector(CJK_DETECTOR_NUM_VERIFIERS, gCJKVerifierSet); + inst1 = new nsXPCOMDetector(CJK_DETECTOR_NUM_VERIFIERS, gCJKVerifierSet, gCJKStatisticsSet); } else if (mCID.Equals(kJAStringPSMDetectorCID)) { - inst2 = new nsXPCOMStringDetector(JA_DETECTOR_NUM_VERIFIERS - 3, gJaVerifierSet); + inst2 = new nsXPCOMStringDetector(JA_DETECTOR_NUM_VERIFIERS - 3, gJaVerifierSet, nsnull); } else if (mCID.Equals(kKOStringPSMDetectorCID)) { - inst2 = new nsXPCOMStringDetector(KO_DETECTOR_NUM_VERIFIERS - 3, gKoVerifierSet); + inst2 = new nsXPCOMStringDetector(KO_DETECTOR_NUM_VERIFIERS - 3, gKoVerifierSet, nsnull); } else if (mCID.Equals(kZHCNStringPSMDetectorCID)) { - inst2 = new nsXPCOMStringDetector(ZHCN_DETECTOR_NUM_VERIFIERS - 3, gZhCnVerifierSet); + inst2 = new nsXPCOMStringDetector(ZHCN_DETECTOR_NUM_VERIFIERS - 3, gZhCnVerifierSet, nsnull); } else if (mCID.Equals(kZHTWStringPSMDetectorCID)) { - inst2 = new nsXPCOMStringDetector(ZHTW_DETECTOR_NUM_VERIFIERS - 3, gZhTwVerifierSet); + inst2 = new nsXPCOMStringDetector(ZHTW_DETECTOR_NUM_VERIFIERS - 3, gZhTwVerifierSet, gZhTwStatisticsSet); } else if (mCID.Equals(kZHStringPSMDetectorCID)) { - inst2 = new nsXPCOMStringDetector(ZH_DETECTOR_NUM_VERIFIERS - 3, gZhVerifierSet); + inst2 = new nsXPCOMStringDetector(ZH_DETECTOR_NUM_VERIFIERS - 3, gZhVerifierSet, gZhStatisticsSet); } else if (mCID.Equals(kCJKStringPSMDetectorCID)) { - inst2 = new nsXPCOMStringDetector(CJK_DETECTOR_NUM_VERIFIERS - 3, gCJKVerifierSet); + inst2 = new nsXPCOMStringDetector(CJK_DETECTOR_NUM_VERIFIERS - 3, gCJKVerifierSet, gCJKStatisticsSet); } if((NULL == inst1) && (NULL == inst2)) { return NS_ERROR_OUT_OF_MEMORY;