iimprove charset detectors to use statistic model

This commit is contained in:
ftang%netscape.com 2000-05-16 22:50:16 +00:00
Родитель 69f29f5564
Коммит 10ed7b66ae
6 изменённых файлов: 1369 добавлений и 20 удалений

Просмотреть файл

@ -0,0 +1,221 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
{
{
0.000000f, // FreqH[a1]
0.000000f, // FreqH[a2]
0.000000f, // FreqH[a3]
0.114427f, // FreqH[a4]
0.061058f, // FreqH[a5]
0.075598f, // FreqH[a6]
0.048386f, // FreqH[a7]
0.063966f, // FreqH[a8]
0.027094f, // FreqH[a9]
0.095787f, // FreqH[aa]
0.029525f, // FreqH[ab]
0.031331f, // FreqH[ac]
0.036915f, // FreqH[ad]
0.021805f, // FreqH[ae]
0.019349f, // FreqH[af]
0.037496f, // FreqH[b0]
0.018068f, // FreqH[b1]
0.012760f, // FreqH[b2]
0.030053f, // FreqH[b3]
0.017339f, // FreqH[b4]
0.016731f, // FreqH[b5]
0.019501f, // FreqH[b6]
0.011240f, // FreqH[b7]
0.032973f, // FreqH[b8]
0.016658f, // FreqH[b9]
0.015872f, // FreqH[ba]
0.021458f, // FreqH[bb]
0.012378f, // FreqH[bc]
0.017003f, // FreqH[bd]
0.020802f, // FreqH[be]
0.012454f, // FreqH[bf]
0.009239f, // FreqH[c0]
0.012829f, // FreqH[c1]
0.007922f, // FreqH[c2]
0.010079f, // FreqH[c3]
0.009815f, // FreqH[c4]
0.010104f, // FreqH[c5]
0.000000f, // FreqH[c6]
0.000000f, // FreqH[c7]
0.000000f, // FreqH[c8]
0.000053f, // FreqH[c9]
0.000035f, // FreqH[ca]
0.000105f, // FreqH[cb]
0.000031f, // FreqH[cc]
0.000088f, // FreqH[cd]
0.000027f, // FreqH[ce]
0.000027f, // FreqH[cf]
0.000026f, // FreqH[d0]
0.000035f, // FreqH[d1]
0.000024f, // FreqH[d2]
0.000034f, // FreqH[d3]
0.000375f, // FreqH[d4]
0.000025f, // FreqH[d5]
0.000028f, // FreqH[d6]
0.000020f, // FreqH[d7]
0.000024f, // FreqH[d8]
0.000028f, // FreqH[d9]
0.000031f, // FreqH[da]
0.000059f, // FreqH[db]
0.000040f, // FreqH[dc]
0.000030f, // FreqH[dd]
0.000079f, // FreqH[de]
0.000037f, // FreqH[df]
0.000040f, // FreqH[e0]
0.000023f, // FreqH[e1]
0.000030f, // FreqH[e2]
0.000027f, // FreqH[e3]
0.000064f, // FreqH[e4]
0.000020f, // FreqH[e5]
0.000027f, // FreqH[e6]
0.000025f, // FreqH[e7]
0.000074f, // FreqH[e8]
0.000019f, // FreqH[e9]
0.000023f, // FreqH[ea]
0.000021f, // FreqH[eb]
0.000018f, // FreqH[ec]
0.000017f, // FreqH[ed]
0.000035f, // FreqH[ee]
0.000021f, // FreqH[ef]
0.000019f, // FreqH[f0]
0.000025f, // FreqH[f1]
0.000017f, // FreqH[f2]
0.000037f, // FreqH[f3]
0.000018f, // FreqH[f4]
0.000018f, // FreqH[f5]
0.000019f, // FreqH[f6]
0.000022f, // FreqH[f7]
0.000033f, // FreqH[f8]
0.000032f, // FreqH[f9]
0.000000f, // FreqH[fa]
0.000000f, // FreqH[fb]
0.000000f, // FreqH[fc]
0.000000f, // FreqH[fd]
0.000000f // FreqH[fe]
},
0.020606f, // Lead Byte StdDev
0.010638f, // Lead Byte Mean
0.675261f, // Lead Byte Weight
{
0.020256f, // FreqL[a1]
0.003293f, // FreqL[a2]
0.045811f, // FreqL[a3]
0.016650f, // FreqL[a4]
0.007066f, // FreqL[a5]
0.004146f, // FreqL[a6]
0.009229f, // FreqL[a7]
0.007333f, // FreqL[a8]
0.003296f, // FreqL[a9]
0.005239f, // FreqL[aa]
0.008282f, // FreqL[ab]
0.003791f, // FreqL[ac]
0.006116f, // FreqL[ad]
0.003536f, // FreqL[ae]
0.004024f, // FreqL[af]
0.016654f, // FreqL[b0]
0.009334f, // FreqL[b1]
0.005429f, // FreqL[b2]
0.033392f, // FreqL[b3]
0.006121f, // FreqL[b4]
0.008983f, // FreqL[b5]
0.002801f, // FreqL[b6]
0.004221f, // FreqL[b7]
0.010357f, // FreqL[b8]
0.014695f, // FreqL[b9]
0.077937f, // FreqL[ba]
0.006314f, // FreqL[bb]
0.004020f, // FreqL[bc]
0.007331f, // FreqL[bd]
0.007150f, // FreqL[be]
0.005341f, // FreqL[bf]
0.009195f, // FreqL[c0]
0.005350f, // FreqL[c1]
0.005698f, // FreqL[c2]
0.004472f, // FreqL[c3]
0.007242f, // FreqL[c4]
0.004039f, // FreqL[c5]
0.011154f, // FreqL[c6]
0.016184f, // FreqL[c7]
0.004741f, // FreqL[c8]
0.012814f, // FreqL[c9]
0.007679f, // FreqL[ca]
0.008045f, // FreqL[cb]
0.016631f, // FreqL[cc]
0.009451f, // FreqL[cd]
0.016487f, // FreqL[ce]
0.007287f, // FreqL[cf]
0.012688f, // FreqL[d0]
0.017421f, // FreqL[d1]
0.013205f, // FreqL[d2]
0.031480f, // FreqL[d3]
0.003404f, // FreqL[d4]
0.009149f, // FreqL[d5]
0.008921f, // FreqL[d6]
0.007514f, // FreqL[d7]
0.008683f, // FreqL[d8]
0.008203f, // FreqL[d9]
0.031403f, // FreqL[da]
0.011733f, // FreqL[db]
0.015617f, // FreqL[dc]
0.015306f, // FreqL[dd]
0.004004f, // FreqL[de]
0.010899f, // FreqL[df]
0.009961f, // FreqL[e0]
0.008388f, // FreqL[e1]
0.010920f, // FreqL[e2]
0.003925f, // FreqL[e3]
0.008585f, // FreqL[e4]
0.009108f, // FreqL[e5]
0.015546f, // FreqL[e6]
0.004659f, // FreqL[e7]
0.006934f, // FreqL[e8]
0.007023f, // FreqL[e9]
0.020252f, // FreqL[ea]
0.005387f, // FreqL[eb]
0.024704f, // FreqL[ec]
0.006963f, // FreqL[ed]
0.002625f, // FreqL[ee]
0.009512f, // FreqL[ef]
0.002971f, // FreqL[f0]
0.008233f, // FreqL[f1]
0.010000f, // FreqL[f2]
0.011973f, // FreqL[f3]
0.010553f, // FreqL[f4]
0.005945f, // FreqL[f5]
0.006349f, // FreqL[f6]
0.009401f, // FreqL[f7]
0.008577f, // FreqL[f8]
0.008186f, // FreqL[f9]
0.008159f, // FreqL[fa]
0.005033f, // FreqL[fb]
0.008714f, // FreqL[fc]
0.010614f, // FreqL[fd]
0.006554f // FreqL[fe]
},
0.009909f, // Trail Byte StdDev
0.010638f, // Trail Byte Mean
0.324739f // Trial Byte Weight
};

Просмотреть файл

@ -0,0 +1,221 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
{
{
0.364808f, // FreqH[a1]
0.000000f, // FreqH[a2]
0.000000f, // FreqH[a3]
0.145325f, // FreqH[a4]
0.304891f, // FreqH[a5]
0.000000f, // FreqH[a6]
0.000000f, // FreqH[a7]
0.000000f, // FreqH[a8]
0.000000f, // FreqH[a9]
0.000000f, // FreqH[aa]
0.000000f, // FreqH[ab]
0.000000f, // FreqH[ac]
0.000000f, // FreqH[ad]
0.000000f, // FreqH[ae]
0.000000f, // FreqH[af]
0.001835f, // FreqH[b0]
0.010771f, // FreqH[b1]
0.006462f, // FreqH[b2]
0.001157f, // FreqH[b3]
0.002114f, // FreqH[b4]
0.003231f, // FreqH[b5]
0.001356f, // FreqH[b6]
0.007420f, // FreqH[b7]
0.004189f, // FreqH[b8]
0.003231f, // FreqH[b9]
0.003032f, // FreqH[ba]
0.033190f, // FreqH[bb]
0.006303f, // FreqH[bc]
0.006064f, // FreqH[bd]
0.009973f, // FreqH[be]
0.002354f, // FreqH[bf]
0.003670f, // FreqH[c0]
0.009135f, // FreqH[c1]
0.001675f, // FreqH[c2]
0.002792f, // FreqH[c3]
0.002194f, // FreqH[c4]
0.014720f, // FreqH[c5]
0.011928f, // FreqH[c6]
0.000878f, // FreqH[c7]
0.013124f, // FreqH[c8]
0.001077f, // FreqH[c9]
0.009295f, // FreqH[ca]
0.003471f, // FreqH[cb]
0.002872f, // FreqH[cc]
0.002433f, // FreqH[cd]
0.000957f, // FreqH[ce]
0.001636f, // FreqH[cf]
0.000000f, // FreqH[d0]
0.000000f, // FreqH[d1]
0.000000f, // FreqH[d2]
0.000000f, // FreqH[d3]
0.000000f, // FreqH[d4]
0.000000f, // FreqH[d5]
0.000000f, // FreqH[d6]
0.000000f, // FreqH[d7]
0.000000f, // FreqH[d8]
0.000000f, // FreqH[d9]
0.000000f, // FreqH[da]
0.000000f, // FreqH[db]
0.000000f, // FreqH[dc]
0.000000f, // FreqH[dd]
0.000080f, // FreqH[de]
0.000279f, // FreqH[df]
0.000000f, // FreqH[e0]
0.000000f, // FreqH[e1]
0.000000f, // FreqH[e2]
0.000000f, // FreqH[e3]
0.000000f, // FreqH[e4]
0.000000f, // FreqH[e5]
0.000000f, // FreqH[e6]
0.000000f, // FreqH[e7]
0.000000f, // FreqH[e8]
0.000000f, // FreqH[e9]
0.000000f, // FreqH[ea]
0.000000f, // FreqH[eb]
0.000000f, // FreqH[ec]
0.000000f, // FreqH[ed]
0.000000f, // FreqH[ee]
0.000000f, // FreqH[ef]
0.000000f, // FreqH[f0]
0.000000f, // FreqH[f1]
0.000000f, // FreqH[f2]
0.000000f, // FreqH[f3]
0.000000f, // FreqH[f4]
0.000000f, // FreqH[f5]
0.000000f, // FreqH[f6]
0.000000f, // FreqH[f7]
0.000000f, // FreqH[f8]
0.000000f, // FreqH[f9]
0.000000f, // FreqH[fa]
0.000000f, // FreqH[fb]
0.000000f, // FreqH[fc]
0.000080f, // FreqH[fd]
0.000000f // FreqH[fe]
},
0.050407f, // Lead Byte StdDev
0.010638f, // Lead Byte Mean
0.640871f, // Lead Byte Weight
{
0.002473f, // FreqL[a1]
0.039134f, // FreqL[a2]
0.152745f, // FreqL[a3]
0.009694f, // FreqL[a4]
0.000359f, // FreqL[a5]
0.022180f, // FreqL[a6]
0.000758f, // FreqL[a7]
0.004308f, // FreqL[a8]
0.000160f, // FreqL[a9]
0.002513f, // FreqL[aa]
0.003072f, // FreqL[ab]
0.001316f, // FreqL[ac]
0.003830f, // FreqL[ad]
0.001037f, // FreqL[ae]
0.003590f, // FreqL[af]
0.000957f, // FreqL[b0]
0.000160f, // FreqL[b1]
0.000239f, // FreqL[b2]
0.006462f, // FreqL[b3]
0.001596f, // FreqL[b4]
0.031554f, // FreqL[b5]
0.001316f, // FreqL[b6]
0.002194f, // FreqL[b7]
0.016555f, // FreqL[b8]
0.003271f, // FreqL[b9]
0.000678f, // FreqL[ba]
0.000598f, // FreqL[bb]
0.206438f, // FreqL[bc]
0.000718f, // FreqL[bd]
0.001077f, // FreqL[be]
0.003710f, // FreqL[bf]
0.001356f, // FreqL[c0]
0.001356f, // FreqL[c1]
0.000439f, // FreqL[c2]
0.004388f, // FreqL[c3]
0.005704f, // FreqL[c4]
0.000878f, // FreqL[c5]
0.010172f, // FreqL[c6]
0.007061f, // FreqL[c7]
0.014680f, // FreqL[c8]
0.000638f, // FreqL[c9]
0.025730f, // FreqL[ca]
0.002792f, // FreqL[cb]
0.000718f, // FreqL[cc]
0.001795f, // FreqL[cd]
0.091551f, // FreqL[ce]
0.000758f, // FreqL[cf]
0.003909f, // FreqL[d0]
0.000558f, // FreqL[d1]
0.031195f, // FreqL[d2]
0.007061f, // FreqL[d3]
0.001316f, // FreqL[d4]
0.022579f, // FreqL[d5]
0.006981f, // FreqL[d6]
0.007260f, // FreqL[d7]
0.001117f, // FreqL[d8]
0.000239f, // FreqL[d9]
0.012127f, // FreqL[da]
0.000878f, // FreqL[db]
0.003790f, // FreqL[dc]
0.001077f, // FreqL[dd]
0.000758f, // FreqL[de]
0.002114f, // FreqL[df]
0.002234f, // FreqL[e0]
0.000678f, // FreqL[e1]
0.002992f, // FreqL[e2]
0.003311f, // FreqL[e3]
0.023416f, // FreqL[e4]
0.001237f, // FreqL[e5]
0.002753f, // FreqL[e6]
0.005146f, // FreqL[e7]
0.002194f, // FreqL[e8]
0.007021f, // FreqL[e9]
0.008497f, // FreqL[ea]
0.013763f, // FreqL[eb]
0.011768f, // FreqL[ec]
0.006303f, // FreqL[ed]
0.001915f, // FreqL[ee]
0.000638f, // FreqL[ef]
0.008776f, // FreqL[f0]
0.000918f, // FreqL[f1]
0.003431f, // FreqL[f2]
0.057603f, // FreqL[f3]
0.000439f, // FreqL[f4]
0.000439f, // FreqL[f5]
0.000758f, // FreqL[f6]
0.002872f, // FreqL[f7]
0.001675f, // FreqL[f8]
0.011050f, // FreqL[f9]
0.000000f, // FreqL[fa]
0.000279f, // FreqL[fb]
0.012127f, // FreqL[fc]
0.000718f, // FreqL[fd]
0.007380f // FreqL[fe]
},
0.028247f, // Trail Byte StdDev
0.010638f, // Trail Byte Mean
0.359129f // Trial Byte Weight
};

Просмотреть файл

@ -0,0 +1,221 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
{
{
0.000000f, // FreqH[a1]
0.000000f, // FreqH[a2]
0.000000f, // FreqH[a3]
0.000000f, // FreqH[a4]
0.000000f, // FreqH[a5]
0.000000f, // FreqH[a6]
0.000000f, // FreqH[a7]
0.000412f, // FreqH[a8]
0.000000f, // FreqH[a9]
0.000000f, // FreqH[aa]
0.000000f, // FreqH[ab]
0.000000f, // FreqH[ac]
0.000000f, // FreqH[ad]
0.000000f, // FreqH[ae]
0.000000f, // FreqH[af]
0.057502f, // FreqH[b0]
0.033182f, // FreqH[b1]
0.002267f, // FreqH[b2]
0.016076f, // FreqH[b3]
0.014633f, // FreqH[b4]
0.032976f, // FreqH[b5]
0.004122f, // FreqH[b6]
0.011336f, // FreqH[b7]
0.058533f, // FreqH[b8]
0.024526f, // FreqH[b9]
0.025969f, // FreqH[ba]
0.054411f, // FreqH[bb]
0.019580f, // FreqH[bc]
0.063273f, // FreqH[bd]
0.113974f, // FreqH[be]
0.029885f, // FreqH[bf]
0.150041f, // FreqH[c0]
0.059151f, // FreqH[c1]
0.002679f, // FreqH[c2]
0.009893f, // FreqH[c3]
0.014839f, // FreqH[c4]
0.026381f, // FreqH[c5]
0.015045f, // FreqH[c6]
0.069456f, // FreqH[c7]
0.089860f, // FreqH[c8]
0.000000f, // FreqH[c9]
0.000000f, // FreqH[ca]
0.000000f, // FreqH[cb]
0.000000f, // FreqH[cc]
0.000000f, // FreqH[cd]
0.000000f, // FreqH[ce]
0.000000f, // FreqH[cf]
0.000000f, // FreqH[d0]
0.000000f, // FreqH[d1]
0.000000f, // FreqH[d2]
0.000000f, // FreqH[d3]
0.000000f, // FreqH[d4]
0.000000f, // FreqH[d5]
0.000000f, // FreqH[d6]
0.000000f, // FreqH[d7]
0.000000f, // FreqH[d8]
0.000000f, // FreqH[d9]
0.000000f, // FreqH[da]
0.000000f, // FreqH[db]
0.000000f, // FreqH[dc]
0.000000f, // FreqH[dd]
0.000000f, // FreqH[de]
0.000000f, // FreqH[df]
0.000000f, // FreqH[e0]
0.000000f, // FreqH[e1]
0.000000f, // FreqH[e2]
0.000000f, // FreqH[e3]
0.000000f, // FreqH[e4]
0.000000f, // FreqH[e5]
0.000000f, // FreqH[e6]
0.000000f, // FreqH[e7]
0.000000f, // FreqH[e8]
0.000000f, // FreqH[e9]
0.000000f, // FreqH[ea]
0.000000f, // FreqH[eb]
0.000000f, // FreqH[ec]
0.000000f, // FreqH[ed]
0.000000f, // FreqH[ee]
0.000000f, // FreqH[ef]
0.000000f, // FreqH[f0]
0.000000f, // FreqH[f1]
0.000000f, // FreqH[f2]
0.000000f, // FreqH[f3]
0.000000f, // FreqH[f4]
0.000000f, // FreqH[f5]
0.000000f, // FreqH[f6]
0.000000f, // FreqH[f7]
0.000000f, // FreqH[f8]
0.000000f, // FreqH[f9]
0.000000f, // FreqH[fa]
0.000000f, // FreqH[fb]
0.000000f, // FreqH[fc]
0.000000f, // FreqH[fd]
0.000000f // FreqH[fe]
},
0.025593f, // Lead Byte StdDev
0.010638f, // Lead Byte Mean
0.647437f, // Lead Byte Weight
{
0.016694f, // FreqL[a1]
0.000000f, // FreqL[a2]
0.012778f, // FreqL[a3]
0.030091f, // FreqL[a4]
0.002679f, // FreqL[a5]
0.006595f, // FreqL[a6]
0.001855f, // FreqL[a7]
0.000824f, // FreqL[a8]
0.005977f, // FreqL[a9]
0.004740f, // FreqL[aa]
0.003092f, // FreqL[ab]
0.000824f, // FreqL[ac]
0.019580f, // FreqL[ad]
0.037304f, // FreqL[ae]
0.008244f, // FreqL[af]
0.014633f, // FreqL[b0]
0.001031f, // FreqL[b1]
0.000000f, // FreqL[b2]
0.003298f, // FreqL[b3]
0.002061f, // FreqL[b4]
0.006183f, // FreqL[b5]
0.005977f, // FreqL[b6]
0.000824f, // FreqL[b7]
0.021847f, // FreqL[b8]
0.014839f, // FreqL[b9]
0.052968f, // FreqL[ba]
0.017312f, // FreqL[bb]
0.007626f, // FreqL[bc]
0.000412f, // FreqL[bd]
0.000824f, // FreqL[be]
0.011129f, // FreqL[bf]
0.000000f, // FreqL[c0]
0.000412f, // FreqL[c1]
0.001649f, // FreqL[c2]
0.005977f, // FreqL[c3]
0.065746f, // FreqL[c4]
0.020198f, // FreqL[c5]
0.021434f, // FreqL[c6]
0.014633f, // FreqL[c7]
0.004122f, // FreqL[c8]
0.001649f, // FreqL[c9]
0.000824f, // FreqL[ca]
0.000824f, // FreqL[cb]
0.051937f, // FreqL[cc]
0.019580f, // FreqL[cd]
0.023289f, // FreqL[ce]
0.026381f, // FreqL[cf]
0.040396f, // FreqL[d0]
0.009068f, // FreqL[d1]
0.001443f, // FreqL[d2]
0.003710f, // FreqL[d3]
0.007420f, // FreqL[d4]
0.001443f, // FreqL[d5]
0.013190f, // FreqL[d6]
0.002885f, // FreqL[d7]
0.000412f, // FreqL[d8]
0.003298f, // FreqL[d9]
0.025969f, // FreqL[da]
0.000412f, // FreqL[db]
0.000412f, // FreqL[dc]
0.006183f, // FreqL[dd]
0.003298f, // FreqL[de]
0.066983f, // FreqL[df]
0.002679f, // FreqL[e0]
0.002267f, // FreqL[e1]
0.011129f, // FreqL[e2]
0.000412f, // FreqL[e3]
0.010099f, // FreqL[e4]
0.015251f, // FreqL[e5]
0.007626f, // FreqL[e6]
0.043899f, // FreqL[e7]
0.003710f, // FreqL[e8]
0.002679f, // FreqL[e9]
0.001443f, // FreqL[ea]
0.010923f, // FreqL[eb]
0.002885f, // FreqL[ec]
0.009068f, // FreqL[ed]
0.019992f, // FreqL[ee]
0.000412f, // FreqL[ef]
0.008450f, // FreqL[f0]
0.005153f, // FreqL[f1]
0.000000f, // FreqL[f2]
0.010099f, // FreqL[f3]
0.000000f, // FreqL[f4]
0.001649f, // FreqL[f5]
0.012160f, // FreqL[f6]
0.011542f, // FreqL[f7]
0.006595f, // FreqL[f8]
0.001855f, // FreqL[f9]
0.010923f, // FreqL[fa]
0.000412f, // FreqL[fb]
0.023702f, // FreqL[fc]
0.003710f, // FreqL[fd]
0.001855f // FreqL[fe]
},
0.013937f, // Trail Byte StdDev
0.010638f, // Trail Byte Mean
0.352563f // Trial Byte Weight
};

Просмотреть файл

@ -0,0 +1,221 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
{
{
0.000000f, // FreqH[a1]
0.000000f, // FreqH[a2]
0.000000f, // FreqH[a3]
0.000000f, // FreqH[a4]
0.000000f, // FreqH[a5]
0.000000f, // FreqH[a6]
0.000000f, // FreqH[a7]
0.000000f, // FreqH[a8]
0.000000f, // FreqH[a9]
0.000000f, // FreqH[aa]
0.000000f, // FreqH[ab]
0.000000f, // FreqH[ac]
0.000000f, // FreqH[ad]
0.000000f, // FreqH[ae]
0.000000f, // FreqH[af]
0.000000f, // FreqH[b0]
0.000000f, // FreqH[b1]
0.000000f, // FreqH[b2]
0.000000f, // FreqH[b3]
0.000000f, // FreqH[b4]
0.000000f, // FreqH[b5]
0.000000f, // FreqH[b6]
0.000000f, // FreqH[b7]
0.000000f, // FreqH[b8]
0.000000f, // FreqH[b9]
0.000000f, // FreqH[ba]
0.000000f, // FreqH[bb]
0.000000f, // FreqH[bc]
0.000000f, // FreqH[bd]
0.000000f, // FreqH[be]
0.000000f, // FreqH[bf]
0.000000f, // FreqH[c0]
0.000000f, // FreqH[c1]
0.000000f, // FreqH[c2]
0.000000f, // FreqH[c3]
0.119286f, // FreqH[c4]
0.052233f, // FreqH[c5]
0.044126f, // FreqH[c6]
0.052494f, // FreqH[c7]
0.045906f, // FreqH[c8]
0.019038f, // FreqH[c9]
0.032465f, // FreqH[ca]
0.026252f, // FreqH[cb]
0.025502f, // FreqH[cc]
0.015963f, // FreqH[cd]
0.052493f, // FreqH[ce]
0.019256f, // FreqH[cf]
0.015137f, // FreqH[d0]
0.031782f, // FreqH[d1]
0.017370f, // FreqH[d2]
0.018494f, // FreqH[d3]
0.015575f, // FreqH[d4]
0.016621f, // FreqH[d5]
0.007444f, // FreqH[d6]
0.011642f, // FreqH[d7]
0.013916f, // FreqH[d8]
0.019159f, // FreqH[d9]
0.016445f, // FreqH[da]
0.007851f, // FreqH[db]
0.011079f, // FreqH[dc]
0.022842f, // FreqH[dd]
0.015513f, // FreqH[de]
0.010033f, // FreqH[df]
0.009950f, // FreqH[e0]
0.010347f, // FreqH[e1]
0.013103f, // FreqH[e2]
0.015371f, // FreqH[e3]
0.012502f, // FreqH[e4]
0.007436f, // FreqH[e5]
0.018253f, // FreqH[e6]
0.014134f, // FreqH[e7]
0.008907f, // FreqH[e8]
0.005411f, // FreqH[e9]
0.009570f, // FreqH[ea]
0.013598f, // FreqH[eb]
0.006092f, // FreqH[ec]
0.007409f, // FreqH[ed]
0.008432f, // FreqH[ee]
0.005816f, // FreqH[ef]
0.009349f, // FreqH[f0]
0.005472f, // FreqH[f1]
0.007170f, // FreqH[f2]
0.007420f, // FreqH[f3]
0.003681f, // FreqH[f4]
0.007523f, // FreqH[f5]
0.004610f, // FreqH[f6]
0.006154f, // FreqH[f7]
0.003348f, // FreqH[f8]
0.005074f, // FreqH[f9]
0.005922f, // FreqH[fa]
0.005254f, // FreqH[fb]
0.004682f, // FreqH[fc]
0.002093f, // FreqH[fd]
0.000000f // FreqH[fe]
},
0.016681f, // Lead Byte StdDev
0.010638f, // Lead Byte Mean
0.715599f, // Lead Byte Weight
{
0.028933f, // FreqL[a1]
0.011371f, // FreqL[a2]
0.011053f, // FreqL[a3]
0.007232f, // FreqL[a4]
0.010192f, // FreqL[a5]
0.004093f, // FreqL[a6]
0.015043f, // FreqL[a7]
0.011752f, // FreqL[a8]
0.022387f, // FreqL[a9]
0.008410f, // FreqL[aa]
0.012448f, // FreqL[ab]
0.007473f, // FreqL[ac]
0.003594f, // FreqL[ad]
0.007139f, // FreqL[ae]
0.018912f, // FreqL[af]
0.006083f, // FreqL[b0]
0.003302f, // FreqL[b1]
0.010215f, // FreqL[b2]
0.008791f, // FreqL[b3]
0.024236f, // FreqL[b4]
0.014107f, // FreqL[b5]
0.014108f, // FreqL[b6]
0.010303f, // FreqL[b7]
0.009728f, // FreqL[b8]
0.007877f, // FreqL[b9]
0.009719f, // FreqL[ba]
0.007952f, // FreqL[bb]
0.021028f, // FreqL[bc]
0.005764f, // FreqL[bd]
0.009341f, // FreqL[be]
0.006591f, // FreqL[bf]
0.012517f, // FreqL[c0]
0.005921f, // FreqL[c1]
0.008982f, // FreqL[c2]
0.008771f, // FreqL[c3]
0.012802f, // FreqL[c4]
0.005926f, // FreqL[c5]
0.008342f, // FreqL[c6]
0.003086f, // FreqL[c7]
0.006843f, // FreqL[c8]
0.007576f, // FreqL[c9]
0.004734f, // FreqL[ca]
0.016404f, // FreqL[cb]
0.008803f, // FreqL[cc]
0.008071f, // FreqL[cd]
0.005349f, // FreqL[ce]
0.008566f, // FreqL[cf]
0.010840f, // FreqL[d0]
0.015401f, // FreqL[d1]
0.031904f, // FreqL[d2]
0.008670f, // FreqL[d3]
0.011479f, // FreqL[d4]
0.010936f, // FreqL[d5]
0.007617f, // FreqL[d6]
0.008995f, // FreqL[d7]
0.008114f, // FreqL[d8]
0.008658f, // FreqL[d9]
0.005934f, // FreqL[da]
0.010452f, // FreqL[db]
0.009142f, // FreqL[dc]
0.004519f, // FreqL[dd]
0.008339f, // FreqL[de]
0.007476f, // FreqL[df]
0.007027f, // FreqL[e0]
0.006025f, // FreqL[e1]
0.021804f, // FreqL[e2]
0.024248f, // FreqL[e3]
0.015895f, // FreqL[e4]
0.003768f, // FreqL[e5]
0.010171f, // FreqL[e6]
0.010007f, // FreqL[e7]
0.010178f, // FreqL[e8]
0.008316f, // FreqL[e9]
0.006832f, // FreqL[ea]
0.006364f, // FreqL[eb]
0.009141f, // FreqL[ec]
0.009148f, // FreqL[ed]
0.012081f, // FreqL[ee]
0.011914f, // FreqL[ef]
0.004464f, // FreqL[f0]
0.014257f, // FreqL[f1]
0.006907f, // FreqL[f2]
0.011292f, // FreqL[f3]
0.018622f, // FreqL[f4]
0.008149f, // FreqL[f5]
0.004636f, // FreqL[f6]
0.006612f, // FreqL[f7]
0.013478f, // FreqL[f8]
0.012614f, // FreqL[f9]
0.005186f, // FreqL[fa]
0.048285f, // FreqL[fb]
0.006816f, // FreqL[fc]
0.006743f, // FreqL[fd]
0.008671f // FreqL[fe]
},
0.006630f, // Trail Byte StdDev
0.010638f, // Trail Byte Mean
0.284401f // Trial Byte Weight
};

Просмотреть файл

@ -0,0 +1,221 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
{
{
0.011628f, // FreqH[a1]
0.000000f, // FreqH[a2]
0.000000f, // FreqH[a3]
0.000000f, // FreqH[a4]
0.000000f, // FreqH[a5]
0.000000f, // FreqH[a6]
0.000000f, // FreqH[a7]
0.000000f, // FreqH[a8]
0.000000f, // FreqH[a9]
0.000000f, // FreqH[aa]
0.000000f, // FreqH[ab]
0.000000f, // FreqH[ac]
0.000000f, // FreqH[ad]
0.000000f, // FreqH[ae]
0.000000f, // FreqH[af]
0.011628f, // FreqH[b0]
0.012403f, // FreqH[b1]
0.009302f, // FreqH[b2]
0.003876f, // FreqH[b3]
0.017829f, // FreqH[b4]
0.037209f, // FreqH[b5]
0.008527f, // FreqH[b6]
0.010078f, // FreqH[b7]
0.019380f, // FreqH[b8]
0.054264f, // FreqH[b9]
0.010078f, // FreqH[ba]
0.041085f, // FreqH[bb]
0.020930f, // FreqH[bc]
0.018605f, // FreqH[bd]
0.010078f, // FreqH[be]
0.013178f, // FreqH[bf]
0.016279f, // FreqH[c0]
0.006202f, // FreqH[c1]
0.009302f, // FreqH[c2]
0.017054f, // FreqH[c3]
0.011628f, // FreqH[c4]
0.008527f, // FreqH[c5]
0.004651f, // FreqH[c6]
0.006202f, // FreqH[c7]
0.017829f, // FreqH[c8]
0.024806f, // FreqH[c9]
0.020155f, // FreqH[ca]
0.013953f, // FreqH[cb]
0.032558f, // FreqH[cc]
0.035659f, // FreqH[cd]
0.068217f, // FreqH[ce]
0.010853f, // FreqH[cf]
0.036434f, // FreqH[d0]
0.117054f, // FreqH[d1]
0.027907f, // FreqH[d2]
0.100775f, // FreqH[d3]
0.010078f, // FreqH[d4]
0.017829f, // FreqH[d5]
0.062016f, // FreqH[d6]
0.012403f, // FreqH[d7]
0.000000f, // FreqH[d8]
0.000000f, // FreqH[d9]
0.000000f, // FreqH[da]
0.000000f, // FreqH[db]
0.000000f, // FreqH[dc]
0.000000f, // FreqH[dd]
0.000000f, // FreqH[de]
0.000000f, // FreqH[df]
0.000000f, // FreqH[e0]
0.000000f, // FreqH[e1]
0.000000f, // FreqH[e2]
0.000000f, // FreqH[e3]
0.000000f, // FreqH[e4]
0.000000f, // FreqH[e5]
0.000000f, // FreqH[e6]
0.000000f, // FreqH[e7]
0.000000f, // FreqH[e8]
0.000000f, // FreqH[e9]
0.001550f, // FreqH[ea]
0.000000f, // FreqH[eb]
0.000000f, // FreqH[ec]
0.000000f, // FreqH[ed]
0.000000f, // FreqH[ee]
0.000000f, // FreqH[ef]
0.000000f, // FreqH[f0]
0.000000f, // FreqH[f1]
0.000000f, // FreqH[f2]
0.000000f, // FreqH[f3]
0.000000f, // FreqH[f4]
0.000000f, // FreqH[f5]
0.000000f, // FreqH[f6]
0.000000f, // FreqH[f7]
0.000000f, // FreqH[f8]
0.000000f, // FreqH[f9]
0.000000f, // FreqH[fa]
0.000000f, // FreqH[fb]
0.000000f, // FreqH[fc]
0.000000f, // FreqH[fd]
0.000000f // FreqH[fe]
},
0.020081f, // Lead Byte StdDev
0.010638f, // Lead Byte Mean
0.586533f, // Lead Byte Weight
{
0.006202f, // FreqL[a1]
0.031008f, // FreqL[a2]
0.005426f, // FreqL[a3]
0.003101f, // FreqL[a4]
0.001550f, // FreqL[a5]
0.003101f, // FreqL[a6]
0.082171f, // FreqL[a7]
0.014729f, // FreqL[a8]
0.006977f, // FreqL[a9]
0.001550f, // FreqL[aa]
0.013953f, // FreqL[ab]
0.000000f, // FreqL[ac]
0.013953f, // FreqL[ad]
0.010078f, // FreqL[ae]
0.008527f, // FreqL[af]
0.006977f, // FreqL[b0]
0.004651f, // FreqL[b1]
0.003101f, // FreqL[b2]
0.003101f, // FreqL[b3]
0.003101f, // FreqL[b4]
0.008527f, // FreqL[b5]
0.003101f, // FreqL[b6]
0.005426f, // FreqL[b7]
0.005426f, // FreqL[b8]
0.005426f, // FreqL[b9]
0.003101f, // FreqL[ba]
0.001550f, // FreqL[bb]
0.006202f, // FreqL[bc]
0.014729f, // FreqL[bd]
0.010853f, // FreqL[be]
0.000000f, // FreqL[bf]
0.011628f, // FreqL[c0]
0.000000f, // FreqL[c1]
0.031783f, // FreqL[c2]
0.013953f, // FreqL[c3]
0.030233f, // FreqL[c4]
0.039535f, // FreqL[c5]
0.008527f, // FreqL[c6]
0.015504f, // FreqL[c7]
0.000000f, // FreqL[c8]
0.003101f, // FreqL[c9]
0.008527f, // FreqL[ca]
0.016279f, // FreqL[cb]
0.005426f, // FreqL[cc]
0.001550f, // FreqL[cd]
0.013953f, // FreqL[ce]
0.013953f, // FreqL[cf]
0.044961f, // FreqL[d0]
0.003101f, // FreqL[d1]
0.004651f, // FreqL[d2]
0.006977f, // FreqL[d3]
0.001550f, // FreqL[d4]
0.005426f, // FreqL[d5]
0.012403f, // FreqL[d6]
0.001550f, // FreqL[d7]
0.015504f, // FreqL[d8]
0.000000f, // FreqL[d9]
0.006202f, // FreqL[da]
0.001550f, // FreqL[db]
0.000000f, // FreqL[dc]
0.007752f, // FreqL[dd]
0.006977f, // FreqL[de]
0.001550f, // FreqL[df]
0.009302f, // FreqL[e0]
0.011628f, // FreqL[e1]
0.004651f, // FreqL[e2]
0.010853f, // FreqL[e3]
0.012403f, // FreqL[e4]
0.017829f, // FreqL[e5]
0.005426f, // FreqL[e6]
0.024806f, // FreqL[e7]
0.000000f, // FreqL[e8]
0.006202f, // FreqL[e9]
0.000000f, // FreqL[ea]
0.082171f, // FreqL[eb]
0.015504f, // FreqL[ec]
0.004651f, // FreqL[ed]
0.000000f, // FreqL[ee]
0.006977f, // FreqL[ef]
0.004651f, // FreqL[f0]
0.000000f, // FreqL[f1]
0.008527f, // FreqL[f2]
0.012403f, // FreqL[f3]
0.004651f, // FreqL[f4]
0.003876f, // FreqL[f5]
0.003101f, // FreqL[f6]
0.022481f, // FreqL[f7]
0.024031f, // FreqL[f8]
0.001550f, // FreqL[f9]
0.047287f, // FreqL[fa]
0.009302f, // FreqL[fb]
0.001550f, // FreqL[fc]
0.005426f, // FreqL[fd]
0.017054f // FreqL[fe]
},
0.014156f, // Trail Byte StdDev
0.010638f, // Trail Byte Mean
0.413467f // Trial Byte Weight
};

Просмотреть файл

@ -22,6 +22,7 @@
*/ */
#include <math.h>
#include "nsVerifier.h" #include "nsVerifier.h"
//---- for verifiers //---- for verifiers
#include "nsSJISVerifier.h" #include "nsSJISVerifier.h"
@ -95,7 +96,140 @@ NS_DEFINE_CID(kCJKStringPSMDetectorCID, NS_CJK_STRING_PSMDETECTOR_CID);
#define DETECTOR_DEBUG #define DETECTOR_DEBUG
typedef struct {
float mFirstByteFreq[94];
float mFirstByteStdDev;
float mFirstByteMean;
float mFirstByteWeight;
float mSecoundByteFreq[94];
float mSecoundByteStdDev;
float mSecoundByteMean;
float mSecoundByteWeight;
} nsEUCStatistics;
static nsEUCStatistics gBig5Statistics =
#include "Big5Statistics.h"
// end of UECTWStatistics.h include
static nsEUCStatistics gEUCTWStatistics =
#include "EUCTWStatistics.h"
// end of UECTWStatistics.h include
static nsEUCStatistics gGB2312Statistics =
#include "GB2312Statistics.h"
// end of GB2312Statistics.h include
static nsEUCStatistics gEUCJPStatistics =
#include "EUCJPStatistics.h"
// end of EUCJPStatistics.h include
static nsEUCStatistics gEUCKRStatistics =
#include "EUCKRStatistics.h"
// end of EUCKRStatistics.h include
class nsEUCSampler {
public:
nsEUCSampler() {
mTotal =0;
mThreshold = 2000;
mState = 0;
PRInt32 i;
for(i=0;i<94;i++)
mFirstByteCnt[i] = mSecondByteCnt[i]=0;
}
PRBool EnoughData() { return mTotal > mThreshold; }
PRBool GetSomeData() { return mTotal > 1; }
PRBool Sample(const char* aIn, PRUint32 aLen);
void CalFreq();
float GetScore(const float* aFirstByteFreq, float aFirstByteWeight,
const float* aSecondByteFreq, float aSecondByteWeight);
float GetScore(const float* array1, const float* array2);
private:
PRUint32 mTotal;
PRUint32 mThreshold;
PRInt8 mState;
PRUint32 mFirstByteCnt[94];
PRUint32 mSecondByteCnt[94];
float mFirstByteFreq[94];
float mSecondByteFreq[94];
};
PRBool nsEUCSampler::Sample(const char* aIn, PRUint32 aLen)
{
if(mState == 1)
return PR_FALSE;
const unsigned char* p = (const unsigned char*) aIn;
if(aLen + mTotal > 0x80000000)
aLen = 0x80000000 - mTotal;
PRUint32 i;
for(i=0; (i<aLen) && (1 != mState) ;i++,p++)
{
switch(mState) {
case 0:
if( *p & 0x0080)
{
if((0x00ff == *p) || ( 0x00a1 > *p)) {
mState = 1;
} else {
mTotal++;
mFirstByteCnt[*p - 0x00a1]++;
mState = 2;
}
}
break;
case 1:
break;
case 2:
if( *p & 0x0080)
{
if((0x00ff == *p) || ( 0x00a1 > *p)) {
mState = 1;
} else {
mTotal++;
mSecondByteCnt[*p - 0x00a1]++;
mState = 0;
}
} else {
mState = 1;
}
break;
default:
mState = 1;
}
}
return (1 != mState );
}
float nsEUCSampler::GetScore(const float* aFirstByteFreq, float aFirstByteWeight,
const float* aSecondByteFreq, float aSecondByteWeight)
{
return GetScore(aFirstByteFreq, mFirstByteFreq) ;
/*
return aFirstByteWeight * GetScore(aFirstByteFreq, mFirstByteFreq) +
aSecondByteWeight * GetScore(aSecondByteFreq, mSecondByteFreq);
*/
}
float nsEUCSampler::GetScore(const float* array1, const float* array2)
{
float s;
float sum=0.0;
PRUint16 i;
for(i=0;i<94;i++) {
s = array1[i] - array2[i];
sum += s * s;
}
return (float)sqrt((double)sum) / 94.0f;
}
void nsEUCSampler::CalFreq()
{
PRUint32 i;
for(i = 0 ; i < 94; i++) {
mFirstByteFreq[i] = (float)mFirstByteCnt[i] / (float)mTotal;
mSecondByteFreq[i] = (float)mSecondByteCnt[i] / (float)mTotal;
}
}
/* /*
In the current design, we know the following combination of verifiers In the current design, we know the following combination of verifiers
are not good- are not good-
@ -114,7 +248,7 @@ NS_DEFINE_CID(kCJKStringPSMDetectorCID, NS_CJK_STRING_PSMDETECTOR_CID);
#define MAX_VERIFIERS 16 #define MAX_VERIFIERS 16
class nsPSMDetector { class nsPSMDetector {
public : public :
nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet); nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet, nsEUCStatistics** aStatisticsSet);
virtual ~nsPSMDetector() {}; virtual ~nsPSMDetector() {};
virtual PRBool HandleData(const char* aBuf, PRUint32 aLen); virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
@ -127,18 +261,25 @@ protected:
PRUint8 mState[MAX_VERIFIERS]; PRUint8 mState[MAX_VERIFIERS];
PRUint8 mItemIdx[MAX_VERIFIERS]; PRUint8 mItemIdx[MAX_VERIFIERS];
nsVerifier** mVerifier; nsVerifier** mVerifier;
nsEUCStatistics** mStatisticsData;
PRBool mDone; PRBool mDone;
PRBool mRunSampler;
protected:
void Sample(const char* aBuf, PRUint32 aLen, PRBool aLastChance=PR_FALSE);
private: private:
#ifdef DETECTOR_DEBUG #ifdef DETECTOR_DEBUG
PRUint32 mDbgTest; PRUint32 mDbgTest;
PRUint32 mDbgLen; PRUint32 mDbgLen;
#endif #endif
nsEUCSampler mSampler;
}; };
//---------------------------------------------------------- //----------------------------------------------------------
nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet) nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet, nsEUCStatistics** aStatisticsSet)
{ {
mRunSampler = (nsnull != aStatisticsSet);
mStatisticsData = aStatisticsSet;
mDone= PR_FALSE; mDone= PR_FALSE;
mItems = aItems; mItems = aItems;
NS_ASSERTION(MAX_VERIFIERS >= aItems , "MAX_VERIFIERS is too small!"); NS_ASSERTION(MAX_VERIFIERS >= aItems , "MAX_VERIFIERS is too small!");
@ -156,6 +297,8 @@ nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier** aVerifierSet)
//---------------------------------------------------------- //----------------------------------------------------------
void nsPSMDetector::DataEnd() void nsPSMDetector::DataEnd()
{ {
if(mRunSampler)
Sample(nsnull, 0, PR_TRUE);
} }
//---------------------------------------------------------- //----------------------------------------------------------
@ -251,12 +394,76 @@ PRBool nsPSMDetector::HandleData(const char* aBuf, PRUint32 aLen)
} }
} }
} }
if(mRunSampler)
Sample(aBuf, aLen);
#ifdef DETECTOR_DEBUG #ifdef DETECTOR_DEBUG
mDbgLen += aLen; mDbgLen += aLen;
#endif #endif
return PR_FALSE; return PR_FALSE;
} }
void nsPSMDetector::Sample(const char* aBuf, PRUint32 aLen, PRBool aLastChance)
{
PRInt32 nonUCS2Num=0;
PRInt32 j;
PRInt32 eucNum=0;
for(j = 0; j < mItems; j++) {
if(nsnull != mStatisticsData[mItemIdx[j]])
eucNum++;
if(((&nsUCS2BEVerifier) != mVerifier[mItemIdx[j]]) &&
((&nsUCS2LEVerifier) != mVerifier[mItemIdx[j]])) {
nonUCS2Num++;
}
}
mRunSampler = (eucNum > 1);
if(mRunSampler) {
mRunSampler = mSampler.Sample(aBuf, aLen);
if(((aLastChance && mSampler.GetSomeData()) ||
mSampler.EnoughData())
&& (eucNum == nonUCS2Num)) {
mSampler.CalFreq();
#ifdef DETECTOR_DEBUG
printf("We cannot figure out charset from the encoding, "
"All EUC based charset share the same encoding structure.\n"
"Detect based on statistics");
if(aLastChance) {
printf(" after we receive all the data.\n");
} else {
printf(" after we receive enough data.\n");
}
#endif
PRInt32 bestIdx;
PRInt32 eucCnt=0;
float bestScore = 0.0f;
for(j = 0; j < mItems; j++) {
if(nsnull != mStatisticsData[mItemIdx[j]])
{
float score = mSampler.GetScore(
mStatisticsData[mItemIdx[j]]->mFirstByteFreq,
mStatisticsData[mItemIdx[j]]->mFirstByteWeight,
mStatisticsData[mItemIdx[j]]->mSecoundByteFreq,
mStatisticsData[mItemIdx[j]]->mSecoundByteWeight );
#ifdef DETECTOR_DEBUG
printf("Differences between %s and this data is %2.8f\n",
mVerifier[mItemIdx[j]]->charset,
score);
#endif
if(( 0 == eucCnt++) || (bestScore > score )) {
bestScore = score;
bestIdx = j;
} // if(( 0 == eucCnt++) || (bestScore > score ))
} // if(nsnull != ...)
} // for
#ifdef DETECTOR_DEBUG
printf("Based on the statistic, we decide it is %s",
mVerifier[mItemIdx[bestIdx]]->charset);
#endif
Report( mVerifier[mItemIdx[bestIdx]]->charset);
mDone = PR_TRUE;
} // if (eucNum == nonUCS2Num)
} // if(mRunSampler)
}
//========================================================== //==========================================================
/* /*
This class won't detect x-euc-tw for now. It can only This class won't detect x-euc-tw for now. It can only
@ -277,6 +484,15 @@ static nsVerifier *gZhTwVerifierSet[ZHTW_DETECTOR_NUM_VERIFIERS] = {
&nsUCS2BEVerifier, &nsUCS2BEVerifier,
&nsUCS2LEVerifier &nsUCS2LEVerifier
}; };
static nsEUCStatistics *gZhTwStatisticsSet[ZHTW_DETECTOR_NUM_VERIFIERS] = {
nsnull,
&gBig5Statistics,
nsnull,
&gEUCTWStatistics,
nsnull,
nsnull,
nsnull
};
//========================================================== //==========================================================
#define KO_DETECTOR_NUM_VERIFIERS 6 #define KO_DETECTOR_NUM_VERIFIERS 6
static nsVerifier *gKoVerifierSet[KO_DETECTOR_NUM_VERIFIERS] = { static nsVerifier *gKoVerifierSet[KO_DETECTOR_NUM_VERIFIERS] = {
@ -322,6 +538,17 @@ static nsVerifier *gZhVerifierSet[ZH_DETECTOR_NUM_VERIFIERS] = {
&nsUCS2BEVerifier, &nsUCS2BEVerifier,
&nsUCS2LEVerifier &nsUCS2LEVerifier
}; };
static nsEUCStatistics *gZhStatisticsSet[ZH_DETECTOR_NUM_VERIFIERS] = {
nsnull,
&gGB2312Statistics,
&gBig5Statistics,
nsnull,
nsnull,
&gEUCTWStatistics,
nsnull,
nsnull,
nsnull
};
//========================================================== //==========================================================
#define CJK_DETECTOR_NUM_VERIFIERS 14 #define CJK_DETECTOR_NUM_VERIFIERS 14
static nsVerifier *gCJKVerifierSet[CJK_DETECTOR_NUM_VERIFIERS] = { static nsVerifier *gCJKVerifierSet[CJK_DETECTOR_NUM_VERIFIERS] = {
@ -340,6 +567,22 @@ static nsVerifier *gCJKVerifierSet[CJK_DETECTOR_NUM_VERIFIERS] = {
&nsUCS2BEVerifier, &nsUCS2BEVerifier,
&nsUCS2LEVerifier &nsUCS2LEVerifier
}; };
static nsEUCStatistics *gCJKStatisticsSet[CJK_DETECTOR_NUM_VERIFIERS] = {
nsnull,
nsnull,
&gEUCJPStatistics,
nsnull,
&gEUCKRStatistics,
nsnull,
&gBig5Statistics,
&gEUCTWStatistics,
&gGB2312Statistics,
nsnull,
nsnull,
nsnull,
nsnull,
nsnull
};
//========================================================== //==========================================================
class nsXPCOMDetector : class nsXPCOMDetector :
private nsPSMDetector, private nsPSMDetector,
@ -347,7 +590,7 @@ class nsXPCOMDetector :
{ {
NS_DECL_ISUPPORTS NS_DECL_ISUPPORTS
public: public:
nsXPCOMDetector(PRUint8 aItems, nsVerifier** aVer); nsXPCOMDetector(PRUint8 aItems, nsVerifier** aVer, nsEUCStatistics** aStatisticsSet);
virtual ~nsXPCOMDetector(); virtual ~nsXPCOMDetector();
NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver); NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver);
NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe); NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe);
@ -360,8 +603,8 @@ private:
nsICharsetDetectionObserver* mObserver; nsICharsetDetectionObserver* mObserver;
}; };
//---------------------------------------------------------- //----------------------------------------------------------
nsXPCOMDetector::nsXPCOMDetector(PRUint8 aItems, nsVerifier **aVer) nsXPCOMDetector::nsXPCOMDetector(PRUint8 aItems, nsVerifier **aVer, nsEUCStatistics** aStatisticsSet)
: nsPSMDetector( aItems, aVer) : nsPSMDetector( aItems, aVer, aStatisticsSet)
{ {
NS_INIT_REFCNT(); NS_INIT_REFCNT();
PR_AtomicIncrement(&g_InstanceCount); PR_AtomicIncrement(&g_InstanceCount);
@ -404,6 +647,7 @@ NS_IMETHODIMP nsXPCOMDetector::DoIt(
NS_IMETHODIMP nsXPCOMDetector::Done() NS_IMETHODIMP nsXPCOMDetector::Done()
{ {
NS_ASSERTION(mObserver != nsnull , "have not init yet"); NS_ASSERTION(mObserver != nsnull , "have not init yet");
this->DataEnd();
return NS_OK; return NS_OK;
} }
//---------------------------------------------------------- //----------------------------------------------------------
@ -418,7 +662,7 @@ class nsXPCOMStringDetector :
{ {
NS_DECL_ISUPPORTS NS_DECL_ISUPPORTS
public: public:
nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer); nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer, nsEUCStatistics** aStatisticsSet);
virtual ~nsXPCOMStringDetector(); virtual ~nsXPCOMStringDetector();
NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen,
const char** oCharset, const char** oCharset,
@ -429,8 +673,8 @@ private:
const char* mResult; const char* mResult;
}; };
//---------------------------------------------------------- //----------------------------------------------------------
nsXPCOMStringDetector::nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer) nsXPCOMStringDetector::nsXPCOMStringDetector(PRUint8 aItems, nsVerifier** aVer, nsEUCStatistics** aStatisticsSet)
: nsPSMDetector( aItems, aVer) : nsPSMDetector( aItems, aVer, aStatisticsSet)
{ {
NS_INIT_REFCNT(); NS_INIT_REFCNT();
PR_AtomicIncrement(&g_InstanceCount); PR_AtomicIncrement(&g_InstanceCount);
@ -515,29 +759,29 @@ NS_IMETHODIMP nsXPCOMDetectorFactory::CreateInstance(
nsXPCOMStringDetector *inst2 = nsnull; nsXPCOMStringDetector *inst2 = nsnull;
if (mCID.Equals(kJAPSMDetectorCID)) { if (mCID.Equals(kJAPSMDetectorCID)) {
inst1 = new nsXPCOMDetector(JA_DETECTOR_NUM_VERIFIERS, gJaVerifierSet); inst1 = new nsXPCOMDetector(JA_DETECTOR_NUM_VERIFIERS, gJaVerifierSet, nsnull);
} else if (mCID.Equals(kKOPSMDetectorCID)) { } else if (mCID.Equals(kKOPSMDetectorCID)) {
inst1 = new nsXPCOMDetector(KO_DETECTOR_NUM_VERIFIERS, gKoVerifierSet); inst1 = new nsXPCOMDetector(KO_DETECTOR_NUM_VERIFIERS, gKoVerifierSet, nsnull);
} else if (mCID.Equals(kZHCNPSMDetectorCID)) { } else if (mCID.Equals(kZHCNPSMDetectorCID)) {
inst1 = new nsXPCOMDetector(ZHCN_DETECTOR_NUM_VERIFIERS, gZhCnVerifierSet); inst1 = new nsXPCOMDetector(ZHCN_DETECTOR_NUM_VERIFIERS, gZhCnVerifierSet, nsnull);
} else if (mCID.Equals(kZHTWPSMDetectorCID)) { } else if (mCID.Equals(kZHTWPSMDetectorCID)) {
inst1 = new nsXPCOMDetector(ZHTW_DETECTOR_NUM_VERIFIERS, gZhTwVerifierSet); inst1 = new nsXPCOMDetector(ZHTW_DETECTOR_NUM_VERIFIERS, gZhTwVerifierSet, gZhTwStatisticsSet);
} else if (mCID.Equals(kZHPSMDetectorCID)) { } else if (mCID.Equals(kZHPSMDetectorCID)) {
inst1 = new nsXPCOMDetector(ZH_DETECTOR_NUM_VERIFIERS, gZhVerifierSet); inst1 = new nsXPCOMDetector(ZH_DETECTOR_NUM_VERIFIERS, gZhVerifierSet, gZhStatisticsSet);
} else if (mCID.Equals(kCJKPSMDetectorCID)) { } else if (mCID.Equals(kCJKPSMDetectorCID)) {
inst1 = new nsXPCOMDetector(CJK_DETECTOR_NUM_VERIFIERS, gCJKVerifierSet); inst1 = new nsXPCOMDetector(CJK_DETECTOR_NUM_VERIFIERS, gCJKVerifierSet, gCJKStatisticsSet);
} else if (mCID.Equals(kJAStringPSMDetectorCID)) { } else if (mCID.Equals(kJAStringPSMDetectorCID)) {
inst2 = new nsXPCOMStringDetector(JA_DETECTOR_NUM_VERIFIERS - 3, gJaVerifierSet); inst2 = new nsXPCOMStringDetector(JA_DETECTOR_NUM_VERIFIERS - 3, gJaVerifierSet, nsnull);
} else if (mCID.Equals(kKOStringPSMDetectorCID)) { } else if (mCID.Equals(kKOStringPSMDetectorCID)) {
inst2 = new nsXPCOMStringDetector(KO_DETECTOR_NUM_VERIFIERS - 3, gKoVerifierSet); inst2 = new nsXPCOMStringDetector(KO_DETECTOR_NUM_VERIFIERS - 3, gKoVerifierSet, nsnull);
} else if (mCID.Equals(kZHCNStringPSMDetectorCID)) { } else if (mCID.Equals(kZHCNStringPSMDetectorCID)) {
inst2 = new nsXPCOMStringDetector(ZHCN_DETECTOR_NUM_VERIFIERS - 3, gZhCnVerifierSet); inst2 = new nsXPCOMStringDetector(ZHCN_DETECTOR_NUM_VERIFIERS - 3, gZhCnVerifierSet, nsnull);
} else if (mCID.Equals(kZHTWStringPSMDetectorCID)) { } else if (mCID.Equals(kZHTWStringPSMDetectorCID)) {
inst2 = new nsXPCOMStringDetector(ZHTW_DETECTOR_NUM_VERIFIERS - 3, gZhTwVerifierSet); inst2 = new nsXPCOMStringDetector(ZHTW_DETECTOR_NUM_VERIFIERS - 3, gZhTwVerifierSet, gZhTwStatisticsSet);
} else if (mCID.Equals(kZHStringPSMDetectorCID)) { } else if (mCID.Equals(kZHStringPSMDetectorCID)) {
inst2 = new nsXPCOMStringDetector(ZH_DETECTOR_NUM_VERIFIERS - 3, gZhVerifierSet); inst2 = new nsXPCOMStringDetector(ZH_DETECTOR_NUM_VERIFIERS - 3, gZhVerifierSet, gZhStatisticsSet);
} else if (mCID.Equals(kCJKStringPSMDetectorCID)) { } else if (mCID.Equals(kCJKStringPSMDetectorCID)) {
inst2 = new nsXPCOMStringDetector(CJK_DETECTOR_NUM_VERIFIERS - 3, gCJKVerifierSet); inst2 = new nsXPCOMStringDetector(CJK_DETECTOR_NUM_VERIFIERS - 3, gCJKVerifierSet, gCJKStatisticsSet);
} }
if((NULL == inst1) && (NULL == inst2)) { if((NULL == inst1) && (NULL == inst2)) {
return NS_ERROR_OUT_OF_MEMORY; return NS_ERROR_OUT_OF_MEMORY;