Bug #244357 --> Fix several underflow situations in the chi2 algorithm used for junk mail detection. Simplify the math to make it less computensive.

Patch origianlly by tenthumbs@cybernex.net sr=mscott
2004-10-19 21:25:00 +00:00 · 2004-10-19 21:25:00 +00:00 · 1a4937b94c
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@ -79,6 +79,7 @@
 #include "nsIHTMLToTextSink.h"
 #include "nsIDocumentEncoder.h" 

+#include "nsIncompleteGamma.h"
 #include <math.h>

 static PRLogModuleInfo *BayesianFilterLogModule = nsnull;
@ -923,21 +924,28 @@ PR_STATIC_CALLBACK(int) compareTokens(const void* p1, const void* p2, void* /* d
 inline double dmax(double x, double y) { return (x > y ? x : y); }
 inline double dmin(double x, double y) { return (x < y ? x : y); }

-double chi2Q (double x2, PRUint32 v) 
+// Chi square functions are implemented by an incomplete gamma function.
+// Note that chi2P's callers multiply the arguments by 2 but chi2P
+// divides them by 2 again. Inlining chi2P gives the compiler a
+// chance to notice this.
+
+// Both chi2P and nsIncompleteGammaP set *error negative on domain
+// errors and nsIncompleteGammaP sets it posivive on internal errors.
+// This may be useful but the chi2P callers treat any error as fatal.
+
+// Note that converting unsigned ints to floating point can be slow on
+// some platforms (like Intel) so use signed quantities for the numeric
+// routines.
+static inline double chi2P (double chi2, double nu, PRInt32 *error)
 {
-  PRUint32 i;
-  double m = x2 / 2.0;
-  double sum = exp(-m);
-  double term = sum;
-
-  NS_ASSERTION(!(v & 1), "chi2Q called with odd value");
-  for (i=1; i < v/2; ++i) 
-  {
-    term *= m / i;
-    sum += term;
-  }
-
-  return dmin(sum,1.0);
+    // domain checks; set error and return a dummy value
+    if (chi2 < 0.0 || nu <= 0.0)
+    {
+        *error = -1;
+        return 0.0;
+    }
+    // reversing the arguments is intentional
+    return nsIncompleteGammaP (nu/2.0, chi2/2.0, error);
 }

 void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* messageURI,
@ -1037,12 +1045,18 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message

    if (goodclues > 0) 
    {
-      S = 1.0 - chi2Q(-2.0 * S, 2 * goodclues);
-      H = 1.0 - chi2Q(-2.0 * H, 2 * goodclues);
-      prob = (S-H +1.0) / 2.0;
+        PRInt32 chi_error;
+        S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
+        if (!chi_error)
+            H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
+        // if any error toss the entire calculation
+        if (!chi_error)
+            prob = (S-H +1.0) / 2.0;
+        else
+            prob = 0.5;
    } 
    else 
-      prob = 0.5;
+        prob = 0.5;

    PRBool isJunk = (prob >= mJunkProbabilityThreshold);
    PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("%s is junk probability = (%f)  HAM SCORE:%f SPAM SCORE:%f", messageURI, prob,H,S));
--- a/mailnews/extensions/bayesian-spam-filter/src/nsIncompleteGamma.h
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsIncompleteGamma.h
@ -0,0 +1,291 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/***** BEGIN LICENSE BLOCK *****
+  Version: MPL 1.1/GPL 2.0/LGPL 2.1
+
+  The contents of this file are subject to the Mozilla Public License Version 
+  1.1 (the "License"); you may not use this file except in compliance with 
+  the License. You may obtain a copy of the License at 
+  http://www.mozilla.org/MPL/
+
+  Software distributed under the License is distributed on an "AS IS" basis,
+  WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+  for the specific language governing rights and limitations under the
+  License.
+
+  The Original Code is bayesian spam filter
+
+  Portions created by the Initial Developer are Copyright (C) 2004
+  the Initial Developer. All Rights Reserved.
+
+  Contributor(s):
+    Scott MacGregor <mscott@mozilla.org>
+    tenthumbs@cybernex.net
+
+  Alternatively, the contents of this file may be used under the terms of
+  either of the GNU General Public License Version 2 or later (the "GPL"),
+  or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+  in which case the provisions of the GPL or the LGPL are applicable instead
+  of those above. If you wish to allow use of your version of this file only
+  under the terms of either the GPL or the LGPL, and not to allow others to
+  use your version of this file under the terms of the MPL, indicate your
+  decision by deleting the provisions above and replace them with the notice
+  and other provisions required by the GPL or the LGPL. If you do not delete
+  the provisions above, a recipient may use your version of this file under
+  the terms of any one of the MPL, the GPL or the LGPL.
+
+***** END LICENSE BLOCK *****/
+
+#ifndef nsIncompleteGamma_h__
+#define nsIncompleteGamma_h__
+
+/* An implementation of the incomplete gamma functions for real
+   arguments. P is defined as
+
+                          x
+                         /
+                  1      [     a - 1  - t
+    P(a, x) =  --------  I    t      e    dt
+               Gamma(a)  ]
+                         /
+                          0
+
+   and
+
+                          infinity
+                         /
+                  1      [     a - 1  - t
+    Q(a, x) =  --------  I    t      e    dt
+               Gamma(a)  ]
+                         /
+                          x
+
+   so that P(a,x) + Q(a,x) = 1.
+
+   Both a series expansion and a continued fraction exist.  This
+   implementation uses the more efficient method based on the arguments.
+
+   Either case involves calculating a multiplicative term:
+      e^(-x)*x^a/Gamma(a).
+   Here we calculate the log of this term. Most math libraries have a
+   "lgamma" function but it is not re-entrant. Some libraries have a
+   "lgamma_r" which is re-entrant. Use it if possible. I have included a
+   simple replacement but it is certainly not as accurate.
+
+   Relative errors are almost always < 1e-10 and usually < 1e-14. Very
+   small and very large arguments cause trouble.
+
+   The region where a < 0.5 and x < 0.5 has poor error properties and is
+   not too stable. Get a better routine if you need results in this
+   region.
+
+   The error argument will be set negative if there is a domain error or
+   positive for an internal calculation error, currently lack of
+   convergence. A value is always returned, though.
+
+ */
+
+#include <math.h>
+#include <float.h>
+
+// the main routine
+static double nsIncompleteGammaP (double a, double x, int *error);
+
+// nsLnGamma(z): either a wrapper around lgamma_r or the internal function.
+// C_m = B[2*m]/(2*m*(2*m-1)) where B is a Bernoulli number
+static const double C_1 = 1.0 / 12.0;
+static const double C_2 = -1.0 / 360.0;
+static const double C_3 = 1.0 / 1260.0;
+static const double C_4 = -1.0 / 1680.0;
+static const double C_5 = 1.0 / 1188.0;
+static const double C_6 = -691.0 / 360360.0;
+static const double C_7 = 1.0 / 156.0;
+static const double C_8 = -3617.0 / 122400.0;
+static const double C_9 = 43867.0 / 244188.0;
+static const double C_10 = -174611.0 / 125400.0;
+static const double C_11 = 77683.0 / 5796.0;
+
+// truncated asymptotic series in 1/z
+static inline double lngamma_asymp (double z)
+{
+  double w, w2, sum;
+  w = 1.0 / z;
+  w2 = w * w;
+  sum = w * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2
+        * (C_11 * w2 + C_10) + C_9) + C_8) + C_7) + C_6)
+        + C_5) + C_4) + C_3) + C_2) + C_1);
+
+  return sum;
+}
+
+struct fact_table_s
+{
+  double fact;
+  double lnfact;
+};
+
+// for speed and accuracy
+static const struct fact_table_s FactTable[] = {
+    {1.000000000000000, 0.0000000000000000000000e+00},
+    {1.000000000000000, 0.0000000000000000000000e+00},
+    {2.000000000000000, 6.9314718055994530942869e-01},
+    {6.000000000000000, 1.7917594692280550007892e+00},
+    {24.00000000000000, 3.1780538303479456197550e+00},
+    {120.0000000000000, 4.7874917427820459941458e+00},
+    {720.0000000000000, 6.5792512120101009952602e+00},
+    {5040.000000000000, 8.5251613610654142999881e+00},
+    {40320.00000000000, 1.0604602902745250228925e+01},
+    {362880.0000000000, 1.2801827480081469610995e+01},
+    {3628800.000000000, 1.5104412573075515295248e+01},
+    {39916800.00000000, 1.7502307845873885839769e+01},
+    {479001600.0000000, 1.9987214495661886149228e+01},
+    {6227020800.000000, 2.2552163853123422886104e+01},
+    {87178291200.00000, 2.5191221182738681499610e+01},
+    {1307674368000.000, 2.7899271383840891566988e+01},
+    {20922789888000.00, 3.0671860106080672803835e+01},
+    {355687428096000.0, 3.3505073450136888885825e+01},
+    {6402373705728000., 3.6395445208033053576674e+01}
+};
+#define FactTableLength (int)(sizeof(FactTable)/sizeof(FactTable[0]))
+
+// for speed
+static const double ln_2pi_2 = 0.918938533204672741803;	// log(2*PI)/2
+
+/* A simple lgamma function, not very robust.
+
+   Valid for z_in > 0 ONLY.
+
+   For z_in > 8 precision is quite good, relative errors < 1e-14 and
+   usually better. For z_in < 8 relative errors increase but are usually
+   < 1e-10. In two small regions, 1 +/- .001 and 2 +/- .001 errors
+   increase quickly.
+*/
+static double nsLnGamma (double z_in, int *gsign)
+{
+  double scale, z, sum, result;
+  *gsign = 1;
+
+  int zi = (int) z_in;
+  if (z_in == (double) zi)
+  {
+    if (0 < zi && zi <= FactTableLength)
+	    return FactTable[zi - 1].lnfact;	// gamma(z) = (z-1)!
+  }
+
+  for (scale = 1.0, z = z_in; z < 8.0; ++z)
+    scale *= z;
+  
+  sum = lngamma_asymp (z);
+  result = (z - 0.5) * log (z) - z + ln_2pi_2 - log (scale);
+  result += sum;
+  return result;
+}
+
+// log( e^(-x)*x^a/Gamma(a) )
+static inline double lnPQfactor (double a, double x)
+{
+  int gsign;				// ignored because a > 0
+  return a * log (x) - x - nsLnGamma (a, &gsign);
+}
+
+static double Pseries (double a, double x, int *error)
+{
+  double sum, term;
+  const double eps = 2.0 * DBL_EPSILON;
+  const int imax = 5000;
+  int i;
+
+  sum = term = 1.0 / a;
+  for (i = 1; i < imax; ++i)
+  {
+	  term *= x / (a + i);
+	  sum += term;
+	  if (fabs (term) < eps * fabs (sum))
+      break;
+  }
+
+  if (i >= imax)
+	  *error = 1;
+
+  return sum;
+}
+
+static double Qcontfrac (double a, double x, int *error)
+{
+  double result, D, C, e, f, term;
+  const double eps = 2.0 * DBL_EPSILON;
+  const double small =
+  DBL_EPSILON * DBL_EPSILON * DBL_EPSILON * DBL_EPSILON;
+  const int imax = 5000;
+  int i;
+
+  // modified Lentz method
+  f = x - a + 1.0;
+  if (fabs (f) < small)
+    f = small;
+  C = f + 1.0 / small;
+  D = 1.0 / f;
+  result = D;
+  for (i = 1; i < imax; ++i)
+  {
+    e = i * (a - i);
+    f += 2.0;
+    D = f + e * D;
+    if (fabs (D) < small)
+      D = small;
+    D = 1.0 / D;
+    C = f + e / C;
+    if (fabs (C) < small)
+      C = small;
+    term = C * D;
+    result *= term;
+    if (fabs (term - 1.0) < eps)
+      break;
+  }
+
+  if (i >= imax)
+    *error = 1;
+  return result;
+}
+
+static double nsIncompleteGammaP (double a, double x, int *error)
+{
+  double result, dom, ldom;
+  //  domain errors. the return values are meaningless but have
+  //  to return something. 
+  *error = -1;
+  if (a <= 0.0)
+	  return 1.0;
+  if (x < 0.0)
+	  return 0.0;
+  *error = 0;
+  if (x == 0.0)
+    return 0.0;
+
+  ldom = lnPQfactor (a, x);
+  dom = exp (ldom);
+  // might need to adjust the crossover point
+  if (a <= 0.5)
+  {
+    if (x < a + 1.0)
+      result = dom * Pseries (a, x, error);
+    else
+      result = 1.0 - dom * Qcontfrac (a, x, error);
+  }
+  else
+  {
+    if (x < a)
+      result = dom * Pseries (a, x, error);
+	  else
+	    result = 1.0 - dom * Qcontfrac (a, x, error);
+  }
+
+  // not clear if this can ever happen
+  if (result > 1.0)
+    result = 1.0;
+  if (result < 0.0)
+    result = 0.0;
+  return result;
+}
+
+#endif
+