bug 471885 bayes analysis should probability to 0 or 1 with unbalanced tokens, r/sr=bienvenu

2009-01-18 13:17:53 -08:00 · 2009-01-18 13:17:53 -08:00 · a5cc90c627
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@ -1280,21 +1280,29 @@ void nsBayesianFilter::classifyMessage(
        double antiCount =
          static_cast<double>(mCorpus.getTraitCount(t, aAntiTraits[traitIndex]));

-        // if proCount and antiCount are both 0, we could end up with a
-        // divide by 0 error, tread carefully here. (Bug #240819)
-        double probDenom = (proCount * numAntiMessages[traitIndex] +
-                            antiCount * numProMessages[traitIndex]);
-        if (probDenom != 0.0)
+        double prob, proDenom, antiDenom;
+        // Prevent a divide by zero error by setting defaults for prob
+
+        // If there are no matching tokens at all, ignore.
+        if (antiCount == 0.0 && proCount == 0.0)
+          continue;
+        // if only anti match, set probability to 0%
+        if ((proDenom = proCount * numProMessages[traitIndex]) == 0.0)
+          prob = 0.0;
+        // if only pro match, set probability to 100%
+        else if ((antiDenom = antiCount * numAntiMessages[traitIndex]) == 0.0)
+          prob = 1.0;
+        else
+          prob = (proCount * numAntiMessages[traitIndex]) /
+                 (proDenom + antiDenom);
+
+        double n = proCount + antiCount;
+        prob =  (0.225 + n * prob) / (.45 + n);
+        double distance = PR_ABS(prob - 0.5);
+        if (distance >= .1)
        {
-          double prob = (proCount * numAntiMessages[traitIndex])/probDenom;
-          double n = proCount + antiCount;
-          prob =  (0.225 + n * prob) / (.45 + n);
-          double distance = PR_ABS(prob - 0.5);
-          if (distance >= .1)
-          {
-            nsresult rv = setAnalysis(token, traitIndex, distance, prob);
-            NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
-          }
+          nsresult rv = setAnalysis(token, traitIndex, distance, prob);
+          NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
        }
      }
    }
--- a/mailnews/extensions/bayesian-spam-filter/test/unit/test_junkAsTraits.js
+++ b/mailnews/extensions/bayesian-spam-filter/test/unit/test_junkAsTraits.js
@ -95,10 +95,10 @@ var tests =
   junkPercent: 0,
   traitListener: false,
   junkListener: true},
-  // with ham but no spam training, percents still 50 but classifies as ham
+  // with ham but no spam training, percents are 0 and classifies as ham
  {command: kClassT,
   fileName: "ham1.eml",
-   junkPercent: 50,
+   junkPercent: 0,
   traitListener: false,
   junkListener: true},
  // train 1 spam message