bug 471885 bayes analysis should probability to 0 or 1 with unbalanced tokens, r/sr=bienvenu

This commit is contained in:
Kent James 2009-01-18 13:17:53 -08:00
Родитель 2817bb3bb6
Коммит a5cc90c627
2 изменённых файлов: 24 добавлений и 16 удалений

Просмотреть файл

@ -1280,21 +1280,29 @@ void nsBayesianFilter::classifyMessage(
double antiCount =
static_cast<double>(mCorpus.getTraitCount(t, aAntiTraits[traitIndex]));
// if proCount and antiCount are both 0, we could end up with a
// divide by 0 error, tread carefully here. (Bug #240819)
double probDenom = (proCount * numAntiMessages[traitIndex] +
antiCount * numProMessages[traitIndex]);
if (probDenom != 0.0)
double prob, proDenom, antiDenom;
// Prevent a divide by zero error by setting defaults for prob
// If there are no matching tokens at all, ignore.
if (antiCount == 0.0 && proCount == 0.0)
continue;
// if only anti match, set probability to 0%
if ((proDenom = proCount * numProMessages[traitIndex]) == 0.0)
prob = 0.0;
// if only pro match, set probability to 100%
else if ((antiDenom = antiCount * numAntiMessages[traitIndex]) == 0.0)
prob = 1.0;
else
prob = (proCount * numAntiMessages[traitIndex]) /
(proDenom + antiDenom);
double n = proCount + antiCount;
prob = (0.225 + n * prob) / (.45 + n);
double distance = PR_ABS(prob - 0.5);
if (distance >= .1)
{
double prob = (proCount * numAntiMessages[traitIndex])/probDenom;
double n = proCount + antiCount;
prob = (0.225 + n * prob) / (.45 + n);
double distance = PR_ABS(prob - 0.5);
if (distance >= .1)
{
nsresult rv = setAnalysis(token, traitIndex, distance, prob);
NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
}
nsresult rv = setAnalysis(token, traitIndex, distance, prob);
NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
}
}
}

Просмотреть файл

@ -95,10 +95,10 @@ var tests =
junkPercent: 0,
traitListener: false,
junkListener: true},
// with ham but no spam training, percents still 50 but classifies as ham
// with ham but no spam training, percents are 0 and classifies as ham
{command: kClassT,
fileName: "ham1.eml",
junkPercent: 50,
junkPercent: 0,
traitListener: false,
junkListener: true},
// train 1 spam message