add PR_LOGGING for the bayesian filter.
people will have problems, and this will help dev and QA track them down.
it's *really* verbose, but that can be addressed later.
r/sr=bienvenu, a=sspitzer for 1.4 final
This commit is contained in:
sspitzer%netscape.com 2003-05-23 07:34:19 +00:00
Родитель 24b50346a6
Коммит c7cb4b6c49
1 изменённых файлов: 51 добавлений и 12 удалений

Просмотреть файл

@ -21,6 +21,7 @@
*
* Contributor(s):
* Patrick C. Beard <beard@netscape.com>
* Seth Spitzer <sspitzer@netscape.com>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -36,6 +37,11 @@
*
* ***** END LICENSE BLOCK ***** */
#ifdef MOZ_LOGGING
// sorry, this has to be before the pre-compiled header
#define FORCE_PR_LOG /* Allow logging in the release build */
#endif
#include "nsCRT.h"
#include "nsBayesianFilter.h"
#include "nsIInputStream.h"
@ -47,6 +53,9 @@
#include "nsMsgUtils.h" // for GetMessageServiceFromURI
#include "prnetdb.h"
#include "nsIMsgWindow.h"
#include "prlog.h"
static PRLogModuleInfo *BayesianFilterLogModule = nsnull;
static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f!\"#%&()*+,./:;<=>?@[\\]^_`{|}~";
@ -153,6 +162,8 @@ Tokenizer::Tokenizer()
PL_INIT_ARENA_POOL(&mWordPool, "Words Arena", 16384);
PRBool ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
NS_ASSERTION(ok, "mTokenTable failed to initialize");
if (!ok)
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("mTokenTable failed to initialize"));
}
Tokenizer::~Tokenizer()
@ -173,6 +184,8 @@ nsresult Tokenizer::clearTokens()
PL_FreeArenaPool(&mWordPool);
ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
NS_ASSERTION(ok, "mTokenTable failed to initialize");
if (!ok)
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("mTokenTable failed to initialize in clearTokens()"));
}
return (ok) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}
@ -197,23 +210,29 @@ inline Token* Tokenizer::get(const char* word)
Token* Tokenizer::add(const char* word, PRUint32 count)
{
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("add word: %s (count=%d)", word, count));
PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_ADD);
Token* token = NS_STATIC_CAST(Token*, entry);
if (token) {
if (token->mWord == NULL) {
PRUint32 len = strlen(word);
NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
if (!len)
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding zero length word to tokenizer"));
token->mWord = copyWord(word, len);
NS_ASSERTION(token->mWord, "copyWord failed");
if (!token->mWord) {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("copyWord failed: %s (%d)", word, len));
PL_DHashTableRawRemove(&mTokenTable, entry);
return NULL;
}
token->mLength = len;
token->mCount = count;
token->mProbability = 0;
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding word to tokenizer: %s (len=%d) (count=%d)", word, len, count));
} else {
token->mCount += count;
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count, token->mCount));
}
}
return token;
@ -221,14 +240,19 @@ Token* Tokenizer::add(const char* word, PRUint32 count)
void Tokenizer::remove(const char* word, PRUint32 count)
{
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("remove word: %s (count=%d)", word, count));
Token* token = get(word);
if (token) {
NS_ASSERTION(token->mCount >= count, "token count underflow");
if (token->mCount >= count) {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("remove word: %s (count=%d) (mCount=%d)", word, count, token->mCount));
token->mCount -= count;
if (token->mCount == 0)
PL_DHashTableRawRemove(&mTokenTable, token);
}
else {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("token count underflow: %s (count=%d) (mCount=%d)", word, count, token->mCount));
}
}
}
@ -269,6 +293,7 @@ static char* toLowerCase(char* str)
void Tokenizer::tokenize(char* text)
{
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize: %s", text));
char* word;
char* next = text;
while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
@ -324,6 +349,9 @@ void Tokenizer::visit(PRBool (*f) (Token*, void*), void* data)
VisitClosure closure = { f, data };
PRUint32 visitCount = PL_DHashTableEnumerate(&mTokenTable, VisitEntry, &closure);
NS_ASSERTION(visitCount == mTokenTable.entryCount, "visitCount != entryCount!");
if (visitCount != mTokenTable.entryCount) {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("visitCount != entryCount!: %d vs %d", visitCount, mTokenTable.entryCount));
}
}
inline PRUint32 Tokenizer::countTokens()
@ -497,6 +525,7 @@ NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISuppor
}
/* finally, analyze the tokenized message. */
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("analyze the tokenized message"));
if (mAnalyzer)
mAnalyzer->analyzeTokens(mTokenizer);
@ -510,10 +539,16 @@ nsBayesianFilter::nsBayesianFilter()
: mGoodCount(0), mBadCount(0),
mBatchLevel(0), mTrainingDataDirty(PR_FALSE)
{
if (!BayesianFilterLogModule)
BayesianFilterLogModule = PR_NewLogModule("BayesianFilter");
PRBool ok = (mGoodTokens && mBadTokens);
NS_ASSERTION(ok, "error allocating tokenizers");
if (ok)
readTrainingData();
else {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("error allocating tokenizers"));
}
}
nsBayesianFilter::~nsBayesianFilter() {}
@ -552,8 +587,11 @@ public:
virtual void classifyNextMessage()
{
if (++mCurMessageToClassify < mNumMessagesToClassify && mMessageURIs[mCurMessageToClassify])
if (++mCurMessageToClassify < mNumMessagesToClassify && mMessageURIs[mCurMessageToClassify]) {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("classifyNextMessage(%s)", mMessageURIs[mCurMessageToClassify]));
mFilter->tokenizeMessage(mMessageURIs[mCurMessageToClassify], mMsgWindow, this);
}
else
{
mTokenListener = nsnull; // this breaks the circular ref that keeps this object alive
@ -613,10 +651,12 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message
// this will also "encourage" the user to train
// see bug #194238
if (listener && !mGoodCount && !mGoodTokens.countTokens()) {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no good tokens, assume junk"));
listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK));
return;
}
if (listener && !mBadCount && !mBadTokens.countTokens()) {
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no bad tokens, assume good"));
listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
return;
}
@ -643,8 +683,10 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message
(dmin(1.0, (b / nbad)) /
(dmin(1.0, (g / ngood)) +
dmin(1.0, (b / nbad))))));
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("token.mProbability (%s) is %f", word, token.mProbability));
} else {
token.mProbability = 0.4;
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("token.mProbability (%s) assume 0.4", word));
}
}
@ -665,6 +707,7 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message
}
double prob = (prod1 / (prod1 + prod2));
PRBool isJunk = (prob >= 0.90);
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("%s is junk probability = (%f)", messageURI, prob));
delete[] tokens;
@ -690,18 +733,14 @@ NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders(PRBool *aShouldDownl
NS_IMETHODIMP nsBayesianFilter::StartBatch(void)
{
#ifdef DEBUG_dmose
printf("StartBatch() entered with mBatchLevel=%d\n", mBatchLevel);
#endif
++mBatchLevel;
return NS_OK;
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("StartBatch() entered with mBatchLevel=%d", mBatchLevel));
++mBatchLevel;
return NS_OK;
}
NS_IMETHODIMP nsBayesianFilter::EndBatch(void)
{
#ifdef DEBUG_dmose
printf("EndBatch() entered with mBatchLevel=%d\n", mBatchLevel);
#endif
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("EndBatch() entered with mBatchLevel=%d", mBatchLevel));
NS_ASSERTION(mBatchLevel > 0, "nsBayesianFilter::EndBatch() called with"
" mBatchLevel <= 0");
--mBatchLevel;
@ -781,6 +820,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification,
nsIJunkMailClassificationListener* listener)
{
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification));
TokenEnumeration tokens = tokenizer.getTokens();
switch (oldClassification) {
case nsIJunkMailPlugin::JUNK:
@ -935,9 +975,7 @@ static const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' };
void nsBayesianFilter::writeTrainingData()
{
#ifdef DEBUG_dmose
printf("writeTrainingData() entered\n");
#endif
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("writeTrainingData() entered"));
nsCOMPtr<nsILocalFile> file;
nsresult rv = getTrainingFile(file);
if (NS_FAILED(rv)) return;
@ -986,6 +1024,7 @@ void nsBayesianFilter::readTrainingData()
readTokens(stream, mGoodTokens) &&
readTokens(stream, mBadTokens))) {
NS_WARNING("failed to read training data.");
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("failed to read training data."));
}
fclose(stream);