Bug 506397 - "Support multiple spam corpus files" [r=Standard8 sr=bienvenu]

This commit is contained in:
Kent James 2009-08-22 09:27:25 +01:00
Родитель de3a2097df
Коммит 996ab47df3
14 изменённых файлов: 850 добавлений и 68 удалений

Просмотреть файл

@ -45,6 +45,7 @@ interface nsIMsgFilterHitNotify;
interface nsIMsgWindow;
interface nsIMsgDBHdr;
interface nsIStreamListener;
interface nsILocalFile;
/**
* This interface is still very much under development, and is not yet stable.
@ -145,7 +146,7 @@ interface nsIMsgTraitDetailListener : nsISupports
[array, size_is(tokenCount)] in unsigned long runningPercents);
};
[scriptable, uuid(EDB05079-3F8A-46a6-A596-E7FD8E12216B)]
[scriptable, uuid(8EA5BBCA-F735-4d43-8541-D203D8E2FF2F)]
interface nsIJunkMailPlugin : nsIMsgFilterPlugin
{
/**
@ -307,15 +308,80 @@ interface nsIJunkMailPlugin : nsIMsgFilterPlugin
in nsIMsgTraitDetailListener aListener,
[optional] in nsIMsgWindow aMsgWindow);
/**
* Gives information on token and message count information in the
* training data corpus
*
* @param aTrait trait id (may be null)
* @param aMessageCount count of messages that have been trained with aTrait
*
* @return token count for all traits
*/
unsigned long corpusCounts(in unsigned long aTrait, out unsigned long aMessageCount);
};
/**
* The nsIMsgCorpus interface manages a corpus of mail data used for
* statistical analysis of messages.
*/
[scriptable, uuid(70BAD26F-DFD4-41bd-8FAB-4C09B9C1E845)]
interface nsIMsgCorpus : nsISupports
{
/**
* Clear the corpus data for a trait id.
*
* @param aTrait trait id
*/
void clearTrait(in unsigned long aTrait);
/**
* Update corpus data from a file.
*
* @param aFile the file with the data, in the format:
*
* Format of the trait file for version 1:
* [0xFCA93601] (the 01 is the version)
* for each trait to write:
* [id of trait to write] (0 means end of list)
* [number of messages per trait]
* for each token with non-zero count
* [count]
* [length of word]word
*
* @param aIsAdd should the data be added, or removed? True if
* adding, false if removing.
*
* @param aRemapCount number of items in the parallel arrays aFromTraits,
* aToTraits. These arrays allow conversion of the
* trait id stored in the file (which may be originated
* externally) to the trait id used in the local corpus
* (which is defined locally using nsIMsgTraitService, and
* mapped by that interface to a globally unique trait
* id string).
*
* @param aFromTraits array of trait ids used in aFile. If aFile contains
* trait ids that are not in this array, they are not
* remapped, but assummed to be local trait ids.
*
* @param aToTraits array of trait ids, corresponding to elements of
* aFromTraits, that represent the local trait ids to
* be used in storing data from aFile into the local corpus.
*/
void updateData(in nsILocalFile aFile, in boolean aIsAdd,
[optional] in unsigned long aRemapCount,
[optional, array, size_is(aRemapCount)] in unsigned long aFromTraits,
[optional, array, size_is(aRemapCount)] in unsigned long aToTraits);
/**
* Get the corpus count for a token as a string.
*
* @param aWord string of characters representing the token
* @param aTrait trait id
*
* @return count of that token in the corpus
*
*/
unsigned long getTokenCount(in AUTF8String aWord, in unsigned long aTrait);
/**
* Gives information on token and message count information in the
* training data corpus.
*
* @param aTrait trait id (may be null)
* @param aMessageCount count of messages that have been trained with aTrait
*
* @return token count for all traits
*/
unsigned long corpusCounts(in unsigned long aTrait, out unsigned long aMessageCount);
};

Просмотреть файл

@ -49,7 +49,7 @@
#include "nsISupports.idl"
[scriptable, uuid(e3e47690-a676-12d6-81c9-00308646b737)]
[scriptable, uuid(2CB15FB0-A912-40d3-8882-F2765C75655F)]
interface nsIMsgTraitService : nsISupports
{
/**
@ -172,4 +172,35 @@ interface nsIMsgTraitService : nsISupports
void getEnabledIndices(out unsigned long count,
[array, size_is(count)] out unsigned long proIndices,
[array, size_is(count)] out unsigned long antiIndices);
/**
* Add a trait as an alias of another trait. An alias is a trait whose
* counts will be combined with the aliased trait. This allows multiple sets
* of corpus data to be used to provide information on a single message
* characteristic, while allowing each individual set of corpus data to
* retain its own identity.
*
* @param aTraitIndex the internal identifier for the aliased trait
* @param aTraitAlias the internal identifier for the alias to add
*/
void addAlias(in unsigned long aTraitIndex, in unsigned long aTraitAlias);
/**
* Removes a trait as an alias of another trait.
*
* @param aTraitIndex the internal identifier for the aliased trait
* @param aTraitAlias the internal identifier for the alias to remove
*/
void removeAlias(in unsigned long aTraitIndex, in unsigned long aTraitAlias);
/**
* Get an array of trait aliases for a trait index, if any
*
* @param aTraitIndex the internal identifier for the aliased trait
* @param aLength length of array of aliases
* @param aAliases array of internal identifiers for aliases
*/
void getAliases(in unsigned long aTraitIndex, out unsigned long aLength,
[retval, array, size_is(aLength)] out unsigned long aAliases);
};

Просмотреть файл

@ -165,6 +165,64 @@ nsMsgTraitService.prototype =
aAntiIndices.value = antiIndices;
return;
},
addAlias: function addAlias(aTraitIndex, aTraitAliasIndex)
{
let aliasesString = "";
try {
aliasesString = traitsBranch.getCharPref("aliases." + aTraitIndex);
}
catch (e) {}
let aliases;
if (aliasesString.length)
aliases = aliasesString.split(",");
else
aliases = [];
if (aliases.indexOf(aTraitAliasIndex.toString()) == -1)
{
aliases.push(aTraitAliasIndex);
traitsBranch.setCharPref("aliases." + aTraitIndex, aliases.join());
}
},
removeAlias: function removeAlias(aTraitIndex, aTraitAliasIndex)
{
let aliasesString = "";
try {
aliasesString = traitsBranch.getCharPref("aliases." + aTraitIndex);
}
catch (e) {
return;
}
let aliases;
if (aliasesString.length)
aliases = aliasesString.split(",");
else
aliases = [];
let location;
if ((location = aliases.indexOf(aTraitAliasIndex.toString())) != -1)
{
aliases.splice(location, 1);
traitsBranch.setCharPref("aliases." + aTraitIndex, aliases.join());
}
},
getAliases: function getAliases(aTraitIndex, aLength)
{
let aliasesString = "";
try {
aliasesString = traitsBranch.getCharPref("aliases." + aTraitIndex);
}
catch (e) {}
let aliases;
if (aliasesString.length)
aliases = aliasesString.split(",");
else
aliases = [];
aLength.value = aliases.length;
return aliases;
},
};
// initialization

Просмотреть файл

@ -90,6 +90,30 @@ function run_test()
do_check_eq(proArray.value[1], proIndex);
do_check_eq(antiArray.value[1], antiIndex);
// check of aliases
// add three random aliases
ts.addAlias(1, 501);
ts.addAlias(1, 502);
ts.addAlias(1, 601);
let aliases = ts.getAliases(1, {});
do_check_eq(aliases[0], 501);
do_check_eq(aliases[1], 502);
do_check_eq(aliases[2], 601);
// remove the middle one
ts.removeAlias(1, 502);
aliases = ts.getAliases(1, {});
do_check_eq(aliases.length, 2);
do_check_eq(aliases[0], 501);
do_check_eq(aliases[1], 601);
// try to add an existing value
ts.addAlias(1, 501);
aliases = ts.getAliases(1, {});
do_check_eq(aliases.length, 2);
do_check_eq(aliases[0], 501);
do_check_eq(aliases[1], 601);
// now let's make sure this got saved in preferences
do_check_eq(proId, traitsBranch.getCharPref("id." + proIndex));
do_check_eq(proName, traitsBranch.getCharPref("name." + proIndex));

Просмотреть файл

@ -87,6 +87,7 @@
#include "nsIncompleteGamma.h"
#include <math.h>
#include <prmem.h>
#include "nsIMsgTraitService.h"
static PRLogModuleInfo *BayesianFilterLogModule = nsnull;
@ -1201,7 +1202,8 @@ NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISuppor
/* Implementation file */
NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
NS_IMPL_ISUPPORTS3(nsBayesianFilter, nsIMsgFilterPlugin,
nsIJunkMailPlugin, nsIMsgCorpus)
nsBayesianFilter::nsBayesianFilter()
: mTrainingDataDirty(PR_FALSE)
@ -1298,7 +1300,7 @@ public:
PRUint32 aNumMessagesToClassify,
const char **aMessageURIs)
: mFilter(aFilter),
mSupports(aFilter),
mJunkMailPlugin(aFilter),
mJunkListener(aJunkListener),
mTraitListener(aTraitListener),
mDetailListener(aDetailListener),
@ -1321,7 +1323,7 @@ public:
PRUint32 aNumMessagesToClassify,
const char **aMessageURIs)
: mFilter(aFilter),
mSupports(aFilter),
mJunkMailPlugin(aFilter),
mJunkListener(aJunkListener),
mTraitListener(nsnull),
mDetailListener(nsnull),
@ -1378,7 +1380,7 @@ public:
private:
nsBayesianFilter* mFilter;
nsCOMPtr<nsISupports> mSupports;
nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
nsCOMPtr<nsIMsgTraitDetailListener> mDetailListener;
@ -1490,6 +1492,14 @@ void nsBayesianFilter::classifyMessage(
nsAutoTArray<PRUint32, kTraitAutoCapacity> numProMessages;
// anti message counts per trait index
nsAutoTArray<PRUint32, kTraitAutoCapacity> numAntiMessages;
// array of pro aliases per trait index
nsAutoTArray<PRUint32*, kTraitAutoCapacity > proAliasArrays;
// number of pro aliases per trait index
nsAutoTArray<PRUint32, kTraitAutoCapacity > proAliasesLengths;
// array of anti aliases per trait index
nsAutoTArray<PRUint32*, kTraitAutoCapacity> antiAliasArrays;
// number of anti aliases per trait index
nsAutoTArray<PRUint32, kTraitAutoCapacity > antiAliasesLengths;
// construct the outgoing listener arrays
nsAutoTArray<PRUint32, kTraitAutoCapacity> traits;
nsAutoTArray<PRUint32, kTraitAutoCapacity> percents;
@ -1499,14 +1509,64 @@ void nsBayesianFilter::classifyMessage(
percents.SetCapacity(traitCount);
numProMessages.SetCapacity(traitCount);
numAntiMessages.SetCapacity(traitCount);
proAliasesLengths.SetCapacity(traitCount);
antiAliasesLengths.SetCapacity(traitCount);
proAliasArrays.SetCapacity(traitCount);
antiAliasArrays.SetCapacity(traitCount);
}
nsresult rv;
nsCOMPtr<nsIMsgTraitService> traitService(do_GetService("@mozilla.org/msg-trait-service;1", &rv));
if (NS_FAILED(rv))
{
NS_ERROR("Failed to get trait service");
PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("Failed to get trait service"));
}
// get aliases and message counts for the pro and anti traits
for (PRUint32 traitIndex = 0; traitIndex < traitCount; traitIndex++)
{
numProMessages.AppendElement(
mCorpus.getMessageCount(aProTraits[traitIndex]));
numAntiMessages.AppendElement(
mCorpus.getMessageCount(aAntiTraits[traitIndex]));
nsresult rv;
// pro trait
PRUint32 proAliasesLength = 0;
PRUint32* proAliases = nsnull;
PRUint32 proTrait = aProTraits[traitIndex];
if (traitService)
{
rv = traitService->GetAliases(proTrait, &proAliasesLength, &proAliases);
if (NS_FAILED(rv))
{
NS_ERROR("trait service failed to get aliases");
PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("trait service failed to get aliases"));
}
}
proAliasesLengths.AppendElement(proAliasesLength);
proAliasArrays.AppendElement(proAliases);
PRUint32 proMessageCount = mCorpus.getMessageCount(proTrait);
for (PRUint32 aliasIndex = 0; aliasIndex < proAliasesLength; aliasIndex++)
proMessageCount += mCorpus.getMessageCount(proAliases[aliasIndex]);
numProMessages.AppendElement(proMessageCount);
// anti trait
PRUint32 antiAliasesLength = 0;
PRUint32* antiAliases = nsnull;
PRUint32 antiTrait = aAntiTraits[traitIndex];
if (traitService)
{
rv = traitService->GetAliases(antiTrait, &antiAliasesLength, &antiAliases);
if (NS_FAILED(rv))
{
NS_ERROR("trait service failed to get aliases");
PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("trait service failed to get aliases"));
}
}
antiAliasesLengths.AppendElement(antiAliasesLength);
antiAliasArrays.AppendElement(antiAliases);
PRUint32 antiMessageCount = mCorpus.getMessageCount(antiTrait);
for (PRUint32 aliasIndex = 0; aliasIndex < antiAliasesLength; aliasIndex++)
antiMessageCount += mCorpus.getMessageCount(antiAliases[aliasIndex]);
numAntiMessages.AppendElement(antiMessageCount);
}
for (PRUint32 i = 0; i < tokenCount; ++i)
@ -1517,10 +1577,17 @@ void nsBayesianFilter::classifyMessage(
continue;
for (PRUint32 traitIndex = 0; traitIndex < traitCount; traitIndex++)
{
double proCount =
static_cast<double>(mCorpus.getTraitCount(t, aProTraits[traitIndex]));
double antiCount =
static_cast<double>(mCorpus.getTraitCount(t, aAntiTraits[traitIndex]));
PRUint32 iProCount = mCorpus.getTraitCount(t, aProTraits[traitIndex]);
// add in any counts for aliases to proTrait
for (PRUint32 aliasIndex = 0; aliasIndex < proAliasesLengths[traitIndex]; aliasIndex++)
iProCount += mCorpus.getTraitCount(t, proAliasArrays[traitIndex][aliasIndex]);
double proCount = static_cast<double>(iProCount);
PRUint32 iAntiCount = mCorpus.getTraitCount(t, aAntiTraits[traitIndex]);
// add in any counts for aliases to antiTrait
for (PRUint32 aliasIndex = 0; aliasIndex < antiAliasesLengths[traitIndex]; aliasIndex++)
iAntiCount += mCorpus.getTraitCount(t, antiAliasArrays[traitIndex][aliasIndex]);
double antiCount = static_cast<double>(iAntiCount);
double prob, denom;
// Prevent a divide by zero error by setting defaults for prob
@ -1715,6 +1782,12 @@ void nsBayesianFilter::classifyMessage(
traits.AppendElement(aProTraits[traitIndex]);
percents.AppendElement(proPercent);
}
// free aliases arrays returned from XPCOM
if (proAliasesLengths[traitIndex])
NS_Free(proAliasArrays[traitIndex]);
if (antiAliasesLengths[traitIndex])
NS_Free(antiAliasArrays[traitIndex]);
}
if (aTraitListener)
@ -1903,7 +1976,7 @@ public:
nsTArray<PRUint32>& aNewClassifications,
nsIJunkMailClassificationListener* aJunkListener,
nsIMsgTraitClassificationListener* aTraitListener)
: mFilter(filter), mSupports(filter), mJunkListener(aJunkListener),
: mFilter(filter), mJunkMailPlugin(filter), mJunkListener(aJunkListener),
mTraitListener(aTraitListener),
mOldClassifications(aOldClassifications),
mNewClassifications(aNewClassifications)
@ -1920,7 +1993,7 @@ public:
private:
nsBayesianFilter* mFilter;
nsCOMPtr<nsISupports> mSupports;
nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
nsTArray<PRUint32> mOldClassifications;
@ -2122,6 +2195,8 @@ NS_IMETHODIMP nsBayesianFilter::DetailMessage(const char *aMsgURI,
return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
}
// nsIMsgCorpus implementation
NS_IMETHODIMP nsBayesianFilter::CorpusCounts(PRUint32 aTrait,
PRUint32 *aMessageCount,
PRUint32 *aTokenCount)
@ -2137,6 +2212,33 @@ NS_IMETHODIMP nsBayesianFilter::CorpusCounts(PRUint32 aTrait,
return NS_ERROR_FAILURE;
}
NS_IMETHODIMP nsBayesianFilter::ClearTrait(PRUint32 aTrait)
{
return mCorpus.ClearTrait(aTrait);
}
NS_IMETHODIMP
nsBayesianFilter::UpdateData(nsILocalFile *aFile,
PRBool aIsAdd,
PRUint32 aRemapCount,
PRUint32 *aFromTraits,
PRUint32 *aToTraits)
{
return mCorpus.UpdateData(aFile, aIsAdd, aRemapCount, aFromTraits, aToTraits);
}
NS_IMETHODIMP
nsBayesianFilter::GetTokenCount(const nsACString &aWord,
PRUint32 aTrait,
PRUint32 *aCount)
{
NS_ENSURE_ARG_POINTER(aCount);
CorpusToken* t = mCorpus.get(PromiseFlatCString(aWord).get());
PRUint32 count = mCorpus.getTraitCount(t, aTrait);
*aCount = count;
return NS_OK;
}
/* Corpus Store */
/*
@ -2262,7 +2364,8 @@ PRBool CorpusStore::writeTokens(FILE* stream, PRBool shrink, PRUint32 aTraitId)
return PR_TRUE;
}
PRBool CorpusStore::readTokens(FILE* stream, PRInt64 fileSize, PRUint32 aTraitId)
PRBool CorpusStore::readTokens(FILE* stream, PRInt64 fileSize,
PRUint32 aTraitId, PRBool aIsAdd)
{
PRUint32 tokenCount;
if (readUInt32(stream, &tokenCount) != 1)
@ -2302,7 +2405,10 @@ PRBool CorpusStore::readTokens(FILE* stream, PRInt64 fileSize, PRUint32 aTraitId
break;
fpos += size;
buffer[size] = '\0';
add(buffer, aTraitId, count);
if (aIsAdd)
add(buffer, aTraitId, count);
else
remove(buffer, aTraitId, count);
}
delete[] buffer;
@ -2483,8 +2589,8 @@ void CorpusStore::readTrainingData()
(memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
(readUInt32(stream, &goodMessageCount) == 1) &&
(readUInt32(stream, &junkMessageCount) == 1) &&
readTokens(stream, fileSize, kGoodTrait) &&
readTokens(stream, fileSize, kJunkTrait))) {
readTokens(stream, fileSize, kGoodTrait, PR_TRUE) &&
readTokens(stream, fileSize, kJunkTrait, PR_TRUE))) {
NS_WARNING("failed to read training data.");
PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("failed to read training data."));
}
@ -2508,39 +2614,9 @@ void CorpusStore::readTrainingData()
if (NS_FAILED(rv) || !exists)
return;
rv = mTraitFile->OpenANSIFileDesc("rb", &stream);
rv = UpdateData(mTraitFile, PR_TRUE, 0, nsnull, nsnull);
if (NS_FAILED(rv))
return;
rv = mTraitFile->GetFileSize(&fileSize);
if (NS_FAILED(rv))
return;
PRBool error;
while(1) // break on error or done
{
if (error = (fread(cookie, sizeof(cookie), 1, stream) != 1))
break;
if (error = memcmp(cookie, kTraitCookie, sizeof(cookie)))
break;
PRUint32 trait;
while ( !(error = (readUInt32(stream, &trait) != 1)) && trait)
{
PRUint32 count;
if (error = (readUInt32(stream, &count) != 1))
break;
setMessageCount(trait, count);
if (error = !readTokens(stream, fileSize, trait))
break;
}
break;
}
if (error)
{
NS_WARNING("failed to read training data.");
PR_LOG(BayesianFilterLogModule, PR_LOG_ERROR, ("failed to read training data."));
@ -2684,3 +2760,87 @@ void CorpusStore::setMessageCount(PRUint32 aTraitId, PRUint32 aCount)
mMessageCounts[index] = aCount;
}
}
nsresult
CorpusStore::UpdateData(nsILocalFile *aFile,
PRBool aIsAdd,
PRUint32 aRemapCount,
PRUint32 *aFromTraits,
PRUint32 *aToTraits)
{
NS_ENSURE_ARG_POINTER(aFile);
if (aRemapCount)
{
NS_ENSURE_ARG_POINTER(aFromTraits);
NS_ENSURE_ARG_POINTER(aToTraits);
}
FILE* stream;
nsresult rv = aFile->OpenANSIFileDesc("rb", &stream);
NS_ENSURE_SUCCESS(rv, rv);
PRInt64 fileSize;
rv = aFile->GetFileSize(&fileSize);
PRBool error;
while(NS_SUCCEEDED(rv)) // break on error or done
{
char cookie[4];
if (error = (fread(cookie, sizeof(cookie), 1, stream) != 1))
break;
if (error = memcmp(cookie, kTraitCookie, sizeof(cookie)))
break;
PRUint32 fileTrait;
while ( !(error = (readUInt32(stream, &fileTrait) != 1)) && fileTrait)
{
PRUint32 count;
if (error = (readUInt32(stream, &count) != 1))
break;
PRUint32 localTrait = fileTrait;
// remap the trait
for (PRUint32 i = 0; i < aRemapCount; i++)
{
if (aFromTraits[i] == fileTrait)
localTrait = aToTraits[i];
}
PRUint32 messageCount = getMessageCount(localTrait);
if (aIsAdd)
messageCount += count;
else if (count > messageCount)
messageCount = 0;
else
messageCount -= count;
setMessageCount(localTrait, messageCount);
if (error = !readTokens(stream, fileSize, localTrait, aIsAdd))
break;
}
break;
}
fclose(stream);
if (error || NS_FAILED(rv))
return NS_ERROR_FAILURE;
return NS_OK;
}
nsresult CorpusStore::ClearTrait(PRUint32 aTrait)
{
// clear message counts
setMessageCount(aTrait, 0);
// clear token counts
PRUint32 tokenCount = countTokens();
TokenEnumeration tokens = getTokens();
while (tokens.hasMoreTokens())
{
CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
PRInt32 wordCount = static_cast<PRInt32>(getTraitCount(token, aTrait));
updateTrait(token, aTrait, -wordCount);
}
return NS_OK;
}

Просмотреть файл

@ -277,6 +277,49 @@ public:
*/
PRUint32 getTraitCount(CorpusToken *token, PRUint32 aTraitId);
/**
* Add (or remove) data from a particular file to the corpus data.
*
* @param aFile the file with the data, in the format:
*
* Format of the trait file for version 1:
* [0xFCA93601] (the 01 is the version)
* for each trait to write:
* [id of trait to write] (0 means end of list)
* [number of messages per trait]
* for each token with non-zero count
* [count]
* [length of word]word
*
* @param aIsAdd should the data be added, or removed? PR_TRUE if adding,
* else removing.
*
* @param aRemapCount number of items in the parallel arrays aFromTraits,
* aToTraits. These arrays allow conversion of the
* trait id stored in the file (which may be originated
* externally) to the trait id used in the local corpus
* (which is defined locally using nsIMsgTraitService).
*
* @param aFromTraits array of trait ids used in aFile. If aFile contains
* trait ids that are not in this array, they are not
* remapped, but assummed to be local trait ids.
*
* @param aToTraits array of trait ids, corresponding to elements of
* aFromTraits, that represent the local trait ids to be
* used in storing data from aFile into the local corpus.
*
*/
nsresult UpdateData(nsILocalFile *aFile, PRBool aIsAdd,
PRUint32 aRemapCount, PRUint32 *aFromTraits,
PRUint32 *aToTraits);
/**
* remove all counts (message and tokens) for a trait id
*
* @param aTrait trait id for the trait to remove
*/
nsresult ClearTrait(PRUint32 aTrait);
protected:
/**
@ -291,8 +334,16 @@ protected:
/**
* read token strings from the data file
*
* @param stream file stream with token data
* @param fileSize file size
* @param aTraitId id for the trait whose counts will be read
* @param aIsAdd true to add the counts, false to remove them
*
* @return true if successful, false if error
*/
PRBool readTokens(FILE* stream, PRInt64 fileSize, PRUint32 aTraitId);
PRBool readTokens(FILE* stream, PRInt64 fileSize, PRUint32 aTraitId,
PRBool aIsAdd);
/**
* write token strings to the data file
@ -326,11 +377,12 @@ protected:
// the corresponding trait ID
};
class nsBayesianFilter : public nsIJunkMailPlugin {
class nsBayesianFilter : public nsIJunkMailPlugin, nsIMsgCorpus {
public:
NS_DECL_ISUPPORTS
NS_DECL_NSIMSGFILTERPLUGIN
NS_DECL_NSIJUNKMAILPLUGIN
NS_DECL_NSIMSGCORPUS
nsBayesianFilter();
virtual ~nsBayesianFilter();

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,6 @@
From - Sat Jan 26 08:43:42 2008
Subject: test1
Content-Type: text/plain; charset=iso-8859-1
important

Просмотреть файл

@ -0,0 +1,6 @@
From - Sat Jan 26 08:43:42 2008
Subject: test2
Content-Type: text/plain; charset=iso-8859-1
work

Просмотреть файл

@ -0,0 +1,6 @@
From - Sat Jan 26 08:43:42 2008
Subject: test3
Content-Type: text/plain; charset=iso-8859-1
very important work

Двоичный файл не отображается.

Просмотреть файл

@ -520,10 +520,11 @@ function startCommand()
case kCounts:
// test counts
let msgCount = {};
let tokenCount = nsIJunkMailPlugin.corpusCounts(null, {});
nsIJunkMailPlugin.corpusCounts(kJunkTrait, msgCount);
let nsIMsgCorpus = nsIJunkMailPlugin.QueryInterface(Ci.nsIMsgCorpus);
let tokenCount = nsIMsgCorpus.corpusCounts(null, {});
nsIMsgCorpus.corpusCounts(kJunkTrait, msgCount);
let junkCount = msgCount.value;
nsIJunkMailPlugin.corpusCounts(kGoodTrait, msgCount);
nsIMsgCorpus.corpusCounts(kGoodTrait, msgCount);
let goodCount = msgCount.value;
print("tokenCount, junkCount, goodCount is " + tokenCount, junkCount, goodCount);
do_check_eq(tokenCount, gTest.tokenCount);

Просмотреть файл

@ -0,0 +1,178 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Kent James <kent@caspia.com>.
* Portions created by the Initial Developer are Copyright (C) 2009
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
// Tests corpus management functions using nsIMsgCorpus
var msgCorpus =
Cc["@mozilla.org/messenger/filter-plugin;1?name=bayesianfilter"]
.getService(Ci.nsIMsgCorpus);
// tokens found in the test corpus file. trait 1001 was trained with
// 2 messages, and trait 1003 with 1.
var tokenData = [
// [traitid, count, token]
[1001, 0, "iDoNotExist"],
[1001, 1, "linecount"],
[1001, 2, "envelope-to:kenttest@caspia.com"],
[1003, 0, "iAlsoDoNotExist"],
[1003, 0, "isjunk"], // in 1001 but not 1003
[1003, 1, "linecount"],
[1003, 1, "subject:test"],
[1003, 1, "envelope-to:kenttest@caspia.com"],
]
// list of tests
var gTests =
[
// train two different combinations of messages
function checkLoadOnce() {
let fileName = "msgCorpus.dat";
let file = do_get_file("resources/" + fileName);
msgCorpus.updateData(file, true);
// check message counts
let messageCount = {};
msgCorpus.corpusCounts(1001, messageCount);
do_check_eq(2, messageCount.value);
msgCorpus.corpusCounts(1003, messageCount);
do_check_eq(1, messageCount.value);
for (i = 0; i < tokenData.length; i++) {
let id = tokenData[i][0];
let count = tokenData[i][1];
let word = tokenData[i][2];
do_check_eq(count, msgCorpus.getTokenCount(word, id));
}
},
function checkLoadTwice() {
let fileName = "msgCorpus.dat";
let file = do_get_file("resources/" + fileName);
msgCorpus.updateData(file, true);
// check message counts
let messageCount = {};
msgCorpus.corpusCounts(1001, messageCount);
do_check_eq(4, messageCount.value);
msgCorpus.corpusCounts(1003, messageCount);
do_check_eq(2, messageCount.value);
for (i = 0; i < tokenData.length; i++) {
let id = tokenData[i][0];
let count = 2 * tokenData[i][1];
let word = tokenData[i][2];
do_check_eq(count, msgCorpus.getTokenCount(word, id));
}
},
// remap the ids in the file to different local ids
function loadWithRemap() {
let fileName = "msgCorpus.dat";
let file = do_get_file("resources/" + fileName);
msgCorpus.updateData(file, true, 2, [1001, 1003], [1, 3]);
for (i = 0; i < tokenData.length; i++) {
let id = tokenData[i][0] - 1000;
let count = tokenData[i][1];
let word = tokenData[i][2];
do_check_eq(count, msgCorpus.getTokenCount(word, id));
}
},
// test removing data
function checkRemove() {
let fileName = "msgCorpus.dat";
let file = do_get_file("resources/" + fileName);
msgCorpus.updateData(file, false);
// check message counts
let messageCount = {};
msgCorpus.corpusCounts(1001, messageCount);
do_check_eq(2, messageCount.value);
msgCorpus.corpusCounts(1003, messageCount);
do_check_eq(1, messageCount.value);
for (i = 0; i < tokenData.length; i++) {
let id = tokenData[i][0];
let count = tokenData[i][1];
let word = tokenData[i][2];
do_check_eq(count, msgCorpus.getTokenCount(word, id));
}
},
// test clearing a trait
function checkClear() {
let messageCountObject = {};
/*
msgCorpus.corpusCounts(1001, messageCountObject);
let v1001 = messageCountObject.value;
msgCorpus.corpusCounts(1003, messageCountObject);
let v1003 = messageCountObject.value;
dump("pre-clear value " + v1001 + " " + v1003 + "\n");
/**/
msgCorpus.clearTrait(1001);
// check that the message count is zero
msgCorpus.corpusCounts(1001, messageCountObject);
do_check_eq(0, messageCountObject.value);
// but the other trait should still have counts
msgCorpus.corpusCounts(1003, messageCountObject);
do_check_eq(1, messageCountObject.value);
// check that token count was cleared
for (i = 0; i < tokenData.length; i++) {
let id = tokenData[i][0];
let count = tokenData[i][1];
let word = tokenData[i][2];
do_check_eq(id == 1001 ? 0 : count, msgCorpus.getTokenCount(word, id));
}
},
]
// main test
function run_test()
{
do_test_pending();
while(1)
{
if (!gTests.length) // Do we have more commands?
{
// no, all done
do_test_finished();
return;
}
let test = gTests.shift();
test();
}
}

Просмотреть файл

@ -0,0 +1,194 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Kent James <kent@caspia.com>.
* Portions created by the Initial Developer are Copyright (C) 2009
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
// Tests bayes trait analysis with aliases. Adapted from test_traits.js
/*
* These tests rely on data stored in a file, with the same format as traits.dat,
* that was trained in the following manner. There are two training messages,
* included here as files aliases1.eml and aliases2.eml Aliases.dat was trained on
* each of these messages, for different trait indices, as follows, with
* columns showing the training count for each trait index:
*
* file count(1001) count(1005) count(1007) count(1009)
*
* aliases1.eml 1 0 2 0
* aliases2.eml 0 1 0 1
*
* There is also a third email file, aliases3.eml, which combines tokens
* from aliases1.eml and aliases2.eml
*
* The goal here is to demonstrate that traits 1001 and 1007, and traits
* 1005 and 1009, can be combined using aliases. We classify messages with
* trait 1001 as the PRO trait, and 1005 as the ANTI trait.
*
* With these characteristics, I've run a trait analysis without aliases, and
* determined that the following is the correct percentage results from the
* analysis for each message. "Train11" means that the training was 1 pro count
* from aliases1.eml, and 1 anti count from alias2.eml. "Train32" is 3 pro counts,
* and 2 anti counts.
*
* percentage
* file Train11 Train32
*
* alias1.eml 92 98
* alias2.eml 8 3
* alias3.eml 50 53
*/
const nsIJunkMailPlugin =
Cc["@mozilla.org/messenger/filter-plugin;1?name=bayesianfilter"]
.getService(Ci.nsIJunkMailPlugin);
const traitService = Cc["@mozilla.org/msg-trait-service;1"]
.getService(Ci.nsIMsgTraitService);
const kProTrait = 1001;
const kAntiTrait = 1005;
const kProAlias = 1007;
const kAntiAlias = 1009;
var gTest; // currently active test
// The tests array defines the tests to attempt. Format of
// an element "test" of this array:
//
// test.fileName: file containing message to test
// test.proAliases: array of aliases for the pro trait
// test.antiAliases: array of aliases for the anti trait
// test.percent: expected results from the classifier
var tests =
[
{fileName: "aliases1.eml",
proAliases: [],
antiAliases: [],
percent: 92
},
{fileName: "aliases2.eml",
proAliases: [],
antiAliases: [],
percent: 8
},
{fileName: "aliases3.eml",
proAliases: [],
antiAliases: [],
percent: 50
},
{fileName: "aliases1.eml",
proAliases: [kProAlias],
antiAliases: [kAntiAlias],
percent: 98
},
{fileName: "aliases2.eml",
proAliases: [kProAlias],
antiAliases: [kAntiAlias],
percent: 3
},
{fileName: "aliases3.eml",
proAliases: [kProAlias],
antiAliases: [kAntiAlias],
percent: 53
},
]
// main test
function run_test()
{
loadLocalMailAccount();
// load in the aliases trait testing file
nsIJunkMailPlugin.QueryInterface(Ci.nsIMsgCorpus)
.updateData(do_get_file("resources/aliases.dat"), true);
do_test_pending();
startCommand();
}
var listener =
{
//nsIMsgTraitClassificationListener implementation
onMessageTraitsClassified: function(aMsgURI, {}, aTraits, aPercents)
{
//print("Message URI is " + aMsgURI);
if (!aMsgURI)
return; //ignore end-of-batch signal
do_check_eq(aPercents[0], gTest.percent)
// All done, start the next test
startCommand();
},
};
// start the next test command
function startCommand()
{
if (!tests.length) // Do we have more commands?
{
// no, all done
do_test_finished();
return;
}
gTest = tests.shift();
// classify message
var antiArray = [kAntiTrait];
var proArray = [kProTrait];
// remove any existing aliases
let proAliases = traitService.getAliases(kProTrait, {});
let antiAliases = traitService.getAliases(kAntiTrait, {});
let proAlias;
let antiAlias;
while (proAlias = proAliases.pop())
traitService.removeAlias(kProTrait, proAlias);
while (antiAlias = antiAliases.pop())
traitService.removeAlias(kAntiTrait, antiAlias);
// add new aliases
while (proAlias = gTest.proAliases.pop())
traitService.addAlias(kProTrait, proAlias);
while (antiAlias = gTest.antiAliases.pop())
traitService.addAlias(kAntiTrait, antiAlias);
nsIJunkMailPlugin.classifyTraitsInMessage(
getSpec(gTest.fileName), // in string aMsgURI
proArray.length, // length of traits arrays
proArray, // in array aProTraits,
antiArray, // in array aAntiTraits
listener); // in nsIMsgTraitClassificationListener aTraitListener
//null, // [optional] in nsIMsgWindow aMsgWindow
//null, // [optional] in nsIJunkMailClassificationListener aJunkListener
}