Bug 283493: flushing of training data during session practically never happens due to too high "# of changes" treshold, patch by Tuukka Tolvanen <tuukka.tolvanen@gmail.com>, r=eyalroz, sr=mscott

This commit is contained in:
gavin%gavinsharp.com 2006-07-19 13:38:19 +00:00
parent 4b322a21b4
commit 7af2b70e56
2 changed files with 19 additions and 25 deletions

View File

@ -897,8 +897,7 @@ NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISuppor
NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
nsBayesianFilter::nsBayesianFilter()
: mGoodCount(0), mBadCount(0),
mNumDirtyingMessages(0)
: mGoodCount(0), mBadCount(0), mTrainingDataDirty(PR_FALSE)
{
if (!BayesianFilterLogModule)
BayesianFilterLogModule = PR_NewLogModule("BayesianFilter");
@ -934,9 +933,6 @@ nsBayesianFilter::nsBayesianFilter()
rv = prefs->GetBranch(nsnull, getter_AddRefs(prefBranch));
NS_ASSERTION(NS_SUCCEEDED(rv),"failed getting preferences branch");
rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.diryting_messages_threshold",&mDirtyingMessageWriteThreshold);
if (NS_FAILED(rv) || (mDirtyingMessageWriteThreshold <= 0) )
mDirtyingMessageWriteThreshold = DEFAULT_WRITE_TRAINING_DATA_MESSAGES_THRESHOLD;
rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.minimum_interval",&mMinFlushInterval);
// it is not a good idea to allow a minimum interval of under 1 second
if (NS_FAILED(rv) || (mMinFlushInterval <= 1000) )
@ -953,15 +949,11 @@ nsBayesianFilter::nsBayesianFilter()
void
nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure)
{
// we will flush the training data to disk if it is dirty with
// enough messages, and if enough time has passed since the first
// time a message has been classified after the last flush
// we will flush the training data to disk after enough time has passed
// since the first time a message has been classified after the last flush
nsBayesianFilter *filter = NS_STATIC_CAST(nsBayesianFilter *, aClosure);
if (filter->mNumDirtyingMessages > filter->mDirtyingMessageWriteThreshold)
filter->writeTrainingData();
else
filter->mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, filter, filter->mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
filter->writeTrainingData();
}
nsBayesianFilter::~nsBayesianFilter()
@ -1201,7 +1193,7 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message
/* void shutdown (); */
NS_IMETHODIMP nsBayesianFilter::Shutdown()
{
if (mNumDirtyingMessages > 0)
if (mTrainingDataDirty)
writeTrainingData();
return NS_OK;
}
@ -1287,8 +1279,9 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification,
nsIJunkMailClassificationListener* listener)
{
PRUint32 oldNumDirtyingMessages = mNumDirtyingMessages;
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification));
PRBool trainingDataWasDirty = mTrainingDataDirty;
TokenEnumeration tokens = tokenizer.getTokens();
// Uhoh...if the user is re-training then the message may already be classified and we are classifying it again with the same classification.
@ -1306,7 +1299,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
if (mBadCount > 0) {
--mBadCount;
forgetTokens(mBadTokens, tokens);
mNumDirtyingMessages++;
mTrainingDataDirty = PR_TRUE;
}
break;
case nsIJunkMailPlugin::GOOD:
@ -1314,7 +1307,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
if (mGoodCount > 0) {
--mGoodCount;
forgetTokens(mGoodTokens, tokens);
mNumDirtyingMessages++;
mTrainingDataDirty = PR_TRUE;
}
break;
}
@ -1326,23 +1319,26 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
// put tokens into junk corpus.
++mBadCount;
rememberTokens(mBadTokens, tokens);
mNumDirtyingMessages++;
mTrainingDataDirty = PR_TRUE;
break;
case nsIJunkMailPlugin::GOOD:
// put tokens into good corpus.
++mGoodCount;
rememberTokens(mGoodTokens, tokens);
mNumDirtyingMessages++;
mTrainingDataDirty = PR_TRUE;
break;
}
if (listener)
listener->OnMessageClassified(messageURL, newClassification);
if ( (mNumDirtyingMessages > 0) && (oldNumDirtyingMessages == 0) && ( mTimer != nsnull ) )
if (mTrainingDataDirty && !trainingDataWasDirty && ( mTimer != nsnull ))
{
// schedule check for need to flush training data in
// mMinFlushInterval msec from now
// if training data became dirty just now, schedule flush
// mMinFlushInterval msec from now
PR_LOG(
BayesianFilterLogModule, PR_LOG_ALWAYS,
("starting training data flush timer %i msec", mMinFlushInterval));
mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, this, mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
}
}
@ -1476,7 +1472,7 @@ void nsBayesianFilter::writeTrainingData()
else
{
fclose(stream);
mNumDirtyingMessages = 0;
mTrainingDataDirty = PR_FALSE;
}
}

View File

@ -50,7 +50,6 @@
#include "plarena.h"
#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15*60*1000
#define DEFAULT_WRITE_TRAINING_DATA_MESSAGES_THRESHOLD 50
struct Token;
class TokenEnumeration;
@ -162,8 +161,7 @@ protected:
Tokenizer mGoodTokens, mBadTokens;
double mJunkProbabilityThreshold;
PRUint32 mGoodCount, mBadCount;
PRInt32 mDirtyingMessageWriteThreshold; // ... before flushing training data
PRInt32 mNumDirtyingMessages; // must be positive
PRPackedBool mTrainingDataDirty;
PRInt32 mMinFlushInterval; // in miliseconds, must be positive
//and not too close to 0
nsCOMPtr<nsITimer> mTimer;