mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-26 14:22:01 +00:00
Bug 283493: flushing of training data during session practically never happens due to too high "# of changes" treshold, patch by Tuukka Tolvanen <tuukka.tolvanen@gmail.com>, r=eyalroz, sr=mscott
This commit is contained in:
parent
4b322a21b4
commit
7af2b70e56
@ -897,8 +897,7 @@ NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISuppor
|
||||
NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
|
||||
|
||||
nsBayesianFilter::nsBayesianFilter()
|
||||
: mGoodCount(0), mBadCount(0),
|
||||
mNumDirtyingMessages(0)
|
||||
: mGoodCount(0), mBadCount(0), mTrainingDataDirty(PR_FALSE)
|
||||
{
|
||||
if (!BayesianFilterLogModule)
|
||||
BayesianFilterLogModule = PR_NewLogModule("BayesianFilter");
|
||||
@ -934,9 +933,6 @@ nsBayesianFilter::nsBayesianFilter()
|
||||
rv = prefs->GetBranch(nsnull, getter_AddRefs(prefBranch));
|
||||
NS_ASSERTION(NS_SUCCEEDED(rv),"failed getting preferences branch");
|
||||
|
||||
rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.diryting_messages_threshold",&mDirtyingMessageWriteThreshold);
|
||||
if (NS_FAILED(rv) || (mDirtyingMessageWriteThreshold <= 0) )
|
||||
mDirtyingMessageWriteThreshold = DEFAULT_WRITE_TRAINING_DATA_MESSAGES_THRESHOLD;
|
||||
rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.minimum_interval",&mMinFlushInterval);
|
||||
// it is not a good idea to allow a minimum interval of under 1 second
|
||||
if (NS_FAILED(rv) || (mMinFlushInterval <= 1000) )
|
||||
@ -953,15 +949,11 @@ nsBayesianFilter::nsBayesianFilter()
|
||||
void
|
||||
nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure)
|
||||
{
|
||||
// we will flush the training data to disk if it is dirty with
|
||||
// enough messages, and if enough time has passed since the first
|
||||
// time a message has been classified after the last flush
|
||||
// we will flush the training data to disk after enough time has passed
|
||||
// since the first time a message has been classified after the last flush
|
||||
|
||||
nsBayesianFilter *filter = NS_STATIC_CAST(nsBayesianFilter *, aClosure);
|
||||
if (filter->mNumDirtyingMessages > filter->mDirtyingMessageWriteThreshold)
|
||||
filter->writeTrainingData();
|
||||
else
|
||||
filter->mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, filter, filter->mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
|
||||
filter->writeTrainingData();
|
||||
}
|
||||
|
||||
nsBayesianFilter::~nsBayesianFilter()
|
||||
@ -1201,7 +1193,7 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message
|
||||
/* void shutdown (); */
|
||||
NS_IMETHODIMP nsBayesianFilter::Shutdown()
|
||||
{
|
||||
if (mNumDirtyingMessages > 0)
|
||||
if (mTrainingDataDirty)
|
||||
writeTrainingData();
|
||||
return NS_OK;
|
||||
}
|
||||
@ -1287,8 +1279,9 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
|
||||
nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification,
|
||||
nsIJunkMailClassificationListener* listener)
|
||||
{
|
||||
PRUint32 oldNumDirtyingMessages = mNumDirtyingMessages;
|
||||
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification));
|
||||
|
||||
PRBool trainingDataWasDirty = mTrainingDataDirty;
|
||||
TokenEnumeration tokens = tokenizer.getTokens();
|
||||
|
||||
// Uhoh...if the user is re-training then the message may already be classified and we are classifying it again with the same classification.
|
||||
@ -1306,7 +1299,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
|
||||
if (mBadCount > 0) {
|
||||
--mBadCount;
|
||||
forgetTokens(mBadTokens, tokens);
|
||||
mNumDirtyingMessages++;
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
}
|
||||
break;
|
||||
case nsIJunkMailPlugin::GOOD:
|
||||
@ -1314,7 +1307,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
|
||||
if (mGoodCount > 0) {
|
||||
--mGoodCount;
|
||||
forgetTokens(mGoodTokens, tokens);
|
||||
mNumDirtyingMessages++;
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -1326,23 +1319,26 @@ void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageU
|
||||
// put tokens into junk corpus.
|
||||
++mBadCount;
|
||||
rememberTokens(mBadTokens, tokens);
|
||||
mNumDirtyingMessages++;
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
break;
|
||||
case nsIJunkMailPlugin::GOOD:
|
||||
// put tokens into good corpus.
|
||||
++mGoodCount;
|
||||
rememberTokens(mGoodTokens, tokens);
|
||||
mNumDirtyingMessages++;
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
break;
|
||||
}
|
||||
|
||||
if (listener)
|
||||
listener->OnMessageClassified(messageURL, newClassification);
|
||||
|
||||
if ( (mNumDirtyingMessages > 0) && (oldNumDirtyingMessages == 0) && ( mTimer != nsnull ) )
|
||||
if (mTrainingDataDirty && !trainingDataWasDirty && ( mTimer != nsnull ))
|
||||
{
|
||||
// schedule check for need to flush training data in
|
||||
// mMinFlushInterval msec from now
|
||||
// if training data became dirty just now, schedule flush
|
||||
// mMinFlushInterval msec from now
|
||||
PR_LOG(
|
||||
BayesianFilterLogModule, PR_LOG_ALWAYS,
|
||||
("starting training data flush timer %i msec", mMinFlushInterval));
|
||||
mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, this, mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
|
||||
}
|
||||
}
|
||||
@ -1476,7 +1472,7 @@ void nsBayesianFilter::writeTrainingData()
|
||||
else
|
||||
{
|
||||
fclose(stream);
|
||||
mNumDirtyingMessages = 0;
|
||||
mTrainingDataDirty = PR_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,7 +50,6 @@
|
||||
#include "plarena.h"
|
||||
|
||||
#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15*60*1000
|
||||
#define DEFAULT_WRITE_TRAINING_DATA_MESSAGES_THRESHOLD 50
|
||||
|
||||
struct Token;
|
||||
class TokenEnumeration;
|
||||
@ -162,8 +161,7 @@ protected:
|
||||
Tokenizer mGoodTokens, mBadTokens;
|
||||
double mJunkProbabilityThreshold;
|
||||
PRUint32 mGoodCount, mBadCount;
|
||||
PRInt32 mDirtyingMessageWriteThreshold; // ... before flushing training data
|
||||
PRInt32 mNumDirtyingMessages; // must be positive
|
||||
PRPackedBool mTrainingDataDirty;
|
||||
PRInt32 mMinFlushInterval; // in miliseconds, must be positive
|
||||
//and not too close to 0
|
||||
nsCOMPtr<nsITimer> mTimer;
|
||||
|
Loading…
Reference in New Issue
Block a user