Added writeTrainingData/readTrainingData methods which save to a file called "training.txt" in the current profile directory. [not part of build]

This commit is contained in:
beard%netscape.com 2002-10-02 21:52:21 +00:00
parent 82a267d76a
commit c024c1e8c8
2 changed files with 149 additions and 81 deletions

View File

@ -45,6 +45,7 @@
#include "nsIByteBuffer.h"
#include "nsNetUtil.h"
#include "nsQuickSort.h"
#include "nsIProfileInternal.h"
static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f!\"#%&()*+,./:;<=>?@[\\]^_`{|}~";
@ -306,9 +307,13 @@ NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISuppor
NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
nsBayesianFilter::nsBayesianFilter()
: mServerPrefsKey(NULL), mBatchUpdate(PR_FALSE), mGoodCount(0), mBadCount(0)
: mGoodCount(0), mBadCount(0),
mServerPrefsKey(NULL), mBatchUpdate(PR_FALSE), mTrainingDataDirty(PR_FALSE)
{
NS_INIT_ISUPPORTS();
// should probably wait until Init() is called to do this.
readTrainingData();
}
nsBayesianFilter::~nsBayesianFilter()
@ -454,6 +459,10 @@ NS_IMETHODIMP nsBayesianFilter::GetBatchUpdate(PRBool *aBatchUpdate)
NS_IMETHODIMP nsBayesianFilter::SetBatchUpdate(PRBool aBatchUpdate)
{
mBatchUpdate = aBatchUpdate;
if (mBatchUpdate && mTrainingDataDirty)
writeTrainingData();
return NS_OK;
}
@ -520,6 +529,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
if (mBadCount > 0) {
--mBadCount;
forgetTokens(mBadTokens, tokens, count);
mTrainingDataDirty = PR_TRUE;
}
break;
case nsIJunkMailPlugin::GOOD:
@ -527,6 +537,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
if (mGoodCount > 0) {
--mGoodCount;
forgetTokens(mGoodTokens, tokens, count);
mTrainingDataDirty = PR_TRUE;
}
break;
}
@ -535,11 +546,13 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
// put tokens into junk corpus.
++mBadCount;
rememberTokens(mBadTokens, tokens, count);
mTrainingDataDirty = PR_TRUE;
break;
case nsIJunkMailPlugin::GOOD:
// put tokens into good corpus.
++mGoodCount;
rememberTokens(mGoodTokens, tokens, count);
mTrainingDataDirty = PR_TRUE;
break;
}
@ -547,6 +560,137 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
if (listener)
listener->OnMessageClassified(messageURL, newClassification);
if (mTrainingDataDirty && !mBatchUpdate)
writeTrainingData();
}
/*
var profileMgr = do_GetService(PROFILEMGR_CTRID, nsIProfileInternal);
var outFile = profileMgr.getProfileDir(profileMgr.currentProfile);
outFile.append("spam.db");
var fileTransportService = do_GetService(FILETPTSVC_CTRID, nsIFileTransportService);
const ioFlags = NS_WRONLY | NS_CREATE_FILE | NS_TRUNCATE;
var trans = fileTransportService.createTransport(outFile, ioFlags, 'w', true);
var out = trans.openOutputStream(0, -1, 0);
var totals = this.mHamCount.toString() + "\t" +
this.mSpamCount.toString() + "\n";
out.write(totals, totals.length);
for (token in this.mHash) {
var record = this.mHash[token];
var tokenString = token + "\t" + record[kHamCount].toString() +
"\t" + record[kSpamCount].toString() +
"\n";
// "\t" + record[kTime].toString() + "\n";
out.write(tokenString, tokenString.length);
}
out.close();
*/
static nsresult getTrainingFile(nsCOMPtr<nsIFile>& file)
{
// should we cache the profile manager's directory?
nsresult rv;
nsCOMPtr<nsIProfileInternal> profileManager = do_GetService("@mozilla.org/profile/manager;1", &rv);
if (NS_FAILED(rv)) return rv;
nsXPIDLString currentProfile;
rv = profileManager->GetCurrentProfile(getter_Copies(currentProfile));
if (NS_FAILED(rv)) return rv;
rv = profileManager->GetProfileDir(currentProfile.get(), getter_AddRefs(file));
if (NS_FAILED(rv)) return rv;
return file->Append(NS_LITERAL_STRING("training.txt"));
}
static void writeTokens(FILE* stream, Tokenizer& tokenizer)
{
PRUint32 i, count = tokenizer.countTokens();
Token ** tokens = tokenizer.getTokens();
if (!tokens) return;
// compute the maximum word length, so we can use a fixed buffer size when reading back in.
PRUint32 maxWordLength = 0;
for (i = 0; i < count; ++i) {
PRUint32 wordLength = tokens[i]->mWord.Length();
if (wordLength > maxWordLength)
maxWordLength = wordLength;
}
fprintf(stream, "count = %lu, maxWordLength = %lu\n", count, maxWordLength);
for (PRUint32 i = 0; i < count; ++i) {
Token* token = tokens[i];
fprintf(stream, "%s : %lu\n", token->mWord.get(), token->mCount);
}
delete[] tokens;
}
static void readTokens(FILE* stream, Tokenizer& tokenizer)
{
PRUint32 count, maxWordLength;
fscanf(stream, "count = %lu, maxWordLength = %lu\n", &count, &maxWordLength);
char* wordBuffer = new char[maxWordLength + 1];
if (!wordBuffer) return;
PRUint32 wordCount;
for (PRUint32 i = 0; i < count; ++i) {
if (fscanf(stream, "%s : %lu\n", wordBuffer, &wordCount) > 0)
tokenizer.add(wordBuffer, wordCount);
}
delete[] wordBuffer;
}
void nsBayesianFilter::writeTrainingData()
{
nsCOMPtr<nsIFile> file;
nsresult rv = getTrainingFile(file);
if (NS_FAILED(rv)) return;
// open the file, and write out training data using fprintf for now.
nsCOMPtr<nsILocalFile> localFile = do_QueryInterface(file, &rv);
if (NS_FAILED(rv)) return;
FILE* stream;
rv = localFile->OpenANSIFileDesc("w", &stream);
if (NS_FAILED(rv)) return;
fprintf(stream, "ngood = %lu, nbad = %lu\n", mGoodCount, mBadCount);
writeTokens(stream, mGoodTokens);
writeTokens(stream, mBadTokens);
fclose(stream);
mTrainingDataDirty = PR_FALSE;
}
void nsBayesianFilter::readTrainingData()
{
nsCOMPtr<nsIFile> file;
nsresult rv = getTrainingFile(file);
if (NS_FAILED(rv)) return;
// open the file, and write out training data using fprintf for now.
nsCOMPtr<nsILocalFile> localFile = do_QueryInterface(file, &rv);
if (NS_FAILED(rv)) return;
FILE* stream;
rv = localFile->OpenANSIFileDesc("r", &stream);
if (NS_FAILED(rv)) return;
// FIXME: should make sure that the tokenizers are empty.
fscanf(stream, "ngood = %lu, nbad = %lu\n", &mGoodCount, &mBadCount);
readTokens(stream, mGoodTokens);
readTokens(stream, mBadTokens);
fclose(stream);
}
/* void setMessageClassification (in string aMsgURL, in long aOldClassification, in long aNewClassification); */

View File

@ -91,91 +91,15 @@ public:
void classifyMessage(Tokenizer& messageTokens, const char* messageURL, nsIJunkMailClassificationListener* listener);
void observeMessage(Tokenizer& messageTokens, const char* messageURL, PRInt32 oldClassification, PRInt32 newClassification, nsIJunkMailClassificationListener* listener);
void writeTrainingData();
void readTrainingData();
protected:
Tokenizer mGoodTokens, mBadTokens;
PRUint32 mGoodCount, mBadCount;
nsACString* mServerPrefsKey;
PRPackedBool mBatchUpdate;
PRPackedBool mTrainingDataDirty;
};
#if 0
#include "nsIMsgMdnGenerator.h"
#include "nsIMimeConverter.h"
#include "nsIUrlListener.h"
#include "nsXPIDLString.h"
#include "nsIMsgIncomingServer.h"
#include "nsFileStream.h"
#include "nsIOutputStream.h"
#include "nsIFileSpec.h"
#include "nsIMsgIdentity.h"
#include "nsIMsgWindow.h"
#include "nsIMimeHeaders.h"
#define eNeverSendOp ((PRInt32) 0)
#define eAutoSendOp ((PRInt32) 1)
#define eAskMeOp ((PRInt32) 2)
#define eDeniedOp ((PRInt32) 3)
class nsMsgMdnGenerator : public nsIMsgMdnGenerator, public nsIUrlListener
{
public:
NS_DECL_ISUPPORTS
NS_DECL_NSIMSGMDNGENERATOR
NS_DECL_NSIURLLISTENER
nsMsgMdnGenerator();
virtual ~nsMsgMdnGenerator();
private:
// Sanity Check methods
PRBool ProcessSendMode(); // must called prior ValidateReturnPath
PRBool ValidateReturnPath();
PRBool NotInToOrCc();
PRBool MailAddrMatch(const char *addr1, const char *addr2);
nsresult StoreMDNSentFlag(nsIMsgFolder *folder, nsMsgKey key);
nsresult CreateMdnMsg();
nsresult CreateFirstPart();
nsresult CreateSecondPart();
nsresult CreateThirdPart();
nsresult SendMdnMsg();
// string bundle helper methods
nsresult GetStringFromName(const PRUnichar *aName, PRUnichar **aResultString);
nsresult FormatStringFromName(const PRUnichar *aName,
const PRUnichar *aString,
PRUnichar **aResultString);
// other helper methods
nsresult InitAndProcess();
nsresult OutputAllHeaders();
nsresult WriteString(const char *str);
private:
EDisposeType m_disposeType;
nsCOMPtr<nsIMsgWindow> m_window;
nsCOMPtr<nsIOutputStream> m_outputStream;
nsCOMPtr<nsIFileSpec> m_fileSpec;
nsCOMPtr<nsIMsgIdentity> m_identity;
nsXPIDLString m_charset;
nsXPIDLCString m_email;
nsXPIDLCString m_mimeSeparator;
nsXPIDLCString m_messageId;
nsCOMPtr<nsIMsgFolder> m_folder;
nsCOMPtr<nsIMsgIncomingServer> m_server;
nsCOMPtr<nsIMimeHeaders> m_headers;
nsXPIDLCString m_dntRrt;
PRInt32 m_notInToCcOp;
PRInt32 m_outsideDomainOp;
PRInt32 m_otherOp;
PRPackedBool m_reallySendMdn;
PRPackedBool m_autoSend;
PRPackedBool m_autoAction;
PRPackedBool m_mdnEnabled;
};
#endif
#endif // _nsBayesianFilter_h__