mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-16 23:05:42 +00:00
Added writeTrainingData/readTrainingData methods which save to a file called "training.txt" in the current profile directory. [not part of build]
This commit is contained in:
parent
82a267d76a
commit
c024c1e8c8
@ -45,6 +45,7 @@
|
||||
#include "nsIByteBuffer.h"
|
||||
#include "nsNetUtil.h"
|
||||
#include "nsQuickSort.h"
|
||||
#include "nsIProfileInternal.h"
|
||||
|
||||
static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f!\"#%&()*+,./:;<=>?@[\\]^_`{|}~";
|
||||
|
||||
@ -306,9 +307,13 @@ NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISuppor
|
||||
NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
|
||||
|
||||
nsBayesianFilter::nsBayesianFilter()
|
||||
: mServerPrefsKey(NULL), mBatchUpdate(PR_FALSE), mGoodCount(0), mBadCount(0)
|
||||
: mGoodCount(0), mBadCount(0),
|
||||
mServerPrefsKey(NULL), mBatchUpdate(PR_FALSE), mTrainingDataDirty(PR_FALSE)
|
||||
{
|
||||
NS_INIT_ISUPPORTS();
|
||||
|
||||
// should probably wait until Init() is called to do this.
|
||||
readTrainingData();
|
||||
}
|
||||
|
||||
nsBayesianFilter::~nsBayesianFilter()
|
||||
@ -454,6 +459,10 @@ NS_IMETHODIMP nsBayesianFilter::GetBatchUpdate(PRBool *aBatchUpdate)
|
||||
NS_IMETHODIMP nsBayesianFilter::SetBatchUpdate(PRBool aBatchUpdate)
|
||||
{
|
||||
mBatchUpdate = aBatchUpdate;
|
||||
|
||||
if (mBatchUpdate && mTrainingDataDirty)
|
||||
writeTrainingData();
|
||||
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
@ -520,6 +529,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
|
||||
if (mBadCount > 0) {
|
||||
--mBadCount;
|
||||
forgetTokens(mBadTokens, tokens, count);
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
}
|
||||
break;
|
||||
case nsIJunkMailPlugin::GOOD:
|
||||
@ -527,6 +537,7 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
|
||||
if (mGoodCount > 0) {
|
||||
--mGoodCount;
|
||||
forgetTokens(mGoodTokens, tokens, count);
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -535,11 +546,13 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
|
||||
// put tokens into junk corpus.
|
||||
++mBadCount;
|
||||
rememberTokens(mBadTokens, tokens, count);
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
break;
|
||||
case nsIJunkMailPlugin::GOOD:
|
||||
// put tokens into good corpus.
|
||||
++mGoodCount;
|
||||
rememberTokens(mGoodTokens, tokens, count);
|
||||
mTrainingDataDirty = PR_TRUE;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -547,6 +560,137 @@ void nsBayesianFilter::observeMessage(Tokenizer& messageTokens, const char* mess
|
||||
|
||||
if (listener)
|
||||
listener->OnMessageClassified(messageURL, newClassification);
|
||||
|
||||
if (mTrainingDataDirty && !mBatchUpdate)
|
||||
writeTrainingData();
|
||||
}
|
||||
|
||||
/*
|
||||
var profileMgr = do_GetService(PROFILEMGR_CTRID, nsIProfileInternal);
|
||||
var outFile = profileMgr.getProfileDir(profileMgr.currentProfile);
|
||||
outFile.append("spam.db");
|
||||
var fileTransportService = do_GetService(FILETPTSVC_CTRID, nsIFileTransportService);
|
||||
const ioFlags = NS_WRONLY | NS_CREATE_FILE | NS_TRUNCATE;
|
||||
var trans = fileTransportService.createTransport(outFile, ioFlags, 'w', true);
|
||||
var out = trans.openOutputStream(0, -1, 0);
|
||||
|
||||
var totals = this.mHamCount.toString() + "\t" +
|
||||
this.mSpamCount.toString() + "\n";
|
||||
out.write(totals, totals.length);
|
||||
for (token in this.mHash) {
|
||||
var record = this.mHash[token];
|
||||
var tokenString = token + "\t" + record[kHamCount].toString() +
|
||||
"\t" + record[kSpamCount].toString() +
|
||||
"\n";
|
||||
// "\t" + record[kTime].toString() + "\n";
|
||||
out.write(tokenString, tokenString.length);
|
||||
}
|
||||
|
||||
out.close();
|
||||
*/
|
||||
|
||||
static nsresult getTrainingFile(nsCOMPtr<nsIFile>& file)
|
||||
{
|
||||
// should we cache the profile manager's directory?
|
||||
nsresult rv;
|
||||
nsCOMPtr<nsIProfileInternal> profileManager = do_GetService("@mozilla.org/profile/manager;1", &rv);
|
||||
if (NS_FAILED(rv)) return rv;
|
||||
|
||||
nsXPIDLString currentProfile;
|
||||
rv = profileManager->GetCurrentProfile(getter_Copies(currentProfile));
|
||||
if (NS_FAILED(rv)) return rv;
|
||||
|
||||
rv = profileManager->GetProfileDir(currentProfile.get(), getter_AddRefs(file));
|
||||
if (NS_FAILED(rv)) return rv;
|
||||
|
||||
return file->Append(NS_LITERAL_STRING("training.txt"));
|
||||
}
|
||||
|
||||
static void writeTokens(FILE* stream, Tokenizer& tokenizer)
|
||||
{
|
||||
PRUint32 i, count = tokenizer.countTokens();
|
||||
Token ** tokens = tokenizer.getTokens();
|
||||
if (!tokens) return;
|
||||
|
||||
// compute the maximum word length, so we can use a fixed buffer size when reading back in.
|
||||
PRUint32 maxWordLength = 0;
|
||||
for (i = 0; i < count; ++i) {
|
||||
PRUint32 wordLength = tokens[i]->mWord.Length();
|
||||
if (wordLength > maxWordLength)
|
||||
maxWordLength = wordLength;
|
||||
}
|
||||
|
||||
fprintf(stream, "count = %lu, maxWordLength = %lu\n", count, maxWordLength);
|
||||
|
||||
for (PRUint32 i = 0; i < count; ++i) {
|
||||
Token* token = tokens[i];
|
||||
fprintf(stream, "%s : %lu\n", token->mWord.get(), token->mCount);
|
||||
}
|
||||
|
||||
delete[] tokens;
|
||||
}
|
||||
|
||||
static void readTokens(FILE* stream, Tokenizer& tokenizer)
|
||||
{
|
||||
PRUint32 count, maxWordLength;
|
||||
fscanf(stream, "count = %lu, maxWordLength = %lu\n", &count, &maxWordLength);
|
||||
|
||||
char* wordBuffer = new char[maxWordLength + 1];
|
||||
if (!wordBuffer) return;
|
||||
|
||||
PRUint32 wordCount;
|
||||
|
||||
for (PRUint32 i = 0; i < count; ++i) {
|
||||
if (fscanf(stream, "%s : %lu\n", wordBuffer, &wordCount) > 0)
|
||||
tokenizer.add(wordBuffer, wordCount);
|
||||
}
|
||||
|
||||
delete[] wordBuffer;
|
||||
}
|
||||
|
||||
void nsBayesianFilter::writeTrainingData()
|
||||
{
|
||||
nsCOMPtr<nsIFile> file;
|
||||
nsresult rv = getTrainingFile(file);
|
||||
if (NS_FAILED(rv)) return;
|
||||
|
||||
// open the file, and write out training data using fprintf for now.
|
||||
nsCOMPtr<nsILocalFile> localFile = do_QueryInterface(file, &rv);
|
||||
if (NS_FAILED(rv)) return;
|
||||
|
||||
FILE* stream;
|
||||
rv = localFile->OpenANSIFileDesc("w", &stream);
|
||||
if (NS_FAILED(rv)) return;
|
||||
|
||||
fprintf(stream, "ngood = %lu, nbad = %lu\n", mGoodCount, mBadCount);
|
||||
writeTokens(stream, mGoodTokens);
|
||||
writeTokens(stream, mBadTokens);
|
||||
|
||||
fclose(stream);
|
||||
|
||||
mTrainingDataDirty = PR_FALSE;
|
||||
}
|
||||
|
||||
void nsBayesianFilter::readTrainingData()
|
||||
{
|
||||
nsCOMPtr<nsIFile> file;
|
||||
nsresult rv = getTrainingFile(file);
|
||||
if (NS_FAILED(rv)) return;
|
||||
|
||||
// open the file, and write out training data using fprintf for now.
|
||||
nsCOMPtr<nsILocalFile> localFile = do_QueryInterface(file, &rv);
|
||||
if (NS_FAILED(rv)) return;
|
||||
|
||||
FILE* stream;
|
||||
rv = localFile->OpenANSIFileDesc("r", &stream);
|
||||
if (NS_FAILED(rv)) return;
|
||||
|
||||
// FIXME: should make sure that the tokenizers are empty.
|
||||
fscanf(stream, "ngood = %lu, nbad = %lu\n", &mGoodCount, &mBadCount);
|
||||
readTokens(stream, mGoodTokens);
|
||||
readTokens(stream, mBadTokens);
|
||||
|
||||
fclose(stream);
|
||||
}
|
||||
|
||||
/* void setMessageClassification (in string aMsgURL, in long aOldClassification, in long aNewClassification); */
|
||||
|
@ -91,91 +91,15 @@ public:
|
||||
void classifyMessage(Tokenizer& messageTokens, const char* messageURL, nsIJunkMailClassificationListener* listener);
|
||||
void observeMessage(Tokenizer& messageTokens, const char* messageURL, PRInt32 oldClassification, PRInt32 newClassification, nsIJunkMailClassificationListener* listener);
|
||||
|
||||
void writeTrainingData();
|
||||
void readTrainingData();
|
||||
|
||||
protected:
|
||||
Tokenizer mGoodTokens, mBadTokens;
|
||||
PRUint32 mGoodCount, mBadCount;
|
||||
nsACString* mServerPrefsKey;
|
||||
PRPackedBool mBatchUpdate;
|
||||
PRPackedBool mTrainingDataDirty;
|
||||
};
|
||||
|
||||
#if 0
|
||||
|
||||
#include "nsIMsgMdnGenerator.h"
|
||||
#include "nsIMimeConverter.h"
|
||||
#include "nsIUrlListener.h"
|
||||
#include "nsXPIDLString.h"
|
||||
#include "nsIMsgIncomingServer.h"
|
||||
#include "nsFileStream.h"
|
||||
#include "nsIOutputStream.h"
|
||||
#include "nsIFileSpec.h"
|
||||
#include "nsIMsgIdentity.h"
|
||||
#include "nsIMsgWindow.h"
|
||||
#include "nsIMimeHeaders.h"
|
||||
|
||||
#define eNeverSendOp ((PRInt32) 0)
|
||||
#define eAutoSendOp ((PRInt32) 1)
|
||||
#define eAskMeOp ((PRInt32) 2)
|
||||
#define eDeniedOp ((PRInt32) 3)
|
||||
|
||||
class nsMsgMdnGenerator : public nsIMsgMdnGenerator, public nsIUrlListener
|
||||
{
|
||||
public:
|
||||
NS_DECL_ISUPPORTS
|
||||
NS_DECL_NSIMSGMDNGENERATOR
|
||||
NS_DECL_NSIURLLISTENER
|
||||
|
||||
nsMsgMdnGenerator();
|
||||
virtual ~nsMsgMdnGenerator();
|
||||
|
||||
private:
|
||||
// Sanity Check methods
|
||||
PRBool ProcessSendMode(); // must called prior ValidateReturnPath
|
||||
PRBool ValidateReturnPath();
|
||||
PRBool NotInToOrCc();
|
||||
PRBool MailAddrMatch(const char *addr1, const char *addr2);
|
||||
|
||||
nsresult StoreMDNSentFlag(nsIMsgFolder *folder, nsMsgKey key);
|
||||
|
||||
nsresult CreateMdnMsg();
|
||||
nsresult CreateFirstPart();
|
||||
nsresult CreateSecondPart();
|
||||
nsresult CreateThirdPart();
|
||||
nsresult SendMdnMsg();
|
||||
|
||||
// string bundle helper methods
|
||||
nsresult GetStringFromName(const PRUnichar *aName, PRUnichar **aResultString);
|
||||
nsresult FormatStringFromName(const PRUnichar *aName,
|
||||
const PRUnichar *aString,
|
||||
PRUnichar **aResultString);
|
||||
|
||||
// other helper methods
|
||||
nsresult InitAndProcess();
|
||||
nsresult OutputAllHeaders();
|
||||
nsresult WriteString(const char *str);
|
||||
|
||||
private:
|
||||
EDisposeType m_disposeType;
|
||||
nsCOMPtr<nsIMsgWindow> m_window;
|
||||
nsCOMPtr<nsIOutputStream> m_outputStream;
|
||||
nsCOMPtr<nsIFileSpec> m_fileSpec;
|
||||
nsCOMPtr<nsIMsgIdentity> m_identity;
|
||||
nsXPIDLString m_charset;
|
||||
nsXPIDLCString m_email;
|
||||
nsXPIDLCString m_mimeSeparator;
|
||||
nsXPIDLCString m_messageId;
|
||||
nsCOMPtr<nsIMsgFolder> m_folder;
|
||||
nsCOMPtr<nsIMsgIncomingServer> m_server;
|
||||
nsCOMPtr<nsIMimeHeaders> m_headers;
|
||||
nsXPIDLCString m_dntRrt;
|
||||
PRInt32 m_notInToCcOp;
|
||||
PRInt32 m_outsideDomainOp;
|
||||
PRInt32 m_otherOp;
|
||||
PRPackedBool m_reallySendMdn;
|
||||
PRPackedBool m_autoSend;
|
||||
PRPackedBool m_autoAction;
|
||||
PRPackedBool m_mdnEnabled;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif // _nsBayesianFilter_h__
|
||||
|
Loading…
Reference in New Issue
Block a user