Fix for bug #183109, r=dmose, sr=sfraser, a=asa.

This commit is contained in:
beard%netscape.com 2002-12-07 00:42:48 +00:00
parent 24bb3f3bcc
commit 377eb1b1ff
2 changed files with 54 additions and 4 deletions

View File

@ -162,7 +162,7 @@ Tokenizer::~Tokenizer()
PL_FinishArenaPool(&mWordPool);
}
nsresult Tokenizer::Clear()
nsresult Tokenizer::clearTokens()
{
// we re-use the tokenizer when classifying multiple messages,
// so this gets called after every message classification.
@ -244,6 +244,17 @@ static PRBool isDecimalNumber(const char* word)
return PR_TRUE;
}
static PRBool isASCII(const char* word)
{
const unsigned char* p = (const unsigned char*)word;
unsigned char c;
while ((c = *p++)) {
if (c > 127)
return PR_FALSE;
}
return PR_TRUE;
}
inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
static char* toLowerCase(char* str)
@ -263,7 +274,39 @@ void Tokenizer::tokenize(char* text)
while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
if (word[0] == '\0') continue;
if (isDecimalNumber(word)) continue;
add(toLowerCase(word));
if (isASCII(word))
add(toLowerCase(word));
else {
nsresult rv;
// use I18N scanner to break this word into meaningful semantic units.
if (!mScanner) {
mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
if (NS_FAILED(rv)) {
return;
}
}
if (mScanner) {
mScanner->Start("UTF-8");
// convert this word from UTF-8 into UCS2.
NS_ConvertUTF8toUCS2 uword(word);
ToLowerCase(uword);
const PRUnichar* utext = uword.get();
PRInt32 len = uword.Length(), pos = 0, begin, end;
PRBool gotUnit;
while (pos < len) {
rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
if (NS_SUCCEEDED(rv) && gotUnit) {
NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
add(utfUnit.get());
// advance to end of current unit.
pos = end;
} else {
break;
}
}
}
}
}
}
@ -506,7 +549,7 @@ public:
virtual void analyzeTokens(Tokenizer& tokenizer)
{
mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
tokenizer.Clear();
tokenizer.clearTokens();
classifyNextMessage();
}

View File

@ -41,6 +41,7 @@
#include "nsCOMPtr.h"
#include "nsIMsgFilterPlugin.h"
#include "nsISemanticUnitScanner.h"
#include "pldhash.h"
// XXX can't simply byte align arenas, must at least 2-byte align.
@ -51,6 +52,7 @@ class Token;
class TokenEnumeration;
class TokenAnalyzer;
class nsIMsgWindow;
/**
* Helper class to enumerate Token objects in a PLDHashTable
* safely and without copying (see bugzilla #174859). The
@ -73,7 +75,6 @@ public:
Tokenizer();
~Tokenizer();
nsresult Clear(); // clears out the previous message tokens
operator int() { return mTokenTable.entryStore != NULL; }
Token* get(const char* word);
@ -84,6 +85,11 @@ public:
Token* copyTokens();
TokenEnumeration getTokens();
/**
* Clears out the previous message tokens.
*/
nsresult clearTokens();
/**
* Assumes that text is mutable and
* can be nsCRT::strtok'd.
@ -106,6 +112,7 @@ private:
private:
PLDHashTable mTokenTable;
PLArenaPool mWordPool;
nsCOMPtr<nsISemanticUnitScanner> mScanner;
};
class nsBayesianFilter : public nsIJunkMailPlugin {