mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-16 23:05:42 +00:00
Fix for bug #183109, r=dmose, sr=sfraser, a=asa.
This commit is contained in:
parent
24bb3f3bcc
commit
377eb1b1ff
@ -162,7 +162,7 @@ Tokenizer::~Tokenizer()
|
||||
PL_FinishArenaPool(&mWordPool);
|
||||
}
|
||||
|
||||
nsresult Tokenizer::Clear()
|
||||
nsresult Tokenizer::clearTokens()
|
||||
{
|
||||
// we re-use the tokenizer when classifying multiple messages,
|
||||
// so this gets called after every message classification.
|
||||
@ -244,6 +244,17 @@ static PRBool isDecimalNumber(const char* word)
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
static PRBool isASCII(const char* word)
|
||||
{
|
||||
const unsigned char* p = (const unsigned char*)word;
|
||||
unsigned char c;
|
||||
while ((c = *p++)) {
|
||||
if (c > 127)
|
||||
return PR_FALSE;
|
||||
}
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
|
||||
|
||||
static char* toLowerCase(char* str)
|
||||
@ -263,7 +274,39 @@ void Tokenizer::tokenize(char* text)
|
||||
while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
|
||||
if (word[0] == '\0') continue;
|
||||
if (isDecimalNumber(word)) continue;
|
||||
add(toLowerCase(word));
|
||||
if (isASCII(word))
|
||||
add(toLowerCase(word));
|
||||
else {
|
||||
nsresult rv;
|
||||
// use I18N scanner to break this word into meaningful semantic units.
|
||||
if (!mScanner) {
|
||||
mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
|
||||
NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
|
||||
if (NS_FAILED(rv)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mScanner) {
|
||||
mScanner->Start("UTF-8");
|
||||
// convert this word from UTF-8 into UCS2.
|
||||
NS_ConvertUTF8toUCS2 uword(word);
|
||||
ToLowerCase(uword);
|
||||
const PRUnichar* utext = uword.get();
|
||||
PRInt32 len = uword.Length(), pos = 0, begin, end;
|
||||
PRBool gotUnit;
|
||||
while (pos < len) {
|
||||
rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
|
||||
if (NS_SUCCEEDED(rv) && gotUnit) {
|
||||
NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
|
||||
add(utfUnit.get());
|
||||
// advance to end of current unit.
|
||||
pos = end;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -506,7 +549,7 @@ public:
|
||||
virtual void analyzeTokens(Tokenizer& tokenizer)
|
||||
{
|
||||
mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
|
||||
tokenizer.Clear();
|
||||
tokenizer.clearTokens();
|
||||
classifyNextMessage();
|
||||
}
|
||||
|
||||
|
@ -41,6 +41,7 @@
|
||||
|
||||
#include "nsCOMPtr.h"
|
||||
#include "nsIMsgFilterPlugin.h"
|
||||
#include "nsISemanticUnitScanner.h"
|
||||
#include "pldhash.h"
|
||||
|
||||
// XXX can't simply byte align arenas, must at least 2-byte align.
|
||||
@ -51,6 +52,7 @@ class Token;
|
||||
class TokenEnumeration;
|
||||
class TokenAnalyzer;
|
||||
class nsIMsgWindow;
|
||||
|
||||
/**
|
||||
* Helper class to enumerate Token objects in a PLDHashTable
|
||||
* safely and without copying (see bugzilla #174859). The
|
||||
@ -73,7 +75,6 @@ public:
|
||||
Tokenizer();
|
||||
~Tokenizer();
|
||||
|
||||
nsresult Clear(); // clears out the previous message tokens
|
||||
operator int() { return mTokenTable.entryStore != NULL; }
|
||||
|
||||
Token* get(const char* word);
|
||||
@ -84,6 +85,11 @@ public:
|
||||
Token* copyTokens();
|
||||
TokenEnumeration getTokens();
|
||||
|
||||
/**
|
||||
* Clears out the previous message tokens.
|
||||
*/
|
||||
nsresult clearTokens();
|
||||
|
||||
/**
|
||||
* Assumes that text is mutable and
|
||||
* can be nsCRT::strtok'd.
|
||||
@ -106,6 +112,7 @@ private:
|
||||
private:
|
||||
PLDHashTable mTokenTable;
|
||||
PLArenaPool mWordPool;
|
||||
nsCOMPtr<nsISemanticUnitScanner> mScanner;
|
||||
};
|
||||
|
||||
class nsBayesianFilter : public nsIJunkMailPlugin {
|
||||
|
Loading…
Reference in New Issue
Block a user