mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-16 23:05:42 +00:00
Fix for bug #183109, r=dmose, sr=sfraser, a=asa.
This commit is contained in:
parent
24bb3f3bcc
commit
377eb1b1ff
@ -162,7 +162,7 @@ Tokenizer::~Tokenizer()
|
|||||||
PL_FinishArenaPool(&mWordPool);
|
PL_FinishArenaPool(&mWordPool);
|
||||||
}
|
}
|
||||||
|
|
||||||
nsresult Tokenizer::Clear()
|
nsresult Tokenizer::clearTokens()
|
||||||
{
|
{
|
||||||
// we re-use the tokenizer when classifying multiple messages,
|
// we re-use the tokenizer when classifying multiple messages,
|
||||||
// so this gets called after every message classification.
|
// so this gets called after every message classification.
|
||||||
@ -244,6 +244,17 @@ static PRBool isDecimalNumber(const char* word)
|
|||||||
return PR_TRUE;
|
return PR_TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PRBool isASCII(const char* word)
|
||||||
|
{
|
||||||
|
const unsigned char* p = (const unsigned char*)word;
|
||||||
|
unsigned char c;
|
||||||
|
while ((c = *p++)) {
|
||||||
|
if (c > 127)
|
||||||
|
return PR_FALSE;
|
||||||
|
}
|
||||||
|
return PR_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
|
inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
|
||||||
|
|
||||||
static char* toLowerCase(char* str)
|
static char* toLowerCase(char* str)
|
||||||
@ -263,7 +274,39 @@ void Tokenizer::tokenize(char* text)
|
|||||||
while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
|
while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
|
||||||
if (word[0] == '\0') continue;
|
if (word[0] == '\0') continue;
|
||||||
if (isDecimalNumber(word)) continue;
|
if (isDecimalNumber(word)) continue;
|
||||||
|
if (isASCII(word))
|
||||||
add(toLowerCase(word));
|
add(toLowerCase(word));
|
||||||
|
else {
|
||||||
|
nsresult rv;
|
||||||
|
// use I18N scanner to break this word into meaningful semantic units.
|
||||||
|
if (!mScanner) {
|
||||||
|
mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
|
||||||
|
NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
|
||||||
|
if (NS_FAILED(rv)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mScanner) {
|
||||||
|
mScanner->Start("UTF-8");
|
||||||
|
// convert this word from UTF-8 into UCS2.
|
||||||
|
NS_ConvertUTF8toUCS2 uword(word);
|
||||||
|
ToLowerCase(uword);
|
||||||
|
const PRUnichar* utext = uword.get();
|
||||||
|
PRInt32 len = uword.Length(), pos = 0, begin, end;
|
||||||
|
PRBool gotUnit;
|
||||||
|
while (pos < len) {
|
||||||
|
rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
|
||||||
|
if (NS_SUCCEEDED(rv) && gotUnit) {
|
||||||
|
NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
|
||||||
|
add(utfUnit.get());
|
||||||
|
// advance to end of current unit.
|
||||||
|
pos = end;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -506,7 +549,7 @@ public:
|
|||||||
virtual void analyzeTokens(Tokenizer& tokenizer)
|
virtual void analyzeTokens(Tokenizer& tokenizer)
|
||||||
{
|
{
|
||||||
mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
|
mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
|
||||||
tokenizer.Clear();
|
tokenizer.clearTokens();
|
||||||
classifyNextMessage();
|
classifyNextMessage();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,6 +41,7 @@
|
|||||||
|
|
||||||
#include "nsCOMPtr.h"
|
#include "nsCOMPtr.h"
|
||||||
#include "nsIMsgFilterPlugin.h"
|
#include "nsIMsgFilterPlugin.h"
|
||||||
|
#include "nsISemanticUnitScanner.h"
|
||||||
#include "pldhash.h"
|
#include "pldhash.h"
|
||||||
|
|
||||||
// XXX can't simply byte align arenas, must at least 2-byte align.
|
// XXX can't simply byte align arenas, must at least 2-byte align.
|
||||||
@ -51,6 +52,7 @@ class Token;
|
|||||||
class TokenEnumeration;
|
class TokenEnumeration;
|
||||||
class TokenAnalyzer;
|
class TokenAnalyzer;
|
||||||
class nsIMsgWindow;
|
class nsIMsgWindow;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper class to enumerate Token objects in a PLDHashTable
|
* Helper class to enumerate Token objects in a PLDHashTable
|
||||||
* safely and without copying (see bugzilla #174859). The
|
* safely and without copying (see bugzilla #174859). The
|
||||||
@ -73,7 +75,6 @@ public:
|
|||||||
Tokenizer();
|
Tokenizer();
|
||||||
~Tokenizer();
|
~Tokenizer();
|
||||||
|
|
||||||
nsresult Clear(); // clears out the previous message tokens
|
|
||||||
operator int() { return mTokenTable.entryStore != NULL; }
|
operator int() { return mTokenTable.entryStore != NULL; }
|
||||||
|
|
||||||
Token* get(const char* word);
|
Token* get(const char* word);
|
||||||
@ -84,6 +85,11 @@ public:
|
|||||||
Token* copyTokens();
|
Token* copyTokens();
|
||||||
TokenEnumeration getTokens();
|
TokenEnumeration getTokens();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clears out the previous message tokens.
|
||||||
|
*/
|
||||||
|
nsresult clearTokens();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Assumes that text is mutable and
|
* Assumes that text is mutable and
|
||||||
* can be nsCRT::strtok'd.
|
* can be nsCRT::strtok'd.
|
||||||
@ -106,6 +112,7 @@ private:
|
|||||||
private:
|
private:
|
||||||
PLDHashTable mTokenTable;
|
PLDHashTable mTokenTable;
|
||||||
PLArenaPool mWordPool;
|
PLArenaPool mWordPool;
|
||||||
|
nsCOMPtr<nsISemanticUnitScanner> mScanner;
|
||||||
};
|
};
|
||||||
|
|
||||||
class nsBayesianFilter : public nsIJunkMailPlugin {
|
class nsBayesianFilter : public nsIJunkMailPlugin {
|
||||||
|
Loading…
Reference in New Issue
Block a user