Fix for bug #183109, r=dmose, sr=sfraser, a=asa.

2024-10-16 23:05:42 +00:00 · 2002-12-07 00:42:48 +00:00 · 2002-12-07 00:42:48 +00:00 · 377eb1b1ff
commit 377eb1b1ff
parent 24bb3f3bcc
2 changed files with 54 additions and 4 deletions
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@ -162,7 +162,7 @@ Tokenizer::~Tokenizer()
    PL_FinishArenaPool(&mWordPool);
 }
-nsresult Tokenizer::Clear()
+nsresult Tokenizer::clearTokens()
 {
    // we re-use the tokenizer when classifying multiple messages, 
    // so this gets called after every message classification.
@ -244,6 +244,17 @@ static PRBool isDecimalNumber(const char* word)
    return PR_TRUE;
 }
 static PRBool isASCII(const char* word)
 {
    const unsigned char* p = (const unsigned char*)word;
    unsigned char c;
    while ((c = *p++)) {
        if (c > 127)
            return PR_FALSE;
    }
    return PR_TRUE;
 }
 inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
 static char* toLowerCase(char* str)
@ -263,7 +274,39 @@ void Tokenizer::tokenize(char* text)
    while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
        if (word[0] == '\0') continue;
        if (isDecimalNumber(word)) continue;
        if (isASCII(word))
            add(toLowerCase(word));
        else {
            nsresult rv;
            // use I18N  scanner to break this word into meaningful semantic units.
            if (!mScanner) {
                mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
                NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
                if (NS_FAILED(rv)) {
                    return;
                }
            }
            if (mScanner) {
                mScanner->Start("UTF-8");
                // convert this word from UTF-8 into UCS2.
                NS_ConvertUTF8toUCS2 uword(word);
                ToLowerCase(uword);
                const PRUnichar* utext = uword.get();
                PRInt32 len = uword.Length(), pos = 0, begin, end;
                PRBool gotUnit;
                while (pos < len) {
                    rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
                    if (NS_SUCCEEDED(rv) && gotUnit) {
                        NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
                        add(utfUnit.get());
                        // advance to end of current unit.
                        pos = end;
                    } else {
                        break;
                    }
                }
            }
        }
    }
 }
@ -506,7 +549,7 @@ public:
    virtual void analyzeTokens(Tokenizer& tokenizer)
    {
        mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
-        tokenizer.Clear();
+        tokenizer.clearTokens();
        classifyNextMessage();
    }
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
@ -41,6 +41,7 @@
 #include "nsCOMPtr.h"
 #include "nsIMsgFilterPlugin.h"
 #include "nsISemanticUnitScanner.h"
 #include "pldhash.h"
 // XXX can't simply byte align arenas, must at least 2-byte align.
@ -51,6 +52,7 @@ class Token;
 class TokenEnumeration;
 class TokenAnalyzer;
 class nsIMsgWindow;
 /**
 * Helper class to enumerate Token objects in a PLDHashTable
 * safely and without copying (see bugzilla #174859). The
@ -73,7 +75,6 @@ public:
    Tokenizer();
    ~Tokenizer();
    nsresult Clear(); // clears out the previous message tokens
    operator int() { return mTokenTable.entryStore != NULL; }
    Token* get(const char* word);
@ -84,6 +85,11 @@ public:
    Token* copyTokens();
    TokenEnumeration getTokens();
    /**
     * Clears out the previous message tokens.
     */
    nsresult clearTokens();
    /**
     * Assumes that text is mutable and
     * can be nsCRT::strtok'd.
@ -106,6 +112,7 @@ private:
 private:
    PLDHashTable mTokenTable;
    PLArenaPool mWordPool;
    nsCOMPtr<nsISemanticUnitScanner> mScanner;
 };
 class nsBayesianFilter : public nsIJunkMailPlugin {