Fix for bug #183109, r=dmose, sr=sfraser, a=asa.

2024-10-16 23:05:42 +00:00 · 2002-12-07 00:42:48 +00:00 · 2002-12-07 00:42:48 +00:00 · 377eb1b1ff
commit 377eb1b1ff
parent 24bb3f3bcc
2 changed files with 54 additions and 4 deletions
--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@ -162,7 +162,7 @@ Tokenizer::~Tokenizer()
    PL_FinishArenaPool(&mWordPool);
 }

-nsresult Tokenizer::Clear()
+nsresult Tokenizer::clearTokens()
 {
    // we re-use the tokenizer when classifying multiple messages, 
    // so this gets called after every message classification.
@ -244,6 +244,17 @@ static PRBool isDecimalNumber(const char* word)
    return PR_TRUE;
 }

+static PRBool isASCII(const char* word)
+{
+    const unsigned char* p = (const unsigned char*)word;
+    unsigned char c;
+    while ((c = *p++)) {
+        if (c > 127)
+            return PR_FALSE;
+    }
+    return PR_TRUE;
+}
+
 inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }

 static char* toLowerCase(char* str)
@ -263,7 +274,39 @@ void Tokenizer::tokenize(char* text)
    while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
        if (word[0] == '\0') continue;
        if (isDecimalNumber(word)) continue;
-        add(toLowerCase(word));
+        if (isASCII(word))
+            add(toLowerCase(word));
+        else {
+            nsresult rv;
+            // use I18N  scanner to break this word into meaningful semantic units.
+            if (!mScanner) {
+                mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
+                NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
+                if (NS_FAILED(rv)) {
+                    return;
+                }
+            }
+            if (mScanner) {
+                mScanner->Start("UTF-8");
+                // convert this word from UTF-8 into UCS2.
+                NS_ConvertUTF8toUCS2 uword(word);
+                ToLowerCase(uword);
+                const PRUnichar* utext = uword.get();
+                PRInt32 len = uword.Length(), pos = 0, begin, end;
+                PRBool gotUnit;
+                while (pos < len) {
+                    rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
+                    if (NS_SUCCEEDED(rv) && gotUnit) {
+                        NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
+                        add(utfUnit.get());
+                        // advance to end of current unit.
+                        pos = end;
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
    }
 }

@ -506,7 +549,7 @@ public:
    virtual void analyzeTokens(Tokenizer& tokenizer)
    {
        mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
-        tokenizer.Clear();
+        tokenizer.clearTokens();
        classifyNextMessage();
    }

--- a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
@ -41,6 +41,7 @@

 #include "nsCOMPtr.h"
 #include "nsIMsgFilterPlugin.h"
+#include "nsISemanticUnitScanner.h"
 #include "pldhash.h"

 // XXX can't simply byte align arenas, must at least 2-byte align.
@ -51,6 +52,7 @@ class Token;
 class TokenEnumeration;
 class TokenAnalyzer;
 class nsIMsgWindow;
+
 /**
 * Helper class to enumerate Token objects in a PLDHashTable
 * safely and without copying (see bugzilla #174859). The
@ -73,7 +75,6 @@ public:
    Tokenizer();
    ~Tokenizer();

-    nsresult Clear(); // clears out the previous message tokens
    operator int() { return mTokenTable.entryStore != NULL; }
    
    Token* get(const char* word);
@ -84,6 +85,11 @@ public:
    Token* copyTokens();
    TokenEnumeration getTokens();

+    /**
+     * Clears out the previous message tokens.
+     */
+    nsresult clearTokens();
+
    /**
     * Assumes that text is mutable and
     * can be nsCRT::strtok'd.
@ -106,6 +112,7 @@ private:
 private:
    PLDHashTable mTokenTable;
    PLArenaPool mWordPool;
+    nsCOMPtr<nsISemanticUnitScanner> mScanner;
 };

 class nsBayesianFilter : public nsIJunkMailPlugin {