bug 176528, need a nsISemanticUnitScanner to support intl spam mail filter

Implement a new interface for this purpose.
win&linux, p=ftang, r=shanjian, sr=beard
mac build, p=nhotta, r=ftang, sr=beard
This commit is contained in:
shanjian%netscape.com 2002-11-22 22:36:42 +00:00
parent 729bf8f659
commit 3b0054ca0b
9 changed files with 228 additions and 13 deletions

View File

@ -815,6 +815,7 @@ sub BuildClientDist()
#LWBRK
InstallFromManifest(":mozilla:intl:lwbrk:public:MANIFEST", "$distdirectory:lwbrk:");
InstallFromManifest(":mozilla:intl:lwbrk:idl:MANIFEST_IDL", "$distdirectory:idl:");
#STRRES
InstallFromManifest(":mozilla:intl:strres:public:MANIFEST_IDL", "$distdirectory:idl:");
@ -1507,6 +1508,7 @@ sub BuildIDLProjects()
BuildIDLProject(":mozilla:intl:unicharutil:macbuild:unicharutilIDL.xml", "unicharutil");
BuildIDLProject(":mozilla:intl:uconv:macbuild:uconvIDL.xml", "uconv");
BuildIDLProject(":mozilla:intl:chardet:macbuild:chardetIDL.xml", "chardet");
BuildIDLProject(":mozilla:intl:lwbrk:macbuild:lwbrkIDL.xml", "lwbrk");
if ($main::options{iiextras})
{

View File

@ -46,6 +46,7 @@
// lwbrk
#include "nsLWBrkConstructors.h"
#include "nsSemanticUnitScanner.h"
// unicharutil
#include "nsUcharUtilConstructors.h"
@ -56,6 +57,9 @@
// locale
#include "nsLocaleConstructors.h"
NS_GENERIC_FACTORY_CONSTRUCTOR(nsSemanticUnitScanner);
static NS_METHOD
AddCategoryEntry(const char* category,
const char* key,
@ -290,6 +294,8 @@ static nsModuleComponentInfo components[] =
// lwbrk
{ "Line and Word Breaker", NS_LWBRK_CID,
NS_LWBRK_CONTRACTID, nsLWBreakerFImpConstructor},
{ "Semantic Unit Scanner", NS_SEMANTICUNITSCANNER_CID,
NS_SEMANTICUNITSCANNER_CONTRACTID, nsSemanticUnitScannerConstructor},
// unicharutil
{ "Unichar Utility", NS_UNICHARUTIL_CID,

View File

@ -26,7 +26,7 @@ VPATH = @srcdir@
include $(DEPTH)/config/autoconf.mk
DIRS = public src
DIRS = idl public src
ifdef ENABLE_TESTS
DIRS += tests

View File

@ -42,6 +42,7 @@ CPPSRCS = \
nsJISx4501LineBreaker.cpp \
nsLWBreakerFImp.cpp \
nsSampleWordBreaker.cpp \
nsSemanticUnitScanner.cpp \
$(NULL)
include $(topsrcdir)/config/rules.mk

View File

@ -74,18 +74,6 @@ nsresult nsSampleWordBreaker::BreakInBetween(
}
// hack
typedef enum {
kWbClassSpace = 0,
kWbClassAlphaLetter,
kWbClassPunct,
kWbClassHanLetter,
kWbClassKatakanaLetter,
kWbClassHiraganaLetter,
kWbClassHWKatakanaLetter,
kWbClassThaiLetter
} wb_class;
#define IS_ASCII(c) (0 == ( 0xFF80 & (c)))
#define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
#define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9'))

View File

@ -41,6 +41,17 @@
#include "nsIWordBreaker.h"
typedef enum {
kWbClassSpace = 0,
kWbClassAlphaLetter,
kWbClassPunct,
kWbClassHanLetter,
kWbClassKatakanaLetter,
kWbClassHiraganaLetter,
kWbClassHWKatakanaLetter,
kWbClassThaiLetter
} wb_class;
class nsSampleWordBreaker : public nsIWordBreaker
{
NS_DECL_ISUPPORTS

View File

@ -0,0 +1,119 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nsSemanticUnitScanner.h"
#include "prmem.h"
NS_IMPL_ISUPPORTS1(nsSemanticUnitScanner, nsISemanticUnitScanner)
nsSemanticUnitScanner::nsSemanticUnitScanner() : nsSampleWordBreaker()
{
NS_INIT_ISUPPORTS();
/* member initializers and constructor code */
}
nsSemanticUnitScanner::~nsSemanticUnitScanner()
{
/* destructor code */
}
/* void start (in string characterSet); */
NS_IMETHODIMP nsSemanticUnitScanner::Start(const char *characterSet)
{
// do nothing for now.
return NS_OK;
}
/* void next (in wstring text, in long length, in long pos, out boolean hasMoreUnits, out long begin, out long end); */
NS_IMETHODIMP nsSemanticUnitScanner::Next(const PRUnichar *text, PRInt32 length, PRInt32 pos, PRBool isLastBuffer, PRInt32 *begin, PRInt32 *end, PRBool *_retval)
{
// xxx need to bullet proff and check input pointer
// make sure begin, end and _retval is not nsnull here
// if we reach the end, just return
if (pos >= length) {
*begin = pos;
*end = pos;
*_retval = PR_FALSE;
return NS_OK;
}
PRUint8 char_class = nsSampleWordBreaker::GetClass(text[pos]);
// if we are in chinese mode, return on han letter at a time
// we should not do this if we are in Japanese or Korena mode
if (kWbClassHanLetter == char_class) {
*begin = pos;
*end = pos+1;
*_retval = PR_TRUE;
return NS_OK;
}
PRUint32 next;
PRBool needMoreText;
// find the next "word"
nsresult res = nsSampleWordBreaker::Next(text, (PRUint32) length, (PRUint32) pos,
&next, &needMoreText);
NS_ASSERTION(NS_SUCCEEDED(res), "nsSampleWordBreaker::Next failed");
if(NS_FAILED(res))
return res;
// if we don't have enough text to make decision, return
if (needMoreText) {
*begin = pos;
*end = pos;
*_retval = PR_FALSE;
return NS_OK;
}
// if what we got is space or punct, look at the next break
if ( (char_class == kWbClassSpace) || (char_class == kWbClassPunct) ) {
// if the next "word" is not letters,
// call itself recursively with the new pos
return Next(text, length, next, isLastBuffer, begin, end, _retval);
}
// for the rest, return
*begin = pos;
*end = next;
*_retval = PR_TRUE;
return NS_OK;
}

View File

@ -0,0 +1,58 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsSemanticUnitScanner_h__
#define nsSemanticUnitScanner_h__
#include "nsSampleWordBreaker.h"
#include "nsISemanticUnitScanner.h"
class nsSemanticUnitScanner : public nsISemanticUnitScanner
, public nsSampleWordBreaker
{
public:
NS_DECL_ISUPPORTS
NS_DECL_NSISEMANTICUNITSCANNER
nsSemanticUnitScanner();
virtual ~nsSemanticUnitScanner();
/* additional members */
};
#endif

View File

@ -1243,6 +1243,13 @@
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
<FILE>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
</FILELIST>
<LINKORDER>
<FILEREF>
@ -1435,6 +1442,11 @@
<PATH>nsEntityConverter.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
<FILEREF>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
</LINKORDER>
</TARGET>
<TARGET>
@ -2627,6 +2639,13 @@
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
<FILE>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
</FILELIST>
<LINKORDER>
<FILEREF>
@ -2819,6 +2838,11 @@
<PATH>nsEntityConverter.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
<FILEREF>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
</LINKORDER>
</TARGET>
</TARGETLIST>
@ -3034,6 +3058,12 @@
<PATH>rulebrk.c</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
<FILEREF>
<TARGETNAME>i18n.shlb</TARGETNAME>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
</GROUP>
<GROUP><NAME>strres</NAME>
<FILEREF>