#162894 Extend universal detector's coverage to include iso-8859-1

Added latin1 prober.
r=smontagu, sr=jst, a=asa
This commit is contained in:
shanjian%netscape.com 2005-11-02 16:57:06 +00:00
parent 2adb8cf305
commit f062e7c7ba
4 changed files with 279 additions and 2 deletions

View File

@ -0,0 +1,206 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nsLatin1Prober.h"
#include "prmem.h"
#define UDF 0 // undefined
#define OTH 1 //other
#define ASC 2 // ascii capital letter
#define ASS 3 // ascii small letter
#define ACV 4 // accent capital vowel
#define ACO 5 // accent capital other
#define ASV 6 // accent small vowel
#define ASO 7 // accent small other
#define CLASS_NUM 8 // total classes
static unsigned char Latin1_CharToClass[] =
{
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
};
/* 0 : illegal
1 : very unlikely
2 : normal
3 : very likely
*/
static char Latin1ClassModel[] =
{
/* UDF OTH ASC ASS ACV ACO ASV ASO */
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
};
void nsLatin1Prober::Reset(void)
{
mState = eDetecting;
mLastCharClass = OTH;
for (int i = 0; i < FREQ_CAT_NUM; i++)
mFreqCounter[i] = 0;
}
PRBool nsLatin1Prober::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
{
//do filtering to reduce load to probers
char *newptr;
char *prevPtr, *curPtr;
PRBool isInTag = PR_FALSE;
newptr = *newBuf = (char*)PR_MALLOC(aLen);
if (!newptr)
return PR_FALSE;
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
{
if (*curPtr == '>')
isInTag = PR_FALSE;
else if (*curPtr == '<')
isInTag = PR_TRUE;
if (!(*curPtr & 0x80) &&
(*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
{
if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
// and it is not inside a tag, keep it
{
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
prevPtr++;
*newptr++ = ' ';
}
else
prevPtr = curPtr+1;
}
}
newLen = newptr - *newBuf;
return PR_TRUE;
}
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
{
char *newBuf1;
PRUint32 newLen1;
if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
newBuf1 = (char*)aBuf;
newLen1 = aLen;
}
char charClass;
char freq;
for (PRUint32 i = 0; i < newLen1; i++)
{
charClass = Latin1_CharToClass[newBuf1[i]];
freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
if (freq == 0) {
mState = eNotMe;
break;
}
mFreqCounter[freq]++;
mLastCharClass = charClass;
}
if (newBuf1 != aBuf)
PR_FREEIF(newBuf1);
return mState;
}
float nsLatin1Prober::GetConfidence(void)
{
if (mState == eNotMe)
return 0.01f;
float confidence;
PRUint32 total = 0;
for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
total += mFreqCounter[i];
confidence = mFreqCounter[3]*1.0f / total;
confidence -= mFreqCounter[1]*20.0f/total;
if (confidence < 0.0f)
confidence = 0.0f;
// lower the confidence of latin1 so that other more accurate detector
// can take priority.
confidence *= 0.60f;
return confidence;
}

View File

@ -0,0 +1,67 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsLatin1Prober_h__
#define nsLatin1Prober_h__
#include "nsCharSetProber.h"
#define FREQ_CAT_NUM 4
class nsLatin1Prober: public nsCharSetProber {
public:
nsLatin1Prober(void){Reset();};
virtual ~nsLatin1Prober(void){};
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "windows-1252";};
nsProbingState GetState(void) {return mState;};
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
protected:
PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
nsProbingState mState;
char mLastCharClass;
PRUint32 mFreqCounter[FREQ_CAT_NUM];
};
#endif /* nsLatin1Prober_h__ */

View File

@ -52,6 +52,7 @@
#include "nsMBCSGroupProber.h"
#include "nsSBCSGroupProber.h"
#include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h"
static NS_DEFINE_CID(kUniversalDetectorCID, NS_UNIVERSAL_DETECTOR_CID);
static NS_DEFINE_CID(kUniversalStringDetectorCID, NS_UNIVERSAL_STRING_DETECTOR_CID);
@ -156,7 +157,10 @@ void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
if (nsnull == mCharSetProbers[0])
mCharSetProbers[0] = new nsMBCSGroupProber;
if (nsnull == mCharSetProbers[1])
mCharSetProbers[1] = new nsSBCSGroupProber; }
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[2])
mCharSetProbers[2] = new nsLatin1Prober;
}
}
else
{

View File

@ -56,7 +56,7 @@
class nsCharSetProber;
#define NUM_OF_CHARSET_PROBERS 2
#define NUM_OF_CHARSET_PROBERS 3
typedef enum {
ePureAscii = 0,