#109913 universal charset detector improvement: use positive approach

r=yokoyama, sr=brendan
This commit is contained in:
shanjian%netscape.com 2001-12-03 23:33:12 +00:00
parent 4306539df1
commit 7a8f056a0b
14 changed files with 122 additions and 56 deletions

View File

@ -231,6 +231,7 @@ SequenceModel Latin5BulgarianModel =
{
Latin5_BulgarianCharToOrderMap,
BulgarianLangModel,
(float)0.969392,
PR_FALSE,
"ISO-8859-5"
};
@ -239,6 +240,7 @@ SequenceModel Win1251BulgarianModel =
{
win1251BulgarianCharToOrderMap,
BulgarianLangModel,
(float)0.969392,
PR_FALSE,
"windows-1251"
};

View File

@ -163,6 +163,11 @@ unsigned char IBM866_CharToOrderMap[] =
};
//Model Table:
//total sequences: 100%
//first 512 sequences: 97.6601%
//first 1024 sequences: 2.3389%
//rest sequences: 0.1237%
//negative sequences: 0.0009%
char RussianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
@ -300,6 +305,7 @@ SequenceModel Koi8rModel =
{
KOI8R_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
"KOI8-R"
};
@ -308,6 +314,7 @@ SequenceModel Win1251Model =
{
win1251_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
"windows-1251"
};
@ -316,6 +323,7 @@ SequenceModel Latin5Model =
{
latin5_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
"ISO-8859-5"
};
@ -324,6 +332,7 @@ SequenceModel MacCyrillicModel =
{
macCyrillic_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
"x-mac-cyrillic"
};
@ -332,6 +341,7 @@ SequenceModel Ibm866Model =
{
IBM866_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
"IBM866"
};
@ -340,6 +350,7 @@ SequenceModel Ibm855Model =
{
IBM855_CharToOrderMap,
RussianLangModel,
(float)0.976601,
PR_FALSE,
"IBM855"
};

View File

@ -230,6 +230,7 @@ SequenceModel Latin7Model =
{
Latin7_CharToOrderMap,
GreekLangModel,
(float)0.982851,
PR_FALSE,
"ISO-8859-7"
};
@ -238,6 +239,7 @@ SequenceModel Win1253Model =
{
win1253_CharToOrderMap,
GreekLangModel,
(float)0.982851,
PR_FALSE,
"windows-1253"
};

View File

@ -228,6 +228,7 @@ SequenceModel Latin2HungarianModel =
{
Latin2_HungarianCharToOrderMap,
HungarianLangModel,
(float)0.947368,
PR_TRUE,
"ISO-8859-2"
};
@ -236,6 +237,7 @@ SequenceModel Win1250HungarianModel =
{
win1250HungarianCharToOrderMap,
HungarianLangModel,
(float)0.947368,
PR_TRUE,
"windows-1250"
};

View File

@ -216,6 +216,7 @@ SequenceModel TIS620ThaiModel =
{
TIS620CharToOrderMap,
ThaiLangModel,
(float)0.926386,
PR_FALSE,
"TIS-620"
};

View File

@ -56,6 +56,10 @@ public:
virtual void Reset(void) = 0;
virtual float GetConfidence(void) = 0;
virtual void SetOpion() = 0;
#ifdef DEBUG_chardet
virtual void DumpStatus() {};
#endif
};
#endif /* nsCharSetProber_h__ */

View File

@ -134,9 +134,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
mBestGuess = i;
mState = eFoundIt;
#ifdef DEBUG_chardet
printf("MBCS Prober found charset %d in HandleData. \r\n", i);
#endif
break;
}
else if (st == eNotMe)
@ -180,19 +177,26 @@ float nsMBCSGroupProber::GetConfidence(void)
}
}
}
#ifdef DEBUG_chardet
printf("MBCS Prober confidence is %f in charset %d . \r\n", bestConf, mBestGuess);
for (i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
printf("[%s] is inactive\r\n", ProberName[i], i);
else
{
cf = mProbers[i]->GetConfidence();
printf("[%s] detector has confidence %f\r\n", ProberName[i], cf);
}
}
#endif
return bestConf;
}
#ifdef DEBUG_chardet
void
nsMBCSGroupProber::DumpStatus()
{
PRUint32 i;
float cf;
GetConfidence();
for (i = 0; i < NUM_OF_PROBERS; i++)
{
if (!mIsActive[i])
printf("[%s] is inactive(ie. cofidence is too low).\r\n", ProberName[i]);
else
{
cf = mProbers[i]->GetConfidence();
printf("[%s] prober has confidence %f\r\n", ProberName[i], cf);
}
}
}
#endif

View File

@ -60,6 +60,10 @@ public:
float GetConfidence(void);
void SetOpion() {};
#ifdef DEBUG_chardet
void DumpStatus();
#endif
protected:
nsProbingState mState;
nsCharSetProber* mProbers[NUM_OF_PROBERS];

View File

@ -51,12 +51,12 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
mProbers[6] = new nsSingleByteCharSetProber(&Win1253Model);
mProbers[7] = new nsSingleByteCharSetProber(&Latin7Model);
mProbers[8] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
mProbers[9] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
mProbers[10] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
mProbers[11] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
Reset();
}
@ -194,9 +194,6 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
mBestGuess = i;
mState = eFoundIt;
#ifdef DEBUG_chardet
printf("MBCS Prober found charset %d in HandleData. \r\n", i);
#endif
break;
}
else if (st == eNotMe)
@ -240,19 +237,26 @@ float nsSBCSGroupProber::GetConfidence(void)
}
}
}
#ifdef DEBUG_chardet
printf("SBCS Group Prober confidence is %f in charset %d . \r\n", bestConf, mBestGuess);
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
{
if (!mIsActive[i])
printf("[%s] is inactive\r\n", mProbers[i]->GetCharSetName(), i);
else
{
cf = mProbers[i]->GetConfidence();
printf("[%s] detector has confidence %f\r\n", mProbers[i]->GetCharSetName(), cf);
}
}
#endif
return bestConf;
}
#ifdef DEBUG_chardet
void
nsSBCSGroupProber::DumpStatus()
{
PRUint32 i;
float cf;
cf = GetConfidence();
printf("SBCS Group Prober --------begin status \r\n");
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
{
if (!mIsActive[i])
printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i);
else
mProbers[i]->DumpStatus();
}
printf("SBCS Group found best match [%s] confidence %f.\r\n",
mProbers[mBestGuess]->GetCharSetName(), cf);
}
#endif

View File

@ -56,6 +56,10 @@ public:
float GetConfidence(void);
void SetOpion() {};
#ifdef DEBUG_chardet
void DumpStatus();
#endif
protected:
nsProbingState mState;
nsSingleByteCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];

View File

@ -35,6 +35,7 @@
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <stdio.h>
#include "nsSBCharSetProber.h"
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -53,9 +54,8 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
if (mLastOrder < SAMPLE_SIZE)
{
if (mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order] == 0)
mNegativeSeqs++;
mTotalSeqs++;
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
}
}
mLastOrder = order;
@ -78,16 +78,40 @@ void nsSingleByteCharSetProber::Reset(void)
{
mState = eDetecting;
mLastOrder = 255;
for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++)
mSeqCounters[i] = 0;
mTotalSeqs = 0;
mNegativeSeqs = 0;
mTotalChar = 0;
mFreqChar = 0;
}
//#define NEGATIVE_APPROACH 1
float nsSingleByteCharSetProber::GetConfidence(void)
{
#ifdef NEGATIVE_APPROACH
if (mTotalSeqs > 0)
if (mTotalSeqs > mNegativeSeqs*10 )
return ((float)(mTotalSeqs - mNegativeSeqs*10))/mTotalSeqs * mFreqChar / mTotalChar;
if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
return (float)0.01;
#else //POSITIVE_APPROACH
float r;
if (mTotalSeqs > 0) {
r = ((float)1.2) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
r = r*mFreqChar/mTotalChar;
if (r >= (float)1.00)
r = (float)0.99;
return r;
}
return (float)0.01;
#endif
}
#ifdef DEBUG_chardet
void
nsSingleByteCharSetProber::DumpStatus()
{
printf("[%s] prober has confidence %f\r\n", GetCharSetName(), GetConfidence());
}
#endif

View File

@ -45,11 +45,15 @@
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
#define SYMBOL_CAT_ORDER 250
#define NUMBER_OF_SEQ_CAT 4
#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
#define NEGATIVE_CAT 0
typedef struct
{
unsigned char *charToOrderMap; //[256] table use to find a char's order
char *precedenceMatrix; //[SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; //it says if this script contains latin letters
const char* charsetName;
} SequenceModel;
@ -66,6 +70,10 @@ public:
void SetOpion() {};
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;};
#ifdef DEBUG_chardet
void DumpStatus();
#endif
protected:
nsProbingState mState;
SequenceModel *mModel;
@ -74,7 +82,7 @@ protected:
unsigned char mLastOrder;
PRUint32 mTotalSeqs;
PRUint32 mNegativeSeqs;
PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
PRUint32 mTotalChar;
//characters that fall in our sampling range

View File

@ -191,9 +191,6 @@ void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
#ifdef DEBUG_chardet
printf(" %d Prober found charset %s in HandleData. \r\n", i, mCharSetProbers[i]->GetCharSetName());
#endif
return;
}
}
@ -220,9 +217,6 @@ void nsUniversalDetector::DataEnd()
{
mDone = PR_TRUE;
Report(mDetectedCharset);
#ifdef DEBUG_chardet
printf("New Charset Prober found charset %s in HandleData. \r\n", mDetectedCharset);
#endif
return;
}
@ -237,9 +231,9 @@ void nsUniversalDetector::DataEnd()
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
proberConfidence = mCharSetProbers[i]->GetConfidence();
#ifdef DEBUG_chardet
printf("%d Prober has confidence %f in charset %s in DataEnd. \r\n", i, proberConfidence, mCharSetProbers[i]->GetCharSetName());
#endif
#ifdef DEBUG_chardet
mCharSetProbers[i]->DumpStatus();
#endif
if (proberConfidence > maxProberConfidence)
{
@ -324,7 +318,9 @@ void nsUniversalXPCOMDetector::Report(const char* aCharset)
{
NS_ASSERTION(mObserver != nsnull , "have not init yet");
#ifdef DEBUG_chardet
printf("New Charset Prober report charset %s . \r\n", aCharset);
printf("Universal Charset Detector report charset %s . \r\n", aCharset);
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
mCharSetProbers[i]->DumpStatus();
#endif
mObserver->Notify(aCharset, eBestAnswer);
}

View File

@ -119,10 +119,10 @@ int main(int argc, char** argv) {
}
nsresult rev = NS_OK;
nsICharsetDetector *det = nsnull;
rev = GetDetector("all_charset_detector", &det);
rev = GetDetector("universal_charset_detector", &det);
if(NS_FAILED(rev) || (nsnull == det) ){
usage();
printf("Could not find All Detector\n");
printf("Error: Could not find Universal Detector\n");
printf("XPCOM ERROR CODE = %x\n", rev);
return(-1);
}