From 7a8f056a0b72f38385032fda375e8ffc64b4b05f Mon Sep 17 00:00:00 2001 From: "shanjian%netscape.com" Date: Mon, 3 Dec 2001 23:33:12 +0000 Subject: [PATCH] #109913 universal charset detector improvement: use positive approach r=yokoyama, sr=brendan --- .../src/LangBulgarianModel.cpp | 2 + .../src/LangCyrillicModel.cpp | 11 +++++ .../universalchardet/src/LangGreekModel.cpp | 2 + .../src/LangHungarianModel.cpp | 2 + .../universalchardet/src/LangThaiModel.cpp | 1 + .../universalchardet/src/nsCharSetProber.h | 4 ++ .../src/nsMBCSGroupProber.cpp | 36 +++++++------- .../universalchardet/src/nsMBCSGroupProber.h | 4 ++ .../src/nsSBCSGroupProber.cpp | 48 ++++++++++--------- .../universalchardet/src/nsSBCSGroupProber.h | 4 ++ .../src/nsSBCharSetProber.cpp | 34 +++++++++++-- .../universalchardet/src/nsSBCharSetProber.h | 10 +++- .../src/nsUniversalDetector.cpp | 16 +++---- .../tests/UniversalChardetTest.cpp | 4 +- 14 files changed, 122 insertions(+), 56 deletions(-) diff --git a/extensions/universalchardet/src/LangBulgarianModel.cpp b/extensions/universalchardet/src/LangBulgarianModel.cpp index 737b5baa2364..fd55efe2beaa 100644 --- a/extensions/universalchardet/src/LangBulgarianModel.cpp +++ b/extensions/universalchardet/src/LangBulgarianModel.cpp @@ -231,6 +231,7 @@ SequenceModel Latin5BulgarianModel = { Latin5_BulgarianCharToOrderMap, BulgarianLangModel, + (float)0.969392, PR_FALSE, "ISO-8859-5" }; @@ -239,6 +240,7 @@ SequenceModel Win1251BulgarianModel = { win1251BulgarianCharToOrderMap, BulgarianLangModel, + (float)0.969392, PR_FALSE, "windows-1251" }; diff --git a/extensions/universalchardet/src/LangCyrillicModel.cpp b/extensions/universalchardet/src/LangCyrillicModel.cpp index 09e4dd4694ec..54bdb582a954 100644 --- a/extensions/universalchardet/src/LangCyrillicModel.cpp +++ b/extensions/universalchardet/src/LangCyrillicModel.cpp @@ -163,6 +163,11 @@ unsigned char IBM866_CharToOrderMap[] = }; //Model Table: +//total sequences: 100% +//first 512 sequences: 97.6601% +//first 1024 sequences: 2.3389% +//rest sequences: 0.1237% +//negative sequences: 0.0009% char RussianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, @@ -300,6 +305,7 @@ SequenceModel Koi8rModel = { KOI8R_CharToOrderMap, RussianLangModel, + (float)0.976601, PR_FALSE, "KOI8-R" }; @@ -308,6 +314,7 @@ SequenceModel Win1251Model = { win1251_CharToOrderMap, RussianLangModel, + (float)0.976601, PR_FALSE, "windows-1251" }; @@ -316,6 +323,7 @@ SequenceModel Latin5Model = { latin5_CharToOrderMap, RussianLangModel, + (float)0.976601, PR_FALSE, "ISO-8859-5" }; @@ -324,6 +332,7 @@ SequenceModel MacCyrillicModel = { macCyrillic_CharToOrderMap, RussianLangModel, + (float)0.976601, PR_FALSE, "x-mac-cyrillic" }; @@ -332,6 +341,7 @@ SequenceModel Ibm866Model = { IBM866_CharToOrderMap, RussianLangModel, + (float)0.976601, PR_FALSE, "IBM866" }; @@ -340,6 +350,7 @@ SequenceModel Ibm855Model = { IBM855_CharToOrderMap, RussianLangModel, + (float)0.976601, PR_FALSE, "IBM855" }; diff --git a/extensions/universalchardet/src/LangGreekModel.cpp b/extensions/universalchardet/src/LangGreekModel.cpp index 18f430fe2fc1..18d6a938878f 100644 --- a/extensions/universalchardet/src/LangGreekModel.cpp +++ b/extensions/universalchardet/src/LangGreekModel.cpp @@ -230,6 +230,7 @@ SequenceModel Latin7Model = { Latin7_CharToOrderMap, GreekLangModel, + (float)0.982851, PR_FALSE, "ISO-8859-7" }; @@ -238,6 +239,7 @@ SequenceModel Win1253Model = { win1253_CharToOrderMap, GreekLangModel, + (float)0.982851, PR_FALSE, "windows-1253" }; diff --git a/extensions/universalchardet/src/LangHungarianModel.cpp b/extensions/universalchardet/src/LangHungarianModel.cpp index 5c5c03c9788d..31ab69a1e2d1 100644 --- a/extensions/universalchardet/src/LangHungarianModel.cpp +++ b/extensions/universalchardet/src/LangHungarianModel.cpp @@ -228,6 +228,7 @@ SequenceModel Latin2HungarianModel = { Latin2_HungarianCharToOrderMap, HungarianLangModel, + (float)0.947368, PR_TRUE, "ISO-8859-2" }; @@ -236,6 +237,7 @@ SequenceModel Win1250HungarianModel = { win1250HungarianCharToOrderMap, HungarianLangModel, + (float)0.947368, PR_TRUE, "windows-1250" }; diff --git a/extensions/universalchardet/src/LangThaiModel.cpp b/extensions/universalchardet/src/LangThaiModel.cpp index 584e9a721cc3..c534def074f9 100644 --- a/extensions/universalchardet/src/LangThaiModel.cpp +++ b/extensions/universalchardet/src/LangThaiModel.cpp @@ -216,6 +216,7 @@ SequenceModel TIS620ThaiModel = { TIS620CharToOrderMap, ThaiLangModel, + (float)0.926386, PR_FALSE, "TIS-620" }; diff --git a/extensions/universalchardet/src/nsCharSetProber.h b/extensions/universalchardet/src/nsCharSetProber.h index cc71cbe2fde7..67b143b69a57 100644 --- a/extensions/universalchardet/src/nsCharSetProber.h +++ b/extensions/universalchardet/src/nsCharSetProber.h @@ -56,6 +56,10 @@ public: virtual void Reset(void) = 0; virtual float GetConfidence(void) = 0; virtual void SetOpion() = 0; + +#ifdef DEBUG_chardet + virtual void DumpStatus() {}; +#endif }; #endif /* nsCharSetProber_h__ */ diff --git a/extensions/universalchardet/src/nsMBCSGroupProber.cpp b/extensions/universalchardet/src/nsMBCSGroupProber.cpp index 6482b9bcbe22..40fb243d2ef3 100644 --- a/extensions/universalchardet/src/nsMBCSGroupProber.cpp +++ b/extensions/universalchardet/src/nsMBCSGroupProber.cpp @@ -134,9 +134,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { mBestGuess = i; mState = eFoundIt; -#ifdef DEBUG_chardet - printf("MBCS Prober found charset %d in HandleData. \r\n", i); -#endif break; } else if (st == eNotMe) @@ -180,19 +177,26 @@ float nsMBCSGroupProber::GetConfidence(void) } } } -#ifdef DEBUG_chardet - printf("MBCS Prober confidence is %f in charset %d . \r\n", bestConf, mBestGuess); - for (i = 0; i < NUM_OF_PROBERS; i++) - { - if (!mIsActive[i]) - printf("[%s] is inactive\r\n", ProberName[i], i); - else - { - cf = mProbers[i]->GetConfidence(); - printf("[%s] detector has confidence %f\r\n", ProberName[i], cf); - } - } -#endif return bestConf; } +#ifdef DEBUG_chardet +void +nsMBCSGroupProber::DumpStatus() +{ + PRUint32 i; + float cf; + + GetConfidence(); + for (i = 0; i < NUM_OF_PROBERS; i++) + { + if (!mIsActive[i]) + printf("[%s] is inactive(ie. cofidence is too low).\r\n", ProberName[i]); + else + { + cf = mProbers[i]->GetConfidence(); + printf("[%s] prober has confidence %f\r\n", ProberName[i], cf); + } + } +} +#endif diff --git a/extensions/universalchardet/src/nsMBCSGroupProber.h b/extensions/universalchardet/src/nsMBCSGroupProber.h index 7e084507363c..3e2ceceef69e 100644 --- a/extensions/universalchardet/src/nsMBCSGroupProber.h +++ b/extensions/universalchardet/src/nsMBCSGroupProber.h @@ -60,6 +60,10 @@ public: float GetConfidence(void); void SetOpion() {}; +#ifdef DEBUG_chardet + void DumpStatus(); +#endif + protected: nsProbingState mState; nsCharSetProber* mProbers[NUM_OF_PROBERS]; diff --git a/extensions/universalchardet/src/nsSBCSGroupProber.cpp b/extensions/universalchardet/src/nsSBCSGroupProber.cpp index ec115c0def82..f0d1351dddc2 100644 --- a/extensions/universalchardet/src/nsSBCSGroupProber.cpp +++ b/extensions/universalchardet/src/nsSBCSGroupProber.cpp @@ -51,12 +51,12 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); - mProbers[6] = new nsSingleByteCharSetProber(&Win1253Model); - mProbers[7] = new nsSingleByteCharSetProber(&Latin7Model); - mProbers[8] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); - mProbers[9] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); - mProbers[10] = new nsSingleByteCharSetProber(&Win1250HungarianModel); - mProbers[11] = new nsSingleByteCharSetProber(&Latin2HungarianModel); + mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); + mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); + mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); + mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); + mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); + mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); Reset(); } @@ -194,9 +194,6 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { mBestGuess = i; mState = eFoundIt; -#ifdef DEBUG_chardet - printf("MBCS Prober found charset %d in HandleData. \r\n", i); -#endif break; } else if (st == eNotMe) @@ -240,19 +237,26 @@ float nsSBCSGroupProber::GetConfidence(void) } } } -#ifdef DEBUG_chardet - printf("SBCS Group Prober confidence is %f in charset %d . \r\n", bestConf, mBestGuess); - for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) - { - if (!mIsActive[i]) - printf("[%s] is inactive\r\n", mProbers[i]->GetCharSetName(), i); - else - { - cf = mProbers[i]->GetConfidence(); - printf("[%s] detector has confidence %f\r\n", mProbers[i]->GetCharSetName(), cf); - } - } -#endif return bestConf; } +#ifdef DEBUG_chardet +void +nsSBCSGroupProber::DumpStatus() +{ + PRUint32 i; + float cf; + + cf = GetConfidence(); + printf("SBCS Group Prober --------begin status \r\n"); + for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) + { + if (!mIsActive[i]) + printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i); + else + mProbers[i]->DumpStatus(); + } + printf("SBCS Group found best match [%s] confidence %f.\r\n", + mProbers[mBestGuess]->GetCharSetName(), cf); +} +#endif diff --git a/extensions/universalchardet/src/nsSBCSGroupProber.h b/extensions/universalchardet/src/nsSBCSGroupProber.h index f6aab3b11040..b238a6afc192 100644 --- a/extensions/universalchardet/src/nsSBCSGroupProber.h +++ b/extensions/universalchardet/src/nsSBCSGroupProber.h @@ -56,6 +56,10 @@ public: float GetConfidence(void); void SetOpion() {}; +#ifdef DEBUG_chardet + void DumpStatus(); +#endif + protected: nsProbingState mState; nsSingleByteCharSetProber* mProbers[NUM_OF_SBCS_PROBERS]; diff --git a/extensions/universalchardet/src/nsSBCharSetProber.cpp b/extensions/universalchardet/src/nsSBCharSetProber.cpp index f37ceeff8146..b57b46176fb2 100644 --- a/extensions/universalchardet/src/nsSBCharSetProber.cpp +++ b/extensions/universalchardet/src/nsSBCharSetProber.cpp @@ -35,6 +35,7 @@ * the terms of any one of the NPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ +#include #include "nsSBCharSetProber.h" nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) @@ -53,9 +54,8 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 if (mLastOrder < SAMPLE_SIZE) { - if (mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order] == 0) - mNegativeSeqs++; mTotalSeqs++; + ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); } } mLastOrder = order; @@ -78,16 +78,40 @@ void nsSingleByteCharSetProber::Reset(void) { mState = eDetecting; mLastOrder = 255; + for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++) + mSeqCounters[i] = 0; mTotalSeqs = 0; - mNegativeSeqs = 0; mTotalChar = 0; mFreqChar = 0; } +//#define NEGATIVE_APPROACH 1 + float nsSingleByteCharSetProber::GetConfidence(void) { +#ifdef NEGATIVE_APPROACH if (mTotalSeqs > 0) - if (mTotalSeqs > mNegativeSeqs*10 ) - return ((float)(mTotalSeqs - mNegativeSeqs*10))/mTotalSeqs * mFreqChar / mTotalChar; + if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) + return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; return (float)0.01; +#else //POSITIVE_APPROACH + float r; + + if (mTotalSeqs > 0) { + r = ((float)1.2) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; + r = r*mFreqChar/mTotalChar; + if (r >= (float)1.00) + r = (float)0.99; + return r; + } + return (float)0.01; +#endif } + +#ifdef DEBUG_chardet +void +nsSingleByteCharSetProber::DumpStatus() +{ + printf("[%s] prober has confidence %f\r\n", GetCharSetName(), GetConfidence()); +} +#endif diff --git a/extensions/universalchardet/src/nsSBCharSetProber.h b/extensions/universalchardet/src/nsSBCharSetProber.h index 7781124fe718..7f3b98522eb4 100644 --- a/extensions/universalchardet/src/nsSBCharSetProber.h +++ b/extensions/universalchardet/src/nsSBCharSetProber.h @@ -45,11 +45,15 @@ #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 #define SYMBOL_CAT_ORDER 250 +#define NUMBER_OF_SEQ_CAT 4 +#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) +#define NEGATIVE_CAT 0 typedef struct { unsigned char *charToOrderMap; //[256] table use to find a char's order char *precedenceMatrix; //[SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency + float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; //it says if this script contains latin letters const char* charsetName; } SequenceModel; @@ -66,6 +70,10 @@ public: void SetOpion() {}; PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; +#ifdef DEBUG_chardet + void DumpStatus(); +#endif + protected: nsProbingState mState; SequenceModel *mModel; @@ -74,7 +82,7 @@ protected: unsigned char mLastOrder; PRUint32 mTotalSeqs; - PRUint32 mNegativeSeqs; + PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT]; PRUint32 mTotalChar; //characters that fall in our sampling range diff --git a/extensions/universalchardet/src/nsUniversalDetector.cpp b/extensions/universalchardet/src/nsUniversalDetector.cpp index 213a5a2edde3..50b2921ccbd1 100644 --- a/extensions/universalchardet/src/nsUniversalDetector.cpp +++ b/extensions/universalchardet/src/nsUniversalDetector.cpp @@ -191,9 +191,6 @@ void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { mDone = PR_TRUE; mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); - #ifdef DEBUG_chardet - printf(" %d Prober found charset %s in HandleData. \r\n", i, mCharSetProbers[i]->GetCharSetName()); - #endif return; } } @@ -220,9 +217,6 @@ void nsUniversalDetector::DataEnd() { mDone = PR_TRUE; Report(mDetectedCharset); -#ifdef DEBUG_chardet - printf("New Charset Prober found charset %s in HandleData. \r\n", mDetectedCharset); -#endif return; } @@ -237,9 +231,9 @@ void nsUniversalDetector::DataEnd() for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { proberConfidence = mCharSetProbers[i]->GetConfidence(); - #ifdef DEBUG_chardet - printf("%d Prober has confidence %f in charset %s in DataEnd. \r\n", i, proberConfidence, mCharSetProbers[i]->GetCharSetName()); - #endif +#ifdef DEBUG_chardet + mCharSetProbers[i]->DumpStatus(); +#endif if (proberConfidence > maxProberConfidence) { @@ -324,7 +318,9 @@ void nsUniversalXPCOMDetector::Report(const char* aCharset) { NS_ASSERTION(mObserver != nsnull , "have not init yet"); #ifdef DEBUG_chardet - printf("New Charset Prober report charset %s . \r\n", aCharset); + printf("Universal Charset Detector report charset %s . \r\n", aCharset); + for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) + mCharSetProbers[i]->DumpStatus(); #endif mObserver->Notify(aCharset, eBestAnswer); } diff --git a/extensions/universalchardet/tests/UniversalChardetTest.cpp b/extensions/universalchardet/tests/UniversalChardetTest.cpp index a66e99cc7a92..6cec0e45cbdd 100644 --- a/extensions/universalchardet/tests/UniversalChardetTest.cpp +++ b/extensions/universalchardet/tests/UniversalChardetTest.cpp @@ -119,10 +119,10 @@ int main(int argc, char** argv) { } nsresult rev = NS_OK; nsICharsetDetector *det = nsnull; - rev = GetDetector("all_charset_detector", &det); + rev = GetDetector("universal_charset_detector", &det); if(NS_FAILED(rev) || (nsnull == det) ){ usage(); - printf("Could not find All Detector\n"); + printf("Error: Could not find Universal Detector\n"); printf("XPCOM ERROR CODE = %x\n", rev); return(-1); }