mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-09 03:15:11 +00:00
#109913 universal charset detector improvement: use positive approach
r=yokoyama, sr=brendan
This commit is contained in:
parent
4306539df1
commit
7a8f056a0b
@ -231,6 +231,7 @@ SequenceModel Latin5BulgarianModel =
|
||||
{
|
||||
Latin5_BulgarianCharToOrderMap,
|
||||
BulgarianLangModel,
|
||||
(float)0.969392,
|
||||
PR_FALSE,
|
||||
"ISO-8859-5"
|
||||
};
|
||||
@ -239,6 +240,7 @@ SequenceModel Win1251BulgarianModel =
|
||||
{
|
||||
win1251BulgarianCharToOrderMap,
|
||||
BulgarianLangModel,
|
||||
(float)0.969392,
|
||||
PR_FALSE,
|
||||
"windows-1251"
|
||||
};
|
||||
|
@ -163,6 +163,11 @@ unsigned char IBM866_CharToOrderMap[] =
|
||||
};
|
||||
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 97.6601%
|
||||
//first 1024 sequences: 2.3389%
|
||||
//rest sequences: 0.1237%
|
||||
//negative sequences: 0.0009%
|
||||
char RussianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
@ -300,6 +305,7 @@ SequenceModel Koi8rModel =
|
||||
{
|
||||
KOI8R_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
"KOI8-R"
|
||||
};
|
||||
@ -308,6 +314,7 @@ SequenceModel Win1251Model =
|
||||
{
|
||||
win1251_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
"windows-1251"
|
||||
};
|
||||
@ -316,6 +323,7 @@ SequenceModel Latin5Model =
|
||||
{
|
||||
latin5_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
"ISO-8859-5"
|
||||
};
|
||||
@ -324,6 +332,7 @@ SequenceModel MacCyrillicModel =
|
||||
{
|
||||
macCyrillic_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
"x-mac-cyrillic"
|
||||
};
|
||||
@ -332,6 +341,7 @@ SequenceModel Ibm866Model =
|
||||
{
|
||||
IBM866_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
"IBM866"
|
||||
};
|
||||
@ -340,6 +350,7 @@ SequenceModel Ibm855Model =
|
||||
{
|
||||
IBM855_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
(float)0.976601,
|
||||
PR_FALSE,
|
||||
"IBM855"
|
||||
};
|
||||
|
@ -230,6 +230,7 @@ SequenceModel Latin7Model =
|
||||
{
|
||||
Latin7_CharToOrderMap,
|
||||
GreekLangModel,
|
||||
(float)0.982851,
|
||||
PR_FALSE,
|
||||
"ISO-8859-7"
|
||||
};
|
||||
@ -238,6 +239,7 @@ SequenceModel Win1253Model =
|
||||
{
|
||||
win1253_CharToOrderMap,
|
||||
GreekLangModel,
|
||||
(float)0.982851,
|
||||
PR_FALSE,
|
||||
"windows-1253"
|
||||
};
|
||||
|
@ -228,6 +228,7 @@ SequenceModel Latin2HungarianModel =
|
||||
{
|
||||
Latin2_HungarianCharToOrderMap,
|
||||
HungarianLangModel,
|
||||
(float)0.947368,
|
||||
PR_TRUE,
|
||||
"ISO-8859-2"
|
||||
};
|
||||
@ -236,6 +237,7 @@ SequenceModel Win1250HungarianModel =
|
||||
{
|
||||
win1250HungarianCharToOrderMap,
|
||||
HungarianLangModel,
|
||||
(float)0.947368,
|
||||
PR_TRUE,
|
||||
"windows-1250"
|
||||
};
|
||||
|
@ -216,6 +216,7 @@ SequenceModel TIS620ThaiModel =
|
||||
{
|
||||
TIS620CharToOrderMap,
|
||||
ThaiLangModel,
|
||||
(float)0.926386,
|
||||
PR_FALSE,
|
||||
"TIS-620"
|
||||
};
|
||||
|
@ -56,6 +56,10 @@ public:
|
||||
virtual void Reset(void) = 0;
|
||||
virtual float GetConfidence(void) = 0;
|
||||
virtual void SetOpion() = 0;
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
virtual void DumpStatus() {};
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif /* nsCharSetProber_h__ */
|
||||
|
@ -134,9 +134,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
#ifdef DEBUG_chardet
|
||||
printf("MBCS Prober found charset %d in HandleData. \r\n", i);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
else if (st == eNotMe)
|
||||
@ -180,19 +177,26 @@ float nsMBCSGroupProber::GetConfidence(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_chardet
|
||||
printf("MBCS Prober confidence is %f in charset %d . \r\n", bestConf, mBestGuess);
|
||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
printf("[%s] is inactive\r\n", ProberName[i], i);
|
||||
else
|
||||
{
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
printf("[%s] detector has confidence %f\r\n", ProberName[i], cf);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return bestConf;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void
|
||||
nsMBCSGroupProber::DumpStatus()
|
||||
{
|
||||
PRUint32 i;
|
||||
float cf;
|
||||
|
||||
GetConfidence();
|
||||
for (i = 0; i < NUM_OF_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
printf("[%s] is inactive(ie. cofidence is too low).\r\n", ProberName[i]);
|
||||
else
|
||||
{
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
printf("[%s] prober has confidence %f\r\n", ProberName[i], cf);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -60,6 +60,10 @@ public:
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void DumpStatus();
|
||||
#endif
|
||||
|
||||
protected:
|
||||
nsProbingState mState;
|
||||
nsCharSetProber* mProbers[NUM_OF_PROBERS];
|
||||
|
@ -51,12 +51,12 @@ nsSBCSGroupProber::nsSBCSGroupProber()
|
||||
mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
|
||||
mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
|
||||
mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
|
||||
mProbers[6] = new nsSingleByteCharSetProber(&Win1253Model);
|
||||
mProbers[7] = new nsSingleByteCharSetProber(&Latin7Model);
|
||||
mProbers[8] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
|
||||
mProbers[9] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
|
||||
mProbers[10] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
|
||||
mProbers[11] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
|
||||
mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
|
||||
mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
|
||||
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
|
||||
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
|
||||
mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
|
||||
mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
|
||||
|
||||
Reset();
|
||||
}
|
||||
@ -194,9 +194,6 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
#ifdef DEBUG_chardet
|
||||
printf("MBCS Prober found charset %d in HandleData. \r\n", i);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
else if (st == eNotMe)
|
||||
@ -240,19 +237,26 @@ float nsSBCSGroupProber::GetConfidence(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_chardet
|
||||
printf("SBCS Group Prober confidence is %f in charset %d . \r\n", bestConf, mBestGuess);
|
||||
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
printf("[%s] is inactive\r\n", mProbers[i]->GetCharSetName(), i);
|
||||
else
|
||||
{
|
||||
cf = mProbers[i]->GetConfidence();
|
||||
printf("[%s] detector has confidence %f\r\n", mProbers[i]->GetCharSetName(), cf);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return bestConf;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void
|
||||
nsSBCSGroupProber::DumpStatus()
|
||||
{
|
||||
PRUint32 i;
|
||||
float cf;
|
||||
|
||||
cf = GetConfidence();
|
||||
printf("SBCS Group Prober --------begin status \r\n");
|
||||
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
||||
{
|
||||
if (!mIsActive[i])
|
||||
printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i);
|
||||
else
|
||||
mProbers[i]->DumpStatus();
|
||||
}
|
||||
printf("SBCS Group found best match [%s] confidence %f.\r\n",
|
||||
mProbers[mBestGuess]->GetCharSetName(), cf);
|
||||
}
|
||||
#endif
|
||||
|
@ -56,6 +56,10 @@ public:
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void DumpStatus();
|
||||
#endif
|
||||
|
||||
protected:
|
||||
nsProbingState mState;
|
||||
nsSingleByteCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];
|
||||
|
@ -35,6 +35,7 @@
|
||||
* the terms of any one of the NPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
#include <stdio.h>
|
||||
#include "nsSBCharSetProber.h"
|
||||
|
||||
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
@ -53,9 +54,8 @@ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32
|
||||
|
||||
if (mLastOrder < SAMPLE_SIZE)
|
||||
{
|
||||
if (mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order] == 0)
|
||||
mNegativeSeqs++;
|
||||
mTotalSeqs++;
|
||||
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
|
||||
}
|
||||
}
|
||||
mLastOrder = order;
|
||||
@ -78,16 +78,40 @@ void nsSingleByteCharSetProber::Reset(void)
|
||||
{
|
||||
mState = eDetecting;
|
||||
mLastOrder = 255;
|
||||
for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++)
|
||||
mSeqCounters[i] = 0;
|
||||
mTotalSeqs = 0;
|
||||
mNegativeSeqs = 0;
|
||||
mTotalChar = 0;
|
||||
mFreqChar = 0;
|
||||
}
|
||||
|
||||
//#define NEGATIVE_APPROACH 1
|
||||
|
||||
float nsSingleByteCharSetProber::GetConfidence(void)
|
||||
{
|
||||
#ifdef NEGATIVE_APPROACH
|
||||
if (mTotalSeqs > 0)
|
||||
if (mTotalSeqs > mNegativeSeqs*10 )
|
||||
return ((float)(mTotalSeqs - mNegativeSeqs*10))/mTotalSeqs * mFreqChar / mTotalChar;
|
||||
if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
|
||||
return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
|
||||
return (float)0.01;
|
||||
#else //POSITIVE_APPROACH
|
||||
float r;
|
||||
|
||||
if (mTotalSeqs > 0) {
|
||||
r = ((float)1.2) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
|
||||
r = r*mFreqChar/mTotalChar;
|
||||
if (r >= (float)1.00)
|
||||
r = (float)0.99;
|
||||
return r;
|
||||
}
|
||||
return (float)0.01;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void
|
||||
nsSingleByteCharSetProber::DumpStatus()
|
||||
{
|
||||
printf("[%s] prober has confidence %f\r\n", GetCharSetName(), GetConfidence());
|
||||
}
|
||||
#endif
|
||||
|
@ -45,11 +45,15 @@
|
||||
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
||||
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
||||
#define SYMBOL_CAT_ORDER 250
|
||||
#define NUMBER_OF_SEQ_CAT 4
|
||||
#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
|
||||
#define NEGATIVE_CAT 0
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned char *charToOrderMap; //[256] table use to find a char's order
|
||||
char *precedenceMatrix; //[SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
||||
PRBool keepEnglishLetter; //it says if this script contains latin letters
|
||||
const char* charsetName;
|
||||
} SequenceModel;
|
||||
@ -66,6 +70,10 @@ public:
|
||||
void SetOpion() {};
|
||||
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;};
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void DumpStatus();
|
||||
#endif
|
||||
|
||||
protected:
|
||||
nsProbingState mState;
|
||||
SequenceModel *mModel;
|
||||
@ -74,7 +82,7 @@ protected:
|
||||
unsigned char mLastOrder;
|
||||
|
||||
PRUint32 mTotalSeqs;
|
||||
PRUint32 mNegativeSeqs;
|
||||
PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
|
||||
|
||||
PRUint32 mTotalChar;
|
||||
//characters that fall in our sampling range
|
||||
|
@ -191,9 +191,6 @@ void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
#ifdef DEBUG_chardet
|
||||
printf(" %d Prober found charset %s in HandleData. \r\n", i, mCharSetProbers[i]->GetCharSetName());
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -220,9 +217,6 @@ void nsUniversalDetector::DataEnd()
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
Report(mDetectedCharset);
|
||||
#ifdef DEBUG_chardet
|
||||
printf("New Charset Prober found charset %s in HandleData. \r\n", mDetectedCharset);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@ -237,9 +231,9 @@ void nsUniversalDetector::DataEnd()
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
#ifdef DEBUG_chardet
|
||||
printf("%d Prober has confidence %f in charset %s in DataEnd. \r\n", i, proberConfidence, mCharSetProbers[i]->GetCharSetName());
|
||||
#endif
|
||||
#ifdef DEBUG_chardet
|
||||
mCharSetProbers[i]->DumpStatus();
|
||||
#endif
|
||||
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
{
|
||||
@ -324,7 +318,9 @@ void nsUniversalXPCOMDetector::Report(const char* aCharset)
|
||||
{
|
||||
NS_ASSERTION(mObserver != nsnull , "have not init yet");
|
||||
#ifdef DEBUG_chardet
|
||||
printf("New Charset Prober report charset %s . \r\n", aCharset);
|
||||
printf("Universal Charset Detector report charset %s . \r\n", aCharset);
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
mCharSetProbers[i]->DumpStatus();
|
||||
#endif
|
||||
mObserver->Notify(aCharset, eBestAnswer);
|
||||
}
|
||||
|
@ -119,10 +119,10 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
nsresult rev = NS_OK;
|
||||
nsICharsetDetector *det = nsnull;
|
||||
rev = GetDetector("all_charset_detector", &det);
|
||||
rev = GetDetector("universal_charset_detector", &det);
|
||||
if(NS_FAILED(rev) || (nsnull == det) ){
|
||||
usage();
|
||||
printf("Could not find All Detector\n");
|
||||
printf("Error: Could not find Universal Detector\n");
|
||||
printf("XPCOM ERROR CODE = %x\n", rev);
|
||||
return(-1);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user