Bug 1513799 - Remove windows-1252 as a Japanese detection outcome. r=emk.

If this removal turns out to be OK, we should be able to make the detector
decide more quickly between the remaining options.

Differential Revision: https://phabricator.services.mozilla.com/D26283
This commit is contained in:
Henri Sivonen 2019-04-05 12:08:41 +03:00
parent 1be446680f
commit ec150294ab
14 changed files with 16 additions and 354 deletions

View File

@ -11,7 +11,6 @@ UNIFIED_SOURCES += [
'nsEscCharsetProber.cpp',
'nsEscSM.cpp',
'nsEUCJPProber.cpp',
'nsLatin1Prober.cpp',
'nsMBCSGroupProber.cpp',
'nsMBCSSM.cpp',
'nsSJISProber.cpp',

View File

@ -1,131 +0,0 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsLatin1Prober.h"
#include <stdio.h>
#define UDF 0 // undefined
#define OTH 1 // other
#define ASC 2 // ascii capital letter
#define ASS 3 // ascii small letter
#define ACV 4 // accent capital vowel
#define ACO 5 // accent capital other
#define ASV 6 // accent small vowel
#define ASO 7 // accent small other
#define CLASS_NUM 8 // total classes
static const unsigned char Latin1_CharToClass[] = {
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
};
/* 0 : illegal
1 : very unlikely
2 : normal
3 : very likely
*/
static const unsigned char Latin1ClassModel[] = {
/* UDF OTH ASC ASS ACV ACO ASV ASO */
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
};
void nsLatin1Prober::Reset(void) {
mState = eDetecting;
mLastCharClass = OTH;
for (int i = 0; i < FREQ_CAT_NUM; i++) mFreqCounter[i] = 0;
}
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen) {
char* newBuf1 = 0;
uint32_t newLen1 = 0;
if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
newBuf1 = (char*)aBuf;
newLen1 = aLen;
}
unsigned char charClass;
unsigned char freq;
for (uint32_t i = 0; i < newLen1; i++) {
charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass];
if (freq == 0) {
mState = eNotMe;
break;
}
mFreqCounter[freq]++;
mLastCharClass = charClass;
}
if (newBuf1 != aBuf) free(newBuf1);
return mState;
}
float nsLatin1Prober::GetConfidence(void) {
if (mState == eNotMe) return 0.01f;
float confidence;
uint32_t total = 0;
for (int32_t i = 0; i < FREQ_CAT_NUM; i++) total += mFreqCounter[i];
if (!total)
confidence = 0.0f;
else {
confidence = mFreqCounter[3] * 1.0f / total;
confidence -= mFreqCounter[1] * 20.0f / total;
}
if (confidence < 0.0f) confidence = 0.0f;
// lower the confidence of latin1 so that other more accurate detector
// can take priority.
confidence *= 0.50f;
return confidence;
}
#ifdef DEBUG_chardet
void nsLatin1Prober::DumpStatus() {
printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
}
#endif

View File

@ -1,33 +0,0 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsLatin1Prober_h__
#define nsLatin1Prober_h__
#include "nsCharSetProber.h"
#define FREQ_CAT_NUM 4
class nsLatin1Prober : public nsCharSetProber {
public:
nsLatin1Prober(void) { Reset(); }
virtual ~nsLatin1Prober(void) {}
nsProbingState HandleData(const char* aBuf, uint32_t aLen) override;
const char* GetCharSetName() override { return "windows-1252"; }
nsProbingState GetState(void) override { return mState; }
void Reset(void) override;
float GetConfidence(void) override;
#ifdef DEBUG_chardet
virtual void DumpStatus();
#endif
protected:
nsProbingState mState;
char mLastCharClass;
uint32_t mFreqCounter[FREQ_CAT_NUM];
};
#endif /* nsLatin1Prober_h__ */

View File

@ -9,12 +9,12 @@
#include "nsMBCSGroupProber.h"
#include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h"
nsUniversalDetector::nsUniversalDetector() {
mDone = false;
mBestGuess = -1; // illegal value as signal
mInTag = false;
mMultibyteProber = nullptr;
mEscCharSetProber = nullptr;
mStart = true;
@ -22,15 +22,10 @@ nsUniversalDetector::nsUniversalDetector() {
mGotData = false;
mInputState = ePureAscii;
mLastChar = '\0';
uint32_t i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) mCharSetProbers[i] = nullptr;
}
nsUniversalDetector::~nsUniversalDetector() {
for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
delete mCharSetProbers[i];
delete mMultibyteProber;
delete mEscCharSetProber;
}
@ -45,11 +40,13 @@ void nsUniversalDetector::Reset() {
mInputState = ePureAscii;
mLastChar = '\0';
if (mEscCharSetProber) mEscCharSetProber->Reset();
if (mMultibyteProber) {
mMultibyteProber->Reset();
}
uint32_t i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
if (mCharSetProbers[i]) mCharSetProbers[i]->Reset();
if (mEscCharSetProber) {
mEscCharSetProber->Reset();
}
}
//---------------------------------------------------------------------
@ -110,14 +107,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) {
mEscCharSetProber = nullptr;
}
// start multibyte and singlebyte charset prober
if (nullptr == mCharSetProbers[0]) {
mCharSetProbers[0] = new nsMBCSGroupProber();
if (nullptr == mCharSetProbers[0]) return NS_ERROR_OUT_OF_MEMORY;
}
if (nullptr == mCharSetProbers[2]) {
mCharSetProbers[2] = new nsLatin1Prober;
if (nullptr == mCharSetProbers[2]) return NS_ERROR_OUT_OF_MEMORY;
// start multibyte charset prober
if (!mMultibyteProber) {
mMultibyteProber = new nsMBCSGroupProber();
}
}
} else {
@ -144,16 +136,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) {
}
break;
case eHighbyte:
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
if (mCharSetProbers[i]) {
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
st = mMultibyteProber->HandleData(aBuf, aLen);
if (st == eFoundIt) {
mDone = true;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
mDetectedCharset = mMultibyteProber->GetCharSetName();
return NS_OK;
}
}
}
break;
default: // pure ascii
@ -179,23 +167,10 @@ void nsUniversalDetector::DataEnd() {
switch (mInputState) {
case eHighbyte: {
float proberConfidence;
float maxProberConfidence = (float)0.0;
int32_t maxProber = 0;
for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
if (mCharSetProbers[i]) {
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence) {
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
}
// do not report anything because we are not confident of it, that's in
// fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD)
Report(mCharSetProbers[maxProber]->GetCharSetName());
if (mMultibyteProber->GetConfidence() > MINIMUM_THRESHOLD)
Report(mMultibyteProber->GetCharSetName());
} break;
case eEscAscii:
break;

View File

@ -8,8 +8,6 @@
class nsCharSetProber;
#define NUM_OF_CHARSET_PROBERS 3
typedef enum { ePureAscii = 0, eEscAscii = 1, eHighbyte = 2 } nsInputState;
class nsUniversalDetector {
@ -32,7 +30,7 @@ class nsUniversalDetector {
int32_t mBestGuess;
uint32_t mLanguageFilter;
nsCharSetProber* mCharSetProbers[NUM_OF_CHARSET_PROBERS];
nsCharSetProber* mMultibyteProber;
nsCharSetProber* mEscCharSetProber;
};

View File

@ -1,5 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html><head>
<meta http-equiv="Content-Type" content="text/html"><title>BBC - 606 - A Forum Conversation</title>
</head>
<body topmargin="0" leftmargin="0" marginheight="0" marginwidth="0"><p class="posted">posted 5 Weeks Ago</p><p>if rangers draw and marseille and benfica win i stand to lift £825. not bad for a £2 bet.<br>50p on 3 homes<br>Man Utd/Marseille/Benfica<br>50p on 3 Draws<br>Rangers/Halifax/Bristol City<br>50p on 3 Aways<br>Doncaster/Stranraer/Rushden &amp; Diamonds<br>and 50p on all nine results.<br>GET IN THERE.</p></body></html>

View File

@ -1,3 +0,0 @@
First bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character: À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
First bytes of 3-byte sequences (0xe0-0xef), each followed by a space character: à á â ã ä å æ ç è é ê ë ì í î ï
First bytes of 4-byte sequences (0xf0-0xf4), each followed by a space character: ð ñ ò ó ô

View File

@ -1,2 +0,0 @@
3-byte sequence with last byte missing (U+0000): à°
4-byte sequence with last b0te missing (U+0000): ð°€

View File

@ -1 +0,0 @@
Overlong encodings: <20><> <20><><EFBFBD> <20><><EFBFBD><EFBFBD>

View File

@ -2,7 +2,6 @@
support-files =
CharsetDetectionTests.js
bug306272_text.html
bug421271_text.html
bug426271_text-euc-jp.html
bug426271_text-utf-8.html
bug431054_text.html
@ -19,9 +18,6 @@ support-files =
bug811363-8.text
bug811363-9.text
bug811363-invalid-1.text
bug811363-invalid-2.text
bug811363-invalid-3.text
bug811363-invalid-4.text
bug811363-invalid-5.text
bug1071816-1_text.html
bug1071816-2_text.html
@ -29,7 +25,6 @@ support-files =
bug1071816-4_text.html
[test_bug306272.html]
[test_bug421271.html]
[test_bug426271-euc-jp.html]
[test_bug426271-utf-8.html]
[test_bug431054-japanese.html]
@ -38,9 +33,6 @@ support-files =
[test_bug631751le.html]
[test_bug638318.html]
[test_bug811363-1-1.html]
[test_bug811363-1-2.html]
[test_bug811363-1-3.html]
[test_bug811363-1-4.html]
[test_bug811363-1-5.html]
[test_bug811363-2-1.html]
[test_bug811363-2-2.html]

View File

@ -1,37 +0,0 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=421271
-->
<head>
<title>Test for Bug 421271</title>
<script type="text/javascript"
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
</script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css"
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=421271">Mozilla Bug 421271</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 421271 **/
CharsetDetectionTests("bug421271_text.html",
"windows-1252",
new Array("ja_parallel_state_machine",
"ko_parallel_state_machine",
"zh_parallel_state_machine",
"zhtw_parallel_state_machine",
"zhcn_parallel_state_machine",
"cjk_parallel_state_machine",
"universal_charset_detector"));
</script>
</pre>
</body>
</html>

View File

@ -1,30 +0,0 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=811363
-->
<head>
<title>Test for Bug 811363</title>
<script type="text/javascript"
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
</script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css"
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=811363">Mozilla Bug 811363</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 811363 **/
CharsetDetectionTests("bug811363-invalid-2.text",
"windows-1252",
new Array("ja_parallel_state_machine"));
</script>
</pre>
</body>
</html>

View File

@ -1,30 +0,0 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=811363
-->
<head>
<title>Test for Bug 811363</title>
<script type="text/javascript"
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
</script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css"
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=811363">Mozilla Bug 811363</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 811363 **/
CharsetDetectionTests("bug811363-invalid-3.text",
"windows-1252",
new Array("ja_parallel_state_machine"));
</script>
</pre>
</body>
</html>

View File

@ -1,30 +0,0 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=811363
-->
<head>
<title>Test for Bug 811363</title>
<script type="text/javascript"
src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
</script>
<script type="text/javascript" src="CharsetDetectionTests.js"></script>
<link rel="stylesheet" type="text/css"
href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=811363">Mozilla Bug 811363</a>
<p id="display"></p>
<div id="content" style="display: none">
</div>
<iframe id="testframe"></iframe>
<pre id="test">
<script class="testbody" type="text/javascript">
/** Test for Bug 811363 **/
CharsetDetectionTests("bug811363-invalid-4.text",
"windows-1252",
new Array("ja_parallel_state_machine"));
</script>
</pre>
</body>
</html>