Bug 1513799 - Remove windows-1252 as a Japanese detection outcome. r=emk.

If this removal turns out to be OK, we should be able to make the detector decide more quickly between the remaining options. Differential Revision: https://phabricator.services.mozilla.com/D26283
2024-11-24 13:21:05 +00:00 · 2019-04-05 12:08:41 +03:00 · 2019-04-05 12:08:41 +03:00 · ec150294ab
commit ec150294ab
parent 1be446680f
14 changed files with 16 additions and 354 deletions
--- a/extensions/universalchardet/src/base/moz.build
+++ b/extensions/universalchardet/src/base/moz.build
@ -11,7 +11,6 @@ UNIFIED_SOURCES += [
    'nsEscCharsetProber.cpp',
    'nsEscSM.cpp',
    'nsEUCJPProber.cpp',
-    'nsLatin1Prober.cpp',
    'nsMBCSGroupProber.cpp',
    'nsMBCSSM.cpp',
    'nsSJISProber.cpp',
--- a/extensions/universalchardet/src/base/nsLatin1Prober.cpp
+++ b/extensions/universalchardet/src/base/nsLatin1Prober.cpp
@ -1,131 +0,0 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include "nsLatin1Prober.h"
-#include <stdio.h>
-
-#define UDF 0        // undefined
-#define OTH 1        // other
-#define ASC 2        // ascii capital letter
-#define ASS 3        // ascii small letter
-#define ACV 4        // accent capital vowel
-#define ACO 5        // accent capital other
-#define ASV 6        // accent small vowel
-#define ASO 7        // accent small other
-#define CLASS_NUM 8  // total classes
-
-static const unsigned char Latin1_CharToClass[] = {
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 00 - 07
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 08 - 0F
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 10 - 17
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 18 - 1F
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 20 - 27
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 28 - 2F
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 30 - 37
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 38 - 3F
-    OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  // 40 - 47
-    ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  // 48 - 4F
-    ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  // 50 - 57
-    ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,  // 58 - 5F
-    OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  // 60 - 67
-    ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  // 68 - 6F
-    ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  // 70 - 77
-    ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,  // 78 - 7F
-    OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,  // 80 - 87
-    OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,  // 88 - 8F
-    UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // 90 - 97
-    OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,  // 98 - 9F
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // A0 - A7
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // A8 - AF
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // B0 - B7
-    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  // B8 - BF
-    ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,  // C0 - C7
-    ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,  // C8 - CF
-    ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,  // D0 - D7
-    ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,  // D8 - DF
-    ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,  // E0 - E7
-    ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,  // E8 - EF
-    ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,  // F0 - F7
-    ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,  // F8 - FF
-};
-
-/* 0 : illegal
-   1 : very unlikely
-   2 : normal
-   3 : very likely
-*/
-static const unsigned char Latin1ClassModel[] = {
-    /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
-    /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
-    /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
-    /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
-    /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
-    /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
-    /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
-    /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
-    /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
-};
-
-void nsLatin1Prober::Reset(void) {
-  mState = eDetecting;
-  mLastCharClass = OTH;
-  for (int i = 0; i < FREQ_CAT_NUM; i++) mFreqCounter[i] = 0;
-}
-
-nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen) {
-  char* newBuf1 = 0;
-  uint32_t newLen1 = 0;
-
-  if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
-    newBuf1 = (char*)aBuf;
-    newLen1 = aLen;
-  }
-
-  unsigned char charClass;
-  unsigned char freq;
-  for (uint32_t i = 0; i < newLen1; i++) {
-    charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
-    freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass];
-    if (freq == 0) {
-      mState = eNotMe;
-      break;
-    }
-    mFreqCounter[freq]++;
-    mLastCharClass = charClass;
-  }
-
-  if (newBuf1 != aBuf) free(newBuf1);
-
-  return mState;
-}
-
-float nsLatin1Prober::GetConfidence(void) {
-  if (mState == eNotMe) return 0.01f;
-
-  float confidence;
-  uint32_t total = 0;
-  for (int32_t i = 0; i < FREQ_CAT_NUM; i++) total += mFreqCounter[i];
-
-  if (!total)
-    confidence = 0.0f;
-  else {
-    confidence = mFreqCounter[3] * 1.0f / total;
-    confidence -= mFreqCounter[1] * 20.0f / total;
-  }
-
-  if (confidence < 0.0f) confidence = 0.0f;
-
-  // lower the confidence of latin1 so that other more accurate detector
-  // can take priority.
-  confidence *= 0.50f;
-
-  return confidence;
-}
-
-#ifdef DEBUG_chardet
-void nsLatin1Prober::DumpStatus() {
-  printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
-}
-#endif
--- a/extensions/universalchardet/src/base/nsLatin1Prober.h
+++ b/extensions/universalchardet/src/base/nsLatin1Prober.h
@ -1,33 +0,0 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#ifndef nsLatin1Prober_h__
-#define nsLatin1Prober_h__
-
-#include "nsCharSetProber.h"
-
-#define FREQ_CAT_NUM 4
-
-class nsLatin1Prober : public nsCharSetProber {
- public:
-  nsLatin1Prober(void) { Reset(); }
-  virtual ~nsLatin1Prober(void) {}
-  nsProbingState HandleData(const char* aBuf, uint32_t aLen) override;
-  const char* GetCharSetName() override { return "windows-1252"; }
-  nsProbingState GetState(void) override { return mState; }
-  void Reset(void) override;
-  float GetConfidence(void) override;
-
-#ifdef DEBUG_chardet
-  virtual void DumpStatus();
-#endif
-
- protected:
-  nsProbingState mState;
-  char mLastCharClass;
-  uint32_t mFreqCounter[FREQ_CAT_NUM];
-};
-
-#endif /* nsLatin1Prober_h__ */
--- a/extensions/universalchardet/src/base/nsUniversalDetector.cpp
+++ b/extensions/universalchardet/src/base/nsUniversalDetector.cpp
@ -9,12 +9,12 @@

 #include "nsMBCSGroupProber.h"
 #include "nsEscCharsetProber.h"
-#include "nsLatin1Prober.h"

 nsUniversalDetector::nsUniversalDetector() {
  mDone = false;
  mBestGuess = -1;  // illegal value as signal
  mInTag = false;
+  mMultibyteProber = nullptr;
  mEscCharSetProber = nullptr;

  mStart = true;
@ -22,15 +22,10 @@ nsUniversalDetector::nsUniversalDetector() {
  mGotData = false;
  mInputState = ePureAscii;
  mLastChar = '\0';
-
-  uint32_t i;
-  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) mCharSetProbers[i] = nullptr;
 }

 nsUniversalDetector::~nsUniversalDetector() {
-  for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
-    delete mCharSetProbers[i];
-
+  delete mMultibyteProber;
  delete mEscCharSetProber;
 }

@ -45,11 +40,13 @@ void nsUniversalDetector::Reset() {
  mInputState = ePureAscii;
  mLastChar = '\0';

-  if (mEscCharSetProber) mEscCharSetProber->Reset();
+  if (mMultibyteProber) {
+    mMultibyteProber->Reset();
+  }

-  uint32_t i;
-  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
-    if (mCharSetProbers[i]) mCharSetProbers[i]->Reset();
+  if (mEscCharSetProber) {
+    mEscCharSetProber->Reset(); 
+  }
 }

 //---------------------------------------------------------------------
@ -110,14 +107,9 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) {
          mEscCharSetProber = nullptr;
        }

-        // start multibyte and singlebyte charset prober
-        if (nullptr == mCharSetProbers[0]) {
-          mCharSetProbers[0] = new nsMBCSGroupProber();
-          if (nullptr == mCharSetProbers[0]) return NS_ERROR_OUT_OF_MEMORY;
-        }
-        if (nullptr == mCharSetProbers[2]) {
-          mCharSetProbers[2] = new nsLatin1Prober;
-          if (nullptr == mCharSetProbers[2]) return NS_ERROR_OUT_OF_MEMORY;
+        // start multibyte charset prober
+        if (!mMultibyteProber) {
+          mMultibyteProber = new nsMBCSGroupProber();
        }
      }
    } else {
@ -144,16 +136,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) {
      }
      break;
    case eHighbyte:
-      for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
-        if (mCharSetProbers[i]) {
-          st = mCharSetProbers[i]->HandleData(aBuf, aLen);
+          st = mMultibyteProber->HandleData(aBuf, aLen);
          if (st == eFoundIt) {
            mDone = true;
-            mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
+            mDetectedCharset = mMultibyteProber->GetCharSetName();
            return NS_OK;
          }
-        }
-      }
      break;

    default:     // pure ascii
@ -179,23 +167,10 @@ void nsUniversalDetector::DataEnd() {

  switch (mInputState) {
    case eHighbyte: {
-      float proberConfidence;
-      float maxProberConfidence = (float)0.0;
-      int32_t maxProber = 0;
-
-      for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
-        if (mCharSetProbers[i]) {
-          proberConfidence = mCharSetProbers[i]->GetConfidence();
-          if (proberConfidence > maxProberConfidence) {
-            maxProberConfidence = proberConfidence;
-            maxProber = i;
-          }
-        }
-      }
      // do not report anything because we are not confident of it, that's in
      // fact a negative answer
-      if (maxProberConfidence > MINIMUM_THRESHOLD)
-        Report(mCharSetProbers[maxProber]->GetCharSetName());
+      if (mMultibyteProber->GetConfidence() > MINIMUM_THRESHOLD)
+        Report(mMultibyteProber->GetCharSetName());
    } break;
    case eEscAscii:
      break;
--- a/extensions/universalchardet/src/base/nsUniversalDetector.h
+++ b/extensions/universalchardet/src/base/nsUniversalDetector.h
@ -8,8 +8,6 @@

 class nsCharSetProber;

-#define NUM_OF_CHARSET_PROBERS 3
-
 typedef enum { ePureAscii = 0, eEscAscii = 1, eHighbyte = 2 } nsInputState;

 class nsUniversalDetector {
@ -32,7 +30,7 @@ class nsUniversalDetector {
  int32_t mBestGuess;
  uint32_t mLanguageFilter;

-  nsCharSetProber* mCharSetProbers[NUM_OF_CHARSET_PROBERS];
+  nsCharSetProber* mMultibyteProber;
  nsCharSetProber* mEscCharSetProber;
 };

--- a/extensions/universalchardet/tests/bug421271_text.html
+++ b/extensions/universalchardet/tests/bug421271_text.html
@ -1,5 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html><head>
-<meta http-equiv="Content-Type" content="text/html"><title>BBC - 606 - A Forum Conversation</title>
-</head>
-<body topmargin="0" leftmargin="0" marginheight="0" marginwidth="0"><p class="posted">posted 5 Weeks Ago</p><p>if rangers draw and marseille and benfica win i stand to lift £825. not bad for a £2 bet.<br>50p on 3 homes<br>Man Utd/Marseille/Benfica<br>50p on 3 Draws<br>Rangers/Halifax/Bristol City<br>50p on 3 Aways<br>Doncaster/Stranraer/Rushden &amp; Diamonds<br>and 50p on all nine results.<br>GET IN THERE.</p></body></html>
--- a/extensions/universalchardet/tests/bug811363-invalid-2.text
+++ b/extensions/universalchardet/tests/bug811363-invalid-2.text
@ -1,3 +0,0 @@
-First bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character: À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß 
-First bytes of 3-byte sequences (0xe0-0xef), each followed by a space character: à á â ã ä å æ ç è é ê ë ì í î ï 
-First bytes of 4-byte sequences (0xf0-0xf4), each followed by a space character: ð ñ ò ó ô 
--- a/extensions/universalchardet/tests/bug811363-invalid-3.text
+++ b/extensions/universalchardet/tests/bug811363-invalid-3.text
@ -1,2 +0,0 @@
-3-byte sequence with last byte missing (U+0000): à°
-4-byte sequence with last b0te missing (U+0000): ð°€
--- a/extensions/universalchardet/tests/bug811363-invalid-4.text
+++ b/extensions/universalchardet/tests/bug811363-invalid-4.text
@ -1 +0,0 @@
-Overlong encodings: <20><> <20><><EFBFBD> <20><><EFBFBD><EFBFBD>
--- a/extensions/universalchardet/tests/chrome.ini
+++ b/extensions/universalchardet/tests/chrome.ini
@ -2,7 +2,6 @@
 support-files =
  CharsetDetectionTests.js
  bug306272_text.html
-  bug421271_text.html
  bug426271_text-euc-jp.html
  bug426271_text-utf-8.html
  bug431054_text.html
@ -19,9 +18,6 @@ support-files =
  bug811363-8.text
  bug811363-9.text
  bug811363-invalid-1.text
-  bug811363-invalid-2.text
-  bug811363-invalid-3.text
-  bug811363-invalid-4.text
  bug811363-invalid-5.text
  bug1071816-1_text.html
  bug1071816-2_text.html
@ -29,7 +25,6 @@ support-files =
  bug1071816-4_text.html

 [test_bug306272.html]
-[test_bug421271.html]
 [test_bug426271-euc-jp.html]
 [test_bug426271-utf-8.html]
 [test_bug431054-japanese.html]
@ -38,9 +33,6 @@ support-files =
 [test_bug631751le.html]
 [test_bug638318.html]
 [test_bug811363-1-1.html]
-[test_bug811363-1-2.html]
-[test_bug811363-1-3.html]
-[test_bug811363-1-4.html]
 [test_bug811363-1-5.html]
 [test_bug811363-2-1.html]
 [test_bug811363-2-2.html]
--- a/extensions/universalchardet/tests/test_bug421271.html
+++ b/extensions/universalchardet/tests/test_bug421271.html
@ -1,37 +0,0 @@
-<!DOCTYPE HTML>
-<html>
-<!--
-https://bugzilla.mozilla.org/show_bug.cgi?id=421271
-->
-<head>
-  <title>Test for Bug 421271</title>
-  <script type="text/javascript" 
-          src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
-          </script>
-  <script type="text/javascript" src="CharsetDetectionTests.js"></script>
-  <link rel="stylesheet" type="text/css" 
-        href="chrome://mochikit/content/tests/SimpleTest/test.css" />
-</head>
-<body>
-<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=421271">Mozilla Bug 421271</a>
-<p id="display"></p>
-<div id="content" style="display: none">  
-</div>
-<iframe id="testframe"></iframe>
-<pre id="test">
-<script class="testbody" type="text/javascript">
-/** Test for Bug 421271 **/
-CharsetDetectionTests("bug421271_text.html",
-		      "windows-1252",
-		      new Array("ja_parallel_state_machine",
-				"ko_parallel_state_machine",
-				"zh_parallel_state_machine",
-				"zhtw_parallel_state_machine",
-				"zhcn_parallel_state_machine",
-				"cjk_parallel_state_machine",
-				"universal_charset_detector"));
-</script>
-</pre>
-</body>
-</html>
-
--- a/extensions/universalchardet/tests/test_bug811363-1-2.html
+++ b/extensions/universalchardet/tests/test_bug811363-1-2.html
@ -1,30 +0,0 @@
-<!DOCTYPE HTML>
-<html>
-<!--
-https://bugzilla.mozilla.org/show_bug.cgi?id=811363
-->
-<head>
-  <title>Test for Bug 811363</title>
-  <script type="text/javascript"
-          src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
-          </script>
-  <script type="text/javascript" src="CharsetDetectionTests.js"></script>
-  <link rel="stylesheet" type="text/css"
-        href="chrome://mochikit/content/tests/SimpleTest/test.css" />
-</head>
-<body>
-<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=811363">Mozilla Bug 811363</a>
-<p id="display"></p>
-<div id="content" style="display: none">
-</div>
-<iframe id="testframe"></iframe>
-<pre id="test">
-<script class="testbody" type="text/javascript">
-/** Test for Bug 811363 **/
-CharsetDetectionTests("bug811363-invalid-2.text",
-		      "windows-1252",
-		      new Array("ja_parallel_state_machine"));
-</script>
-</pre>
-</body>
-</html>
--- a/extensions/universalchardet/tests/test_bug811363-1-3.html
+++ b/extensions/universalchardet/tests/test_bug811363-1-3.html
@ -1,30 +0,0 @@
-<!DOCTYPE HTML>
-<html>
-<!--
-https://bugzilla.mozilla.org/show_bug.cgi?id=811363
-->
-<head>
-  <title>Test for Bug 811363</title>
-  <script type="text/javascript"
-          src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
-          </script>
-  <script type="text/javascript" src="CharsetDetectionTests.js"></script>
-  <link rel="stylesheet" type="text/css"
-        href="chrome://mochikit/content/tests/SimpleTest/test.css" />
-</head>
-<body>
-<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=811363">Mozilla Bug 811363</a>
-<p id="display"></p>
-<div id="content" style="display: none">
-</div>
-<iframe id="testframe"></iframe>
-<pre id="test">
-<script class="testbody" type="text/javascript">
-/** Test for Bug 811363 **/
-CharsetDetectionTests("bug811363-invalid-3.text",
-		      "windows-1252",
-		      new Array("ja_parallel_state_machine"));
-</script>
-</pre>
-</body>
-</html>
--- a/extensions/universalchardet/tests/test_bug811363-1-4.html
+++ b/extensions/universalchardet/tests/test_bug811363-1-4.html
@ -1,30 +0,0 @@
-<!DOCTYPE HTML>
-<html>
-<!--
-https://bugzilla.mozilla.org/show_bug.cgi?id=811363
-->
-<head>
-  <title>Test for Bug 811363</title>
-  <script type="text/javascript"
-          src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js">
-          </script>
-  <script type="text/javascript" src="CharsetDetectionTests.js"></script>
-  <link rel="stylesheet" type="text/css"
-        href="chrome://mochikit/content/tests/SimpleTest/test.css" />
-</head>
-<body>
-<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=811363">Mozilla Bug 811363</a>
-<p id="display"></p>
-<div id="content" style="display: none">
-</div>
-<iframe id="testframe"></iframe>
-<pre id="test">
-<script class="testbody" type="text/javascript">
-/** Test for Bug 811363 **/
-CharsetDetectionTests("bug811363-invalid-4.text",
-		      "windows-1252",
-		      new Array("ja_parallel_state_machine"));
-</script>
-</pre>
-</body>
-</html>
				`@ -1 +0,0 @@`
				`Overlong encodings: <20><> <20><><EFBFBD> <20><><EFBFBD><EFBFBD>`