Backed out changeset 0810ad586986 (bug 1551276) for wpt failures in ar-ISO-8859-6-late.tentative.html on a CLOSED TREE

This commit is contained in:
Oana Pop Rus 2019-12-12 16:38:54 +02:00
parent 0870dac6cd
commit df78d6011c
157 changed files with 2126 additions and 7028 deletions

21
Cargo.lock generated
View File

@ -462,24 +462,6 @@ name = "cfg-if"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "chardetng"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_rs 0.8.20 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "chardetng_c"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"chardetng 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.20 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "chrono"
version = "0.4.6"
@ -1475,7 +1457,6 @@ dependencies = [
"bitsdownload 0.1.0",
"bookmark_sync 0.1.0",
"cert_storage 0.0.1",
"chardetng_c 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"cose-c 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"cubeb-coreaudio 0.1.0 (git+https://github.com/ChunMinChang/cubeb-coreaudio-rs?rev=5fcbd99e1b7356be1efcdc41654d495bd4c71c8c)",
"cubeb-pulse 0.3.0 (git+https://github.com/djg/cubeb-pulse-rs?rev=8069f8f4189982e0b38fa6dc8993dd4fab41f728)",
@ -4757,8 +4738,6 @@ dependencies = [
"checksum cc 1.0.47 (registry+https://github.com/rust-lang/crates.io-index)" = "aa87058dce70a3ff5621797f1506cb837edd02ac4c0ae642b4542dce802908b8"
"checksum cexpr 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8fc0086be9ca82f7fc89fc873435531cb898b86e850005850de1f820e2db6e9b"
"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4"
"checksum chardetng 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "19200261d1f12430abce7cafdafbf53ac92c502fbad8d015174883905d87a3bc"
"checksum chardetng_c 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0458f9fd705d9040356a137c5872a0f9ce1d27222d69de6aeeecf00e5e520076"
"checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878"
"checksum clang-sys 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)" = "81de550971c976f176130da4b2978d3b524eaa0fd9ac31f3ceb5ae1231fb4853"
"checksum clap 2.31.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f0f16b89cbb9ee36d87483dc939fe9f1e13c05898d56d7b230a0d4dff033a536"

View File

@ -3,7 +3,6 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* import-globals-from ../../base/content/utilityOverlay.js */
/* import-globals-from ../../../toolkit/mozapps/preferences/fontbuilder.js */
// browser.display.languageList LOCK ALL when LOCKED
@ -30,10 +29,6 @@ Preferences.addAll([
{ id: "intl.charset.fallback.override", type: "string" },
]);
document.getElementById("FallbackGroupbox").hidden = Services.prefs.getBoolPref(
"intl.charset.detector.ng.enabled"
);
var gFontsDialog = {
_selectLanguageGroupPromise: Promise.resolve(),

View File

@ -234,7 +234,7 @@
</groupbox>
<!-- Text Encoding -->
<groupbox id="FallbackGroupbox">
<groupbox>
<label><html:h2 data-l10n-id="fonts-languages-fallback-header"/></label>
<description data-l10n-id="fonts-languages-fallback-desc"/>
<hbox align="center">

View File

@ -1290,8 +1290,7 @@ nsDocShell::GatherCharsetMenuTelemetry() {
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION::RemoteNonTld);
}
break;
case kCharsetFromInitialAutoDetection:
case kCharsetFromFinalAutoDetection:
case kCharsetFromAutoDetection:
// Changing charset on unlabeled doc where chardet fired
if (isFileURL) {
Telemetry::AccumulateCategorical(
@ -1922,8 +1921,7 @@ nsDocShell::GetCharsetAutodetected(bool* aCharsetAutodetected) {
}
int32_t source = doc->GetDocumentCharacterSetSource();
if (source == kCharsetFromInitialAutoDetection ||
source == kCharsetFromFinalAutoDetection ||
if (source == kCharsetFromAutoDetection ||
source == kCharsetFromUserForcedAutoDetection) {
*aCharsetAutodetected = true;
}

View File

@ -10,15 +10,15 @@ function test() {
function afterOpen() {
is(
content.document.documentElement.textContent.indexOf("\u0434"),
content.document.documentElement.textContent.indexOf("\u00A4"),
131,
"Parent doc should be IBM866 initially"
"Parent doc should be windows-1252 initially"
);
is(
content.frames[0].document.documentElement.textContent.indexOf("\u0434"),
content.frames[0].document.documentElement.textContent.indexOf("\u00A4"),
87,
"Child doc should be IBM866 initially"
"Child doc should be windows-1252 initially"
);
}

View File

@ -10,15 +10,15 @@ function test() {
function afterOpen() {
is(
content.document.documentElement.textContent.indexOf("\u0412"),
content.document.documentElement.textContent.indexOf("\u201A"),
134,
"Parent doc should be IBM866 initially"
"Parent doc should be windows-1252 initially"
);
is(
content.frames[0].document.documentElement.textContent.indexOf("\u0412"),
content.frames[0].document.documentElement.textContent.indexOf("\u201A"),
90,
"Child doc should be IBM866 initially"
"Child doc should be windows-1252 initially"
);
}

View File

@ -10,15 +10,15 @@ function test() {
function afterOpen() {
is(
content.document.documentElement.textContent.indexOf("\u3042"),
content.document.documentElement.textContent.indexOf("\u001B"),
136,
"Parent doc should be ISO-2022-JP initially"
"Parent doc should be windows-1252 initially"
);
is(
content.frames[0].document.documentElement.textContent.indexOf("\u3042"),
content.frames[0].document.documentElement.textContent.indexOf("\u001B"),
92,
"Child doc should be ISO-2022-JP initially"
"Child doc should be windows-1252 initially"
);
}

View File

@ -10,15 +10,15 @@ function test() {
function afterOpen() {
is(
content.document.documentElement.textContent.indexOf("\u0434"),
content.document.documentElement.textContent.indexOf("\u00A4"),
131,
"Parent doc should be IBM866 initially"
"Parent doc should be windows-1252 initially"
);
is(
content.frames[0].document.documentElement.textContent.indexOf("\u0412"),
content.frames[0].document.documentElement.textContent.indexOf("\u201A"),
90,
"Child doc should be IBM866 initially"
"Child doc should be windows-1252 initially"
);
}

View File

@ -150,6 +150,7 @@
#include "nsIAsyncVerifyRedirectCallback.h"
#include "nsICategoryManager.h"
#include "nsIChannelEventSink.h"
#include "nsICharsetDetectionObserver.h"
#include "nsIConsoleService.h"
#include "nsIContent.h"
#include "nsIContentInlines.h"

View File

@ -288,8 +288,7 @@ void nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell,
return;
}
if (kCharsetFromParentForced == parentSource ||
kCharsetFromUserForced == parentSource ||
kCharsetFromUserForcedAutoDetection == parentSource) {
kCharsetFromUserForced == parentSource) {
if (WillIgnoreCharsetOverride() ||
!IsAsciiCompatible(aEncoding) || // if channel said UTF-16
!IsAsciiCompatible(parentCharset)) {

View File

@ -22,7 +22,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=431054
<script class="testbody" type="text/javascript">
/** Test for Bug 431054 **/
CharsetDetectionTests("bug431054_text.html",
"EUC-JP",
"windows-1252",
["zhtw_parallel_state_machine",
"zhcn_parallel_state_machine",
"zh_parallel_state_machine",

View File

@ -21,7 +21,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=811363
<script class="testbody" type="text/javascript">
/** Test for Bug 811363 **/
CharsetDetectionTests("bug811363-invalid-1.text",
"windows-1252",
"Shift_JIS",
new Array("ja_parallel_state_machine"));
</script>
</pre>

View File

@ -21,7 +21,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=811363
<script class="testbody" type="text/javascript">
/** Test for Bug 811363 **/
CharsetDetectionTests("bug811363-invalid-5.text",
"windows-1251",
"Shift_JIS",
new Array("ja_parallel_state_machine"));
</script>
</pre>

View File

@ -1,137 +0,0 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// Mostly copied and pasted from
// third_party/rust/chardetng/src/lib.rs , so
// "top-level directory of this distribution" above refers to
// third_party/rust/chardetng/
#ifndef mozilla_EncodingDetector_h
#define mozilla_EncodingDetector_h
#include "mozilla/Encoding.h"
namespace mozilla {
class EncodingDetector;
}; // namespace mozilla
#define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector
#include "chardetng.h"
namespace mozilla {
/**
* A Web browser-oriented detector for guessing what character
* encoding a stream of bytes is encoded in.
*
* The bytes are fed to the detector incrementally using the `feed`
* method. The current guess of the detector can be queried using
* the `guess` method. The guessing parameters are arguments to the
* `guess` method rather than arguments to the constructor in order
* to enable the application to check if the arguments affect the
* guessing outcome. (The specific use case is to disable UI for
* re-running the detector with UTF-8 allowed and the top-level
* domain name ignored if those arguments don't change the guess.)
*/
class EncodingDetector final {
public:
~EncodingDetector() = default;
static void operator delete(void* aDetector) {
chardetng_encoding_detector_free(
reinterpret_cast<EncodingDetector*>(aDetector));
}
/**
* Creates a new instance of the detector.
*/
static inline UniquePtr<EncodingDetector> Create() {
UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new());
return detector;
}
/**
* Inform the detector of a chunk of input.
*
* The byte stream is represented as a sequence of calls to this
* method such that the concatenation of the arguments to this
* method form the byte stream. It does not matter how the application
* chooses to chunk the stream. It is OK to call this method with
* a zero-length byte slice.
*
* The end of the stream is indicated by calling this method with
* `aLast` set to `true`. In that case, the end of the stream is
* considered to occur after the last byte of the `aBuffer` (which
* may be zero-length) passed in the same call. Once this method
* has been called with `last` set to `true` this method must not
* be called again.
*
* If you want to perform detection on just the prefix of a longer
* stream, do not pass `aLast=true` after the prefix if the stream
* actually still continues.
*
* Returns `true` if after processing `aBuffer` the stream has
* contained at least one non-ASCII byte and `false` if only
* ASCII has been seen so far.
*
* # Panics
*
* If this method has previously been called with `aLast` set to `true`.
*/
inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) {
return chardetng_encoding_detector_feed(this, aBuffer.Elements(),
aBuffer.Length(), aLast);
}
/**
* Guess the encoding given the bytes pushed to the detector so far
* (via `Feed()`), the top-level domain name from which the bytes were
* loaded, and an indication of whether to consider UTF-8 as a permissible
* guess.
*
* The `aTld` argument takes the rightmost DNS label of the hostname of the
* host the stream was loaded from in lower-case ASCII form. That is, if
* the label is an internationalized top-level domain name, it must be
* provided in its Punycode form. If the TLD that the stream was loaded
* from is unavalable, an empty `Spane` may be passed instead, which is
* equivalent to passing a `Span` for "com".
*
* If the `aAllowUTF8` argument is set to `false`, the return value of
* this method won't be `UTF_8_ENCODING`. When performing detection
* on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
* unless the user has taken a specific contextual action to request an
* override. This way, Web developers cannot start depending on UTF-8
* detection. Such reliance would make the Web Platform more brittle.
*
* Returns the guessed encoding.
*
* # Panics
*
* If `aTld` contains non-ASCII, period, or upper-case letters. (The panic
* condition is intentionally limited to signs of failing to extract the
* label correctly, failing to provide it in its Punycode form, and failure
* to lower-case it. Full DNS label validation is intentionally not performed
* to avoid panics when the reality doesn't match the specs.)
*/
inline mozilla::NotNull<const mozilla::Encoding*> Guess(
Span<const char> aTLD, bool aAllowUTF8) const {
return WrapNotNull(chardetng_encoding_detector_guess(
this, aTLD.Elements(), aTLD.Length(), aAllowUTF8));
}
private:
EncodingDetector() = delete;
EncodingDetector(const EncodingDetector&) = delete;
EncodingDetector& operator=(const EncodingDetector&) = delete;
};
}; // namespace mozilla
#endif // mozilla_EncodingDetector_h

18
intl/chardet/moz.build Normal file
View File

@ -0,0 +1,18 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
EXPORTS += [
'nsDetectionConfident.h',
'nsICharsetDetectionObserver.h',
'nsICharsetDetector.h',
'nsIStringCharsetDetector.h',
]
UNIFIED_SOURCES += [
'nsCyrillicDetector.cpp',
]
FINAL_LIBRARY = 'xul'

View File

@ -0,0 +1,55 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsCyrillicClass_h__
#define nsCyrillicClass_h__
/* PLEASE DO NOT EDIT THIS FILE DIRECTLY. THIS FILE IS GENERATED BY
GenCyrllicClass found in mozilla/intl/chardet/tools
*/
static const uint8_t KOI8Map[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
};
static const uint8_t CP1251Map[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13,
14, 15, 16, 17, 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1,
18, 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20,
21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18,
};
static const uint8_t IBM866Map[128] = {
2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, 2, 3, 24, 8, 5, 6,
23, 27, 10, 11, 12, 13, 14, 15, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const uint8_t ISO88595Map[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 24, 8, 5, 6, 23, 27, 10,
11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26,
25, 29, 1, 18, 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16,
17, 19, 20, 21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const uint8_t MacCyrillicMap[128] = {
2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 18, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18, 2, 3, 24, 8, 5, 6, 23, 27, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20,
21, 22, 7, 9, 4, 31, 28, 30, 32, 26, 25, 29, 1, 0,
};
#endif

View File

@ -0,0 +1,109 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nscore.h"
#include "nsCyrillicProb.h"
#include <stdio.h>
#include "nsCOMPtr.h"
#include "nsISupports.h"
#include "nsICharsetDetector.h"
#include "nsICharsetDetectionObserver.h"
#include "nsIStringCharsetDetector.h"
#include "nsCyrillicDetector.h"
//----------------------------------------------------------------------
// Interface nsISupports [implementation]
NS_IMPL_ISUPPORTS(nsCyrXPCOMDetector, nsICharsetDetector)
void nsCyrillicDetector::HandleData(const char* aBuf, uint32_t aLen) {
uint8_t cls;
const char* b;
uint32_t i;
if (mDone) return;
for (i = 0, b = aBuf; i < aLen; i++, b++) {
for (unsigned j = 0; j < mItems; j++) {
if (0x80 & *b)
cls = mCyrillicClass[j][(*b) & 0x7F];
else
cls = 0;
NS_ASSERTION(cls <= 32, "illegal character class");
mProb[j] += gCyrillicProb[mLastCls[j]][cls];
mLastCls[j] = cls;
}
}
// We now only based on the first block we receive
DataEnd();
}
//---------------------------------------------------------------------
#define THRESHOLD_RATIO 1.5f
void nsCyrillicDetector::DataEnd() {
uint32_t max = 0;
uint8_t maxIdx = 0;
uint8_t j;
if (mDone) return;
for (j = 0; j < mItems; j++) {
if (mProb[j] > max) {
max = mProb[j];
maxIdx = j;
}
}
if (0 == max) // if we didn't get any 8 bits data
return;
#ifdef DEBUG
for (j = 0; j < mItems; j++)
printf("Charset %s->\t%d\n", mCharsets[j], mProb[j]);
#endif
this->Report(mCharsets[maxIdx]);
mDone = true;
}
//---------------------------------------------------------------------
nsCyrXPCOMDetector::nsCyrXPCOMDetector(uint8_t aItems,
const uint8_t** aCyrillicClass,
const char** aCharsets)
: nsCyrillicDetector(aItems, aCyrillicClass, aCharsets) {
mObserver = nullptr;
}
//---------------------------------------------------------------------
nsCyrXPCOMDetector::~nsCyrXPCOMDetector() {}
//---------------------------------------------------------------------
NS_IMETHODIMP nsCyrXPCOMDetector::Init(nsICharsetDetectionObserver* aObserver) {
NS_ASSERTION(mObserver == nullptr, "Init twice");
if (nullptr == aObserver) return NS_ERROR_ILLEGAL_VALUE;
mObserver = aObserver;
return NS_OK;
}
//----------------------------------------------------------
NS_IMETHODIMP nsCyrXPCOMDetector::DoIt(const char* aBuf, uint32_t aLen,
bool* oDontFeedMe) {
NS_ASSERTION(mObserver != nullptr, "have not init yet");
if ((nullptr == aBuf) || (nullptr == oDontFeedMe))
return NS_ERROR_ILLEGAL_VALUE;
this->HandleData(aBuf, aLen);
*oDontFeedMe = false;
return NS_OK;
}
//----------------------------------------------------------
NS_IMETHODIMP nsCyrXPCOMDetector::Done() {
NS_ASSERTION(mObserver != nullptr, "have not init yet");
this->DataEnd();
return NS_OK;
}
//----------------------------------------------------------
void nsCyrXPCOMDetector::Report(const char* aCharset) {
NS_ASSERTION(mObserver != nullptr, "have not init yet");
mObserver->Notify(aCharset, eBestAnswer);
}

View File

@ -0,0 +1,77 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsCyrillicDetector_h__
#define nsCyrillicDetector_h__
#include "nsCyrillicClass.h"
#include "nsIStringCharsetDetector.h"
static const uint8_t* gCyrillicCls[5] = {CP1251Map, KOI8Map, ISO88595Map,
MacCyrillicMap, IBM866Map};
static const char* gRussian[5] = {"windows-1251", "KOI8-R", "ISO-8859-5",
"x-mac-cyrillic", "IBM866"};
static const char* gUkrainian[5] = {"windows-1251", "KOI8-U", "ISO-8859-5",
"x-mac-cyrillic", "IBM866"};
#define NUM_CYR_CHARSET 5
class nsCyrillicDetector {
public:
nsCyrillicDetector(uint8_t aItems, const uint8_t** aCyrillicClass,
const char** aCharsets) {
mItems = aItems;
mCyrillicClass = aCyrillicClass;
mCharsets = aCharsets;
for (unsigned i = 0; i < mItems; i++) mProb[i] = mLastCls[i] = 0;
mDone = false;
}
virtual ~nsCyrillicDetector() {}
virtual void HandleData(const char* aBuf, uint32_t aLen);
virtual void DataEnd();
protected:
virtual void Report(const char* aCharset) = 0;
bool mDone;
private:
uint8_t mItems;
const uint8_t** mCyrillicClass;
const char** mCharsets;
uint32_t mProb[NUM_CYR_CHARSET];
uint8_t mLastCls[NUM_CYR_CHARSET];
};
class nsCyrXPCOMDetector : public nsCyrillicDetector,
public nsICharsetDetector {
public:
// nsISupports interface
NS_DECL_ISUPPORTS
nsCyrXPCOMDetector(uint8_t aItems, const uint8_t** aCyrillicClass,
const char** aCharsets);
NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver) override;
NS_IMETHOD DoIt(const char* aBuf, uint32_t aLen, bool* oDontFeedMe) override;
NS_IMETHOD Done() override;
protected:
virtual ~nsCyrXPCOMDetector();
virtual void Report(const char* aCharset) override;
private:
nsCOMPtr<nsICharsetDetectionObserver> mObserver;
};
class nsRUProbDetector final : public nsCyrXPCOMDetector {
public:
nsRUProbDetector() : nsCyrXPCOMDetector(5, gCyrillicCls, gRussian) {}
};
class nsUKProbDetector final : public nsCyrXPCOMDetector {
public:
nsUKProbDetector() : nsCyrXPCOMDetector(5, gCyrillicCls, gUkrainian) {}
};
#endif

View File

@ -0,0 +1,204 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsCyrillicProb_h___h__
#define nsCyrillicProb_h___h__
/*
DO NOT EDIT THIS FILE !!!
This file is generated by the perl script in
mozilla/intl/chardet/tools/gencyrillic.pl
To ues that script, you need to grab StatKoi.pm file from
the "Cyrillic Software Suite" written by John Neystdt.
http://www.neystadt.org/cyrillic (You can also find it from CPAN)
*/
const uint16_t gCyrillicProb[33][33] = {
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
},
{
0, 1, 0, 62, 8, 237, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0,
0, 0, 50, 9, 1342, 0, 5, 10, 0, 0, 16, 2, 0, 2041, 505, 0,
},
{
0, 1197, 0, 891, 3797, 594, 2064, 112, 646, 1039, 166,
152, 3162, 10935, 3465, 10268, 5, 277, 1744, 3706, 5043, 8884,
79, 716, 4563, 0, 0, 3090, 205, 9, 591, 1515, 0,
},
{
0, 206, 1117, 0, 0, 0, 652, 0, 0, 92, 194,
0, 4, 924, 25, 204, 2334, 2, 836, 832, 403, 0,
365, 63, 1, 0, 1257, 5, 9, 0, 358, 0, 629,
},
{
0, 0, 935, 0, 0, 0, 1695, 0, 0, 0, 5193, 0, 5, 1, 1, 0, 461,
0, 0, 0, 0, 0, 216, 0, 9, 0, 47, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 4049, 20, 22, 27, 8713, 0, 49, 0, 1530,
0, 660, 1182, 138, 1459, 5347, 1488, 344, 741, 1738, 63,
1460, 206, 242, 19, 743, 26, 51, 0, 0, 33, 90,
},
{
0, 141, 635, 516, 183, 8332, 911, 108, 2694, 255, 76,
2958, 2366, 8125, 3209, 19276, 285, 346, 483, 6823, 5705, 6596,
45, 1286, 525, 0, 0, 1093, 414, 15, 286, 767, 0,
},
{
0, 0, 272, 0, 0, 0, 376, 50, 0, 0, 803, 0, 0, 15, 2, 28, 591,
0, 0, 6, 2, 24, 19, 0, 0, 7, 31, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 4191, 0, 0, 68, 162, 0, 0, 0, 1248, 0, 8, 369, 0, 12, 15161,
0, 0, 678, 0, 2, 337, 0, 0, 0, 0, 0, 19, 0, 0, 11, 0,
},
{
0, 0, 102, 0, 0, 0, 5, 0, 15, 0, 27, 0, 6, 2, 1, 92, 2227,
0, 0, 101, 161, 7, 15, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
},
{
0, 1245, 609, 755, 2134, 1161, 4628, 120, 151, 2180, 5903,
3242, 2804, 3261, 4656, 3708, 1658, 104, 7815, 882, 3354, 3398,
16, 169, 1769, 0, 0, 5064, 96, 0, 48, 1628, 0,
},
{
0, 0, 0, 0, 0, 1, 3, 3, 0, 0, 0, 0, 6, 0, 12, 96, 67,
1, 0, 0, 2066, 11, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0,
},
{
0, 0, 4402, 0, 677, 0, 782, 0, 2, 0, 2724,
0, 10, 876, 0, 35, 6609, 0, 0, 651, 1323, 1558,
1049, 416, 225, 0, 0, 2, 13, 0, 0, 0, 0,
},
{
0, 741, 5440, 0, 0, 1, 6066, 0, 89, 0, 9040,
0, 153, 97, 4, 949, 9899, 0, 2830, 0, 8, 16,
2139, 434, 0, 7487, 157, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 2073, 13, 0, 0, 4818, 0, 0, 0, 3684,
0, 30, 89, 1094, 204, 4078, 119, 61, 1, 68, 0,
1684, 0, 68, 10, 1424, 0, 0, 0, 14, 6, 0,
},
{
0, 18, 16528, 0, 176, 474, 5075, 174, 31, 0, 14151,
0, 840, 0, 0, 8956, 14457, 0, 911, 0, 1150, 1893,
711, 8, 199, 271, 9281, 192, 0, 0, 2, 84, 0,
},
{
0, 23, 27, 4868, 799, 7820, 1391, 145, 13562, 909, 1551,
5834, 1881, 4400, 6329, 2878, 1911, 3632, 2374, 7308, 8626, 6679,
161, 2573, 15172, 0, 0, 1322, 778, 34, 129, 944, 0,
},
{
0, 0, 671, 0, 12, 0, 2500, 1, 0, 0, 409,
0, 26, 3612, 0, 38, 8786, 268, 87, 13327, 13, 15,
471, 0, 0, 7, 266, 0, 0, 0, 0, 2, 0,
},
{
0, 847, 0, 3, 184, 878, 1070, 0, 19, 482, 0,
90, 18, 26, 765, 151, 0, 0, 18, 20, 81, 2587,
0, 51, 766, 0, 0, 1224, 0, 0, 2209, 20, 0,
},
{
0, 2, 10059, 62, 17, 21, 11067, 6, 2653, 30, 7582,
0, 122, 14, 638, 490, 6767, 9, 1045, 431, 1139, 683,
2482, 326, 496, 156, 938, 0, 254, 0, 0, 30, 0,
},
{
0, 17, 1493, 218, 3, 213, 633, 26, 3, 590, 2176,
0, 3716, 3732, 938, 693, 4388, 1639, 4197, 1185, 2118, 21815,
2792, 0, 1033, 154, 239, 0, 25, 0, 0, 522, 3,
},
{
0, 0, 9785, 0, 27, 197, 8202, 0, 12, 24, 5253,
0, 433, 12, 53, 2577, 9712, 25, 122, 3392, 4966, 4,
836, 0, 8956, 4693, 1483, 5, 3, 0, 0, 270, 3,
},
{
0, 1930, 104, 260, 18, 1452, 325, 6, 1192, 51, 6,
0, 1098, 301, 1778, 398, 0, 2263, 7, 254, 2808, 452,
0, 743, 140, 0, 0, 45, 559, 0, 1336, 2289, 0,
},
{
0, 0, 796, 390, 0, 1303, 3459, 1, 11, 0, 632, 0, 37, 0, 0, 620, 0,
15, 0, 1, 0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 7418, 0, 51, 10, 5465, 0, 1, 51, 2962,
0, 999, 3853, 82, 1048, 7277, 241, 370, 394, 280, 286,
1126, 0, 183, 24, 3182, 197, 286, 0, 28, 0, 4,
},
{
0, 395, 0, 6, 22, 0, 496, 9, 113, 0, 700,
0, 171, 0, 78, 3296, 0, 0, 1501, 0, 1379, 193,
0, 0, 0, 0, 0, 487, 165, 0, 1633, 30, 0,
},
{
0, 0, 0, 36, 0, 272, 2847, 0, 27, 4998, 1,
1192, 33, 224, 2657, 219, 0, 363, 29, 273, 205, 503,
0, 0, 400, 0, 0, 38, 255, 0, 0, 305, 0,
},
{
0, 0, 7005, 32, 32, 869, 400, 0, 37, 0, 999,
0, 46, 204, 739, 1570, 1076, 0, 112, 89, 0, 1,
430, 1, 1191, 3, 368, 0, 0, 0, 0, 2, 77,
},
{
0, 0, 200, 0, 0, 0, 2054, 0, 0, 0, 397, 0, 19, 438, 0, 108, 0,
0, 0, 4, 0, 112, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 311, 16, 19, 11, 0,
2, 0, 10, 3, 1382, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 297, 0, 0, 0, 4290, 0, 0, 0, 3968, 0, 0, 0, 0, 33, 0,
0, 0, 1, 0, 0, 70, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0,
},
{
0, 0, 2304, 0, 0, 0, 4731, 0, 0, 0, 1873, 0, 198, 33, 0, 921, 0,
0, 0, 191, 0, 114, 134, 0, 2, 12, 0, 0, 7, 0, 0, 0, 0,
},
{
0, 0, 0, 0, 0, 0, 599, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 207, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
},
};
#endif

View File

@ -0,0 +1,43 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsDetetctionConfident_h__
#define nsDetetctionConfident_h__
/*
This type is used to indicate how confident the detection module about
the return result.
eNoAnswerYet is used to indicate that the detector have not find out a
answer yet based on the data it received.
eBestAnswer is used to indicate that the answer the detector returned
is the best one within the knowledge of the detector.
In other words, the test to all other candidcates fail.
For example, the (Shift_JIS/EUC-JP/ISO-2022-JP) detection
module may return this with answer "Shift_JIS "if it receive
bytes > 0x80 (which make ISO-2022-JP test failed) and byte
0x82 (which may EUC-JP test failed)
eSureAnswer is used to indicate that the detector is 100% sure about the
answer.
Exmaple 1; the Shift_JIS/ISO-2022-JP/EUC-JP detector return
this w/ ISO-2022-JP when it hit one of the following ESC seq
ESC ( J
ESC $ @
ESC $ B
Example 2: the detector which can detect UCS2 return w/ UCS2
when the first 2 byte are BOM mark.
Example 3: the Korean detector return ISO-2022-KR when it
hit ESC $ ) C
*/
typedef enum {
eNoAnswerYet = 0,
eBestAnswer,
eSureAnswer,
eNoAnswerMatch
} nsDetectionConfident;
#endif /* nsDetetctionConfident_h__ */

View File

@ -0,0 +1,32 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsICDETObserver_h__
#define nsICDETObserver_h__
#include "nsISupports.h"
#include "nsDetectionConfident.h"
// {12BB8F12-2389-11d3-B3BF-00805F8A6670}
#define NS_ICHARSETDETECTIONOBSERVER_IID \
{ \
0x12bb8f12, 0x2389, 0x11d3, { \
0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 \
} \
}
/*
Used to inform answer by nsICharsetDetector
*/
class nsICharsetDetectionObserver : public nsISupports {
public:
NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICHARSETDETECTIONOBSERVER_IID)
NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) = 0;
};
NS_DEFINE_STATIC_IID_ACCESSOR(nsICharsetDetectionObserver,
NS_ICHARSETDETECTIONOBSERVER_IID)
#endif /* nsICDETObserver_h__ */

View File

@ -0,0 +1,51 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsICharsetDetector_h__
#define nsICharsetDetector_h__
#include "nsISupports.h"
class nsICharsetDetectionObserver;
// {12BB8F14-2389-11d3-B3BF-00805F8A6670}
#define NS_ICHARSETDETECTOR_IID \
{ \
0x12bb8f14, 0x2389, 0x11d3, { \
0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 \
} \
}
class nsICharsetDetector : public nsISupports {
public:
NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICHARSETDETECTOR_IID)
/*
Setup the observer so it know how to notify the answer
*/
NS_IMETHOD Init(nsICharsetDetectionObserver* observer) = 0;
/*
Feed a block of bytes to the detector.
It will call the Notify function of the nsICharsetObserver if it find out
the answer.
aBytesArray - array of bytes
aLen - length of aBytesArray
oDontFeedMe - return true if the detector do not need the following block
false it need more bytes.
This is used to enhance performance
*/
NS_IMETHOD DoIt(const char* aBytesArray, uint32_t aLen,
bool* oDontFeedMe) = 0;
/*
It also tell the detector the last chance the make a decision
*/
NS_IMETHOD Done() = 0;
};
NS_DEFINE_STATIC_IID_ACCESSOR(nsICharsetDetector, NS_ICHARSETDETECTOR_IID)
#endif /* nsICharsetDetector_h__ */

View File

@ -0,0 +1,43 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsIStringCharsetDetector_h__
#define nsIStringCharsetDetector_h__
#include "nsISupports.h"
#include "nsDetectionConfident.h"
// {12BB8F15-2389-11d3-B3BF-00805F8A6670}
#define NS_ISTRINGCHARSETDETECTOR_IID \
{ \
0x12bb8f15, 0x2389, 0x11d3, { \
0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 \
} \
}
/*
This interface is similar to nsICharsetDetector
The difference is it is for line base detection instead of block based
detectection.
*/
class nsIStringCharsetDetector : public nsISupports {
public:
NS_DECLARE_STATIC_IID_ACCESSOR(NS_ISTRINGCHARSETDETECTOR_IID)
/*
Perform the charset detection
aBytesArray- the bytes
aLen- the length of the bytes
oCharset- the charset answer
oConfident - the confidence of the answer
*/
NS_IMETHOD DoIt(const char* aBytesArray, uint32_t aLen, const char** oCharset,
nsDetectionConfident& oConfident) = 0;
};
NS_DEFINE_STATIC_IID_ACCESSOR(nsIStringCharsetDetector,
NS_ISTRINGCHARSETDETECTOR_IID)
#endif /* nsIStringCharsetDetector_h__ */

View File

@ -0,0 +1,118 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsICharsetConverterManager.h"
#include <iostream.h>
#include "nsISupports.h"
#include "nsIUnicodeDecoder.h"
#include "nsIUnicodeEncoder.h"
#include "nsCRT.h"
#include <stdio.h>
#include <stdlib.h>
#if defined(XP_WIN)
# include <io.h>
#endif
#ifdef XP_UNIX
# include <unistd.h>
#endif
//---------------------------------------------------------------------------
void header() {
char* header =
"#ifndef nsCyrillicClass_h__\n"
"#define nsCyrillicClass_h__\n"
"/* PLEASE DO NOT EDIT THIS FILE DIRECTLY. THIS FILE IS GENERATED BY \n"
" GenCyrllicClass found in mozilla/intl/chardet/tools\n"
" */\n";
printf(header);
}
//---------------------------------------------------------------------------
void footer() { printf("#endif\n"); }
//---------------------------------------------------------------------------
void npl() {
char* npl =
"/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 "
"-*- */\n"
"/* This Source Code Form is subject to the terms of the Mozilla Public\n"
" * License, v. 2.0. If a copy of the MPL was not distributed with this\n"
" * file, You can obtain one at http://mozilla.org/MPL/2.0/. */\n";
printf(npl);
}
//---------------------------------------------------------------------------
static nsIUnicodeEncoder* gKOI8REncoder = nullptr;
static nsICharsetConverterManager* gCCM = nullptr;
//---------------------------------------------------------------------------
uint8_t CyrillicClass(nsIUnicodeDecoder* decoder, uint8_t byte) {
char16_t ubuf[2];
uint8_t bbuf[2];
int32_t blen = 1;
int32_t ulen = 1;
nsresult res = decoder->Convert((char*)&byte, &blen, ubuf, &ulen);
if (NS_SUCCEEDED(res) && (1 == ulen)) {
ubuf[0] = nsCRT::ToUpper(ubuf[0]);
blen = 1;
res = gKOI8REncoder->Convert(ubuf, &ulen, (char*)bbuf, &blen);
if (NS_SUCCEEDED(res) && (1 == blen)) {
if (0xe0 <= bbuf[0]) {
return bbuf[0] - (uint8_t)0xdf;
}
}
}
return 0;
}
//---------------------------------------------------------------------------
void genCyrillicClass(const char* name, const char* charset) {
nsIUnicodeDecoder* decoder = nullptr;
nsresult res = NS_OK;
nsAutoString str(charset);
res = gCCM->GetUnicodeDecoder(&str, &decoder);
if (NS_FAILED(res)) {
printf("cannot locate %s Decoder\n", charset);
return;
}
printf("static const uint8_t %sMap [128] = {\n", name);
uint8_t i, j;
for (i = 0x80; i != 0x00; i += 0x10) {
for (j = 0; j <= 0x0f; j++) {
uint8_t cls = CyrillicClass(decoder, i + j);
printf(" %2d, ", cls);
}
printf("\n");
}
printf("};\n");
NS_IF_RELEASE(decoder);
}
//---------------------------------------------------------------------------
int main(int argc, char** argv) {
nsresult res = nullptr;
nsCOMPtr<nsICharsetConverterManager> gCCM =
do_GetService(kCharsetConverterManagerCID, &res);
if (NS_FAILED(res) && (nullptr != gCCM)) {
printf("cannot locate CharsetConverterManager\n");
return (-1);
}
nsAutoString koi8r("KOI8-R");
res = gCCM->GetUnicodeEncoder(&koi8r, &gKOI8REncoder);
if (NS_FAILED(res) && (nullptr != gKOI8REncoder)) {
printf("cannot locate KOI8-R Encoder\n");
return (-1);
}
npl();
header();
genCyrillicClass("KOI8", "KOI8-R");
genCyrillicClass("CP1251", "windows-1251");
genCyrillicClass("IBM866", "IBM866");
genCyrillicClass("ISO88595", "ISO-8859-5");
genCyrillicClass("MacCyrillic", "x-mac-cyrillic");
footer();
NS_IF_RELEASE(gKOI8REncoder);
return (0);
};

View File

@ -0,0 +1,50 @@
#!/usr/bin/perl
#!/usr/bin/perl
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
open (STAT,$ARGV[0]) || die " cannot open data file $ARGV[0]\n";
@count;
while(<STAT>)
{
@k = split(/\s+/, $_);
$count{$k[0]} = $k[1];
}
$count = 0;
while(<STDIN>)
{
@ck = split /\s*/, $_;
$s = 0;
$fb = 0;
$cl = $#ck;
$j = 0;
while($j < $cl) {
$cc = unpack("C", $ck[$j]);
if(0 eq $s ) {
if($cc > 0x80) {
if($cc > 0xa0) {
$fb = $ck[$j];
$s = 2;
} else {
$s = 1;
}
}
} elsif (1 eq $s) {
} else {
if($cc > 0xa0) {
$fb .= $ck[$j];
$count{$fb}++;
print $fb . " " .$count{$fb} . "\n";
$s = 0;
} else {
$s = 1;
}
}
$j = $j + 1;
}
}
foreach $c (sort(keys( %count )))
{
print $c . " ". $count{$c} . "\n";
}

View File

@ -0,0 +1,95 @@
#!/usr/bin/perl
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
sub GenNPL {
my($ret) = << "END_NPL";
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
END_NPL
return $ret;
}
print GenNPL();
$total=0;
@h;
@l;
while(<STDIN>)
{
@k = split(/\s+/, $_);
@i = unpack("CCCC", $k[0]);
# printf("%x %x %s",$i[0] , $i[1] , "[" . $k[0] . "] " . $i . " " . $j . " " . $k[1] ."\n");
if((0xA1 <= $i[0]) && (0xA1 <= $i[1])){
$total += $k[1];
$v = $i[0] - 0x00A1;
$h[$v] += $k[1];
$u = $i[1] - 0x00A1;
$l[$u] += $k[1];
# print "hello $v $h[$v] $u $l[$u]\n";
}
}
$ffh = 0.0;
$ffl = 0.0;
for($i=0x00A1;$i< 0x00FF ; $i++)
{
$fh[$i - 0x00a1] = $h[$i- 0x00a1] / $total;
$ffh += $fh[$i - 0x00a1];
$fl[$i - 0x00a1] = $l[$i- 0x00a1] / $total;
$ffl += $fl[$i - 0x00a1];
}
$mh = $ffh / 94.0;
$ml = $ffl / 94.0;
$sumh=0.0;
$suml=0.0;
for($i=0x00A1;$i< 0x00FF ; $i++)
{
$sh = $fh[$i - 0x00a1] - $mh;
$sh *= $sh;
$sumh += $sh;
$sl = $fl[$i - 0x00a1] - $ml;
$sl *= $sl;
$suml += $sl;
}
$sumh /= 94.0;
$suml /= 94.0;
$stdh = sqrt($sumh);
$stdl = sqrt($suml);
print "{\n";
print " {\n";
for($i=0x00A1;$i< 0x00FF ; $i++)
{
if($i eq 0xfe) {
printf(" %.6ff \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i);
} else {
printf(" %.6ff, \/\/ FreqH[%2x]\n", $fh[$i - 0x00a1] , $i);
}
}
print " },\n";
printf ("%.6ff, \/\/ Lead Byte StdDev\n", $stdh);
printf ("%.6ff, \/\/ Lead Byte Mean\n", $mh);
printf ("%.6ff, \/\/ Lead Byte Weight\n", $stdh / ($stdh + $stdl));
print " {\n";
for($i=0x00A1;$i< 0x00FF ; $i++)
{
if($i eq 0xfe) {
printf(" %.6ff \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] , $i);
} else {
printf(" %.6ff, \/\/ FreqL[%2x]\n", $fl[$i - 0x00a1] , $i);
}
}
print " },\n";
printf ("%.6ff, \/\/ Trail Byte StdDev\n", $stdl);
printf ("%.6ff, \/\/ Trail Byte Mean\n", $ml);
printf ("%.6ff \/\/ Trial Byte Weight\n", $stdl / ($stdh + $stdl));
print "};\n";

18
intl/chardet/tools/gen.cmd Executable file
View File

@ -0,0 +1,18 @@
REM This Source Code Form is subject to the terms of the Mozilla Public
REM License, v. 2.0. If a copy of the MPL was not distributed with this
REM file, You can obtain one at http://mozilla.org/MPL/2.0/.
perl gencp1252.pl > ..\src\nsCP1252Verifier.h
perl geneucjp.pl > ..\src\nsEUCJPVerifier.h
perl geniso2022jp.pl > ..\src\nsISO2022JPVerifier.h
perl gensjis.pl > ..\src\nsSJISVerifier.h
perl genutf8.pl > ..\src\nsUTF8Verifier.h
perl geneuckr.pl > ..\src\nsEUCKRVerifier.h
perl gengb2312.pl > ..\src\nsGB2312Verifier.h
perl genbig5.pl > ..\src\nsBIG5Verifier.h
perl geneuctw.pl > ..\src\nsEUCTWVerifier.h
perl genucs2be.pl > ..\src\nsUCS2BEVerifier.h
perl genucs2le.pl > ..\src\nsUCS2LEVerifier.h
perl genhz.pl > ..\src\nsHZVerifier.h
perl geniso2022kr.pl > ..\src\nsISO2022KRVerifier.h
perl geniso2022cn.pl > ..\src\nsISO2022CNVerifier.h

View File

@ -0,0 +1,42 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@big5_cls);
my(@big5_st);
my($big5_ver);
@big5_cls = (
[ 0x00 , 0x00 , 1 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x01 , 0x3f , 1 ],
[ 0x40 , 0x7e , 2 ],
[ 0x7f , 0x7f , 1 ],
[ 0xff , 0xff , 0 ],
[ 0x80 , 0xa0 , 4 ],
[ 0xa1 , 0xfe , 3 ],
);
package genverifier;
@big5_st = (
# 0 1 2 3 4
1, 0, 0, 3, 1, # state 0
1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 0, 0, 0, # state 3
);
$big5_ver = genverifier::GenVerifier("BIG5", "Big5", \@big5_cls, 5, \@big5_st);
print $big5_ver;

View File

@ -0,0 +1,55 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@cp1252_cls);
my(@cp1252_st);
my($cp1252_ver);
@cp1252_cls = (
[ 0x00 , 0x00 , 1 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x81 , 0x81 , 0 ],
[ 0x8d , 0x8d , 0 ],
[ 0x8f , 0x8f , 0 ],
[ 0x90 , 0x90 , 0 ],
[ 0x9d , 0x9d , 0 ],
[ 0xc0 , 0xd6 , 1 ],
[ 0xd8 , 0xf6 , 1 ],
[ 0xf8 , 0xff , 1 ],
[ 0x8a , 0x8a , 1 ],
[ 0x8c , 0x8c , 1 ],
[ 0x8e , 0x8e , 1 ],
[ 0x9a , 0x9a , 1 ],
[ 0x9c , 0x9c , 1 ],
[ 0x9e , 0x9e , 1 ],
[ 0x9f , 0x9f , 1 ],
[ 0x00 , 0xff , 2 ],
);
package genverifier;
@cp1252_st = (
# 0 1 2
1, 3, 0, # Start State - 0
1, 1, 1, # Error State - 1
2, 2, 2, # ItsMe State - 2
1, 4, 0, # State - 3
1, 5, 4, # State - 4
1, 1, 4, # State - 5
);
$cp1252_ver = genverifier::GenVerifier("CP1252", "windows-1252",
\@cp1252_cls, 3, \@cp1252_st);
print $cp1252_ver;

View File

@ -0,0 +1,65 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use StatKoi '.' ;
open(FILE, "> ../src/nsCyrillicProb.h") or die "cannot open nsCyrillicDetector.h";
print FILE <<EOF;
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsCyrillicDetector_h__
#define nsCyrillicDetector_h__
/*
DO NOT EDIT THIS FILE !!!
This file is generated by the perl script in
mozilla/intl/chardet/tools/gencyrillic.pl
To ues that script, you need to grab StatKoi.pm file from
the "Cyrillic Software Suite" written by John Neystdt.
http://www.neystadt.org/cyrillic (You can also find it from CPAN)
*/
EOF
$table = \%Lingua::DetectCharset::StatKoi::StatsTableKoi;
print FILE "const uint16_t gCyrillicProb[33][33] = {";
print FILE "{ \n";
print FILE "0,\n";
for($j = 0xc0; $j < 0xe0; $j++)
{
print FILE "0, \t";
if( 7 == ( $j % 8) )
{
print FILE "\n";
}
}
print FILE "\n}, \n";
for($i = 0xc0; $i < 0xe0; $i++)
{
print FILE "{ \n";
print FILE "0,\n";
for($j = 0xc0; $j < 0xe0; $j++)
{
$key = chr($i) . chr($j);
if(exists($table->{$key}))
{
$v = $table->{$key};
} else {
$v = 0;
}
print FILE $v . ", \t";
if( 7 == ( $j % 8) )
{
print FILE "\n";
}
}
print FILE "\n}, \n";
}
print FILE "};\n";
print FILE "#endif\n";

View File

@ -0,0 +1,47 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@eucjp_cls);
my(@eucjp_st);
my($eucjp_ver);
@eucjp_cls = (
[ 0x0e , 0x0f , 5 ],
[ 0xe0 , 0xfe , 0 ],
[ 0x8e , 0x8e , 1 ],
[ 0xa1 , 0xdf , 2 ],
[ 0x8f , 0x8f , 3 ],
[ 0x01 , 0x1a , 4 ],
[ 0x1c , 0x7f , 4 ],
[ 0x00 , 0x00 , 4 ],
[ 0x1b , 0x1b , 5 ],
[ 0x80 , 0x8d , 5 ],
[ 0xa0 , 0xa0 , 5 ],
[ 0x80 , 0xff , 5 ]
);
package genverifier;
@eucjp_st = (
# 0 1 2 3 4 5
3, 4, 3, 5, 0, 1, # state 0
1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, # ItsMe State - 2
0, 1, 0, 1, 1, 1, # state 3
1, 1, 0, 1, 1, 1, # state 4
3, 1, 3, 1, 1, 1, # state 5
);
$eucjp_ver = genverifier::GenVerifier("EUCJP", "EUC-JP", \@eucjp_cls, 6, \@eucjp_st);
print $eucjp_ver;

View File

@ -0,0 +1,42 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@euckr_cls);
my(@euckr_st);
my($euckr_ver);
@euckr_cls = (
[ 0x00 , 0x00 , 1 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x01 , 0x7f , 1 ],
[ 0x80 , 0xa0 , 0 ],
[ 0xff , 0xff , 0 ],
[ 0xad , 0xaf , 3 ],
[ 0xc9 , 0xc9 , 3 ],
[ 0xa1 , 0xfe , 2 ],
);
package genverifier;
@euckr_st = (
# 0 1 2 3
1, 0, 3, 1, # state 0
1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, # ItsMe State - 2
1, 1, 0, 0, # state 3
);
$euckr_ver = genverifier::GenVerifier("EUCKR", "EUC-KR", \@euckr_cls, 4, \@euckr_st);
print $euckr_ver;

View File

@ -0,0 +1,49 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@euctw_cls);
my(@euctw_st);
my($euctw_ver);
@euctw_cls = (
[ 0x00 , 0x00 , 2 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x01 , 0x7f , 2 ],
[ 0x8e , 0x8e , 6 ],
[ 0x80 , 0xa0 , 0 ],
[ 0xff , 0xff , 0 ],
[ 0xa1 , 0xa1 , 3 ],
[ 0xa2 , 0xa7 , 4 ],
[ 0xa8 , 0xa9 , 5 ],
[ 0xaa , 0xc1 , 1 ],
[ 0xc2 , 0xc2 , 3 ],
[ 0xc3 , 0xc3 , 1 ],
[ 0xc4 , 0xfe , 3 ],
);
package genverifier;
@euctw_st = (
# 0 1 2 3 4 5 6
1, 1, 0, 3, 3, 3, 4, # state 0
1, 1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 0, 1, 0, 0, 0, 1, # state 3
1, 1, 1, 1, 5, 1, 1, # state 4
1, 0, 1, 0, 0, 0, 1, # state 5
);
$euctw_ver = genverifier::GenVerifier("EUCTW", "x-euc-tw", \@euctw_cls, 7, \@euctw_st);
print $euctw_ver;

View File

@ -0,0 +1,44 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@gb18030_cls);
my(@gb18030_st);
my($gb18030_ver);
@gb18030_cls = (
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x30 , 0x39 , 3 ],
[ 0x00 , 0x3f , 1 ],
[ 0x40 , 0x7e , 2 ],
[ 0x7f , 0x7f , 4 ],
[ 0x80 , 0x80 , 5 ],
[ 0x81 , 0xfe , 6 ],
[ 0xff , 0xff , 0 ],
);
package genverifier;
@gb18030_st = (
# 0 1 2 3 4 5 6
1, 0, 0, 0, 0, 0, 3, # state 0
1, 1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 0, 4, 1, 0, 0, # state 3, multibytes, 1st byte identified
1, 1, 1, 1, 1, 1, 5, # state 4, multibytes, 2nd byte identified
1, 1, 1, 2, 1, 1, 1, # state 5, multibytes, 3rd byte identified
);
$gb18030_ver = genverifier::GenVerifier("gb18030", "gb18030", \@gb18030_cls, 7, \@gb18030_st);
print $gb18030_ver;

View File

@ -0,0 +1,41 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@gb2312_cls);
my(@gb2312_st);
my($gb2312_ver);
@gb2312_cls = (
[ 0x00 , 0x00 , 1 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x01 , 0x7f , 1 ],
[ 0x80 , 0xa0 , 0 ],
[ 0xff , 0xff , 0 ],
[ 0xaa , 0xaf , 3 ],
[ 0xa1 , 0xfe , 2 ],
);
package genverifier;
@gb2312_st = (
# 0 1 2 3
1, 0, 3, 1, # state 0
1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, # ItsMe State - 2
1, 1, 0, 0, # state 3
);
$gb2312_ver = genverifier::GenVerifier("GB2312", "GB2312", \@gb2312_cls, 4, \@gb2312_st);
print $gb2312_ver;

View File

@ -0,0 +1,57 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@hz_cls);
my(@hz_st);
my($hz_ver);
#
#
# > 0x80 - 1
# ~ - 2
# LF - 3
# { - 4
# } - 5
#
@hz_cls = (
[ 0x01 , 0x1a , 0 ],
[ 0x7e , 0x7e , 2 ],
[ 0x0a , 0x0a , 3 ],
[ 0x7b , 0x7b , 4 ],
[ 0x7d , 0x7d , 5 ],
[ 0x1c , 0x7f , 0 ],
[ 0x0e , 0x0f , 1 ],
[ 0x1b , 0x1b , 1 ],
[ 0x00 , 0x00 , 1 ],
[ 0x80 , 0xff , 1 ]
);
#
#
package genverifier;
@hz_st = (
# 0 1 2 3 4 5
0, 1, 3, 0, 0, 0, # Start State - 0
1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 0, 0, 4, 1, # state 3 - got ~
5, 1, 6, 1, 5, 5, # state 4 - got ~ {
4, 1, 4, 1, 4, 4, # state 5 - got ~ { X
4, 1, 4, 1, 4, 2, # state 6 - got ~ { [X X]* ~
);
$hz_ver = genverifier::GenVerifier("HZ", "HZ-GB-2312",
\@hz_cls, 6, \@hz_st);
print $hz_ver;

View File

@ -0,0 +1,58 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@iso2022cn_cls);
my(@iso2022cn_st);
my($iso2022cn_ver);
#
#
# ESC - 1
# > 0x80 - 2
# $ - 3
# ) - 4
# * - 5
# A G - 6
# H - 7
# N O - 8
#
@iso2022cn_cls = (
[ 0x01 , 0x1a , 0 ],
[ 0x29 , 0x29 , 3 ],
[ 0x43 , 0x43 , 4 ],
[ 0x1c , 0x7f , 0 ],
[ 0x1b , 0x1b , 1 ],
[ 0x00 , 0x00 , 2 ],
[ 0x80 , 0xff , 2 ]
);
#
# ESC$((([)][AG])|([*]H))|[NO])
#
package genverifier;
@iso2022cn_st = (
# 0 1 2 3 4 5 6 7 8
0, 3, 1, 0, 0, 0, 0, 0, 0, # Start State - 0
1, 1, 1, 1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 1, 4, 1, 1, 1, 1, 2, # state 3 - got ESC
1, 1, 1, 1, 5, 6, 1, 1, 1, # state 4 - got ESC $
1, 1, 1, 1, 1, 1, 2, 1, 1, # state 5 - got ESC $ )
1, 1, 1, 1, 1, 1, 1, 2, 1, # state 6 - got ESC $ *
);
$iso2022cn_ver = genverifier::GenVerifier("ISO2022CN", "ISO-2022-CN",
\@iso2022cn_cls, 9, \@iso2022cn_st);
print $iso2022cn_ver;

View File

@ -0,0 +1,49 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@iso2022jp_cls);
my(@iso2022jp_st);
my($iso2022jp_ver);
# 1:ESC 3:'(' 4:'B' 5:'J' 6:'@' 7:'$' 8:'D' 9:'I'
@iso2022jp_cls = (
[ 0x0e , 0x0f , 2 ],
[ 0x28 , 0x28 , 3 ],
[ 0x42 , 0x42 , 4 ],
[ 0x4a , 0x4a , 5 ],
[ 0x40 , 0x40 , 6 ],
[ 0x24 , 0x24 , 7 ],
[ 0x44 , 0x44 , 8 ],
[ 0x49 , 0x49 , 9 ],
[ 0x01 , 0x1a , 0 ],
[ 0x1c , 0x7f , 0 ],
[ 0x1b , 0x1b , 1 ],
[ 0x00 , 0x00 , 2 ],
[ 0x80 , 0xff , 2 ]
);
package genverifier;
@iso2022jp_st = (
# 0 1 2 3 4 5 6 7 8 9
0, 3, 1, 0, 0, 0, 0, 0, 0, 0, # Start State - 0
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 1, 5, 1, 1, 1, 4, 1, 1, # got ESC
1, 1, 1, 6, 2, 1, 2, 1, 1, 1, # got ESC $
1, 1, 1, 1, 2, 2, 1, 1, 1, 2, # got ESC (
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, # got ESC $ (
);
$iso2022jp_ver = genverifier::GenVerifier("ISO2022JP", "ISO-2022-JP",
\@iso2022jp_cls, 10, \@iso2022jp_st);
print $iso2022jp_ver;

View File

@ -0,0 +1,55 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@iso2022kr_cls);
my(@iso2022kr_st);
my($iso2022kr_ver);
#
#
# ESC - 1
# > 0x80 - 2
# $ - 3
# ) - 4
# C - 5
#
@iso2022kr_cls = (
[ 0x01 , 0x1a , 0 ],
[ 0x24 , 0x24 , 3 ],
[ 0x29 , 0x29 , 4 ],
[ 0x43 , 0x43 , 5 ],
[ 0x1c , 0x7f , 0 ],
[ 0x1b , 0x1b , 1 ],
[ 0x00 , 0x00 , 2 ],
[ 0x80 , 0xff , 2 ]
);
#
# ESC$)C
#
package genverifier;
@iso2022kr_st = (
# 0 1 2 3 4 5
0, 3, 1, 0, 0, 0, # Start State - 0
1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 1, 4, 1, 1, # state 3 - got ESC
1, 1, 1, 1, 5, 1, # state 4 - got ESC $
1, 1, 1, 1, 1, 2, # state 5 - got ESC $ )
);
$iso2022kr_ver = genverifier::GenVerifier("ISO2022KR", "ISO-2022-KR",
\@iso2022kr_cls, 6, \@iso2022kr_st);
print $iso2022kr_ver;

View File

@ -0,0 +1,46 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@sjis_cls);
my(@sjis_st);
my($sjis_ver);
@sjis_cls = (
[ 0x00 , 0x00 , 0 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0xfd , 0xff , 0 ],
[ 0x85 , 0x86 , 3 ],
[ 0xeb , 0xec , 5 ],
[ 0x01 , 0x1a , 1 ],
[ 0x1c , 0x3f , 1 ],
[ 0x7f , 0x7f , 1 ],
[ 0x40 , 0x7e , 2 ],
[ 0xa1 , 0xdf , 2 ],
[ 0x80 , 0x9f , 3 ],
[ 0xa0 , 0xa0 , 4 ],
[ 0xe0 , 0xea , 3 ],
[ 0xed , 0xfc , 4 ],
);
package genverifier;
@sjis_st = (
# 0 1 2 3 4 5
1, 0, 0, 3, 1, 1, # Start State - 0
1, 1, 1, 1, 1, 1, # Error State - 1
2, 2, 2, 2, 2, 2, # ItsMe State - 2
1, 1, 0, 0, 0, 0, # State - 3
);
$sjis_ver = genverifier::GenVerifier("SJIS", "Shift_JIS", \@sjis_cls, 6, \@sjis_st);
print $sjis_ver;

View File

@ -0,0 +1,189 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use strict;
require "genverifier.pm";
use genverifier;
my(@utf8_cls);
my(@utf8_st);
my($utf8_ver);
#
#
# UTF8 encode the UCS4 into 1 to 4 bytes
#
# 1 byte 00 00 00 00 00 00 00 7f
# 2 bytes 00 00 00 80 00 00 07 ff
# 3 bytes 00 00 08 00 00 00 ff ff
# 4 bytes 00 01 00 00 00 10 ff ff
#
# However, since Surrogate area should not be encoded into UTF8 as
# a Surrogate pair, we can remove the surrogate area from UTF8
#
# 1 byte 00 00 00 00 00 00 00 7f
# 2 bytes 00 00 00 80 00 00 07 ff
# 3 bytes 00 00 08 00 00 00 d7 ff
# 00 00 e0 00 00 00 ff ff
# 4 bytes 00 01 00 00 00 10 ff ff
#
# Now we break them into 6 bits group for 2-4 bytes UTF8
#
# 1 byte 00 7f
# 2 bytes 02 00 1f 3f
# 3 bytes 00 20 00 0d 1f 3f
# 0e 00 00 0f 3f 3f
# 4 bytes 00 10 00 00 04 0f 3f 3f
#
# Break down more
#
# 1 byte 00 7f
# 2 bytes 02 00 1f 3f
# 3 bytes 00 20 00 00 3f 3f
# 01 00 00 0c 3f 3f
# 0d 00 00 0d 1f 3f
# 0e 00 00 0f 3f 3f
# 4 bytes 00 10 00 00 00 3f 3f 3f
# 01 00 00 00 03 3f 3f 3f
# 04 00 00 00 04 0f 3f 3f
#
# Now, add
# c0 to the lead byte of 2 bytes UTF8
# e0 to the lead byte of 3 bytes UTF8
# f0 to the lead byte of 4 bytes UTF8
# 80 to the trail bytes
#
# 1 byte 00 7f
# 2 bytes c2 80 df bf
# 3 bytes e0 a0 80 e0 bf bf
# e1 80 80 ec bf bf
# ed 80 80 ed 9f bf
# ee 80 80 ef bf bf
# 4 bytes f0 90 80 80 f0 bf bf bf
# f1 80 80 80 f3 bf bf bf
# f4 80 80 80 f4 8f bf bf
#
#
# Now we can construct our state diagram
#
# 0:0x0e,0x0f,0x1b->Error
# 0:[0-0x7f]->0
# 0:[c2-df]->3
# 0:e0->4
# 0:[e1-ec, ee-ef]->5
# 0:ed->6
# 0:f0->7
# 0:[f1-f3]->8
# 0:f4->9
# 0:*->Error
# 3:[80-bf]->0
# 3:*->Error
# 4:[a0-bf]->3
# 4:*->Error
# 5:[80-bf]->3
# 5:*->Error
# 6:[80-9f]->3
# 6:*->Error
# 7:[90-bf]->5
# 7:*->Error
# 8:[80-bf]->5
# 8:*->Error
# 9:[80-8f]->5
# 9:*->Error
#
# Now, we classified chars into class
#
# 00,0e,0f,1b:k0
# 01-0d,10-1a,1c-7f:k1
# 80-8f:k2
# 90-9f:k3
# a0-bf:k4
# c0-c1:k0
# c2-df:k5
# e0:k6
# e1-ec:k7
# ed:k8
# ee-ef:k7
# f0:k9
# f1-f3:k10
# f4:k11
# f5-ff:k0
#
# Now, let's put them into array form
@utf8_cls = (
[ 0x00 , 0x00 , 1 ],
[ 0x0e , 0x0f , 0 ],
[ 0x1b , 0x1b , 0 ],
[ 0x01 , 0x0d , 1 ],
[ 0x10 , 0x1a , 1 ],
[ 0x1c , 0x7f , 1 ],
[ 0x80 , 0x8f , 2 ],
[ 0x90 , 0x9f , 3 ],
[ 0xa0 , 0xbf , 4 ],
[ 0xc0 , 0xc1 , 0 ],
[ 0xc2 , 0xdf , 5 ],
[ 0xe0 , 0xe0 , 6 ],
[ 0xe1 , 0xec , 7 ],
[ 0xed , 0xed , 8 ],
[ 0xee , 0xef , 7 ],
[ 0xf0 , 0xf0 , 9 ],
[ 0xf1 , 0xf3 , 10 ],
[ 0xf4 , 0xf4 , 11 ],
[ 0xf5 , 0xff , 0 ],
);
#
# Now, we write the state diagram in class
#
# 0:k0->Error
# 0:k1->0
# 0:k5->3
# 0:k6->4
# 0:k7->5
# 0:k8->6
# 0:k9->7
# 0:k10->8
# 0:k11->9
# 0:*->Error
# 3:k2,k3,k4->0
# 3:*->Error
# 4:k4->3
# 4:*->Error
# 5:k2,k3,k4->3
# 5:*->Error
# 6:k2,k3->3
# 6:*->Error
# 7:k3,k4->5
# 7:*->Error
# 8:k2,k3,k4->5
# 8:*->Error
# 9:k2->5
# 9:*->Error
#
# Now, let's put them into array
#
package genverifier;
@utf8_st = (
# 0 1 2 3 4 5 6 7 8 9 10 11
1, 0, 1, 1, 1, 3, 4, 5, 6, 7, 8, 9, # state 0 Start
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 1 Error
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # state 2 ItsMe
1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, # state 3
1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, # state 4
1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, # state 5
1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, # state 6
1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, # state 7
1, 1, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, # state 8
1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, # state 9
);
$utf8_ver = genverifier::GenVerifier("UTF8", "UTF-8", \@utf8_cls, 12, \@utf8_st);
print $utf8_ver;

View File

@ -0,0 +1,175 @@
#!/usr/local/bin/perl
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
package genverifier;
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
use Exporter;
$VERSION = 1.00;
@ISA = qw(Exporter);
@EXPORT = qw(
GenVerifier
);
@EXPORT_OK = qw();
sub GenNPL {
my($ret) = << "END_MPL";
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
END_MPL
return $ret;
}
##--------------------------------------------------------------
sub GetClass {
my($char, $clstbl) = @_;
my($l);
for($l =0; $l <= @$clstbl; $l++) {
if(($clstbl->[$l][0] <= $char) && ($char <= $clstbl->[$l][1]))
{
return $clstbl->[$l][2];
}
}
print "WARNING- there are no class for $char\n";
};
##--------------------------------------------------------------
sub GenClassPkg {
my($name, $bits) = @_;
return GenPkg($name, $bits, "_cls");
}
##--------------------------------------------------------------
sub GenStatePkg {
my($name, $bits) = @_;
return GenPkg($name, $bits, "_st");
};
##--------------------------------------------------------------
sub GenPkg {
my($name, $bits, $tbl) = @_;
my($ret);
$ret = " {" .
"eIdxSft" . $bits . "bits, " .
"eSftMsk" . $bits . "bits, " .
"eBitSft" . $bits . "bits, " .
"eUnitMsk" . $bits . "bits, " .
$name . $tbl . "" .
" }";
return $ret;
};
##--------------------------------------------------------------
sub Gen4BitsClass {
my($name, $clstbl) = @_;
my($i,$j);
my($cls);
my($ret);
$ret = "";
$ret .= "static const uint32_t " . $name . "_cls [ 256 / 8 ] = {\n";
for($i = 0; $i < 0x100; $i+= 8) {
$ret .= "PCK4BITS(";
for($j = $i; $j < $i + 8; $j++) {
$cls = &GetClass($j,$clstbl);
$ret .= sprintf("%2d", $cls) ;
if($j != ($i+7)) {
$ret .= ",";
}
}
if( $i+8 >= 0x100) {
$ret .= ") ";
} else {
$ret .= "),";
}
$ret .= sprintf(" // %02x - %02x\n", $i, ($i+7));
}
$ret .= "};\n";
return $ret;
};
##--------------------------------------------------------------
sub GenVerifier {
my($name, $charset, $cls, $numcls, $st) = @_;
my($ret);
$ret = GenNPL();
$ret .= GenNote();
$ret .= GenHeader();
$ret .= Gen4BitsClass($name, $cls);
$ret .= "\n\n";
$ret .= Gen4BitsState($name, $st);
$ret .= "\n\n";
$ret .= "const SMModel " . $name . "SMModel = {\n";
$ret .= GenClassPkg($name, 4);
$ret .= ",\n";
$ret .= " " . $numcls;
$ret .= ",\n";
$ret .= GenStatePkg($name, 4);
$ret .= ",\n";
$ret .= " " . "CHAR_LEN_TABLE(" . $name . "CharLenTable),\n";
$ret .= ' "' . $charset . '",' . "\n";
$ret .= "};\n";
return $ret;
};
##--------------------------------------------------------------
sub Gen4BitsState {
my($name, $sttbl) = @_;
my($lenafterpad) = (((@$sttbl-1) >> 3) + 1) << 3;
my($i,$j);
my($ret);
$ret = "";
$ret .= "static const uint32_t " . $name . "_st [ " . ($lenafterpad >> 3) . "] = {\n";
for($i = 0; $i < $lenafterpad ; $i+= 8) {
$ret .= "PCK4BITS(";
for($j = $i; $j < $i + 8; $j++) {
if(0 == $sttbl->[$j]) {
$ret .= "eStart";
} else { if(1 == $sttbl->[$j]) {
$ret .= "eError";
} else { if(2 == $sttbl->[$j]) {
$ret .= "eItsMe";
} else {
$ret .= sprintf(" %d", $sttbl->[$j]) ;
}}}
if($j != ($i+7)) {
$ret .= ",";
}
}
if( $i+8 >= $lenafterpad ) {
$ret .= ") ";
} else {
$ret .= "),";
}
$ret .= sprintf(" // %02x - %02x\n", $i, ($i+7));
}
$ret .= "};\n";
return $ret;
};
##--------------------------------------------------------------
sub GenNote {
my($ret) = << "END_NOTE";
/*
* DO NOT EDIT THIS DOCUMENT MANUALLY !!!
* THIS FILE IS AUTOMATICALLY GENERATED BY THE TOOLS UNDER
* mozilla/intl/chardet/tools/
* Please contact ftang\@netscape.com or mozilla-i18n\@mozilla.org
* if you have any question. Thanks
*/
END_NOTE
return $ret;
}
##--------------------------------------------------------------
sub GenHeader {
my($ret) = << "END_HEADER";
#include "nsVerifier.h"
END_HEADER
return $ret;
}
##--------------------------------------------------------------
1; # this should be the last line

View File

@ -25,12 +25,10 @@ DIRS += [
EXPORTS.mozilla += [
'Encoding.h',
'EncodingDetector.h',
'JapaneseDetector.h',
]
EXPORTS += [
'../third_party/rust/chardetng_c/include/chardetng.h',
'../third_party/rust/encoding_c/include/encoding_rs.h',
'../third_party/rust/encoding_c/include/encoding_rs_statics.h',
'../third_party/rust/shift_or_euc_c/include/shift_or_euc.h',

View File

@ -4046,37 +4046,12 @@
#---------------------------------------------------------------------------
# Whether ISO-2022-JP is a permitted content-based encoding detection
# outcome in the JapaneseDetector.
# outcome.
- name: intl.charset.detector.iso2022jp.allowed
type: bool
value: true
mirror: always
# Whether the new encoding detector is enabled (except for Japan's ccTLDs).
- name: intl.charset.detector.ng.enabled
type: bool
value: true
mirror: always
# Whether the new encoding detector is enabled for the .jp TLD.
- name: intl.charset.detector.ng.jp.enabled
type: bool
value: false
mirror: always
# Whether the new encoding detector is enabled for the .in TLD.
- name: intl.charset.detector.ng.in.enabled
type: bool
value: false
mirror: always
# Whether the new encoding detector is enabled for the .lk TLD.
- name: intl.charset.detector.ng.lk.enabled
type: bool
value: false
mirror: always
# Whether the TLD is considered if the new encoding detector is disabled.
- name: intl.charset.fallback.tld
type: bool
value: true

View File

@ -95,6 +95,7 @@ FINAL_LIBRARY = 'xul'
LOCAL_INCLUDES += [
'/dom/base',
'/intl/chardet',
]
if CONFIG['CC_TYPE'] in ('clang', 'gcc'):

View File

@ -17,6 +17,7 @@
#include "nsCOMArray.h"
#include "nsContentSink.h"
#include "nsCycleCollectionParticipant.h"
#include "nsDetectionConfident.h"
#include "nsHtml5OwningUTF16Buffer.h"
#include "nsHtml5TreeOpExecutor.h"
#include "nsHtml5StreamParser.h"

View File

@ -9,6 +9,7 @@
#include "mozilla/DebugOnly.h"
#include "mozilla/Encoding.h"
#include "nsContentUtils.h"
#include "nsCyrillicDetector.h"
#include "nsHtml5Tokenizer.h"
#include "nsIHttpChannel.h"
#include "nsHtml5Parser.h"
@ -69,7 +70,7 @@ NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
NS_INTERFACE_TABLE(nsHtml5StreamParser, nsISupports)
NS_INTERFACE_TABLE(nsHtml5StreamParser, nsICharsetDetectionObserver)
NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
NS_INTERFACE_MAP_END
@ -83,6 +84,7 @@ NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
tmp->mExecutorFlusher = nullptr;
tmp->mLoadFlusher = nullptr;
tmp->mExecutor = nullptr;
NS_IMPL_CYCLE_COLLECTION_UNLINK(mChardet)
NS_IMPL_CYCLE_COLLECTION_UNLINK_END
NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
@ -99,6 +101,11 @@ NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor));
}
// hack: count self if held by mChardet
if (tmp->mChardet) {
NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mChardet->mObserver");
cb.NoteXPCOMChild(static_cast<nsICharsetDetectionObserver*>(tmp));
}
NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
class nsHtml5ExecutorFlusher : public Runnable {
@ -151,7 +158,6 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mCharsetSource(kCharsetUninitialized),
mEncoding(WINDOWS_1252_ENCODING),
mFeedChardet(true),
mGuessEncoding(true),
mReparseForbidden(false),
mLastBuffer(nullptr), // Will be filled when starting
mExecutor(aExecutor),
@ -178,10 +184,9 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
mJapaneseDetector(mozilla::JapaneseDetector::Create(
StaticPrefs::intl_charset_detector_iso2022jp_allowed())),
mUseJapaneseDetector(false),
mInitialEncodingWasFromParentFrame(false),
mHasHadErrors(false),
mDecodingLocalFileWithoutTokenizing(false),
mDecodingLocalFileAsUTF8(false),
mFlushTimer(NS_NewTimer(mEventTarget)),
mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
mFlushTimerArmed(false),
@ -202,6 +207,24 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mTreeBuilder->EnableViewSource(highlighter); // doesn't own
}
// Chardet instantiation adapted from File.
// Chardet is initialized here even if it turns out to be useless
// to make the chardet refcount its observer (nsHtml5StreamParser)
// on the main thread.
nsAutoCString detectorName;
Preferences::GetLocalizedCString("intl.charset.detector", detectorName);
if (!detectorName.IsEmpty()) {
// We recognize one of the two magic strings for Russian and Ukranian.
if (detectorName.EqualsLiteral("ruprob")) {
mChardet = new nsRUProbDetector();
} else if (detectorName.EqualsLiteral("ukprob")) {
mChardet = new nsUKProbDetector();
}
if (mChardet) {
(void)mChardet->Init(this);
}
}
// There's a zeroing operator new for everything else
}
@ -232,49 +255,52 @@ nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
: NS_ERROR_NOT_AVAILABLE;
}
void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
if (mUseJapaneseDetector) {
return;
}
if (!aInitial) {
mGuessEncoding = false;
}
auto encoding = mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
if (mEncoding == encoding) {
auto source = aInitial ? kCharsetFromInitialAutoDetection
: kCharsetFromFinalAutoDetection;
MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?");
mCharsetSource = source;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else {
MOZ_ASSERT(mCharsetSource < kCharsetFromFinalAutoDetection);
// We've already committed to a decoder. Request a reload from the
// docshell.
mTreeBuilder->NeedsCharsetSwitchTo(encoding,
kCharsetFromFinalAutoDetection, 0);
FlushTreeOpsAndDisarmTimer();
Interrupt();
NS_IMETHODIMP
nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf) {
NS_ASSERTION(IsParserThread(), "Wrong thread!");
if (aConf == eBestAnswer || aConf == eSureAnswer) {
mFeedChardet = false; // just in case
auto encoding =
Encoding::ForLabelNoReplacement(nsDependentCString(aCharset));
if (!encoding) {
return NS_OK;
}
if (HasDecoder()) {
if (mEncoding == encoding) {
MOZ_ASSERT(mCharsetSource < kCharsetFromAutoDetection,
"Why are we running chardet at all?");
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else {
// We've already committed to a decoder. Request a reload from the
// docshell.
mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(encoding),
kCharsetFromAutoDetection, 0);
FlushTreeOpsAndDisarmTimer();
Interrupt();
}
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
mEncoding = WrapNotNull(encoding);
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
mEncoding = encoding;
mCharsetSource = aInitial ? kCharsetFromInitialAutoDetection
: kCharsetFromFinalAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
}
return NS_OK;
}
void nsHtml5StreamParser::FeedJapaneseDetector(Span<const uint8_t> aBuffer,
bool aLast) {
MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
const Encoding* detected = mJapaneseDetector->Feed(aBuffer, aLast);
if (!detected) {
return;
}
DontGuessEncoding();
int32_t source = kCharsetFromFinalAutoDetection;
mFeedChardet = false;
if (mDecodingLocalFileAsUTF8 && detected != ISO_2022_JP_ENCODING) {
return;
}
int32_t source = kCharsetFromAutoDetection;
if (mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) {
source = kCharsetFromUserForcedAutoDetection;
@ -300,10 +326,23 @@ void nsHtml5StreamParser::FeedJapaneseDetector(Span<const uint8_t> aBuffer,
void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
bool aLast) {
if (mUseJapaneseDetector) {
if (mEncoding->IsJapaneseLegacy()) {
FeedJapaneseDetector(aBuffer, aLast);
} else if (mEncoding == WINDOWS_1251_ENCODING && mChardet &&
!mDecodingLocalFileAsUTF8) {
if (!aBuffer.IsEmpty()) {
bool dontFeed = false;
mozilla::Unused << mChardet->DoIt((const char*)aBuffer.Elements(),
aBuffer.Length(), &dontFeed);
if (dontFeed) {
mFeedChardet = false;
}
}
if (aLast) {
mozilla::Unused << mChardet->Done();
}
} else {
Unused << mDetector->Feed(aBuffer, aLast);
mFeedChardet = false;
}
}
@ -349,18 +388,16 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
Span<const uint8_t> aFromSegment) {
NS_ASSERTION(IsParserThread(), "Wrong thread!");
nsresult rv = NS_OK;
if (mDecodingLocalFileWithoutTokenizing &&
mCharsetSource <= kCharsetFromFileURLGuess) {
if (mDecodingLocalFileAsUTF8 && mCharsetSource <= kCharsetFromFileURLGuess) {
MOZ_ASSERT(mEncoding != UTF_8_ENCODING);
mUnicodeDecoder = UTF_8_ENCODING->NewDecoderWithBOMRemoval();
} else {
if (mCharsetSource >= kCharsetFromFinalAutoDetection) {
if (!(mCharsetSource == kCharsetFromUserForced ||
mCharsetSource == kCharsetFromParentForced)) {
DontGuessEncoding();
}
mDecodingLocalFileWithoutTokenizing = false;
if (mCharsetSource >= kCharsetFromAutoDetection &&
!(mCharsetSource == kCharsetFromUserForced ||
mCharsetSource == kCharsetFromParentForced)) {
mFeedChardet = false;
}
mDecodingLocalFileAsUTF8 = false;
mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
}
if (mSniffingBuffer) {
@ -376,10 +413,10 @@ nsresult nsHtml5StreamParser::SetupDecodingFromBom(
NotNull<const Encoding*> aEncoding) {
NS_ASSERTION(IsParserThread(), "Wrong thread!");
mEncoding = aEncoding;
mDecodingLocalFileWithoutTokenizing = false;
mDecodingLocalFileAsUTF8 = false;
mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling();
mCharsetSource = kCharsetFromByteOrderMark;
DontGuessEncoding();
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
mSniffingBuffer = nullptr;
mMetaScanner = nullptr;
@ -437,7 +474,7 @@ void nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(
}
mCharsetSource = kCharsetFromIrreversibleAutoDetection;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
DontGuessEncoding();
mFeedChardet = false;
mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16", true, 0);
}
@ -511,23 +548,22 @@ void nsHtml5StreamParser::FinalizeSniffingWithDetector(
FeedDetector(MakeSpan(mSniffingBuffer.get(), mSniffingLength), false);
}
if (mFeedChardet && !aFromSegment.IsEmpty()) {
// Avoid buffer boundary-dependent behavior.
FeedDetector(aFromSegment.To(aCountToSniffingLimit), false);
// Avoid buffer boundary-dependent behavior when
// reparsing is forbidden. If reparse is forbidden,
// act as if we only saw the first 1024 bytes.
// When reparsing isn't forbidden, buffer boundaries
// can have an effect on whether the page is loaded
// once or twice. :-(
FeedDetector(mReparseForbidden ? aFromSegment.To(aCountToSniffingLimit)
: aFromSegment,
false);
}
bool guess = mFeedChardet;
if (mFeedChardet && aEof && aCountToSniffingLimit <= aFromSegment.Length()) {
FeedDetector(Span<const uint8_t>(), true);
if (mFeedChardet && aEof &&
(!mReparseForbidden || aCountToSniffingLimit == aFromSegment.Length())) {
// Don't signal EOF if reparse is forbidden and we didn't pass all input
// to the detector above.
mFeedChardet = false;
}
if (guess) {
GuessEncoding(aEof, (guess == mFeedChardet));
}
if (mReparseForbidden) {
DontGuessEncoding();
}
if (mFeedChardet && !aEof && aCountToSniffingLimit < aFromSegment.Length()) {
// Avoid buffer boundary-dependent behavior.
FeedDetector(aFromSegment.From(aCountToSniffingLimit), false);
FeedDetector(Span<const uint8_t>(), true);
}
}
@ -738,11 +774,8 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
// Honor override
if (mEncoding->IsJapaneseLegacy()) {
mFeedChardet = true;
mUseJapaneseDetector = true;
FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit,
false);
} else {
DontGuessEncoding();
}
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment);
@ -759,10 +792,7 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(
// meta not found, honor override
if (mEncoding->IsJapaneseLegacy()) {
mFeedChardet = true;
mUseJapaneseDetector = true;
FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit, false);
} else {
DontGuessEncoding();
}
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment);
}
@ -877,10 +907,14 @@ nsresult nsHtml5StreamParser::WriteStreamBytes(
bool hadErrors;
Tie(result, read, written, hadErrors) =
mUnicodeDecoder->DecodeToUTF16(src, dst, false);
if (!mDecodingLocalFileWithoutTokenizing) {
if (!mDecodingLocalFileAsUTF8) {
OnNewContent(dst.To(written));
}
if (hadErrors && !mHasHadErrors) {
if (mDecodingLocalFileAsUTF8) {
ReDecodeLocalFile();
return NS_OK;
}
mHasHadErrors = true;
if (mEncoding == UTF_8_ENCODING) {
mTreeBuilder->TryToEnableEncodingMenu();
@ -900,15 +934,9 @@ nsresult nsHtml5StreamParser::WriteStreamBytes(
} else {
MOZ_ASSERT(totalRead == aFromSegment.Length(),
"The Unicode decoder consumed the wrong number of bytes.");
if (mDecodingLocalFileWithoutTokenizing &&
if (mDecodingLocalFileAsUTF8 &&
mLocalFileBytesBuffered == LOCAL_FILE_UTF_8_BUFFER_SIZE) {
auto encoding = mEncoding;
GuessEncoding(false, false);
if (encoding == mEncoding) {
CommitLocalFileToEncoding();
} else {
ReDecodeLocalFile();
}
CommitLocalFileToUTF8();
}
return NS_OK;
}
@ -916,12 +944,16 @@ nsresult nsHtml5StreamParser::WriteStreamBytes(
}
void nsHtml5StreamParser::ReDecodeLocalFile() {
MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing);
mDecodingLocalFileWithoutTokenizing = false;
MOZ_ASSERT(mDecodingLocalFileAsUTF8);
mDecodingLocalFileAsUTF8 = false;
mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
mHasHadErrors = false;
DontGuessEncoding();
// We need the detector to start with fresh state.
// Turn off ISO-2022-JP detection, because if this doc was
// ISO-2022-JP, it would have already been detected.
mJapaneseDetector = mozilla::JapaneseDetector::Create(false);
mFeedChardet = true;
// Throw away previous decoded data
mLastBuffer = mFirstBuffer;
@ -935,11 +967,13 @@ void nsHtml5StreamParser::ReDecodeLocalFile() {
}
}
void nsHtml5StreamParser::CommitLocalFileToEncoding() {
MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing);
mDecodingLocalFileWithoutTokenizing = false;
void nsHtml5StreamParser::CommitLocalFileToUTF8() {
MOZ_ASSERT(mDecodingLocalFileAsUTF8);
mDecodingLocalFileAsUTF8 = false;
mFeedChardet = false;
mGuessEncoding = false;
mEncoding = UTF_8_ENCODING;
mCharsetSource = kCharsetFromFileURLGuess;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer;
while (buffer) {
@ -970,17 +1004,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
MOZ_ASSERT(
!mExecutor->HasStarted(),
"Got OnStartRequest at the wrong stage in the executor life cycle.");
MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
// To avoid the cost of instantiating the detector when it's not needed,
// let's instantiate only if we make it out of this method with the
// intent to use it.
auto detectorCreator = MakeScopeExit([&] {
if (mFeedChardet && !mUseJapaneseDetector) {
mDetector = mozilla::EncodingDetector::Create();
}
});
NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
if (mObserver) {
mObserver->OnStartRequest(aRequest);
}
@ -990,7 +1014,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
mSkipContentSniffing = loadInfo->GetSkipContentSniffing();
if (mSkipContentSniffing) {
DontGuessEncoding();
mFeedChardet = false;
}
mStreamState = STREAM_BEING_READ;
@ -1005,7 +1029,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
mMode == LOAD_AS_DATA ? false : mExecutor->IsScriptEnabled();
mOwner->StartTokenizer(scriptingEnabled);
MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
MOZ_ASSERT(!mDecodingLocalFileAsUTF8);
bool isSrcdoc = false;
nsCOMPtr<nsIChannel> channel;
nsresult rv = GetChannel(getter_AddRefs(channel));
@ -1023,27 +1047,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
rv = channel->GetURI(getter_AddRefs(currentURI));
if (NS_SUCCEEDED(rv)) {
nsCOMPtr<nsIURI> innermost = NS_GetInnermostURI(currentURI);
if (innermost->SchemeIs("file")) {
mDecodingLocalFileWithoutTokenizing = true;
} else {
nsAutoCString host;
innermost->GetAsciiHost(host);
if (!host.IsEmpty()) {
// First let's see if the host is DNS-absolute and ends with a
// dot and get rid of that one.
if (host.Last() == '.') {
host.SetLength(host.Length() - 1);
}
int32_t index = host.RFindChar('.');
if (index != kNotFound) {
// We tolerate an IPv4 component as generic "TLD", so don't
// bother checking.
ToLowerCase(
Substring(host, index + 1, host.Length() - (index + 1)),
mTLD);
}
}
}
mDecodingLocalFileAsUTF8 = innermost->SchemeIs("file");
}
}
}
@ -1112,6 +1116,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
// This is the old Gecko behavior but the HTML5 spec disagrees.
// Don't reparse on POST.
mReparseForbidden = true;
mFeedChardet = false; // can't restart anyway
}
}
@ -1138,58 +1143,14 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
}
if (mCharsetSource == kCharsetFromParentFrame) {
// Remember this for error reporting.
// Remember this in case chardet overwrites mCharsetSource
mInitialEncodingWasFromParentFrame = true;
}
if (mCharsetSource >= kCharsetFromFinalAutoDetection) {
if ((mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) &&
mEncoding->IsJapaneseLegacy()) {
// Japanese detector only
mUseJapaneseDetector = true;
mGuessEncoding = false;
} else {
DontGuessEncoding();
}
}
// Compute various pref-based special cases
if (!mDecodingLocalFileWithoutTokenizing && mFeedChardet) {
if (StaticPrefs::intl_charset_detector_ng_enabled()) {
if (mTLD.EqualsLiteral("jp")) {
mUseJapaneseDetector =
!StaticPrefs::intl_charset_detector_ng_jp_enabled();
} else if (mTLD.EqualsLiteral("in") &&
mEncoding == WINDOWS_1252_ENCODING &&
!StaticPrefs::intl_charset_detector_ng_in_enabled()) {
// Avoid breaking font hacks that Chrome doesn't break.
DontGuessEncoding();
} else if (mTLD.EqualsLiteral("lk") &&
mEncoding == WINDOWS_1252_ENCODING &&
!StaticPrefs::intl_charset_detector_ng_lk_enabled()) {
// Avoid breaking font hacks that Chrome doesn't break.
DontGuessEncoding();
}
} else {
// If the new detector is turned off in general, we still use it to
// emulate the old Cyrillic detector in cases where the old Cyrillic
// detector would have been enabled.
nsAutoCString detectorName;
Preferences::GetLocalizedCString("intl.charset.detector", detectorName);
bool forceEncodingDetectorToCyrillicOnly =
detectorName.EqualsLiteral("ruprob") ||
detectorName.EqualsLiteral("ukprob");
if (mEncoding->IsJapaneseLegacy()) {
mUseJapaneseDetector = true;
} else if (mEncoding == WINDOWS_1251_ENCODING &&
forceEncodingDetectorToCyrillicOnly) {
mTLD.AssignLiteral("ru"); // Force the detector into Cyrillic mode
// regardless of real TLD
} else {
DontGuessEncoding();
}
}
if (mCharsetSource >= kCharsetFromAutoDetection &&
!(mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced)) {
mFeedChardet = false;
}
if (mCharsetSource < kCharsetFromUtf8OnlyMime) {
@ -1202,10 +1163,10 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) {
// There's no need to remove the BOM manually here, because
// the UTF-8 decoder removes it.
mReparseForbidden = true;
DontGuessEncoding();
mFeedChardet = false;
// Instantiate the converter here to avoid BOM sniffing.
mDecodingLocalFileWithoutTokenizing = false;
mDecodingLocalFileAsUTF8 = false;
mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval();
return NS_OK;
}
@ -1269,10 +1230,15 @@ void nsHtml5StreamParser::DoStopRequest() {
bool hadErrors;
Tie(result, read, written, hadErrors) =
mUnicodeDecoder->DecodeToUTF16(src, dst, true);
if (!mDecodingLocalFileWithoutTokenizing) {
if (!mDecodingLocalFileAsUTF8) {
OnNewContent(dst.To(written));
}
if (hadErrors && !mHasHadErrors) {
if (mDecodingLocalFileAsUTF8) {
ReDecodeLocalFile();
DoStopRequest();
return;
}
mHasHadErrors = true;
if (mEncoding == UTF_8_ENCODING) {
mTreeBuilder->TryToEnableEncodingMenu();
@ -1289,20 +1255,9 @@ void nsHtml5StreamParser::DoStopRequest() {
}
mLastBuffer = (mLastBuffer->next = newBuf.forget());
} else {
if (mDecodingLocalFileWithoutTokenizing) {
if (mDecodingLocalFileAsUTF8) {
MOZ_ASSERT(mLocalFileBytesBuffered < LOCAL_FILE_UTF_8_BUFFER_SIZE);
MOZ_ASSERT(mGuessEncoding);
auto encoding = mEncoding;
GuessEncoding(true, false);
if (encoding == mEncoding) {
CommitLocalFileToEncoding();
} else {
ReDecodeLocalFile();
DoStopRequest();
return;
}
} else if (mGuessEncoding) {
GuessEncoding(true, false);
CommitLocalFileToUTF8();
}
break;
}
@ -1347,7 +1302,7 @@ nsresult nsHtml5StreamParser::OnStopRequest(nsIRequest* aRequest,
void nsHtml5StreamParser::DoDataAvailableBuffer(
mozilla::Buffer<uint8_t>&& aBuffer) {
if (MOZ_LIKELY(!mDecodingLocalFileWithoutTokenizing)) {
if (MOZ_LIKELY(!mDecodingLocalFileAsUTF8)) {
DoDataAvailable(aBuffer);
return;
}
@ -1395,7 +1350,7 @@ void nsHtml5StreamParser::DoDataAvailableBuffer(
// Do this clean-up here to avoid use-after-free when
// DoDataAvailable is passed a span pointing into an
// element of mBufferedLocalFileData.
if (!mDecodingLocalFileWithoutTokenizing) {
if (!mDecodingLocalFileAsUTF8) {
mBufferedLocalFileData.Clear();
}
}
@ -1428,7 +1383,7 @@ void nsHtml5StreamParser::DoDataAvailable(Span<const uint8_t> aBuffer) {
return;
}
if (mDecodingLocalFileWithoutTokenizing) {
if (mDecodingLocalFileAsUTF8) {
return;
}
@ -1501,7 +1456,7 @@ nsresult nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
mozilla::MutexAutoLock autoLock(mTokenizerMutex);
if (MOZ_UNLIKELY(mDecodingLocalFileWithoutTokenizing)) {
if (MOZ_UNLIKELY(mDecodingLocalFileAsUTF8)) {
// It's a bit sad to potentially buffer the first 1024
// bytes in two places, but it's a lot simpler than trying
// to optitize out that copy. It only happens for local files
@ -1572,7 +1527,7 @@ const Encoding* nsHtml5StreamParser::PreferredForInternalEncodingDecl(
}
}
mCharsetSource = kCharsetFromMetaTag; // become confident
DontGuessEncoding(); // don't feed chardet when confident
mFeedChardet = false; // don't feed chardet when confident
return nullptr;
}
@ -1611,7 +1566,7 @@ bool nsHtml5StreamParser::internalEncodingDeclaration(nsHtml5String aEncoding) {
// Avoid having the chardet ask for another restart after this restart
// request.
DontGuessEncoding();
mFeedChardet = false;
mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(encoding), kCharsetFromMetaTag,
mTokenizer->getLineNumber());
FlushTreeOpsAndDisarmTimer();
@ -1648,7 +1603,7 @@ void nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer() {
void nsHtml5StreamParser::ParseAvailableData() {
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
mTokenizerMutex.AssertCurrentThreadOwns();
MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing);
MOZ_ASSERT(!mDecodingLocalFileAsUTF8);
if (IsTerminatedOrInterrupted()) {
return;

View File

@ -8,9 +8,9 @@
#include "nsAutoPtr.h"
#include "nsCOMPtr.h"
#include "nsICharsetDetectionObserver.h"
#include "nsHtml5MetaScanner.h"
#include "mozilla/Encoding.h"
#include "mozilla/EncodingDetector.h"
#include "mozilla/JapaneseDetector.h"
#include "nsHtml5TreeOpExecutor.h"
#include "nsHtml5OwningUTF16Buffer.h"
@ -21,6 +21,7 @@
#include "nsHtml5Speculation.h"
#include "nsISerialEventTarget.h"
#include "nsITimer.h"
#include "nsICharsetDetector.h"
#include "mozilla/dom/DocGroup.h"
#include "mozilla/Buffer.h"
@ -100,7 +101,7 @@ enum eHtml5StreamState {
STREAM_ENDED = 2
};
class nsHtml5StreamParser final : public nsISupports {
class nsHtml5StreamParser final : public nsICharsetDetectionObserver {
template <typename T>
using NotNull = mozilla::NotNull<T>;
using Encoding = mozilla::Encoding;
@ -117,7 +118,8 @@ class nsHtml5StreamParser final : public nsISupports {
public:
NS_DECL_CYCLE_COLLECTING_ISUPPORTS
NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
nsICharsetDetectionObserver)
nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, nsHtml5Parser* aOwner,
eParserMode aMode);
@ -132,6 +134,12 @@ class nsHtml5StreamParser final : public nsISupports {
nsresult OnStopRequest(nsIRequest* aRequest, nsresult status);
// nsICharsetDetectionObserver
/**
* Chardet calls this to report the detection result
*/
NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;
// EncodingDeclarationHandler
// https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
/**
@ -327,7 +335,7 @@ class nsHtml5StreamParser final : public nsISupports {
* to UTF-8 as the non-speculative encoding and start processing
* the decoded data.
*/
void CommitLocalFileToEncoding();
void CommitLocalFileToUTF8();
/**
* When speculatively decoding from file: URL as UTF-8, redecode
@ -335,19 +343,6 @@ class nsHtml5StreamParser final : public nsISupports {
*/
void ReDecodeLocalFile();
/**
* Potentially guess the encoding using mozilla::EncodingDetector.
*/
void GuessEncoding(bool aEof, bool aInitial);
inline void DontGuessEncoding() {
mFeedChardet = false;
mGuessEncoding = false;
if (mDecodingLocalFileWithoutTokenizing) {
CommitLocalFileToEncoding();
}
}
/**
* Become confident or resolve and encoding name to its preferred form.
* @param aEncoding the value of an internal encoding decl. Acts as an
@ -443,15 +438,10 @@ class nsHtml5StreamParser final : public nsISupports {
NotNull<const Encoding*> mEncoding;
/**
* Whether the generic or Japanese detector should still be fed.
* Whether the Cyrillic or Japanese detector should still be fed.
*/
bool mFeedChardet;
/**
* Whether the generic detector should be still queried for its guess.
*/
bool mGuessEncoding;
/**
* Whether reparse is forbidden
*/
@ -565,23 +555,16 @@ class nsHtml5StreamParser final : public nsISupports {
nsCOMPtr<nsIRunnable> mLoadFlusher;
/**
* The Cyrillic detector if enabled.
*/
nsCOMPtr<nsICharsetDetector> mChardet;
/**
* The Japanese detector.
*/
mozilla::UniquePtr<mozilla::JapaneseDetector> mJapaneseDetector;
/**
* The generict detector.
*/
mozilla::UniquePtr<mozilla::EncodingDetector> mDetector;
/**
* The TLD we're loading from or empty if unknown.
*/
nsCString mTLD;
bool mUseJapaneseDetector;
/**
* Whether the initial charset source was kCharsetFromParentFrame
*/
@ -591,9 +574,9 @@ class nsHtml5StreamParser final : public nsISupports {
/**
* If true, we are decoding a local file that lacks an encoding
* declaration and we are not tokenizing yet.
* declaration as UTF-8 and we are not tokenizing yet.
*/
bool mDecodingLocalFileWithoutTokenizing;
bool mDecodingLocalFileAsUTF8;
/**
* Timer for flushing tree ops once in a while when not speculating.

View File

@ -13,11 +13,8 @@ enum {
kCharsetFromFileURLGuess,
kCharsetFromDocTypeDefault, // This and up confident for XHR
kCharsetFromCache,
kCharsetFromInitialAutoDetection,
kCharsetFromFinalAutoDetection,
kCharsetFromParentFrame, // Same-origin parent takes precedence over detector
// to avoid breaking tests. (Also, the HTML spec
// says so.)
kCharsetFromParentFrame,
kCharsetFromAutoDetection,
kCharsetFromMetaPrescan, // this one and smaller: HTML5 Tentative
kCharsetFromMetaTag, // this one and greater: HTML5 Confident
kCharsetFromIrreversibleAutoDetection,

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ar ISO-8859-6 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "ISO-8859-6", 'Expected ISO-8859-6');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ar-ISO-8859-6-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ar ISO-8859-6</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "ISO-8859-6", 'Expected ISO-8859-6');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ar windows-1256 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1256", 'Expected windows-1256');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ar-windows-1256-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ar windows-1256</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1256", 'Expected windows-1256');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>el ISO-8859-7 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "ISO-8859-7", 'Expected ISO-8859-7');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/el-ISO-8859-7-late.sub.html");
</script>

View File

@ -1,15 +0,0 @@
<!doctype html>
<title>el ISO-8859-7</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: <20><><EFBFBD><EFBFBD>
<!-- I needed to work capital alpha with tonos into the test somehow... --></p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "ISO-8859-7", 'Expected ISO-8859-7');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>el windows-1253 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1253", 'Expected windows-1253');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/el-windows-1253-late.sub.html");
</script>

View File

@ -1,15 +0,0 @@
<!doctype html>
<title>el windows-1253</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: <20><><EFBFBD><EFBFBD>
<!-- I needed to work capital alpha with tonos into the test somehow... --></p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1253", 'Expected windows-1253');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>fa windows-1256 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1256", 'Expected windows-1256');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/fa-windows-1256-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>fa windows-1256</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD>&#1740;<EFBFBD> &#1740;<EFBFBD> <20><><EFBFBD> <20><>Ґ<EFBFBD><D290><EFBFBD>&#1740; <20><><EFBFBD>ǘ<EFBFBD><C798> <20><><EFBFBD>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1256", 'Expected windows-1256');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>fi windows-1252 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/fi-windows-1252-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>fi windows-1252</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>T<EFBFBD>m<EFBFBD> on merkkikoodaustesti.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>he ISO-8859-8 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "ISO-8859-8", 'Expected ISO-8859-8');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/he-ISO-8859-8-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>he ISO-8859-8</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>.<2E><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD></p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "ISO-8859-8", 'Expected ISO-8859-8');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>he windows-1255 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1255", 'Expected windows-1255');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/he-windows-1255-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>he windows-1255</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1255", 'Expected windows-1255');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>is windows-1252 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/is-windows-1252-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>is windows-1252</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD>etta er k<><6B>unarpr<70>f <20> staf. Fyrir sum tungum<75>l sem nota latneska stafi <20>urfum vi<76> meira inntak til a<> taka <20>kv<6B>r<EFBFBD>unina.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ja EUC-JP late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "EUC-JP", 'Expected EUC-JP');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ja-EUC-JP-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ja EUC-JP</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʸ<EFBFBD><EFBFBD><EFBFBD>¸<EFBFBD><EFBFBD>Ǥ<EFBFBD><EFBFBD><EFBFBD></p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "EUC-JP", 'Expected EUC-JP');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ja ISO-2022-JP late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "ISO-2022-JP", 'Expected ISO-2022-JP');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ja-ISO-2022-JP-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ja ISO-2022-JP</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>$B$3$l$OJ8;z<B83$G$9!#(B</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "ISO-2022-JP", 'Expected ISO-2022-JP');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ja Shift_JIS late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "Shift_JIS", 'Expected Shift_JIS');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ja-Shift_JIS-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ja Shift_JIS</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͕<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ł<EFBFBD><EFBFBD>B</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "Shift_JIS", 'Expected Shift_JIS');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ko EUC-KR late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "EUC-KR", 'Expected EUC-KR');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ko-EUC-KR-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ko EUC-KR</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD>̰<EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD>ڵ<EFBFBD> <20>׽<EFBFBD>Ʈ<EFBFBD>Դϴ<D4B4>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "EUC-KR", 'Expected EUC-KR');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>lt windows-1257 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1257", 'Expected windows-1257');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/lt-windows-1257-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>lt windows-1257</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>Tai simboli<6C> kodavimo testas. Kai kurioms kalboms, naudojan<61>ioms lotyni<6E>kus ra<72>menis, mums reikia daugiau informacijos, kad gal<61>tume priimti sprendim<69>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1257", 'Expected windows-1257');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>lv windows-1257 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1257", 'Expected windows-1257');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/lv-windows-1257-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>lv windows-1257</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD>is ir rakstz<74>mju kod<6F><64>anas tests. Da<44><61>s valod<6F>s, kur<75>s tiek izmantotas lat<61><74>u valodas burti, l<>muma pie<69>em<65>anai mums ir nepiecie<69>ams vair<69>k ieguld<6C>juma.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1257", 'Expected windows-1257');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>pl ISO-8859-2 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "ISO-8859-2", 'Expected ISO-8859-2');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/pl-ISO-8859-2-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>pl ISO-8859-2</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>To jest test kodowania znak<61>w. W przypadku niekt<6B>rych j<>zyk<79>w, kt<6B>re u<>ywaj<61> znak<61>w <20>aci<63>skich, potrzebujemy wi<77>cej danych, aby podj<64><6A> decyzj<7A>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "ISO-8859-2", 'Expected ISO-8859-2');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>pl windows-1250 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1250", 'Expected windows-1250');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/pl-windows-1250-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>pl windows-1250</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>To jest test kodowania znak<61>w. W przypadku niekt<6B>rych j<>zyk<79>w, kt<6B>re u<>ywaj<61> znak<61>w <20>aci<63>skich, potrzebujemy wi<77>cej danych, aby podj<64><6A> decyzj<7A>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1250", 'Expected windows-1250');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>pt windows-1252 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/pt-windows-1252-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>pt windows-1252</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>Este <20> um teste de codifica<63><61>o de caracteres.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ru IBM866 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "IBM866", 'Expected IBM866');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ru-IBM866-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ru IBM866</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD> <20><><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><E0AEA2><><E1A8AC><EFBFBD><EFBFBD><EFBFBD>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "IBM866", 'Expected IBM866');
}, "Check detection result");
done();
};
</script>

View File

@ -1,16 +0,0 @@
<!doctype html>
<meta charset="utf-8">
<title>ru ISO-8859-5 late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "ISO-8859-5", 'Expected ISO-8859-5');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ru-ISO-8859-5-late.sub.html");
</script>

View File

@ -1,14 +0,0 @@
<!doctype html>
<title>ru ISO-8859-5</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "ISO-8859-5", 'Expected ISO-8859-5');
}, "Check detection result");
done();
};
</script>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More