Bug 1686463 - Gather telemetry about automatic encoding detection outcomes. r=chutten,emk

Differential Revision: https://phabricator.services.mozilla.com/D102397
This commit is contained in:
Henri Sivonen 2021-01-24 00:11:07 +00:00
parent 7431701c98
commit 9b210c311e
21 changed files with 355 additions and 79 deletions

View File

@ -77,10 +77,15 @@ git = "https://github.com/hsivonen/packed_simd"
replace-with = "vendored-sources"
rev = "0917fe780032a6bbb23d71be545f9c1834128d75"
[source."https://github.com/hsivonen/chardetng_c"]
git = "https://github.com/hsivonen/chardetng_c"
replace-with = "vendored-sources"
rev = "ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
[source."https://github.com/hsivonen/chardetng"]
git = "https://github.com/hsivonen/chardetng"
replace-with = "vendored-sources"
rev = "7d5e0608d3e012bdfea3bd199111e3546607dd31"
rev = "fd4ed671ef495af4dcda4c4cba3ef8d426db8af1"
[source."https://github.com/gfx-rs/naga"]
git = "https://github.com/gfx-rs/naga"

7
Cargo.lock generated
View File

@ -567,7 +567,7 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
[[package]]
name = "chardetng"
version = "0.1.9"
source = "git+https://github.com/hsivonen/chardetng?rev=7d5e0608d3e012bdfea3bd199111e3546607dd31#7d5e0608d3e012bdfea3bd199111e3546607dd31"
source = "git+https://github.com/hsivonen/chardetng?rev=fd4ed671ef495af4dcda4c4cba3ef8d426db8af1#fd4ed671ef495af4dcda4c4cba3ef8d426db8af1"
dependencies = [
"encoding_rs",
"memchr",
@ -575,9 +575,8 @@ dependencies = [
[[package]]
name = "chardetng_c"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0458f9fd705d9040356a137c5872a0f9ce1d27222d69de6aeeecf00e5e520076"
version = "0.1.2"
source = "git+https://github.com/hsivonen/chardetng_c?rev=ed8a4c6f900a90d4dbc1d64b856e61490a1c3570#ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
dependencies = [
"chardetng",
"encoding_rs",

View File

@ -71,7 +71,8 @@ opt-level = 1
opt-level = 1
[patch.crates-io]
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="7d5e0608d3e012bdfea3bd199111e3546607dd31" }
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="fd4ed671ef495af4dcda4c4cba3ef8d426db8af1" }
chardetng_c = { git = "https://github.com/hsivonen/chardetng_c", rev="ed8a4c6f900a90d4dbc1d64b856e61490a1c3570" }
libudev-sys = { path = "dom/webauthn/libudev-sys" }
packed_simd = { git = "https://github.com/hsivonen/packed_simd", rev="0917fe780032a6bbb23d71be545f9c1834128d75" }
rlbox_lucet_sandbox = { git = "https://github.com/PLSysSec/rlbox_lucet_sandbox/", rev="f3cace4fb8b53db0849c62af4fa62bade5a620f7" }

View File

@ -1508,12 +1508,26 @@ nsDocShell::GatherCharsetMenuTelemetry() {
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledJp);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8:
case kCharsetFromInitialAutoDetectionASCII:
// Deliberately no final version
LOGCHARSETMENU(("UnlabeledAscii"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledAscii);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
LOGCHARSETMENU(("UnlabeledNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledNonUtf8);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
LOGCHARSETMENU(("UnlabeledNonUtf8TLD"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledNonUtf8TLD);
break;
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8:
LOGCHARSETMENU(("UnlabeledUtf8"));
@ -1979,13 +1993,8 @@ nsDocShell::GetCharsetAutodetected(bool* aCharsetAutodetected) {
}
int32_t source = doc->GetDocumentCharacterSetSource();
if (source == kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 ||
source == kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8 ||
source == kCharsetFromFinalJapaneseAutoDetection ||
source == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 ||
source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8 ||
source == kCharsetFromFinalAutoDetectionFile ||
if ((source >= kCharsetFromInitialAutoDetectionASCII &&
source <= kCharsetFromFinalAutoDetectionFile) ||
source == kCharsetFromUserForcedJapaneseAutoDetection ||
source == kCharsetFromPendingUserForcedAutoDetection ||
source == kCharsetFromInitialUserForcedAutoDetection ||

View File

@ -127,7 +127,8 @@ nsHTMLDocument::nsHTMLDocument()
mNumForms(0),
mLoadFlags(0),
mWarnedWidthHeight(false),
mIsPlainText(false) {
mIsPlainText(false),
mViewSource(false) {
mType = eHTML;
mDefaultElementType = kNameSpaceID_XHTML;
mCompatMode = eCompatibility_NavQuirks;
@ -273,7 +274,7 @@ void nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell,
return;
}
if (kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 <= parentSource) {
if (kCharsetFromInitialAutoDetectionASCII <= parentSource) {
// Make sure that's OK
if (!NodePrincipal()->Equals(parentPrincipal) ||
!IsAsciiCompatible(parentCharset)) {
@ -319,9 +320,9 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
bool view =
!strcmp(aCommand, "view") || !strcmp(aCommand, "external-resource");
bool viewSource = !strcmp(aCommand, "view-source");
mViewSource = !strcmp(aCommand, "view-source");
bool asData = !strcmp(aCommand, kLoadAsData);
if (!(view || viewSource || asData)) {
if (!(view || mViewSource || asData)) {
MOZ_ASSERT(false, "Bad parser command");
return NS_ERROR_INVALID_ARG;
}
@ -331,7 +332,7 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
contentType.EqualsLiteral(APPLICATION_WAPXHTML_XML));
mIsPlainText =
!html && !xhtml && nsContentUtils::IsPlainTextType(contentType);
if (!(html || xhtml || mIsPlainText || viewSource)) {
if (!(html || xhtml || mIsPlainText || mViewSource)) {
MOZ_ASSERT(false, "Channel with bad content type.");
return NS_ERROR_INVALID_ARG;
}
@ -341,7 +342,7 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
bool loadAsHtml5 = true;
if (!viewSource && xhtml) {
if (!mViewSource && xhtml) {
// We're parsing XHTML as XML, remember that.
mType = eXHTML;
SetCompatibilityMode(eCompatibility_FullStandards);
@ -382,12 +383,12 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
html5Parser = nsHtml5Module::NewHtml5Parser();
mParser = html5Parser;
if (mIsPlainText) {
if (viewSource) {
if (mViewSource) {
html5Parser->MarkAsNotScriptCreated("view-source-plain");
} else {
html5Parser->MarkAsNotScriptCreated("plain-text");
}
} else if (viewSource && !html) {
} else if (mViewSource && !html) {
html5Parser->MarkAsNotScriptCreated("view-source-xml");
} else {
html5Parser->MarkAsNotScriptCreated(aCommand);

View File

@ -70,6 +70,8 @@ class nsHTMLDocument : public mozilla::dom::Document {
bool IsPlainText() const { return mIsPlainText; }
bool IsViewSource() const { return mViewSource; }
// Returns whether an object was found for aName.
bool ResolveName(JSContext* aCx, const nsAString& aName,
JS::MutableHandle<JS::Value> aRetval,
@ -194,6 +196,11 @@ class nsHTMLDocument : public mozilla::dom::Document {
* Set to true once we know that we are loading plain text content.
*/
bool mIsPlainText;
/**
* Set to true once we know that we are viewing source.
*/
bool mViewSource;
};
namespace mozilla {

View File

@ -57,6 +57,15 @@ class EncodingDetector final {
return detector;
}
/**
* Queries whether the TLD is considered non-generic and could affect the
* guess.
*/
static inline bool TldMayAffectGuess(Span<const char> aTLD) {
return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(),
aTLD.Length());
}
/**
* Inform the detector of a chunk of input.
*

View File

@ -228,6 +228,8 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
mInitialEncodingWasFromParentFrame(false),
mHasHadErrors(false),
mDetectorHasSeenNonAscii(false),
mDetectorHadOnlySeenAsciiWhenFirstGuessing(false),
mDecodingLocalFileWithoutTokenizing(false),
mFlushTimer(NS_NewTimer(mEventTarget)),
mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
@ -278,11 +280,36 @@ nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
: NS_ERROR_NOT_AVAILABLE;
}
int32_t nsHtml5StreamParser::MaybeRollBackSource(int32_t aSource) {
if (aSource ==
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) {
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
}
if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic;
}
if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content) {
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
if (aSource == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
!mDetectorHadOnlySeenAsciiWhenFirstGuessing) {
return kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
}
if (aSource == kCharsetFromFinalUserForcedAutoDetection) {
aSource = kCharsetFromInitialUserForcedAutoDetection;
}
return aSource;
}
void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
if (mJapaneseDetector) {
return;
}
if (!aInitial) {
if (aInitial) {
if (!mDetectorHasSeenNonAscii) {
mDetectorHadOnlySeenAsciiWhenFirstGuessing = true;
}
} else {
mGuessEncoding = false;
}
bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
@ -291,43 +318,66 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
mCharsetSource != kCharsetFromFinalJapaneseAutoDetection &&
mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8 &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
mCharsetSource != kCharsetFromFinalAutoDetectionFile);
auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
auto encoding =
forced ? mDetector->Guess(EmptyCString(), true)
forced ? ifHadBeenForced
: mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
auto source =
int32_t source =
aInitial
? (forced ? kCharsetFromInitialUserForcedAutoDetection
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8)
? (forced
? kCharsetFromInitialUserForcedAutoDetection
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic)
: (forced
? kCharsetFromFinalUserForcedAutoDetection
: (mDecodingLocalFileWithoutTokenizing
? kCharsetFromFinalAutoDetectionFile
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8));
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic));
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
if (encoding == ISO_2022_JP_ENCODING) {
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial
} else if (ifHadBeenForced == UTF_8_ENCODING) {
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
} else if (encoding != ifHadBeenForced) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (source == kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8) {
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
} else if (source ==
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
if (encoding == ISO_2022_JP_ENCODING) {
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII;
} else if (ifHadBeenForced == UTF_8_ENCODING) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
} else if (encoding != ifHadBeenForced) {
source =
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
}
}
if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
if (mEncoding == encoding) {
MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?");
MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
mCharsetSource < source,
"Why are we running chardet at all?");
// Source didn't actually change between initial and final, so roll it
// back for future telemetry purposes, while taking into account the final
// UTF-8ness. https://bugzilla.mozilla.org/show_bug.cgi?id=1686463
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8;
} else if (source == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
} else if (source == kCharsetFromFinalUserForcedAutoDetection) {
source = kCharsetFromInitialUserForcedAutoDetection;
}
mCharsetSource = source;
// back for telemetry purposes.
mCharsetSource = MaybeRollBackSource(source);
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
} else {
MOZ_ASSERT(mCharsetSource < kCharsetFromFinalJapaneseAutoDetection ||
@ -341,6 +391,11 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
if (mCharsetSource == kCharsetUninitialized && aEof) {
// The document is so short that the initial buffer is the last
// buffer.
source = MaybeRollBackSource(source);
}
mEncoding = encoding;
mCharsetSource = source;
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
@ -383,7 +438,7 @@ void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
if (mJapaneseDetector) {
FeedJapaneseDetector(aBuffer, aLast);
} else {
Unused << mDetector->Feed(aBuffer, aLast);
mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast);
}
}

View File

@ -362,6 +362,11 @@ class nsHtml5StreamParser final : public nsISupports {
*/
void ReDecodeLocalFile();
/**
* Change a final autodetection source to the corresponding initial one.
*/
int32_t MaybeRollBackSource(int32_t aSource);
/**
* Potentially guess the encoding using mozilla::EncodingDetector.
*/
@ -614,6 +619,10 @@ class nsHtml5StreamParser final : public nsISupports {
bool mHasHadErrors;
bool mDetectorHasSeenNonAscii;
bool mDetectorHadOnlySeenAsciiWhenFirstGuessing;
/**
* If true, we are decoding a local file that lacks an encoding
* declaration and we are not tokenizing yet.

View File

@ -18,10 +18,12 @@
#include "mozilla/StaticPrefs_content.h"
#include "mozilla/StaticPrefs_security.h"
#include "mozilla/StaticPrefs_view_source.h"
#include "mozilla/Telemetry.h"
#include "mozilla/css/Loader.h"
#include "nsContentUtils.h"
#include "nsDocShell.h"
#include "nsError.h"
#include "nsHTMLDocument.h"
#include "nsHtml5AutoPauseUpdate.h"
#include "nsHtml5Parser.h"
#include "nsHtml5StreamParser.h"
@ -41,6 +43,11 @@
using namespace mozilla;
static mozilla::LazyLogModule gCharsetMenuLog("Chardetng");
#define LOGCHARDETNG(args) \
MOZ_LOG(gCharsetMenuLog, mozilla::LogLevel::Debug, args)
NS_IMPL_ISUPPORTS_CYCLE_COLLECTION_INHERITED(nsHtml5TreeOpExecutor,
nsHtml5DocumentBuilder,
nsIContentSink)
@ -202,6 +209,118 @@ nsHtml5TreeOpExecutor::DidBuildModel(bool aTerminated) {
// OnStartRequest call.
if (mStarted) {
mDocument->EndLoad();
// Gather chardetng telemetry
MOZ_ASSERT(mDocument->IsHTMLDocument());
if (!aTerminated && !mDocument->AsHTMLDocument()->IsViewSource()) {
// We deliberately measure only normally-completed (non-aborted) loads
// that are not View Source loads. This seems like a better place for
// checking normal completion than anything in nsHtml5StreamParser.
bool plain = mDocument->AsHTMLDocument()->IsPlainText();
int32_t charsetSource = mDocument->GetDocumentCharacterSetSource();
switch (charsetSource) {
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
if (plain) {
LOGCHARDETNG(("TEXT::UtfInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::UtfInitial);
} else {
LOGCHARDETNG(("HTML::UtfInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::UtfInitial);
}
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
if (plain) {
LOGCHARDETNG(("TEXT::GenericInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
GenericInitial);
} else {
LOGCHARDETNG(("HTML::GenericInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
GenericInitial);
}
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
if (plain) {
LOGCHARDETNG(("TEXT::ContentInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
ContentInitial);
} else {
LOGCHARDETNG(("HTML::ContentInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
ContentInitial);
}
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
if (plain) {
LOGCHARDETNG(("TEXT::TldInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::TldInitial);
} else {
LOGCHARDETNG(("HTML::TldInitial"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::TldInitial);
}
break;
// Deliberately no final version of ASCII
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8:
if (plain) {
LOGCHARDETNG(("TEXT::UtfFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::UtfFinal);
} else {
LOGCHARDETNG(("HTML::UtfFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::UtfFinal);
}
break;
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
if (plain) {
LOGCHARDETNG(("TEXT::GenericFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
GenericFinal);
} else {
LOGCHARDETNG(("HTML::GenericFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
GenericFinal);
}
break;
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
if (plain) {
LOGCHARDETNG(("TEXT::ContentFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
ContentFinal);
} else {
LOGCHARDETNG(("HTML::ContentFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
ContentFinal);
}
break;
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
if (plain) {
LOGCHARDETNG(("TEXT::TldFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::TldFinal);
} else {
LOGCHARDETNG(("HTML::TldFinal"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::TldFinal);
}
break;
default:
// Chardetng didn't run automatically or the input was all ASCII.
break;
}
}
}
// Dropping the stream parser changes the parser's apparent

View File

@ -11,12 +11,20 @@ enum {
kCharsetFromFallback,
kCharsetFromTopLevelDomain,
kCharsetFromDocTypeDefault, // This and up confident for XHR
// Start subdividing source for telementry purposes
kCharsetFromInitialAutoDetectionASCII,
kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8,
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8,
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic,
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content,
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD,
kCharsetFromFinalJapaneseAutoDetection,
// Deliberately no Final version of ASCII
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8,
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8,
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic,
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content,
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD,
kCharsetFromFinalAutoDetectionFile,
// End subdividing source for telementry purposes
kCharsetFromParentFrame, // Same-origin parent takes precedence over detector
// to avoid breaking tests. (Also, the HTML spec
// says so.)

View File

@ -1 +1 @@
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"f70618ea5b783cd2aac8a6ee17babb659257b12d996b0289cf8b5f5e09b3fdd8","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"356616eaa99dc0af1d89acbf4177836ef5cdc8ac5287d224412da1009d432be9","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}

View File

@ -2826,6 +2826,15 @@ impl EncodingDetector {
closed: false,
}
}
/// Queries whether the TLD is considered non-generic and could affect the guess.
pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool {
if let Some(tld) = tld {
classify_tld(tld) != Tld::Generic
} else {
false
}
}
}
#[cfg(test)]

View File

@ -1 +1 @@
{"files":{"CONTRIBUTING.md":"5f28b63428b92d27d796d6d926447d15a19232236200e161ec870f4fdda1b489","COPYRIGHT":"e2dd307feb2e2625f245ae91df617ec79a48a9874f6e057af2906808723d5b15","Cargo.toml":"a935455963868ab0ec4714cae23e5ea088cf6619d6b92b7d5da12d71b6664eff","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"da2367d1e8863b102367d45fb7938432ce7c5f8b997e21e1da557e803a786e60","include/chardetng.h":"85b1a44e7a9f4fd31cacb40eaa025159022145edf97513dbbeb2eae6840847c4","src/lib.rs":"cbae33f19c50c0f4d3cc6fe23ef08403304bcffc82919eb3c1870d17813e3f09"},"package":"0458f9fd705d9040356a137c5872a0f9ce1d27222d69de6aeeecf00e5e520076"}
{"files":{"CONTRIBUTING.md":"5f28b63428b92d27d796d6d926447d15a19232236200e161ec870f4fdda1b489","COPYRIGHT":"5fa6d8c0701e5ce051b72b9ed08e3a75d5aee8e4132d876556c3dc04084238c7","Cargo.toml":"4ae3f82f8c8997b0bdf8830959825ec7e77316bbcec62e42255c07afff2e4479","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"3fa4ca83dcc9237839b1bdeb2e6d16bdfb5ec0c5ce42b24694d8bbf0dcbef72c","README.md":"f458dc617c487ace6c60096e1bf3ab1b39c151543b40916717596c28e81deebc","include/chardetng.h":"8a781fcbf6441d063abc6c004d485cb2d5a0b304f3bfe5d5978e70437b7b778e","src/lib.rs":"2eeaf976144bab2c9c819934ca0fce36fe2d9b21236d19b506426733d9a57e2b"},"package":null}

View File

@ -1,4 +1,4 @@
chardetng_c is copyright 2019 Mozilla Foundation.
chardetng_c is copyright Mozilla Foundation.
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or

View File

@ -1,30 +1,17 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
edition = "2018"
name = "chardetng_c"
version = "0.1.1"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "C bindings for chardetng"
homepage = "https://docs.rs/chardetng_c/"
documentation = "https://docs.rs/chardetng_c/"
version = "0.1.2"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
edition = "2018"
license = "Apache-2.0 OR MIT"
readme = "README.md"
documentation = "https://docs.rs/chardetng_c/"
homepage = "https://docs.rs/chardetng_c/"
repository = "https://github.com/hsivonen/chardetng-c"
keywords = ["encoding", "web", "unicode", "charset"]
categories = ["text-processing", "encoding", "web-programming", "internationalization"]
license = "MIT/Apache-2.0"
repository = "https://github.com/hsivonen/chardetng-c"
[dependencies.chardetng]
version = "0.1.1"
[dependencies.encoding_rs]
version = "0.8.17"
[dependencies]
encoding_rs = "0.8.17"
chardetng = "0.1.1"

View File

@ -1,4 +1,4 @@
Copyright (c) 2019 Mozilla Foundation
Copyright Mozilla Foundation
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated

View File

@ -18,6 +18,10 @@ online.
## Release Notes
### 0.1.2
* Remove year from copyright notices.
### 0.1.1
* Add newline to the end of the C header.

View File

@ -1,4 +1,4 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
@ -45,6 +45,16 @@ CHARDETNG_ENCODING_DETECTOR* chardetng_encoding_detector_new();
/// Deallocates a detector obtained from `chardetng_encodingdetector_new`.
void chardetng_encoding_detector_free(CHARDETNG_ENCODING_DETECTOR* detector);
/// Queries whether the TLD is considered non-generic and could affect the guess.
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
/// don't designate a range of memory valid for reading.
bool chardetng_encoding_detector_tld_may_affect_guess(char const* tld, size_t tld_len);
/// Inform the detector of a chunk of input.
///
/// The byte stream is represented as a sequence of calls to this

View File

@ -1,4 +1,4 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
@ -49,6 +49,28 @@ pub unsafe extern "C" fn chardetng_encoding_detector_free(detector: *mut Encodin
let _ = Box::from_raw(detector);
}
/// Queries whether the TLD is considered non-generic and could affect the guess.
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
/// don't designate a range of memory valid for reading.
#[no_mangle]
pub unsafe extern "C" fn chardetng_encoding_detector_tld_may_affect_guess(
tld: *const u8,
tld_len: usize,
) -> bool {
let tld_opt = if tld.is_null() {
assert_eq!(tld_len, 0);
None
} else {
Some(::std::slice::from_raw_parts(tld, tld_len))
};
EncodingDetector::tld_may_affect_guess(tld_opt)
}
/// Inform the detector of a chunk of input.
///
/// The byte stream is represented as a sequence of calls to this

View File

@ -8321,13 +8321,35 @@
"record_in_processes": ["main", "content"],
"products": ["firefox"],
"alert_emails": ["hsivonen@mozilla.com"],
"bug_numbers": [840476, 977573, 1513473, 1554592, 1603047, 1648464],
"bug_numbers": [840476, 977573, 1513473, 1554592, 1603047, 1648464, 1686463],
"expires_in_version": "91",
"releaseChannelCollection": "opt-out",
"kind": "categorical",
"labels": ["AutoOverridden", "ManuallyOverridden", "LocalLabeled", "LocalUnlabeled", "UnlabeledInLk", "UnlabeledJp", "UnlabeledNonUtf8", "UnlabeledUtf8", "ChannelNonUtf8", "ChannelUtf8", "MetaNonUtf8", "MetaUtf8", "Bug"],
"labels": ["AutoOverridden", "ManuallyOverridden", "LocalLabeled", "LocalUnlabeled", "UnlabeledInLk", "UnlabeledJp", "UnlabeledNonUtf8", "UnlabeledUtf8", "ChannelNonUtf8", "ChannelUtf8", "MetaNonUtf8", "MetaUtf8", "Bug", "UnlabeledAscii", "UnlabeledNonUtf8TLD"],
"description": "Labeling status of top-level page when overriding encoding"
},
"ENCODING_DETECTION_OUTCOME_TEXT": {
"record_in_processes": ["main", "content"],
"products": ["firefox"],
"alert_emails": ["hsivonen@mozilla.com"],
"bug_numbers": [1686463],
"expires_in_version": "91",
"releaseChannelCollection": "opt-out",
"kind": "categorical",
"labels": ["UtfInitial", "UtfFinal", "TldInitial", "TldFinal", "ContentInitial", "ContentFinal", "GenericInitial", "GenericFinal"],
"description": "Type of automatic encoding detection outcome for text/plain excluding ASCII-only"
},
"ENCODING_DETECTION_OUTCOME_HTML": {
"record_in_processes": ["main", "content"],
"products": ["firefox"],
"alert_emails": ["hsivonen@mozilla.com"],
"bug_numbers": [1686463],
"expires_in_version": "91",
"releaseChannelCollection": "opt-out",
"kind": "categorical",
"labels": ["UtfInitial", "UtfFinal", "TldInitial", "TldFinal", "ContentInitial", "ContentFinal", "GenericInitial", "GenericFinal"],
"description": "Type of automatic encoding detection outcome for text/html excluding ASCII-only"
},
"LONG_REFLOW_INTERRUPTIBLE": {
"record_in_processes": ["main", "content"],
"products": ["firefox", "fennec"],