mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-26 04:09:50 +00:00
Bug 1686463 - Gather telemetry about automatic encoding detection outcomes. r=chutten,emk
Differential Revision: https://phabricator.services.mozilla.com/D102397
This commit is contained in:
parent
7431701c98
commit
9b210c311e
@ -77,10 +77,15 @@ git = "https://github.com/hsivonen/packed_simd"
|
||||
replace-with = "vendored-sources"
|
||||
rev = "0917fe780032a6bbb23d71be545f9c1834128d75"
|
||||
|
||||
[source."https://github.com/hsivonen/chardetng_c"]
|
||||
git = "https://github.com/hsivonen/chardetng_c"
|
||||
replace-with = "vendored-sources"
|
||||
rev = "ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
|
||||
|
||||
[source."https://github.com/hsivonen/chardetng"]
|
||||
git = "https://github.com/hsivonen/chardetng"
|
||||
replace-with = "vendored-sources"
|
||||
rev = "7d5e0608d3e012bdfea3bd199111e3546607dd31"
|
||||
rev = "fd4ed671ef495af4dcda4c4cba3ef8d426db8af1"
|
||||
|
||||
[source."https://github.com/gfx-rs/naga"]
|
||||
git = "https://github.com/gfx-rs/naga"
|
||||
|
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -567,7 +567,7 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.9"
|
||||
source = "git+https://github.com/hsivonen/chardetng?rev=7d5e0608d3e012bdfea3bd199111e3546607dd31#7d5e0608d3e012bdfea3bd199111e3546607dd31"
|
||||
source = "git+https://github.com/hsivonen/chardetng?rev=fd4ed671ef495af4dcda4c4cba3ef8d426db8af1#fd4ed671ef495af4dcda4c4cba3ef8d426db8af1"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
@ -575,9 +575,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "chardetng_c"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0458f9fd705d9040356a137c5872a0f9ce1d27222d69de6aeeecf00e5e520076"
|
||||
version = "0.1.2"
|
||||
source = "git+https://github.com/hsivonen/chardetng_c?rev=ed8a4c6f900a90d4dbc1d64b856e61490a1c3570#ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
|
||||
dependencies = [
|
||||
"chardetng",
|
||||
"encoding_rs",
|
||||
|
@ -71,7 +71,8 @@ opt-level = 1
|
||||
opt-level = 1
|
||||
|
||||
[patch.crates-io]
|
||||
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="7d5e0608d3e012bdfea3bd199111e3546607dd31" }
|
||||
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="fd4ed671ef495af4dcda4c4cba3ef8d426db8af1" }
|
||||
chardetng_c = { git = "https://github.com/hsivonen/chardetng_c", rev="ed8a4c6f900a90d4dbc1d64b856e61490a1c3570" }
|
||||
libudev-sys = { path = "dom/webauthn/libudev-sys" }
|
||||
packed_simd = { git = "https://github.com/hsivonen/packed_simd", rev="0917fe780032a6bbb23d71be545f9c1834128d75" }
|
||||
rlbox_lucet_sandbox = { git = "https://github.com/PLSysSec/rlbox_lucet_sandbox/", rev="f3cace4fb8b53db0849c62af4fa62bade5a620f7" }
|
||||
|
@ -1508,12 +1508,26 @@ nsDocShell::GatherCharsetMenuTelemetry() {
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledJp);
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8:
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8:
|
||||
case kCharsetFromInitialAutoDetectionASCII:
|
||||
// Deliberately no final version
|
||||
LOGCHARSETMENU(("UnlabeledAscii"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledAscii);
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
|
||||
LOGCHARSETMENU(("UnlabeledNonUtf8"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledNonUtf8);
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
|
||||
LOGCHARSETMENU(("UnlabeledNonUtf8TLD"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledNonUtf8TLD);
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
|
||||
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8:
|
||||
LOGCHARSETMENU(("UnlabeledUtf8"));
|
||||
@ -1979,13 +1993,8 @@ nsDocShell::GetCharsetAutodetected(bool* aCharsetAutodetected) {
|
||||
}
|
||||
int32_t source = doc->GetDocumentCharacterSetSource();
|
||||
|
||||
if (source == kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 ||
|
||||
source == kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8 ||
|
||||
|
||||
source == kCharsetFromFinalJapaneseAutoDetection ||
|
||||
source == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 ||
|
||||
source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8 ||
|
||||
source == kCharsetFromFinalAutoDetectionFile ||
|
||||
if ((source >= kCharsetFromInitialAutoDetectionASCII &&
|
||||
source <= kCharsetFromFinalAutoDetectionFile) ||
|
||||
source == kCharsetFromUserForcedJapaneseAutoDetection ||
|
||||
source == kCharsetFromPendingUserForcedAutoDetection ||
|
||||
source == kCharsetFromInitialUserForcedAutoDetection ||
|
||||
|
@ -127,7 +127,8 @@ nsHTMLDocument::nsHTMLDocument()
|
||||
mNumForms(0),
|
||||
mLoadFlags(0),
|
||||
mWarnedWidthHeight(false),
|
||||
mIsPlainText(false) {
|
||||
mIsPlainText(false),
|
||||
mViewSource(false) {
|
||||
mType = eHTML;
|
||||
mDefaultElementType = kNameSpaceID_XHTML;
|
||||
mCompatMode = eCompatibility_NavQuirks;
|
||||
@ -273,7 +274,7 @@ void nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell,
|
||||
return;
|
||||
}
|
||||
|
||||
if (kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 <= parentSource) {
|
||||
if (kCharsetFromInitialAutoDetectionASCII <= parentSource) {
|
||||
// Make sure that's OK
|
||||
if (!NodePrincipal()->Equals(parentPrincipal) ||
|
||||
!IsAsciiCompatible(parentCharset)) {
|
||||
@ -319,9 +320,9 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
||||
|
||||
bool view =
|
||||
!strcmp(aCommand, "view") || !strcmp(aCommand, "external-resource");
|
||||
bool viewSource = !strcmp(aCommand, "view-source");
|
||||
mViewSource = !strcmp(aCommand, "view-source");
|
||||
bool asData = !strcmp(aCommand, kLoadAsData);
|
||||
if (!(view || viewSource || asData)) {
|
||||
if (!(view || mViewSource || asData)) {
|
||||
MOZ_ASSERT(false, "Bad parser command");
|
||||
return NS_ERROR_INVALID_ARG;
|
||||
}
|
||||
@ -331,7 +332,7 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
||||
contentType.EqualsLiteral(APPLICATION_WAPXHTML_XML));
|
||||
mIsPlainText =
|
||||
!html && !xhtml && nsContentUtils::IsPlainTextType(contentType);
|
||||
if (!(html || xhtml || mIsPlainText || viewSource)) {
|
||||
if (!(html || xhtml || mIsPlainText || mViewSource)) {
|
||||
MOZ_ASSERT(false, "Channel with bad content type.");
|
||||
return NS_ERROR_INVALID_ARG;
|
||||
}
|
||||
@ -341,7 +342,7 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
||||
|
||||
bool loadAsHtml5 = true;
|
||||
|
||||
if (!viewSource && xhtml) {
|
||||
if (!mViewSource && xhtml) {
|
||||
// We're parsing XHTML as XML, remember that.
|
||||
mType = eXHTML;
|
||||
SetCompatibilityMode(eCompatibility_FullStandards);
|
||||
@ -382,12 +383,12 @@ nsresult nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
||||
html5Parser = nsHtml5Module::NewHtml5Parser();
|
||||
mParser = html5Parser;
|
||||
if (mIsPlainText) {
|
||||
if (viewSource) {
|
||||
if (mViewSource) {
|
||||
html5Parser->MarkAsNotScriptCreated("view-source-plain");
|
||||
} else {
|
||||
html5Parser->MarkAsNotScriptCreated("plain-text");
|
||||
}
|
||||
} else if (viewSource && !html) {
|
||||
} else if (mViewSource && !html) {
|
||||
html5Parser->MarkAsNotScriptCreated("view-source-xml");
|
||||
} else {
|
||||
html5Parser->MarkAsNotScriptCreated(aCommand);
|
||||
|
@ -70,6 +70,8 @@ class nsHTMLDocument : public mozilla::dom::Document {
|
||||
|
||||
bool IsPlainText() const { return mIsPlainText; }
|
||||
|
||||
bool IsViewSource() const { return mViewSource; }
|
||||
|
||||
// Returns whether an object was found for aName.
|
||||
bool ResolveName(JSContext* aCx, const nsAString& aName,
|
||||
JS::MutableHandle<JS::Value> aRetval,
|
||||
@ -194,6 +196,11 @@ class nsHTMLDocument : public mozilla::dom::Document {
|
||||
* Set to true once we know that we are loading plain text content.
|
||||
*/
|
||||
bool mIsPlainText;
|
||||
|
||||
/**
|
||||
* Set to true once we know that we are viewing source.
|
||||
*/
|
||||
bool mViewSource;
|
||||
};
|
||||
|
||||
namespace mozilla {
|
||||
|
@ -57,6 +57,15 @@ class EncodingDetector final {
|
||||
return detector;
|
||||
}
|
||||
|
||||
/**
|
||||
* Queries whether the TLD is considered non-generic and could affect the
|
||||
* guess.
|
||||
*/
|
||||
static inline bool TldMayAffectGuess(Span<const char> aTLD) {
|
||||
return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(),
|
||||
aTLD.Length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Inform the detector of a chunk of input.
|
||||
*
|
||||
|
@ -228,6 +228,8 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
|
||||
mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)),
|
||||
mInitialEncodingWasFromParentFrame(false),
|
||||
mHasHadErrors(false),
|
||||
mDetectorHasSeenNonAscii(false),
|
||||
mDetectorHadOnlySeenAsciiWhenFirstGuessing(false),
|
||||
mDecodingLocalFileWithoutTokenizing(false),
|
||||
mFlushTimer(NS_NewTimer(mEventTarget)),
|
||||
mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"),
|
||||
@ -278,11 +280,36 @@ nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {
|
||||
: NS_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
int32_t nsHtml5StreamParser::MaybeRollBackSource(int32_t aSource) {
|
||||
if (aSource ==
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) {
|
||||
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
|
||||
}
|
||||
if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
|
||||
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic;
|
||||
}
|
||||
if (aSource == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content) {
|
||||
return kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
|
||||
}
|
||||
if (aSource == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
|
||||
!mDetectorHadOnlySeenAsciiWhenFirstGuessing) {
|
||||
return kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
|
||||
}
|
||||
if (aSource == kCharsetFromFinalUserForcedAutoDetection) {
|
||||
aSource = kCharsetFromInitialUserForcedAutoDetection;
|
||||
}
|
||||
return aSource;
|
||||
}
|
||||
|
||||
void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
|
||||
if (mJapaneseDetector) {
|
||||
return;
|
||||
}
|
||||
if (!aInitial) {
|
||||
if (aInitial) {
|
||||
if (!mDetectorHasSeenNonAscii) {
|
||||
mDetectorHadOnlySeenAsciiWhenFirstGuessing = true;
|
||||
}
|
||||
} else {
|
||||
mGuessEncoding = false;
|
||||
}
|
||||
bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection ||
|
||||
@ -291,43 +318,66 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
|
||||
mCharsetSource != kCharsetFromFinalJapaneseAutoDetection &&
|
||||
mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
|
||||
mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
|
||||
mCharsetSource != kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8 &&
|
||||
mCharsetSource !=
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
|
||||
mCharsetSource !=
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
|
||||
mCharsetSource !=
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
|
||||
mCharsetSource != kCharsetFromFinalAutoDetectionFile);
|
||||
auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
|
||||
auto encoding =
|
||||
forced ? mDetector->Guess(EmptyCString(), true)
|
||||
forced ? ifHadBeenForced
|
||||
: mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing);
|
||||
auto source =
|
||||
int32_t source =
|
||||
aInitial
|
||||
? (forced ? kCharsetFromInitialUserForcedAutoDetection
|
||||
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8)
|
||||
? (forced
|
||||
? kCharsetFromInitialUserForcedAutoDetection
|
||||
: kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic)
|
||||
: (forced
|
||||
? kCharsetFromFinalUserForcedAutoDetection
|
||||
: (mDecodingLocalFileWithoutTokenizing
|
||||
? kCharsetFromFinalAutoDetectionFile
|
||||
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8));
|
||||
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
|
||||
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
|
||||
: kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic));
|
||||
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) {
|
||||
if (encoding == ISO_2022_JP_ENCODING) {
|
||||
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
|
||||
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
|
||||
}
|
||||
} else if (!mDetectorHasSeenNonAscii) {
|
||||
source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial
|
||||
} else if (ifHadBeenForced == UTF_8_ENCODING) {
|
||||
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
|
||||
} else if (encoding != ifHadBeenForced) {
|
||||
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
|
||||
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
|
||||
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
|
||||
}
|
||||
} else if (source == kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8) {
|
||||
if (mDetector->Guess(EmptyCString(), true) == UTF_8_ENCODING) {
|
||||
} else if (source ==
|
||||
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
|
||||
if (encoding == ISO_2022_JP_ENCODING) {
|
||||
if (EncodingDetector::TldMayAffectGuess(mTLD)) {
|
||||
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
|
||||
}
|
||||
} else if (!mDetectorHasSeenNonAscii) {
|
||||
source = kCharsetFromInitialAutoDetectionASCII;
|
||||
} else if (ifHadBeenForced == UTF_8_ENCODING) {
|
||||
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
|
||||
} else if (encoding != ifHadBeenForced) {
|
||||
source =
|
||||
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
|
||||
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
|
||||
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content;
|
||||
}
|
||||
}
|
||||
if (HasDecoder() && !mDecodingLocalFileWithoutTokenizing) {
|
||||
if (mEncoding == encoding) {
|
||||
MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?");
|
||||
MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
|
||||
mCharsetSource < source,
|
||||
"Why are we running chardet at all?");
|
||||
// Source didn't actually change between initial and final, so roll it
|
||||
// back for future telemetry purposes, while taking into account the final
|
||||
// UTF-8ness. https://bugzilla.mozilla.org/show_bug.cgi?id=1686463
|
||||
if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8) {
|
||||
source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8;
|
||||
} else if (source == kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) {
|
||||
source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8;
|
||||
} else if (source == kCharsetFromFinalUserForcedAutoDetection) {
|
||||
source = kCharsetFromInitialUserForcedAutoDetection;
|
||||
}
|
||||
mCharsetSource = source;
|
||||
// back for telemetry purposes.
|
||||
mCharsetSource = MaybeRollBackSource(source);
|
||||
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
|
||||
} else {
|
||||
MOZ_ASSERT(mCharsetSource < kCharsetFromFinalJapaneseAutoDetection ||
|
||||
@ -341,6 +391,11 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) {
|
||||
} else {
|
||||
// Got a confident answer from the sniffing buffer. That code will
|
||||
// take care of setting up the decoder.
|
||||
if (mCharsetSource == kCharsetUninitialized && aEof) {
|
||||
// The document is so short that the initial buffer is the last
|
||||
// buffer.
|
||||
source = MaybeRollBackSource(source);
|
||||
}
|
||||
mEncoding = encoding;
|
||||
mCharsetSource = source;
|
||||
mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource);
|
||||
@ -383,7 +438,7 @@ void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer,
|
||||
if (mJapaneseDetector) {
|
||||
FeedJapaneseDetector(aBuffer, aLast);
|
||||
} else {
|
||||
Unused << mDetector->Feed(aBuffer, aLast);
|
||||
mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -362,6 +362,11 @@ class nsHtml5StreamParser final : public nsISupports {
|
||||
*/
|
||||
void ReDecodeLocalFile();
|
||||
|
||||
/**
|
||||
* Change a final autodetection source to the corresponding initial one.
|
||||
*/
|
||||
int32_t MaybeRollBackSource(int32_t aSource);
|
||||
|
||||
/**
|
||||
* Potentially guess the encoding using mozilla::EncodingDetector.
|
||||
*/
|
||||
@ -614,6 +619,10 @@ class nsHtml5StreamParser final : public nsISupports {
|
||||
|
||||
bool mHasHadErrors;
|
||||
|
||||
bool mDetectorHasSeenNonAscii;
|
||||
|
||||
bool mDetectorHadOnlySeenAsciiWhenFirstGuessing;
|
||||
|
||||
/**
|
||||
* If true, we are decoding a local file that lacks an encoding
|
||||
* declaration and we are not tokenizing yet.
|
||||
|
@ -18,10 +18,12 @@
|
||||
#include "mozilla/StaticPrefs_content.h"
|
||||
#include "mozilla/StaticPrefs_security.h"
|
||||
#include "mozilla/StaticPrefs_view_source.h"
|
||||
#include "mozilla/Telemetry.h"
|
||||
#include "mozilla/css/Loader.h"
|
||||
#include "nsContentUtils.h"
|
||||
#include "nsDocShell.h"
|
||||
#include "nsError.h"
|
||||
#include "nsHTMLDocument.h"
|
||||
#include "nsHtml5AutoPauseUpdate.h"
|
||||
#include "nsHtml5Parser.h"
|
||||
#include "nsHtml5StreamParser.h"
|
||||
@ -41,6 +43,11 @@
|
||||
|
||||
using namespace mozilla;
|
||||
|
||||
static mozilla::LazyLogModule gCharsetMenuLog("Chardetng");
|
||||
|
||||
#define LOGCHARDETNG(args) \
|
||||
MOZ_LOG(gCharsetMenuLog, mozilla::LogLevel::Debug, args)
|
||||
|
||||
NS_IMPL_ISUPPORTS_CYCLE_COLLECTION_INHERITED(nsHtml5TreeOpExecutor,
|
||||
nsHtml5DocumentBuilder,
|
||||
nsIContentSink)
|
||||
@ -202,6 +209,118 @@ nsHtml5TreeOpExecutor::DidBuildModel(bool aTerminated) {
|
||||
// OnStartRequest call.
|
||||
if (mStarted) {
|
||||
mDocument->EndLoad();
|
||||
|
||||
// Gather chardetng telemetry
|
||||
MOZ_ASSERT(mDocument->IsHTMLDocument());
|
||||
if (!aTerminated && !mDocument->AsHTMLDocument()->IsViewSource()) {
|
||||
// We deliberately measure only normally-completed (non-aborted) loads
|
||||
// that are not View Source loads. This seems like a better place for
|
||||
// checking normal completion than anything in nsHtml5StreamParser.
|
||||
bool plain = mDocument->AsHTMLDocument()->IsPlainText();
|
||||
int32_t charsetSource = mDocument->GetDocumentCharacterSetSource();
|
||||
switch (charsetSource) {
|
||||
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::UtfInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::UtfInitial);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::UtfInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::UtfInitial);
|
||||
}
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::GenericInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
|
||||
GenericInitial);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::GenericInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
|
||||
GenericInitial);
|
||||
}
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::ContentInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
|
||||
ContentInitial);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::ContentInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
|
||||
ContentInitial);
|
||||
}
|
||||
break;
|
||||
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::TldInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::TldInitial);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::TldInitial"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::TldInitial);
|
||||
}
|
||||
break;
|
||||
// Deliberately no final version of ASCII
|
||||
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::UtfFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::UtfFinal);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::UtfFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::UtfFinal);
|
||||
}
|
||||
break;
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::GenericFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
|
||||
GenericFinal);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::GenericFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
|
||||
GenericFinal);
|
||||
}
|
||||
break;
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::ContentFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::
|
||||
ContentFinal);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::ContentFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::
|
||||
ContentFinal);
|
||||
}
|
||||
break;
|
||||
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
|
||||
if (plain) {
|
||||
LOGCHARDETNG(("TEXT::TldFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_TEXT::TldFinal);
|
||||
} else {
|
||||
LOGCHARDETNG(("HTML::TldFinal"));
|
||||
Telemetry::AccumulateCategorical(
|
||||
Telemetry::LABELS_ENCODING_DETECTION_OUTCOME_HTML::TldFinal);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Chardetng didn't run automatically or the input was all ASCII.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Dropping the stream parser changes the parser's apparent
|
||||
|
@ -11,12 +11,20 @@ enum {
|
||||
kCharsetFromFallback,
|
||||
kCharsetFromTopLevelDomain,
|
||||
kCharsetFromDocTypeDefault, // This and up confident for XHR
|
||||
// Start subdividing source for telementry purposes
|
||||
kCharsetFromInitialAutoDetectionASCII,
|
||||
kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8,
|
||||
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8,
|
||||
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic,
|
||||
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content,
|
||||
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD,
|
||||
kCharsetFromFinalJapaneseAutoDetection,
|
||||
// Deliberately no Final version of ASCII
|
||||
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8,
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8,
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic,
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content,
|
||||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD,
|
||||
kCharsetFromFinalAutoDetectionFile,
|
||||
// End subdividing source for telementry purposes
|
||||
kCharsetFromParentFrame, // Same-origin parent takes precedence over detector
|
||||
// to avoid breaking tests. (Also, the HTML spec
|
||||
// says so.)
|
||||
|
@ -1 +1 @@
|
||||
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"f70618ea5b783cd2aac8a6ee17babb659257b12d996b0289cf8b5f5e09b3fdd8","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
|
||||
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"356616eaa99dc0af1d89acbf4177836ef5cdc8ac5287d224412da1009d432be9","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
|
9
third_party/rust/chardetng/src/lib.rs
vendored
9
third_party/rust/chardetng/src/lib.rs
vendored
@ -2826,6 +2826,15 @@ impl EncodingDetector {
|
||||
closed: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Queries whether the TLD is considered non-generic and could affect the guess.
|
||||
pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool {
|
||||
if let Some(tld) = tld {
|
||||
classify_tld(tld) != Tld::Generic
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -1 +1 @@
|
||||
{"files":{"CONTRIBUTING.md":"5f28b63428b92d27d796d6d926447d15a19232236200e161ec870f4fdda1b489","COPYRIGHT":"e2dd307feb2e2625f245ae91df617ec79a48a9874f6e057af2906808723d5b15","Cargo.toml":"a935455963868ab0ec4714cae23e5ea088cf6619d6b92b7d5da12d71b6664eff","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"da2367d1e8863b102367d45fb7938432ce7c5f8b997e21e1da557e803a786e60","include/chardetng.h":"85b1a44e7a9f4fd31cacb40eaa025159022145edf97513dbbeb2eae6840847c4","src/lib.rs":"cbae33f19c50c0f4d3cc6fe23ef08403304bcffc82919eb3c1870d17813e3f09"},"package":"0458f9fd705d9040356a137c5872a0f9ce1d27222d69de6aeeecf00e5e520076"}
|
||||
{"files":{"CONTRIBUTING.md":"5f28b63428b92d27d796d6d926447d15a19232236200e161ec870f4fdda1b489","COPYRIGHT":"5fa6d8c0701e5ce051b72b9ed08e3a75d5aee8e4132d876556c3dc04084238c7","Cargo.toml":"4ae3f82f8c8997b0bdf8830959825ec7e77316bbcec62e42255c07afff2e4479","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"3fa4ca83dcc9237839b1bdeb2e6d16bdfb5ec0c5ce42b24694d8bbf0dcbef72c","README.md":"f458dc617c487ace6c60096e1bf3ab1b39c151543b40916717596c28e81deebc","include/chardetng.h":"8a781fcbf6441d063abc6c004d485cb2d5a0b304f3bfe5d5978e70437b7b778e","src/lib.rs":"2eeaf976144bab2c9c819934ca0fce36fe2d9b21236d19b506426733d9a57e2b"},"package":null}
|
2
third_party/rust/chardetng_c/COPYRIGHT
vendored
2
third_party/rust/chardetng_c/COPYRIGHT
vendored
@ -1,4 +1,4 @@
|
||||
chardetng_c is copyright 2019 Mozilla Foundation.
|
||||
chardetng_c is copyright Mozilla Foundation.
|
||||
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
|
33
third_party/rust/chardetng_c/Cargo.toml
vendored
33
third_party/rust/chardetng_c/Cargo.toml
vendored
@ -1,30 +1,17 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies
|
||||
#
|
||||
# If you believe there's an error in this file please file an
|
||||
# issue against the rust-lang/cargo repository. If you're
|
||||
# editing this file be aware that the upstream Cargo.toml
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "chardetng_c"
|
||||
version = "0.1.1"
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
description = "C bindings for chardetng"
|
||||
homepage = "https://docs.rs/chardetng_c/"
|
||||
documentation = "https://docs.rs/chardetng_c/"
|
||||
version = "0.1.2"
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
edition = "2018"
|
||||
license = "Apache-2.0 OR MIT"
|
||||
readme = "README.md"
|
||||
documentation = "https://docs.rs/chardetng_c/"
|
||||
homepage = "https://docs.rs/chardetng_c/"
|
||||
repository = "https://github.com/hsivonen/chardetng-c"
|
||||
keywords = ["encoding", "web", "unicode", "charset"]
|
||||
categories = ["text-processing", "encoding", "web-programming", "internationalization"]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/hsivonen/chardetng-c"
|
||||
[dependencies.chardetng]
|
||||
version = "0.1.1"
|
||||
|
||||
[dependencies.encoding_rs]
|
||||
version = "0.8.17"
|
||||
[dependencies]
|
||||
encoding_rs = "0.8.17"
|
||||
chardetng = "0.1.1"
|
2
third_party/rust/chardetng_c/LICENSE-MIT
vendored
2
third_party/rust/chardetng_c/LICENSE-MIT
vendored
@ -1,4 +1,4 @@
|
||||
Copyright (c) 2019 Mozilla Foundation
|
||||
Copyright Mozilla Foundation
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
|
4
third_party/rust/chardetng_c/README.md
vendored
4
third_party/rust/chardetng_c/README.md
vendored
@ -18,6 +18,10 @@ online.
|
||||
|
||||
## Release Notes
|
||||
|
||||
### 0.1.2
|
||||
|
||||
* Remove year from copyright notices.
|
||||
|
||||
### 0.1.1
|
||||
|
||||
* Add newline to the end of the C header.
|
||||
|
12
third_party/rust/chardetng_c/include/chardetng.h
vendored
12
third_party/rust/chardetng_c/include/chardetng.h
vendored
@ -1,4 +1,4 @@
|
||||
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
@ -45,6 +45,16 @@ CHARDETNG_ENCODING_DETECTOR* chardetng_encoding_detector_new();
|
||||
/// Deallocates a detector obtained from `chardetng_encodingdetector_new`.
|
||||
void chardetng_encoding_detector_free(CHARDETNG_ENCODING_DETECTOR* detector);
|
||||
|
||||
/// Queries whether the TLD is considered non-generic and could affect the guess.
|
||||
///
|
||||
/// # Undefined Behavior
|
||||
///
|
||||
/// UB ensues if
|
||||
///
|
||||
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
|
||||
/// don't designate a range of memory valid for reading.
|
||||
bool chardetng_encoding_detector_tld_may_affect_guess(char const* tld, size_t tld_len);
|
||||
|
||||
/// Inform the detector of a chunk of input.
|
||||
///
|
||||
/// The byte stream is represented as a sequence of calls to this
|
||||
|
24
third_party/rust/chardetng_c/src/lib.rs
vendored
24
third_party/rust/chardetng_c/src/lib.rs
vendored
@ -1,4 +1,4 @@
|
||||
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
@ -49,6 +49,28 @@ pub unsafe extern "C" fn chardetng_encoding_detector_free(detector: *mut Encodin
|
||||
let _ = Box::from_raw(detector);
|
||||
}
|
||||
|
||||
/// Queries whether the TLD is considered non-generic and could affect the guess.
|
||||
///
|
||||
/// # Undefined Behavior
|
||||
///
|
||||
/// UB ensues if
|
||||
///
|
||||
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
|
||||
/// don't designate a range of memory valid for reading.
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn chardetng_encoding_detector_tld_may_affect_guess(
|
||||
tld: *const u8,
|
||||
tld_len: usize,
|
||||
) -> bool {
|
||||
let tld_opt = if tld.is_null() {
|
||||
assert_eq!(tld_len, 0);
|
||||
None
|
||||
} else {
|
||||
Some(::std::slice::from_raw_parts(tld, tld_len))
|
||||
};
|
||||
EncodingDetector::tld_may_affect_guess(tld_opt)
|
||||
}
|
||||
|
||||
/// Inform the detector of a chunk of input.
|
||||
///
|
||||
/// The byte stream is represented as a sequence of calls to this
|
||||
|
@ -8321,13 +8321,35 @@
|
||||
"record_in_processes": ["main", "content"],
|
||||
"products": ["firefox"],
|
||||
"alert_emails": ["hsivonen@mozilla.com"],
|
||||
"bug_numbers": [840476, 977573, 1513473, 1554592, 1603047, 1648464],
|
||||
"bug_numbers": [840476, 977573, 1513473, 1554592, 1603047, 1648464, 1686463],
|
||||
"expires_in_version": "91",
|
||||
"releaseChannelCollection": "opt-out",
|
||||
"kind": "categorical",
|
||||
"labels": ["AutoOverridden", "ManuallyOverridden", "LocalLabeled", "LocalUnlabeled", "UnlabeledInLk", "UnlabeledJp", "UnlabeledNonUtf8", "UnlabeledUtf8", "ChannelNonUtf8", "ChannelUtf8", "MetaNonUtf8", "MetaUtf8", "Bug"],
|
||||
"labels": ["AutoOverridden", "ManuallyOverridden", "LocalLabeled", "LocalUnlabeled", "UnlabeledInLk", "UnlabeledJp", "UnlabeledNonUtf8", "UnlabeledUtf8", "ChannelNonUtf8", "ChannelUtf8", "MetaNonUtf8", "MetaUtf8", "Bug", "UnlabeledAscii", "UnlabeledNonUtf8TLD"],
|
||||
"description": "Labeling status of top-level page when overriding encoding"
|
||||
},
|
||||
"ENCODING_DETECTION_OUTCOME_TEXT": {
|
||||
"record_in_processes": ["main", "content"],
|
||||
"products": ["firefox"],
|
||||
"alert_emails": ["hsivonen@mozilla.com"],
|
||||
"bug_numbers": [1686463],
|
||||
"expires_in_version": "91",
|
||||
"releaseChannelCollection": "opt-out",
|
||||
"kind": "categorical",
|
||||
"labels": ["UtfInitial", "UtfFinal", "TldInitial", "TldFinal", "ContentInitial", "ContentFinal", "GenericInitial", "GenericFinal"],
|
||||
"description": "Type of automatic encoding detection outcome for text/plain excluding ASCII-only"
|
||||
},
|
||||
"ENCODING_DETECTION_OUTCOME_HTML": {
|
||||
"record_in_processes": ["main", "content"],
|
||||
"products": ["firefox"],
|
||||
"alert_emails": ["hsivonen@mozilla.com"],
|
||||
"bug_numbers": [1686463],
|
||||
"expires_in_version": "91",
|
||||
"releaseChannelCollection": "opt-out",
|
||||
"kind": "categorical",
|
||||
"labels": ["UtfInitial", "UtfFinal", "TldInitial", "TldFinal", "ContentInitial", "ContentFinal", "GenericInitial", "GenericFinal"],
|
||||
"description": "Type of automatic encoding detection outcome for text/html excluding ASCII-only"
|
||||
},
|
||||
"LONG_REFLOW_INTERRUPTIBLE": {
|
||||
"record_in_processes": ["main", "content"],
|
||||
"products": ["firefox", "fennec"],
|
||||
|
Loading…
x
Reference in New Issue
Block a user