From 7eef0de378350845cd4998a7e9d95b7ecffa7b2f Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 6 Feb 2014 11:08:01 +0200 Subject: [PATCH] Bug 910211 - Guess the fallback encoding from the top-level domain when feasible. r=emk. --- build/pgo/server-locations.txt | 7 + content/html/document/src/nsHTMLDocument.cpp | 61 +++++++ content/html/document/src/nsHTMLDocument.h | 1 + docshell/base/nsDocShell.cpp | 4 + dom/encoding/FallbackEncoding.cpp | 32 ++++ dom/encoding/FallbackEncoding.h | 22 +++ dom/encoding/Makefile.in | 4 + dom/encoding/domainsfallbacks.properties | 167 ++++++++++++++++++ dom/encoding/moz.build | 2 + .../nonparticipatingdomains.properties | 51 ++++++ dom/encoding/test/file_TLD.html | 7 + dom/encoding/test/mochitest.ini | 2 + dom/encoding/test/test_TLD.html | 57 ++++++ modules/libpref/src/init/all.js | 1 + parser/nsCharsetSource.h | 27 +-- 15 files changed, 432 insertions(+), 13 deletions(-) create mode 100644 dom/encoding/domainsfallbacks.properties create mode 100644 dom/encoding/nonparticipatingdomains.properties create mode 100644 dom/encoding/test/file_TLD.html create mode 100644 dom/encoding/test/test_TLD.html diff --git a/build/pgo/server-locations.txt b/build/pgo/server-locations.txt index dbc0cf4af3f2..de0a6a935913 100644 --- a/build/pgo/server-locations.txt +++ b/build/pgo/server-locations.txt @@ -205,3 +205,10 @@ https://www2.w3c-test.org:443 https://xn--n8j6ds53lwwkrqhv28a.w3c-test.org:443 https://xn--lve-6lad.w3c-test.org:443 http://test.w3.org:80 + +# Hosts for testing TLD-based fallback encoding +http://example.tw:80 privileged +http://example.cn:80 privileged +http://example.co.jp:80 privileged +http://example.fi:80 privileged + diff --git a/content/html/document/src/nsHTMLDocument.cpp b/content/html/document/src/nsHTMLDocument.cpp index ff91acca21b4..d7214858f2a6 100644 --- a/content/html/document/src/nsHTMLDocument.cpp +++ b/content/html/document/src/nsHTMLDocument.cpp @@ -435,6 +435,66 @@ nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell, } } +void +nsHTMLDocument::TryTLD(int32_t& aCharsetSource, nsACString& aCharset) +{ + if (aCharsetSource >= kCharsetFromTopLevelDomain) { + return; + } + if (!FallbackEncoding::sGuessFallbackFromTopLevelDomain) { + return; + } + if (!mDocumentURI) { + return; + } + nsAutoCString host; + mDocumentURI->GetAsciiHost(host); + if (host.IsEmpty()) { + return; + } + // First let's see if the host is DNS-absolute and ends with a dot and + // get rid of that one. + if (host.Last() == '.') { + host.SetLength(host.Length() - 1); + if (host.IsEmpty()) { + return; + } + } + // If we still have a dot, the host is weird, so let's continue only + // if we have something other than a dot now. + if (host.Last() == '.') { + return; + } + int32_t index = host.RFindChar('.'); + if (index == kNotFound) { + // We have an intranet host, Gecko-internal URL or an IPv6 address. + return; + } + // Since the string didn't end with a dot and we found a dot, + // there is at least one character between the dot and the end of + // the string, so taking the substring below is safe. + nsAutoCString tld; + ToLowerCase(Substring(host, index + 1, host.Length() - (index + 1)), tld); + // Reject generic TLDs and country TLDs that need more research + if (!FallbackEncoding::IsParticipatingTopLevelDomain(tld)) { + return; + } + // Check if we have an IPv4 address + bool seenNonDigit = false; + for (size_t i = 0; i < tld.Length(); ++i) { + char c = tld.CharAt(i); + if (c < '0' || c > '9') { + seenNonDigit = true; + break; + } + } + if (!seenNonDigit) { + return; + } + aCharsetSource = kCharsetFromTopLevelDomain; + FallbackEncoding::FromTopLevelDomain(tld, aCharset); +} + void nsHTMLDocument::TryFallback(int32_t& aCharsetSource, nsACString& aCharset) { @@ -661,6 +721,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand, TryCacheCharset(cachingChan, charsetSource, charset); } + TryTLD(charsetSource, charset); TryFallback(charsetSource, charset); if (wyciwygChannel) { diff --git a/content/html/document/src/nsHTMLDocument.h b/content/html/document/src/nsHTMLDocument.h index a70980f8e2c4..2cc28021b6ef 100644 --- a/content/html/document/src/nsHTMLDocument.h +++ b/content/html/document/src/nsHTMLDocument.h @@ -313,6 +313,7 @@ protected: nsACString& aCharset); void TryParentCharset(nsIDocShell* aDocShell, int32_t& charsetSource, nsACString& aCharset); + void TryTLD(int32_t& aCharsetSource, nsACString& aCharset); static void TryFallback(int32_t& aCharsetSource, nsACString& aCharset); // Override so we can munge the charset on our wyciwyg channel as needed. diff --git a/docshell/base/nsDocShell.cpp b/docshell/base/nsDocShell.cpp index 51b6141e5496..d04d9757d8c9 100644 --- a/docshell/base/nsDocShell.cpp +++ b/docshell/base/nsDocShell.cpp @@ -1994,6 +1994,10 @@ nsDocShell::GatherCharsetMenuTelemetry() int32_t charsetSource = doc->GetDocumentCharacterSetSource(); switch (charsetSource) { + case kCharsetFromTopLevelDomain: + // Unlabeled doc on a domain that we map to a fallback encoding + Telemetry::Accumulate(Telemetry::CHARSET_OVERRIDE_SITUATION, 7); + break; case kCharsetFromFallback: case kCharsetFromDocTypeDefault: case kCharsetFromCache: diff --git a/dom/encoding/FallbackEncoding.cpp b/dom/encoding/FallbackEncoding.cpp index db63994cede8..26ad04d53a20 100644 --- a/dom/encoding/FallbackEncoding.cpp +++ b/dom/encoding/FallbackEncoding.cpp @@ -17,7 +17,16 @@ static const char* localesFallbacks[][3] = { #include "localesfallbacks.properties.h" }; +static const char* domainsFallbacks[][3] = { +#include "domainsfallbacks.properties.h" +}; + +static const char* nonParticipatingDomains[][3] = { +#include "nonparticipatingdomains.properties.h" +}; + FallbackEncoding* FallbackEncoding::sInstance = nullptr; +bool FallbackEncoding::sGuessFallbackFromTopLevelDomain = true; FallbackEncoding::FallbackEncoding() { @@ -121,6 +130,8 @@ FallbackEncoding::Initialize() Preferences::RegisterCallback(FallbackEncoding::PrefChanged, "general.useragent.locale", nullptr); + Preferences::AddBoolVarCache(&sGuessFallbackFromTopLevelDomain, + "intl.charset.fallback.tld"); } void @@ -132,5 +143,26 @@ FallbackEncoding::Shutdown() FallbackEncoding::sInstance = nullptr; } +bool +FallbackEncoding::IsParticipatingTopLevelDomain(const nsACString& aTLD) +{ + nsAutoCString dummy; + return NS_FAILED(nsUConvPropertySearch::SearchPropertyValue( + nonParticipatingDomains, + ArrayLength(nonParticipatingDomains), + aTLD, + dummy)); +} + +void +FallbackEncoding::FromTopLevelDomain(const nsACString& aTLD, + nsACString& aFallback) +{ + if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue( + domainsFallbacks, ArrayLength(domainsFallbacks), aTLD, aFallback))) { + aFallback.AssignLiteral("windows-1252"); + } +} + } // namespace dom } // namespace mozilla diff --git a/dom/encoding/FallbackEncoding.h b/dom/encoding/FallbackEncoding.h index 9f6adf49b78b..431dd4ee8d80 100644 --- a/dom/encoding/FallbackEncoding.h +++ b/dom/encoding/FallbackEncoding.h @@ -14,6 +14,11 @@ class FallbackEncoding { public: + /** + * Whether FromTopLevelDomain() should be used. + */ + static bool sGuessFallbackFromTopLevelDomain; + /** * Gets the locale-dependent fallback encoding for legacy HTML and plain * text content. @@ -22,6 +27,23 @@ public: */ static void FromLocale(nsACString& aFallback); + /** + * Checks if it is appropriate to call FromTopLevelDomain() for a given TLD. + * + * @param aTLD the top-level domain (in Punycode) + * @return true if OK to call FromTopLevelDomain() + */ + static bool IsParticipatingTopLevelDomain(const nsACString& aTLD); + + /** + * Gets a top-level domain-depedendent fallback encoding for legacy HTML + * and plain text content + * + * @param aTLD the top-level domain (in Punycode) + * @param aFallback the outparam for the fallback encoding + */ + static void FromTopLevelDomain(const nsACString& aTLD, nsACString& aFallback); + // public API ends here! /** diff --git a/dom/encoding/Makefile.in b/dom/encoding/Makefile.in index 21c71af311e8..5ca56bb8c23d 100644 --- a/dom/encoding/Makefile.in +++ b/dom/encoding/Makefile.in @@ -9,3 +9,7 @@ labelsencodings.properties.h: $(PROPS2ARRAYS) labelsencodings.properties $(PYTHON) $^ $@ localesfallbacks.properties.h: $(PROPS2ARRAYS) localesfallbacks.properties $(PYTHON) $^ $@ +domainsfallbacks.properties.h: $(PROPS2ARRAYS) domainsfallbacks.properties + $(PYTHON) $^ $@ +nonparticipatingdomains.properties.h: $(PROPS2ARRAYS) nonparticipatingdomains.properties + $(PYTHON) $^ $@ diff --git a/dom/encoding/domainsfallbacks.properties b/dom/encoding/domainsfallbacks.properties new file mode 100644 index 000000000000..4189b8f89242 --- /dev/null +++ b/dom/encoding/domainsfallbacks.properties @@ -0,0 +1,167 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# This file contains educated guesses about which top-level domains are +# likely to host legacy content that assumes a non-windows-1252 encoding. +# Punycode TLDs are included on the theory that legacy content might appear +# behind those relatively new TLDs if DNS just points to a legacy server. +# +# Encodings for which a confident-enough educated guess is missing are +# listed in nonparticipatingdomains.properties. Domains that are listed +# neither there nor here get windows-1252 as the associated fallback. +# +# The list below includes Arabic-script TLDs not on IANA list but on the +# ICANN list: +# http://www.icann.org/en/resources/idn/fast-track/string-evaluation-completion +# Otherwise, the list includes non-windows-1252-affilited country TLDs from +# https://data.iana.org/TLD/tlds-alpha-by-domain.txt +# +# The guesses are assigned as follows: +# * If the country has a dominant country-affiliated language and that language +# is part of the languages to fallbacks mapping, use the encoding for that +# language from that mapping. +# * Use windows-1256 for countries that have a dominant Arabic-script +# language or whose all languages are Arabic-script languages. +# * Use windows-1251 likewise but for Cyrillic script. + +ae=windows-1256 +xn--mgbaam7a8h=windows-1256 + +af=windows-1256 + +bg=windows-1251 + +bh=windows-1256 + +by=windows-1251 + +cn=gbk +xn--fiqs8s=gbk +# Assume that Traditional Chinese TLD is meant to work if URL input happens to +# be in the traditional mode. Expect content to be simplified anyway. +xn--fiqz9s=gbk + +cz=windows-1250 + +dz=windows-1256 +xn--lgbbat1ad8j=windows-1256 + +ee=windows-1257 + +eg=windows-1256 +xn--wgbh1c=windows-1256 + +gr=ISO-8859-7 + +hk=Big5-HKSCS +xn--j6w193g=Big5-HKSCS + +hr=windows-1250 + +hu=ISO-8859-2 + +iq=windows-1256 + +ir=windows-1256 +xn--mgba3a4f16a=windows-1256 + +jo=windows-1256 +xn--mgbayh7gpa=windows-1256 + +jp=Shift_JIS + +kg=windows-1251 + +kp=EUC-KR + +kr=EUC-KR +xn--3e0b707e=EUC-KR + +kw=windows-1256 + +kz=windows-1251 +xn--80ao21a=windows-1251 + +lb=windows-1256 + +lt=windows-1257 + +lv=windows-1257 + +ma=windows-1256 +xn--mgbc0a9azcg=windows-1256 + +mk=windows-1251 + +mn=windows-1251 +xn--l1acc=windows-1251 + +mo=Big5 + +# my +xn--mgbx4cd0ab=windows-1256 + +om=windows-1256 +xn--mgb9awbf=windows-1256 + +#pk +xn--mgbai9azgqp6j=windows-1256 + +pl=ISO-8859-2 + +ps=windows-1256 +xn--ygbi2ammx=windows-1256 + +qa=windows-1256 +xn--wgbl6a=windows-1256 + +rs=windows-1251 +xn--90a3ac=windows-1251 + +ru=windows-1251 +xn--p1ai=windows-1251 + +sa=windows-1256 +xn--mgberp4a5d4ar=windows-1256 + +sd=windows-1256 +xn--mgbpl2fh=windows-1256 + +sg=gbk +xn--yfro4i67o=gbk + +si=ISO-8859-2 + +sk=windows-1250 + +su=windows-1251 + +sy=windows-1256 +xn--mgbtf8fl=windows-1256 + +th=windows-874 +xn--o3cw4h=windows-874 + +tj=windows-1251 + +tn=windows-1256 +xn--pgbs0dh=windows-1256 + +tr=windows-1254 + +tw=Big5 +# Assume that the Simplified Chinese TLD is meant to work when URL input +# happens in the simplified mode. Assume content is tradition anyway. +xn--kprw13d=Big5 +xn--kpry57d=Big5 + +ua=windows-1251 +xn--j1amh=windows-1251 + +uz=windows-1251 + +vn=windows-1258 + +ye=windows-1256 +xn--mgb2ddes=windows-1256 diff --git a/dom/encoding/moz.build b/dom/encoding/moz.build index 6786661d1d1f..e18ee9f6c02b 100644 --- a/dom/encoding/moz.build +++ b/dom/encoding/moz.build @@ -28,6 +28,8 @@ LOCAL_INCLUDES += [ ] GENERATED_FILES += [ + 'domainsfallbacks.properties.h', 'labelsencodings.properties.h', 'localesfallbacks.properties.h', + 'nonparticipatingdomains.properties.h', ] diff --git a/dom/encoding/nonparticipatingdomains.properties b/dom/encoding/nonparticipatingdomains.properties new file mode 100644 index 000000000000..b2e1396bf9b9 --- /dev/null +++ b/dom/encoding/nonparticipatingdomains.properties @@ -0,0 +1,51 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Top-level domains listed here do not participate in TLD-based guessing. +# +# We should do Web crawls to see if domains listed here can migrate to +# domainsfallbacks.properties. +# +# The value to the right of the = sign is ignored and serves as a placeholder. + +# Generic +com=windows-1252 +net=windows-1252 +org=windows-1252 + +# No Firefox localization for Azeri +az=windows-1254 + +# windows-1251 or windows-1250? +ba=??? + +# ISO-8859-7 or windows-1254? +cy=??? + +# Is there enough unlabeled windows-1256 content for a windows-1255 to break +# too much? +il=windows-1255 + +# Out-of-country English use +ly=windows-1256 + +# Out-of-country English use +# md=windows-1250 + +# Out-of-country English use +# me=windows-1251 + +# Malaysia has an Arabic-script TLD, official script is latin, possibly Chinese-script publications +my=??? + +# No Firefox localization for Urdu; potential for minority-language sites +# relying on windows-1252 hacks. +pk=windows-1256 + +# The Romanian localization says windows-1252, even though the Windows legacy +# differs. +ro=windows-1250 + +tm=windows-1250 + diff --git a/dom/encoding/test/file_TLD.html b/dom/encoding/test/file_TLD.html new file mode 100644 index 000000000000..468c7fdf0e00 --- /dev/null +++ b/dom/encoding/test/file_TLD.html @@ -0,0 +1,7 @@ + + + diff --git a/dom/encoding/test/mochitest.ini b/dom/encoding/test/mochitest.ini index 90aca28daeaa..63a71b147100 100644 --- a/dom/encoding/test/mochitest.ini +++ b/dom/encoding/test/mochitest.ini @@ -7,6 +7,7 @@ support-files = file_utf16_le_bom.js file_utf16_le_bom.xhtml file_utf16_le_nobom.xhtml + file_TLD.html worker_helper.js [test_BOMEncoding.js] @@ -16,4 +17,5 @@ support-files = [test_TextEncoder.js] [test_stringencoding.html] [test_submit_euckr.html] +[test_TLD.html] [test_utf16_files.html] diff --git a/dom/encoding/test/test_TLD.html b/dom/encoding/test/test_TLD.html new file mode 100644 index 000000000000..d602b74e28c1 --- /dev/null +++ b/dom/encoding/test/test_TLD.html @@ -0,0 +1,57 @@ + + + + + + Test for Bug 910211 + + + + + +Mozilla Bug 910211 +

+ +
+
+ + diff --git a/modules/libpref/src/init/all.js b/modules/libpref/src/init/all.js index 6e2225c8c1bb..2caff44047cb 100644 --- a/modules/libpref/src/init/all.js +++ b/modules/libpref/src/init/all.js @@ -1392,6 +1392,7 @@ pref("intl.charsetmenu.composer.cache", ""); pref("intl.charsetmenu.browser.cache.size", 5); pref("intl.charset.detector", "chrome://global/locale/intl.properties"); pref("intl.charset.fallback.override", ""); +pref("intl.charset.fallback.tld", true); pref("intl.ellipsis", "chrome://global-platform/locale/intl.properties"); pref("intl.locale.matchOS", false); // fallback charset list for Unicode conversion (converting from Unicode) diff --git a/parser/nsCharsetSource.h b/parser/nsCharsetSource.h index cd555e25e3ba..bd85bba108c9 100644 --- a/parser/nsCharsetSource.h +++ b/parser/nsCharsetSource.h @@ -8,18 +8,19 @@ // note: the value order defines the priority; higher numbers take priority #define kCharsetUninitialized 0 #define kCharsetFromFallback 1 -#define kCharsetFromDocTypeDefault 2 // This and up confident for XHR -#define kCharsetFromCache 3 -#define kCharsetFromParentFrame 4 -#define kCharsetFromAutoDetection 5 -#define kCharsetFromHintPrevDoc 6 -#define kCharsetFromMetaPrescan 7 // this one and smaller: HTML5 Tentative -#define kCharsetFromMetaTag 8 // this one and greater: HTML5 Confident -#define kCharsetFromIrreversibleAutoDetection 9 -#define kCharsetFromChannel 10 -#define kCharsetFromOtherComponent 11 -#define kCharsetFromParentForced 12 // propagates to child frames -#define kCharsetFromUserForced 13 // propagates to child frames -#define kCharsetFromByteOrderMark 14 +#define kCharsetFromTopLevelDomain 2 +#define kCharsetFromDocTypeDefault 3 // This and up confident for XHR +#define kCharsetFromCache 4 +#define kCharsetFromParentFrame 5 +#define kCharsetFromAutoDetection 6 +#define kCharsetFromHintPrevDoc 7 +#define kCharsetFromMetaPrescan 8 // this one and smaller: HTML5 Tentative +#define kCharsetFromMetaTag 9 // this one and greater: HTML5 Confident +#define kCharsetFromIrreversibleAutoDetection 10 +#define kCharsetFromChannel 11 +#define kCharsetFromOtherComponent 12 +#define kCharsetFromParentForced 13 // propagates to child frames +#define kCharsetFromUserForced 14 // propagates to child frames +#define kCharsetFromByteOrderMark 15 #endif /* nsCharsetSource_h_ */