Bug 1623383 - Cleanup URIFixup.jsm. r=harry

Differential Revision: https://phabricator.services.mozilla.com/D73687
This commit is contained in:
Marco Bonardo 2020-05-06 15:37:43 +00:00
parent 1943cf3dcf
commit 7a00a2e24a

View File

@ -7,11 +7,14 @@
/**
* This component handles fixing up URIs, by correcting obvious typos and adding
* missing schemes.
* URI references:
* http://www.faqs.org/rfcs/rfc1738.html
* http://www.faqs.org/rfcs/rfc2396.html
*/
// TODO (Bug 1623383): This is a direct conversion from .cpp, as such a few
// functions are overly complex and should be cleaned up.
/* eslint complexity: ["error", 63] */
// getFixupURIInfo has a complex logic, that likely could be simplified, but
// the risk of regressions is high, thus that should be done with care.
/* eslint complexity: ["error", 39] */
var EXPORTED_SYMBOLS = ["URIFixup", "URIFixupInfo"];
@ -74,36 +77,25 @@ const {
const COMMON_PROTOCOLS = ["http", "https", "ftp", "file"];
// Regex used to identify user:password tokens in url strings.
// This is not a strict character check, because we try to fixup this part of
// a the url too.
// This is not a strict valid characters check, because we try to fixup this
// part of the url too.
XPCOMUtils.defineLazyGetter(
this,
"userPasswordRegex",
() => /^([a-z+.-]+:\/{0,3})*[^\/@]+@.+/i
);
// Regex used to identify ascii alphabetical characters.
XPCOMUtils.defineLazyGetter(this, "asciiAlphaRegex", () => /^[a-z]$/i);
// Regex used to look for ascii alphabetical characters.
XPCOMUtils.defineLazyGetter(this, "asciiAlphaRegex", () => /[a-z]/i);
// Regex used to identify ascii a-f characters.
XPCOMUtils.defineLazyGetter(this, "asciiHexRegex", () => /^[a-f0-9]$/i);
// Regex used to identify ascii numbers.
XPCOMUtils.defineLazyGetter(
this,
"asciiNumberRegex",
() => /^[0-9]+(\.[0-9]+)?$/
);
// Regex used to identify numbers.
XPCOMUtils.defineLazyGetter(this, "numberRegex", () => /^[0-9]+(\.[0-9]+)?$/);
// Regex used to identify tab separated content (having at least 2 tabs).
XPCOMUtils.defineLazyGetter(this, "maxOneTabRegex", () => /^[^\t]*\t?[^\t]*$/);
// Regex to test if the protocol might actually be a url without a protocol:
//
// http://www.faqs.org/rfcs/rfc1738.html
// http://www.faqs.org/rfcs/rfc2396.html
//
// e.g. Anything of the form:
// Regex used to test if a string with a protocol might instead be a url
// without a protocol but with a port:
//
// <hostname>:<port> or
// <hostname>:<port>/
@ -138,18 +130,34 @@ XPCOMUtils.defineLazyGetter(
() => /^([a-z][a-z0-9.+\t-]*):/i
);
// Regex used to match IPs. Note that these are not made to validate IPs, but
// just to detect strings that look like an IP. They also skip protocol.
// For IPv4 this also accepts a shorthand format with just 2 dots.
XPCOMUtils.defineLazyGetter(
this,
"IPv4LikeRegex",
() => /^(?:[a-z+.-]+:\/*(?!\/))?(?:\d{1,3}\.){2,3}\d{1,3}(?::\d+|\/)?/i
);
XPCOMUtils.defineLazyGetter(
this,
"IPv6LikeRegex",
() =>
/^(?:[a-z+.-]+:\/*(?!\/))?\[(?:[0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?(?::\d+|\/)?/i
);
// Cache of whitelisted domains.
XPCOMUtils.defineLazyGetter(this, "domainsWhitelist", () => {
const branch = "browser.fixup.domainwhitelist.";
let domains = new Set(
Services.prefs
.getChildList("browser.fixup.domainwhitelist.")
.getChildList(branch)
.filter(p => Services.prefs.getBoolPref(p, false))
.map(p => p.substring(30))
.map(p => p.substring(branch.length))
);
// Hold onto the observer.
// Hold onto the observer to avoid it being GC-ed.
domains._observer = {
observe(subject, topic, data) {
let domain = data.substring(30);
let domain = data.substring(branch.length);
if (Services.prefs.getBoolPref(data, false)) {
domains.add(domain);
} else {
@ -161,11 +169,7 @@ XPCOMUtils.defineLazyGetter(this, "domainsWhitelist", () => {
Ci.nsISupportsWeakReference,
]),
};
Services.prefs.addObserver(
"browser.fixup.domainwhitelist.",
domains._observer,
true
);
Services.prefs.addObserver(branch, domains._observer, true);
return domains;
});
@ -209,40 +213,12 @@ URIFixup.prototype = {
let info = new URIFixupInfo(uriString);
let scheme = extractScheme(uriString);
let isCommonProtocol = COMMON_PROTOCOLS.includes(scheme);
// View-source is a pseudo scheme. We're interested in fixing up the stuff
// after it. The easiest way to do that is to call this method again with
// the "view-source:" lopped off and then prepend it again afterwards.
if (scheme == "view-source") {
// We disable keyword lookup and alternate URIs so that small typos don't
// cause us to look at very different domains.
let newFixupFlags =
fixupFlags &
~FIXUP_FLAG_ALLOW_KEYWORD_LOOKUP &
~FIXUP_FLAGS_MAKE_ALTERNATE_URI;
let innerURIString = uriString.substring(12).trim();
// Prevent recursion.
let innerScheme = extractScheme(innerURIString);
if (innerScheme == "view-source") {
throw new Components.Exception(
"Prevent view-source recursion",
Cr.NS_ERROR_FAILURE
);
}
let innerInfo = this.getFixupURIInfo(
innerURIString,
newFixupFlags,
info.preferredURI = info.fixedURI = fixupViewSource(
uriString,
fixupFlags,
postData
);
if (!innerInfo.preferredURI) {
throw new Components.Exception("No preferredURI", Cr.NS_ERROR_FAILURE);
}
info.preferredURI = info.fixedURI = Services.io.newURI(
"view-source:" + innerInfo.preferredURI.spec
);
return info;
}
@ -259,13 +235,14 @@ URIFixup.prototype = {
// Fix up common scheme typos.
// TODO: Use levenshtein distance here?
let isCommonProtocol = COMMON_PROTOCOLS.includes(scheme);
if (
fixupSchemeTypos &&
fixupFlags & FIXUP_FLAG_FIX_SCHEME_TYPOS &&
scheme &&
!isCommonProtocol
) {
for (let [typo, fixed] of [
info.fixupChangedProtocol = [
["ttp", "http"],
["htp", "http"],
["ttps", "https"],
@ -274,19 +251,18 @@ URIFixup.prototype = {
["htps", "https"],
["ile", "file"],
["le", "file"],
]) {
].some(([typo, fixed]) => {
if (uriString.startsWith(typo)) {
scheme = fixed;
uriString = scheme + uriString.substring(typo.length);
info.fixupChangedProtocol = true;
isCommonProtocol = true;
break;
return true;
}
}
return false;
});
}
// Now we need to check whether "scheme" is something we don't
// really know about.
// Now check whether "scheme" is something we don't know about.
let isDefaultProtocolHandler =
!isCommonProtocol &&
scheme &&
@ -329,9 +305,7 @@ URIFixup.prototype = {
if (info.fixedURI) {
if (!info.preferredURI) {
if (fixupFlags & FIXUP_FLAGS_MAKE_ALTERNATE_URI) {
info.fixupCreatedAlternateURI = makeAlternateFixedURI(info);
}
maybeSetAlternateFixedURI(info, fixupFlags);
info.preferredURI = info.fixedURI;
}
return info;
@ -342,15 +316,10 @@ URIFixup.prototype = {
// Prune duff protocol schemes:
// ://totallybroken.url.com
// //shorthand.url.com
let inputHadDuffProtocol = false;
if (!scheme) {
if (uriString.startsWith("://")) {
uriString = uriString.substring(3);
inputHadDuffProtocol = true;
} else if (uriString.startsWith("//")) {
uriString = uriString.substring(2);
inputHadDuffProtocol = true;
}
let inputHadDuffProtocol =
uriString.startsWith("://") || uriString.startsWith("//");
if (inputHadDuffProtocol) {
uriString = uriString.replace(/^:?\/\//, "");
}
// Avoid fixing up content that looks like tab-separated values.
@ -361,11 +330,7 @@ URIFixup.prototype = {
if (uriWithProtocol) {
info.fixedURI = uriWithProtocol;
info.fixupChangedProtocol = true;
// Did the caller want us to try an alternative URI?
// If so, attempt to fixup http://foo into http://www.foo.com
if (fixupFlags & FIXUP_FLAGS_MAKE_ALTERNATE_URI) {
info.fixupCreatedAlternateURI = makeAlternateFixedURI(info);
}
maybeSetAlternateFixedURI(info, fixupFlags);
info.preferredURI = info.fixedURI;
}
}
@ -374,11 +339,10 @@ URIFixup.prototype = {
if (
keywordEnabled &&
fixupFlags & FIXUP_FLAG_ALLOW_KEYWORD_LOOKUP &&
!inputHadDuffProtocol
!inputHadDuffProtocol &&
keywordURIFixup(uriString, info, isPrivateContext, postData)
) {
if (keywordURIFixup(uriString, info, isPrivateContext, postData)) {
return info;
}
return info;
}
if (
@ -400,7 +364,7 @@ URIFixup.prototype = {
}
if (!info.preferredURI) {
// we couldn't salvage anything.
// We couldn't salvage anything.
throw new Components.Exception(
"Couldn't build a valid uri",
Cr.NS_ERROR_MALFORMED_URI
@ -575,6 +539,7 @@ URIFixupInfo.prototype = {
* Implementation of isDomainWhitelisted, so we don't have to go through the
* service.
* @param {string} asciiHost
* @returns {boolean} whether the domain is whitelisted
*/
function isDomainWhitelisted(asciiHost) {
if (dnsFirstForSingleWords) {
@ -595,8 +560,8 @@ function isDomainWhitelisted(asciiHost) {
* If the suffix is unknown due to a typo this will try to fix it up.
* @param {URIFixupInfo} info about the uri to check.
* @note this may modify the public suffix of info.fixedURI.
* @returns false if the public suffix is unknown, true in any other case,
* included if it's not present.
* @returns {boolean} false if the public suffix is unknown, true in any other
* case, included if it's not present.
*/
function checkAndFixPublicSuffix(info) {
let uri = info.fixedURI;
@ -625,7 +590,7 @@ function checkAndFixPublicSuffix(info) {
// Suffix is unknown, try to fix most common 3 chars TLDs typos.
// .com is the most commonly mistyped tld, so it has more cases.
let suffix = Services.eTLD.getPublicSuffix(uri);
if (!suffix || asciiNumberRegex.test(suffix)) {
if (!suffix || numberRegex.test(suffix)) {
return true;
}
for (let [typo, fixed] of [
@ -685,12 +650,14 @@ function tryKeywordFixupForURIInfo(
* This generates an alternate fixedURI, by adding a prefix and a suffix to
* the fixedURI host, if and only if the protocol is http. It should _never_
* modify URIs with other protocols.
* @param info an URIInfo object
* @returns whether an alternate uri was generated.
* @param {URIFixupInfo} info an URIInfo object
* @param {integer} fixupFlags the fixup flags
* @returns {boolean} Whether an alternate uri was generated
*/
function makeAlternateFixedURI(info) {
function maybeSetAlternateFixedURI(info, fixupFlags) {
let uri = info.fixedURI;
if (
!(fixupFlags & FIXUP_FLAGS_MAKE_ALTERNATE_URI) ||
!alternateEnabled ||
// Code only works for http. Not for any other protocol including https!
!uri.schemeIs("http") ||
@ -701,14 +668,13 @@ function makeAlternateFixedURI(info) {
) {
return false;
}
let oldHost = uri.host;
// Don't create an alternate uri for localhost, because it would be confusing.
if (oldHost == "localhost") {
return false;
}
let numDots = (oldHost.match(/\./g) || []).length;
// Get the prefix and suffix to stick onto the new hostname. By default these
// are www. & .com but they could be any other value, e.g. www. & .org
let prefix = Services.prefs.getCharPref(
@ -721,6 +687,7 @@ function makeAlternateFixedURI(info) {
);
let newHost = "";
let numDots = (oldHost.match(/\./g) || []).length;
if (numDots == 0) {
newHost = prefix + oldHost + suffix;
} else if (numDots == 1) {
@ -746,12 +713,13 @@ function makeAlternateFixedURI(info) {
}
return false;
}
info.fixupCreatedAlternateURI = true;
return true;
}
/**
* Try to fixup a file URI.
* @param uriString The file URI to fix.
* @param {string} uriString The file URI to fix.
* @returns {nsIURI} a fixed uri or null.
* @note FileURIFixup only returns a URI if it has to add the file: protocol.
*/
@ -791,7 +759,7 @@ function fileURIFixup(uriString) {
* user:pass@no-scheme.com
*
* @param {string} uriString The string to fixup.
* @returns an nsIURI built adding the default protocol to the string,
* @returns {nsIURI} an nsIURI built adding the default protocol to the string,
* or null if fixing was not possible.
*/
function fixupURIProtocol(uriString) {
@ -837,165 +805,41 @@ function keywordURIFixup(uriString, fixupInfo, isPrivateContext, postData) {
// "nonQualifiedHost?some args"
// "blah.com."
// Note: -1 is greater than any actual location
// in practice. So if we cast all locations to uint32_t, then a <
// b guarantees that either b is kNotFound and a is found, or both
// are found and a found before b.
// TODO: the cpp code used to assign -1 to an unsigned int, causing overflow.
// We should replace most of this with better js code.
const NOT_FOUND = Infinity;
let firstDotLoc = NOT_FOUND;
let lastDotLoc = NOT_FOUND;
let firstColonLoc = NOT_FOUND;
let firstQuoteLoc = NOT_FOUND;
let firstSpaceLoc = NOT_FOUND;
let firstQMarkLoc = NOT_FOUND;
let lastLSBracketLoc = NOT_FOUND;
let lastSlashLoc = NOT_FOUND;
let foundDots = 0;
let foundColons = 0;
let foundDigits = 0;
let foundRSBrackets = 0;
let looksLikeIpv6 = true;
let hasAsciiAlpha = false;
for (let i = 0, pos = 0; i < uriString.length; ++i, ++pos) {
let iter = uriString[i];
if (pos >= 1 && foundRSBrackets == 0) {
if (
!(
lastLSBracketLoc == 0 &&
(iter == ":" ||
iter == "." ||
iter == "]" ||
asciiHexRegex.test(iter))
)
) {
looksLikeIpv6 = false;
}
}
// If we're at the end of the string or this is the first slash,
// check if the thing before the slash looks like ipv4:
if (
(i == uriString.length - 1 ||
(lastSlashLoc == NOT_FOUND && iter == "/")) &&
// Need 2 or 3 dots + only digits
(foundDots == 2 || foundDots == 3) &&
// and they should be all that came before now:
(foundDots + foundDigits == pos ||
// or maybe there was also exactly 1 colon that came after the last
// dot, and the digits, dots and colon were all that came before now:
(foundColons == 1 &&
firstColonLoc > lastDotLoc &&
foundDots + foundDigits + foundColons == pos))
) {
// Hurray, we got ourselves some ipv4!
// At this point, there's no way we will do a keyword lookup, so just bail
// immediately:
return false;
}
if (iter == ".") {
foundDots++;
lastDotLoc = pos;
if (firstDotLoc == NOT_FOUND) {
firstDotLoc = pos;
}
} else if (iter == ":") {
foundColons++;
if (firstColonLoc == NOT_FOUND) {
firstColonLoc = pos;
}
} else if (iter == " " && firstSpaceLoc == NOT_FOUND) {
firstSpaceLoc = pos;
} else if (iter == "?" && firstQMarkLoc == NOT_FOUND) {
firstQMarkLoc = pos;
} else if ((iter == "'" || iter == '"') && firstQuoteLoc == NOT_FOUND) {
firstQuoteLoc = pos;
} else if (iter == "[") {
lastLSBracketLoc = pos;
} else if (iter == "]") {
foundRSBrackets++;
} else if (iter == "/") {
lastSlashLoc = pos;
} else if (asciiAlphaRegex.test(iter)) {
hasAsciiAlpha = true;
} else if (asciiNumberRegex.test(iter)) {
foundDigits++;
}
}
if (lastLSBracketLoc > 0 || foundRSBrackets != 1) {
looksLikeIpv6 = false;
}
// If there are only colons and only hexadecimal characters ([a-z][0-9])
// enclosed in [], then don't do a keyword lookup
if (looksLikeIpv6) {
// Check for IPs.
if (IPv4LikeRegex.test(uriString) || IPv6LikeRegex.test(uriString)) {
return false;
}
let asciiHost = fixupInfo.fixedURI && fixupInfo.fixedURI.asciiHost;
let displayHost = fixupInfo.fixedURI && fixupInfo.fixedURI.displayHost;
// We do keyword lookups if a space or quote preceded the dot, colon
// or question mark (or if the latter is not found, or if the input starts
// with a question mark)
if (
((firstSpaceLoc < firstDotLoc || firstQuoteLoc < firstDotLoc) &&
(firstSpaceLoc < firstColonLoc || firstQuoteLoc < firstColonLoc) &&
(firstSpaceLoc < firstQMarkLoc || firstQuoteLoc < firstQMarkLoc)) ||
firstQMarkLoc == 0
) {
// We do keyword lookups if the input starts with a question mark, or if it
// contains a space or quote, provided they don't come after a dot, colon or
// question mark.
if (uriString.startsWith("?") || /^[^.:?]*[\s"']/.test(uriString)) {
return tryKeywordFixupForURIInfo(
fixupInfo.originalInput,
fixupInfo,
isPrivateContext,
postData
);
// ... or when the asciiHost is the same as displayHost and there are no
// characters from [a-z][A-Z]
} else if (
}
// Avoid lookup if we can identify an host and it's whitelisted.
// Note that if dnsFirstForSingleWords is true isDomainWhitelisted will always
// return true, so we can avoid checking dnsFirstForSingleWords after this.
let asciiHost = fixupInfo.fixedURI?.asciiHost;
if (asciiHost && isDomainWhitelisted(asciiHost)) {
return false;
}
// Or when the asciiHost is the same as displayHost and there are no
// alphabetical characters
let displayHost = fixupInfo.fixedURI && fixupInfo.fixedURI.displayHost;
let hasAsciiAlpha = asciiAlphaRegex.test(uriString);
if (
asciiHost &&
displayHost &&
!hasAsciiAlpha &&
asciiHost.toLowerCase() == displayHost.toLowerCase()
) {
if (!dnsFirstForSingleWords) {
return tryKeywordFixupForURIInfo(
fixupInfo.originalInput,
fixupInfo,
isPrivateContext,
postData
);
}
// ... or if there is no question mark or colon, and there is either no
// dot, or exactly 1 and it is the first or last character of the input:
} else if (
(firstDotLoc == NOT_FOUND ||
(foundDots == 1 &&
(firstDotLoc == 0 || firstDotLoc == uriString.length - 1))) &&
firstColonLoc == NOT_FOUND &&
firstQMarkLoc == NOT_FOUND
) {
if (asciiHost && isDomainWhitelisted(asciiHost)) {
return false;
}
// ... unless there are no dots, and a slash, and alpha characters, and
// this is a valid host:
if (
firstDotLoc == NOT_FOUND &&
lastSlashLoc != NOT_FOUND &&
hasAsciiAlpha &&
asciiHost
) {
return false;
}
// If we get here, we don't have a valid URI, or we did but the
// host is not whitelisted, so we do a keyword search *anyway*:
return tryKeywordFixupForURIInfo(
fixupInfo.originalInput,
fixupInfo,
@ -1003,10 +847,93 @@ function keywordURIFixup(uriString, fixupInfo, isPrivateContext, postData) {
postData
);
}
// Avoid lookup if we reached this point and there is a question mark or colon.
if (uriString.includes(":") || uriString.includes("?")) {
return false;
}
// Keyword lookup if there is exactly one dot and it is the first or last
// character of the input.
let firstDotIndex = uriString.indexOf(".");
if (
firstDotIndex == uriString.length - 1 ||
(firstDotIndex == 0 && firstDotIndex == uriString.lastIndexOf("."))
) {
return tryKeywordFixupForURIInfo(
fixupInfo.originalInput,
fixupInfo,
isPrivateContext,
postData
);
}
// Keyword lookup if there is no dot and the string doesn't include a slash,
// or any alphabetical character or an host.
if (
firstDotIndex == -1 &&
(!uriString.includes("/") || !hasAsciiAlpha || !asciiHost)
) {
return tryKeywordFixupForURIInfo(
fixupInfo.originalInput,
fixupInfo,
isPrivateContext,
postData
);
}
return false;
}
/**
* Mimics the logic in Services.io.extractScheme, but avoids crossing XPConnect.
* @param {string} uriString the string to examine
* @returns {string} a scheme or empty string if one could not be identified
*/
function extractScheme(uriString) {
let matches = uriString.match(possibleProtocolRegex);
return matches ? matches[1].replace("\t", "").toLowerCase() : "";
}
/**
* View-source is a pseudo scheme. We're interested in fixing up the stuff
* after it. The easiest way to do that is to call this method again with
* the "view-source:" lopped off and then prepend it again afterwards.
* @param {string} uriString The original string to fixup
* @param {integer} fixupFlags The original fixup flags
* @param {nsIInputStream} postData Optional POST data for the search
* @returns {nsIURI} a fixed URI
* @throws if it's not possible to fixup the url
*/
function fixupViewSource(uriString, fixupFlags, postData) {
// We disable keyword lookup and alternate URIs so that small typos don't
// cause us to look at very different domains.
let newFixupFlags =
fixupFlags &
~FIXUP_FLAG_ALLOW_KEYWORD_LOOKUP &
~FIXUP_FLAGS_MAKE_ALTERNATE_URI;
let innerURIString = uriString.substring(12).trim();
// Prevent recursion.
let innerScheme = extractScheme(innerURIString);
if (innerScheme == "view-source") {
throw new Components.Exception(
"Prevent view-source recursion",
Cr.NS_ERROR_FAILURE
);
}
let info = Services.uriFixup.getFixupURIInfo(
innerURIString,
newFixupFlags,
postData
);
if (!info.preferredURI) {
throw new Components.Exception(
"Couldn't build a valid uri",
Cr.NS_ERROR_MALFORMED_URI
);
}
return Services.io.newURI("view-source:" + info.preferredURI.spec);
}