mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-27 14:52:16 +00:00
Bug 1887943 - Refine textContent domain extraction method. r=jteow
Differential Revision: https://phabricator.services.mozilla.com/D205928
This commit is contained in:
parent
eb6cb77183
commit
0f6c6ec3b1
@ -1077,7 +1077,11 @@ class DomainExtractor {
|
||||
break;
|
||||
}
|
||||
case "textContent": {
|
||||
this.#fromElementsRetrieveTextContent(elements, extractedDomains);
|
||||
this.#fromElementsRetrieveTextContent(
|
||||
elements,
|
||||
extractedDomains,
|
||||
providerName
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1197,8 +1201,26 @@ class DomainExtractor {
|
||||
* A list of elements from the page whose text content we want to inspect.
|
||||
* @param {Set<string>} extractedDomains
|
||||
* The result set of domains extracted from the page.
|
||||
* @param {string} providerName
|
||||
* The name of the search provider.
|
||||
*/
|
||||
#fromElementsRetrieveTextContent(elements, extractedDomains) {
|
||||
#fromElementsRetrieveTextContent(elements, extractedDomains, providerName) {
|
||||
// Not an exhaustive regex, but it fits our purpose for this method.
|
||||
const LOOSE_URL_REGEX =
|
||||
/^(?:https?:\/\/)?(?:www\.)?(?:[\w\-]+\.)+(?:[\w\-]{2,})/i;
|
||||
|
||||
// Known but acceptable limitations to this function, where the return
|
||||
// value won't be correctly fixed up:
|
||||
// 1) A url is embedded within other text. Ex: "xkcd.com is cool."
|
||||
// 2) The url contains legal but unusual characters. Ex: $ ! * '
|
||||
function fixup(textContent) {
|
||||
return textContent
|
||||
.toLowerCase()
|
||||
.replaceAll(" ", "")
|
||||
.replace(/\.$/, "")
|
||||
.concat(".com");
|
||||
}
|
||||
|
||||
for (let element of elements) {
|
||||
if (this.#exceedsThreshold(extractedDomains.size)) {
|
||||
return;
|
||||
@ -1209,18 +1231,24 @@ class DomainExtractor {
|
||||
}
|
||||
|
||||
let domain;
|
||||
try {
|
||||
domain = new URL(textContent).hostname;
|
||||
} catch (e) {
|
||||
domain = textContent.toLowerCase().replaceAll(" ", "");
|
||||
// If the attempt to turn the text content into a URL object only fails
|
||||
// because we're missing a protocol, ".com" may already be present.
|
||||
if (!domain.endsWith(".com")) {
|
||||
domain = domain.concat(".com");
|
||||
if (LOOSE_URL_REGEX.test(textContent)) {
|
||||
// Creating a new URL object will throw if the protocol is missing.
|
||||
if (!/^https?:\/\//.test(textContent)) {
|
||||
textContent = "https://" + textContent;
|
||||
}
|
||||
|
||||
try {
|
||||
domain = new URL(textContent).hostname;
|
||||
} catch (e) {
|
||||
domain = fixup(textContent);
|
||||
}
|
||||
} else {
|
||||
domain = fixup(textContent);
|
||||
}
|
||||
if (!extractedDomains.has(domain)) {
|
||||
extractedDomains.add(domain);
|
||||
|
||||
let processedDomain = this.#processDomain(domain, providerName);
|
||||
if (processedDomain && !extractedDomains.has(processedDomain)) {
|
||||
extractedDomains.add(processedDomain);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -362,6 +362,37 @@ const TESTS = [
|
||||
],
|
||||
expectedDomains: ["organic.com"],
|
||||
},
|
||||
{
|
||||
title: "Bing organic result with a path in the URL.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: "#test26 #b_results .b_algo .b_attribution cite",
|
||||
method: "textContent",
|
||||
},
|
||||
],
|
||||
expectedDomains: ["organic.com"],
|
||||
},
|
||||
{
|
||||
title: "Bing organic result with a path and query param in the URL.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: "#test27 #b_results .b_algo .b_attribution cite",
|
||||
method: "textContent",
|
||||
},
|
||||
],
|
||||
expectedDomains: ["organic.com"],
|
||||
},
|
||||
{
|
||||
title:
|
||||
"Bing organic result with a path in the URL, but protocol appears in separate HTML element.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: "#test28 #b_results .b_algo .b_attribution cite",
|
||||
method: "textContent",
|
||||
},
|
||||
],
|
||||
expectedDomains: ["wikipedia.org"],
|
||||
},
|
||||
];
|
||||
|
||||
add_setup(async function () {
|
||||
|
@ -256,6 +256,37 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test26">
|
||||
<div id="b_results">
|
||||
<div class="b_algo">
|
||||
<div class="b_attribution">
|
||||
<cite>https://organic.com/cats</cite>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test27">
|
||||
<div id="b_results">
|
||||
<div class="b_algo">
|
||||
<div class="b_attribution">
|
||||
<cite>https://organic.com/testing?q=cats</cite>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test28">
|
||||
<div id="b_results">
|
||||
<div class="b_algo">
|
||||
<div class="b_attribution">
|
||||
<span>HTTPS</span>
|
||||
<cite>en.wikipedia.org/wiki/Cat</cite>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
Loading…
Reference in New Issue
Block a user