Bug 1887943 - Refine textContent domain extraction method. r=jteow

Differential Revision: https://phabricator.services.mozilla.com/D205928
2024-11-27 14:52:16 +00:00 · 2024-04-02 16:08:19 +00:00 · 2024-04-02 16:08:19 +00:00 · 0f6c6ec3b1
commit 0f6c6ec3b1
parent eb6cb77183
3 changed files with 102 additions and 12 deletions
--- a/browser/actors/SearchSERPTelemetryChild.sys.mjs
+++ b/browser/actors/SearchSERPTelemetryChild.sys.mjs
@ -1077,7 +1077,11 @@ class DomainExtractor {
          break;
        }
        case "textContent": {
-          this.#fromElementsRetrieveTextContent(elements, extractedDomains);
+          this.#fromElementsRetrieveTextContent(
+            elements,
+            extractedDomains,
+            providerName
+          );
          break;
        }
      }
@ -1197,8 +1201,26 @@ class DomainExtractor {
   *  A list of elements from the page whose text content we want to inspect.
   * @param {Set<string>} extractedDomains
   *  The result set of domains extracted from the page.
+   * @param {string} providerName
+   *  The name of the search provider.
   */
-  #fromElementsRetrieveTextContent(elements, extractedDomains) {
+  #fromElementsRetrieveTextContent(elements, extractedDomains, providerName) {
+    // Not an exhaustive regex, but it fits our purpose for this method.
+    const LOOSE_URL_REGEX =
+      /^(?:https?:\/\/)?(?:www\.)?(?:[\w\-]+\.)+(?:[\w\-]{2,})/i;
+
+    // Known but acceptable limitations to this function, where the return
+    // value won't be correctly fixed up:
+    //   1) A url is embedded within other text. Ex: "xkcd.com is cool."
+    //   2) The url contains legal but unusual characters. Ex: $ ! * '
+    function fixup(textContent) {
+      return textContent
+        .toLowerCase()
+        .replaceAll(" ", "")
+        .replace(/\.$/, "")
+        .concat(".com");
+    }
+
    for (let element of elements) {
      if (this.#exceedsThreshold(extractedDomains.size)) {
        return;
@ -1209,18 +1231,24 @@ class DomainExtractor {
      }

      let domain;
-      try {
-        domain = new URL(textContent).hostname;
-      } catch (e) {
-        domain = textContent.toLowerCase().replaceAll(" ", "");
-        // If the attempt to turn the text content into a URL object only fails
-        // because we're missing a protocol, ".com" may already be present.
-        if (!domain.endsWith(".com")) {
-          domain = domain.concat(".com");
+      if (LOOSE_URL_REGEX.test(textContent)) {
+        // Creating a new URL object will throw if the protocol is missing.
+        if (!/^https?:\/\//.test(textContent)) {
+          textContent = "https://" + textContent;
        }
+
+        try {
+          domain = new URL(textContent).hostname;
+        } catch (e) {
+          domain = fixup(textContent);
+        }
+      } else {
+        domain = fixup(textContent);
      }
-      if (!extractedDomains.has(domain)) {
-        extractedDomains.add(domain);
+
+      let processedDomain = this.#processDomain(domain, providerName);
+      if (processedDomain && !extractedDomains.has(processedDomain)) {
+        extractedDomains.add(processedDomain);
      }
    }
  }
--- a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
+++ b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
@ -362,6 +362,37 @@ const TESTS = [
    ],
    expectedDomains: ["organic.com"],
  },
+  {
+    title: "Bing organic result with a path in the URL.",
+    extractorInfos: [
+      {
+        selectors: "#test26 #b_results .b_algo .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["organic.com"],
+  },
+  {
+    title: "Bing organic result with a path and query param in the URL.",
+    extractorInfos: [
+      {
+        selectors: "#test27 #b_results .b_algo .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["organic.com"],
+  },
+  {
+    title:
+      "Bing organic result with a path in the URL, but protocol appears in separate HTML element.",
+    extractorInfos: [
+      {
+        selectors: "#test28 #b_results .b_algo .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["wikipedia.org"],
+  },
 ];

 add_setup(async function () {
--- a/browser/components/search/test/browser/telemetry/searchTelemetryDomainExtraction.html
+++ b/browser/components/search/test/browser/telemetry/searchTelemetryDomainExtraction.html
@ -256,6 +256,37 @@
        </div>
      </div>
    </div>
+
+    <div id="test26">
+      <div id="b_results">
+        <div class="b_algo">
+          <div class="b_attribution">
+            <cite>https://organic.com/cats</cite>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div id="test27">
+      <div id="b_results">
+        <div class="b_algo">
+          <div class="b_attribution">
+            <cite>https://organic.com/testing?q=cats</cite>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div id="test28">
+      <div id="b_results">
+        <div class="b_algo">
+          <div class="b_attribution">
+            <span>HTTPS</span>
+            <cite>en.wikipedia.org/wiki/Cat</cite>
+          </div>
+        </div>
+      </div>
+    </div>
  </div>
 </body>
 </html>