Bug 1846357 - Extract domains for basic SERP links. r=jteow

Differential Revision: https://phabricator.services.mozilla.com/D185609
2023-09-05 16:49:53 +00:00 · 2023-09-05 16:49:53 +00:00 · 6bd56eddeb
--- a/browser/actors/SearchSERPTelemetryChild.sys.mjs
+++ b/browser/actors/SearchSERPTelemetryChild.sys.mjs
@ -18,6 +18,13 @@ XPCOMUtils.defineLazyPreferenceGetter(
  false
 );

+XPCOMUtils.defineLazyPreferenceGetter(
+  lazy,
+  "serpEventTelemetryCategorization",
+  "browser.search.serpEventTelemetryCategorization.enabled",
+  false
+);
+
 const SHARED_DATA_KEY = "SearchTelemetry:ProviderInfo";
 export const ADLINK_CHECK_TIMEOUT_MS = 1000;

@ -848,6 +855,154 @@ class SearchAdImpression {
  }
 }

+/**
+ * An object indicating which elements to examine for domains to extract and
+ * which heuristic technique to use to extract that element's domain.
+ *
+ * @typedef {object} ExtractorInfo
+ * @property {string} selectors
+ *  A string representing the CSS selector that targets the elements on the
+ *  page that contain domains we want to extract.
+ * @property {string} method
+ *  A string representing which domain extraction heuristic to use.
+ *  One of: "href" or "data-attribute".
+ * @property {object | null} options
+ *  Options related to the domain extraction heuristic used.
+ * @property {string | null} options.dataAttributeKey
+ *  The key name of the data attribute to lookup.
+ * @property {string | null} options.queryParamKey
+ *  The key name of the query param value to lookup.
+ */
+
+/**
+ * DomainExtractor examines elements on a page to retrieve the domains.
+ */
+class DomainExtractor {
+  /**
+   * Extract domains from the page using an array of information pertaining to
+   * the SERP.
+   *
+   * @param {Document} document
+   *  The document for the SERP we are extracting domains from.
+   * @param {Array<ExtractorInfo>} extractorInfos
+   *  Information used to target the domains we need to extract.
+   * @return {Set<string>}
+   *  A set of the domains extracted from the page.
+   */
+  extractDomainsFromDocument(document, extractorInfos) {
+    let extractedDomains = new Set();
+    if (!extractorInfos?.length) {
+      return extractedDomains;
+    }
+
+    for (let extractorInfo of extractorInfos) {
+      if (!extractorInfo.selectors) {
+        continue;
+      }
+
+      let elements = document.querySelectorAll(extractorInfo.selectors);
+      if (!elements) {
+        continue;
+      }
+
+      switch (extractorInfo.method) {
+        case "href": {
+          // Origin is used in case a URL needs to be made absolute.
+          let origin = new URL(document.documentURI).origin;
+          this.#fromElementsConvertHrefsIntoDomains(
+            elements,
+            origin,
+            extractedDomains,
+            extractorInfo.options?.queryParamKey
+          );
+          break;
+        }
+        case "data-attribute": {
+          this.#fromElementsRetrieveDataAttributeValues(
+            elements,
+            extractorInfo.options?.dataAttributeKey,
+            extractedDomains
+          );
+          break;
+        }
+      }
+    }
+
+    return extractedDomains;
+  }
+
+  /**
+   * Given a list of elements, extract domains using href attributes. If the
+   * URL in the href includes the specified query param, the domain will be
+   * that query param's value. Otherwise it will be the hostname of the href
+   * attribute's URL.
+   *
+   * @param {NodeList<Element>} elements
+   *  A list of elements from the page whose href attributes we want to
+   *  inspect.
+   * @param {string} origin
+   *  Origin of the current page.
+   * @param {Set<string>} extractedDomains
+   *  The result set of domains extracted from the page.
+   * @param {string | null} queryParam
+   *  An optional query param to search for in an element's href attribute.
+   */
+  #fromElementsConvertHrefsIntoDomains(
+    elements,
+    origin,
+    extractedDomains,
+    queryParam
+  ) {
+    for (let element of elements) {
+      let href = element.getAttribute("href");
+
+      let url;
+      try {
+        url = new URL(href, origin);
+      } catch (ex) {
+        continue;
+      }
+
+      // Ignore non-standard protocols.
+      if (url.protocol != "https:" && url.protocol != "http:") {
+        continue;
+      }
+
+      let domain = queryParam ? url.searchParams.get(queryParam) : url.hostname;
+      if (domain && !extractedDomains.has(domain)) {
+        extractedDomains.add(domain);
+      }
+    }
+  }
+
+  /**
+   * Given a list of elements, examine each for the specified data attribute.
+   * If found, add that data attribute's value to the result set of extracted
+   * domains as is.
+   *
+   * @param {NodeList<Element>} elements
+   *  A list of elements from the page whose data attributes we want to
+   *  inspect.
+   * @param {string} attribute
+   *  The name of a data attribute to search for within an element.
+   * @param {Set<string>} extractedDomains
+   *  The result set of domains extracted from the page.
+   */
+  #fromElementsRetrieveDataAttributeValues(
+    elements,
+    attribute,
+    extractedDomains
+  ) {
+    for (let element of elements) {
+      let value = element.dataset[attribute];
+      if (value && !extractedDomains.has(value)) {
+        extractedDomains.add(value);
+      }
+    }
+  }
+}
+
+export const domainExtractor = new DomainExtractor();
 const searchProviders = new SearchProviders();
 const searchAdImpression = new SearchAdImpression();

@ -967,6 +1122,34 @@ export class SearchSERPTelemetryChild extends JSWindowActorChild {
        });
      }
    }
+
+    if (
+      lazy.serpEventTelemetryCategorization &&
+      providerInfo.domainExtraction &&
+      (eventType == "load" || eventType == "pageshow")
+    ) {
+      let start = Cu.now();
+      let nonAdDomains = domainExtractor.extractDomainsFromDocument(
+        doc,
+        providerInfo.domainExtraction.nonAds
+      );
+      let adDomains = domainExtractor.extractDomainsFromDocument(
+        doc,
+        providerInfo.domainExtraction.ads
+      );
+
+      this.sendAsyncMessage("SearchTelemetry:Domains", {
+        url,
+        nonAdDomains,
+        adDomains,
+      });
+
+      ChromeUtils.addProfilerMarker(
+        "SearchSERPTelemetryChild._checkForAdLink",
+        start,
+        "Extract domains from elements"
+      );
+    }
  }

  /**
--- a/browser/actors/SearchSERPTelemetryParent.sys.mjs
+++ b/browser/actors/SearchSERPTelemetryParent.sys.mjs
@ -29,6 +29,10 @@ export class SearchSERPTelemetryParent extends JSWindowActorParent {
        lazy.SearchSERPTelemetry.reportPageImpression(msg.data, browser);
        break;
      }
+      case "SearchTelemetry:Domains": {
+        lazy.SearchSERPTelemetry.reportPageDomains(msg.data, browser);
+        break;
+      }
    }
  }
 }
--- a/browser/app/profile/firefox.js
+++ b/browser/app/profile/firefox.js
@ -724,6 +724,9 @@ pref("browser.search.serpEventTelemetry.enabled", true);
 pref("browser.search.serpEventTelemetry.enabled", false);
 #endif

+// Enables search SERP telemetry page categorization.
+pref("browser.search.serpEventTelemetryCategorization.enabled", false);
+
 // Enable new experimental shopping features. This is solely intended as a
 // rollout/"emergency stop" button - it will go away once the feature has
 // rolled out. There will be separate controls for user opt-in/opt-out.
--- a/browser/components/search/SearchSERPTelemetry.sys.mjs
+++ b/browser/components/search/SearchSERPTelemetry.sys.mjs
@ -38,6 +38,13 @@ XPCOMUtils.defineLazyPreferenceGetter(
  false
 );

+XPCOMUtils.defineLazyPreferenceGetter(
+  lazy,
+  "serpEventTelemetryCategorization",
+  "browser.search.serpEventTelemetryCategorization.enabled",
+  false
+);
+
 export var SearchSERPTelemetryUtils = {
  ACTIONS: {
    CLICKED: "clicked",
@ -322,6 +329,10 @@ class TelemetryHandler {
    this._contentHandler._reportPageWithAdImpressions(info, browser);
  }

+  reportPageDomains(info, browser) {
+    this._contentHandler._reportPageDomains(info, browser);
+  }
+
  reportPageImpression(info, browser) {
    this._contentHandler._reportPageImpression(info, browser);
  }
@ -1307,6 +1318,138 @@ class ContentHandler {
      lazy.logConsole.debug("Could not find an impression id.");
    }
  }
+
+  /**
+   * Initiates the categorization and reporting of domains extracted from
+   * SERPs.
+   *
+   * @param {object} info
+   *   The search provider infomation for the page.
+   * @param {Set} info.nonAdDomains
+       The non-ad domains extracted from the page. 
+   * @param {Set} info.adDomains
+       The ad domains extracted from the page. 
+   * @param {object} browser
+   *   The browser associated with the page.
+   */
+  _reportPageDomains(info, browser) {
+    let item = this._findBrowserItemForURL(info.url);
+    let telemetryState = item.browserTelemetryStateMap.get(browser);
+    if (lazy.serpEventTelemetryCategorization && telemetryState) {
+      let provider = item?.info.provider;
+      if (provider) {
+        SearchSERPCategorization.categorizeDomainsFromProvider(
+          info.nonAdDomains,
+          info.adDomains,
+          provider
+        );
+        Services.obs.notifyObservers(
+          null,
+          "reported-page-with-categorized-domains"
+        );
+      }
+    }
+  }
+}
+
+/**
+ * Categorizes SERPs.
+ */
+class DomainCategorizer {
+  /**
+   * Categorizes domains extracted from SERPs.
+   *
+   * @param {Set} nonAdDomains
+   *   The non-ad domains extracted from the page.
+   * @param {Set} adDomains
+   *   The ad domains extracted from the page.
+   * @param {string} provider
+   *   The provider associated with the page.
+   */
+  categorizeDomainsFromProvider(nonAdDomains, adDomains, provider) {
+    nonAdDomains = this.processDomains(nonAdDomains, provider);
+    this.applyCategorizationLogic(nonAdDomains, false);
+    this.logDomains(nonAdDomains, false);
+
+    adDomains = this.processDomains(adDomains, provider);
+    this.applyCategorizationLogic(adDomains, true);
+    this.logDomains(adDomains, true);
+  }
+
+  // TODO: insert logic from DS for reducing extracted domains to a single
+  // category for the SERP.
+  applyCategorizationLogic(domains, areAdDomains) {}
+
+  // TODO: replace this method once we know where to send the categorized
+  // domains and overall SERP category.
+  logDomains(domains, areAdDomains) {
+    if (domains?.size) {
+      lazy.logConsole.debug(
+        areAdDomains ? "Ad Domains:" : "Domains:",
+        ...domains
+      );
+    }
+  }
+
+  /**
+   * Processes raw domains extracted from the SERP into their final form before
+   * categorization.
+   *
+   * @param {Set} domains
+   *   The domains extracted from the page.
+   * @param {string} provider
+   *   The provider associated with the page.
+   * @returns {Set} processedDomains
+   *   The final set of processed domains for a page.
+   */
+  processDomains(domains, provider) {
+    let processedDomains = new Set();
+
+    for (let domain of domains) {
+      // Don't include domains associated with the search provider.
+      if (
+        domain.startsWith(`${provider}.`) ||
+        domain.includes(`.${provider}.`)
+      ) {
+        continue;
+      }
+      let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain);
+      // We may have come across the same domain twice, once with www. prefixed
+      // and another time without.
+      if (
+        domainWithoutSubdomains &&
+        !processedDomains.has(domainWithoutSubdomains)
+      ) {
+        processedDomains.add(domainWithoutSubdomains);
+      }
+    }
+
+    return processedDomains;
+  }
+
+  /**
+   * Helper to strip domains of any subdomains.
+   *
+   * @param {string} domain
+   *   The domain to strip of any subdomains.
+   * @returns {object} browser
+   *   The given domain with any subdomains removed.
+   */
+  #stripDomainOfSubdomains(domain) {
+    let tld;
+    // Can throw an exception if the input has too few domain levels.
+    try {
+      tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
+    } catch (ex) {
+      return "";
+    }
+
+    let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
+    let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
+
+    return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
+  }
 }

 export var SearchSERPTelemetry = new TelemetryHandler();
+export var SearchSERPCategorization = new DomainCategorizer();
--- a/browser/components/search/test/browser/telemetry/browser.ini
+++ b/browser/components/search/test/browser/telemetry/browser.ini
@ -25,6 +25,12 @@ support-files =
  serp.css
 [browser_search_telemetry_categorization_timing.js]
 [browser_search_telemetry_content.js]
+[browser_search_telemetry_domain_categorization_extraction.js]
+support-files =
+  searchTelemetryDomainExtraction.html
+[browser_search_telemetry_domain_categorization_reporting.js]
+support-files =
+  searchTelemetryDomainCategorizationReporting.html
 [browser_search_telemetry_engagement_cached.js]
 support-files =
  cacheable.html
--- a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
+++ b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
@ -0,0 +1,224 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/*
+ * This test ensures we are correctly extracting domains from a SERP.
+ */
+
+ChromeUtils.defineESModuleGetters(this, {
+  SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
+  SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
+});
+
+const TESTS = [
+  {
+    title: "Extract domain from href (absolute URL) - one link.",
+    extractorInfos: [
+      {
+        selectors:
+          '#test1 [data-layout="organic"] a[data-testid="result-title-a"]',
+        method: "href",
+      },
+    ],
+    expectedDomains: ["foobar.com"],
+  },
+  {
+    title: "Extract domain from href (absolute URL) - multiple links.",
+    extractorInfos: [
+      {
+        selectors:
+          '#test2 [data-layout="organic"] a[data-testid="result-title-a"]',
+        method: "href",
+      },
+    ],
+    expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
+  },
+  {
+    title: "Extract domain from href (relative URL).",
+    extractorInfos: [
+      {
+        selectors:
+          '#test3 [data-layout="organic"] a[data-testid="result-title-a"]',
+        method: "href",
+      },
+    ],
+    expectedDomains: ["example.org"],
+  },
+  {
+    title: "Extract domain from data attribute - one link.",
+    extractorInfos: [
+      {
+        selectors: "#test4 [data-dtld]",
+        method: "data-attribute",
+        options: {
+          dataAttributeKey: "dtld",
+        },
+      },
+    ],
+    expectedDomains: ["www.abc.com"],
+  },
+  {
+    title: "Extract domain from data attribute - multiple links.",
+    extractorInfos: [
+      {
+        selectors: "#test5 [data-dtld]",
+        method: "data-attribute",
+        options: {
+          dataAttributeKey: "dtld",
+        },
+      },
+    ],
+    expectedDomains: [
+      "www.foo.com",
+      "www.bar.com",
+      "www.baz.com",
+      "www.qux.com",
+    ],
+  },
+  {
+    title: "Extract domain from an href's query param value.",
+    extractorInfos: [
+      {
+        selectors:
+          '#test6 .js-carousel-item-title, #test6 [data-layout="ad"] [data-testid="result-title-a"]',
+        method: "href",
+        options: {
+          queryParamKey: "ad_domain",
+        },
+      },
+    ],
+    expectedDomains: ["def.com"],
+  },
+  {
+    title: "Extraction preserves order of domains within the page.",
+    extractorInfos: [
+      {
+        selectors:
+          '#test7 [data-layout="organic"] a[data-testid="result-title-a"]',
+        method: "href",
+      },
+      {
+        selectors: "#test7 [data-dtld]",
+        method: "data-attribute",
+        options: {
+          dataAttributeKey: "dtld",
+        },
+      },
+      {
+        selectors:
+          '#test7 .js-carousel-item-title, #test7 [data-layout="ad"] [data-testid="result-title-a"]',
+        method: "href",
+        options: {
+          queryParamKey: "ad_domain",
+        },
+      },
+    ],
+    expectedDomains: ["foobar.com", "www.abc.com", "def.com"],
+  },
+  {
+    title: "No elements match the selectors.",
+    extractorInfos: [
+      {
+        selectors:
+          '#test8 [data-layout="organic"] a[data-testid="result-title-a"]',
+        method: "href",
+      },
+    ],
+    expectedDomains: [],
+  },
+  {
+    title: "Data attribute is present, but value is missing.",
+    extractorInfos: [
+      {
+        selectors: "#test9 [data-dtld]",
+        method: "data-attribute",
+        options: {
+          dataAttributeKey: "dtld",
+        },
+      },
+    ],
+    expectedDomains: [],
+  },
+  {
+    title: "Query param is present, but value is missing.",
+    extractorInfos: [
+      {
+        selectors: '#test10 [data-layout="ad"] [data-testid="result-title-a"]',
+        method: "href",
+        options: {
+          queryParamKey: "ad_domain",
+        },
+      },
+    ],
+    expectedDomains: [],
+  },
+  {
+    title: "Non-standard URL scheme.",
+    extractorInfos: [
+      {
+        selectors:
+          '#test11 [data-layout="organic"] a[data-testid="result-title-a"]',
+        method: "href",
+      },
+    ],
+    expectedDomains: [],
+  },
+];
+
+add_setup(async function () {
+  await waitForIdle();
+
+  await SpecialPowers.pushPrefEnv({
+    set: [
+      ["browser.search.log", true],
+      ["browser.search.serpEventTelemetry.enabled", true],
+      ["browser.search.serpEventTelemetryCategorization.enabled", true],
+    ],
+  });
+
+  await SearchSERPTelemetry.init();
+
+  registerCleanupFunction(async () => {
+    SearchSERPTelemetry.overrideSearchTelemetryForTests();
+    resetTelemetry();
+  });
+});
+
+add_task(async function test_domain_extraction_heuristics() {
+  resetTelemetry();
+  let url = getSERPUrl("searchTelemetryDomainExtraction.html");
+  info(
+    "Load a sample SERP where domains need to be extracted in different ways."
+  );
+  let tab = await BrowserTestUtils.openNewForegroundTab(gBrowser, url);
+
+  for (let currentTest of TESTS) {
+    if (currentTest.title) {
+      info(currentTest.title);
+    }
+    let expectedDomains = new Set(currentTest.expectedDomains);
+    let actualDomains = await SpecialPowers.spawn(
+      gBrowser.selectedBrowser,
+      [currentTest.extractorInfos],
+      extractorInfos => {
+        const { domainExtractor } = ChromeUtils.importESModule(
+          "resource:///actors/SearchSERPTelemetryChild.sys.mjs"
+        );
+        return domainExtractor.extractDomainsFromDocument(
+          content.document,
+          extractorInfos
+        );
+      }
+    );
+
+    Assert.deepEqual(
+      Array.from(actualDomains),
+      Array.from(expectedDomains),
+      "Domains should have been extracted correctly."
+    );
+  }
+
+  BrowserTestUtils.removeTab(tab);
+});
--- a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_reporting.js
+++ b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_reporting.js
@ -0,0 +1,108 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/*
+ * This test ensures we are correctly reporting categorized domains from a SERP.
+ */
+
+ChromeUtils.defineESModuleGetters(this, {
+  SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
+  SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
+  sinon: "resource://testing-common/Sinon.sys.mjs",
+});
+
+const TEST_PROVIDER_INFO = [
+  {
+    telemetryId: "example",
+    searchPageRegexp:
+      /^https:\/\/example.org\/browser\/browser\/components\/search\/test\/browser\/telemetry\/searchTelemetry/,
+    queryParamName: "s",
+    codeParamName: "abc",
+    taggedCodes: ["ff"],
+    adServerAttributes: ["mozAttr"],
+    nonAdsLinkRegexps: [/^https:\/\/example.com/],
+    extraAdServersRegexps: [/^https:\/\/example\.com\/ad/],
+    // The search telemetry entry responsible for targeting the specific results.
+    domainExtraction: {
+      ads: [
+        {
+          selectors: "[data-ad-domain]",
+          method: "data-attribute",
+          options: {
+            dataAttributeKey: "adDomain",
+          },
+        },
+        {
+          selectors: ".ad",
+          method: "href",
+          options: {
+            queryParamKey: "ad_domain",
+          },
+        },
+      ],
+      nonAds: [
+        {
+          selectors: "#results .organic a",
+          method: "href",
+        },
+      ],
+    },
+    components: [
+      {
+        type: SearchSERPTelemetryUtils.COMPONENTS.AD_LINK,
+        default: true,
+      },
+    ],
+  },
+];
+
+let stub;
+add_setup(async function () {
+  SearchSERPTelemetry.overrideSearchTelemetryForTests(TEST_PROVIDER_INFO);
+  await waitForIdle();
+
+  await SpecialPowers.pushPrefEnv({
+    set: [
+      ["browser.search.log", true],
+      ["browser.search.serpEventTelemetry.enabled", true],
+      ["browser.search.serpEventTelemetryCategorization.enabled", true],
+    ],
+  });
+
+  await SearchSERPTelemetry.init();
+
+  stub = sinon.stub(SearchSERPCategorization, "logDomains");
+
+  registerCleanupFunction(async () => {
+    stub.restore();
+    SearchSERPTelemetry.overrideSearchTelemetryForTests();
+    resetTelemetry();
+  });
+});
+
+add_task(async function test_categorization_reporting() {
+  resetTelemetry();
+  let url = getSERPUrl("searchTelemetryDomainCategorizationReporting.html");
+  info("Load a sample SERP with organic results.");
+  let promise = waitForPageWithCategorizedDomains();
+  let tab = await BrowserTestUtils.openNewForegroundTab(gBrowser, url);
+  await promise;
+
+  // TODO: This needs to be refactored to actually test the reporting of the
+  // categorization.
+  Assert.deepEqual(
+    Array.from(stub.getCall(0).args[0]),
+    ["foobar.org"],
+    "Categorization of non-ads should match."
+  );
+
+  Assert.deepEqual(
+    Array.from(stub.getCall(1).args[0]),
+    ["abc.org", "def.org"],
+    "Categorization of ads should match."
+  );
+
+  BrowserTestUtils.removeTab(tab);
+});
--- a/browser/components/search/test/browser/telemetry/head.js
+++ b/browser/components/search/test/browser/telemetry/head.js
@ -340,6 +340,22 @@ async function waitForPageWithAdImpressions() {
  });
 }

+async function waitForPageWithCategorizedDomains() {
+  return new Promise(resolve => {
+    let listener = win => {
+      Services.obs.removeObserver(
+        listener,
+        "reported-page-with-categorized-domains"
+      );
+      resolve();
+    };
+    Services.obs.addObserver(
+      listener,
+      "reported-page-with-categorized-domains"
+    );
+  });
+}
+
 async function promiseImpressionReceived() {
  return TestUtils.waitForCondition(() => {
    let adImpressions = Glean.serp.adImpression.testGetValue() ?? [];
--- a/browser/components/search/test/browser/telemetry/searchTelemetryDomainCategorization.html
+++ b/browser/components/search/test/browser/telemetry/searchTelemetryDomainCategorization.html
@ -0,0 +1,45 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Document</title>
+</head>
+<body>
+  <div id="results">
+    <!-- Don't include domains matching the provider. -->
+    <div class="organic">
+      <a href="https://www.example.com"></a>
+      <a href="https://example.com"></a>
+    </div>
+    <div class="organic">
+      <a href="https://www.foobar.org"></a>
+    </div>
+    <div data-ad-domain="abc.org">
+      <a href="https://www.example.com/"></a>
+    </div>
+    <div>
+      <a class="ad" href="https://www.example.com/?ad_domain=def.org"></a>
+    </div>
+    <!-- Don't throw on anchors with non-standard or non-existent hrefs -->
+    <div>
+      <a href="javascript:console.log('hello world')">A javascript: URL link</a>
+    </div>
+    <div>
+      <a>An anchor that's missing an href attribute</a>
+    </div>
+    <div>
+      <a href="#">An anchor with a dummy href attribute value</a>
+    </div>
+  </div>
+  <aside>
+    <div class="organic">
+      <a href="https://foobaz.com"></a>
+    </div>
+  </aside>
+  <div class="organic">
+    <!-- Should not find this because it's not part of the results -->
+    <a href="https://outside-results.ca"></a>
+  </div>
+</body>
+</html>
--- a/browser/components/search/test/browser/telemetry/searchTelemetryDomainCategorizationReporting.html
+++ b/browser/components/search/test/browser/telemetry/searchTelemetryDomainCategorizationReporting.html
@ -0,0 +1,45 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Document</title>
+</head>
+<body>
+  <div id="results">
+    <!-- Don't include domains matching the provider. -->
+    <div class="organic">
+      <a href="https://www.example.com"></a>
+      <a href="https://example.com"></a>
+    </div>
+    <div class="organic">
+      <a href="https://www.foobar.org"></a>
+    </div>
+    <div data-ad-domain="abc.org">
+      <a href="https://www.example.com/"></a>
+    </div>
+    <div>
+      <a class="ad" href="https://www.example.com/?ad_domain=def.org"></a>
+    </div>
+    <!-- Don't throw on anchors with non-standard or non-existent hrefs -->
+    <div>
+      <a href="javascript:console.log('hello world')">A javascript: URL link</a>
+    </div>
+    <div>
+      <a>An anchor that's missing an href attribute</a>
+    </div>
+    <div>
+      <a href="#">An anchor with a dummy href attribute value</a>
+    </div>
+  </div>
+  <aside>
+    <div class="organic">
+      <a href="https://foobaz.com"></a>
+    </div>
+  </aside>
+  <div class="organic">
+    <!-- Should not find this because it's not part of the results -->
+    <a href="https://outside-results.ca"></a>
+  </div>
+</body>
+</html>
--- a/browser/components/search/test/browser/telemetry/searchTelemetryDomainExtraction.html
+++ b/browser/components/search/test/browser/telemetry/searchTelemetryDomainExtraction.html
@ -0,0 +1,72 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Document</title>
+</head>
+<body>
+  <div id="results">
+    <div id="test1">
+      <div data-layout="organic">
+        <a href="https://foobar.com" data-testid="result-title-a">Extract domain from href (absolute URL).</a>
+      </div>
+    </div>
+
+     <div id="test2">
+      <div data-layout="organic">
+        <a href="https://foo.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link1.</a>
+        <a href="https://bar.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link2.</a>
+        <a href="https://baz.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link3.</a>
+        <a href="https://qux.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link4.</a>
+      </div>
+    </div>
+
+    <div id="test3">
+      <div data-layout="organic">
+        <a href="/dummy-page" data-testid="result-title-a">Extract domain from href (relative URL).</a>
+      </div>
+    </div>
+
+    <div id="test4">
+      <a href="#" data-dtld="www.abc.com">Extract domain from data attribute.</a>
+    </div>
+
+    <div id="test5">
+      <a href="#" data-dtld="www.foo.com">Extract domain from data attribute - link1.</a>
+      <a href="#" data-dtld="www.bar.com">Extract domain from data attribute - link2.</a>
+      <a href="#" data-dtld="www.baz.com">Extract domain from data attribute - link3.</a>
+      <a href="#" data-dtld="www.qux.com">Extract domain from data attribute - link4.</a>
+    </div>
+
+    <div id="test6">
+      <a href="example.com/testing?ad_domain=def.com" class="js-carousel-item-title">Extract domain from an href's query param value.</a>
+    </div>
+
+    <div id="test7">
+      <!-- Extraction preserves order of domains within the page. -->
+      <div data-layout="organic">
+        <a href="https://foobar.com" data-testid="result-title-a">Extract domain from href (absolute URL).</a>
+        <a href="#" data-dtld="www.abc.com">Extract domain from data attribute.</a>
+        <a href="example.com/testing?ad_domain=def.com" class="js-carousel-item-title">Extract domain from an href's query param value.</a>
+      </div>
+    </div>
+
+    <div id="test8">
+      <a href="nomatches.com">Link that doesn't match a selector.</a>
+    </div>
+
+    <div id="test9">
+      <a href="#" data-dtld="">Data attribute is present, but value is missing.</a>
+    </div>
+
+    <div id="test10">
+      <a href="example.com/testing?ad_domain=" class="js-carousel-item-title">Query param is present, but value is missing.</a>
+    </div>
+
+    <div id="test11">
+      <a href="git://testing.com/testrepo">Non-standard URL scheme.</a>
+    </div>
+  </div>
+</body>
+</html>
--- a/browser/components/search/test/unit/test_search_telemetry_categorization_process_domains.js
+++ b/browser/components/search/test/unit/test_search_telemetry_categorization_process_domains.js
@ -0,0 +1,90 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+/*
+ * This test ensures we are correctly processing the domains that have been
+ * extracted from a SERP.
+ */
+
+ChromeUtils.defineESModuleGetters(this, {
+  BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs",
+  SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
+  SearchSERPTelemetry: "resource:///modules/SearchSERPTelemetry.sys.mjs",
+  SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
+  sinon: "resource://testing-common/Sinon.sys.mjs",
+});
+
+// Links including the provider name are not extracted.
+const PROVIDER = "example";
+
+const TESTS = [
+  {
+    title: "Domains matching the provider.",
+    domains: ["example.com", "www.example.com", "www.foobar.com"],
+    expected: ["foobar.com"],
+  },
+  {
+    title: "Second-level domains to a top-level domain.",
+    domains: [
+      "www.foobar.gc.ca",
+      "www.foobar.gov.uk",
+      "foobar.co.uk",
+      "www.foobar.co.il",
+    ],
+    expected: ["foobar.gc.ca", "foobar.gov.uk", "foobar.co.uk", "foobar.co.il"],
+  },
+  {
+    title: "Long subdomain.",
+    domains: ["ab.cd.ef.gh.foobar.com"],
+    expected: ["foobar.com"],
+  },
+  {
+    title: "Same top-level domain.",
+    domains: ["foobar.com", "www.foobar.com", "abc.def.foobar.com"],
+    expected: ["foobar.com"],
+  },
+  {
+    title: "Empty input.",
+    domains: [""],
+    expected: [],
+  },
+];
+
+add_setup(async function () {
+  Services.prefs.setBoolPref(SearchUtils.BROWSER_SEARCH_PREF + "log", true);
+  Services.prefs.setBoolPref(
+    SearchUtils.BROWSER_SEARCH_PREF + "serpEventTelemetry.enabled",
+    true
+  );
+  Services.prefs.setBoolPref(
+    SearchUtils.BROWSER_SEARCH_PREF +
+      "serpEventTelemetryCategorization.enabled",
+    true
+  );
+
+  // Required or else BrowserSearchTelemetry will throw.
+  sinon.stub(BrowserSearchTelemetry, "shouldRecordSearchCount").returns(true);
+  await SearchSERPTelemetry.init();
+});
+
+add_task(async function test_parsing_extracted_urls() {
+  for (let i = 0; i < TESTS.length; i++) {
+    let currentTest = TESTS[i];
+    let domains = new Set(currentTest.domains);
+
+    if (currentTest.title) {
+      info(currentTest.title);
+    }
+    let expectedDomains = new Set(currentTest.expected);
+    let actualDomains = SearchSERPCategorization.processDomains(
+      domains,
+      PROVIDER
+    );
+
+    Assert.deepEqual(
+      Array.from(actualDomains),
+      Array.from(expectedDomains),
+      "Domains should have been parsed correctly."
+    );
+  }
+});
--- a/browser/components/search/test/unit/xpcshell.ini
+++ b/browser/components/search/test/unit/xpcshell.ini
@ -2,6 +2,7 @@
 skip-if = toolkit == 'android' # bug 1730213
 firefox-appdir = browser

+[test_search_telemetry_categorization_process_domains.js]
 [test_search_telemetry_config_validation.js]
 support-files =
  ../../schema/search-telemetry-schema.json