Bug 1846357 - Extract domains for basic SERP links. r=jteow

Differential Revision: https://phabricator.services.mozilla.com/D185609
This commit is contained in:
Stephanie Cunnane 2023-09-05 16:49:53 +00:00
Родитель f8c85de251
Коммит 6bd56eddeb
13 изменённых файлов: 940 добавлений и 0 удалений

Просмотреть файл

@ -18,6 +18,13 @@ XPCOMUtils.defineLazyPreferenceGetter(
false
);
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"serpEventTelemetryCategorization",
"browser.search.serpEventTelemetryCategorization.enabled",
false
);
const SHARED_DATA_KEY = "SearchTelemetry:ProviderInfo";
export const ADLINK_CHECK_TIMEOUT_MS = 1000;
@ -848,6 +855,154 @@ class SearchAdImpression {
}
}
/**
* An object indicating which elements to examine for domains to extract and
* which heuristic technique to use to extract that element's domain.
*
* @typedef {object} ExtractorInfo
* @property {string} selectors
* A string representing the CSS selector that targets the elements on the
* page that contain domains we want to extract.
* @property {string} method
* A string representing which domain extraction heuristic to use.
* One of: "href" or "data-attribute".
* @property {object | null} options
* Options related to the domain extraction heuristic used.
* @property {string | null} options.dataAttributeKey
* The key name of the data attribute to lookup.
* @property {string | null} options.queryParamKey
* The key name of the query param value to lookup.
*/
/**
* DomainExtractor examines elements on a page to retrieve the domains.
*/
class DomainExtractor {
/**
* Extract domains from the page using an array of information pertaining to
* the SERP.
*
* @param {Document} document
* The document for the SERP we are extracting domains from.
* @param {Array<ExtractorInfo>} extractorInfos
* Information used to target the domains we need to extract.
* @return {Set<string>}
* A set of the domains extracted from the page.
*/
extractDomainsFromDocument(document, extractorInfos) {
let extractedDomains = new Set();
if (!extractorInfos?.length) {
return extractedDomains;
}
for (let extractorInfo of extractorInfos) {
if (!extractorInfo.selectors) {
continue;
}
let elements = document.querySelectorAll(extractorInfo.selectors);
if (!elements) {
continue;
}
switch (extractorInfo.method) {
case "href": {
// Origin is used in case a URL needs to be made absolute.
let origin = new URL(document.documentURI).origin;
this.#fromElementsConvertHrefsIntoDomains(
elements,
origin,
extractedDomains,
extractorInfo.options?.queryParamKey
);
break;
}
case "data-attribute": {
this.#fromElementsRetrieveDataAttributeValues(
elements,
extractorInfo.options?.dataAttributeKey,
extractedDomains
);
break;
}
}
}
return extractedDomains;
}
/**
* Given a list of elements, extract domains using href attributes. If the
* URL in the href includes the specified query param, the domain will be
* that query param's value. Otherwise it will be the hostname of the href
* attribute's URL.
*
* @param {NodeList<Element>} elements
* A list of elements from the page whose href attributes we want to
* inspect.
* @param {string} origin
* Origin of the current page.
* @param {Set<string>} extractedDomains
* The result set of domains extracted from the page.
* @param {string | null} queryParam
* An optional query param to search for in an element's href attribute.
*/
#fromElementsConvertHrefsIntoDomains(
elements,
origin,
extractedDomains,
queryParam
) {
for (let element of elements) {
let href = element.getAttribute("href");
let url;
try {
url = new URL(href, origin);
} catch (ex) {
continue;
}
// Ignore non-standard protocols.
if (url.protocol != "https:" && url.protocol != "http:") {
continue;
}
let domain = queryParam ? url.searchParams.get(queryParam) : url.hostname;
if (domain && !extractedDomains.has(domain)) {
extractedDomains.add(domain);
}
}
}
/**
* Given a list of elements, examine each for the specified data attribute.
* If found, add that data attribute's value to the result set of extracted
* domains as is.
*
* @param {NodeList<Element>} elements
* A list of elements from the page whose data attributes we want to
* inspect.
* @param {string} attribute
* The name of a data attribute to search for within an element.
* @param {Set<string>} extractedDomains
* The result set of domains extracted from the page.
*/
#fromElementsRetrieveDataAttributeValues(
elements,
attribute,
extractedDomains
) {
for (let element of elements) {
let value = element.dataset[attribute];
if (value && !extractedDomains.has(value)) {
extractedDomains.add(value);
}
}
}
}
export const domainExtractor = new DomainExtractor();
const searchProviders = new SearchProviders();
const searchAdImpression = new SearchAdImpression();
@ -967,6 +1122,34 @@ export class SearchSERPTelemetryChild extends JSWindowActorChild {
});
}
}
if (
lazy.serpEventTelemetryCategorization &&
providerInfo.domainExtraction &&
(eventType == "load" || eventType == "pageshow")
) {
let start = Cu.now();
let nonAdDomains = domainExtractor.extractDomainsFromDocument(
doc,
providerInfo.domainExtraction.nonAds
);
let adDomains = domainExtractor.extractDomainsFromDocument(
doc,
providerInfo.domainExtraction.ads
);
this.sendAsyncMessage("SearchTelemetry:Domains", {
url,
nonAdDomains,
adDomains,
});
ChromeUtils.addProfilerMarker(
"SearchSERPTelemetryChild._checkForAdLink",
start,
"Extract domains from elements"
);
}
}
/**

Просмотреть файл

@ -29,6 +29,10 @@ export class SearchSERPTelemetryParent extends JSWindowActorParent {
lazy.SearchSERPTelemetry.reportPageImpression(msg.data, browser);
break;
}
case "SearchTelemetry:Domains": {
lazy.SearchSERPTelemetry.reportPageDomains(msg.data, browser);
break;
}
}
}
}

Просмотреть файл

@ -724,6 +724,9 @@ pref("browser.search.serpEventTelemetry.enabled", true);
pref("browser.search.serpEventTelemetry.enabled", false);
#endif
// Enables search SERP telemetry page categorization.
pref("browser.search.serpEventTelemetryCategorization.enabled", false);
// Enable new experimental shopping features. This is solely intended as a
// rollout/"emergency stop" button - it will go away once the feature has
// rolled out. There will be separate controls for user opt-in/opt-out.

Просмотреть файл

@ -38,6 +38,13 @@ XPCOMUtils.defineLazyPreferenceGetter(
false
);
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"serpEventTelemetryCategorization",
"browser.search.serpEventTelemetryCategorization.enabled",
false
);
export var SearchSERPTelemetryUtils = {
ACTIONS: {
CLICKED: "clicked",
@ -322,6 +329,10 @@ class TelemetryHandler {
this._contentHandler._reportPageWithAdImpressions(info, browser);
}
reportPageDomains(info, browser) {
this._contentHandler._reportPageDomains(info, browser);
}
reportPageImpression(info, browser) {
this._contentHandler._reportPageImpression(info, browser);
}
@ -1307,6 +1318,138 @@ class ContentHandler {
lazy.logConsole.debug("Could not find an impression id.");
}
}
/**
* Initiates the categorization and reporting of domains extracted from
* SERPs.
*
* @param {object} info
* The search provider infomation for the page.
* @param {Set} info.nonAdDomains
The non-ad domains extracted from the page.
* @param {Set} info.adDomains
The ad domains extracted from the page.
* @param {object} browser
* The browser associated with the page.
*/
_reportPageDomains(info, browser) {
let item = this._findBrowserItemForURL(info.url);
let telemetryState = item.browserTelemetryStateMap.get(browser);
if (lazy.serpEventTelemetryCategorization && telemetryState) {
let provider = item?.info.provider;
if (provider) {
SearchSERPCategorization.categorizeDomainsFromProvider(
info.nonAdDomains,
info.adDomains,
provider
);
Services.obs.notifyObservers(
null,
"reported-page-with-categorized-domains"
);
}
}
}
}
/**
* Categorizes SERPs.
*/
class DomainCategorizer {
/**
* Categorizes domains extracted from SERPs.
*
* @param {Set} nonAdDomains
* The non-ad domains extracted from the page.
* @param {Set} adDomains
* The ad domains extracted from the page.
* @param {string} provider
* The provider associated with the page.
*/
categorizeDomainsFromProvider(nonAdDomains, adDomains, provider) {
nonAdDomains = this.processDomains(nonAdDomains, provider);
this.applyCategorizationLogic(nonAdDomains, false);
this.logDomains(nonAdDomains, false);
adDomains = this.processDomains(adDomains, provider);
this.applyCategorizationLogic(adDomains, true);
this.logDomains(adDomains, true);
}
// TODO: insert logic from DS for reducing extracted domains to a single
// category for the SERP.
applyCategorizationLogic(domains, areAdDomains) {}
// TODO: replace this method once we know where to send the categorized
// domains and overall SERP category.
logDomains(domains, areAdDomains) {
if (domains?.size) {
lazy.logConsole.debug(
areAdDomains ? "Ad Domains:" : "Domains:",
...domains
);
}
}
/**
* Processes raw domains extracted from the SERP into their final form before
* categorization.
*
* @param {Set} domains
* The domains extracted from the page.
* @param {string} provider
* The provider associated with the page.
* @returns {Set} processedDomains
* The final set of processed domains for a page.
*/
processDomains(domains, provider) {
let processedDomains = new Set();
for (let domain of domains) {
// Don't include domains associated with the search provider.
if (
domain.startsWith(`${provider}.`) ||
domain.includes(`.${provider}.`)
) {
continue;
}
let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain);
// We may have come across the same domain twice, once with www. prefixed
// and another time without.
if (
domainWithoutSubdomains &&
!processedDomains.has(domainWithoutSubdomains)
) {
processedDomains.add(domainWithoutSubdomains);
}
}
return processedDomains;
}
/**
* Helper to strip domains of any subdomains.
*
* @param {string} domain
* The domain to strip of any subdomains.
* @returns {object} browser
* The given domain with any subdomains removed.
*/
#stripDomainOfSubdomains(domain) {
let tld;
// Can throw an exception if the input has too few domain levels.
try {
tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
} catch (ex) {
return "";
}
let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
}
}
export var SearchSERPTelemetry = new TelemetryHandler();
export var SearchSERPCategorization = new DomainCategorizer();

Просмотреть файл

@ -25,6 +25,12 @@ support-files =
serp.css
[browser_search_telemetry_categorization_timing.js]
[browser_search_telemetry_content.js]
[browser_search_telemetry_domain_categorization_extraction.js]
support-files =
searchTelemetryDomainExtraction.html
[browser_search_telemetry_domain_categorization_reporting.js]
support-files =
searchTelemetryDomainCategorizationReporting.html
[browser_search_telemetry_engagement_cached.js]
support-files =
cacheable.html

Просмотреть файл

@ -0,0 +1,224 @@
/* Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/ */
"use strict";
/*
* This test ensures we are correctly extracting domains from a SERP.
*/
ChromeUtils.defineESModuleGetters(this, {
SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
});
const TESTS = [
{
title: "Extract domain from href (absolute URL) - one link.",
extractorInfos: [
{
selectors:
'#test1 [data-layout="organic"] a[data-testid="result-title-a"]',
method: "href",
},
],
expectedDomains: ["foobar.com"],
},
{
title: "Extract domain from href (absolute URL) - multiple links.",
extractorInfos: [
{
selectors:
'#test2 [data-layout="organic"] a[data-testid="result-title-a"]',
method: "href",
},
],
expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
},
{
title: "Extract domain from href (relative URL).",
extractorInfos: [
{
selectors:
'#test3 [data-layout="organic"] a[data-testid="result-title-a"]',
method: "href",
},
],
expectedDomains: ["example.org"],
},
{
title: "Extract domain from data attribute - one link.",
extractorInfos: [
{
selectors: "#test4 [data-dtld]",
method: "data-attribute",
options: {
dataAttributeKey: "dtld",
},
},
],
expectedDomains: ["www.abc.com"],
},
{
title: "Extract domain from data attribute - multiple links.",
extractorInfos: [
{
selectors: "#test5 [data-dtld]",
method: "data-attribute",
options: {
dataAttributeKey: "dtld",
},
},
],
expectedDomains: [
"www.foo.com",
"www.bar.com",
"www.baz.com",
"www.qux.com",
],
},
{
title: "Extract domain from an href's query param value.",
extractorInfos: [
{
selectors:
'#test6 .js-carousel-item-title, #test6 [data-layout="ad"] [data-testid="result-title-a"]',
method: "href",
options: {
queryParamKey: "ad_domain",
},
},
],
expectedDomains: ["def.com"],
},
{
title: "Extraction preserves order of domains within the page.",
extractorInfos: [
{
selectors:
'#test7 [data-layout="organic"] a[data-testid="result-title-a"]',
method: "href",
},
{
selectors: "#test7 [data-dtld]",
method: "data-attribute",
options: {
dataAttributeKey: "dtld",
},
},
{
selectors:
'#test7 .js-carousel-item-title, #test7 [data-layout="ad"] [data-testid="result-title-a"]',
method: "href",
options: {
queryParamKey: "ad_domain",
},
},
],
expectedDomains: ["foobar.com", "www.abc.com", "def.com"],
},
{
title: "No elements match the selectors.",
extractorInfos: [
{
selectors:
'#test8 [data-layout="organic"] a[data-testid="result-title-a"]',
method: "href",
},
],
expectedDomains: [],
},
{
title: "Data attribute is present, but value is missing.",
extractorInfos: [
{
selectors: "#test9 [data-dtld]",
method: "data-attribute",
options: {
dataAttributeKey: "dtld",
},
},
],
expectedDomains: [],
},
{
title: "Query param is present, but value is missing.",
extractorInfos: [
{
selectors: '#test10 [data-layout="ad"] [data-testid="result-title-a"]',
method: "href",
options: {
queryParamKey: "ad_domain",
},
},
],
expectedDomains: [],
},
{
title: "Non-standard URL scheme.",
extractorInfos: [
{
selectors:
'#test11 [data-layout="organic"] a[data-testid="result-title-a"]',
method: "href",
},
],
expectedDomains: [],
},
];
add_setup(async function () {
await waitForIdle();
await SpecialPowers.pushPrefEnv({
set: [
["browser.search.log", true],
["browser.search.serpEventTelemetry.enabled", true],
["browser.search.serpEventTelemetryCategorization.enabled", true],
],
});
await SearchSERPTelemetry.init();
registerCleanupFunction(async () => {
SearchSERPTelemetry.overrideSearchTelemetryForTests();
resetTelemetry();
});
});
add_task(async function test_domain_extraction_heuristics() {
resetTelemetry();
let url = getSERPUrl("searchTelemetryDomainExtraction.html");
info(
"Load a sample SERP where domains need to be extracted in different ways."
);
let tab = await BrowserTestUtils.openNewForegroundTab(gBrowser, url);
for (let currentTest of TESTS) {
if (currentTest.title) {
info(currentTest.title);
}
let expectedDomains = new Set(currentTest.expectedDomains);
let actualDomains = await SpecialPowers.spawn(
gBrowser.selectedBrowser,
[currentTest.extractorInfos],
extractorInfos => {
const { domainExtractor } = ChromeUtils.importESModule(
"resource:///actors/SearchSERPTelemetryChild.sys.mjs"
);
return domainExtractor.extractDomainsFromDocument(
content.document,
extractorInfos
);
}
);
Assert.deepEqual(
Array.from(actualDomains),
Array.from(expectedDomains),
"Domains should have been extracted correctly."
);
}
BrowserTestUtils.removeTab(tab);
});

Просмотреть файл

@ -0,0 +1,108 @@
/* Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/ */
"use strict";
/*
* This test ensures we are correctly reporting categorized domains from a SERP.
*/
ChromeUtils.defineESModuleGetters(this, {
SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
sinon: "resource://testing-common/Sinon.sys.mjs",
});
const TEST_PROVIDER_INFO = [
{
telemetryId: "example",
searchPageRegexp:
/^https:\/\/example.org\/browser\/browser\/components\/search\/test\/browser\/telemetry\/searchTelemetry/,
queryParamName: "s",
codeParamName: "abc",
taggedCodes: ["ff"],
adServerAttributes: ["mozAttr"],
nonAdsLinkRegexps: [/^https:\/\/example.com/],
extraAdServersRegexps: [/^https:\/\/example\.com\/ad/],
// The search telemetry entry responsible for targeting the specific results.
domainExtraction: {
ads: [
{
selectors: "[data-ad-domain]",
method: "data-attribute",
options: {
dataAttributeKey: "adDomain",
},
},
{
selectors: ".ad",
method: "href",
options: {
queryParamKey: "ad_domain",
},
},
],
nonAds: [
{
selectors: "#results .organic a",
method: "href",
},
],
},
components: [
{
type: SearchSERPTelemetryUtils.COMPONENTS.AD_LINK,
default: true,
},
],
},
];
let stub;
add_setup(async function () {
SearchSERPTelemetry.overrideSearchTelemetryForTests(TEST_PROVIDER_INFO);
await waitForIdle();
await SpecialPowers.pushPrefEnv({
set: [
["browser.search.log", true],
["browser.search.serpEventTelemetry.enabled", true],
["browser.search.serpEventTelemetryCategorization.enabled", true],
],
});
await SearchSERPTelemetry.init();
stub = sinon.stub(SearchSERPCategorization, "logDomains");
registerCleanupFunction(async () => {
stub.restore();
SearchSERPTelemetry.overrideSearchTelemetryForTests();
resetTelemetry();
});
});
add_task(async function test_categorization_reporting() {
resetTelemetry();
let url = getSERPUrl("searchTelemetryDomainCategorizationReporting.html");
info("Load a sample SERP with organic results.");
let promise = waitForPageWithCategorizedDomains();
let tab = await BrowserTestUtils.openNewForegroundTab(gBrowser, url);
await promise;
// TODO: This needs to be refactored to actually test the reporting of the
// categorization.
Assert.deepEqual(
Array.from(stub.getCall(0).args[0]),
["foobar.org"],
"Categorization of non-ads should match."
);
Assert.deepEqual(
Array.from(stub.getCall(1).args[0]),
["abc.org", "def.org"],
"Categorization of ads should match."
);
BrowserTestUtils.removeTab(tab);
});

Просмотреть файл

@ -340,6 +340,22 @@ async function waitForPageWithAdImpressions() {
});
}
async function waitForPageWithCategorizedDomains() {
return new Promise(resolve => {
let listener = win => {
Services.obs.removeObserver(
listener,
"reported-page-with-categorized-domains"
);
resolve();
};
Services.obs.addObserver(
listener,
"reported-page-with-categorized-domains"
);
});
}
async function promiseImpressionReceived() {
return TestUtils.waitForCondition(() => {
let adImpressions = Glean.serp.adImpression.testGetValue() ?? [];

Просмотреть файл

@ -0,0 +1,45 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
</head>
<body>
<div id="results">
<!-- Don't include domains matching the provider. -->
<div class="organic">
<a href="https://www.example.com"></a>
<a href="https://example.com"></a>
</div>
<div class="organic">
<a href="https://www.foobar.org"></a>
</div>
<div data-ad-domain="abc.org">
<a href="https://www.example.com/"></a>
</div>
<div>
<a class="ad" href="https://www.example.com/?ad_domain=def.org"></a>
</div>
<!-- Don't throw on anchors with non-standard or non-existent hrefs -->
<div>
<a href="javascript:console.log('hello world')">A javascript: URL link</a>
</div>
<div>
<a>An anchor that's missing an href attribute</a>
</div>
<div>
<a href="#">An anchor with a dummy href attribute value</a>
</div>
</div>
<aside>
<div class="organic">
<a href="https://foobaz.com"></a>
</div>
</aside>
<div class="organic">
<!-- Should not find this because it's not part of the results -->
<a href="https://outside-results.ca"></a>
</div>
</body>
</html>

Просмотреть файл

@ -0,0 +1,45 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
</head>
<body>
<div id="results">
<!-- Don't include domains matching the provider. -->
<div class="organic">
<a href="https://www.example.com"></a>
<a href="https://example.com"></a>
</div>
<div class="organic">
<a href="https://www.foobar.org"></a>
</div>
<div data-ad-domain="abc.org">
<a href="https://www.example.com/"></a>
</div>
<div>
<a class="ad" href="https://www.example.com/?ad_domain=def.org"></a>
</div>
<!-- Don't throw on anchors with non-standard or non-existent hrefs -->
<div>
<a href="javascript:console.log('hello world')">A javascript: URL link</a>
</div>
<div>
<a>An anchor that's missing an href attribute</a>
</div>
<div>
<a href="#">An anchor with a dummy href attribute value</a>
</div>
</div>
<aside>
<div class="organic">
<a href="https://foobaz.com"></a>
</div>
</aside>
<div class="organic">
<!-- Should not find this because it's not part of the results -->
<a href="https://outside-results.ca"></a>
</div>
</body>
</html>

Просмотреть файл

@ -0,0 +1,72 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
</head>
<body>
<div id="results">
<div id="test1">
<div data-layout="organic">
<a href="https://foobar.com" data-testid="result-title-a">Extract domain from href (absolute URL).</a>
</div>
</div>
<div id="test2">
<div data-layout="organic">
<a href="https://foo.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link1.</a>
<a href="https://bar.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link2.</a>
<a href="https://baz.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link3.</a>
<a href="https://qux.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link4.</a>
</div>
</div>
<div id="test3">
<div data-layout="organic">
<a href="/dummy-page" data-testid="result-title-a">Extract domain from href (relative URL).</a>
</div>
</div>
<div id="test4">
<a href="#" data-dtld="www.abc.com">Extract domain from data attribute.</a>
</div>
<div id="test5">
<a href="#" data-dtld="www.foo.com">Extract domain from data attribute - link1.</a>
<a href="#" data-dtld="www.bar.com">Extract domain from data attribute - link2.</a>
<a href="#" data-dtld="www.baz.com">Extract domain from data attribute - link3.</a>
<a href="#" data-dtld="www.qux.com">Extract domain from data attribute - link4.</a>
</div>
<div id="test6">
<a href="example.com/testing?ad_domain=def.com" class="js-carousel-item-title">Extract domain from an href's query param value.</a>
</div>
<div id="test7">
<!-- Extraction preserves order of domains within the page. -->
<div data-layout="organic">
<a href="https://foobar.com" data-testid="result-title-a">Extract domain from href (absolute URL).</a>
<a href="#" data-dtld="www.abc.com">Extract domain from data attribute.</a>
<a href="example.com/testing?ad_domain=def.com" class="js-carousel-item-title">Extract domain from an href's query param value.</a>
</div>
</div>
<div id="test8">
<a href="nomatches.com">Link that doesn't match a selector.</a>
</div>
<div id="test9">
<a href="#" data-dtld="">Data attribute is present, but value is missing.</a>
</div>
<div id="test10">
<a href="example.com/testing?ad_domain=" class="js-carousel-item-title">Query param is present, but value is missing.</a>
</div>
<div id="test11">
<a href="git://testing.com/testrepo">Non-standard URL scheme.</a>
</div>
</div>
</body>
</html>

Просмотреть файл

@ -0,0 +1,90 @@
/* Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/ */
/*
* This test ensures we are correctly processing the domains that have been
* extracted from a SERP.
*/
ChromeUtils.defineESModuleGetters(this, {
BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs",
SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
SearchSERPTelemetry: "resource:///modules/SearchSERPTelemetry.sys.mjs",
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
sinon: "resource://testing-common/Sinon.sys.mjs",
});
// Links including the provider name are not extracted.
const PROVIDER = "example";
const TESTS = [
{
title: "Domains matching the provider.",
domains: ["example.com", "www.example.com", "www.foobar.com"],
expected: ["foobar.com"],
},
{
title: "Second-level domains to a top-level domain.",
domains: [
"www.foobar.gc.ca",
"www.foobar.gov.uk",
"foobar.co.uk",
"www.foobar.co.il",
],
expected: ["foobar.gc.ca", "foobar.gov.uk", "foobar.co.uk", "foobar.co.il"],
},
{
title: "Long subdomain.",
domains: ["ab.cd.ef.gh.foobar.com"],
expected: ["foobar.com"],
},
{
title: "Same top-level domain.",
domains: ["foobar.com", "www.foobar.com", "abc.def.foobar.com"],
expected: ["foobar.com"],
},
{
title: "Empty input.",
domains: [""],
expected: [],
},
];
add_setup(async function () {
Services.prefs.setBoolPref(SearchUtils.BROWSER_SEARCH_PREF + "log", true);
Services.prefs.setBoolPref(
SearchUtils.BROWSER_SEARCH_PREF + "serpEventTelemetry.enabled",
true
);
Services.prefs.setBoolPref(
SearchUtils.BROWSER_SEARCH_PREF +
"serpEventTelemetryCategorization.enabled",
true
);
// Required or else BrowserSearchTelemetry will throw.
sinon.stub(BrowserSearchTelemetry, "shouldRecordSearchCount").returns(true);
await SearchSERPTelemetry.init();
});
add_task(async function test_parsing_extracted_urls() {
for (let i = 0; i < TESTS.length; i++) {
let currentTest = TESTS[i];
let domains = new Set(currentTest.domains);
if (currentTest.title) {
info(currentTest.title);
}
let expectedDomains = new Set(currentTest.expected);
let actualDomains = SearchSERPCategorization.processDomains(
domains,
PROVIDER
);
Assert.deepEqual(
Array.from(actualDomains),
Array.from(expectedDomains),
"Domains should have been parsed correctly."
);
}
});

Просмотреть файл

@ -2,6 +2,7 @@
skip-if = toolkit == 'android' # bug 1730213
firefox-appdir = browser
[test_search_telemetry_categorization_process_domains.js]
[test_search_telemetry_config_validation.js]
support-files =
../../schema/search-telemetry-schema.json