зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1846357 - Extract domains for basic SERP links. r=jteow
Differential Revision: https://phabricator.services.mozilla.com/D185609
This commit is contained in:
Родитель
f8c85de251
Коммит
6bd56eddeb
|
@ -18,6 +18,13 @@ XPCOMUtils.defineLazyPreferenceGetter(
|
|||
false
|
||||
);
|
||||
|
||||
XPCOMUtils.defineLazyPreferenceGetter(
|
||||
lazy,
|
||||
"serpEventTelemetryCategorization",
|
||||
"browser.search.serpEventTelemetryCategorization.enabled",
|
||||
false
|
||||
);
|
||||
|
||||
const SHARED_DATA_KEY = "SearchTelemetry:ProviderInfo";
|
||||
export const ADLINK_CHECK_TIMEOUT_MS = 1000;
|
||||
|
||||
|
@ -848,6 +855,154 @@ class SearchAdImpression {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An object indicating which elements to examine for domains to extract and
|
||||
* which heuristic technique to use to extract that element's domain.
|
||||
*
|
||||
* @typedef {object} ExtractorInfo
|
||||
* @property {string} selectors
|
||||
* A string representing the CSS selector that targets the elements on the
|
||||
* page that contain domains we want to extract.
|
||||
* @property {string} method
|
||||
* A string representing which domain extraction heuristic to use.
|
||||
* One of: "href" or "data-attribute".
|
||||
* @property {object | null} options
|
||||
* Options related to the domain extraction heuristic used.
|
||||
* @property {string | null} options.dataAttributeKey
|
||||
* The key name of the data attribute to lookup.
|
||||
* @property {string | null} options.queryParamKey
|
||||
* The key name of the query param value to lookup.
|
||||
*/
|
||||
|
||||
/**
|
||||
* DomainExtractor examines elements on a page to retrieve the domains.
|
||||
*/
|
||||
class DomainExtractor {
|
||||
/**
|
||||
* Extract domains from the page using an array of information pertaining to
|
||||
* the SERP.
|
||||
*
|
||||
* @param {Document} document
|
||||
* The document for the SERP we are extracting domains from.
|
||||
* @param {Array<ExtractorInfo>} extractorInfos
|
||||
* Information used to target the domains we need to extract.
|
||||
* @return {Set<string>}
|
||||
* A set of the domains extracted from the page.
|
||||
*/
|
||||
extractDomainsFromDocument(document, extractorInfos) {
|
||||
let extractedDomains = new Set();
|
||||
if (!extractorInfos?.length) {
|
||||
return extractedDomains;
|
||||
}
|
||||
|
||||
for (let extractorInfo of extractorInfos) {
|
||||
if (!extractorInfo.selectors) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let elements = document.querySelectorAll(extractorInfo.selectors);
|
||||
if (!elements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (extractorInfo.method) {
|
||||
case "href": {
|
||||
// Origin is used in case a URL needs to be made absolute.
|
||||
let origin = new URL(document.documentURI).origin;
|
||||
this.#fromElementsConvertHrefsIntoDomains(
|
||||
elements,
|
||||
origin,
|
||||
extractedDomains,
|
||||
extractorInfo.options?.queryParamKey
|
||||
);
|
||||
break;
|
||||
}
|
||||
case "data-attribute": {
|
||||
this.#fromElementsRetrieveDataAttributeValues(
|
||||
elements,
|
||||
extractorInfo.options?.dataAttributeKey,
|
||||
extractedDomains
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return extractedDomains;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of elements, extract domains using href attributes. If the
|
||||
* URL in the href includes the specified query param, the domain will be
|
||||
* that query param's value. Otherwise it will be the hostname of the href
|
||||
* attribute's URL.
|
||||
*
|
||||
* @param {NodeList<Element>} elements
|
||||
* A list of elements from the page whose href attributes we want to
|
||||
* inspect.
|
||||
* @param {string} origin
|
||||
* Origin of the current page.
|
||||
* @param {Set<string>} extractedDomains
|
||||
* The result set of domains extracted from the page.
|
||||
* @param {string | null} queryParam
|
||||
* An optional query param to search for in an element's href attribute.
|
||||
*/
|
||||
#fromElementsConvertHrefsIntoDomains(
|
||||
elements,
|
||||
origin,
|
||||
extractedDomains,
|
||||
queryParam
|
||||
) {
|
||||
for (let element of elements) {
|
||||
let href = element.getAttribute("href");
|
||||
|
||||
let url;
|
||||
try {
|
||||
url = new URL(href, origin);
|
||||
} catch (ex) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ignore non-standard protocols.
|
||||
if (url.protocol != "https:" && url.protocol != "http:") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let domain = queryParam ? url.searchParams.get(queryParam) : url.hostname;
|
||||
if (domain && !extractedDomains.has(domain)) {
|
||||
extractedDomains.add(domain);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of elements, examine each for the specified data attribute.
|
||||
* If found, add that data attribute's value to the result set of extracted
|
||||
* domains as is.
|
||||
*
|
||||
* @param {NodeList<Element>} elements
|
||||
* A list of elements from the page whose data attributes we want to
|
||||
* inspect.
|
||||
* @param {string} attribute
|
||||
* The name of a data attribute to search for within an element.
|
||||
* @param {Set<string>} extractedDomains
|
||||
* The result set of domains extracted from the page.
|
||||
*/
|
||||
#fromElementsRetrieveDataAttributeValues(
|
||||
elements,
|
||||
attribute,
|
||||
extractedDomains
|
||||
) {
|
||||
for (let element of elements) {
|
||||
let value = element.dataset[attribute];
|
||||
if (value && !extractedDomains.has(value)) {
|
||||
extractedDomains.add(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const domainExtractor = new DomainExtractor();
|
||||
const searchProviders = new SearchProviders();
|
||||
const searchAdImpression = new SearchAdImpression();
|
||||
|
||||
|
@ -967,6 +1122,34 @@ export class SearchSERPTelemetryChild extends JSWindowActorChild {
|
|||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
lazy.serpEventTelemetryCategorization &&
|
||||
providerInfo.domainExtraction &&
|
||||
(eventType == "load" || eventType == "pageshow")
|
||||
) {
|
||||
let start = Cu.now();
|
||||
let nonAdDomains = domainExtractor.extractDomainsFromDocument(
|
||||
doc,
|
||||
providerInfo.domainExtraction.nonAds
|
||||
);
|
||||
let adDomains = domainExtractor.extractDomainsFromDocument(
|
||||
doc,
|
||||
providerInfo.domainExtraction.ads
|
||||
);
|
||||
|
||||
this.sendAsyncMessage("SearchTelemetry:Domains", {
|
||||
url,
|
||||
nonAdDomains,
|
||||
adDomains,
|
||||
});
|
||||
|
||||
ChromeUtils.addProfilerMarker(
|
||||
"SearchSERPTelemetryChild._checkForAdLink",
|
||||
start,
|
||||
"Extract domains from elements"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -29,6 +29,10 @@ export class SearchSERPTelemetryParent extends JSWindowActorParent {
|
|||
lazy.SearchSERPTelemetry.reportPageImpression(msg.data, browser);
|
||||
break;
|
||||
}
|
||||
case "SearchTelemetry:Domains": {
|
||||
lazy.SearchSERPTelemetry.reportPageDomains(msg.data, browser);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -724,6 +724,9 @@ pref("browser.search.serpEventTelemetry.enabled", true);
|
|||
pref("browser.search.serpEventTelemetry.enabled", false);
|
||||
#endif
|
||||
|
||||
// Enables search SERP telemetry page categorization.
|
||||
pref("browser.search.serpEventTelemetryCategorization.enabled", false);
|
||||
|
||||
// Enable new experimental shopping features. This is solely intended as a
|
||||
// rollout/"emergency stop" button - it will go away once the feature has
|
||||
// rolled out. There will be separate controls for user opt-in/opt-out.
|
||||
|
|
|
@ -38,6 +38,13 @@ XPCOMUtils.defineLazyPreferenceGetter(
|
|||
false
|
||||
);
|
||||
|
||||
XPCOMUtils.defineLazyPreferenceGetter(
|
||||
lazy,
|
||||
"serpEventTelemetryCategorization",
|
||||
"browser.search.serpEventTelemetryCategorization.enabled",
|
||||
false
|
||||
);
|
||||
|
||||
export var SearchSERPTelemetryUtils = {
|
||||
ACTIONS: {
|
||||
CLICKED: "clicked",
|
||||
|
@ -322,6 +329,10 @@ class TelemetryHandler {
|
|||
this._contentHandler._reportPageWithAdImpressions(info, browser);
|
||||
}
|
||||
|
||||
reportPageDomains(info, browser) {
|
||||
this._contentHandler._reportPageDomains(info, browser);
|
||||
}
|
||||
|
||||
reportPageImpression(info, browser) {
|
||||
this._contentHandler._reportPageImpression(info, browser);
|
||||
}
|
||||
|
@ -1307,6 +1318,138 @@ class ContentHandler {
|
|||
lazy.logConsole.debug("Could not find an impression id.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates the categorization and reporting of domains extracted from
|
||||
* SERPs.
|
||||
*
|
||||
* @param {object} info
|
||||
* The search provider infomation for the page.
|
||||
* @param {Set} info.nonAdDomains
|
||||
The non-ad domains extracted from the page.
|
||||
* @param {Set} info.adDomains
|
||||
The ad domains extracted from the page.
|
||||
* @param {object} browser
|
||||
* The browser associated with the page.
|
||||
*/
|
||||
_reportPageDomains(info, browser) {
|
||||
let item = this._findBrowserItemForURL(info.url);
|
||||
let telemetryState = item.browserTelemetryStateMap.get(browser);
|
||||
if (lazy.serpEventTelemetryCategorization && telemetryState) {
|
||||
let provider = item?.info.provider;
|
||||
if (provider) {
|
||||
SearchSERPCategorization.categorizeDomainsFromProvider(
|
||||
info.nonAdDomains,
|
||||
info.adDomains,
|
||||
provider
|
||||
);
|
||||
Services.obs.notifyObservers(
|
||||
null,
|
||||
"reported-page-with-categorized-domains"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Categorizes SERPs.
|
||||
*/
|
||||
class DomainCategorizer {
|
||||
/**
|
||||
* Categorizes domains extracted from SERPs.
|
||||
*
|
||||
* @param {Set} nonAdDomains
|
||||
* The non-ad domains extracted from the page.
|
||||
* @param {Set} adDomains
|
||||
* The ad domains extracted from the page.
|
||||
* @param {string} provider
|
||||
* The provider associated with the page.
|
||||
*/
|
||||
categorizeDomainsFromProvider(nonAdDomains, adDomains, provider) {
|
||||
nonAdDomains = this.processDomains(nonAdDomains, provider);
|
||||
this.applyCategorizationLogic(nonAdDomains, false);
|
||||
this.logDomains(nonAdDomains, false);
|
||||
|
||||
adDomains = this.processDomains(adDomains, provider);
|
||||
this.applyCategorizationLogic(adDomains, true);
|
||||
this.logDomains(adDomains, true);
|
||||
}
|
||||
|
||||
// TODO: insert logic from DS for reducing extracted domains to a single
|
||||
// category for the SERP.
|
||||
applyCategorizationLogic(domains, areAdDomains) {}
|
||||
|
||||
// TODO: replace this method once we know where to send the categorized
|
||||
// domains and overall SERP category.
|
||||
logDomains(domains, areAdDomains) {
|
||||
if (domains?.size) {
|
||||
lazy.logConsole.debug(
|
||||
areAdDomains ? "Ad Domains:" : "Domains:",
|
||||
...domains
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes raw domains extracted from the SERP into their final form before
|
||||
* categorization.
|
||||
*
|
||||
* @param {Set} domains
|
||||
* The domains extracted from the page.
|
||||
* @param {string} provider
|
||||
* The provider associated with the page.
|
||||
* @returns {Set} processedDomains
|
||||
* The final set of processed domains for a page.
|
||||
*/
|
||||
processDomains(domains, provider) {
|
||||
let processedDomains = new Set();
|
||||
|
||||
for (let domain of domains) {
|
||||
// Don't include domains associated with the search provider.
|
||||
if (
|
||||
domain.startsWith(`${provider}.`) ||
|
||||
domain.includes(`.${provider}.`)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain);
|
||||
// We may have come across the same domain twice, once with www. prefixed
|
||||
// and another time without.
|
||||
if (
|
||||
domainWithoutSubdomains &&
|
||||
!processedDomains.has(domainWithoutSubdomains)
|
||||
) {
|
||||
processedDomains.add(domainWithoutSubdomains);
|
||||
}
|
||||
}
|
||||
|
||||
return processedDomains;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to strip domains of any subdomains.
|
||||
*
|
||||
* @param {string} domain
|
||||
* The domain to strip of any subdomains.
|
||||
* @returns {object} browser
|
||||
* The given domain with any subdomains removed.
|
||||
*/
|
||||
#stripDomainOfSubdomains(domain) {
|
||||
let tld;
|
||||
// Can throw an exception if the input has too few domain levels.
|
||||
try {
|
||||
tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
|
||||
} catch (ex) {
|
||||
return "";
|
||||
}
|
||||
|
||||
let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
|
||||
let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
|
||||
|
||||
return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
|
||||
}
|
||||
}
|
||||
|
||||
export var SearchSERPTelemetry = new TelemetryHandler();
|
||||
export var SearchSERPCategorization = new DomainCategorizer();
|
||||
|
|
|
@ -25,6 +25,12 @@ support-files =
|
|||
serp.css
|
||||
[browser_search_telemetry_categorization_timing.js]
|
||||
[browser_search_telemetry_content.js]
|
||||
[browser_search_telemetry_domain_categorization_extraction.js]
|
||||
support-files =
|
||||
searchTelemetryDomainExtraction.html
|
||||
[browser_search_telemetry_domain_categorization_reporting.js]
|
||||
support-files =
|
||||
searchTelemetryDomainCategorizationReporting.html
|
||||
[browser_search_telemetry_engagement_cached.js]
|
||||
support-files =
|
||||
cacheable.html
|
||||
|
|
|
@ -0,0 +1,224 @@
|
|||
/* Any copyright is dedicated to the Public Domain.
|
||||
http://creativecommons.org/publicdomain/zero/1.0/ */
|
||||
|
||||
"use strict";
|
||||
|
||||
/*
|
||||
* This test ensures we are correctly extracting domains from a SERP.
|
||||
*/
|
||||
|
||||
ChromeUtils.defineESModuleGetters(this, {
|
||||
SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
|
||||
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
|
||||
});
|
||||
|
||||
const TESTS = [
|
||||
{
|
||||
title: "Extract domain from href (absolute URL) - one link.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test1 [data-layout="organic"] a[data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
},
|
||||
],
|
||||
expectedDomains: ["foobar.com"],
|
||||
},
|
||||
{
|
||||
title: "Extract domain from href (absolute URL) - multiple links.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test2 [data-layout="organic"] a[data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
},
|
||||
],
|
||||
expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
|
||||
},
|
||||
{
|
||||
title: "Extract domain from href (relative URL).",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test3 [data-layout="organic"] a[data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
},
|
||||
],
|
||||
expectedDomains: ["example.org"],
|
||||
},
|
||||
{
|
||||
title: "Extract domain from data attribute - one link.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: "#test4 [data-dtld]",
|
||||
method: "data-attribute",
|
||||
options: {
|
||||
dataAttributeKey: "dtld",
|
||||
},
|
||||
},
|
||||
],
|
||||
expectedDomains: ["www.abc.com"],
|
||||
},
|
||||
{
|
||||
title: "Extract domain from data attribute - multiple links.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: "#test5 [data-dtld]",
|
||||
method: "data-attribute",
|
||||
options: {
|
||||
dataAttributeKey: "dtld",
|
||||
},
|
||||
},
|
||||
],
|
||||
expectedDomains: [
|
||||
"www.foo.com",
|
||||
"www.bar.com",
|
||||
"www.baz.com",
|
||||
"www.qux.com",
|
||||
],
|
||||
},
|
||||
{
|
||||
title: "Extract domain from an href's query param value.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test6 .js-carousel-item-title, #test6 [data-layout="ad"] [data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
options: {
|
||||
queryParamKey: "ad_domain",
|
||||
},
|
||||
},
|
||||
],
|
||||
expectedDomains: ["def.com"],
|
||||
},
|
||||
{
|
||||
title: "Extraction preserves order of domains within the page.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test7 [data-layout="organic"] a[data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
},
|
||||
{
|
||||
selectors: "#test7 [data-dtld]",
|
||||
method: "data-attribute",
|
||||
options: {
|
||||
dataAttributeKey: "dtld",
|
||||
},
|
||||
},
|
||||
{
|
||||
selectors:
|
||||
'#test7 .js-carousel-item-title, #test7 [data-layout="ad"] [data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
options: {
|
||||
queryParamKey: "ad_domain",
|
||||
},
|
||||
},
|
||||
],
|
||||
expectedDomains: ["foobar.com", "www.abc.com", "def.com"],
|
||||
},
|
||||
{
|
||||
title: "No elements match the selectors.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test8 [data-layout="organic"] a[data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
},
|
||||
],
|
||||
expectedDomains: [],
|
||||
},
|
||||
{
|
||||
title: "Data attribute is present, but value is missing.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: "#test9 [data-dtld]",
|
||||
method: "data-attribute",
|
||||
options: {
|
||||
dataAttributeKey: "dtld",
|
||||
},
|
||||
},
|
||||
],
|
||||
expectedDomains: [],
|
||||
},
|
||||
{
|
||||
title: "Query param is present, but value is missing.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors: '#test10 [data-layout="ad"] [data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
options: {
|
||||
queryParamKey: "ad_domain",
|
||||
},
|
||||
},
|
||||
],
|
||||
expectedDomains: [],
|
||||
},
|
||||
{
|
||||
title: "Non-standard URL scheme.",
|
||||
extractorInfos: [
|
||||
{
|
||||
selectors:
|
||||
'#test11 [data-layout="organic"] a[data-testid="result-title-a"]',
|
||||
method: "href",
|
||||
},
|
||||
],
|
||||
expectedDomains: [],
|
||||
},
|
||||
];
|
||||
|
||||
add_setup(async function () {
|
||||
await waitForIdle();
|
||||
|
||||
await SpecialPowers.pushPrefEnv({
|
||||
set: [
|
||||
["browser.search.log", true],
|
||||
["browser.search.serpEventTelemetry.enabled", true],
|
||||
["browser.search.serpEventTelemetryCategorization.enabled", true],
|
||||
],
|
||||
});
|
||||
|
||||
await SearchSERPTelemetry.init();
|
||||
|
||||
registerCleanupFunction(async () => {
|
||||
SearchSERPTelemetry.overrideSearchTelemetryForTests();
|
||||
resetTelemetry();
|
||||
});
|
||||
});
|
||||
|
||||
add_task(async function test_domain_extraction_heuristics() {
|
||||
resetTelemetry();
|
||||
let url = getSERPUrl("searchTelemetryDomainExtraction.html");
|
||||
info(
|
||||
"Load a sample SERP where domains need to be extracted in different ways."
|
||||
);
|
||||
let tab = await BrowserTestUtils.openNewForegroundTab(gBrowser, url);
|
||||
|
||||
for (let currentTest of TESTS) {
|
||||
if (currentTest.title) {
|
||||
info(currentTest.title);
|
||||
}
|
||||
let expectedDomains = new Set(currentTest.expectedDomains);
|
||||
let actualDomains = await SpecialPowers.spawn(
|
||||
gBrowser.selectedBrowser,
|
||||
[currentTest.extractorInfos],
|
||||
extractorInfos => {
|
||||
const { domainExtractor } = ChromeUtils.importESModule(
|
||||
"resource:///actors/SearchSERPTelemetryChild.sys.mjs"
|
||||
);
|
||||
return domainExtractor.extractDomainsFromDocument(
|
||||
content.document,
|
||||
extractorInfos
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
Assert.deepEqual(
|
||||
Array.from(actualDomains),
|
||||
Array.from(expectedDomains),
|
||||
"Domains should have been extracted correctly."
|
||||
);
|
||||
}
|
||||
|
||||
BrowserTestUtils.removeTab(tab);
|
||||
});
|
|
@ -0,0 +1,108 @@
|
|||
/* Any copyright is dedicated to the Public Domain.
|
||||
http://creativecommons.org/publicdomain/zero/1.0/ */
|
||||
|
||||
"use strict";
|
||||
|
||||
/*
|
||||
* This test ensures we are correctly reporting categorized domains from a SERP.
|
||||
*/
|
||||
|
||||
ChromeUtils.defineESModuleGetters(this, {
|
||||
SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
|
||||
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
|
||||
sinon: "resource://testing-common/Sinon.sys.mjs",
|
||||
});
|
||||
|
||||
const TEST_PROVIDER_INFO = [
|
||||
{
|
||||
telemetryId: "example",
|
||||
searchPageRegexp:
|
||||
/^https:\/\/example.org\/browser\/browser\/components\/search\/test\/browser\/telemetry\/searchTelemetry/,
|
||||
queryParamName: "s",
|
||||
codeParamName: "abc",
|
||||
taggedCodes: ["ff"],
|
||||
adServerAttributes: ["mozAttr"],
|
||||
nonAdsLinkRegexps: [/^https:\/\/example.com/],
|
||||
extraAdServersRegexps: [/^https:\/\/example\.com\/ad/],
|
||||
// The search telemetry entry responsible for targeting the specific results.
|
||||
domainExtraction: {
|
||||
ads: [
|
||||
{
|
||||
selectors: "[data-ad-domain]",
|
||||
method: "data-attribute",
|
||||
options: {
|
||||
dataAttributeKey: "adDomain",
|
||||
},
|
||||
},
|
||||
{
|
||||
selectors: ".ad",
|
||||
method: "href",
|
||||
options: {
|
||||
queryParamKey: "ad_domain",
|
||||
},
|
||||
},
|
||||
],
|
||||
nonAds: [
|
||||
{
|
||||
selectors: "#results .organic a",
|
||||
method: "href",
|
||||
},
|
||||
],
|
||||
},
|
||||
components: [
|
||||
{
|
||||
type: SearchSERPTelemetryUtils.COMPONENTS.AD_LINK,
|
||||
default: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
let stub;
|
||||
add_setup(async function () {
|
||||
SearchSERPTelemetry.overrideSearchTelemetryForTests(TEST_PROVIDER_INFO);
|
||||
await waitForIdle();
|
||||
|
||||
await SpecialPowers.pushPrefEnv({
|
||||
set: [
|
||||
["browser.search.log", true],
|
||||
["browser.search.serpEventTelemetry.enabled", true],
|
||||
["browser.search.serpEventTelemetryCategorization.enabled", true],
|
||||
],
|
||||
});
|
||||
|
||||
await SearchSERPTelemetry.init();
|
||||
|
||||
stub = sinon.stub(SearchSERPCategorization, "logDomains");
|
||||
|
||||
registerCleanupFunction(async () => {
|
||||
stub.restore();
|
||||
SearchSERPTelemetry.overrideSearchTelemetryForTests();
|
||||
resetTelemetry();
|
||||
});
|
||||
});
|
||||
|
||||
add_task(async function test_categorization_reporting() {
|
||||
resetTelemetry();
|
||||
let url = getSERPUrl("searchTelemetryDomainCategorizationReporting.html");
|
||||
info("Load a sample SERP with organic results.");
|
||||
let promise = waitForPageWithCategorizedDomains();
|
||||
let tab = await BrowserTestUtils.openNewForegroundTab(gBrowser, url);
|
||||
await promise;
|
||||
|
||||
// TODO: This needs to be refactored to actually test the reporting of the
|
||||
// categorization.
|
||||
Assert.deepEqual(
|
||||
Array.from(stub.getCall(0).args[0]),
|
||||
["foobar.org"],
|
||||
"Categorization of non-ads should match."
|
||||
);
|
||||
|
||||
Assert.deepEqual(
|
||||
Array.from(stub.getCall(1).args[0]),
|
||||
["abc.org", "def.org"],
|
||||
"Categorization of ads should match."
|
||||
);
|
||||
|
||||
BrowserTestUtils.removeTab(tab);
|
||||
});
|
|
@ -340,6 +340,22 @@ async function waitForPageWithAdImpressions() {
|
|||
});
|
||||
}
|
||||
|
||||
async function waitForPageWithCategorizedDomains() {
|
||||
return new Promise(resolve => {
|
||||
let listener = win => {
|
||||
Services.obs.removeObserver(
|
||||
listener,
|
||||
"reported-page-with-categorized-domains"
|
||||
);
|
||||
resolve();
|
||||
};
|
||||
Services.obs.addObserver(
|
||||
listener,
|
||||
"reported-page-with-categorized-domains"
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async function promiseImpressionReceived() {
|
||||
return TestUtils.waitForCondition(() => {
|
||||
let adImpressions = Glean.serp.adImpression.testGetValue() ?? [];
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Document</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="results">
|
||||
<!-- Don't include domains matching the provider. -->
|
||||
<div class="organic">
|
||||
<a href="https://www.example.com"></a>
|
||||
<a href="https://example.com"></a>
|
||||
</div>
|
||||
<div class="organic">
|
||||
<a href="https://www.foobar.org"></a>
|
||||
</div>
|
||||
<div data-ad-domain="abc.org">
|
||||
<a href="https://www.example.com/"></a>
|
||||
</div>
|
||||
<div>
|
||||
<a class="ad" href="https://www.example.com/?ad_domain=def.org"></a>
|
||||
</div>
|
||||
<!-- Don't throw on anchors with non-standard or non-existent hrefs -->
|
||||
<div>
|
||||
<a href="javascript:console.log('hello world')">A javascript: URL link</a>
|
||||
</div>
|
||||
<div>
|
||||
<a>An anchor that's missing an href attribute</a>
|
||||
</div>
|
||||
<div>
|
||||
<a href="#">An anchor with a dummy href attribute value</a>
|
||||
</div>
|
||||
</div>
|
||||
<aside>
|
||||
<div class="organic">
|
||||
<a href="https://foobaz.com"></a>
|
||||
</div>
|
||||
</aside>
|
||||
<div class="organic">
|
||||
<!-- Should not find this because it's not part of the results -->
|
||||
<a href="https://outside-results.ca"></a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,45 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Document</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="results">
|
||||
<!-- Don't include domains matching the provider. -->
|
||||
<div class="organic">
|
||||
<a href="https://www.example.com"></a>
|
||||
<a href="https://example.com"></a>
|
||||
</div>
|
||||
<div class="organic">
|
||||
<a href="https://www.foobar.org"></a>
|
||||
</div>
|
||||
<div data-ad-domain="abc.org">
|
||||
<a href="https://www.example.com/"></a>
|
||||
</div>
|
||||
<div>
|
||||
<a class="ad" href="https://www.example.com/?ad_domain=def.org"></a>
|
||||
</div>
|
||||
<!-- Don't throw on anchors with non-standard or non-existent hrefs -->
|
||||
<div>
|
||||
<a href="javascript:console.log('hello world')">A javascript: URL link</a>
|
||||
</div>
|
||||
<div>
|
||||
<a>An anchor that's missing an href attribute</a>
|
||||
</div>
|
||||
<div>
|
||||
<a href="#">An anchor with a dummy href attribute value</a>
|
||||
</div>
|
||||
</div>
|
||||
<aside>
|
||||
<div class="organic">
|
||||
<a href="https://foobaz.com"></a>
|
||||
</div>
|
||||
</aside>
|
||||
<div class="organic">
|
||||
<!-- Should not find this because it's not part of the results -->
|
||||
<a href="https://outside-results.ca"></a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,72 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Document</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="results">
|
||||
<div id="test1">
|
||||
<div data-layout="organic">
|
||||
<a href="https://foobar.com" data-testid="result-title-a">Extract domain from href (absolute URL).</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test2">
|
||||
<div data-layout="organic">
|
||||
<a href="https://foo.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link1.</a>
|
||||
<a href="https://bar.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link2.</a>
|
||||
<a href="https://baz.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link3.</a>
|
||||
<a href="https://qux.com" data-testid="result-title-a">Extract domain from href (absolute URL) - link4.</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test3">
|
||||
<div data-layout="organic">
|
||||
<a href="/dummy-page" data-testid="result-title-a">Extract domain from href (relative URL).</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test4">
|
||||
<a href="#" data-dtld="www.abc.com">Extract domain from data attribute.</a>
|
||||
</div>
|
||||
|
||||
<div id="test5">
|
||||
<a href="#" data-dtld="www.foo.com">Extract domain from data attribute - link1.</a>
|
||||
<a href="#" data-dtld="www.bar.com">Extract domain from data attribute - link2.</a>
|
||||
<a href="#" data-dtld="www.baz.com">Extract domain from data attribute - link3.</a>
|
||||
<a href="#" data-dtld="www.qux.com">Extract domain from data attribute - link4.</a>
|
||||
</div>
|
||||
|
||||
<div id="test6">
|
||||
<a href="example.com/testing?ad_domain=def.com" class="js-carousel-item-title">Extract domain from an href's query param value.</a>
|
||||
</div>
|
||||
|
||||
<div id="test7">
|
||||
<!-- Extraction preserves order of domains within the page. -->
|
||||
<div data-layout="organic">
|
||||
<a href="https://foobar.com" data-testid="result-title-a">Extract domain from href (absolute URL).</a>
|
||||
<a href="#" data-dtld="www.abc.com">Extract domain from data attribute.</a>
|
||||
<a href="example.com/testing?ad_domain=def.com" class="js-carousel-item-title">Extract domain from an href's query param value.</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="test8">
|
||||
<a href="nomatches.com">Link that doesn't match a selector.</a>
|
||||
</div>
|
||||
|
||||
<div id="test9">
|
||||
<a href="#" data-dtld="">Data attribute is present, but value is missing.</a>
|
||||
</div>
|
||||
|
||||
<div id="test10">
|
||||
<a href="example.com/testing?ad_domain=" class="js-carousel-item-title">Query param is present, but value is missing.</a>
|
||||
</div>
|
||||
|
||||
<div id="test11">
|
||||
<a href="git://testing.com/testrepo">Non-standard URL scheme.</a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,90 @@
|
|||
/* Any copyright is dedicated to the Public Domain.
|
||||
http://creativecommons.org/publicdomain/zero/1.0/ */
|
||||
|
||||
/*
|
||||
* This test ensures we are correctly processing the domains that have been
|
||||
* extracted from a SERP.
|
||||
*/
|
||||
|
||||
ChromeUtils.defineESModuleGetters(this, {
|
||||
BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs",
|
||||
SearchSERPCategorization: "resource:///modules/SearchSERPTelemetry.sys.mjs",
|
||||
SearchSERPTelemetry: "resource:///modules/SearchSERPTelemetry.sys.mjs",
|
||||
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
|
||||
sinon: "resource://testing-common/Sinon.sys.mjs",
|
||||
});
|
||||
|
||||
// Links including the provider name are not extracted.
|
||||
const PROVIDER = "example";
|
||||
|
||||
const TESTS = [
|
||||
{
|
||||
title: "Domains matching the provider.",
|
||||
domains: ["example.com", "www.example.com", "www.foobar.com"],
|
||||
expected: ["foobar.com"],
|
||||
},
|
||||
{
|
||||
title: "Second-level domains to a top-level domain.",
|
||||
domains: [
|
||||
"www.foobar.gc.ca",
|
||||
"www.foobar.gov.uk",
|
||||
"foobar.co.uk",
|
||||
"www.foobar.co.il",
|
||||
],
|
||||
expected: ["foobar.gc.ca", "foobar.gov.uk", "foobar.co.uk", "foobar.co.il"],
|
||||
},
|
||||
{
|
||||
title: "Long subdomain.",
|
||||
domains: ["ab.cd.ef.gh.foobar.com"],
|
||||
expected: ["foobar.com"],
|
||||
},
|
||||
{
|
||||
title: "Same top-level domain.",
|
||||
domains: ["foobar.com", "www.foobar.com", "abc.def.foobar.com"],
|
||||
expected: ["foobar.com"],
|
||||
},
|
||||
{
|
||||
title: "Empty input.",
|
||||
domains: [""],
|
||||
expected: [],
|
||||
},
|
||||
];
|
||||
|
||||
add_setup(async function () {
|
||||
Services.prefs.setBoolPref(SearchUtils.BROWSER_SEARCH_PREF + "log", true);
|
||||
Services.prefs.setBoolPref(
|
||||
SearchUtils.BROWSER_SEARCH_PREF + "serpEventTelemetry.enabled",
|
||||
true
|
||||
);
|
||||
Services.prefs.setBoolPref(
|
||||
SearchUtils.BROWSER_SEARCH_PREF +
|
||||
"serpEventTelemetryCategorization.enabled",
|
||||
true
|
||||
);
|
||||
|
||||
// Required or else BrowserSearchTelemetry will throw.
|
||||
sinon.stub(BrowserSearchTelemetry, "shouldRecordSearchCount").returns(true);
|
||||
await SearchSERPTelemetry.init();
|
||||
});
|
||||
|
||||
add_task(async function test_parsing_extracted_urls() {
|
||||
for (let i = 0; i < TESTS.length; i++) {
|
||||
let currentTest = TESTS[i];
|
||||
let domains = new Set(currentTest.domains);
|
||||
|
||||
if (currentTest.title) {
|
||||
info(currentTest.title);
|
||||
}
|
||||
let expectedDomains = new Set(currentTest.expected);
|
||||
let actualDomains = SearchSERPCategorization.processDomains(
|
||||
domains,
|
||||
PROVIDER
|
||||
);
|
||||
|
||||
Assert.deepEqual(
|
||||
Array.from(actualDomains),
|
||||
Array.from(expectedDomains),
|
||||
"Domains should have been parsed correctly."
|
||||
);
|
||||
}
|
||||
});
|
|
@ -2,6 +2,7 @@
|
|||
skip-if = toolkit == 'android' # bug 1730213
|
||||
firefox-appdir = browser
|
||||
|
||||
[test_search_telemetry_categorization_process_domains.js]
|
||||
[test_search_telemetry_config_validation.js]
|
||||
support-files =
|
||||
../../schema/search-telemetry-schema.json
|
||||
|
|
Загрузка…
Ссылка в новой задаче