Bug 1850000 - Add listener and in-memory storage of Search Categorization collection - r=Standard8,leplatrem

Differential Revision: https://phabricator.services.mozilla.com/D186805
This commit is contained in:
James Teow 2023-09-18 13:42:53 +00:00
Родитель daa2b75445
Коммит d1761aed79
8 изменённых файлов: 732 добавлений и 0 удалений

Просмотреть файл

@ -73,6 +73,8 @@ ChromeUtils.defineESModuleGetters(lazy, {
Sanitizer: "resource:///modules/Sanitizer.sys.mjs",
SaveToPocket: "chrome://pocket/content/SaveToPocket.sys.mjs",
ScreenshotsUtils: "resource:///modules/ScreenshotsUtils.sys.mjs",
SearchSERPDomainToCategoriesMap:
"resource:///modules/SearchSERPTelemetry.sys.mjs",
SearchSERPTelemetry: "resource:///modules/SearchSERPTelemetry.sys.mjs",
SessionStartup: "resource:///modules/sessionstore/SessionStartup.sys.mjs",
SessionStore: "resource:///modules/sessionstore/SessionStore.sys.mjs",
@ -2998,6 +3000,13 @@ BrowserGlue.prototype = {
},
},
{
name: "SearchSERPDomainToCategoriesMap.init",
task: () => {
lazy.SearchSERPDomainToCategoriesMap.init().catch(console.error);
},
},
{
name: "browser-startup-idle-tasks-finished",
task: () => {

Просмотреть файл

@ -12,6 +12,10 @@ ChromeUtils.defineESModuleGetters(lazy, {
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
});
ChromeUtils.defineLazyGetter(lazy, "gCryptoHash", () => {
return Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash);
});
// The various histograms and scalars that we report to.
const SEARCH_CONTENT_SCALAR_BASE = "browser.search.content.";
const SEARCH_WITH_ADS_SCALAR_BASE = "browser.search.withads.";
@ -21,6 +25,7 @@ const SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX = "pb";
// Exported for tests.
export const TELEMETRY_SETTINGS_KEY = "search-telemetry-v2";
export const TELEMETRY_CATEGORIZATION_KEY = "search-categorization";
const impressionIdsWithoutEngagementsSet = new Set();
@ -1502,5 +1507,274 @@ class DomainCategorizer {
}
}
/**
* @typedef {object} DomainToCategoriesRecord
* @property {number} version
* The version of the record.
*/
/**
* @typedef {object} DomainCategoryScore
* @property {number} category
* The index of the category.
* @property {number} score
* The score associated with the category.
*/
/**
* Maps domain to categories, with data synced with Remote Settings.
*/
class DomainToCategoriesMap {
/**
* Contains the domain to category scores.
*
* @type {Object<string, Array<DomainCategoryScore>> | null}
*/
#map = null;
/**
* Latest version number of the attachments.
*
* @type {number | null}
*/
#version = null;
/**
* The Remote Settings client.
*
* @type {object | null}
*/
#client = null;
/**
* Whether this is synced with Remote Settings.
*
* @type {boolean}
*/
#init = false;
/**
* Callback when Remote Settings syncs.
*
* @type {Function | null}
*/
#onSettingsSync = null;
/**
* Initializes the map with local attachments and creates a listener for
* updates to Remote Settings in case the mappings are updated while the
* client is on.
*/
async init() {
if (!lazy.serpEventTelemetryCategorization || this.#init) {
return;
}
this.#init = true;
lazy.logConsole.debug("Domain-to-categories map is initializing.");
this.#client = lazy.RemoteSettings(TELEMETRY_CATEGORIZATION_KEY);
this.#onSettingsSync = event => this.#sync(event.data);
this.#client.on("sync", this.#onSettingsSync);
let records = await this.#client.get();
await this.#clearAndPopulateMap(records);
}
uninit() {
lazy.logConsole.debug("Uninitializing domain-to-categories map.");
if (this.#init) {
this.#map = null;
this.#version = null;
this.#client.off("sync", this.#onSettingsSync);
this.#client = null;
this.#onSettingsSync = null;
this.#init = false;
}
}
/**
* Given a domain, find categories and relevant scores.
*
* @param {string} domain Domain to lookup.
* @returns {Array<DomainCategoryScore>}
* An array containing categories and their respective score. If no record
* for the domain is available, return an empty array.
*/
get(domain) {
if (this.empty) {
return [];
}
lazy.gCryptoHash.init(lazy.gCryptoHash.MD5);
let bytes = new TextEncoder().encode(domain);
lazy.gCryptoHash.update(bytes, domain.length);
let hash = lazy.gCryptoHash.finish(true);
let rawValues = this.#map[hash] ?? [];
if (rawValues.length) {
let output = [];
// Transform data into a more readable format.
// [x, y] => { category: x, score: y }
for (let i = 0; i < rawValues.length; i += 2) {
output.push({ category: rawValues[i], score: rawValues[i + 1] });
}
return output;
}
return [];
}
/**
* If the map was initialized, returns the version number for the data.
* The version number is determined by the record with the highest version
* number. Even if the records have different versions, only records from the
* latest version should be available. Returns null if the map was not
* initialized.
*
* @returns {null | number} The version number.
*/
get version() {
return this.#version;
}
/**
* Whether the map is empty of data.
*
* @returns {boolean}
*/
get empty() {
return !this.#map;
}
/**
* Inspects a list of records from the categorization domain bucket and finds
* the maximum version score from the set of records. Each record should have
* the same version number but if for any reason one entry has a lower
* version number, the latest version can be used to filter it out.
*
* @param {Array<DomainToCategoriesRecord>} records
* An array containing the records from a Remote Settings collection.
* @returns {number}
*/
#retrieveLatestVersion(records) {
return records.reduce((version, record) => {
if (record.version > version) {
return record.version;
}
return version;
}, 0);
}
/**
* Callback when Remote Settings has indicated the collection has been
* synced. Since the records in the collection will be updated all at once,
* use the array of current records which at this point in time would have
* the latest records from Remote Settings. Additionally, delete any
* attachment for records that no longer exist.
*
* @param {object} data
* Object containing records that are current, deleted, created, or updated.
*
*/
async #sync(data) {
lazy.logConsole.debug("Syncing domain-to-categories with Remote Settings.");
// Remove local files of deleted records.
let toDelete = data?.deleted.filter(d => d.attachment);
await Promise.all(
toDelete.map(record => this.#client.attachments.deleteDownloaded(record))
);
this.#clearAndPopulateMap(data?.current);
}
/**
* Clear the existing map and populate it with attachments found in the
* records. If no attachments are found, or no record containing an
* attachment contained the latest version, then nothing will change.
*
* @param {Array<DomainToCategoriesRecord>} records
* The records containing attachments.
*
*/
async #clearAndPopulateMap(records) {
// Set map to null so that if there are errors in the downloads, consumers
// will be able to know whether the map has information. Once we've
// successfully downloaded attachments and are parsing them, a non-null
// object will be created.
this.#map = null;
this.#version = null;
if (!records?.length) {
lazy.logConsole.debug("No records found for domain-to-categories map.");
return;
}
if (!records.length) {
lazy.logConsole.error(
"No valid attachments available for domain-to-categories map."
);
return;
}
let fileContents = [];
for (let record of records) {
let result;
// Downloading attachments can fail.
try {
result = await this.#client.attachments.download(record);
} catch (ex) {
lazy.logConsole.error("Could not download file:", ex);
return;
}
fileContents.push(result.buffer);
}
// All attachments should have the same version number. If for whatever
// reason they don't, we should only use the attachments with the latest
// version.
this.#version = this.#retrieveLatestVersion(records);
if (!this.#version) {
lazy.logConsole.debug("Could not find a version number for any record.");
return;
}
// Queue the series of assignments.
for (let i = 0; i < fileContents.length; ++i) {
let buffer = fileContents[i];
Services.tm.idleDispatchToMainThread(() => {
let start = Cu.now();
let json;
try {
json = JSON.parse(new TextDecoder().decode(buffer));
} catch (ex) {
// TODO: If there was an error decoding the buffer, we may want to
// dispatch an error in telemetry or try again.
return;
}
ChromeUtils.addProfilerMarker(
"SearchSERPTelemetry.#clearAndPopulateMap",
start,
"Convert buffer to JSON."
);
if (!this.#map) {
this.#map = {};
}
Object.assign(this.#map, json);
lazy.logConsole.debug("Updated domain-to-categories map.");
if (i == fileContents.length - 1) {
Services.obs.notifyObservers(
null,
"domain-to-categories-map-update-complete"
);
}
});
}
}
}
export var SearchSERPDomainToCategoriesMap = new DomainToCategoriesMap();
export var SearchSERPTelemetry = new TelemetryHandler();
export var SearchSERPCategorization = new DomainCategorizer();

Просмотреть файл

@ -0,0 +1,3 @@
{
"Wrq9YDsieAMC3Y2DSY5Rcg==": [1, 100]
}

Просмотреть файл

@ -0,0 +1,3 @@
{
"G99y4E1rUMgqSMfk3TjMaQ==": [2, 90]
}

Просмотреть файл

@ -0,0 +1,3 @@
{
"Wrq9YDsieAMC3Y2DSY5Rcg==": [1, 80]
}

Просмотреть файл

@ -0,0 +1,3 @@
{
"G99y4E1rUMgqSMfk3TjMaQ==": [2, 50, 4, 80]
}

Просмотреть файл

@ -0,0 +1,431 @@
/* Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/ */
/**
* Tests the integration of Remote Settings with SERP domain categorization.
*/
"use strict";
ChromeUtils.defineESModuleGetters(this, {
RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
SearchSERPDomainToCategoriesMap:
"resource:///modules/SearchSERPTelemetry.sys.mjs",
TELEMETRY_CATEGORIZATION_KEY:
"resource:///modules/SearchSERPTelemetry.sys.mjs",
TestUtils: "resource://testing-common/TestUtils.sys.mjs",
});
async function waitForDomainToCategoriesUpdate() {
return TestUtils.topicObserved("domain-to-categories-map-update-complete");
}
async function mockRecordWithCachedAttachment({ id, version, filename }) {
// Get the bytes of the file for the hash and size for attachment metadata.
let data = await IOUtils.readUTF8(
PathUtils.join(do_get_cwd().path, filename)
);
let buffer = new TextEncoder().encode(data).buffer;
let stream = Cc["@mozilla.org/io/arraybuffer-input-stream;1"].createInstance(
Ci.nsIArrayBufferInputStream
);
stream.setData(buffer, 0, buffer.byteLength);
// Generate a hash.
let hasher = Cc["@mozilla.org/security/hash;1"].createInstance(
Ci.nsICryptoHash
);
hasher.init(Ci.nsICryptoHash.SHA256);
hasher.updateFromStream(stream, -1);
let hash = hasher.finish(false);
hash = Array.from(hash, (_, i) =>
("0" + hash.charCodeAt(i).toString(16)).slice(-2)
).join("");
let record = {
id,
version,
attachment: {
hash,
location: `main-workspace/search-categorization/${filename}`,
filename,
size: buffer.byteLength,
mimetype: "application/json",
},
};
client.attachments.cacheImpl.set(id, {
record,
blob: new Blob([buffer]),
});
return record;
}
const RECORD_A_ID = Services.uuid.generateUUID().number.slice(1, -1);
const RECORD_B_ID = Services.uuid.generateUUID().number.slice(1, -1);
const client = RemoteSettings(TELEMETRY_CATEGORIZATION_KEY);
const db = client.db;
const RECORDS = {
record1a: {
id: RECORD_A_ID,
version: 1,
filename: "domain_category_mappings_1a.json",
},
record1b: {
id: RECORD_B_ID,
version: 1,
filename: "domain_category_mappings_1b.json",
},
record2a: {
id: RECORD_A_ID,
version: 2,
filename: "domain_category_mappings_2a.json",
},
record2b: {
id: RECORD_B_ID,
version: 2,
filename: "domain_category_mappings_2b.json",
},
};
add_setup(async () => {
Services.prefs.setBoolPref("browser.search.log", true);
Services.prefs.setBoolPref(
"browser.search.serpEventTelemetryCategorization.enabled",
true
);
// Testing with Remote Settings requires a profile.
do_get_profile();
// Clear existing Remote Settings data.
await db.clear();
});
add_task(async function test_initial_import() {
info("Create record containing domain_category_mappings_1a.json attachment.");
let record1a = await mockRecordWithCachedAttachment(RECORDS.record1a);
await db.create(record1a);
info("Create record containing domain_category_mappings_1b.json attachment.");
let record1b = await mockRecordWithCachedAttachment(RECORDS.record1b);
await db.create(record1b);
info("Add data to Remote Settings DB.");
await db.importChanges({}, Date.now());
info("Initialize search categorization mappings.");
let promise = waitForDomainToCategoriesUpdate();
await SearchSERPDomainToCategoriesMap.init();
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[{ category: 1, score: 100 }],
"Return value from lookup of example.com should be the same."
);
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.org"),
[{ category: 2, score: 90 }],
"Return value from lookup of example.org should be the same."
);
// Clean up.
await db.clear();
SearchSERPDomainToCategoriesMap.uninit();
});
add_task(async function test_update_records() {
info("Create record containing domain_category_mappings_1a.json attachment.");
let record1a = await mockRecordWithCachedAttachment(RECORDS.record1a);
await db.create(record1a);
info("Create record containing domain_category_mappings_1b.json attachment.");
let record1b = await mockRecordWithCachedAttachment(RECORDS.record1b);
await db.create(record1b);
info("Add data to Remote Settings DB.");
await db.importChanges({}, Date.now());
info("Initialize search categorization mappings.");
let promise = waitForDomainToCategoriesUpdate();
await SearchSERPDomainToCategoriesMap.init();
await promise;
info("Send update from Remote Settings with updates to attachments.");
let record2a = await mockRecordWithCachedAttachment(RECORDS.record2a);
let record2b = await mockRecordWithCachedAttachment(RECORDS.record2b);
const payload = {
current: [record2a, record2b],
created: [],
updated: [
{ old: record1a, new: record2a },
{ old: record1b, new: record2b },
],
deleted: [],
};
promise = waitForDomainToCategoriesUpdate();
await client.emit("sync", {
data: payload,
});
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[{ category: 1, score: 80 }],
"Return value from lookup of example.com should have changed."
);
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.org"),
[
{ category: 2, score: 50 },
{ category: 4, score: 80 },
],
"Return value from lookup of example.org should have changed."
);
Assert.equal(
SearchSERPDomainToCategoriesMap.version,
2,
"Version should be correct."
);
// Clean up.
await db.clear();
SearchSERPDomainToCategoriesMap.uninit();
});
add_task(async function test_delayed_initial_import() {
info("Initialize search categorization mappings.");
let observeNoRecordsFound = TestUtils.consoleMessageObserved(msg => {
return (
typeof msg.wrappedJSObject.arguments?.[0] == "string" &&
msg.wrappedJSObject.arguments[0].includes(
"No records found for domain-to-categories map."
)
);
});
info("Initialize without records.");
await SearchSERPDomainToCategoriesMap.init();
await observeNoRecordsFound;
Assert.ok(SearchSERPDomainToCategoriesMap.empty, "Map is empty.");
info("Send update from Remote Settings with updates to attachments.");
let record1a = await mockRecordWithCachedAttachment(RECORDS.record1a);
let record1b = await mockRecordWithCachedAttachment(RECORDS.record1b);
const payload = {
current: [record1a, record1b],
created: [record1a, record1b],
updated: [],
deleted: [],
};
let promise = waitForDomainToCategoriesUpdate();
await client.emit("sync", {
data: payload,
});
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[{ category: 1, score: 100 }],
"Return value from lookup of example.com should be the same."
);
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.org"),
[{ category: 2, score: 90 }],
"Return value from lookup of example.org should be the same."
);
Assert.equal(
SearchSERPDomainToCategoriesMap.version,
1,
"Version should be correct."
);
// Clean up.
await db.clear();
SearchSERPDomainToCategoriesMap.uninit();
});
add_task(async function test_remove_record() {
info("Create record containing domain_category_mappings_2a.json attachment.");
let record2a = await mockRecordWithCachedAttachment(RECORDS.record2a);
await db.create(record2a);
info("Create record containing domain_category_mappings_2b.json attachment.");
let record2b = await mockRecordWithCachedAttachment(RECORDS.record2b);
await db.create(record2b);
info("Add data to Remote Settings DB.");
await db.importChanges({}, Date.now());
info("Initialize search categorization mappings.");
let promise = waitForDomainToCategoriesUpdate();
await SearchSERPDomainToCategoriesMap.init();
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[{ category: 1, score: 80 }],
"Initialized properly."
);
info("Send update from Remote Settings with one removed record.");
const payload = {
current: [record2a],
created: [],
updated: [],
deleted: [record2b],
};
promise = waitForDomainToCategoriesUpdate();
await client.emit("sync", {
data: payload,
});
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[{ category: 1, score: 80 }],
"Return value from lookup of example.com should remain unchanged."
);
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.org"),
[],
"Return value from lookup of example.org should be empty."
);
Assert.equal(
SearchSERPDomainToCategoriesMap.version,
2,
"Version should be correct."
);
// Clean up.
await db.clear();
SearchSERPDomainToCategoriesMap.uninit();
});
add_task(async function test_different_versions_coexisting() {
info("Create record containing domain_category_mappings_1a.json attachment.");
let record1a = await mockRecordWithCachedAttachment(RECORDS.record1a);
await db.create(record1a);
info("Create record containing domain_category_mappings_2b.json attachment.");
let record2b = await mockRecordWithCachedAttachment(RECORDS.record2b);
await db.create(record2b);
info("Add data to Remote Settings DB.");
await db.importChanges({}, Date.now());
info("Initialize search categorization mappings.");
let promise = waitForDomainToCategoriesUpdate();
await SearchSERPDomainToCategoriesMap.init();
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[
{
category: 1,
score: 100,
},
],
"Should have a record from an older version."
);
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.org"),
[
{ category: 2, score: 50 },
{ category: 4, score: 80 },
],
"Return value from lookup of example.org should have the most recent value."
);
Assert.equal(
SearchSERPDomainToCategoriesMap.version,
2,
"Version should be the latest."
);
// Clean up.
await db.clear();
SearchSERPDomainToCategoriesMap.uninit();
});
add_task(async function test_download_error() {
info("Create record containing domain_category_mappings_1a.json attachment.");
let record1a = await mockRecordWithCachedAttachment(RECORDS.record1a);
await db.create(record1a);
info("Add data to Remote Settings DB.");
await db.importChanges({}, Date.now());
info("Initialize search categorization mappings.");
let promise = waitForDomainToCategoriesUpdate();
await SearchSERPDomainToCategoriesMap.init();
await promise;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[
{
category: 1,
score: 100,
},
],
"Domain should have an entry in the map."
);
Assert.equal(
SearchSERPDomainToCategoriesMap.version,
1,
"Version should be present."
);
info("Delete attachment from local cache.");
client.attachments.cacheImpl.delete(RECORD_A_ID);
const payload = {
current: [record1a],
created: [],
updated: [record1a],
deleted: [],
};
info("Sync payload.");
let observeDownloadError = TestUtils.consoleMessageObserved(msg => {
return (
typeof msg.wrappedJSObject.arguments?.[0] == "string" &&
msg.wrappedJSObject.arguments[0].includes("Could not download file:")
);
});
await client.emit("sync", {
data: payload,
});
await observeDownloadError;
Assert.deepEqual(
SearchSERPDomainToCategoriesMap.get("example.com"),
[],
"Domain should not exist in store."
);
Assert.equal(
SearchSERPDomainToCategoriesMap.version,
null,
"Version should remain null."
);
// Clean up.
await db.clear();
SearchSERPDomainToCategoriesMap.uninit();
});

Просмотреть файл

@ -3,6 +3,12 @@ skip-if = toolkit == 'android' # bug 1730213
firefox-appdir = browser
[test_search_telemetry_categorization_process_domains.js]
[test_search_telemetry_categorization_sync.js]
support-files =
domain_category_mappings_1a.json
domain_category_mappings_1b.json
domain_category_mappings_2a.json
domain_category_mappings_2b.json
[test_search_telemetry_compare_urls.js]
[test_search_telemetry_config_validation.js]
support-files =