Bug 1716025 - Part 1 - Add hardcoded blocklist of sites to exclude from history metadata. r=mak

This patch creates a blocklist of sites that should not be saved as keyframes. The blocklist maps hostnames to regular expressions. While the map is a bit of an awkward structure, it lets us avoid checking every URL against a list of regular expressions. This performance gain will be more apparent as the list expands.

Differential Revision: https://phabricator.services.mozilla.com/D117543
This commit is contained in:
Harry Twyford 2021-06-21 14:32:46 +00:00
Родитель 22900508de
Коммит 699c2f666c
10 изменённых файлов: 286 добавлений и 37 удалений

Просмотреть файл

@ -12,6 +12,7 @@ const { XPCOMUtils } = ChromeUtils.import(
XPCOMUtils.defineLazyModuleGetters(this, {
BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.jsm",
InteractionsBlocklist: "resource:///modules/InteractionsBlocklist.jsm",
PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.jsm",
Services: "resource://gre/modules/Services.jsm",
});
@ -373,6 +374,11 @@ class _Interactions {
this.registerEndOfInteraction(browser);
}
if (InteractionsBlocklist.isUrlBlocklisted(docInfo.url)) {
logConsole.debug("URL is blocklisted", docInfo);
return;
}
logConsole.debug("New interaction", docInfo);
interaction = {
url: docInfo.url,

Просмотреть файл

@ -0,0 +1,142 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
var EXPORTED_SYMBOLS = ["InteractionsBlocklist"];
const { XPCOMUtils } = ChromeUtils.import(
"resource://gre/modules/XPCOMUtils.jsm"
);
XPCOMUtils.defineLazyModuleGetters(this, {
Services: "resource://gre/modules/Services.jsm",
UrlbarUtils: "resource:///modules/UrlbarUtils.jsm",
});
XPCOMUtils.defineLazyGetter(this, "logConsole", function() {
return console.createInstance({
prefix: "InteractionsBlocklist",
maxLogLevel: Services.prefs.getBoolPref(
"browser.places.interactions.log",
false
)
? "Debug"
: "Warn",
});
});
// A blocklist of regular expressions. Maps base hostnames to a list regular
// expressions for URLs with that base hostname. In this context, "base
// hostname" means the hostname without any subdomains or a public suffix. For
// example, the base hostname for "https://www.maps.google.com/a/place" is
// "google". We do this mapping to improve performance; otherwise we'd have to
// check all URLs against a long list of regular expressions. The regexes are
// defined as escaped strings so that we build them lazily.
// We may want to migrate this list to Remote Settings in the future.
let HOST_BLOCKLIST = {
baidu: [
// Baidu SERP
"^(https?:\\/\\/)?(www\\.)?baidu\\.com\\/s.*(\\?|&)wd=.*",
],
bing: [
// Bing SERP
"^(https?:\\/\\/)?(www\\.)?bing\\.com\\/search.*(\\?|&)q=.*",
],
duckduckgo: [
// DuckDuckGo SERP
"^(https?:\\/\\/)?(www\\.)?duckduckgo\\.com\\/.*(\\?|&)q=.*",
],
example: [
// For testing. Removed in part 2 of this patch.
"^(https?:\\/\\/)?example\\.com\\/browser",
],
google: [
// Google SERP
"^(https?:\\/\\/)?(www\\.)?google\\.(\\w|\\.){2,}\\/search.*(\\?|&)q=.*",
],
yandex: [
// Yandex SERP
"^(https?:\\/\\/)?(www\\.)?yandex\\.(\\w|\\.){2,}\\/search.*(\\?|&)text=.*",
],
zoom: [
// Zoom meeting interstitial
"^(https?:\\/\\/)?(www\\.)?.*\\.zoom\\.us\\/j\\/\\d+",
],
};
HOST_BLOCKLIST = new Proxy(HOST_BLOCKLIST, {
get(target, property) {
let regexes = target[property];
if (!regexes || !Array.isArray(regexes)) {
return null;
}
for (let i = 0; i < regexes.length; i++) {
let regex = regexes[i];
if (typeof regex === "string") {
regex = new RegExp(regex, "i");
if (regex) {
regexes[i] = regex;
} else {
throw new Error("Blocklist contains invalid regex.");
}
}
}
return regexes;
},
});
/**
* A class that maintains a blocklist of URLs. The class exposes a method to
* check if a particular URL is contained on the blocklist.
*/
class _InteractionsBlocklist {
/**
* Checks a URL against a blocklist of URLs. If the URL is blocklisted, we
* should not record an interaction.
*
* @param {string} urlToCheck
* The URL we are looking for on the blocklist.
* @returns {boolean}
* True if `url` is on a blocklist. False otherwise.
*/
isUrlBlocklisted(urlToCheck) {
// First, find the URL's base host: the hostname without any subdomains or a
// public suffix.
let url;
try {
url = new URL(urlToCheck);
if (!url) {
throw new Error();
}
} catch (ex) {
logConsole.warn(
`Invalid URL passed to InteractionsBlocklist.isUrlBlocklisted: ${url}`
);
return false;
}
let hostWithoutSuffix = UrlbarUtils.stripPublicSuffixFromHost(url.host);
let [hostWithSubdomains] = UrlbarUtils.stripPrefixAndTrim(
hostWithoutSuffix,
{
stripWww: true,
trimTrailingDot: true,
}
);
let baseHost = hostWithSubdomains.substring(
hostWithSubdomains.lastIndexOf(".") + 1
);
// Then fetch blocked regexes for that baseHost and compare them to the full
// URL.
let regexes = HOST_BLOCKLIST[baseHost.toLocaleLowerCase()];
if (!regexes) {
return false;
}
return regexes.some(r => r.test(url.href));
}
}
const InteractionsBlocklist = new _InteractionsBlocklist();

Просмотреть файл

@ -15,6 +15,7 @@ JAR_MANIFESTS += ["jar.mn"]
EXTRA_JS_MODULES += [
"Interactions.jsm",
"InteractionsBlocklist.jsm",
"PlacesUIUtils.jsm",
]

Просмотреть файл

@ -10,5 +10,6 @@ support-files =
head.js
../keyword_form.html
[browser_interactions_blocklist.js]
[browser_interactions_view_time.js]
[browser_interactions_typing.js]

Просмотреть файл

@ -0,0 +1,50 @@
/* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/ */
/**
* Tests that interactions are not recorded for sites on the blocklist.
*/
const ALLOWED_TEST_URL = "https://example.com/";
const BLOCKED_TEST_URL = "https://example.com/browser";
add_task(async function setup() {
sinon.spy(Interactions, "_updateDatabase");
registerCleanupFunction(() => {
sinon.restore();
});
});
add_task(async function test() {
await BrowserTestUtils.withNewTab(ALLOWED_TEST_URL, async browser => {
Interactions._pageViewStartTime = Cu.now() - 10000;
BrowserTestUtils.loadURI(browser, BLOCKED_TEST_URL);
await BrowserTestUtils.browserLoaded(browser, false, BLOCKED_TEST_URL);
await assertDatabaseValues([
{
url: ALLOWED_TEST_URL,
totalViewTime: 10000,
},
]);
Interactions._pageViewStartTime = Cu.now() - 20000;
BrowserTestUtils.loadURI(browser, "about:blank");
await BrowserTestUtils.browserLoaded(browser, false, "about:blank");
// We should not have updated the database with BLOCKED_TEST_URL because it
// is blocklisted. We wait a little to make sure _updateDatabase is not
// going to fire.
// eslint-disable-next-line mozilla/no-arbitrary-setTimeout
await new Promise(resolve => setTimeout(resolve, 500));
await assertDatabaseValues([
{
url: ALLOWED_TEST_URL,
totalViewTime: 10000,
},
]);
});
});

Просмотреть файл

@ -21,43 +21,6 @@ add_task(async function setup() {
});
});
async function assertDatabaseValues(expected) {
await BrowserTestUtils.waitForCondition(
() => Interactions._updateDatabase.callCount == expected.length,
"Should have saved to the database"
);
let args = Interactions._updateDatabase.args;
for (let i = 0; i < expected.length; i++) {
let actual = args[i][0];
Assert.equal(
actual.url,
expected[i].url,
"Should have saved the page into the database"
);
if (expected[i].exactTotalViewTime) {
Assert.equal(
actual.totalViewTime,
expected[i].exactTotalViewTime,
"Should have kept the exact time"
);
} else {
Assert.greater(
actual.totalViewTime,
expected[i].totalViewTime,
"Should have stored the interaction time"
);
}
if (expected[i].maxViewTime) {
Assert.less(
actual.totalViewTime,
expected[i].maxViewTime,
"Should have recorded an interaction below the maximum expected"
);
}
}
}
add_task(async function test_interactions_simple_load_and_navigate_away() {
await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
Interactions._pageViewStartTime = Cu.now() - 10000;

Просмотреть файл

@ -30,3 +30,40 @@ function disableIdleService() {
idleService.addIdleObserver(Interactions, pageViewIdleTime);
});
}
async function assertDatabaseValues(expected) {
await BrowserTestUtils.waitForCondition(
() => Interactions._updateDatabase.callCount == expected.length,
"Should have saved to the database"
);
let args = Interactions._updateDatabase.args;
for (let i = 0; i < expected.length; i++) {
let actual = args[i][0];
Assert.equal(
actual.url,
expected[i].url,
"Should have saved the page into the database"
);
if (expected[i].exactTotalViewTime) {
Assert.equal(
actual.totalViewTime,
expected[i].exactTotalViewTime,
"Should have kept the exact time"
);
} else {
Assert.greater(
actual.totalViewTime,
expected[i].totalViewTime,
"Should have stored the interaction time"
);
}
if (expected[i].maxViewTime) {
Assert.less(
actual.totalViewTime,
expected[i].maxViewTime,
"Should have recorded an interaction below the maximum expected"
);
}
}
}

Просмотреть файл

@ -0,0 +1,42 @@
/* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/ */
/**
* Tests that blocked sites are caught by InteractionsBlocklist.
*/
const { InteractionsBlocklist } = ChromeUtils.import(
"resource:///modules/InteractionsBlocklist.jsm"
);
let BLOCKED_URLS = [
"https://www.bing.com/search?q=mozilla",
"https://duckduckgo.com/?q=a+test&kp=1&t=ffab",
"https://www.google.com/search?q=mozilla",
"https://www.google.ca/search?q=test",
"https://mozilla.zoom.us/j/123456789",
"https://yandex.az/search/?text=mozilla",
"https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&ch=&tn=baidu&bar=&wd=mozilla&rn=&fenlei=256&oq=&rsv_pq=970f2b8f001757b9&rsv_t=1f5d2V2o80HPdZtZnhodwkc7nZXTvDI1zwdPy%2FAeomnvFFGIrU1F3D9WoK4&rqlang=cn",
];
let ALLOWED_URLS = [
"https://example.com",
"https://zoom.us/pricing",
"https://www.google.ca/maps/place/Toronto,+ON/@43.7181557,-79.5181414,11z/data=!3m1!4b1!4m5!3m4!1s0x89d4cb90d7c63ba5:0x323555502ab4c477!8m2!3d43.653226!4d-79.3831843",
];
add_task(async function test() {
for (let url of BLOCKED_URLS) {
Assert.ok(
InteractionsBlocklist.isUrlBlocklisted(url),
`${url} is blocklisted.`
);
}
for (let url of ALLOWED_URLS) {
Assert.ok(
!InteractionsBlocklist.isUrlBlocklisted(url),
`${url} is not blocklisted.`
);
}
});

Просмотреть файл

@ -17,5 +17,6 @@ support-files =
[test_browserGlue_prefs.js]
[test_browserGlue_restore.js]
[test_clearHistory_shutdown.js]
[test_interactions_blocklist.js]
[test_PUIU_batchUpdatesForNode.js]
[test_PUIU_setCharsetForPage.js]

Просмотреть файл

@ -758,6 +758,8 @@ var UrlbarUtils = {
* Whether to trim a trailing `?`.
* @param {boolean} options.trimEmptyHash
* Whether to trim a trailing `#`.
* @param {boolean} options.trimTrailingDot
* Whether to trim a trailing '.'.
* @returns {array} [modified, prefix, suffix]
* modified: {string} The modified spec.
* prefix: {string} The parts stripped from the prefix, if any.
@ -789,6 +791,10 @@ var UrlbarUtils = {
spec = spec.slice(0, -1);
suffix = "/" + suffix;
}
if (options.trimTrailingDot && spec.endsWith(".")) {
spec = spec.slice(0, -1);
suffix = "." + suffix;
}
return [spec, prefix, suffix];
},