Bug 1046074 - Improve post-filtering of dupes in UnifiedComplete. r=mak

Differential Revision: https://phabricator.services.mozilla.com/D66674

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Harry Twyford 2020-03-23 18:50:47 +00:00
Родитель f5a03c008e
Коммит 7390b3dca3
6 изменённых файлов: 328 добавлений и 97 удалений

Просмотреть файл

@ -40,7 +40,7 @@ const REGEXP_INSERT_METHOD = /(?:^| )insert-method:(\d+)/;
// Regex used to match one or more whitespace.
const REGEXP_SPACES = /\s+/;
// Regex used to strip prefixes from URLs. See stripPrefix().
// Regex used to strip prefixes from URLs. See stripAnyPrefix().
const REGEXP_STRIP_PREFIX = /^[a-z]+:(?:\/){0,2}/i;
// The result is notified on a delay, to avoid rebuilding the panel at every match.
@ -469,7 +469,7 @@ XPCOMUtils.defineLazyGetter(this, "sourceToBehaviorMap", () => {
* The possible URL to strip.
* @return If `str` is a URL, then [prefix, remainder]. Otherwise, ["", str].
*/
function stripPrefix(str) {
function stripAnyPrefix(str) {
let match = REGEXP_STRIP_PREFIX.exec(str);
if (!match) {
return ["", str];
@ -482,25 +482,49 @@ function stripPrefix(str) {
}
/**
* Strip http and trailing separators from a spec.
* Strips parts of a URL defined in `options`.
*
* @param spec
* @param {string} spec
* The text to modify.
* @param trimSlash
* @param {object} options
* @param {boolean} options.stripHttp
* Whether to strip http.
* @param {boolean} options.stripHttps
* Whether to strip https.
* @param {boolean} options.stripWww
* Whether to strip `www.`.
* @param {boolean} options.trimSlash
* Whether to trim the trailing slash.
* @return the modified spec.
* @param {boolean} options.trimEmptyQuery
* Whether to trim a trailing `?`.
* @returns {array} [modified, prefix, suffix]
* modified: {string} The modified spec.
* prefix: {string} The parts stripped from the prefix, if any.
* suffix: {string} The parts trimmed from the suffix, if any.
*/
function stripHttpAndTrim(spec, trimSlash = true) {
if (spec.startsWith("http://")) {
function stripPrefixAndTrim(spec, options = {}) {
let prefix = "";
let suffix = "";
if (options.stripHttp && spec.startsWith("http://")) {
spec = spec.slice(7);
prefix = "http://";
} else if (options.stripHttps && spec.startsWith("https://")) {
spec = spec.slice(8);
prefix = "https://";
}
if (spec.endsWith("?")) {
if (options.stripWww && spec.startsWith("www.")) {
spec = spec.slice(4);
prefix += "www.";
}
if (options.trimSlash && spec.endsWith("/")) {
spec = spec.slice(0, -1);
suffix += "/";
}
if (trimSlash && spec.endsWith("/")) {
if (options.trimEmptyQuery && spec.endsWith("?")) {
spec = spec.slice(0, -1);
suffix += "?";
}
return spec;
return [spec, prefix, suffix];
}
/**
@ -516,18 +540,33 @@ function stripHttpAndTrim(spec, trimSlash = true) {
* compare keys.
*/
function makeKeyForMatch(match) {
// For autofill entries, we need to have a key based on the comment rather
// than the value field, because the latter may have been trimmed.
// For autofill entries, we need to have a key based on the finalCompleteValue
// rather than the value field, because the latter may have been trimmed.
let key, prefix;
if (match.style && match.style.includes("autofill")) {
return [stripHttpAndTrim(match.comment), null];
[key, prefix] = stripPrefixAndTrim(match.finalCompleteValue, {
stripHttp: true,
stripHttps: true,
stripWww: true,
trimEmptyQuery: true,
trimSlash: true,
});
return [key, prefix, null];
}
let action = PlacesUtils.parseActionUrl(match.value);
if (!action) {
return [stripHttpAndTrim(match.value), null];
[key, prefix] = stripPrefixAndTrim(match.value, {
stripHttp: true,
stripHttps: true,
stripWww: true,
trimEmptyQuery: true,
trimSlash: true,
});
return [key, prefix, null];
}
let key;
switch (action.type) {
case "searchengine":
// We want to exclude search suggestion matches that simply echo back the
@ -542,11 +581,17 @@ function makeKeyForMatch(match) {
];
break;
default:
key = stripHttpAndTrim(action.params.url || match.value);
[key, prefix] = stripPrefixAndTrim(action.params.url || match.value, {
stripHttp: true,
stripHttps: true,
stripWww: true,
trimEmptyQuery: true,
trimSlash: true,
});
break;
}
return [key, action];
return [key, prefix, action];
}
/**
@ -663,7 +708,7 @@ function Search(
let unescapedSearchString = Services.textToSubURI.unEscapeURIForUI(
this._trimmedOriginalSearchString
);
let [prefix, suffix] = stripPrefix(unescapedSearchString);
let [prefix, suffix] = stripAnyPrefix(unescapedSearchString);
this._searchString = suffix;
this._strippedPrefix = prefix.toLowerCase();
@ -1346,7 +1391,7 @@ Search.prototype = {
this._result.setDefaultIndex(0);
let url = matchedSite.uri.spec;
let value = stripPrefix(url)[1];
let value = stripAnyPrefix(url)[1];
value = value.substr(value.indexOf(this._searchString));
this._addAutofillMatch(value, url, Infinity, ["preloaded-top-site"]);
@ -2193,16 +2238,34 @@ Search.prototype = {
this.notifyResult(true, match.type == UrlbarUtils.RESULT_GROUP.HEURISTIC);
},
// Ranks a URL prefix from 3 - 0 with the following preferences:
// https:// > https://www. > http:// > http://www.
// Higher is better.
// Returns -1 if the prefix does not match any of the above.
_getPrefixRank(prefix) {
return ["http://www.", "http://", "https://www.", "https://"].indexOf(
prefix
);
},
/**
* Check for duplicates and either discard the duplicate or replace the
* original match, in case the new one is more specific. For example,
* a Remote Tab wins over History, and a Switch to Tab wins over a Remote Tab.
* We must check both id and url for duplication, because keywords may change
* the url by replacing the %s placeholder.
* @param match
* @returns {object} matchPosition
* @returns {number} matchPosition.index
* The index the match should take in the results. Return -1 if the match
* should be discarded.
* @returns {boolean} matchPosition.replace
* True if the match should replace the result already at
* matchPosition.index.
*
*/
_getInsertIndexForMatch(match) {
// Check for duplicates and either discard (by returning -1) the duplicate
// or suggest to replace the original match, in case the new one is more
// specific (for example a Remote Tab wins over History, and a Switch to Tab
// wins over a Remote Tab).
// Must check both id and url, cause keywords dynamically modify the url.
// Note: this partially fixes Bug 1222435, but not if the urls differ more
// than just by "http://". We should still evaluate www and other schemes
// equivalences.
let [urlMapKey, action] = makeKeyForMatch(match);
let [urlMapKey, prefix, action] = makeKeyForMatch(match);
if (
(match.placeId && this._usedPlaceIds.has(match.placeId)) ||
this._usedURLs.some(e => ObjectUtils.deepEqual(e.key, urlMapKey))
@ -2233,13 +2296,112 @@ Search.prototype = {
continue;
}
if (!matchAction || action.type == "switchtab") {
this._usedURLs[i] = { key: urlMapKey, action, type: match.type };
this._usedURLs[i] = {
key: urlMapKey,
action,
type: match.type,
prefix,
comment: match.comment,
};
return { index: i, replace: true };
}
break; // Found the duplicate, no reason to continue.
}
}
} else {
// Dedupe with this flow:
// 1. If the two URLs are the same, dedupe whichever is not the
// heuristic result.
// 2. If they both contain www. or both do not contain it, prefer https.
// 3. If they differ by www.:
// 3a. If the page titles are different, keep both. This is a guard
// against deduping when www.site.com and site.com have different
// content.
// 3b. Otherwise, dedupe based on the priorities in _getPrefixRank.
let prefixRank = this._getPrefixRank(prefix);
for (let i = 0; i < this._usedURLs.length; ++i) {
if (!this._usedURLs[i]) {
// This is true when the result at [i] is a searchengine result.
continue;
}
let {
key: existingKey,
prefix: existingPrefix,
type: existingType,
comment: existingComment,
} = this._usedURLs[i];
let existingPrefixRank = this._getPrefixRank(existingPrefix);
if (ObjectUtils.deepEqual(existingKey, urlMapKey)) {
isDupe = true;
if (prefix == existingPrefix) {
// The URLs are identical. Throw out the new result, unless it's
// the heuristic.
if (match.type != UrlbarUtils.RESULT_GROUP.HEURISTIC) {
break; // Replace match.
} else {
this._usedURLs[i] = {
key: urlMapKey,
action,
type: match.type,
prefix,
comment: match.comment,
};
return { index: i, replace: true };
}
}
if (prefix.endsWith("www.") == existingPrefix.endsWith("www.")) {
// The results differ only by protocol.
if (match.type == UrlbarUtils.RESULT_GROUP.HEURISTIC) {
isDupe = false;
continue;
}
if (prefixRank <= existingPrefixRank) {
break; // Replace match.
} else if (existingType != UrlbarUtils.RESULT_GROUP.HEURISTIC) {
this._usedURLs[i] = {
key: urlMapKey,
action,
type: match.type,
prefix,
comment: match.comment,
};
return { index: i, replace: true };
} else {
isDupe = false;
continue;
}
} else {
// If either result is the heuristic, this will be true and we
// will keep both results.
if (match.comment != existingComment) {
isDupe = false;
continue;
}
if (prefixRank <= existingPrefixRank) {
break; // Replace match.
} else {
this._usedURLs[i] = {
key: urlMapKey,
action,
type: match.type,
prefix,
comment: match.comment,
};
return { index: i, replace: true };
}
}
}
}
}
// Discard the duplicate.
if (isDupe) {
return { index: -1, replace: false };
}
@ -2313,7 +2475,13 @@ Search.prototype = {
bucket.insertIndex++;
break;
}
this._usedURLs[index] = { key: urlMapKey, action, type: match.type };
this._usedURLs[index] = {
key: urlMapKey,
action,
type: match.type,
prefix,
comment: match.comment || "",
};
return { index, replace };
},
@ -2425,10 +2593,12 @@ Search.prototype = {
// that the user knows exactly where the match will take them. To make it
// look a little nicer, remove "http://", and if the user typed a host
// without a trailing slash, remove any trailing slash, too.
let comment = stripHttpAndTrim(
finalCompleteValue,
!this._searchString.includes("/")
);
let [comment] = stripPrefixAndTrim(finalCompleteValue, {
stripHttp: true,
trimEmptyQuery: true,
trimSlash: !this._searchString.includes("/"),
});
this._addMatch({
value: this._strippedPrefix + autofilledValue,
finalCompleteValue,

Просмотреть файл

@ -434,11 +434,6 @@ function addAutofillTasks(origins) {
comment: "https://" + comment,
style: ["autofill", "heuristic"],
},
{
value: "http://" + url,
comment: "test visit for http://" + url,
style: ["favicon"],
},
],
});
await cleanup();
@ -479,11 +474,6 @@ function addAutofillTasks(origins) {
comment: "https://" + comment,
style: ["autofill", "heuristic"],
},
{
value: "http://" + url,
comment: "test visit for http://" + url,
style: ["favicon"],
},
],
});
@ -524,11 +514,6 @@ function addAutofillTasks(origins) {
comment: "https://www." + comment,
style: ["autofill", "heuristic"],
},
{
value: "http://" + url,
comment: "test visit for http://" + url,
style: ["favicon"],
},
{
value: "https://" + url,
comment: "test visit for https://" + url,

Просмотреть файл

@ -306,11 +306,6 @@ add_task(async function groupByHost() {
comment: "https://example.com",
style: ["autofill", "heuristic"],
},
{
value: "http://example.com/",
comment: "test visit for http://example.com/",
style: ["favicon"],
},
],
});
@ -375,11 +370,6 @@ add_task(async function groupByHostNonDefaultStddevMultiplier() {
comment: "https://example.com",
style: ["autofill", "heuristic"],
},
{
value: "http://example.com/",
comment: "test visit for http://example.com/",
style: ["favicon"],
},
],
});

Просмотреть файл

@ -0,0 +1,113 @@
/* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/ */
// Testing that we dedupe results that have the same URL and title as another
// except for their prefix (e.g. http://www.).
add_task(async function dedupe_prefix() {
// We need to set the title or else we won't dedupe. We only dedupe when
// titles match up to mitigate deduping when the www. version of a site is
// completely different from it's www-less counterpart and thus presumably
// has a different title.
await PlacesTestUtils.addVisits([
{
uri: "http://example.com/foo/",
title: "Example Page",
},
{
uri: "http://www.example.com/foo/",
title: "Example Page",
},
{
uri: "https://example.com/foo/",
title: "Example Page",
},
{
uri: "https://www.example.com/foo/",
title: "Example Page",
},
{
uri: "https://www.example.com/foo/",
title: "Example Page",
},
]);
// We should get https://www. as the heuristic result and https:// in the
// results since the latter's prefix is a higher priority.
await check_autocomplete({
search: "example.com/foo/",
autofilled: "example.com/foo/",
completed: "https://www.example.com/foo/",
matches: [
{
value: "example.com/foo/",
comment: "https://www.example.com/foo/",
style: ["autofill", "heuristic"],
},
{
value: "https://example.com/foo/",
comment: "Example Page",
},
],
});
// Add more visits to the lowest-priority prefix. It should be the heuristic
// result but we should still show our highest-priority result. https://www.
// should not appear at all.
for (let i = 0; i < 3; i++) {
await PlacesTestUtils.addVisits([
{
uri: "http://www.example.com/foo/",
title: "Example Page",
},
]);
}
await check_autocomplete({
search: "example.com/foo/",
autofilled: "example.com/foo/",
completed: "http://www.example.com/foo/",
matches: [
{
value: "example.com/foo/",
comment: "www.example.com/foo/",
style: ["autofill", "heuristic"],
},
{
value: "https://example.com/foo/",
comment: "Example Page",
},
],
});
// Add enough https:// vists for it to have the highest frecency. It should
// be the heuristic result. We should still get the https://www. result
// because we still show results with the same key and protocol if they differ
// from the heuristic result in having www.
for (let i = 0; i < 5; i++) {
await PlacesTestUtils.addVisits([
{
uri: "https://example.com/foo/",
title: "Example Page",
},
]);
}
await check_autocomplete({
search: "example.com/foo/",
autofilled: "example.com/foo/",
completed: "https://example.com/foo/",
matches: [
{
value: "example.com/foo/",
comment: "https://example.com/foo/",
style: ["autofill", "heuristic"],
},
{
value: "https://www.example.com/foo/",
comment: "Example Page",
},
],
});
await cleanup();
});

Просмотреть файл

@ -31,12 +31,11 @@ add_task(async function test_swap_protocol() {
{ uri: uri8, title: "title" },
]);
// uri1, uri2, and uri5 won't appear since they are duplicates of uri6, except
// for their prefixes.
let allMatches = [
{ uri: uri1, title: "title" },
{ uri: uri2, title: "title" },
{ uri: uri3, title: "title" },
{ uri: uri4, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri6, title: "title" },
];
@ -47,10 +46,7 @@ add_task(async function test_swap_protocol() {
info("http://www.site matches 'www.site' pages");
await check_autocomplete({
search: "http://www.site",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
],
matches: [{ uri: uri5, title: "title" }],
});
info("http://site matches all site");
@ -74,10 +70,7 @@ add_task(async function test_swap_protocol() {
info("https://www.site matches all site");
await check_autocomplete({
search: "https://www.site",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
],
matches: [{ uri: uri5, title: "title" }],
});
info("https://site matches all site");
@ -89,17 +82,13 @@ add_task(async function test_swap_protocol() {
info("www.site matches 'www.site' pages");
await check_autocomplete({
search: "www.site",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
],
matches: [{ uri: uri5, title: "title" }],
});
info("w matches 'w' pages, including 'www'");
await check_autocomplete({
search: "w",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri7, title: "title" },
{ uri: uri8, title: "title" },
@ -110,7 +99,6 @@ add_task(async function test_swap_protocol() {
await check_autocomplete({
search: "http://w",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri7, title: "title" },
{ uri: uri8, title: "title" },
@ -126,21 +114,13 @@ add_task(async function test_swap_protocol() {
info("ww matches no 'ww' pages, including 'www'");
await check_autocomplete({
search: "ww",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri8, title: "title" },
],
matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
});
info("http://ww matches no 'ww' pages, including 'www'");
await check_autocomplete({
search: "http://ww",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri8, title: "title" },
],
matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
});
info("http://www.ww matches nothing");
@ -152,21 +132,13 @@ add_task(async function test_swap_protocol() {
info("www matches 'www' pages");
await check_autocomplete({
search: "www",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri8, title: "title" },
],
matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
});
info("http://www matches 'www' pages");
await check_autocomplete({
search: "http://www",
matches: [
{ uri: uri1, title: "title" },
{ uri: uri5, title: "title" },
{ uri: uri8, title: "title" },
],
matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
});
info("http://www.www matches nothing");

Просмотреть файл

@ -23,6 +23,7 @@ support-files =
[test_avoid_middle_complete.js]
[test_avoid_stripping_to_empty_tokens.js]
[test_casing.js]
[test_dedupe_prefix.js]
[test_do_not_trim.js]
[test_download_embed_bookmarks.js]
[test_dupe_urls.js]