Bug 1046074 - Improve post-filtering of dupes in UnifiedComplete. r=mak

Differential Revision: https://phabricator.services.mozilla.com/D66674 --HG-- extra : moz-landing-system : lando
2020-03-23 18:50:47 +00:00 · 2020-03-23 18:50:47 +00:00 · 7390b3dca3
--- a/toolkit/components/places/UnifiedComplete.jsm
+++ b/toolkit/components/places/UnifiedComplete.jsm
@ -40,7 +40,7 @@ const REGEXP_INSERT_METHOD = /(?:^| )insert-method:(\d+)/;
 // Regex used to match one or more whitespace.
 const REGEXP_SPACES = /\s+/;

-// Regex used to strip prefixes from URLs.  See stripPrefix().
+// Regex used to strip prefixes from URLs.  See stripAnyPrefix().
 const REGEXP_STRIP_PREFIX = /^[a-z]+:(?:\/){0,2}/i;

 // The result is notified on a delay, to avoid rebuilding the panel at every match.
@ -469,7 +469,7 @@ XPCOMUtils.defineLazyGetter(this, "sourceToBehaviorMap", () => {
 *         The possible URL to strip.
 * @return If `str` is a URL, then [prefix, remainder].  Otherwise, ["", str].
 */
-function stripPrefix(str) {
+function stripAnyPrefix(str) {
  let match = REGEXP_STRIP_PREFIX.exec(str);
  if (!match) {
    return ["", str];
@ -482,25 +482,49 @@ function stripPrefix(str) {
 }

 /**
- * Strip http and trailing separators from a spec.
+ * Strips parts of a URL defined in `options`.
 *
- * @param spec
+ * @param {string} spec
 *        The text to modify.
- * @param trimSlash
+ * @param {object} options
+ * @param {boolean} options.stripHttp
+ *        Whether to strip http.
+ * @param {boolean} options.stripHttps
+ *        Whether to strip https.
+ * @param {boolean} options.stripWww
+ *        Whether to strip `www.`.
+ * @param {boolean} options.trimSlash
 *        Whether to trim the trailing slash.
- * @return the modified spec.
+ * @param {boolean} options.trimEmptyQuery
+ *        Whether to trim a trailing `?`.
+ * @returns {array} [modified, prefix, suffix]
+ *          modified: {string} The modified spec.
+ *          prefix: {string} The parts stripped from the prefix, if any.
+ *          suffix: {string} The parts trimmed from the suffix, if any.
 */
-function stripHttpAndTrim(spec, trimSlash = true) {
-  if (spec.startsWith("http://")) {
+function stripPrefixAndTrim(spec, options = {}) {
+  let prefix = "";
+  let suffix = "";
+  if (options.stripHttp && spec.startsWith("http://")) {
    spec = spec.slice(7);
+    prefix = "http://";
+  } else if (options.stripHttps && spec.startsWith("https://")) {
+    spec = spec.slice(8);
+    prefix = "https://";
  }
-  if (spec.endsWith("?")) {
+  if (options.stripWww && spec.startsWith("www.")) {
+    spec = spec.slice(4);
+    prefix += "www.";
+  }
+  if (options.trimSlash && spec.endsWith("/")) {
    spec = spec.slice(0, -1);
+    suffix += "/";
  }
-  if (trimSlash && spec.endsWith("/")) {
+  if (options.trimEmptyQuery && spec.endsWith("?")) {
    spec = spec.slice(0, -1);
+    suffix += "?";
  }
-  return spec;
+  return [spec, prefix, suffix];
 }

 /**
@ -516,18 +540,33 @@ function stripHttpAndTrim(spec, trimSlash = true) {
 *          compare keys.
 */
 function makeKeyForMatch(match) {
-  // For autofill entries, we need to have a key based on the comment rather
-  // than the value field, because the latter may have been trimmed.
+  // For autofill entries, we need to have a key based on the finalCompleteValue
+  // rather than the value field, because the latter may have been trimmed.
+  let key, prefix;
  if (match.style && match.style.includes("autofill")) {
-    return [stripHttpAndTrim(match.comment), null];
+    [key, prefix] = stripPrefixAndTrim(match.finalCompleteValue, {
+      stripHttp: true,
+      stripHttps: true,
+      stripWww: true,
+      trimEmptyQuery: true,
+      trimSlash: true,
+    });
+
+    return [key, prefix, null];
  }

  let action = PlacesUtils.parseActionUrl(match.value);
  if (!action) {
-    return [stripHttpAndTrim(match.value), null];
+    [key, prefix] = stripPrefixAndTrim(match.value, {
+      stripHttp: true,
+      stripHttps: true,
+      stripWww: true,
+      trimEmptyQuery: true,
+      trimSlash: true,
+    });
+    return [key, prefix, null];
  }

-  let key;
  switch (action.type) {
    case "searchengine":
      // We want to exclude search suggestion matches that simply echo back the
@ -542,11 +581,17 @@ function makeKeyForMatch(match) {
      ];
      break;
    default:
-      key = stripHttpAndTrim(action.params.url || match.value);
+      [key, prefix] = stripPrefixAndTrim(action.params.url || match.value, {
+        stripHttp: true,
+        stripHttps: true,
+        stripWww: true,
+        trimEmptyQuery: true,
+        trimSlash: true,
+      });
      break;
  }

-  return [key, action];
+  return [key, prefix, action];
 }

 /**
@ -663,7 +708,7 @@ function Search(
  let unescapedSearchString = Services.textToSubURI.unEscapeURIForUI(
    this._trimmedOriginalSearchString
  );
-  let [prefix, suffix] = stripPrefix(unescapedSearchString);
+  let [prefix, suffix] = stripAnyPrefix(unescapedSearchString);
  this._searchString = suffix;
  this._strippedPrefix = prefix.toLowerCase();

@ -1346,7 +1391,7 @@ Search.prototype = {
    this._result.setDefaultIndex(0);

    let url = matchedSite.uri.spec;
-    let value = stripPrefix(url)[1];
+    let value = stripAnyPrefix(url)[1];
    value = value.substr(value.indexOf(this._searchString));

    this._addAutofillMatch(value, url, Infinity, ["preloaded-top-site"]);
@ -2193,16 +2238,34 @@ Search.prototype = {
    this.notifyResult(true, match.type == UrlbarUtils.RESULT_GROUP.HEURISTIC);
  },

+  // Ranks a URL prefix from 3 - 0 with the following preferences:
+  // https:// > https://www. > http:// > http://www.
+  // Higher is better.
+  // Returns -1 if the prefix does not match any of the above.
+  _getPrefixRank(prefix) {
+    return ["http://www.", "http://", "https://www.", "https://"].indexOf(
+      prefix
+    );
+  },
+
+  /**
+   * Check for duplicates and either discard the duplicate or replace the
+   * original match, in case the new one is more specific. For example,
+   * a Remote Tab wins over History, and a Switch to Tab wins over a Remote Tab.
+   * We must check both id and url for duplication, because keywords may change
+   * the url by replacing the %s placeholder.
+   * @param match
+   * @returns {object} matchPosition
+   * @returns {number} matchPosition.index
+   *   The index the match should take in the results. Return -1 if the match
+   *   should be discarded.
+   * @returns {boolean} matchPosition.replace
+   *   True if the match should replace the result already at
+   *   matchPosition.index.
+   *
+   */
  _getInsertIndexForMatch(match) {
-    // Check for duplicates and either discard (by returning -1) the duplicate
-    // or suggest to replace the original match, in case the new one is more
-    // specific (for example a Remote Tab wins over History, and a Switch to Tab
-    // wins over a Remote Tab).
-    // Must check both id and url, cause keywords dynamically modify the url.
-    // Note: this partially fixes Bug 1222435,  but not if the urls differ more
-    // than just by "http://". We should still evaluate www and other schemes
-    // equivalences.
-    let [urlMapKey, action] = makeKeyForMatch(match);
+    let [urlMapKey, prefix, action] = makeKeyForMatch(match);
    if (
      (match.placeId && this._usedPlaceIds.has(match.placeId)) ||
      this._usedURLs.some(e => ObjectUtils.deepEqual(e.key, urlMapKey))
@ -2233,13 +2296,112 @@ Search.prototype = {
              continue;
            }
            if (!matchAction || action.type == "switchtab") {
-              this._usedURLs[i] = { key: urlMapKey, action, type: match.type };
+              this._usedURLs[i] = {
+                key: urlMapKey,
+                action,
+                type: match.type,
+                prefix,
+                comment: match.comment,
+              };
              return { index: i, replace: true };
            }
            break; // Found the duplicate, no reason to continue.
          }
        }
+      } else {
+        // Dedupe with this flow:
+        // 1. If the two URLs are the same, dedupe whichever is not the
+        //    heuristic result.
+        // 2. If they both contain www. or both do not contain it, prefer https.
+        // 3. If they differ by www.:
+        //    3a. If the page titles are different, keep both. This is a guard
+        //        against deduping when www.site.com and site.com have different
+        //        content.
+        //    3b. Otherwise, dedupe based on the priorities in _getPrefixRank.
+        let prefixRank = this._getPrefixRank(prefix);
+        for (let i = 0; i < this._usedURLs.length; ++i) {
+          if (!this._usedURLs[i]) {
+            // This is true when the result at [i] is a searchengine result.
+            continue;
+          }
+
+          let {
+            key: existingKey,
+            prefix: existingPrefix,
+            type: existingType,
+            comment: existingComment,
+          } = this._usedURLs[i];
+
+          let existingPrefixRank = this._getPrefixRank(existingPrefix);
+          if (ObjectUtils.deepEqual(existingKey, urlMapKey)) {
+            isDupe = true;
+
+            if (prefix == existingPrefix) {
+              // The URLs are identical. Throw out the new result, unless it's
+              // the heuristic.
+              if (match.type != UrlbarUtils.RESULT_GROUP.HEURISTIC) {
+                break; // Replace match.
+              } else {
+                this._usedURLs[i] = {
+                  key: urlMapKey,
+                  action,
+                  type: match.type,
+                  prefix,
+                  comment: match.comment,
+                };
+                return { index: i, replace: true };
+              }
+            }
+
+            if (prefix.endsWith("www.") == existingPrefix.endsWith("www.")) {
+              // The results differ only by protocol.
+
+              if (match.type == UrlbarUtils.RESULT_GROUP.HEURISTIC) {
+                isDupe = false;
+                continue;
+              }
+
+              if (prefixRank <= existingPrefixRank) {
+                break; // Replace match.
+              } else if (existingType != UrlbarUtils.RESULT_GROUP.HEURISTIC) {
+                this._usedURLs[i] = {
+                  key: urlMapKey,
+                  action,
+                  type: match.type,
+                  prefix,
+                  comment: match.comment,
+                };
+                return { index: i, replace: true };
+              } else {
+                isDupe = false;
+                continue;
+              }
+            } else {
+              // If either result is the heuristic, this will be true and we
+              // will keep both results.
+              if (match.comment != existingComment) {
+                isDupe = false;
+                continue;
+              }
+
+              if (prefixRank <= existingPrefixRank) {
+                break; // Replace match.
+              } else {
+                this._usedURLs[i] = {
+                  key: urlMapKey,
+                  action,
+                  type: match.type,
+                  prefix,
+                  comment: match.comment,
+                };
+                return { index: i, replace: true };
+              }
+            }
+          }
+        }
      }
+
+      // Discard the duplicate.
      if (isDupe) {
        return { index: -1, replace: false };
      }
@ -2313,7 +2475,13 @@ Search.prototype = {
      bucket.insertIndex++;
      break;
    }
-    this._usedURLs[index] = { key: urlMapKey, action, type: match.type };
+    this._usedURLs[index] = {
+      key: urlMapKey,
+      action,
+      type: match.type,
+      prefix,
+      comment: match.comment || "",
+    };
    return { index, replace };
  },

@ -2425,10 +2593,12 @@ Search.prototype = {
    // that the user knows exactly where the match will take them.  To make it
    // look a little nicer, remove "http://", and if the user typed a host
    // without a trailing slash, remove any trailing slash, too.
-    let comment = stripHttpAndTrim(
-      finalCompleteValue,
-      !this._searchString.includes("/")
-    );
+    let [comment] = stripPrefixAndTrim(finalCompleteValue, {
+      stripHttp: true,
+      trimEmptyQuery: true,
+      trimSlash: !this._searchString.includes("/"),
+    });
+
    this._addMatch({
      value: this._strippedPrefix + autofilledValue,
      finalCompleteValue,
--- a/toolkit/components/places/tests/unifiedcomplete/autofill_tasks.js
+++ b/toolkit/components/places/tests/unifiedcomplete/autofill_tasks.js
@ -434,11 +434,6 @@ function addAutofillTasks(origins) {
          comment: "https://" + comment,
          style: ["autofill", "heuristic"],
        },
-        {
-          value: "http://" + url,
-          comment: "test visit for http://" + url,
-          style: ["favicon"],
-        },
      ],
    });
    await cleanup();
@ -479,11 +474,6 @@ function addAutofillTasks(origins) {
          comment: "https://" + comment,
          style: ["autofill", "heuristic"],
        },
-        {
-          value: "http://" + url,
-          comment: "test visit for http://" + url,
-          style: ["favicon"],
-        },
      ],
    });

@ -524,11 +514,6 @@ function addAutofillTasks(origins) {
          comment: "https://www." + comment,
          style: ["autofill", "heuristic"],
        },
-        {
-          value: "http://" + url,
-          comment: "test visit for http://" + url,
-          style: ["favicon"],
-        },
        {
          value: "https://" + url,
          comment: "test visit for https://" + url,
--- a/toolkit/components/places/tests/unifiedcomplete/test_autofill_origins.js
+++ b/toolkit/components/places/tests/unifiedcomplete/test_autofill_origins.js
@ -306,11 +306,6 @@ add_task(async function groupByHost() {
        comment: "https://example.com",
        style: ["autofill", "heuristic"],
      },
-      {
-        value: "http://example.com/",
-        comment: "test visit for http://example.com/",
-        style: ["favicon"],
-      },
    ],
  });

@ -375,11 +370,6 @@ add_task(async function groupByHostNonDefaultStddevMultiplier() {
        comment: "https://example.com",
        style: ["autofill", "heuristic"],
      },
-      {
-        value: "http://example.com/",
-        comment: "test visit for http://example.com/",
-        style: ["favicon"],
-      },
    ],
  });

--- a/toolkit/components/places/tests/unifiedcomplete/test_dedupe_prefix.js
+++ b/toolkit/components/places/tests/unifiedcomplete/test_dedupe_prefix.js
@ -0,0 +1,113 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+// Testing that we dedupe results that have the same URL and title as another
+// except for their prefix (e.g. http://www.).
+add_task(async function dedupe_prefix() {
+  // We need to set the title or else we won't dedupe. We only dedupe when
+  // titles match up to mitigate deduping when the www. version of a site is
+  // completely different from it's www-less counterpart and thus presumably
+  // has a different title.
+  await PlacesTestUtils.addVisits([
+    {
+      uri: "http://example.com/foo/",
+      title: "Example Page",
+    },
+    {
+      uri: "http://www.example.com/foo/",
+      title: "Example Page",
+    },
+    {
+      uri: "https://example.com/foo/",
+      title: "Example Page",
+    },
+    {
+      uri: "https://www.example.com/foo/",
+      title: "Example Page",
+    },
+    {
+      uri: "https://www.example.com/foo/",
+      title: "Example Page",
+    },
+  ]);
+
+  // We should get https://www. as the heuristic result and https:// in the
+  // results since the latter's prefix is a higher priority.
+  await check_autocomplete({
+    search: "example.com/foo/",
+    autofilled: "example.com/foo/",
+    completed: "https://www.example.com/foo/",
+    matches: [
+      {
+        value: "example.com/foo/",
+        comment: "https://www.example.com/foo/",
+        style: ["autofill", "heuristic"],
+      },
+      {
+        value: "https://example.com/foo/",
+        comment: "Example Page",
+      },
+    ],
+  });
+
+  // Add more visits to the lowest-priority prefix. It should be the heuristic
+  // result but we should still show our highest-priority result. https://www.
+  // should not appear at all.
+  for (let i = 0; i < 3; i++) {
+    await PlacesTestUtils.addVisits([
+      {
+        uri: "http://www.example.com/foo/",
+        title: "Example Page",
+      },
+    ]);
+  }
+
+  await check_autocomplete({
+    search: "example.com/foo/",
+    autofilled: "example.com/foo/",
+    completed: "http://www.example.com/foo/",
+    matches: [
+      {
+        value: "example.com/foo/",
+        comment: "www.example.com/foo/",
+        style: ["autofill", "heuristic"],
+      },
+      {
+        value: "https://example.com/foo/",
+        comment: "Example Page",
+      },
+    ],
+  });
+
+  // Add enough https:// vists for it to have the highest frecency. It should
+  // be the heuristic result. We should still get the https://www. result
+  // because we still show results with the same key and protocol if they differ
+  // from the heuristic result in having www.
+  for (let i = 0; i < 5; i++) {
+    await PlacesTestUtils.addVisits([
+      {
+        uri: "https://example.com/foo/",
+        title: "Example Page",
+      },
+    ]);
+  }
+
+  await check_autocomplete({
+    search: "example.com/foo/",
+    autofilled: "example.com/foo/",
+    completed: "https://example.com/foo/",
+    matches: [
+      {
+        value: "example.com/foo/",
+        comment: "https://example.com/foo/",
+        style: ["autofill", "heuristic"],
+      },
+      {
+        value: "https://www.example.com/foo/",
+        comment: "Example Page",
+      },
+    ],
+  });
+
+  await cleanup();
+});
--- a/toolkit/components/places/tests/unifiedcomplete/test_swap_protocol.js
+++ b/toolkit/components/places/tests/unifiedcomplete/test_swap_protocol.js
@ -31,12 +31,11 @@ add_task(async function test_swap_protocol() {
    { uri: uri8, title: "title" },
  ]);

+  // uri1, uri2, and uri5 won't appear since they are duplicates of uri6, except
+  // for their prefixes.
  let allMatches = [
-    { uri: uri1, title: "title" },
-    { uri: uri2, title: "title" },
    { uri: uri3, title: "title" },
    { uri: uri4, title: "title" },
-    { uri: uri5, title: "title" },
    { uri: uri6, title: "title" },
  ];

@ -47,10 +46,7 @@ add_task(async function test_swap_protocol() {
  info("http://www.site matches 'www.site' pages");
  await check_autocomplete({
    search: "http://www.site",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }],
  });

  info("http://site matches all site");
@ -74,10 +70,7 @@ add_task(async function test_swap_protocol() {
  info("https://www.site matches all site");
  await check_autocomplete({
    search: "https://www.site",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }],
  });

  info("https://site matches all site");
@ -89,17 +82,13 @@ add_task(async function test_swap_protocol() {
  info("www.site matches 'www.site' pages");
  await check_autocomplete({
    search: "www.site",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }],
  });

  info("w matches 'w' pages, including 'www'");
  await check_autocomplete({
    search: "w",
    matches: [
-      { uri: uri1, title: "title" },
      { uri: uri5, title: "title" },
      { uri: uri7, title: "title" },
      { uri: uri8, title: "title" },
@ -110,7 +99,6 @@ add_task(async function test_swap_protocol() {
  await check_autocomplete({
    search: "http://w",
    matches: [
-      { uri: uri1, title: "title" },
      { uri: uri5, title: "title" },
      { uri: uri7, title: "title" },
      { uri: uri8, title: "title" },
@ -126,21 +114,13 @@ add_task(async function test_swap_protocol() {
  info("ww matches no 'ww' pages, including 'www'");
  await check_autocomplete({
    search: "ww",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-      { uri: uri8, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
  });

  info("http://ww matches no 'ww' pages, including 'www'");
  await check_autocomplete({
    search: "http://ww",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-      { uri: uri8, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
  });

  info("http://www.ww matches nothing");
@ -152,21 +132,13 @@ add_task(async function test_swap_protocol() {
  info("www matches 'www' pages");
  await check_autocomplete({
    search: "www",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-      { uri: uri8, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
  });

  info("http://www matches 'www' pages");
  await check_autocomplete({
    search: "http://www",
-    matches: [
-      { uri: uri1, title: "title" },
-      { uri: uri5, title: "title" },
-      { uri: uri8, title: "title" },
-    ],
+    matches: [{ uri: uri5, title: "title" }, { uri: uri8, title: "title" }],
  });

  info("http://www.www matches nothing");
--- a/toolkit/components/places/tests/unifiedcomplete/xpcshell.ini
+++ b/toolkit/components/places/tests/unifiedcomplete/xpcshell.ini
@ -23,6 +23,7 @@ support-files =
 [test_avoid_middle_complete.js]
 [test_avoid_stripping_to_empty_tokens.js]
 [test_casing.js]
+[test_dedupe_prefix.js]
 [test_do_not_trim.js]
 [test_download_embed_bookmarks.js]
 [test_dupe_urls.js]