gecko-dev/browser/components/urlbar/UrlbarTokenizer.jsm

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

"use strict";

/**
 * This module exports a tokenizer to be used by the urlbar model.
 * Emitted tokens are objects in the shape { type, value }, where type is one
 * of UrlbarTokenizer.TYPE.
 */

var EXPORTED_SYMBOLS = ["UrlbarTokenizer"];

const { XPCOMUtils } = ChromeUtils.import(
  "resource://gre/modules/XPCOMUtils.jsm"
);
const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm");

ChromeUtils.defineModuleGetter(this, "Log", "resource://gre/modules/Log.jsm");
XPCOMUtils.defineLazyGetter(this, "logger", () =>
  Log.repository.getLogger("Urlbar.Tokenizer")
);

var UrlbarTokenizer = {
  // Regex matching on whitespaces.
  REGEXP_SPACES: /\s+/,

  // Regex used to guess url-like strings.
  // These are not expected to be 100% correct, we accept some user mistypes
  // and we're unlikely to be able to cover 100% of the cases.
  REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
  REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
  REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
  REGEXP_SINGLE_WORD_HOST: /^[^.:]$/i,
  REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
  // This accepts partial IPv4.
  REGEXP_HOSTPORT_INVALID_IP: /\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
  // This only accepts complete IPv4.
  REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
  // This accepts partial IPv6.
  REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
  REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+\/=?^`{|}~-]+@[\[\]A-Z0-9.-]+$/i,

  // Regex matching a percent encoded char at the beginning of a string.
  REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,

  TYPE: {
    TEXT: 1,
    POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
    POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
    RESTRICT_HISTORY: 4,
    RESTRICT_BOOKMARK: 5,
    RESTRICT_TAG: 6,
    RESTRICT_OPENPAGE: 7,
    RESTRICT_SEARCH: 8,
    RESTRICT_TITLE: 9,
    RESTRICT_URL: 10,
  },

  // The special characters below can be typed into the urlbar to restrict
  // the search to a certain category, like history, bookmarks or open pages; or
  // to force a match on just the title or url.
  // These restriction characters can be typed alone, or at word boundaries,
  // provided their meaning cannot be confused, for example # could be present
  // in a valid url, and thus it should not be interpreted as a restriction.
  RESTRICT: {
    HISTORY: "^",
    BOOKMARK: "*",
    TAG: "+",
    OPENPAGE: "%",
    SEARCH: "?",
    TITLE: "#",
    URL: "$",
  },

  /**
   * Returns whether the passed in token looks like a URL.
   * This is based on guessing and heuristics, that means if this function
   * returns false, it's surely not a URL, if it returns true, the result must
   * still be verified through URIFixup.
   *
   * @param {string} token
   *        The string token to verify
   * @param {object} options {
   *          requirePath: the url must have a path
   *        }
   * @returns {boolean} whether the token looks like a URL.
   */
  looksLikeUrl(token, options = {}) {
    if (token.length < 2) {
      return false;
    }
    // It should be a single word.
    if (this.REGEXP_SPACES.test(token)) {
      return false;
    }
    // If it starts with something that looks like a protocol, it's likely a url.
    if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
      return true;
    }
    // Guess path and prePath. At this point we should be analyzing strings not
    // having a protocol.
    let slashIndex = token.indexOf("/");
    let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
    if (!this.looksLikeOrigin(prePath)) {
      return false;
    }

    let path = slashIndex != -1 ? token.slice(slashIndex) : "";
    logger.debug("path", path);
    if (options.requirePath && !path) {
      return false;
    }
    // If there are both path and userinfo, it's likely a url.
    let atIndex = prePath.indexOf("@");
    let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
    if (path.length && userinfo.length) {
      return true;
    }

    // If the first character after the slash in the path is a letter, then the
    // token may be an "abc/def" url.
    if (/^\/[a-z]/i.test(path)) {
      return true;
    }
    // If the path contains special chars, it is likely a url.
    if (["%", "?", "#"].some(c => path.includes(c))) {
      return true;
    }

    // The above looksLikeOrigin call told us the prePath looks like an origin,
    // now we go into details checking some common origins.
    let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
    if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
      return true;
    }
    // ipv6 is very complex to support, just check for a few chars.
    if (
      this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
      ["[", "]", ":"].some(c => hostPort.includes(c))
    ) {
      return true;
    }
    if (Services.uriFixup.isDomainWhitelisted(hostPort, -1)) {
      return true;
    }
    return false;
  },

  /**
   * Returns whether the passed in token looks like an origin.
   * This is based on guessing and heuristics, that means if this function
   * returns false, it's surely not an origin, if it returns true, the result
   * must still be verified through URIFixup.
   *
   * @param {string} token
   *        The string token to verify
   * @returns {boolean} whether the token looks like an origin.
   */
  looksLikeOrigin(token) {
    if (!token.length) {
      return false;
    }
    let atIndex = token.indexOf("@");
    if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
      // We prefer handling it as an email rather than an origin with userinfo.
      return false;
    }
    let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
    let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
    logger.debug("userinfo", userinfo);
    logger.debug("hostPort", hostPort);
    if (
      this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
      this.REGEXP_HOSTPORT_IPV6.test(hostPort)
    ) {
      return true;
    }

    // Check for invalid chars.
    return (
      !this.REGEXP_LIKE_PROTOCOL.test(hostPort) &&
      !this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) &&
      !this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) &&
      (this.REGEXP_SINGLE_WORD_HOST.test(hostPort) ||
        !this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) ||
        !this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
    );
  },

  /**
   * Tokenizes the searchString from a UrlbarQueryContext.
   * @param {UrlbarQueryContext} queryContext
   *        The query context object to tokenize
   * @returns {UrlbarQueryContext} the same query context object with a new
   *          tokens property.
   */
  tokenize(queryContext) {
    logger.info("Tokenizing", queryContext);
    let searchString = queryContext.searchString;
    if (!searchString.trim()) {
      queryContext.tokens = [];
      return queryContext;
    }

    let unfiltered = splitString(searchString);
    let tokens = filterTokens(unfiltered);
    queryContext.tokens = tokens;
    return queryContext;
  },

  /**
   * Given a token, tells if it's a restriction token.
   * @param {string} token
   * @returns {boolean} Whether the token is a restriction character.
   */
  isRestrictionToken(token) {
    return (
      token.type >= this.TYPE.RESTRICT_HISTORY &&
      token.type <= this.TYPE.RESTRICT_URL
    );
  },
};

const CHAR_TO_TYPE_MAP = new Map(
  Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
    char,
    UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
  ])
);

/**
 * Given a search string, splits it into string tokens.
 * @param {string} searchString
 *        The search string to split
 * @returns {array} An array of string tokens.
 */
function splitString(searchString) {
  // The first step is splitting on unicode whitespaces.
  let tokens = searchString.trim().split(UrlbarTokenizer.REGEXP_SPACES);
  let accumulator = [];
  let hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
  let chars = Array.from(CHAR_TO_TYPE_MAP.keys()).join("");
  logger.debug("Restriction chars", chars);
  for (let i = 0; i < tokens.length; ++i) {
    // If there is no separate restriction token, it's possible we have to split
    // a token, if it's the first one and it includes a leading restriction char
    // or it's the last one and it includes a trailing restriction char.
    // This allows to not require the user to add artificial whitespaces to
    // enforce restrictions, for example typing questions would restrict to
    // search results.
    let token = tokens[i];
    if (!hasRestrictionToken && token.length > 1) {
      // Check for an unambiguous restriction char at the beginning of the
      // first token, or at the end of the last token.
      if (
        i == 0 &&
        chars.includes(token[0]) &&
        !UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(token)
      ) {
        hasRestrictionToken = true;
        accumulator.push(token[0]);
        accumulator.push(token.slice(1));
        continue;
      } else if (
        i == tokens.length - 1 &&
        chars.includes(token[token.length - 1]) &&
        !UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })
      ) {
        hasRestrictionToken = true;
        accumulator.push(token.slice(0, token.length - 1));
        accumulator.push(token[token.length - 1]);
        continue;
      }
    }
    accumulator.push(token);
  }
  logger.info("Found tokens", accumulator);
  return accumulator;
}

/**
 * Given an array of unfiltered tokens, this function filters them and converts
 * to token objects with a type.
 *
 * @param {array} tokens
 *        An array of strings, representing search tokens.
 * @returns {array} An array of token objects.
 * @note restriction characters are only considered if they appear at the start
 *       or at the end of the tokens list. In case of restriction characters
 *       conflict, the most external ones win. Leading ones win over trailing
 *       ones. Discarded restriction characters are considered text.
 */
function filterTokens(tokens) {
  let filtered = [];
  let restrictions = [];
  for (let i = 0; i < tokens.length; ++i) {
    let token = tokens[i];
    let tokenObj = {
      value: token,
      lowerCaseValue: token.toLocaleLowerCase(),
      type: UrlbarTokenizer.TYPE.TEXT,
    };
    let restrictionType = CHAR_TO_TYPE_MAP.get(token);
    if (restrictionType) {
      restrictions.push({ index: i, type: restrictionType });
    } else if (UrlbarTokenizer.looksLikeOrigin(token)) {
      tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
    } else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
      tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
    }
    filtered.push(tokenObj);
  }

  // Handle restriction characters.
  if (restrictions.length) {
    // We can apply two kind of restrictions: type (bookmark, search, ...) and
    // matching (url, title). These kind of restrictions can be combined, but we
    // can only have one restriction per kind.
    let matchingRestrictionFound = false;
    let typeRestrictionFound = false;
    function assignRestriction(r) {
      if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
        if (
          [
            UrlbarTokenizer.TYPE.RESTRICT_TITLE,
            UrlbarTokenizer.TYPE.RESTRICT_URL,
          ].includes(r.type)
        ) {
          if (!matchingRestrictionFound) {
            matchingRestrictionFound = true;
            filtered[r.index].type = r.type;
            return true;
          }
        } else if (!typeRestrictionFound) {
          typeRestrictionFound = true;
          filtered[r.index].type = r.type;
          return true;
        }
      }
      return false;
    }

    // Look at the first token.
    let found = assignRestriction(restrictions.find(r => r.index == 0));
    if (found) {
      // If the first token was assigned, look at the next one.
      assignRestriction(restrictions.find(r => r.index == 1));
    }
    // Then look at the last token.
    let lastIndex = tokens.length - 1;
    found = assignRestriction(restrictions.find(r => r.index == lastIndex));
    if (found) {
      // If the last token was assigned, look at the previous one.
      assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
    }
  }

  logger.info("Filtered Tokens", tokens);
  return filtered;
}