gecko-dev/browser/components/urlbar/UrlbarTokenizer.jsm

363 строки
12 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
/**
* This module exports a tokenizer to be used by the urlbar model.
* Emitted tokens are objects in the shape { type, value }, where type is one
* of UrlbarTokenizer.TYPE.
*/
var EXPORTED_SYMBOLS = ["UrlbarTokenizer"];
const { XPCOMUtils } = ChromeUtils.import(
"resource://gre/modules/XPCOMUtils.jsm"
);
const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm");
ChromeUtils.defineModuleGetter(this, "Log", "resource://gre/modules/Log.jsm");
XPCOMUtils.defineLazyGetter(this, "logger", () =>
Log.repository.getLogger("Urlbar.Tokenizer")
);
var UrlbarTokenizer = {
// Regex matching on whitespaces.
REGEXP_SPACES: /\s+/,
// Regex used to guess url-like strings.
// These are not expected to be 100% correct, we accept some user mistypes
// and we're unlikely to be able to cover 100% of the cases.
REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
REGEXP_SINGLE_WORD_HOST: /^[^.:]$/i,
REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
// This accepts partial IPv4.
REGEXP_HOSTPORT_INVALID_IP: /\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
// This only accepts complete IPv4.
REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
// This accepts partial IPv6.
REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+\/=?^`{|}~-]+@[\[\]A-Z0-9.-]+$/i,
// Regex matching a percent encoded char at the beginning of a string.
REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,
TYPE: {
TEXT: 1,
POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
RESTRICT_HISTORY: 4,
RESTRICT_BOOKMARK: 5,
RESTRICT_TAG: 6,
RESTRICT_OPENPAGE: 7,
RESTRICT_SEARCH: 8,
RESTRICT_TITLE: 9,
RESTRICT_URL: 10,
},
// The special characters below can be typed into the urlbar to restrict
// the search to a certain category, like history, bookmarks or open pages; or
// to force a match on just the title or url.
// These restriction characters can be typed alone, or at word boundaries,
// provided their meaning cannot be confused, for example # could be present
// in a valid url, and thus it should not be interpreted as a restriction.
RESTRICT: {
HISTORY: "^",
BOOKMARK: "*",
TAG: "+",
OPENPAGE: "%",
SEARCH: "?",
TITLE: "#",
URL: "$",
},
/**
* Returns whether the passed in token looks like a URL.
* This is based on guessing and heuristics, that means if this function
* returns false, it's surely not a URL, if it returns true, the result must
* still be verified through URIFixup.
*
* @param {string} token
* The string token to verify
* @param {object} options {
* requirePath: the url must have a path
* }
* @returns {boolean} whether the token looks like a URL.
*/
looksLikeUrl(token, options = {}) {
if (token.length < 2) {
return false;
}
// It should be a single word.
if (this.REGEXP_SPACES.test(token)) {
return false;
}
// If it starts with something that looks like a protocol, it's likely a url.
if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
return true;
}
// Guess path and prePath. At this point we should be analyzing strings not
// having a protocol.
let slashIndex = token.indexOf("/");
let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
if (!this.looksLikeOrigin(prePath)) {
return false;
}
let path = slashIndex != -1 ? token.slice(slashIndex) : "";
logger.debug("path", path);
if (options.requirePath && !path) {
return false;
}
// If there are both path and userinfo, it's likely a url.
let atIndex = prePath.indexOf("@");
let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
if (path.length && userinfo.length) {
return true;
}
// If the first character after the slash in the path is a letter, then the
// token may be an "abc/def" url.
if (/^\/[a-z]/i.test(path)) {
return true;
}
// If the path contains special chars, it is likely a url.
if (["%", "?", "#"].some(c => path.includes(c))) {
return true;
}
// The above looksLikeOrigin call told us the prePath looks like an origin,
// now we go into details checking some common origins.
let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
return true;
}
// ipv6 is very complex to support, just check for a few chars.
if (
this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
["[", "]", ":"].some(c => hostPort.includes(c))
) {
return true;
}
if (Services.uriFixup.isDomainWhitelisted(hostPort, -1)) {
return true;
}
return false;
},
/**
* Returns whether the passed in token looks like an origin.
* This is based on guessing and heuristics, that means if this function
* returns false, it's surely not an origin, if it returns true, the result
* must still be verified through URIFixup.
*
* @param {string} token
* The string token to verify
* @returns {boolean} whether the token looks like an origin.
*/
looksLikeOrigin(token) {
if (!token.length) {
return false;
}
let atIndex = token.indexOf("@");
if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
// We prefer handling it as an email rather than an origin with userinfo.
return false;
}
let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
logger.debug("userinfo", userinfo);
logger.debug("hostPort", hostPort);
if (
this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
this.REGEXP_HOSTPORT_IPV6.test(hostPort)
) {
return true;
}
// Check for invalid chars.
return (
!this.REGEXP_LIKE_PROTOCOL.test(hostPort) &&
!this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) &&
!this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) &&
(this.REGEXP_SINGLE_WORD_HOST.test(hostPort) ||
!this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) ||
!this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
);
},
/**
* Tokenizes the searchString from a UrlbarQueryContext.
* @param {UrlbarQueryContext} queryContext
* The query context object to tokenize
* @returns {UrlbarQueryContext} the same query context object with a new
* tokens property.
*/
tokenize(queryContext) {
logger.info("Tokenizing", queryContext);
let searchString = queryContext.searchString;
if (!searchString.trim()) {
queryContext.tokens = [];
return queryContext;
}
let unfiltered = splitString(searchString);
let tokens = filterTokens(unfiltered);
queryContext.tokens = tokens;
return queryContext;
},
/**
* Given a token, tells if it's a restriction token.
* @param {string} token
* @returns {boolean} Whether the token is a restriction character.
*/
isRestrictionToken(token) {
return (
token.type >= this.TYPE.RESTRICT_HISTORY &&
token.type <= this.TYPE.RESTRICT_URL
);
},
};
const CHAR_TO_TYPE_MAP = new Map(
Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
char,
UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
])
);
/**
* Given a search string, splits it into string tokens.
* @param {string} searchString
* The search string to split
* @returns {array} An array of string tokens.
*/
function splitString(searchString) {
// The first step is splitting on unicode whitespaces.
let tokens = searchString.trim().split(UrlbarTokenizer.REGEXP_SPACES);
let accumulator = [];
let hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
let chars = Array.from(CHAR_TO_TYPE_MAP.keys()).join("");
logger.debug("Restriction chars", chars);
for (let i = 0; i < tokens.length; ++i) {
// If there is no separate restriction token, it's possible we have to split
// a token, if it's the first one and it includes a leading restriction char
// or it's the last one and it includes a trailing restriction char.
// This allows to not require the user to add artificial whitespaces to
// enforce restrictions, for example typing questions would restrict to
// search results.
let token = tokens[i];
if (!hasRestrictionToken && token.length > 1) {
// Check for an unambiguous restriction char at the beginning of the
// first token, or at the end of the last token.
if (
i == 0 &&
chars.includes(token[0]) &&
!UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(token)
) {
hasRestrictionToken = true;
accumulator.push(token[0]);
accumulator.push(token.slice(1));
continue;
} else if (
i == tokens.length - 1 &&
chars.includes(token[token.length - 1]) &&
!UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })
) {
hasRestrictionToken = true;
accumulator.push(token.slice(0, token.length - 1));
accumulator.push(token[token.length - 1]);
continue;
}
}
accumulator.push(token);
}
logger.info("Found tokens", accumulator);
return accumulator;
}
/**
* Given an array of unfiltered tokens, this function filters them and converts
* to token objects with a type.
*
* @param {array} tokens
* An array of strings, representing search tokens.
* @returns {array} An array of token objects.
* @note restriction characters are only considered if they appear at the start
* or at the end of the tokens list. In case of restriction characters
* conflict, the most external ones win. Leading ones win over trailing
* ones. Discarded restriction characters are considered text.
*/
function filterTokens(tokens) {
let filtered = [];
let restrictions = [];
for (let i = 0; i < tokens.length; ++i) {
let token = tokens[i];
let tokenObj = {
value: token,
lowerCaseValue: token.toLocaleLowerCase(),
type: UrlbarTokenizer.TYPE.TEXT,
};
let restrictionType = CHAR_TO_TYPE_MAP.get(token);
if (restrictionType) {
restrictions.push({ index: i, type: restrictionType });
} else if (UrlbarTokenizer.looksLikeOrigin(token)) {
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
} else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
}
filtered.push(tokenObj);
}
// Handle restriction characters.
if (restrictions.length) {
// We can apply two kind of restrictions: type (bookmark, search, ...) and
// matching (url, title). These kind of restrictions can be combined, but we
// can only have one restriction per kind.
let matchingRestrictionFound = false;
let typeRestrictionFound = false;
function assignRestriction(r) {
if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
if (
[
UrlbarTokenizer.TYPE.RESTRICT_TITLE,
UrlbarTokenizer.TYPE.RESTRICT_URL,
].includes(r.type)
) {
if (!matchingRestrictionFound) {
matchingRestrictionFound = true;
filtered[r.index].type = r.type;
return true;
}
} else if (!typeRestrictionFound) {
typeRestrictionFound = true;
filtered[r.index].type = r.type;
return true;
}
}
return false;
}
// Look at the first token.
let found = assignRestriction(restrictions.find(r => r.index == 0));
if (found) {
// If the first token was assigned, look at the next one.
assignRestriction(restrictions.find(r => r.index == 1));
}
// Then look at the last token.
let lastIndex = tokens.length - 1;
found = assignRestriction(restrictions.find(r => r.index == lastIndex));
if (found) {
// If the last token was assigned, look at the previous one.
assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
}
}
logger.info("Filtered Tokens", tokens);
return filtered;
}