Bug 1541563 - Sync 131 - Fix string linkification greediness r=loganfsmyth

Differential Revision: https://phabricator.services.mozilla.com/D25994

--HG--
extra : moz-landing-system : lando
This commit is contained in:
David Walsh 2019-04-05 17:33:22 +00:00
Родитель c8835a2c67
Коммит 1578579b3e
3 изменённых файлов: 150 добавлений и 69 удалений

Просмотреть файл

@ -4,7 +4,63 @@
// Dependencies
const validProtocols = /(http|https|ftp|data|resource|chrome):/i;
const tokenSplitRegex = /(\s|\'|\"|\\)+/;
// URL Regex, common idioms:
//
// Lead-in (URL):
// ( Capture because we need to know if there was a lead-in
// character so we can include it as part of the text
// preceding the match. We lack look-behind matching.
// ^| The URL can start at the beginning of the string.
// [\s(,;'"`] Or whitespace or some punctuation that does not imply
// a context which would preclude a URL.
// )
//
// We do not need a trailing look-ahead because our regex's will terminate
// because they run out of characters they can eat.
// What we do not attempt to have the regexp do:
// - Avoid trailing '.' and ')' characters. We let our greedy match absorb
// these, but have a separate regex for extra characters to leave off at the
// end.
//
// The Regex (apart from lead-in/lead-out):
// ( Begin capture of the URL
// (?: (potential detect beginnings)
// https?:\/\/| Start with "http" or "https"
// www\d{0,3}[.][a-z0-9.\-]{2,249}|
// Start with "www", up to 3 numbers, then "." then
// something that looks domain-namey. We differ from the
// next case in that we do not constrain the top-level
// domain as tightly and do not require a trailing path
// indicator of "/". This is IDN root compatible.
// [a-z0-9.\-]{2,250}[.][a-z]{2,4}\/
// Detect a non-www domain, but requiring a trailing "/"
// to indicate a path. This only detects IDN domains
// with a non-IDN root. This is reasonable in cases where
// there is no explicit http/https start us out, but
// unreasonable where there is. Our real fix is the bug
// to port the Thunderbird/gecko linkification logic.
//
// Domain names can be up to 253 characters long, and are
// limited to a-zA-Z0-9 and '-'. The roots don't have
// hyphens unless they are IDN roots. Root zones can be
// found here: http://www.iana.org/domains/root/db
// )
// [-\w.!~*'();,/?:@&=+$#%]*
// path onwards. We allow the set of characters that
// encodeURI does not escape plus the result of escaping
// (so also '%')
// )
// eslint-disable-next-line max-len
const urlRegex = /(^|[\s(,;'"`])((?:https?:\/\/|www\d{0,3}[.][a-z0-9.\-]{2,249}|[a-z0-9.\-]{2,250}[.][a-z]{2,4}\/)[-\w.!~*'();,/?:@&=+$#%]*)/im;
// Set of terminators that are likely to have been part of the context rather
// than part of the URL and so should be uneaten. This is '(', ',', ';', plus
// quotes and question end-ing punctuation and the potential permutations with
// parentheses (english-specific).
const uneatLastUrlCharsRegex = /(?:[),;.!?`'"]|[.!?]\)|\)[.!?])$/;
const ELLIPSIS = "\u2026";
const dom = require("react-dom-factories");
const { span } = dom;
@ -382,7 +438,7 @@ function containsURL(grip) {
* @param string token
* The token.
* @return boolean
* Whenther the token is a URL.
* Whether the token is a URL.
*/
function isURL(token) {
try {
@ -439,7 +495,8 @@ module.exports = {
maybeEscapePropertyName,
getGripPreviewItems,
getGripType,
tokenSplitRegex,
ellipsisElement,
ELLIPSIS
ELLIPSIS,
uneatLastUrlCharsRegex,
urlRegex
};

Просмотреть файл

@ -7,15 +7,15 @@ const PropTypes = require("prop-types");
const {
containsURL,
isURL,
escapeString,
getGripType,
rawCropString,
sanitizeString,
wrapRender,
isGrip,
tokenSplitRegex,
ELLIPSIS
ELLIPSIS,
uneatLastUrlCharsRegex,
urlRegex
} = require("./rep-utils");
const dom = require("react-dom-factories");
@ -184,75 +184,75 @@ function getLinkifiedElements(text, cropLimit, openLink, isInContentPage) {
const startCropIndex = cropLimit ? halfLimit : null;
const endCropIndex = cropLimit ? text.length - halfLimit : null;
// As we walk through the tokens of the source string, we make sure to
// preserve the original whitespace that separated the tokens.
let currentIndex = 0;
const items = [];
for (const token of text.split(tokenSplitRegex)) {
if (isURL(token)) {
// Let's grab all the non-url strings before the link.
const tokenStart = text.indexOf(token, currentIndex);
let nonUrlText = text.slice(currentIndex, tokenStart);
nonUrlText = getCroppedString(
nonUrlText,
currentIndex,
startCropIndex,
endCropIndex
);
if (nonUrlText) {
items.push(nonUrlText);
}
// Update the index to match the beginning of the token.
currentIndex = tokenStart;
const linkText = getCroppedString(
token,
currentIndex,
startCropIndex,
endCropIndex
);
if (linkText) {
items.push(
a(
{
className: "url",
title: token,
draggable: false,
// Because we don't want the link to be open in the current
// panel's frame, we only render the href attribute if `openLink`
// exists (so we can preventDefault) or if the reps will be
// displayed in content page (e.g. in the JSONViewer).
href: openLink || isInContentPage ? token : null,
onClick: openLink
? e => {
e.preventDefault();
openLink(token, e);
}
: null
},
linkText
)
);
}
currentIndex = tokenStart + token.length;
let currentIndex = 0;
let contentStart;
while (true) {
const url = urlRegex.exec(text);
// Pick the regexp with the earlier content; index will always be zero.
if (!url) {
break;
}
contentStart = url.index + url[1].length;
if (contentStart > 0) {
const nonUrlText = text.substring(0, contentStart);
items.push(
getCroppedString(nonUrlText, currentIndex, startCropIndex, endCropIndex)
);
}
// There are some final characters for a URL that are much more likely
// to have been part of the enclosing text rather than the end of the
// URL.
let useUrl = url[2];
const uneat = uneatLastUrlCharsRegex.exec(useUrl);
if (uneat) {
useUrl = useUrl.substring(0, uneat.index);
}
currentIndex = currentIndex + contentStart;
const linkText = getCroppedString(
useUrl,
currentIndex,
startCropIndex,
endCropIndex
);
if (linkText) {
items.push(
a(
{
className: "url",
title: useUrl,
draggable: false,
// Because we don't want the link to be open in the current
// panel's frame, we only render the href attribute if `openLink`
// exists (so we can preventDefault) or if the reps will be
// displayed in content page (e.g. in the JSONViewer).
href: openLink || isInContentPage ? useUrl : null,
onClick: openLink
? e => {
e.preventDefault();
openLink(useUrl, e);
}
: null
},
linkText
)
);
}
currentIndex = currentIndex + useUrl.length;
text = text.substring(url.index + url[1].length + useUrl.length);
}
// Clean up any non-URL text at the end of the source string,
// i.e. not handled in the loop.
if (currentIndex !== text.length) {
let nonUrlText = text.slice(currentIndex, text.length);
if (text.length > 0) {
if (currentIndex < endCropIndex) {
nonUrlText = getCroppedString(
nonUrlText,
currentIndex,
startCropIndex,
endCropIndex
);
text = getCroppedString(text, currentIndex, startCropIndex, endCropIndex);
}
items.push(nonUrlText);
items.push(text);
}
return items;

Просмотреть файл

@ -335,7 +335,7 @@ describe("test String with URL", () => {
it("renders successive cropped URLs with cropped elements between", () => {
const text =
"- http://example.fr test http://example.fr test http://example.us -";
"- http://example.fr test http://example.es test http://example.us -";
const openLink = jest.fn();
const element = renderRep(text, {
openLink,
@ -383,6 +383,30 @@ describe("test String with URL", () => {
expect(linkFr.prop("title")).toBe("http://example.fr");
});
it("renders URLs without unrelated characters", () => {
const text =
"global(http://example.com) and local(http://example.us)" +
" and maybe https://example.fr, https://example.es?";
const openLink = jest.fn();
const element = renderRep(text, {
openLink,
useQuotes: false
});
expect(element.text()).toEqual(text);
const linkCom = element.find("a").at(0);
expect(linkCom.prop("href")).toBe("http://example.com");
const linkUs = element.find("a").at(1);
expect(linkUs.prop("href")).toBe("http://example.us");
const linkFr = element.find("a").at(2);
expect(linkFr.prop("href")).toBe("https://example.fr");
const linkEs = element.find("a").at(3);
expect(linkEs.prop("href")).toBe("https://example.es");
});
it("does not render a link if the URL has no scheme", () => {
const url = "example.com";
const element = renderRep(url, { useQuotes: false });