Bug 1541563 - Sync 131 - Fix string linkification greediness r=loganfsmyth

Differential Revision: https://phabricator.services.mozilla.com/D25994 --HG-- extra : moz-landing-system : lando
2019-04-05 17:33:22 +00:00 · 2019-04-05 17:33:22 +00:00 · 1578579b3e
--- a/devtools/client/debugger/new/packages/devtools-reps/src/reps/rep-utils.js
+++ b/devtools/client/debugger/new/packages/devtools-reps/src/reps/rep-utils.js
@ -4,7 +4,63 @@

 // Dependencies
 const validProtocols = /(http|https|ftp|data|resource|chrome):/i;
-const tokenSplitRegex = /(\s|\'|\"|\\)+/;
+
+// URL Regex, common idioms:
+//
+// Lead-in (URL):
+// (                     Capture because we need to know if there was a lead-in
+//                       character so we can include it as part of the text
+//                       preceding the match. We lack look-behind matching.
+//  ^|                   The URL can start at the beginning of the string.
+//  [\s(,;'"`]           Or whitespace or some punctuation that does not imply
+//                       a context which would preclude a URL.
+// )
+//
+// We do not need a trailing look-ahead because our regex's will terminate
+// because they run out of characters they can eat.
+
+// What we do not attempt to have the regexp do:
+// - Avoid trailing '.' and ')' characters.  We let our greedy match absorb
+//   these, but have a separate regex for extra characters to leave off at the
+//   end.
+//
+// The Regex (apart from lead-in/lead-out):
+// (                     Begin capture of the URL
+//  (?:                  (potential detect beginnings)
+//   https?:\/\/|        Start with "http" or "https"
+//   www\d{0,3}[.][a-z0-9.\-]{2,249}|
+//                      Start with "www", up to 3 numbers, then "." then
+//                       something that looks domain-namey.  We differ from the
+//                       next case in that we do not constrain the top-level
+//                       domain as tightly and do not require a trailing path
+//                       indicator of "/".  This is IDN root compatible.
+//   [a-z0-9.\-]{2,250}[.][a-z]{2,4}\/
+//                       Detect a non-www domain, but requiring a trailing "/"
+//                       to indicate a path.  This only detects IDN domains
+//                       with a non-IDN root.  This is reasonable in cases where
+//                       there is no explicit http/https start us out, but
+//                       unreasonable where there is.  Our real fix is the bug
+//                       to port the Thunderbird/gecko linkification logic.
+//
+//                       Domain names can be up to 253 characters long, and are
+//                       limited to a-zA-Z0-9 and '-'.  The roots don't have
+//                       hyphens unless they are IDN roots.  Root zones can be
+//                       found here: http://www.iana.org/domains/root/db
+//  )
+//  [-\w.!~*'();,/?:@&=+$#%]*
+//                       path onwards. We allow the set of characters that
+//                       encodeURI does not escape plus the result of escaping
+//                       (so also '%')
+// )
+// eslint-disable-next-line max-len
+const urlRegex = /(^|[\s(,;'"`])((?:https?:\/\/|www\d{0,3}[.][a-z0-9.\-]{2,249}|[a-z0-9.\-]{2,250}[.][a-z]{2,4}\/)[-\w.!~*'();,/?:@&=+$#%]*)/im;
+
+// Set of terminators that are likely to have been part of the context rather
+// than part of the URL and so should be uneaten. This is '(', ',', ';', plus
+// quotes and question end-ing punctuation and the potential permutations with
+// parentheses (english-specific).
+const uneatLastUrlCharsRegex = /(?:[),;.!?`'"]|[.!?]\)|\)[.!?])$/;
+
 const ELLIPSIS = "\u2026";
 const dom = require("react-dom-factories");
 const { span } = dom;
@ -382,7 +438,7 @@ function containsURL(grip) {
 * @param string token
 *        The token.
 * @return boolean
- *         Whenther the token is a URL.
+ *         Whether the token is a URL.
 */
 function isURL(token) {
  try {
@ -439,7 +495,8 @@ module.exports = {
  maybeEscapePropertyName,
  getGripPreviewItems,
  getGripType,
-  tokenSplitRegex,
  ellipsisElement,
-  ELLIPSIS
+  ELLIPSIS,
+  uneatLastUrlCharsRegex,
+  urlRegex
 };
--- a/devtools/client/debugger/new/packages/devtools-reps/src/reps/string.js
+++ b/devtools/client/debugger/new/packages/devtools-reps/src/reps/string.js
@ -7,15 +7,15 @@ const PropTypes = require("prop-types");

 const {
  containsURL,
-  isURL,
  escapeString,
  getGripType,
  rawCropString,
  sanitizeString,
  wrapRender,
  isGrip,
-  tokenSplitRegex,
-  ELLIPSIS
+  ELLIPSIS,
+  uneatLastUrlCharsRegex,
+  urlRegex
 } = require("./rep-utils");

 const dom = require("react-dom-factories");
@ -184,75 +184,75 @@ function getLinkifiedElements(text, cropLimit, openLink, isInContentPage) {
  const startCropIndex = cropLimit ? halfLimit : null;
  const endCropIndex = cropLimit ? text.length - halfLimit : null;

-  // As we walk through the tokens of the source string, we make sure to
-  // preserve the original whitespace that separated the tokens.
-  let currentIndex = 0;
  const items = [];
-  for (const token of text.split(tokenSplitRegex)) {
-    if (isURL(token)) {
-      // Let's grab all the non-url strings before the link.
-      const tokenStart = text.indexOf(token, currentIndex);
-      let nonUrlText = text.slice(currentIndex, tokenStart);
-      nonUrlText = getCroppedString(
-        nonUrlText,
-        currentIndex,
-        startCropIndex,
-        endCropIndex
-      );
-      if (nonUrlText) {
-        items.push(nonUrlText);
-      }
-
-      // Update the index to match the beginning of the token.
-      currentIndex = tokenStart;
-
-      const linkText = getCroppedString(
-        token,
-        currentIndex,
-        startCropIndex,
-        endCropIndex
-      );
-      if (linkText) {
-        items.push(
-          a(
-            {
-              className: "url",
-              title: token,
-              draggable: false,
-              // Because we don't want the link to be open in the current
-              // panel's frame, we only render the href attribute if `openLink`
-              // exists (so we can preventDefault) or if the reps will be
-              // displayed in content page (e.g. in the JSONViewer).
-              href: openLink || isInContentPage ? token : null,
-              onClick: openLink
-                ? e => {
-                    e.preventDefault();
-                    openLink(token, e);
-                  }
-                : null
-            },
-            linkText
-          )
-        );
-      }
-
-      currentIndex = tokenStart + token.length;
+  let currentIndex = 0;
+  let contentStart;
+  while (true) {
+    const url = urlRegex.exec(text);
+    // Pick the regexp with the earlier content; index will always be zero.
+    if (!url) {
+      break;
    }
+    contentStart = url.index + url[1].length;
+    if (contentStart > 0) {
+      const nonUrlText = text.substring(0, contentStart);
+      items.push(
+        getCroppedString(nonUrlText, currentIndex, startCropIndex, endCropIndex)
+      );
+    }
+
+    // There are some final characters for a URL that are much more likely
+    // to have been part of the enclosing text rather than the end of the
+    // URL.
+    let useUrl = url[2];
+    const uneat = uneatLastUrlCharsRegex.exec(useUrl);
+    if (uneat) {
+      useUrl = useUrl.substring(0, uneat.index);
+    }
+
+    currentIndex = currentIndex + contentStart;
+    const linkText = getCroppedString(
+      useUrl,
+      currentIndex,
+      startCropIndex,
+      endCropIndex
+    );
+
+    if (linkText) {
+      items.push(
+        a(
+          {
+            className: "url",
+            title: useUrl,
+            draggable: false,
+            // Because we don't want the link to be open in the current
+            // panel's frame, we only render the href attribute if `openLink`
+            // exists (so we can preventDefault) or if the reps will be
+            // displayed in content page (e.g. in the JSONViewer).
+            href: openLink || isInContentPage ? useUrl : null,
+            onClick: openLink
+              ? e => {
+                  e.preventDefault();
+                  openLink(useUrl, e);
+                }
+              : null
+          },
+          linkText
+        )
+      );
+    }
+
+    currentIndex = currentIndex + useUrl.length;
+    text = text.substring(url.index + url[1].length + useUrl.length);
  }

  // Clean up any non-URL text at the end of the source string,
  // i.e. not handled in the loop.
-  if (currentIndex !== text.length) {
-    let nonUrlText = text.slice(currentIndex, text.length);
+  if (text.length > 0) {
    if (currentIndex < endCropIndex) {
-      nonUrlText = getCroppedString(
-        nonUrlText,
-        currentIndex,
-        startCropIndex,
-        endCropIndex
-      );
+      text = getCroppedString(text, currentIndex, startCropIndex, endCropIndex);
    }
-    items.push(nonUrlText);
+    items.push(text);
  }

  return items;
--- a/devtools/client/debugger/new/packages/devtools-reps/src/reps/tests/string-with-url.js
+++ b/devtools/client/debugger/new/packages/devtools-reps/src/reps/tests/string-with-url.js
@ -335,7 +335,7 @@ describe("test String with URL", () => {

  it("renders successive cropped URLs with cropped elements between", () => {
    const text =
-      "- http://example.fr test http://example.fr test http://example.us -";
+      "- http://example.fr test http://example.es test http://example.us -";
    const openLink = jest.fn();
    const element = renderRep(text, {
      openLink,
@ -383,6 +383,30 @@ describe("test String with URL", () => {
    expect(linkFr.prop("title")).toBe("http://example.fr");
  });

+  it("renders URLs without unrelated characters", () => {
+    const text =
+      "global(http://example.com) and local(http://example.us)" +
+      " and maybe https://example.fr, https://example.es?";
+    const openLink = jest.fn();
+    const element = renderRep(text, {
+      openLink,
+      useQuotes: false
+    });
+
+    expect(element.text()).toEqual(text);
+    const linkCom = element.find("a").at(0);
+    expect(linkCom.prop("href")).toBe("http://example.com");
+
+    const linkUs = element.find("a").at(1);
+    expect(linkUs.prop("href")).toBe("http://example.us");
+
+    const linkFr = element.find("a").at(2);
+    expect(linkFr.prop("href")).toBe("https://example.fr");
+
+    const linkEs = element.find("a").at(3);
+    expect(linkEs.prop("href")).toBe("https://example.es");
+  });
+
  it("does not render a link if the URL has no scheme", () => {
    const url = "example.com";
    const element = renderRep(url, { useQuotes: false });