Bug 790997. Align our tokenization of CSS bad-url-token with the CSS Syntax Level 3 CR. r=heycam,tromey

The main change is that once we discover we have a bad-url-token we consume everything up to, but not including, the next ')' character. While we do this we can cross line boundaries and don't bother about matching braces or quotes. We just keep going until we find the ')' or hit EOF.
2016-07-13 22:54:20 -04:00 · 2016-07-13 22:54:20 -04:00 · 63c6b08058
--- a/devtools/shared/css-lexer.js
+++ b/devtools/shared/css-lexer.js
@ -1073,6 +1073,9 @@ Scanner.prototype = {
      this.ScanString(aToken);
      if (aToken.mType == eCSSToken_Bad_String) {
        aToken.mType = eCSSToken_Bad_URL;
+        // Flag us as having been a Bad_String.
+        aToken.mInteger2 = 1;
+        this.ConsumeBadURLRemnants(aToken);
        return;
      }
    } else {
@ -1093,9 +1096,44 @@ Scanner.prototype = {
      }
    } else {
      aToken.mType = eCSSToken_Bad_URL;
+      if (aToken.mSymbol != 0) {
+        // Flag us as having been a String, not a Bad_String.
+        aToken.mInteger2 = 0;
+      }
+      this.ConsumeBadURLRemnants(aToken);
    }
  },

+  ConsumeBadURLRemnants: function (aToken) {
+    aToken.mInteger = aToken.mIdent.length;
+    let ch = this.Peek();
+    do {
+      if (ch < 0) {
+        this.AddEOFCharacters(eEOFCharacters_CloseParen);
+        break;
+      }
+
+      if (ch == REVERSE_SOLIDUS && this.GatherEscape(aToken.mIdent, false)) {
+        // Nothing else needs to be done here for the moment; we've consumed the
+        // backslash and following escape.
+      } else {
+        // We always want to consume this character.
+        if (IsVertSpace(ch)) {
+          this.AdvanceLine();
+        } else {
+          this.Advance();
+        }
+        if (ch == 0) {
+          aToken.mIdent.push(UCS2_REPLACEMENT_CHAR);
+        } else {
+          aToken.mIdent.push(ch);
+        }
+      }
+
+      ch = this.Peek();
+    } while (ch != RIGHT_PARENTHESIS);
+  },
+
  /**
   * Primary scanner entry point.  Consume one token and fill in
   * |aToken| accordingly.  Will skip over any number of comments first,
--- a/devtools/shared/tests/unit/test_csslexer.js
+++ b/devtools/shared/tests/unit/test_csslexer.js
@ -128,8 +128,9 @@ var LEX_TESTS = [
             ["url:http://example.com"]],
  // In CSS Level 3, this is an ordinary URL, not a BAD_URL.
  ["url(http://example.com", ["url:http://example.com"]],
-  // See bug 1153981 to understand why this gets a SYMBOL token.
-  ["url(http://example.com @", ["bad_url:http://example.com", "symbol:@"]],
+  // We end up losing the whitespace before the '@' because it's skipped by the
+  // lexer before we discover we have a BAD_URL token.
+  ["url(http://example.com @", ["bad_url:http://example.com@"]],
  ["quo\\ting", ["ident:quoting"]],
  ["'bad string\n", ["bad_string:bad string", "whitespace"]],
  ["~=", ["includes"]],
--- a/dom/webidl/CSSLexer.webidl
+++ b/dom/webidl/CSSLexer.webidl
@ -36,8 +36,10 @@ enum CSSTokenType {
  "bad_string",
  // A URL.  |text| holds the URL.
  "url",
-  // A "bad URL".  This is a URL that is unterminated at EOF.  |text|
-  // holds the URL.
+  // A "bad URL".  This is a URL that either contains a bad_string or contains
+  // garbage after the string or unquoted URL test.  |text| holds the URL and
+  // potentially whatever garbage came after it, up to but not including the
+  // following ')'.
  "bad_url",
  // A "symbol" is any one-character symbol.  This corresponds to the
  // DELIM token in the CSS specification.
--- a/layout/reftests/css-parsing/invalid-url-handling.xhtml
+++ b/layout/reftests/css-parsing/invalid-url-handling.xhtml
@ -22,17 +22,19 @@
  #two { background-color: green; }
  </style>
  <style type="text/css">
-  /* not a URI token; the unterminated string ends at end of line, so
-     the brace never matches */
-  #three { background-color: green; }
-  #foo { background: url(foo"bar) }
+  /* not a URI token, the invalid URI token consumes everything up to the ')'. */
  #three { background-color: red; }
+  #foo { background: url(foo"bar) }
+  #three { background-color: green; }
  </style>
  <style type="text/css">
-  /* not a URI token; the unterminated string ends at end of line */
+  #four { background-color: green; }
+  /* not a URI token; the invalid URI token consumes everything up to the ')'
+     and then there is some garbage that prevents the next rule from being
+     parsed. */
  #foo { background: url(foo"bar) }
  ) }
-  #four { background-color: green; }
+  #four { background-color: red; }
  </style>
  <style type="text/css">
  /* not a URI token; the unterminated string ends at end of line, so
@ -68,18 +70,20 @@
  #eleven { background: url([) green; }
  </style>
  <style type="text/css">
-  /* not a URI token; brace matching should work only after invalid URI token */
-  #twelve { background: url(}{""{)}); background-color: green; }
+  /* not a URI token; brace matching is ignored while looking for the closing
+     ')' but is used after that. */
+  #twelve { background: url(}{""{)(}); background-color: green; }
  </style>
  <style type="text/css">
  /* invalid URI token absorbs the [ */
  #thirteen { background: url([""); background-color: green; }
  </style>
  <style type="text/css">
-  /* not a URI token; the opening ( is never matched */
-  #fourteen { background-color: green; }
-  #foo { background: url(() }
+  /* not a URI token; the invalid URI token consumes everything up to the
+     next ')'. */
  #fourteen { background-color: red; }
+  #foo { background: url(() }
+  #fourteen { background-color: green; }
  </style>
  <!-- The next three tests test that invalid URI tokens absorb [ and { -->
  <style type="text/css">
--- a/layout/style/nsCSSScanner.cpp
+++ b/layout/style/nsCSSScanner.cpp
@ -260,13 +260,25 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
    case eCSSToken_Bad_URL:
      aBuffer.AppendLiteral("url(");
      if (mSymbol != char16_t(0)) {
-        nsStyleUtil::AppendEscapedCSSString(mIdent, aBuffer, mSymbol);
+        if (mType == eCSSToken_URL) {
+          nsStyleUtil::AppendEscapedCSSString(mIdent, aBuffer, mSymbol);
+        } else {
+          // Only things up to mInteger were part of the string.
+          nsStyleUtil::AppendEscapedCSSString(StringHead(mIdent, mInteger),
+                                              aBuffer, mSymbol);
+          MOZ_ASSERT(mInteger2 == 0 || mInteger2 == 1);
+          if (mInteger2 == 1) {
+            // This was a Bad_String; strip off the closing quote.
+            aBuffer.Truncate(aBuffer.Length() - 1);
+          }
+
+          // Now append the remaining garbage.
+          aBuffer.Append(Substring(mIdent, mInteger));
+        }
      } else {
        aBuffer.Append(mIdent);
      }
-      if (mType == eCSSToken_URL) {
-        aBuffer.Append(char16_t(')'));
-      }
+      aBuffer.Append(char16_t(')'));
      break;

    case eCSSToken_Number:
@ -1166,6 +1178,9 @@ nsCSSScanner::NextURL(nsCSSToken& aToken)
    ScanString(aToken);
    if (MOZ_UNLIKELY(aToken.mType == eCSSToken_Bad_String)) {
      aToken.mType = eCSSToken_Bad_URL;
+      // Flag us as having been a Bad_String.
+      aToken.mInteger2 = 1;
+      ConsumeBadURLRemnants(aToken);
      return;
    }
    MOZ_ASSERT(aToken.mType == eCSSToken_String, "unexpected token type");
@ -1189,9 +1204,46 @@ nsCSSScanner::NextURL(nsCSSToken& aToken)
  } else {
    mSeenBadToken = true;
    aToken.mType = eCSSToken_Bad_URL;
+    if (aToken.mSymbol != 0) {
+      // Flag us as having been a String, not a Bad_String.
+      aToken.mInteger2 = 0;
+    }
+    ConsumeBadURLRemnants(aToken);
  }
 }

+void
+nsCSSScanner::ConsumeBadURLRemnants(nsCSSToken& aToken)
+{
+  aToken.mInteger = aToken.mIdent.Length();
+  int32_t ch = Peek();
+  do {
+    if (ch < 0) {
+      AddEOFCharacters(eEOFCharacters_CloseParen);
+      break;
+    }
+
+    if (ch == '\\' && GatherEscape(aToken.mIdent, false)) {
+      // Nothing else needs to be done here for the moment; we've consumed the
+      // backslash and following escape.
+    } else {
+      // We always want to consume this character.
+      if (IsVertSpace(ch)) {
+        AdvanceLine();
+      } else {
+        Advance();
+      }
+      if (ch == 0) {
+        aToken.mIdent.Append(UCS2_REPLACEMENT_CHAR);
+      } else {
+        aToken.mIdent.Append(ch);
+      }
+    }
+
+    ch = Peek();
+  } while (ch != ')');
+}
+
 /**
 * Primary scanner entry point.  Consume one token and fill in
 * |aToken| accordingly.  Will skip over any number of comments first,
--- a/layout/style/nsCSSScanner.h
+++ b/layout/style/nsCSSScanner.h
@ -59,11 +59,20 @@ enum nsCSSTokenType {
  // belonging to the string, and mSymbol holds the delimiter
  // character, which may be ', ", or zero (only for unquoted URLs).
  // Bad_String and Bad_URL tokens are emitted when the closing
-  // delimiter or parenthesis was missing.
+  // delimiter was missing.  Bad_URL is also emitted if there was trailing
+  // garbage after the string or unquoted url value.
  eCSSToken_String,         // 'foo bar' "foo bar"
  eCSSToken_Bad_String,     // 'foo bar
  eCSSToken_URL,            // url(foobar) url("foo bar")
-  eCSSToken_Bad_URL,        // url(foo
+  // For Bad_URL tokens, we need to keep track of the following state:
+  // (1) Was there a quoted string?  If so, was it a String or Bad_String?
+  // (2) Was there trailing garbage, and if so what was it?
+  // We keep track of whether there was a quoted string by setting mSymbol as
+  // described above.  If that's nonzero, then mInteger2 indicates whether we
+  // have a String or Bad_String by taking on the values 0 and 1 respectively.
+  // mInteger indicates the start of trailing garbage in mIdent (and is set to
+  // mIdent.Length() when there is no trailing garbage).
+  eCSSToken_Bad_URL,        // url(foo') url('foo'a) url('foo

  // Any one-character symbol.  mSymbol holds the character.
  eCSSToken_Symbol,         // . ; { } ! *
@ -255,6 +264,10 @@ class nsCSSScanner {
  // parser.
  void NextURL(nsCSSToken& aTokenResult);

+  // Implement the "consume the remnants of a bad url" algorithm from CSS3
+  // Syntax, except we don't consume the ')'.
+  void ConsumeBadURLRemnants(nsCSSToken& aToken);
+
  // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg,
  // because "2n-1" is a single DIMENSION token, and "n-1" is a single
  // IDENT token, but the :nth() selector syntax wants to interpret
--- a/layout/style/test/test_csslexer.js
+++ b/layout/style/test/test_csslexer.js
@ -55,8 +55,9 @@ var LEX_TESTS = [
             ["url:http://example.com"]],
  // In CSS Level 3, this is an ordinary URL, not a BAD_URL.
  ["url(http://example.com", ["url:http://example.com"]],
-  // See bug 1153981 to understand why this gets a SYMBOL token.
-  ["url(http://example.com @", ["bad_url:http://example.com", "symbol:@"]],
+  // We end up losing the whitespace before the '@' because it's skipped by the
+  // lexer before we discover we have a BAD_URL token.
+  ["url(http://example.com @", ["bad_url:http://example.com@"]],
  ["quo\\ting", ["ident:quoting"]],
  ["'bad string\n", ["bad_string:bad string", "whitespace"]],
  ["~=", ["includes"]],
--- a/layout/style/test/test_parser_diagnostics_unprintables.html
+++ b/layout/style/test/test_parser_diagnostics_unprintables.html
@ -45,7 +45,7 @@ const patterns = [
  // _URL
  { i: "x{ url('<t>')}",   o: "declaration but found \u2018url('<s>')\u2019." },
  // _Bad_URL
-  { i: "x{ url('<t>'.)}" , o: "declaration but found \u2018url('<s>'\u2019." }
+  { i: "x{ url('<t>'.)}" , o: "declaration but found \u2018url('<s>'.)\u2019." }
 ];

 // Blocks of characters to test, and how they should be escaped when