Bug 1467336 - Change a bunch of 'character' nomenclature in token stream code to 'code unit', completing the transition from tokenizing by UTF-16 'character' to tokenizinng by UTF-8/16 code unit. (There are straggling places where algorithms will need to be specialized for UTF-8, or functions will need to move within the TokenStream* hierarchy to permit such; but what is in the tree now universally acts on code units first, full code points second.) r=arai

--HG-- extra : rebase_source : 06fa722d16b801f9db7f38110756d8a8f62b6617
2018-06-28 02:30:08 -07:00 · 2018-06-28 02:30:08 -07:00 · da3129de0b
--- a/js/src/frontend/TokenStream.cpp
+++ b/js/src/frontend/TokenStream.cpp
@ -997,9 +997,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::errorAt(uint32_t offset, unsigned er
 }

 // We have encountered a '\': check for a Unicode escape sequence after it.
-// Return the length of the escape sequence and the character code point (by
-// value) if we found a Unicode escape sequence.  Otherwise, return 0.  In both
-// cases, do not advance along the buffer.
+// Return the length of the escape sequence and the encoded code point (by
+// value) if we found a Unicode escape sequence, and skip all code units
+// involed.  Otherwise, return 0 and don't advance along the buffer.
 template<typename CharT, class AnyCharsAccess>
 uint32_t
 GeneralTokenStreamChars<CharT, AnyCharsAccess>::matchUnicodeEscape(uint32_t* codePoint)
@ -1134,7 +1134,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getDirectives(bool isMultiline,
    // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
    // line comments containing a source mapping URL inside a multiline
    // comment. To avoid potentially expensive lookahead and backtracking, we
-    // only check for this case if we encounter a '#' character.
+    // only check for this case if we encounter a '#' code unit.

    bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
               getSourceMappingURL(isMultiline, shouldWarnDeprecated);
@ -1171,12 +1171,13 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getDirective(bool isMultiline,
    MOZ_ASSERT(directiveLength <= 18);
    char16_t peeked[18];

-    // If there aren't enough characters left, it can't be the desired
-    // directive.
+    // If there aren't enough code units left, it can't be the desired
+    // directive.  (Note that |directive| must be ASCII, so there are no
+    // tricky encoding issues to consider.)
    if (!sourceUnits.peekCodeUnits(directiveLength, peeked))
        return true;

-    // It's also not the desired directive if the characters don't match.
+    // It's also not the desired directive if the code units don't match.
    if (!CharsMatch(peeked, directive))
        return true;

@ -2088,9 +2089,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
            }

            // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
-            // could point at the 'H'.  But we don't do that now, so the
-            // character after the '\' isn't necessarily bad, so just point at
-            // the start of the actually-invalid escape.
+            // could point at the 'H'.  But we don't do that now, so the code
+            // unit after the '\' isn't necessarily bad, so just point at the
+            // start of the actually-invalid escape.
            ungetCodeUnit('\\');
            error(JSMSG_BAD_ESCAPE);
            return badToken();
@ -2434,7 +2435,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getStringOrTemplateToken(char untilC
                        // so it'll pass into this |if|-block.
                        if (!JS7_ISHEX(u3)) {
                            if (parsingTemplate) {
-                                // We put the character back so that we read it
+                                // We put the code unit back so that we read it
                                // on the next pass, which matters if it was
                                // '`' or '\'.
                                ungetCodeUnit(u3);
--- a/js/src/frontend/TokenStream.h
+++ b/js/src/frontend/TokenStream.h
@ -195,8 +195,8 @@ namespace js {
 namespace frontend {

 struct TokenPos {
-    uint32_t    begin;  // Offset of the token's first char.
-    uint32_t    end;    // Offset of 1 past the token's last char.
+    uint32_t    begin;  // Offset of the token's first code unit.
+    uint32_t    end;    // Offset of 1 past the token's last code unit.

    TokenPos()
      : begin(0),
@ -286,9 +286,9 @@ struct Token
        // Div.
        Operand,

-        // Treat subsequent characters as the tail of a template literal, after
+        // Treat subsequent code units as the tail of a template literal, after
        // a template substitution, beginning with a "}", continuing with zero
-        // or more template literal characters, and ending with either "${" or
+        // or more template literal code units, and ending with either "${" or
        // the end of the template literal.  For example:
        //
        //   var entity = "world";
@ -318,8 +318,8 @@ struct Token
        // conditional expression and missing it results in SyntaxError.
        // Comma/semicolon cases are also gotten as operators (None), and 4th
        // case is gotten after them.  If no comma/semicolon found but EOL,
-        // the next token should be gotten as operand in 4th case (especially if
-        // '/' is the first character).  So we should peek the token as
+        // the next token should be gotten as operand in 4th case (especially
+        // if '/' is the first code unit).  So we should peek the token as
        // operand before try getting colon/comma/semicolon.
        // See also the comment in Parser::assignExpr().
        NoneIsOperand,
@ -1080,9 +1080,9 @@ class TokenStreamCharsShared

  protected:
    /**
-     * Character buffer transiently used to store sequences of identifier or
-     * string code points when such can't be directly processed from the
-     * original source text (e.g. because it contains escapes).
+     * Buffer transiently used to store sequences of identifier or string code
+     * points when such can't be directly processed from the original source
+     * text (e.g. because it contains escapes).
     */
    CharBuffer charBuffer;

@ -1327,7 +1327,7 @@ class GeneralTokenStreamChars
    }

    /**
-     * Consume characters til EOL/EOF following the start of a single-line
+     * Consume code units til EOL/EOF following the start of a single-line
     * comment, without consuming the EOL/EOF.
     */
    void consumeRestOfSingleLineComment();
@ -1370,7 +1370,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>

    // Try to get the next code point, normalizing '\r', '\r\n', '\n', and the
    // Unicode line/paragraph separators into '\n'.  Also updates internal
-    // line-counter state.  Return true on success and store the character in
+    // line-counter state.  Return true on success and store the code point in
    // |*c|.  Return false and leave |*c| undefined on failure.
    MOZ_MUST_USE bool getCodePoint(int32_t* cp);

@ -1461,7 +1461,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>

 // TokenStream is the lexical scanner for JavaScript source text.
 //
-// It takes a buffer of CharT characters (currently only char16_t encoding
+// It takes a buffer of CharT code units (currently only char16_t encoding
 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
 // linearly scans it into |Token|s.
 //