Bug 1467336 - Change a bunch of 'character' nomenclature in token stream code to 'code unit', completing the transition from tokenizing by UTF-16 'character' to tokenizinng by UTF-8/16 code unit. (There are straggling places where algorithms will need to be specialized for UTF-8, or functions will need to move within the TokenStream* hierarchy to permit such; but what is in the tree now universally acts on code units first, full code points second.) r=arai

--HG--
extra : rebase_source : 06fa722d16b801f9db7f38110756d8a8f62b6617
This commit is contained in:
Jeff Walden 2018-06-28 02:30:08 -07:00
Родитель 8d4d43a2d1
Коммит da3129de0b
2 изменённых файлов: 24 добавлений и 23 удалений

Просмотреть файл

@ -997,9 +997,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::errorAt(uint32_t offset, unsigned er
}
// We have encountered a '\': check for a Unicode escape sequence after it.
// Return the length of the escape sequence and the character code point (by
// value) if we found a Unicode escape sequence. Otherwise, return 0. In both
// cases, do not advance along the buffer.
// Return the length of the escape sequence and the encoded code point (by
// value) if we found a Unicode escape sequence, and skip all code units
// involed. Otherwise, return 0 and don't advance along the buffer.
template<typename CharT, class AnyCharsAccess>
uint32_t
GeneralTokenStreamChars<CharT, AnyCharsAccess>::matchUnicodeEscape(uint32_t* codePoint)
@ -1134,7 +1134,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getDirectives(bool isMultiline,
// To avoid a crashing bug in IE, several JavaScript transpilers wrap single
// line comments containing a source mapping URL inside a multiline
// comment. To avoid potentially expensive lookahead and backtracking, we
// only check for this case if we encounter a '#' character.
// only check for this case if we encounter a '#' code unit.
bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
getSourceMappingURL(isMultiline, shouldWarnDeprecated);
@ -1171,12 +1171,13 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getDirective(bool isMultiline,
MOZ_ASSERT(directiveLength <= 18);
char16_t peeked[18];
// If there aren't enough characters left, it can't be the desired
// directive.
// If there aren't enough code units left, it can't be the desired
// directive. (Note that |directive| must be ASCII, so there are no
// tricky encoding issues to consider.)
if (!sourceUnits.peekCodeUnits(directiveLength, peeked))
return true;
// It's also not the desired directive if the characters don't match.
// It's also not the desired directive if the code units don't match.
if (!CharsMatch(peeked, directive))
return true;
@ -2088,9 +2089,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
}
// We could point "into" a mistyped escape, e.g. for "\u{41H}" we
// could point at the 'H'. But we don't do that now, so the
// character after the '\' isn't necessarily bad, so just point at
// the start of the actually-invalid escape.
// could point at the 'H'. But we don't do that now, so the code
// unit after the '\' isn't necessarily bad, so just point at the
// start of the actually-invalid escape.
ungetCodeUnit('\\');
error(JSMSG_BAD_ESCAPE);
return badToken();
@ -2434,7 +2435,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getStringOrTemplateToken(char untilC
// so it'll pass into this |if|-block.
if (!JS7_ISHEX(u3)) {
if (parsingTemplate) {
// We put the character back so that we read it
// We put the code unit back so that we read it
// on the next pass, which matters if it was
// '`' or '\'.
ungetCodeUnit(u3);

Просмотреть файл

@ -195,8 +195,8 @@ namespace js {
namespace frontend {
struct TokenPos {
uint32_t begin; // Offset of the token's first char.
uint32_t end; // Offset of 1 past the token's last char.
uint32_t begin; // Offset of the token's first code unit.
uint32_t end; // Offset of 1 past the token's last code unit.
TokenPos()
: begin(0),
@ -286,9 +286,9 @@ struct Token
// Div.
Operand,
// Treat subsequent characters as the tail of a template literal, after
// Treat subsequent code units as the tail of a template literal, after
// a template substitution, beginning with a "}", continuing with zero
// or more template literal characters, and ending with either "${" or
// or more template literal code units, and ending with either "${" or
// the end of the template literal. For example:
//
// var entity = "world";
@ -318,8 +318,8 @@ struct Token
// conditional expression and missing it results in SyntaxError.
// Comma/semicolon cases are also gotten as operators (None), and 4th
// case is gotten after them. If no comma/semicolon found but EOL,
// the next token should be gotten as operand in 4th case (especially if
// '/' is the first character). So we should peek the token as
// the next token should be gotten as operand in 4th case (especially
// if '/' is the first code unit). So we should peek the token as
// operand before try getting colon/comma/semicolon.
// See also the comment in Parser::assignExpr().
NoneIsOperand,
@ -1080,9 +1080,9 @@ class TokenStreamCharsShared
protected:
/**
* Character buffer transiently used to store sequences of identifier or
* string code points when such can't be directly processed from the
* original source text (e.g. because it contains escapes).
* Buffer transiently used to store sequences of identifier or string code
* points when such can't be directly processed from the original source
* text (e.g. because it contains escapes).
*/
CharBuffer charBuffer;
@ -1327,7 +1327,7 @@ class GeneralTokenStreamChars
}
/**
* Consume characters til EOL/EOF following the start of a single-line
* Consume code units til EOL/EOF following the start of a single-line
* comment, without consuming the EOL/EOF.
*/
void consumeRestOfSingleLineComment();
@ -1370,7 +1370,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
// Try to get the next code point, normalizing '\r', '\r\n', '\n', and the
// Unicode line/paragraph separators into '\n'. Also updates internal
// line-counter state. Return true on success and store the character in
// line-counter state. Return true on success and store the code point in
// |*c|. Return false and leave |*c| undefined on failure.
MOZ_MUST_USE bool getCodePoint(int32_t* cp);
@ -1461,7 +1461,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
// TokenStream is the lexical scanner for JavaScript source text.
//
// It takes a buffer of CharT characters (currently only char16_t encoding
// It takes a buffer of CharT code units (currently only char16_t encoding
// UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
// linearly scans it into |Token|s.
//