зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1467336 - Change a bunch of 'character' nomenclature in token stream code to 'code unit', completing the transition from tokenizing by UTF-16 'character' to tokenizinng by UTF-8/16 code unit. (There are straggling places where algorithms will need to be specialized for UTF-8, or functions will need to move within the TokenStream* hierarchy to permit such; but what is in the tree now universally acts on code units first, full code points second.) r=arai
--HG-- extra : rebase_source : 06fa722d16b801f9db7f38110756d8a8f62b6617
This commit is contained in:
Родитель
8d4d43a2d1
Коммит
da3129de0b
|
@ -997,9 +997,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::errorAt(uint32_t offset, unsigned er
|
|||
}
|
||||
|
||||
// We have encountered a '\': check for a Unicode escape sequence after it.
|
||||
// Return the length of the escape sequence and the character code point (by
|
||||
// value) if we found a Unicode escape sequence. Otherwise, return 0. In both
|
||||
// cases, do not advance along the buffer.
|
||||
// Return the length of the escape sequence and the encoded code point (by
|
||||
// value) if we found a Unicode escape sequence, and skip all code units
|
||||
// involed. Otherwise, return 0 and don't advance along the buffer.
|
||||
template<typename CharT, class AnyCharsAccess>
|
||||
uint32_t
|
||||
GeneralTokenStreamChars<CharT, AnyCharsAccess>::matchUnicodeEscape(uint32_t* codePoint)
|
||||
|
@ -1134,7 +1134,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getDirectives(bool isMultiline,
|
|||
// To avoid a crashing bug in IE, several JavaScript transpilers wrap single
|
||||
// line comments containing a source mapping URL inside a multiline
|
||||
// comment. To avoid potentially expensive lookahead and backtracking, we
|
||||
// only check for this case if we encounter a '#' character.
|
||||
// only check for this case if we encounter a '#' code unit.
|
||||
|
||||
bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
|
||||
getSourceMappingURL(isMultiline, shouldWarnDeprecated);
|
||||
|
@ -1171,12 +1171,13 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getDirective(bool isMultiline,
|
|||
MOZ_ASSERT(directiveLength <= 18);
|
||||
char16_t peeked[18];
|
||||
|
||||
// If there aren't enough characters left, it can't be the desired
|
||||
// directive.
|
||||
// If there aren't enough code units left, it can't be the desired
|
||||
// directive. (Note that |directive| must be ASCII, so there are no
|
||||
// tricky encoding issues to consider.)
|
||||
if (!sourceUnits.peekCodeUnits(directiveLength, peeked))
|
||||
return true;
|
||||
|
||||
// It's also not the desired directive if the characters don't match.
|
||||
// It's also not the desired directive if the code units don't match.
|
||||
if (!CharsMatch(peeked, directive))
|
||||
return true;
|
||||
|
||||
|
@ -2088,9 +2089,9 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getTokenInternal(TokenKind* const tt
|
|||
}
|
||||
|
||||
// We could point "into" a mistyped escape, e.g. for "\u{41H}" we
|
||||
// could point at the 'H'. But we don't do that now, so the
|
||||
// character after the '\' isn't necessarily bad, so just point at
|
||||
// the start of the actually-invalid escape.
|
||||
// could point at the 'H'. But we don't do that now, so the code
|
||||
// unit after the '\' isn't necessarily bad, so just point at the
|
||||
// start of the actually-invalid escape.
|
||||
ungetCodeUnit('\\');
|
||||
error(JSMSG_BAD_ESCAPE);
|
||||
return badToken();
|
||||
|
@ -2434,7 +2435,7 @@ TokenStreamSpecific<CharT, AnyCharsAccess>::getStringOrTemplateToken(char untilC
|
|||
// so it'll pass into this |if|-block.
|
||||
if (!JS7_ISHEX(u3)) {
|
||||
if (parsingTemplate) {
|
||||
// We put the character back so that we read it
|
||||
// We put the code unit back so that we read it
|
||||
// on the next pass, which matters if it was
|
||||
// '`' or '\'.
|
||||
ungetCodeUnit(u3);
|
||||
|
|
|
@ -195,8 +195,8 @@ namespace js {
|
|||
namespace frontend {
|
||||
|
||||
struct TokenPos {
|
||||
uint32_t begin; // Offset of the token's first char.
|
||||
uint32_t end; // Offset of 1 past the token's last char.
|
||||
uint32_t begin; // Offset of the token's first code unit.
|
||||
uint32_t end; // Offset of 1 past the token's last code unit.
|
||||
|
||||
TokenPos()
|
||||
: begin(0),
|
||||
|
@ -286,9 +286,9 @@ struct Token
|
|||
// Div.
|
||||
Operand,
|
||||
|
||||
// Treat subsequent characters as the tail of a template literal, after
|
||||
// Treat subsequent code units as the tail of a template literal, after
|
||||
// a template substitution, beginning with a "}", continuing with zero
|
||||
// or more template literal characters, and ending with either "${" or
|
||||
// or more template literal code units, and ending with either "${" or
|
||||
// the end of the template literal. For example:
|
||||
//
|
||||
// var entity = "world";
|
||||
|
@ -318,8 +318,8 @@ struct Token
|
|||
// conditional expression and missing it results in SyntaxError.
|
||||
// Comma/semicolon cases are also gotten as operators (None), and 4th
|
||||
// case is gotten after them. If no comma/semicolon found but EOL,
|
||||
// the next token should be gotten as operand in 4th case (especially if
|
||||
// '/' is the first character). So we should peek the token as
|
||||
// the next token should be gotten as operand in 4th case (especially
|
||||
// if '/' is the first code unit). So we should peek the token as
|
||||
// operand before try getting colon/comma/semicolon.
|
||||
// See also the comment in Parser::assignExpr().
|
||||
NoneIsOperand,
|
||||
|
@ -1080,9 +1080,9 @@ class TokenStreamCharsShared
|
|||
|
||||
protected:
|
||||
/**
|
||||
* Character buffer transiently used to store sequences of identifier or
|
||||
* string code points when such can't be directly processed from the
|
||||
* original source text (e.g. because it contains escapes).
|
||||
* Buffer transiently used to store sequences of identifier or string code
|
||||
* points when such can't be directly processed from the original source
|
||||
* text (e.g. because it contains escapes).
|
||||
*/
|
||||
CharBuffer charBuffer;
|
||||
|
||||
|
@ -1327,7 +1327,7 @@ class GeneralTokenStreamChars
|
|||
}
|
||||
|
||||
/**
|
||||
* Consume characters til EOL/EOF following the start of a single-line
|
||||
* Consume code units til EOL/EOF following the start of a single-line
|
||||
* comment, without consuming the EOL/EOF.
|
||||
*/
|
||||
void consumeRestOfSingleLineComment();
|
||||
|
@ -1370,7 +1370,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
|
|||
|
||||
// Try to get the next code point, normalizing '\r', '\r\n', '\n', and the
|
||||
// Unicode line/paragraph separators into '\n'. Also updates internal
|
||||
// line-counter state. Return true on success and store the character in
|
||||
// line-counter state. Return true on success and store the code point in
|
||||
// |*c|. Return false and leave |*c| undefined on failure.
|
||||
MOZ_MUST_USE bool getCodePoint(int32_t* cp);
|
||||
|
||||
|
@ -1461,7 +1461,7 @@ class TokenStreamChars<char16_t, AnyCharsAccess>
|
|||
|
||||
// TokenStream is the lexical scanner for JavaScript source text.
|
||||
//
|
||||
// It takes a buffer of CharT characters (currently only char16_t encoding
|
||||
// It takes a buffer of CharT code units (currently only char16_t encoding
|
||||
// UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
|
||||
// linearly scans it into |Token|s.
|
||||
//
|
||||
|
|
Загрузка…
Ссылка в новой задаче