From 7bcb4fd669781a714843649a7f46670ee09a529f Mon Sep 17 00:00:00 2001 From: Wes Kocher Date: Wed, 2 Nov 2016 12:12:02 -0700 Subject: [PATCH] Backed out 2 changesets (bug 1314037) for win64 wpt(1) failures a=backout Backed out changeset 0223902c2353 (bug 1314037) Backed out changeset c9b3a1252363 (bug 1314037) --- js/src/frontend/TokenStream.cpp | 156 ++++-------- js/src/frontend/TokenStream.h | 7 +- js/src/irregexp/RegExpParser.cpp | 36 +-- js/src/irregexp/RegExpParser.h | 8 +- js/src/jsstr.cpp | 60 +++-- ...dentifiers-with-extended-unicode-escape.js | 229 ------------------ js/src/vm/Unicode.h | 86 ++----- 7 files changed, 138 insertions(+), 444 deletions(-) delete mode 100644 js/src/tests/ecma_6/Syntax/identifiers-with-extended-unicode-escape.js diff --git a/js/src/frontend/TokenStream.cpp b/js/src/frontend/TokenStream.cpp index a3b0b50e44d6..ea7e01bfaab1 100644 --- a/js/src/frontend/TokenStream.cpp +++ b/js/src/frontend/TokenStream.cpp @@ -28,10 +28,10 @@ #include "vm/HelperThreads.h" #include "vm/Keywords.h" #include "vm/StringBuffer.h" -#include "vm/Unicode.h" using namespace js; using namespace js::frontend; +using namespace js::unicode; using mozilla::Maybe; using mozilla::PodAssign; @@ -106,12 +106,12 @@ IsIdentifier(const CharT* chars, size_t length) if (length == 0) return false; - if (!unicode::IsIdentifierStart(char16_t(*chars))) + if (!IsIdentifierStart(*chars)) return false; const CharT* end = chars + length; while (++chars != end) { - if (!unicode::IsIdentifierPart(char16_t(*chars))) + if (!IsIdentifierPart(*chars)) return false; } @@ -757,96 +757,42 @@ TokenStream::reportAsmJSError(uint32_t offset, unsigned errorNumber, ...) } // We have encountered a '\': check for a Unicode escape sequence after it. -// Return the length of the escape sequence and the character code point (by -// value) if we found a Unicode escape sequence. Otherwise, return 0. In both -// cases, do not advance along the buffer. -uint32_t -TokenStream::peekUnicodeEscape(uint32_t* codePoint) +// Return 'true' and the character code value (by value) if we found a +// Unicode escape sequence. Otherwise, return 'false'. In both cases, do not +// advance along the buffer. +bool +TokenStream::peekUnicodeEscape(int* result) { - int32_t c = getCharIgnoreEOL(); - if (c != 'u') { - ungetCharIgnoreEOL(c); - return 0; - } + char16_t cp[5]; - char16_t cp[3]; - uint32_t length; - c = getCharIgnoreEOL(); - if (JS7_ISHEX(c) && peekChars(3, cp) && - JS7_ISHEX(cp[0]) && JS7_ISHEX(cp[1]) && JS7_ISHEX(cp[2])) + if (peekChars(5, cp) && cp[0] == 'u' && + JS7_ISHEX(cp[1]) && JS7_ISHEX(cp[2]) && + JS7_ISHEX(cp[3]) && JS7_ISHEX(cp[4])) { - *codePoint = (JS7_UNHEX(c) << 12) | - (JS7_UNHEX(cp[0]) << 8) | - (JS7_UNHEX(cp[1]) << 4) | - JS7_UNHEX(cp[2]); - length = 5; - } else if (c == '{') { - length = peekExtendedUnicodeEscape(codePoint); - } else { - length = 0; + *result = (((((JS7_UNHEX(cp[1]) << 4) + + JS7_UNHEX(cp[2])) << 4) + + JS7_UNHEX(cp[3])) << 4) + + JS7_UNHEX(cp[4]); + return true; } - - ungetCharIgnoreEOL(c); - ungetCharIgnoreEOL('u'); - return length; -} - -uint32_t -TokenStream::peekExtendedUnicodeEscape(uint32_t* codePoint) -{ - // The opening brace character was already read. - int32_t c = getCharIgnoreEOL(); - - // Skip leading zeros. - uint32_t leadingZeros = 0; - while (c == '0') { - leadingZeros++; - c = getCharIgnoreEOL(); - } - - char16_t cp[6]; - size_t i = 0; - uint32_t code = 0; - while (JS7_ISHEX(c) && i < 6) { - cp[i++] = c; - code = code << 4 | JS7_UNHEX(c); - c = getCharIgnoreEOL(); - } - - uint32_t length; - if (c == '}' && (leadingZeros > 0 || i > 0) && code <= unicode::NonBMPMax) { - *codePoint = code; - length = leadingZeros + i + 3; - } else { - length = 0; - } - - ungetCharIgnoreEOL(c); - while (i--) - ungetCharIgnoreEOL(cp[i]); - while (leadingZeros--) - ungetCharIgnoreEOL('0'); - - return length; -} - -uint32_t -TokenStream::matchUnicodeEscapeIdStart(uint32_t* codePoint) -{ - uint32_t length = peekUnicodeEscape(codePoint); - if (length > 0 && unicode::IsIdentifierStart(*codePoint)) { - skipChars(length); - return length; - } - return 0; + return false; } bool -TokenStream::matchUnicodeEscapeIdent(uint32_t* codePoint) +TokenStream::matchUnicodeEscapeIdStart(int32_t* cp) { - uint32_t length = peekUnicodeEscape(codePoint); - if (length > 0 && unicode::IsIdentifierPart(*codePoint)) { - skipChars(length); + if (peekUnicodeEscape(cp) && IsIdentifierStart(*cp)) { + skipChars(5); + return true; + } + return false; +} + +bool +TokenStream::matchUnicodeEscapeIdent(int32_t* cp) +{ + if (peekUnicodeEscape(cp) && IsIdentifierPart(*cp)) { + skipChars(5); return true; } return false; @@ -900,7 +846,7 @@ TokenStream::getDirective(bool isMultiline, bool shouldWarnDeprecated, skipChars(directiveLength); tokenbuf.clear(); - while ((c = peekChar()) && c != EOF && !unicode::IsSpaceOrBOM2(c)) { + while ((c = peekChar()) && c != EOF && !IsSpaceOrBOM2(c)) { getChar(); // Debugging directives can occur in both single- and multi-line // comments. If we're currently inside a multi-line comment, we also @@ -995,15 +941,14 @@ IsTokenSane(Token* tp) bool TokenStream::putIdentInTokenbuf(const char16_t* identStart) { - int32_t c; - uint32_t qc; + int32_t c, qc; const char16_t* tmp = userbuf.addressOfNextRawChar(); userbuf.setAddressOfNextRawChar(identStart); tokenbuf.clear(); for (;;) { c = getCharIgnoreEOL(); - if (!unicode::IsIdentifierPart(char16_t(c))) { + if (!IsIdentifierPart(c)) { if (c != '\\' || !matchUnicodeEscapeIdent(&qc)) break; c = qc; @@ -1119,8 +1064,7 @@ static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)), bool TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) { - int c; - uint32_t qc; + int c, qc; Token* tp; FirstCharKind c1kind; const char16_t* numStart; @@ -1151,7 +1095,7 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) // Chars not in the range 0..127 are rare. Getting them out of the way // early allows subsequent checking to be faster. if (MOZ_UNLIKELY(c >= 128)) { - if (unicode::IsSpaceOrBOM2(c)) { + if (IsSpaceOrBOM2(c)) { if (c == LINE_SEPARATOR || c == PARA_SEPARATOR) { updateLineInfoForEOL(); updateFlagsForEOL(); @@ -1168,7 +1112,7 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) static_assert('_' < 128, "IdentifierStart contains '_', but as !IsLetter('_'), " "ensure that '_' is never handled here"); - if (unicode::IsLetter(c)) { + if (IsLetter(c)) { identStart = userbuf.addressOfNextRawChar() - 1; hadUnicodeEscape = false; goto identifier; @@ -1224,7 +1168,7 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) c = getCharIgnoreEOL(); if (c == EOF) break; - if (!unicode::IsIdentifierPart(char16_t(c))) { + if (!IsIdentifierPart(c)) { if (c != '\\' || !matchUnicodeEscapeIdent(&qc)) break; hadUnicodeEscape = true; @@ -1318,7 +1262,7 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) } ungetCharIgnoreEOL(c); - if (c != EOF && unicode::IsIdentifierStart(char16_t(c))) { + if (c != EOF && IsIdentifierStart(c)) { reportError(JSMSG_IDSTART_AFTER_NUMBER); goto error; } @@ -1425,7 +1369,7 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) } ungetCharIgnoreEOL(c); - if (c != EOF && unicode::IsIdentifierStart(char16_t(c))) { + if (c != EOF && IsIdentifierStart(c)) { reportError(JSMSG_IDSTART_AFTER_NUMBER); goto error; } @@ -1478,15 +1422,13 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) tp->type = matchChar('=') ? TOK_ADDASSIGN : TOK_ADD; goto out; - case '\\': { - uint32_t escapeLength = matchUnicodeEscapeIdStart(&qc); - if (escapeLength > 0) { - identStart = userbuf.addressOfNextRawChar() - escapeLength - 1; - hadUnicodeEscape = true; + case '\\': + hadUnicodeEscape = matchUnicodeEscapeIdStart(&qc); + if (hadUnicodeEscape) { + identStart = userbuf.addressOfNextRawChar() - 6; goto identifier; } goto badchar; - } case '|': if (matchChar('|')) @@ -1732,7 +1674,7 @@ TokenStream::getBracedUnicode(uint32_t* cp) return false; code = (code << 4) | JS7_UNHEX(c); - if (code > unicode::NonBMPMax) + if (code > 0x10FFFF) return false; first = false; } @@ -1785,13 +1727,13 @@ TokenStream::getStringOrTemplateToken(int untilChar, Token** tp) return false; } - MOZ_ASSERT(code <= unicode::NonBMPMax); - if (code < unicode::NonBMPMin) { + MOZ_ASSERT(code <= 0x10FFFF); + if (code < 0x10000) { c = code; } else { - if (!tokenbuf.append(unicode::LeadSurrogate(code))) + if (!tokenbuf.append((code - 0x10000) / 1024 + 0xD800)) return false; - c = unicode::TrailSurrogate(code); + c = ((code - 0x10000) % 1024) + 0xDC00; } break; } diff --git a/js/src/frontend/TokenStream.h b/js/src/frontend/TokenStream.h index 29dcead62e4b..4d14c8515094 100644 --- a/js/src/frontend/TokenStream.h +++ b/js/src/frontend/TokenStream.h @@ -948,10 +948,9 @@ class MOZ_STACK_CLASS TokenStream void ungetChar(int32_t c); void ungetCharIgnoreEOL(int32_t c); Token* newToken(ptrdiff_t adjust); - uint32_t peekUnicodeEscape(uint32_t* codePoint); - uint32_t peekExtendedUnicodeEscape(uint32_t* codePoint); - uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint); - bool matchUnicodeEscapeIdent(uint32_t* codePoint); + bool peekUnicodeEscape(int32_t* c); + bool matchUnicodeEscapeIdStart(int32_t* c); + bool matchUnicodeEscapeIdent(int32_t* c); bool peekChars(int n, char16_t* cp); MOZ_MUST_USE bool getDirectives(bool isMultiline, bool shouldWarnDeprecated); diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index ccc6ae3ebb77..7619ee78a085 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -276,7 +276,7 @@ HexValue(uint32_t c) } template -widechar +size_t RegExpParser::ParseOctalLiteral() { MOZ_ASSERT('0' <= current() && current() <= '7'); @@ -297,7 +297,7 @@ RegExpParser::ParseOctalLiteral() template bool -RegExpParser::ParseHexEscape(int length, widechar* value) +RegExpParser::ParseHexEscape(int length, size_t* value) { const CharT* start = position(); uint32_t val = 0; @@ -321,7 +321,7 @@ RegExpParser::ParseHexEscape(int length, widechar* value) template bool -RegExpParser::ParseBracedHexEscape(widechar* value) +RegExpParser::ParseBracedHexEscape(size_t* value) { MOZ_ASSERT(current() == '{'); Advance(); @@ -363,7 +363,7 @@ RegExpParser::ParseBracedHexEscape(widechar* value) template bool -RegExpParser::ParseTrailSurrogate(widechar* value) +RegExpParser::ParseTrailSurrogate(size_t* value) { if (current() != '\\') return false; @@ -541,7 +541,7 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) return true; case 'x': { Advance(); - widechar value; + size_t value; if (ParseHexEscape(2, &value)) { *code = value; return true; @@ -557,7 +557,7 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) } case 'u': { Advance(); - widechar value; + size_t value; if (unicode_) { if (current() == '{') { if (!ParseBracedHexEscape(&value)) @@ -567,7 +567,7 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) } if (ParseHexEscape(4, &value)) { if (unicode::IsLeadSurrogate(value)) { - widechar trail; + size_t trail; if (ParseTrailSurrogate(&trail)) { *code = unicode::UTF16Decode(value, trail); return true; @@ -782,10 +782,10 @@ NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector** ranges, const RangeType& range = (**ranges)[i]; for (size_t j = 0; j < tmp_ranges->length(); j++) { const RangeType& tmpRange = (*tmp_ranges)[j]; - auto from1 = tmpRange.from(); - auto to1 = tmpRange.to(); - auto from2 = range.from(); - auto to2 = range.to(); + size_t from1 = tmpRange.from(); + size_t to1 = tmpRange.to(); + size_t from2 = range.from(); + size_t to2 = range.to(); if (from1 < from2) { if (to1 < from2) { @@ -926,8 +926,8 @@ UnicodeRangesAtom(LifoAlloc* alloc, const WideCharRange& range = (*wide_ranges)[i]; widechar from = range.from(); widechar to = range.to(); - char16_t from_lead, from_trail; - char16_t to_lead, to_trail; + size_t from_lead, from_trail; + size_t to_lead, to_trail; unicode::UTF16Encode(from, &from_lead, &from_trail); if (from == to) { @@ -1636,7 +1636,7 @@ RegExpParser::ParseDisjunction() } Advance(); - widechar octal = ParseOctalLiteral(); + size_t octal = ParseOctalLiteral(); builder->AddCharacter(octal); break; } @@ -1684,7 +1684,7 @@ RegExpParser::ParseDisjunction() } case 'x': { Advance(2); - widechar value; + size_t value; if (ParseHexEscape(2, &value)) { builder->AddCharacter(value); } else { @@ -1696,7 +1696,7 @@ RegExpParser::ParseDisjunction() } case 'u': { Advance(2); - widechar value; + size_t value; if (unicode_) { if (current() == '{') { if (!ParseBracedHexEscape(&value)) @@ -1706,7 +1706,7 @@ RegExpParser::ParseDisjunction() } else if (unicode::IsTrailSurrogate(value)) { builder->AddAtom(TrailSurrogateAtom(alloc, value)); } else if (value >= unicode::NonBMPMin) { - char16_t lead, trail; + size_t lead, trail; unicode::UTF16Encode(value, &lead, &trail); builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_)); @@ -1715,7 +1715,7 @@ RegExpParser::ParseDisjunction() } } else if (ParseHexEscape(4, &value)) { if (unicode::IsLeadSurrogate(value)) { - widechar trail; + size_t trail; if (ParseTrailSurrogate(&trail)) { builder->AddAtom(SurrogatePairAtom(alloc, value, trail, ignore_case_)); diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index b5228a86f99a..89910dea4050 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -196,13 +196,13 @@ class RegExpParser // Checks whether the following is a length-digit hexadecimal number, // and sets the value if it is. - bool ParseHexEscape(int length, widechar* value); + bool ParseHexEscape(int length, size_t* value); - bool ParseBracedHexEscape(widechar* value); - bool ParseTrailSurrogate(widechar* value); + bool ParseBracedHexEscape(size_t* value); + bool ParseTrailSurrogate(size_t* value); bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail); - widechar ParseOctalLiteral(); + size_t ParseOctalLiteral(); // Tries to parse the input as a back reference. If successful it // stores the result in the output parameter and returns true. If diff --git a/js/src/jsstr.cpp b/js/src/jsstr.cpp index edfff59e84c0..48e955638f0a 100644 --- a/js/src/jsstr.cpp +++ b/js/src/jsstr.cpp @@ -46,7 +46,6 @@ #include "vm/RegExpObject.h" #include "vm/RegExpStatics.h" #include "vm/StringBuffer.h" -#include "vm/Unicode.h" #include "vm/Interpreter-inl.h" #include "vm/String-inl.h" @@ -55,6 +54,7 @@ using namespace js; using namespace js::gc; +using namespace js::unicode; using JS::Symbol; using JS::SymbolCode; @@ -2762,6 +2762,35 @@ js::str_fromCharCode_one_arg(JSContext* cx, HandleValue code, MutableHandleValue return CodeUnitToString(cx, ucode, rval); } +static inline bool +IsSupplementary(uint32_t codePoint) +{ + return codePoint > 0xFFFF; +} + +static inline char16_t +LeadSurrogate(uint32_t codePoint) +{ + return char16_t((codePoint >> 10) + 0xD7C0); +} + +static inline char16_t +TrailSurrogate(uint32_t codePoint) +{ + return char16_t((codePoint & 0x3FF) | 0xDC00); +} + +static inline void +UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index) +{ + if (!IsSupplementary(codePoint)) { + elements[(*index)++] = char16_t(codePoint); + } else { + elements[(*index)++] = LeadSurrogate(codePoint); + elements[(*index)++] = TrailSurrogate(codePoint); + } +} + static MOZ_ALWAYS_INLINE bool ToCodePoint(JSContext* cx, HandleValue code, uint32_t* codePoint) { @@ -2771,7 +2800,7 @@ ToCodePoint(JSContext* cx, HandleValue code, uint32_t* codePoint) return false; // String.fromCodePoint, Steps 5.c-d. - if (JS::ToInteger(nextCP) != nextCP || nextCP < 0 || nextCP > unicode::NonBMPMax) { + if (JS::ToInteger(nextCP) != nextCP || nextCP < 0 || nextCP > 0x10FFFF) { ToCStringBuf cbuf; if (char* numStr = NumberToCString(cx, &cbuf, nextCP)) JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_NOT_A_CODEPOINT, numStr); @@ -2793,10 +2822,10 @@ js::str_fromCodePoint_one_arg(JSContext* cx, HandleValue code, MutableHandleValu return false; // Steps 5.e, 6. - if (!unicode::IsSupplementary(codePoint)) + if (!IsSupplementary(codePoint)) return CodeUnitToString(cx, uint16_t(codePoint), rval); - char16_t chars[] = { unicode::LeadSurrogate(codePoint), unicode::TrailSurrogate(codePoint) }; + char16_t chars[] = { LeadSurrogate(codePoint), TrailSurrogate(codePoint) }; JSString* str = NewStringCopyNDontDeflate(cx, chars, 2); if (!str) return false; @@ -2824,7 +2853,7 @@ str_fromCodePoint_few_args(JSContext* cx, const CallArgs& args) return false; // Step 5.e. - unicode::UTF16Encode(codePoint, elements, &length); + UTF16Encode(codePoint, elements, &length); } // Step 6. @@ -2875,7 +2904,7 @@ js::str_fromCodePoint(JSContext* cx, unsigned argc, Value* vp) } // Step 5.e. - unicode::UTF16Encode(codePoint, elements, &length); + UTF16Encode(codePoint, elements, &length); } elements[length] = 0; @@ -3584,11 +3613,11 @@ Encode(StringBuffer& sb, const CharT* chars, size_t length, if (!sb.append(c)) return Encode_Failure; } else { - if (unicode::IsTrailSurrogate(c)) + if (c >= 0xDC00 && c <= 0xDFFF) return Encode_BadUri; uint32_t v; - if (!unicode::IsLeadSurrogate(c)) { + if (c < 0xD800 || c > 0xDBFF) { v = c; } else { k++; @@ -3596,10 +3625,10 @@ Encode(StringBuffer& sb, const CharT* chars, size_t length, return Encode_BadUri; char16_t c2 = chars[k]; - if (!unicode::IsTrailSurrogate(c2)) + if (c2 < 0xDC00 || c2 > 0xDFFF) return Encode_BadUri; - v = unicode::UTF16Decode(c, c2); + v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; } uint8_t utf8buf[4]; size_t L = OneUcs4ToUtf8Char(utf8buf, v); @@ -3699,14 +3728,15 @@ Decode(StringBuffer& sb, const CharT* chars, size_t length, const bool* reserved octets[j] = char(B); } uint32_t v = JS::Utf8ToOneUcs4Char(octets, n); - if (v >= unicode::NonBMPMin) { - if (v > unicode::NonBMPMax) + if (v >= 0x10000) { + v -= 0x10000; + if (v > 0xFFFFF) return Decode_BadUri; - char16_t H = unicode::LeadSurrogate(v); + c = char16_t((v & 0x3FF) + 0xDC00); + char16_t H = char16_t((v >> 10) + 0xD800); if (!sb.append(H)) return Decode_Failure; - c = unicode::TrailSurrogate(v); } else { c = char16_t(v); } @@ -3810,7 +3840,7 @@ str_encodeURI_Component(JSContext* cx, unsigned argc, Value* vp) uint32_t js::OneUcs4ToUtf8Char(uint8_t* utf8Buffer, uint32_t ucs4Char) { - MOZ_ASSERT(ucs4Char <= unicode::NonBMPMax); + MOZ_ASSERT(ucs4Char <= 0x10FFFF); if (ucs4Char < 0x80) { utf8Buffer[0] = uint8_t(ucs4Char); diff --git a/js/src/tests/ecma_6/Syntax/identifiers-with-extended-unicode-escape.js b/js/src/tests/ecma_6/Syntax/identifiers-with-extended-unicode-escape.js deleted file mode 100644 index e67c359a04ad..000000000000 --- a/js/src/tests/ecma_6/Syntax/identifiers-with-extended-unicode-escape.js +++ /dev/null @@ -1,229 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// Simple cases, not using eval. -{ - let \u{61} = 123; - assertEq(a, 123); - - let \u{6A} = 123; - assertEq(j, 123); - - let a\u{62} = 456; - assertEq(ab, 456); - - let \u{63}\u{6b} = 789; - assertEq(ck, 789); -} - -const leadingZeros = [0, 1, 2, 3, 4, 100].map(c => "0".repeat(c)); - - -// From DerivedCoreProperties.txt (Unicode 9): -// Derived Property: ID_Start -// Characters that can start an identifier. -// Generated from: -// Lu + Ll + Lt + Lm + Lo + Nl -// + Other_ID_Start -// - Pattern_Syntax -// - Pattern_White_Space -const idStart = [ - 0x0041, // LATIN CAPITAL LETTER A, Gc=Lu - 0x006A, // LATIN SMALL LETTER J, Gc=Ll - 0x00C9, // LATIN CAPITAL LETTER E WITH ACUTE, Gc=Lu - 0x00FF, // LATIN SMALL LETTER Y WITH DIAERESIS, Gc=Ll - 0x01C5, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON, Gc=Lt - 0x0294, // LATIN LETTER GLOTTAL STOP, Gc=Lo - 0x037A, // GREEK YPOGEGRAMMENI, Gc=Lm - 0x16EE, // RUNIC ARLAUG SYMBOL, Gc=Nl - 0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK, Gc=Lm -]; - -const idStartSupplemental = [ - 0x10140, // GREEK ACROPHONIC ATTIC ONE QUARTER, Gc=Nl - 0x10300, // OLD ITALIC LETTER A, Gc=Lo - 0x10400, // DESERET CAPITAL LETTER LONG I, Gc=Lu - 0x10430, // DESERET SMALL LETTER SHORT A, Gc=Ll - 0x16B40, // PAHAWH HMONG SIGN VOS SEEV, Gc=Lm -]; - -// From PropList.txt (Unicode 9): -const otherIdStart = [ - // Enable the following lines when Bug 1282724 is fixed. - // 0x1885, // MONGOLIAN LETTER ALI GALI BALUDA, Gc=Mn - // 0x1886, // MONGOLIAN LETTER ALI GALI THREE BALUDA, Gc=Mn - 0x2118, // SCRIPT CAPITAL P, Gc=Sm - 0x212E, // ESTIMATED SYMBOL, Gc=So - 0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK, Gc=Sk - 0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK, Gc=Sk -]; - -// Remove this list when we support Unicode 9 (Bug 1282724). -const otherIdStart_Unicode9 = [ - 0x1885, // MONGOLIAN LETTER ALI GALI BALUDA, Gc=Mn - 0x1886, // MONGOLIAN LETTER ALI GALI THREE BALUDA, Gc=Mn -]; - -// From DerivedCoreProperties.txt (Unicode 9): -// Derived Property: ID_Continue -// Characters that can continue an identifier. -// Generated from: -// ID_Start -// + Mn + Mc + Nd + Pc -// + Other_ID_Continue -// - Pattern_Syntax -// - Pattern_White_Space -const idContinue = [ - 0x0030, // DIGIT ZERO, Gc=Nd - 0x0300, // COMBINING GRAVE ACCENT, Gc=Mn - 0x0660, // ARABIC-INDIC DIGIT ZERO, Gc=Nd - 0x0903, // DEVANAGARI SIGN VISARGA, Gc=Mc - 0xFF10, // FULLWIDTH DIGIT ZERO, Gc=Nd - 0xFF3F, // FULLWIDTH LOW LINE, Gc=Pc -]; - -const idContinueSupplemental = [ - 0x101FD, // PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE, Gc=Mn - 0x104A0, // OSMANYA DIGIT ZERO, Gc=Nd - 0x11000, // BRAHMI SIGN CANDRABINDU, Gc=Mc -]; - -// From PropList.txt (Unicode 9): -const otherIdContinue = [ - 0x00B7, // MIDDLE DOT, Gc=Po - 0x0387, // GREEK ANO TELEIA, Gc=Po - 0x1369, // ETHIOPIC DIGIT ONE, Gc=No - 0x136A, // ETHIOPIC DIGIT TWO, Gc=No - 0x136B, // ETHIOPIC DIGIT THREE, Gc=No - 0x136C, // ETHIOPIC DIGIT FOUR, Gc=No - 0x136D, // ETHIOPIC DIGIT FIVE, Gc=No - 0x136E, // ETHIOPIC DIGIT SIX, Gc=No - 0x136F, // ETHIOPIC DIGIT SEVEN, Gc=No - 0x1370, // ETHIOPIC DIGIT EIGHT, Gc=No - 0x1371, // ETHIOPIC DIGIT NINE, Gc=No - 0x19DA, // NEW TAI LUE THAM DIGIT ONE, Gc=No -]; - -for (let ident of [...idStart, ...otherIdStart_Unicode9]) { - for (let count of leadingZeros) { - let zeros = "0".repeat(count); - eval(` - let \\u{${zeros}${ident.toString(16)}} = 123; - assertEq(${String.fromCodePoint(ident)}, 123); - `); - } -} - -// Move this to the loop above when Bug 917436 is fixed. -for (let ident of [...idStartSupplemental, ...otherIdStart]) { - for (let zeros of leadingZeros) { - assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError); - } -} - -for (let ident of [...idContinue, ...idContinueSupplemental, ...otherIdContinue]) { - for (let zeros of leadingZeros) { - assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError); - } -} - -for (let ident of [...idStart, ...otherIdStart_Unicode9, ...idContinue]) { - for (let zeros of leadingZeros) { - eval(` - let A\\u{${zeros}${ident.toString(16)}} = 123; - assertEq(${String.fromCodePoint(0x41, ident)}, 123); - `); - } -} - -// Move this to the loop above when Bug 917436 is fixed. -for (let ident of [...idStartSupplemental, ...otherIdStart, ...idContinueSupplemental, ...otherIdContinue]) { - for (let zeros of leadingZeros) { - assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError); - } -} - - -const notIdentifiers = [ - 0x0000, // NULL, Gc=Cc - 0x000A, // LINE FEED (LF), Gc=Cc - 0x005E, // CIRCUMFLEX ACCENT, Gc=Sk - 0x00B1, // PLUS-MINUS SIGN, Gc=Sm - 0xFF61, // HALFWIDTH IDEOGRAPHIC FULL STOP, Gc=Po - 0x10061, // Not assigned. - 0x10100, // AEGEAN WORD SEPARATOR LINE, Gc=Po - 0x100061, // , Gc=Co -]; - -for (let ident of notIdentifiers) { - for (let zeros of leadingZeros) { - assertThrowsInstanceOf(() => eval(`\\u{${zeros}${ident.toString(16)}}`), SyntaxError); - } -} - - -const incompleteEscapes = [ - "\\u{", - "\\u{6", - "\\u{61", - "\\u{061", - "\\u{0061", - "\\u{00061", - "\\u{000061", - "\\u{0000061", - - "\\u}", -]; -for (let invalid of incompleteEscapes) { - // Ends with EOF. - assertThrowsInstanceOf(() => eval(invalid), SyntaxError); - - // Ends with EOL. - assertThrowsInstanceOf(() => eval(invalid + "\n"), SyntaxError); - - // Ends with space. - assertThrowsInstanceOf(() => eval(invalid + " "), SyntaxError); -} - - -const invalidEscapes = [ - // Empty escape. - "", - - // Not hexadecimal characters. - "\0", - "G", - "Z", - "\uFFFF", - "\uDBFF\uDFFF", - - // Has space characters. - " 61", - "61 ", - - // Has newline characters. - "\n61", - "61\n", - - // Exceeds 0x10FFFF, six characters. - "110000", - "110001", - "fffffe", - "ffffff", - - // Exceeds 0x10FFFF, more than six characters. - "10ffff0", - "10ffffabcdef", -]; - -for (let invalid of invalidEscapes) { - for (let zeros of leadingZeros) { - assertThrowsInstanceOf(() => eval(`\\u{${zeros}${invalid}}`), SyntaxError); - assertThrowsInstanceOf(() => eval(`var \\u{${zeros}${invalid}}`), SyntaxError); - } -} - - -if (typeof reportCompare === "function") - reportCompare(0, 0, "ok"); diff --git a/js/src/vm/Unicode.h b/js/src/vm/Unicode.h index a7ebda3dd82d..5954930d5d3e 100644 --- a/js/src/vm/Unicode.h +++ b/js/src/vm/Unicode.h @@ -68,15 +68,6 @@ struct CharFlag { const char16_t BYTE_ORDER_MARK2 = 0xFFFE; const char16_t NO_BREAK_SPACE = 0x00A0; -const char16_t LeadSurrogateMin = 0xD800; -const char16_t LeadSurrogateMax = 0xDBFF; -const char16_t TrailSurrogateMin = 0xDC00; -const char16_t TrailSurrogateMax = 0xDFFF; - -const uint32_t UTF16Max = 0xFFFF; -const uint32_t NonBMPMin = 0x10000; -const uint32_t NonBMPMax = 0x10FFFF; - class CharacterInfo { /* * upperCase and lowerCase normally store the delta between two @@ -144,13 +135,6 @@ IsIdentifierStart(char16_t ch) return CharInfo(ch).isLetter(); } -inline bool -IsIdentifierStart(uint32_t codePoint) -{ - // TODO: Supplemental code points not yet supported (bug 1197230). - return codePoint <= UTF16Max && IsIdentifierStart(char16_t(codePoint)); -} - inline bool IsIdentifierPart(char16_t ch) { @@ -162,13 +146,6 @@ IsIdentifierPart(char16_t ch) return CharInfo(ch).isIdentifierPart(); } -inline bool -IsIdentifierPart(uint32_t codePoint) -{ - // TODO: Supplemental code points not yet supported (bug 1197230). - return codePoint <= UTF16Max && IsIdentifierPart(char16_t(codePoint)); -} - inline bool IsLetter(char16_t ch) { @@ -421,67 +398,42 @@ ReverseFoldCase3(char16_t ch) return uint16_t(ch) + info.reverse3; } +const size_t LeadSurrogateMin = 0xD800; +const size_t LeadSurrogateMax = 0xDBFF; +const size_t TrailSurrogateMin = 0xDC00; +const size_t TrailSurrogateMax = 0xDFFF; +const size_t UTF16Max = 0xFFFF; +const size_t NonBMPMin = 0x10000; +const size_t NonBMPMax = 0x10FFFF; + inline bool -IsSupplementary(uint32_t codePoint) +IsLeadSurrogate(size_t value) { - return codePoint >= NonBMPMin && codePoint <= NonBMPMax; + return value >= LeadSurrogateMin && value <= LeadSurrogateMax; } inline bool -IsLeadSurrogate(uint32_t codePoint) +IsTrailSurrogate(size_t value) { - return codePoint >= LeadSurrogateMin && codePoint <= LeadSurrogateMax; -} - -inline bool -IsTrailSurrogate(uint32_t codePoint) -{ - return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax; -} - -inline char16_t -LeadSurrogate(uint32_t codePoint) -{ - MOZ_ASSERT(IsSupplementary(codePoint)); - - return char16_t((codePoint >> 10) + (LeadSurrogateMin - (NonBMPMin >> 10))); -} - -inline char16_t -TrailSurrogate(uint32_t codePoint) -{ - MOZ_ASSERT(IsSupplementary(codePoint)); - - return char16_t((codePoint & 0x3FF) | TrailSurrogateMin); + return value >= TrailSurrogateMin && value <= TrailSurrogateMax; } inline void -UTF16Encode(uint32_t codePoint, char16_t* lead, char16_t* trail) +UTF16Encode(size_t cp, size_t* lead, size_t* trail) { - MOZ_ASSERT(IsSupplementary(codePoint)); + MOZ_ASSERT(cp >= NonBMPMin && cp <= NonBMPMax); - *lead = LeadSurrogate(codePoint); - *trail = TrailSurrogate(codePoint); + *lead = (cp - NonBMPMin) / 1024 + LeadSurrogateMin; + *trail = ((cp - NonBMPMin) % 1024) + TrailSurrogateMin; } -static inline void -UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index) -{ - if (!IsSupplementary(codePoint)) { - elements[(*index)++] = char16_t(codePoint); - } else { - elements[(*index)++] = LeadSurrogate(codePoint); - elements[(*index)++] = TrailSurrogate(codePoint); - } -} - -inline uint32_t -UTF16Decode(char16_t lead, char16_t trail) +inline size_t +UTF16Decode(size_t lead, size_t trail) { MOZ_ASSERT(IsLeadSurrogate(lead)); MOZ_ASSERT(IsTrailSurrogate(trail)); - return (lead << 10) + trail + (NonBMPMin - (LeadSurrogateMin << 10) - TrailSurrogateMin); + return (lead - LeadSurrogateMin) * 1024 + (trail - TrailSurrogateMin) + NonBMPMin; } } /* namespace unicode */