diff --git a/js/src/builtin/String.cpp b/js/src/builtin/String.cpp index 5d94f9f74499..3d84d8286eb1 100644 --- a/js/src/builtin/String.cpp +++ b/js/src/builtin/String.cpp @@ -905,9 +905,9 @@ ToLowerCase(JSContext* cx, JSLinearString* str) // We don't need extra special casing checks in the loop below, // because U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+03A3 // GREEK CAPITAL LETTER SIGMA already have simple lower case mappings. - MOZ_ASSERT(unicode::CanLowerCase(unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE), + MOZ_ASSERT(unicode::ChangesWhenLowerCased(unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE), "U+0130 has a simple lower case mapping"); - MOZ_ASSERT(unicode::CanLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA), + MOZ_ASSERT(unicode::ChangesWhenLowerCased(unicode::GREEK_CAPITAL_LETTER_SIGMA), "U+03A3 has a simple lower case mapping"); // One element Latin-1 strings can be directly retrieved from the @@ -930,7 +930,7 @@ ToLowerCase(JSContext* cx, JSLinearString* str) if (unicode::IsLeadSurrogate(c) && i + 1 < length) { CharT trail = chars[i + 1]; if (unicode::IsTrailSurrogate(trail)) { - if (unicode::CanLowerCaseNonBMP(c, trail)) + if (unicode::ChangesWhenLowerCasedNonBMP(c, trail)) break; i++; @@ -938,7 +938,7 @@ ToLowerCase(JSContext* cx, JSLinearString* str) } } } - if (unicode::CanLowerCase(c)) + if (unicode::ChangesWhenLowerCased(c)) break; } @@ -1114,24 +1114,24 @@ js::str_toLocaleLowerCase(JSContext* cx, unsigned argc, Value* vp) #endif // EXPOSE_INTL_API static inline bool -CanUpperCaseSpecialCasing(Latin1Char charCode) +ToUpperCaseHasSpecialCasing(Latin1Char charCode) { - // Handle U+00DF LATIN SMALL LETTER SHARP S inline, all other Latin-1 - // characters don't have special casing rules. - MOZ_ASSERT_IF(charCode != unicode::LATIN_SMALL_LETTER_SHARP_S, - !unicode::CanUpperCaseSpecialCasing(charCode)); + // U+00DF LATIN SMALL LETTER SHARP S is the only Latin-1 code point with + // special casing rules, so detect it inline. + bool hasUpperCaseSpecialCasing = charCode == unicode::LATIN_SMALL_LETTER_SHARP_S; + MOZ_ASSERT(hasUpperCaseSpecialCasing == unicode::ChangesWhenUpperCasedSpecialCasing(charCode)); - return charCode == unicode::LATIN_SMALL_LETTER_SHARP_S; + return hasUpperCaseSpecialCasing; } static inline bool -CanUpperCaseSpecialCasing(char16_t charCode) +ToUpperCaseHasSpecialCasing(char16_t charCode) { - return unicode::CanUpperCaseSpecialCasing(charCode); + return unicode::ChangesWhenUpperCasedSpecialCasing(charCode); } static inline size_t -LengthUpperCaseSpecialCasing(Latin1Char charCode) +ToUpperCaseLengthSpecialCasing(Latin1Char charCode) { // U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'. MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S); @@ -1140,15 +1140,15 @@ LengthUpperCaseSpecialCasing(Latin1Char charCode) } static inline size_t -LengthUpperCaseSpecialCasing(char16_t charCode) +ToUpperCaseLengthSpecialCasing(char16_t charCode) { - MOZ_ASSERT(::CanUpperCaseSpecialCasing(charCode)); + MOZ_ASSERT(ToUpperCaseHasSpecialCasing(charCode)); return unicode::LengthUpperCaseSpecialCasing(charCode); } static inline void -AppendUpperCaseSpecialCasing(char16_t charCode, Latin1Char* elements, size_t* index) +ToUpperCaseAppendUpperCaseSpecialCasing(char16_t charCode, Latin1Char* elements, size_t* index) { // U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'. MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S); @@ -1159,7 +1159,7 @@ AppendUpperCaseSpecialCasing(char16_t charCode, Latin1Char* elements, size_t* in } static inline void -AppendUpperCaseSpecialCasing(char16_t charCode, char16_t* elements, size_t* index) +ToUpperCaseAppendUpperCaseSpecialCasing(char16_t charCode, char16_t* elements, size_t* index) { unicode::AppendUpperCaseSpecialCasing(charCode, elements, index); } @@ -1191,12 +1191,12 @@ ToUpperCaseImpl(DestChar* destChars, const SrcChar* srcChars, size_t startIndex, } } - if (MOZ_UNLIKELY(c > 0x7f && ::CanUpperCaseSpecialCasing(static_cast(c)))) { + if (MOZ_UNLIKELY(c > 0x7f && ToUpperCaseHasSpecialCasing(static_cast(c)))) { // Return if the output buffer is too small. if (srcLength == destLength) return i; - ::AppendUpperCaseSpecialCasing(c, destChars, &j); + ToUpperCaseAppendUpperCaseSpecialCasing(c, destChars, &j); continue; } @@ -1226,8 +1226,8 @@ ToUpperCaseLength(const CharT* chars, size_t startIndex, size_t length) for (size_t i = startIndex; i < length; i++) { char16_t c = chars[i]; - if (c > 0x7f && ::CanUpperCaseSpecialCasing(static_cast(c))) - upperLength += ::LengthUpperCaseSpecialCasing(static_cast(c)) - 1; + if (c > 0x7f && ToUpperCaseHasSpecialCasing(static_cast(c))) + upperLength += ToUpperCaseLengthSpecialCasing(static_cast(c)) - 1; } return upperLength; } @@ -1307,7 +1307,7 @@ ToUpperCase(JSContext* cx, JSLinearString* str) } MOZ_ASSERT(unicode::ToUpperCase(c) > JSString::MAX_LATIN1_CHAR || - ::CanUpperCaseSpecialCasing(c)); + ToUpperCaseHasSpecialCasing(c)); } } @@ -1319,7 +1319,7 @@ ToUpperCase(JSContext* cx, JSLinearString* str) if (unicode::IsLeadSurrogate(c) && i + 1 < length) { CharT trail = chars[i + 1]; if (unicode::IsTrailSurrogate(trail)) { - if (unicode::CanUpperCaseNonBMP(c, trail)) + if (unicode::ChangesWhenUpperCasedNonBMP(c, trail)) break; i++; @@ -1327,9 +1327,9 @@ ToUpperCase(JSContext* cx, JSLinearString* str) } } } - if (unicode::CanUpperCase(c)) + if (unicode::ChangesWhenUpperCased(c)) break; - if (MOZ_UNLIKELY(c > 0x7f && ::CanUpperCaseSpecialCasing(c))) + if (MOZ_UNLIKELY(c > 0x7f && ToUpperCaseHasSpecialCasing(c))) break; } diff --git a/js/src/util/Unicode.cpp b/js/src/util/Unicode.cpp index 09bfdfbbade4..45005bae75ee 100644 --- a/js/src/util/Unicode.cpp +++ b/js/src/util/Unicode.cpp @@ -2679,7 +2679,7 @@ js::unicode::IsIdentifierPartNonBMP(uint32_t codePoint) } bool -js::unicode::CanUpperCaseSpecialCasing(char16_t ch) +js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch) { if (ch < 0x00DF || ch > 0xFB17) return false; diff --git a/js/src/util/Unicode.h b/js/src/util/Unicode.h index 4e5f147c7703..f80c5ff80a10 100644 --- a/js/src/util/Unicode.h +++ b/js/src/util/Unicode.h @@ -255,8 +255,9 @@ IsSpaceOrBOM2(char16_t ch) } /* - * Returns the simple upper case mapping (see CanUpperCaseSpecialCasing for - * details) of the given UTF-16 code unit. + * Returns the simple upper case mapping (possibly the identity mapping; see + * ChangesWhenUpperCasedSpecialCasing for details) of the given UTF-16 code + * unit. */ inline char16_t ToUpperCase(char16_t ch) @@ -273,8 +274,9 @@ ToUpperCase(char16_t ch) } /* - * Returns the simple lower case mapping (see CanUpperCaseSpecialCasing for - * details) of the given UTF-16 code unit. + * Returns the simple lower case mapping (possibly the identity mapping; see + * ChangesWhenUpperCasedSpecialCasing for details) of the given UTF-16 code + * unit. */ inline char16_t ToLowerCase(char16_t ch) @@ -290,32 +292,46 @@ ToLowerCase(char16_t ch) return uint16_t(ch) + info.lowerCase; } -// Returns true iff ToUpperCase(ch) != ch. +/** + * Returns true iff ToUpperCase(ch) != ch. + * + * This function isn't guaranteed to correctly handle code points for which + * |ChangesWhenUpperCasedSpecialCasing| returns true, so it is *not* always the + * same as the value of the Changes_When_Uppercased Unicode property value for + * the code point. + */ inline bool -CanUpperCase(char16_t ch) +ChangesWhenUpperCased(char16_t ch) { if (ch < 128) return ch >= 'a' && ch <= 'z'; return CharInfo(ch).upperCase != 0; } -// Returns true iff ToUpperCase(ch) != ch. +/** + * Returns true iff ToUpperCase(ch) != ch. + * + * This function isn't guaranteed to correctly handle code points for which + * |ChangesWhenUpperCasedSpecialCasing| returns true, so it is *not* always the + * same as the value of the Changes_When_Uppercased Unicode property value for + * the code point. + */ inline bool -CanUpperCase(JS::Latin1Char ch) +ChangesWhenUpperCased(JS::Latin1Char ch) { if (MOZ_LIKELY(ch < 128)) return ch >= 'a' && ch <= 'z'; // U+00B5 and U+00E0 to U+00FF, except U+00F7, have an uppercase form. - bool canUpper = ch == MICRO_SIGN || + bool hasUpper = ch == MICRO_SIGN || (((ch & ~0x1F) == LATIN_SMALL_LETTER_A_WITH_GRAVE) && ch != DIVISION_SIGN); - MOZ_ASSERT(canUpper == CanUpperCase(char16_t(ch))); - return canUpper; + MOZ_ASSERT(hasUpper == ChangesWhenUpperCased(char16_t(ch))); + return hasUpper; } // Returns true iff ToLowerCase(ch) != ch. inline bool -CanLowerCase(char16_t ch) +ChangesWhenLowerCased(char16_t ch) { if (ch < 128) return ch >= 'A' && ch <= 'Z'; @@ -324,16 +340,16 @@ CanLowerCase(char16_t ch) // Returns true iff ToLowerCase(ch) != ch. inline bool -CanLowerCase(JS::Latin1Char ch) +ChangesWhenLowerCased(JS::Latin1Char ch) { if (MOZ_LIKELY(ch < 128)) return ch >= 'A' && ch <= 'Z'; // U+00C0 to U+00DE, except U+00D7, have a lowercase form. - bool canLower = ((ch & ~0x1F) == LATIN_CAPITAL_LETTER_A_WITH_GRAVE) && + bool hasLower = ((ch & ~0x1F) == LATIN_CAPITAL_LETTER_A_WITH_GRAVE) && ((ch & MULTIPLICATION_SIGN) != MULTIPLICATION_SIGN); - MOZ_ASSERT(canLower == CanLowerCase(char16_t(ch))); - return canLower; + MOZ_ASSERT(hasLower == ChangesWhenLowerCased(char16_t(ch))); + return hasLower; } #define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ @@ -341,14 +357,14 @@ CanLowerCase(JS::Latin1Char ch) return true; inline bool -CanUpperCaseNonBMP(char16_t lead, char16_t trail) +ChangesWhenUpperCasedNonBMP(char16_t lead, char16_t trail) { FOR_EACH_NON_BMP_UPPERCASE(CHECK_RANGE) return false; } inline bool -CanLowerCaseNonBMP(char16_t lead, char16_t trail) +ChangesWhenLowerCasedNonBMP(char16_t lead, char16_t trail) { FOR_EACH_NON_BMP_LOWERCASE(CHECK_RANGE) return false; @@ -381,24 +397,36 @@ ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail) } /* - * Returns true if the given UTF-16 code unit has a language-independent, - * unconditional or conditional special upper case mapping. + * Returns true if, independent of language/locale, the given UTF-16 code unit + * has a special upper case mapping. * * Unicode defines two case mapping modes: - * 1. "simple case mappings" for one-to-one mappings which are independent of - * context and language (defined in UnicodeData.txt). - * 2. "special case mappings" for mappings which can increase or decrease the - * string length; or are dependent on context or locale (defined in - * SpecialCasing.txt). * - * The CanUpperCase() method defined above only supports simple case mappings. - * In order to support the full case mappings of all Unicode characters, - * callers need to check this method in addition to CanUpperCase(). + * 1. "simple case mappings" (defined in UnicodeData.txt) for one-to-one + * mappings that are always the same regardless of locale or context + * within a string (e.g. "a"→"A"). + * 2. "special case mappings" (defined in SpecialCasing.txt) for mappings + * that alter string length (e.g. uppercasing "ß"→"SS") or where different + * mappings occur depending on language/locale (e.g. uppercasing "i"→"I" + * usually but "i"→"İ" in Turkish) or context within the string (e.g. + * lowercasing "Σ" U+03A3 GREEK CAPITAL LETTER SIGMA to "ς" U+03C2 GREEK + * SMALL LETTER FINAL SIGMA when the sigma appears [roughly speaking] at + * the end of a word but "ς" U+03C3 GREEK SMALL LETTER SIGMA anywhere + * else). * - * NOTE: All special upper case mappings are unconditional in Unicode 9. + * The ChangesWhenUpperCased*() functions defined above will return true for + * code points that have simple case mappings, but they may not return the + * right result for code points that have special case mappings. To correctly + * support full case mappings for all code points, callers must determine + * whether this function returns true or false for the code point, then use + * AppendUpperCaseSpecialCasing in the former case and ToUpperCase in the + * latter. + * + * NOTE: All special upper case mappings are unconditional (that is, they don't + * depend on language/locale or context within the string) in Unicode 10. */ bool -CanUpperCaseSpecialCasing(char16_t ch); +ChangesWhenUpperCasedSpecialCasing(char16_t ch); /* * Returns the length of the upper case mapping of |ch|. diff --git a/js/src/util/make_unicode.py b/js/src/util/make_unicode.py index a075d3d26dee..c3206a035fa8 100755 --- a/js/src/util/make_unicode.py +++ b/js/src/util/make_unicode.py @@ -723,10 +723,10 @@ def write_special_casing_methods(unconditional_toupper, codepoint_table, println println(indent, ' return {};'.format(range_test_expr)) println(indent, '}') - def write_CanUpperCaseSpecialCasing(): + def write_ChangesWhenUpperCasedSpecialCasing(): """ Checks if the input has a special upper case mapping. """ println('bool') - println('js::unicode::CanUpperCaseSpecialCasing(char16_t ch)') + println('js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)') println('{') assert unconditional_toupper, "|unconditional_toupper| is not empty" @@ -816,7 +816,7 @@ def write_special_casing_methods(unconditional_toupper, codepoint_table, println println('}') - write_CanUpperCaseSpecialCasing() + write_ChangesWhenUpperCasedSpecialCasing() println('') write_LengthUpperCaseSpecialCasing() println('')