diff --git a/js/src/builtin/Intl.cpp b/js/src/builtin/Intl.cpp index 5d95d8407a5a..dac996e603bd 100644 --- a/js/src/builtin/Intl.cpp +++ b/js/src/builtin/Intl.cpp @@ -33,7 +33,7 @@ #include "builtin/intl/ICUStubs.h" #include "builtin/intl/NumberFormat.h" #include "builtin/intl/ScopedICUObject.h" -#include "builtin/IntlTimeZoneData.h" +#include "builtin/intl/SharedIntlData.h" #include "ds/Sort.h" #include "gc/FreeOp.h" #include "js/Date.h" @@ -66,6 +66,7 @@ using js::intl::DateTimeFormatOptions; using js::intl::GetAvailableLocales; using js::intl::IcuLocale; using js::intl::INITIAL_CHAR_BUFFER_SIZE; +using js::intl::SharedIntlData; using js::intl::StringsAreEqual; /******************** DateTimeFormat ********************/ @@ -385,280 +386,6 @@ js::intl_defaultCalendar(JSContext* cx, unsigned argc, Value* vp) return DefaultCalendar(cx, locale, args.rval()); } -template -static constexpr Char -ToUpperASCII(Char c) -{ - return ('a' <= c && c <= 'z') - ? (c & ~0x20) - : c; -} - -static_assert(ToUpperASCII('a') == 'A', "verifying 'a' uppercases correctly"); -static_assert(ToUpperASCII('m') == 'M', "verifying 'm' uppercases correctly"); -static_assert(ToUpperASCII('z') == 'Z', "verifying 'z' uppercases correctly"); -static_assert(ToUpperASCII(u'a') == u'A', "verifying u'a' uppercases correctly"); -static_assert(ToUpperASCII(u'k') == u'K', "verifying u'k' uppercases correctly"); -static_assert(ToUpperASCII(u'z') == u'Z', "verifying u'z' uppercases correctly"); - -template -static bool -EqualCharsIgnoreCaseASCII(const Char1* s1, const Char2* s2, size_t len) -{ - for (const Char1* s1end = s1 + len; s1 < s1end; s1++, s2++) { - if (ToUpperASCII(*s1) != ToUpperASCII(*s2)) - return false; - } - return true; -} - -template -static js::HashNumber -HashStringIgnoreCaseASCII(const Char* s, size_t length) -{ - uint32_t hash = 0; - for (size_t i = 0; i < length; i++) - hash = mozilla::AddToHash(hash, ToUpperASCII(s[i])); - return hash; -} - -js::SharedIntlData::TimeZoneHasher::Lookup::Lookup(JSLinearString* timeZone) - : js::SharedIntlData::LinearStringLookup(timeZone) -{ - if (isLatin1) - hash = HashStringIgnoreCaseASCII(latin1Chars, length); - else - hash = HashStringIgnoreCaseASCII(twoByteChars, length); -} - -bool -js::SharedIntlData::TimeZoneHasher::match(TimeZoneName key, const Lookup& lookup) -{ - if (key->length() != lookup.length) - return false; - - // Compare time zone names ignoring ASCII case differences. - if (key->hasLatin1Chars()) { - const Latin1Char* keyChars = key->latin1Chars(lookup.nogc); - if (lookup.isLatin1) - return EqualCharsIgnoreCaseASCII(keyChars, lookup.latin1Chars, lookup.length); - return EqualCharsIgnoreCaseASCII(keyChars, lookup.twoByteChars, lookup.length); - } - - const char16_t* keyChars = key->twoByteChars(lookup.nogc); - if (lookup.isLatin1) - return EqualCharsIgnoreCaseASCII(lookup.latin1Chars, keyChars, lookup.length); - return EqualCharsIgnoreCaseASCII(keyChars, lookup.twoByteChars, lookup.length); -} - -static bool -IsLegacyICUTimeZone(const char* timeZone) -{ - for (const auto& legacyTimeZone : js::timezone::legacyICUTimeZones) { - if (StringsAreEqual(timeZone, legacyTimeZone)) - return true; - } - return false; -} - -bool -js::SharedIntlData::ensureTimeZones(JSContext* cx) -{ - if (timeZoneDataInitialized) - return true; - - // If ensureTimeZones() was called previously, but didn't complete due to - // OOM, clear all sets/maps and start from scratch. - if (availableTimeZones.initialized()) - availableTimeZones.finish(); - if (!availableTimeZones.init()) { - ReportOutOfMemory(cx); - return false; - } - - UErrorCode status = U_ZERO_ERROR; - UEnumeration* values = ucal_openTimeZones(&status); - if (U_FAILURE(status)) { - intl::ReportInternalError(cx); - return false; - } - ScopedICUObject toClose(values); - - RootedAtom timeZone(cx); - while (true) { - int32_t size; - const char* rawTimeZone = uenum_next(values, &size, &status); - if (U_FAILURE(status)) { - intl::ReportInternalError(cx); - return false; - } - - if (rawTimeZone == nullptr) - break; - - // Skip legacy ICU time zone names. - if (IsLegacyICUTimeZone(rawTimeZone)) - continue; - - MOZ_ASSERT(size >= 0); - timeZone = Atomize(cx, rawTimeZone, size_t(size)); - if (!timeZone) - return false; - - TimeZoneHasher::Lookup lookup(timeZone); - TimeZoneSet::AddPtr p = availableTimeZones.lookupForAdd(lookup); - - // ICU shouldn't report any duplicate time zone names, but if it does, - // just ignore the duplicate name. - if (!p && !availableTimeZones.add(p, timeZone)) { - ReportOutOfMemory(cx); - return false; - } - } - - if (ianaZonesTreatedAsLinksByICU.initialized()) - ianaZonesTreatedAsLinksByICU.finish(); - if (!ianaZonesTreatedAsLinksByICU.init()) { - ReportOutOfMemory(cx); - return false; - } - - for (const char* rawTimeZone : timezone::ianaZonesTreatedAsLinksByICU) { - MOZ_ASSERT(rawTimeZone != nullptr); - timeZone = Atomize(cx, rawTimeZone, strlen(rawTimeZone)); - if (!timeZone) - return false; - - TimeZoneHasher::Lookup lookup(timeZone); - TimeZoneSet::AddPtr p = ianaZonesTreatedAsLinksByICU.lookupForAdd(lookup); - MOZ_ASSERT(!p, "Duplicate entry in timezone::ianaZonesTreatedAsLinksByICU"); - - if (!ianaZonesTreatedAsLinksByICU.add(p, timeZone)) { - ReportOutOfMemory(cx); - return false; - } - } - - if (ianaLinksCanonicalizedDifferentlyByICU.initialized()) - ianaLinksCanonicalizedDifferentlyByICU.finish(); - if (!ianaLinksCanonicalizedDifferentlyByICU.init()) { - ReportOutOfMemory(cx); - return false; - } - - RootedAtom linkName(cx); - RootedAtom& target = timeZone; - for (const auto& linkAndTarget : timezone::ianaLinksCanonicalizedDifferentlyByICU) { - const char* rawLinkName = linkAndTarget.link; - const char* rawTarget = linkAndTarget.target; - - MOZ_ASSERT(rawLinkName != nullptr); - linkName = Atomize(cx, rawLinkName, strlen(rawLinkName)); - if (!linkName) - return false; - - MOZ_ASSERT(rawTarget != nullptr); - target = Atomize(cx, rawTarget, strlen(rawTarget)); - if (!target) - return false; - - TimeZoneHasher::Lookup lookup(linkName); - TimeZoneMap::AddPtr p = ianaLinksCanonicalizedDifferentlyByICU.lookupForAdd(lookup); - MOZ_ASSERT(!p, "Duplicate entry in timezone::ianaLinksCanonicalizedDifferentlyByICU"); - - if (!ianaLinksCanonicalizedDifferentlyByICU.add(p, linkName, target)) { - ReportOutOfMemory(cx); - return false; - } - } - - MOZ_ASSERT(!timeZoneDataInitialized, "ensureTimeZones is neither reentrant nor thread-safe"); - timeZoneDataInitialized = true; - - return true; -} - -bool -js::SharedIntlData::validateTimeZoneName(JSContext* cx, HandleString timeZone, - MutableHandleAtom result) -{ - if (!ensureTimeZones(cx)) - return false; - - RootedLinearString timeZoneLinear(cx, timeZone->ensureLinear(cx)); - if (!timeZoneLinear) - return false; - - TimeZoneHasher::Lookup lookup(timeZoneLinear); - if (TimeZoneSet::Ptr p = availableTimeZones.lookup(lookup)) - result.set(*p); - - return true; -} - -bool -js::SharedIntlData::tryCanonicalizeTimeZoneConsistentWithIANA(JSContext* cx, HandleString timeZone, - MutableHandleAtom result) -{ - if (!ensureTimeZones(cx)) - return false; - - RootedLinearString timeZoneLinear(cx, timeZone->ensureLinear(cx)); - if (!timeZoneLinear) - return false; - - TimeZoneHasher::Lookup lookup(timeZoneLinear); - MOZ_ASSERT(availableTimeZones.has(lookup), "Invalid time zone name"); - - if (TimeZoneMap::Ptr p = ianaLinksCanonicalizedDifferentlyByICU.lookup(lookup)) { - // The effectively supported time zones aren't known at compile time, - // when - // 1. SpiderMonkey was compiled with "--with-system-icu". - // 2. ICU's dynamic time zone data loading feature was used. - // (ICU supports loading time zone files at runtime through the - // ICU_TIMEZONE_FILES_DIR environment variable.) - // Ensure ICU supports the new target zone before applying the update. - TimeZoneName targetTimeZone = p->value(); - TimeZoneHasher::Lookup targetLookup(targetTimeZone); - if (availableTimeZones.has(targetLookup)) - result.set(targetTimeZone); - } else if (TimeZoneSet::Ptr p = ianaZonesTreatedAsLinksByICU.lookup(lookup)) { - result.set(*p); - } - - return true; -} - -void -js::SharedIntlData::destroyInstance() -{ - availableTimeZones.finish(); - ianaZonesTreatedAsLinksByICU.finish(); - ianaLinksCanonicalizedDifferentlyByICU.finish(); - upperCaseFirstLocales.finish(); -} - -void -js::SharedIntlData::trace(JSTracer* trc) -{ - // Atoms are always tenured. - if (!JS::CurrentThreadIsHeapMinorCollecting()) { - availableTimeZones.trace(trc); - ianaZonesTreatedAsLinksByICU.trace(trc); - ianaLinksCanonicalizedDifferentlyByICU.trace(trc); - upperCaseFirstLocales.trace(trc); - } -} - -size_t -js::SharedIntlData::sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const -{ - return availableTimeZones.sizeOfExcludingThis(mallocSizeOf) + - ianaZonesTreatedAsLinksByICU.sizeOfExcludingThis(mallocSizeOf) + - ianaLinksCanonicalizedDifferentlyByICU.sizeOfExcludingThis(mallocSizeOf) + - upperCaseFirstLocales.sizeOfExcludingThis(mallocSizeOf); -} - bool js::intl_IsValidTimeZoneName(JSContext* cx, unsigned argc, Value* vp) { diff --git a/js/src/builtin/Intl.h b/js/src/builtin/Intl.h index ae8555e0fed6..3c5dc40a082e 100644 --- a/js/src/builtin/Intl.h +++ b/js/src/builtin/Intl.h @@ -36,204 +36,6 @@ class FreeOp; extern JSObject* InitIntlClass(JSContext* cx, HandleObject obj); -/** - * Stores Intl data which can be shared across compartments (but not contexts). - * - * Used for data which is expensive when computed repeatedly or is not - * available through ICU. - */ -class SharedIntlData -{ - struct LinearStringLookup - { - union { - const JS::Latin1Char* latin1Chars; - const char16_t* twoByteChars; - }; - bool isLatin1; - size_t length; - JS::AutoCheckCannotGC nogc; - HashNumber hash = 0; - - explicit LinearStringLookup(JSLinearString* string) - : isLatin1(string->hasLatin1Chars()), length(string->length()) - { - if (isLatin1) - latin1Chars = string->latin1Chars(nogc); - else - twoByteChars = string->twoByteChars(nogc); - } - }; - - private: - /** - * Information tracking the set of the supported time zone names, derived - * from the IANA time zone database . - * - * There are two kinds of IANA time zone names: Zone and Link (denoted as - * such in database source files). Zone names are the canonical, preferred - * name for a time zone, e.g. Asia/Kolkata. Link names simply refer to - * target Zone names for their meaning, e.g. Asia/Calcutta targets - * Asia/Kolkata. That a name is a Link doesn't *necessarily* reflect a - * sense of deprecation: some Link names also exist partly for convenience, - * e.g. UTC and GMT as Link names targeting the Zone name Etc/UTC. - * - * Two data sources determine the time zone names we support: those ICU - * supports and IANA's zone information. - * - * Unfortunately the names ICU and IANA support, and their Link - * relationships from name to target, aren't identical, so we can't simply - * implicitly trust ICU's name handling. We must perform various - * preprocessing of user-provided zone names and post-processing of - * ICU-provided zone names to implement ECMA-402's IANA-consistent behavior. - * - * Also see and - * . - */ - - using TimeZoneName = JSAtom*; - - struct TimeZoneHasher - { - struct Lookup : LinearStringLookup - { - explicit Lookup(JSLinearString* timeZone); - }; - - static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } - static bool match(TimeZoneName key, const Lookup& lookup); - }; - - using TimeZoneSet = js::GCHashSet; - - using TimeZoneMap = js::GCHashMap; - - /** - * As a threshold matter, available time zones are those time zones ICU - * supports, via ucal_openTimeZones. But ICU supports additional non-IANA - * time zones described in intl/icu/source/tools/tzcode/icuzones (listed in - * IntlTimeZoneData.cpp's |legacyICUTimeZones|) for its own backwards - * compatibility purposes. This set consists of ICU's supported time zones, - * minus all backwards-compatibility time zones. - */ - TimeZoneSet availableTimeZones; - - /** - * IANA treats some time zone names as Zones, that ICU instead treats as - * Links. For example, IANA considers "America/Indiana/Indianapolis" to be - * a Zone and "America/Fort_Wayne" a Link that targets it, but ICU - * considers the former a Link that targets "America/Indianapolis" (which - * IANA treats as a Link). - * - * ECMA-402 requires that we respect IANA data, so if we're asked to - * canonicalize a time zone name in this set, we must *not* return ICU's - * canonicalization. - */ - TimeZoneSet ianaZonesTreatedAsLinksByICU; - - /** - * IANA treats some time zone names as Links to one target, that ICU - * instead treats as either Zones, or Links to different targets. An - * example of the former is "Asia/Calcutta, which IANA assigns the target - * "Asia/Kolkata" but ICU considers its own Zone. An example of the latter - * is "America/Virgin", which IANA assigns the target - * "America/Port_of_Spain" but ICU assigns the target "America/St_Thomas". - * - * ECMA-402 requires that we respect IANA data, so if we're asked to - * canonicalize a time zone name that's a key in this map, we *must* return - * the corresponding value and *must not* return ICU's canonicalization. - */ - TimeZoneMap ianaLinksCanonicalizedDifferentlyByICU; - - bool timeZoneDataInitialized = false; - - /** - * Precomputes the available time zone names, because it's too expensive to - * call ucal_openTimeZones() repeatedly. - */ - bool ensureTimeZones(JSContext* cx); - - public: - /** - * Returns the validated time zone name in |result|. If the input time zone - * isn't a valid IANA time zone name, |result| remains unchanged. - */ - bool validateTimeZoneName(JSContext* cx, JS::HandleString timeZone, - MutableHandleAtom result); - - /** - * Returns the canonical time zone name in |result|. If no canonical name - * was found, |result| remains unchanged. - * - * This method only handles time zones which are canonicalized differently - * by ICU when compared to IANA. - */ - bool tryCanonicalizeTimeZoneConsistentWithIANA(JSContext* cx, JS::HandleString timeZone, - MutableHandleAtom result); - - private: - /** - * The case first parameter (BCP47 key "kf") allows to switch the order of - * upper- and lower-case characters. ICU doesn't directly provide an API - * to query the default case first value of a given locale, but instead - * requires to instantiate a collator object and then query the case first - * attribute (UCOL_CASE_FIRST). - * To avoid instantiating an additional collator object whenever we need - * to retrieve the default case first value of a specific locale, we - * compute the default case first value for every supported locale only - * once and then keep a list of all locales which don't use the default - * case first setting. - * There is almost no difference between lower-case first and when case - * first is disabled (UCOL_LOWER_FIRST resp. UCOL_OFF), so we only need to - * track locales which use upper-case first as their default setting. - */ - - using Locale = JSAtom*; - - struct LocaleHasher - { - struct Lookup : LinearStringLookup - { - explicit Lookup(JSLinearString* locale); - }; - - static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } - static bool match(Locale key, const Lookup& lookup); - }; - - using LocaleSet = js::GCHashSet; - - LocaleSet upperCaseFirstLocales; - - bool upperCaseFirstInitialized = false; - - /** - * Precomputes the available locales which use upper-case first sorting. - */ - bool ensureUpperCaseFirstLocales(JSContext* cx); - - public: - /** - * Sets |isUpperFirst| to true if |locale| sorts upper-case characters - * before lower-case characters. - */ - bool isUpperCaseFirst(JSContext* cx, JS::HandleString locale, bool* isUpperFirst); - - public: - void destroyInstance(); - - void trace(JSTracer* trc); - - size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const; -}; - /* * The following functions are for use by self-hosted code. */ diff --git a/js/src/builtin/intl/Collator.cpp b/js/src/builtin/intl/Collator.cpp index 91a3bf56ae02..1397c3edd570 100644 --- a/js/src/builtin/intl/Collator.cpp +++ b/js/src/builtin/intl/Collator.cpp @@ -16,6 +16,7 @@ #include "builtin/intl/CommonFunctions.h" #include "builtin/intl/ICUStubs.h" #include "builtin/intl/ScopedICUObject.h" +#include "builtin/intl/SharedIntlData.h" #include "gc/FreeOp.h" #include "js/TypeDecls.h" #include "vm/GlobalObject.h" @@ -29,6 +30,7 @@ using namespace js; using js::intl::GetAvailableLocales; using js::intl::IcuLocale; using js::intl::ReportInternalError; +using js::intl::SharedIntlData; using js::intl::StringsAreEqual; const ClassOps CollatorObject::classOps_ = { @@ -464,124 +466,6 @@ js::intl_CompareStrings(JSContext* cx, unsigned argc, Value* vp) return intl_CompareStrings(cx, coll, str1, str2, args.rval()); } -js::SharedIntlData::LocaleHasher::Lookup::Lookup(JSLinearString* locale) - : js::SharedIntlData::LinearStringLookup(locale) -{ - if (isLatin1) - hash = mozilla::HashString(latin1Chars, length); - else - hash = mozilla::HashString(twoByteChars, length); -} - -bool -js::SharedIntlData::LocaleHasher::match(Locale key, const Lookup& lookup) -{ - if (key->length() != lookup.length) - return false; - - if (key->hasLatin1Chars()) { - const Latin1Char* keyChars = key->latin1Chars(lookup.nogc); - if (lookup.isLatin1) - return EqualChars(keyChars, lookup.latin1Chars, lookup.length); - return EqualChars(keyChars, lookup.twoByteChars, lookup.length); - } - - const char16_t* keyChars = key->twoByteChars(lookup.nogc); - if (lookup.isLatin1) - return EqualChars(lookup.latin1Chars, keyChars, lookup.length); - return EqualChars(keyChars, lookup.twoByteChars, lookup.length); -} - -bool -js::SharedIntlData::ensureUpperCaseFirstLocales(JSContext* cx) -{ - if (upperCaseFirstInitialized) - return true; - - // If ensureUpperCaseFirstLocales() was called previously, but didn't - // complete due to OOM, clear all data and start from scratch. - if (upperCaseFirstLocales.initialized()) - upperCaseFirstLocales.finish(); - if (!upperCaseFirstLocales.init()) { - ReportOutOfMemory(cx); - return false; - } - - UErrorCode status = U_ZERO_ERROR; - UEnumeration* available = ucol_openAvailableLocales(&status); - if (U_FAILURE(status)) { - ReportInternalError(cx); - return false; - } - ScopedICUObject toClose(available); - - RootedAtom locale(cx); - while (true) { - int32_t size; - const char* rawLocale = uenum_next(available, &size, &status); - if (U_FAILURE(status)) { - ReportInternalError(cx); - return false; - } - - if (rawLocale == nullptr) - break; - - UCollator* collator = ucol_open(rawLocale, &status); - if (U_FAILURE(status)) { - ReportInternalError(cx); - return false; - } - ScopedICUObject toCloseCollator(collator); - - UColAttributeValue caseFirst = ucol_getAttribute(collator, UCOL_CASE_FIRST, &status); - if (U_FAILURE(status)) { - ReportInternalError(cx); - return false; - } - - if (caseFirst != UCOL_UPPER_FIRST) - continue; - - MOZ_ASSERT(size >= 0); - locale = Atomize(cx, rawLocale, size_t(size)); - if (!locale) - return false; - - LocaleHasher::Lookup lookup(locale); - LocaleSet::AddPtr p = upperCaseFirstLocales.lookupForAdd(lookup); - - // ICU shouldn't report any duplicate locales, but if it does, just - // ignore the duplicated locale. - if (!p && !upperCaseFirstLocales.add(p, locale)) { - ReportOutOfMemory(cx); - return false; - } - } - - MOZ_ASSERT(!upperCaseFirstInitialized, - "ensureUpperCaseFirstLocales is neither reentrant nor thread-safe"); - upperCaseFirstInitialized = true; - - return true; -} - -bool -js::SharedIntlData::isUpperCaseFirst(JSContext* cx, HandleString locale, bool* isUpperFirst) -{ - if (!ensureUpperCaseFirstLocales(cx)) - return false; - - RootedLinearString localeLinear(cx, locale->ensureLinear(cx)); - if (!localeLinear) - return false; - - LocaleHasher::Lookup lookup(localeLinear); - *isUpperFirst = upperCaseFirstLocales.has(lookup); - - return true; -} - bool js::intl_isUpperCaseFirst(JSContext* cx, unsigned argc, Value* vp) { diff --git a/js/src/builtin/intl/SharedIntlData.cpp b/js/src/builtin/intl/SharedIntlData.cpp new file mode 100644 index 000000000000..0f46675fa33c --- /dev/null +++ b/js/src/builtin/intl/SharedIntlData.cpp @@ -0,0 +1,419 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Runtime-wide Intl data shared across compartments. */ + +#include "builtin/intl/SharedIntlData.h" + +#include "mozilla/Assertions.h" +#include "mozilla/HashFunctions.h" + +#include + +#include "jsatom.h" +#include "jsstr.h" + +#include "builtin/intl/CommonFunctions.h" +#include "builtin/intl/ICUStubs.h" +#include "builtin/intl/ScopedICUObject.h" +#include "builtin/IntlTimeZoneData.h" +#include "js/Utility.h" + +using js::HashNumber; +using js::intl::StringsAreEqual; + +template +static constexpr Char +ToUpperASCII(Char c) +{ + return ('a' <= c && c <= 'z') + ? (c & ~0x20) + : c; +} + +static_assert(ToUpperASCII('a') == 'A', "verifying 'a' uppercases correctly"); +static_assert(ToUpperASCII('m') == 'M', "verifying 'm' uppercases correctly"); +static_assert(ToUpperASCII('z') == 'Z', "verifying 'z' uppercases correctly"); +static_assert(ToUpperASCII(u'a') == u'A', "verifying u'a' uppercases correctly"); +static_assert(ToUpperASCII(u'k') == u'K', "verifying u'k' uppercases correctly"); +static_assert(ToUpperASCII(u'z') == u'Z', "verifying u'z' uppercases correctly"); + +template +static HashNumber +HashStringIgnoreCaseASCII(const Char* s, size_t length) +{ + uint32_t hash = 0; + for (size_t i = 0; i < length; i++) + hash = mozilla::AddToHash(hash, ToUpperASCII(s[i])); + return hash; +} + +js::intl::SharedIntlData::TimeZoneHasher::Lookup::Lookup(JSLinearString* timeZone) + : js::intl::SharedIntlData::LinearStringLookup(timeZone) +{ + if (isLatin1) + hash = HashStringIgnoreCaseASCII(latin1Chars, length); + else + hash = HashStringIgnoreCaseASCII(twoByteChars, length); +} + +template +static bool +EqualCharsIgnoreCaseASCII(const Char1* s1, const Char2* s2, size_t len) +{ + for (const Char1* s1end = s1 + len; s1 < s1end; s1++, s2++) { + if (ToUpperASCII(*s1) != ToUpperASCII(*s2)) + return false; + } + return true; +} + +bool +js::intl::SharedIntlData::TimeZoneHasher::match(TimeZoneName key, const Lookup& lookup) +{ + if (key->length() != lookup.length) + return false; + + // Compare time zone names ignoring ASCII case differences. + if (key->hasLatin1Chars()) { + const Latin1Char* keyChars = key->latin1Chars(lookup.nogc); + if (lookup.isLatin1) + return EqualCharsIgnoreCaseASCII(keyChars, lookup.latin1Chars, lookup.length); + return EqualCharsIgnoreCaseASCII(keyChars, lookup.twoByteChars, lookup.length); + } + + const char16_t* keyChars = key->twoByteChars(lookup.nogc); + if (lookup.isLatin1) + return EqualCharsIgnoreCaseASCII(lookup.latin1Chars, keyChars, lookup.length); + return EqualCharsIgnoreCaseASCII(keyChars, lookup.twoByteChars, lookup.length); +} + +static bool +IsLegacyICUTimeZone(const char* timeZone) +{ + for (const auto& legacyTimeZone : js::timezone::legacyICUTimeZones) { + if (StringsAreEqual(timeZone, legacyTimeZone)) + return true; + } + return false; +} + +bool +js::intl::SharedIntlData::ensureTimeZones(JSContext* cx) +{ + if (timeZoneDataInitialized) + return true; + + // If ensureTimeZones() was called previously, but didn't complete due to + // OOM, clear all sets/maps and start from scratch. + if (availableTimeZones.initialized()) + availableTimeZones.finish(); + if (!availableTimeZones.init()) { + ReportOutOfMemory(cx); + return false; + } + + UErrorCode status = U_ZERO_ERROR; + UEnumeration* values = ucal_openTimeZones(&status); + if (U_FAILURE(status)) { + ReportInternalError(cx); + return false; + } + ScopedICUObject toClose(values); + + RootedAtom timeZone(cx); + while (true) { + int32_t size; + const char* rawTimeZone = uenum_next(values, &size, &status); + if (U_FAILURE(status)) { + ReportInternalError(cx); + return false; + } + + if (rawTimeZone == nullptr) + break; + + // Skip legacy ICU time zone names. + if (IsLegacyICUTimeZone(rawTimeZone)) + continue; + + MOZ_ASSERT(size >= 0); + timeZone = Atomize(cx, rawTimeZone, size_t(size)); + if (!timeZone) + return false; + + TimeZoneHasher::Lookup lookup(timeZone); + TimeZoneSet::AddPtr p = availableTimeZones.lookupForAdd(lookup); + + // ICU shouldn't report any duplicate time zone names, but if it does, + // just ignore the duplicate name. + if (!p && !availableTimeZones.add(p, timeZone)) { + ReportOutOfMemory(cx); + return false; + } + } + + if (ianaZonesTreatedAsLinksByICU.initialized()) + ianaZonesTreatedAsLinksByICU.finish(); + if (!ianaZonesTreatedAsLinksByICU.init()) { + ReportOutOfMemory(cx); + return false; + } + + for (const char* rawTimeZone : timezone::ianaZonesTreatedAsLinksByICU) { + MOZ_ASSERT(rawTimeZone != nullptr); + timeZone = Atomize(cx, rawTimeZone, strlen(rawTimeZone)); + if (!timeZone) + return false; + + TimeZoneHasher::Lookup lookup(timeZone); + TimeZoneSet::AddPtr p = ianaZonesTreatedAsLinksByICU.lookupForAdd(lookup); + MOZ_ASSERT(!p, "Duplicate entry in timezone::ianaZonesTreatedAsLinksByICU"); + + if (!ianaZonesTreatedAsLinksByICU.add(p, timeZone)) { + ReportOutOfMemory(cx); + return false; + } + } + + if (ianaLinksCanonicalizedDifferentlyByICU.initialized()) + ianaLinksCanonicalizedDifferentlyByICU.finish(); + if (!ianaLinksCanonicalizedDifferentlyByICU.init()) { + ReportOutOfMemory(cx); + return false; + } + + RootedAtom linkName(cx); + RootedAtom& target = timeZone; + for (const auto& linkAndTarget : timezone::ianaLinksCanonicalizedDifferentlyByICU) { + const char* rawLinkName = linkAndTarget.link; + const char* rawTarget = linkAndTarget.target; + + MOZ_ASSERT(rawLinkName != nullptr); + linkName = Atomize(cx, rawLinkName, strlen(rawLinkName)); + if (!linkName) + return false; + + MOZ_ASSERT(rawTarget != nullptr); + target = Atomize(cx, rawTarget, strlen(rawTarget)); + if (!target) + return false; + + TimeZoneHasher::Lookup lookup(linkName); + TimeZoneMap::AddPtr p = ianaLinksCanonicalizedDifferentlyByICU.lookupForAdd(lookup); + MOZ_ASSERT(!p, "Duplicate entry in timezone::ianaLinksCanonicalizedDifferentlyByICU"); + + if (!ianaLinksCanonicalizedDifferentlyByICU.add(p, linkName, target)) { + ReportOutOfMemory(cx); + return false; + } + } + + MOZ_ASSERT(!timeZoneDataInitialized, "ensureTimeZones is neither reentrant nor thread-safe"); + timeZoneDataInitialized = true; + + return true; +} + +bool +js::intl::SharedIntlData::validateTimeZoneName(JSContext* cx, HandleString timeZone, + MutableHandleAtom result) +{ + if (!ensureTimeZones(cx)) + return false; + + RootedLinearString timeZoneLinear(cx, timeZone->ensureLinear(cx)); + if (!timeZoneLinear) + return false; + + TimeZoneHasher::Lookup lookup(timeZoneLinear); + if (TimeZoneSet::Ptr p = availableTimeZones.lookup(lookup)) + result.set(*p); + + return true; +} + +bool +js::intl::SharedIntlData::tryCanonicalizeTimeZoneConsistentWithIANA(JSContext* cx, + HandleString timeZone, + MutableHandleAtom result) +{ + if (!ensureTimeZones(cx)) + return false; + + RootedLinearString timeZoneLinear(cx, timeZone->ensureLinear(cx)); + if (!timeZoneLinear) + return false; + + TimeZoneHasher::Lookup lookup(timeZoneLinear); + MOZ_ASSERT(availableTimeZones.has(lookup), "Invalid time zone name"); + + if (TimeZoneMap::Ptr p = ianaLinksCanonicalizedDifferentlyByICU.lookup(lookup)) { + // The effectively supported time zones aren't known at compile time, + // when + // 1. SpiderMonkey was compiled with "--with-system-icu". + // 2. ICU's dynamic time zone data loading feature was used. + // (ICU supports loading time zone files at runtime through the + // ICU_TIMEZONE_FILES_DIR environment variable.) + // Ensure ICU supports the new target zone before applying the update. + TimeZoneName targetTimeZone = p->value(); + TimeZoneHasher::Lookup targetLookup(targetTimeZone); + if (availableTimeZones.has(targetLookup)) + result.set(targetTimeZone); + } else if (TimeZoneSet::Ptr p = ianaZonesTreatedAsLinksByICU.lookup(lookup)) { + result.set(*p); + } + + return true; +} + +js::intl::SharedIntlData::LocaleHasher::Lookup::Lookup(JSLinearString* locale) + : js::intl::SharedIntlData::LinearStringLookup(locale) +{ + if (isLatin1) + hash = mozilla::HashString(latin1Chars, length); + else + hash = mozilla::HashString(twoByteChars, length); +} + +bool +js::intl::SharedIntlData::LocaleHasher::match(Locale key, const Lookup& lookup) +{ + if (key->length() != lookup.length) + return false; + + if (key->hasLatin1Chars()) { + const Latin1Char* keyChars = key->latin1Chars(lookup.nogc); + if (lookup.isLatin1) + return EqualChars(keyChars, lookup.latin1Chars, lookup.length); + return EqualChars(keyChars, lookup.twoByteChars, lookup.length); + } + + const char16_t* keyChars = key->twoByteChars(lookup.nogc); + if (lookup.isLatin1) + return EqualChars(lookup.latin1Chars, keyChars, lookup.length); + return EqualChars(keyChars, lookup.twoByteChars, lookup.length); +} + +bool +js::intl::SharedIntlData::ensureUpperCaseFirstLocales(JSContext* cx) +{ + if (upperCaseFirstInitialized) + return true; + + // If ensureUpperCaseFirstLocales() was called previously, but didn't + // complete due to OOM, clear all data and start from scratch. + if (upperCaseFirstLocales.initialized()) + upperCaseFirstLocales.finish(); + if (!upperCaseFirstLocales.init()) { + ReportOutOfMemory(cx); + return false; + } + + UErrorCode status = U_ZERO_ERROR; + UEnumeration* available = ucol_openAvailableLocales(&status); + if (U_FAILURE(status)) { + ReportInternalError(cx); + return false; + } + ScopedICUObject toClose(available); + + RootedAtom locale(cx); + while (true) { + int32_t size; + const char* rawLocale = uenum_next(available, &size, &status); + if (U_FAILURE(status)) { + ReportInternalError(cx); + return false; + } + + if (rawLocale == nullptr) + break; + + UCollator* collator = ucol_open(rawLocale, &status); + if (U_FAILURE(status)) { + ReportInternalError(cx); + return false; + } + ScopedICUObject toCloseCollator(collator); + + UColAttributeValue caseFirst = ucol_getAttribute(collator, UCOL_CASE_FIRST, &status); + if (U_FAILURE(status)) { + ReportInternalError(cx); + return false; + } + + if (caseFirst != UCOL_UPPER_FIRST) + continue; + + MOZ_ASSERT(size >= 0); + locale = Atomize(cx, rawLocale, size_t(size)); + if (!locale) + return false; + + LocaleHasher::Lookup lookup(locale); + LocaleSet::AddPtr p = upperCaseFirstLocales.lookupForAdd(lookup); + + // ICU shouldn't report any duplicate locales, but if it does, just + // ignore the duplicated locale. + if (!p && !upperCaseFirstLocales.add(p, locale)) { + ReportOutOfMemory(cx); + return false; + } + } + + MOZ_ASSERT(!upperCaseFirstInitialized, + "ensureUpperCaseFirstLocales is neither reentrant nor thread-safe"); + upperCaseFirstInitialized = true; + + return true; +} + +bool +js::intl::SharedIntlData::isUpperCaseFirst(JSContext* cx, HandleString locale, bool* isUpperFirst) +{ + if (!ensureUpperCaseFirstLocales(cx)) + return false; + + RootedLinearString localeLinear(cx, locale->ensureLinear(cx)); + if (!localeLinear) + return false; + + LocaleHasher::Lookup lookup(localeLinear); + *isUpperFirst = upperCaseFirstLocales.has(lookup); + + return true; +} + +void +js::intl::SharedIntlData::destroyInstance() +{ + availableTimeZones.finish(); + ianaZonesTreatedAsLinksByICU.finish(); + ianaLinksCanonicalizedDifferentlyByICU.finish(); + upperCaseFirstLocales.finish(); +} + +void +js::intl::SharedIntlData::trace(JSTracer* trc) +{ + // Atoms are always tenured. + if (!JS::CurrentThreadIsHeapMinorCollecting()) { + availableTimeZones.trace(trc); + ianaZonesTreatedAsLinksByICU.trace(trc); + ianaLinksCanonicalizedDifferentlyByICU.trace(trc); + upperCaseFirstLocales.trace(trc); + } +} + +size_t +js::intl::SharedIntlData::sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const +{ + return availableTimeZones.sizeOfExcludingThis(mallocSizeOf) + + ianaZonesTreatedAsLinksByICU.sizeOfExcludingThis(mallocSizeOf) + + ianaLinksCanonicalizedDifferentlyByICU.sizeOfExcludingThis(mallocSizeOf) + + upperCaseFirstLocales.sizeOfExcludingThis(mallocSizeOf); +} diff --git a/js/src/builtin/intl/SharedIntlData.h b/js/src/builtin/intl/SharedIntlData.h new file mode 100644 index 000000000000..fe9e9e99d5a8 --- /dev/null +++ b/js/src/builtin/intl/SharedIntlData.h @@ -0,0 +1,221 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef builtin_intl_SharedIntlData_h +#define builtin_intl_SharedIntlData_h + +#include "mozilla/MemoryReporting.h" + +#include + +#include "jsalloc.h" + +#include "js/CharacterEncoding.h" +#include "js/GCAPI.h" +#include "js/GCHashTable.h" +#include "js/RootingAPI.h" +#include "js/Utility.h" +#include "vm/String.h" + +namespace js { + +namespace intl { + +/** + * Stores Intl data which can be shared across compartments (but not contexts). + * + * Used for data which is expensive when computed repeatedly or is not + * available through ICU. + */ +class SharedIntlData +{ + struct LinearStringLookup + { + union { + const JS::Latin1Char* latin1Chars; + const char16_t* twoByteChars; + }; + bool isLatin1; + size_t length; + JS::AutoCheckCannotGC nogc; + HashNumber hash = 0; + + explicit LinearStringLookup(JSLinearString* string) + : isLatin1(string->hasLatin1Chars()), length(string->length()) + { + if (isLatin1) + latin1Chars = string->latin1Chars(nogc); + else + twoByteChars = string->twoByteChars(nogc); + } + }; + + private: + /** + * Information tracking the set of the supported time zone names, derived + * from the IANA time zone database . + * + * There are two kinds of IANA time zone names: Zone and Link (denoted as + * such in database source files). Zone names are the canonical, preferred + * name for a time zone, e.g. Asia/Kolkata. Link names simply refer to + * target Zone names for their meaning, e.g. Asia/Calcutta targets + * Asia/Kolkata. That a name is a Link doesn't *necessarily* reflect a + * sense of deprecation: some Link names also exist partly for convenience, + * e.g. UTC and GMT as Link names targeting the Zone name Etc/UTC. + * + * Two data sources determine the time zone names we support: those ICU + * supports and IANA's zone information. + * + * Unfortunately the names ICU and IANA support, and their Link + * relationships from name to target, aren't identical, so we can't simply + * implicitly trust ICU's name handling. We must perform various + * preprocessing of user-provided zone names and post-processing of + * ICU-provided zone names to implement ECMA-402's IANA-consistent behavior. + * + * Also see and + * . + */ + + using TimeZoneName = JSAtom*; + + struct TimeZoneHasher + { + struct Lookup : LinearStringLookup + { + explicit Lookup(JSLinearString* timeZone); + }; + + static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } + static bool match(TimeZoneName key, const Lookup& lookup); + }; + + using TimeZoneSet = GCHashSet; + using TimeZoneMap = GCHashMap; + + /** + * As a threshold matter, available time zones are those time zones ICU + * supports, via ucal_openTimeZones. But ICU supports additional non-IANA + * time zones described in intl/icu/source/tools/tzcode/icuzones (listed in + * IntlTimeZoneData.cpp's |legacyICUTimeZones|) for its own backwards + * compatibility purposes. This set consists of ICU's supported time zones, + * minus all backwards-compatibility time zones. + */ + TimeZoneSet availableTimeZones; + + /** + * IANA treats some time zone names as Zones, that ICU instead treats as + * Links. For example, IANA considers "America/Indiana/Indianapolis" to be + * a Zone and "America/Fort_Wayne" a Link that targets it, but ICU + * considers the former a Link that targets "America/Indianapolis" (which + * IANA treats as a Link). + * + * ECMA-402 requires that we respect IANA data, so if we're asked to + * canonicalize a time zone name in this set, we must *not* return ICU's + * canonicalization. + */ + TimeZoneSet ianaZonesTreatedAsLinksByICU; + + /** + * IANA treats some time zone names as Links to one target, that ICU + * instead treats as either Zones, or Links to different targets. An + * example of the former is "Asia/Calcutta, which IANA assigns the target + * "Asia/Kolkata" but ICU considers its own Zone. An example of the latter + * is "America/Virgin", which IANA assigns the target + * "America/Port_of_Spain" but ICU assigns the target "America/St_Thomas". + * + * ECMA-402 requires that we respect IANA data, so if we're asked to + * canonicalize a time zone name that's a key in this map, we *must* return + * the corresponding value and *must not* return ICU's canonicalization. + */ + TimeZoneMap ianaLinksCanonicalizedDifferentlyByICU; + + bool timeZoneDataInitialized = false; + + /** + * Precomputes the available time zone names, because it's too expensive to + * call ucal_openTimeZones() repeatedly. + */ + bool ensureTimeZones(JSContext* cx); + + public: + /** + * Returns the validated time zone name in |result|. If the input time zone + * isn't a valid IANA time zone name, |result| remains unchanged. + */ + bool validateTimeZoneName(JSContext* cx, JS::Handle timeZone, + JS::MutableHandle result); + + /** + * Returns the canonical time zone name in |result|. If no canonical name + * was found, |result| remains unchanged. + * + * This method only handles time zones which are canonicalized differently + * by ICU when compared to IANA. + */ + bool tryCanonicalizeTimeZoneConsistentWithIANA(JSContext* cx, JS::Handle timeZone, + JS::MutableHandle result); + + private: + /** + * The case first parameter (BCP47 key "kf") allows to switch the order of + * upper- and lower-case characters. ICU doesn't directly provide an API + * to query the default case first value of a given locale, but instead + * requires to instantiate a collator object and then query the case first + * attribute (UCOL_CASE_FIRST). + * To avoid instantiating an additional collator object whenever we need + * to retrieve the default case first value of a specific locale, we + * compute the default case first value for every supported locale only + * once and then keep a list of all locales which don't use the default + * case first setting. + * There is almost no difference between lower-case first and when case + * first is disabled (UCOL_LOWER_FIRST resp. UCOL_OFF), so we only need to + * track locales which use upper-case first as their default setting. + */ + + using Locale = JSAtom*; + + struct LocaleHasher + { + struct Lookup : LinearStringLookup + { + explicit Lookup(JSLinearString* locale); + }; + + static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } + static bool match(Locale key, const Lookup& lookup); + }; + + using LocaleSet = GCHashSet; + + LocaleSet upperCaseFirstLocales; + + bool upperCaseFirstInitialized = false; + + /** + * Precomputes the available locales which use upper-case first sorting. + */ + bool ensureUpperCaseFirstLocales(JSContext* cx); + + public: + /** + * Sets |isUpperFirst| to true if |locale| sorts upper-case characters + * before lower-case characters. + */ + bool isUpperCaseFirst(JSContext* cx, JS::Handle locale, bool* isUpperFirst); + + public: + void destroyInstance(); + + void trace(JSTracer* trc); + + size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const; +}; + +} // namespace intl + +} // namespace js + +#endif /* builtin_intl_SharedIntlData_h */ diff --git a/js/src/moz.build b/js/src/moz.build index 3c24c8d3ea78..b6a3a52de1f4 100755 --- a/js/src/moz.build +++ b/js/src/moz.build @@ -163,6 +163,7 @@ UNIFIED_SOURCES += [ 'builtin/intl/Collator.cpp', 'builtin/intl/CommonFunctions.cpp', 'builtin/intl/NumberFormat.cpp', + 'builtin/intl/SharedIntlData.cpp', 'builtin/MapObject.cpp', 'builtin/ModuleObject.cpp', 'builtin/Object.cpp', diff --git a/js/src/vm/Runtime.h b/js/src/vm/Runtime.h index ab9bda45c674..cc86b3e1aa99 100644 --- a/js/src/vm/Runtime.h +++ b/js/src/vm/Runtime.h @@ -24,7 +24,7 @@ #include "jsscript.h" #include "builtin/AtomicsObject.h" -#include "builtin/Intl.h" +#include "builtin/intl/SharedIntlData.h" #include "builtin/Promise.h" #include "frontend/NameCollections.h" #include "gc/GCRuntime.h" @@ -860,7 +860,7 @@ struct JSRuntime : public js::MallocProvider js::WriteOnceData wellKnownSymbols; /* Shared Intl data for this runtime. */ - js::ActiveThreadData sharedIntlData; + js::ActiveThreadData sharedIntlData; void traceSharedIntlData(JSTracer* trc);