Bug 1494942 - Improve AtomizeUTF8Chars performance. (r=Waldo)

--HG-- extra : rebase_source : 520a97331ee2358617acbf9db3bfc9eb425248e0
2018-11-05 16:03:16 -08:00 · 2018-11-05 16:03:16 -08:00 · ea3bcd2b8b
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@ -377,6 +377,28 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
    return true;
 }

+template <OnUTF8Error ErrorAction, typename CharT>
+static void
+CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, size_t outlen, bool allASCII)
+{
+    if (allASCII) {
+        size_t srclen = src.length();
+        MOZ_ASSERT(outlen == srclen);
+        for (uint32_t i = 0; i < srclen; i++) {
+            dst[i] = CharT(src[i]);
+        }
+    } else {
+        size_t j = 0;
+        auto push = [dst, &j](char16_t c) -> LoopDisposition {
+            dst[j++] = CharT(c);
+            return LoopDisposition::Continue;
+        };
+        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
+        MOZ_ASSERT(j == outlen);
+    }
+    dst[outlen] = CharT('\0');    // NUL char
+}
+
 template <OnUTF8Error ErrorAction, typename CharsT>
 static CharsT
 InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
@ -406,25 +428,10 @@ InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
        return CharsT();
    }

-    if (allASCII) {
-        size_t srclen = src.length();
-        MOZ_ASSERT(*outlen == srclen);
-        for (uint32_t i = 0; i < srclen; i++) {
-            dst[i] = CharT(src[i]);
-        }
-    } else {
-        constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
-            ? OnUTF8Error::InsertQuestionMark
-            : OnUTF8Error::InsertReplacementCharacter;
-        size_t j = 0;
-        auto push = [dst, &j](char16_t c) -> LoopDisposition {
-            dst[j++] = CharT(c);
-            return LoopDisposition::Continue;
-        };
-        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<errorMode>(cx, src, push)));
-        MOZ_ASSERT(j == len);
-    }
-    dst[*outlen] = 0;    // NUL char
+    constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
+        ? OnUTF8Error::InsertQuestionMark
+        : OnUTF8Error::InsertReplacementCharacter;
+    CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);

    return CharsT(dst, *outlen);
 }
@ -455,20 +462,31 @@ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const JS::ConstUTF8CharsZ& u
    return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter, TwoByteCharsZ>(cx, chars, outlen);
 }

+static void
+UpdateSmallestEncodingForChar(char16_t c, JS::SmallestEncoding* encoding)
+{
+    JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
+    if (c >= 0x80) {
+        if (c < 0x100) {
+            newEncoding = JS::SmallestEncoding::Latin1;
+        } else {
+            newEncoding = JS::SmallestEncoding::UTF16;
+        }
+    }
+    if (newEncoding > *encoding) {
+        *encoding = newEncoding;
+    }
+}
+
 JS::SmallestEncoding
 JS::FindSmallestEncoding(UTF8Chars utf8)
 {
    JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
    auto onChar = [&encoding](char16_t c) -> LoopDisposition {
-        if (c >= 0x80) {
-            if (c < 0x100) {
-                encoding = JS::SmallestEncoding::Latin1;
-            } else {
-                encoding = JS::SmallestEncoding::UTF16;
-                return LoopDisposition::Break;
-            }
-        }
-        return LoopDisposition::Continue;
+        UpdateSmallestEncodingForChar(c, &encoding);
+        return encoding == JS::SmallestEncoding::UTF16
+               ? LoopDisposition::Break
+               : LoopDisposition::Continue;
    };
    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<OnUTF8Error::InsertReplacementCharacter>(
                         /* cx = */ nullptr, utf8, onChar)));
@ -487,6 +505,86 @@ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t*
    return InflateUTF8StringHelper<OnUTF8Error::InsertQuestionMark, Latin1CharsZ>(cx, utf8, outlen);
 }

+/**
+ * Atomization Helpers.
+ *
+ * These functions are extremely single-use, and are not intended for general
+ * consumption.
+ */
+
+bool
+GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
+                       JS::SmallestEncoding* encoding, HashNumber* hashNum)
+{
+    *outlen = 0;
+    *encoding = JS::SmallestEncoding::ASCII;
+    *hashNum = 0;
+
+    auto getMetadata = [outlen, encoding, hashNum](char16_t c) -> LoopDisposition {
+        (*outlen)++;
+        UpdateSmallestEncodingForChar(c, encoding);
+        *hashNum = mozilla::AddToHash(*hashNum, c);
+        return LoopDisposition::Continue;
+    };
+    if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
+        return false;
+    }
+
+    return true;
+}
+
+template <typename CharT>
+bool
+UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
+{
+    size_t ind = 0;
+    bool isEqual = true;
+
+    auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
+    #ifdef DEBUG
+        JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
+        UpdateSmallestEncodingForChar(c, &encoding);
+        if (std::is_same<CharT, JS::Latin1Char>::value) {
+            MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
+        } else if (!std::is_same<CharT, char16_t>::value) {
+            MOZ_CRASH("Invalid character type in UTF8EqualsChars");
+        }
+    #endif
+
+        if (CharT(c) != chars[ind]) {
+            isEqual = false;
+            return LoopDisposition::Break;
+        }
+
+        ind++;
+        return LoopDisposition::Continue;
+    };
+
+    // To get here, you must have checked your work.
+    InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars, checkEqual);
+
+    return isEqual;
+}
+
+template bool UTF8EqualsChars<char16_t>(const JS::UTF8Chars, const char16_t*);
+template bool UTF8EqualsChars<JS::Latin1Char>(const JS::UTF8Chars, const JS::Latin1Char*);
+
+template <typename CharT>
+void
+InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
+                                     JS::SmallestEncoding encoding)
+{
+    CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(/* cx = */ nullptr, src, dst, dstLen,
+                                                     encoding == JS::SmallestEncoding::ASCII);
+}
+
+template void
+InflateUTF8CharsToBufferAndTerminate<char16_t>(const UTF8Chars src, char16_t* dst, size_t dstLen,
+                                               JS::SmallestEncoding encoding);
+template void
+InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const UTF8Chars src, JS::Latin1Char* dst,
+                                                     size_t dstLen, JS::SmallestEncoding encoding);
+
 #ifdef DEBUG
 void
 JS::ConstUTF8CharsZ::validate(size_t aLength)
--- a/js/src/vm/JSAtom.cpp
+++ b/js/src/vm/JSAtom.cpp
@ -41,37 +41,59 @@ using mozilla::Maybe;
 using mozilla::Nothing;
 using mozilla::RangedPtr;

+template <typename CharT>
+extern void InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
+                                                 JS::SmallestEncoding encoding);
+
+template <typename CharT>
+extern bool UTF8EqualsChars(const JS::UTF8Chars utf8, const CharT* chars);
+
+extern bool
+GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding,
+                       HashNumber* hashNum);
+
 struct js::AtomHasher::Lookup
 {
    union {
        const JS::Latin1Char* latin1Chars;
        const char16_t* twoByteChars;
+        const char* utf8Bytes;
    };
-    bool isLatin1;
+    enum {
+        TwoByteChar,
+        Latin1,
+        UTF8
+    } type;
    size_t length;
+    size_t byteLength;
    const JSAtom* atom; /* Optional. */
    JS::AutoCheckCannotGC nogc;

    HashNumber hash;

+    MOZ_ALWAYS_INLINE Lookup(const char* utf8Bytes, size_t byteLen, size_t length, HashNumber hash)
+      : utf8Bytes(utf8Bytes), type(UTF8), length(length), byteLength(byteLen), atom(nullptr), hash(hash)
+    {}
+
    MOZ_ALWAYS_INLINE Lookup(const char16_t* chars, size_t length)
-      : twoByteChars(chars), isLatin1(false), length(length), atom(nullptr),
+      : twoByteChars(chars), type(TwoByteChar), length(length), atom(nullptr),
        hash(mozilla::HashString(chars, length))
    {}

    MOZ_ALWAYS_INLINE Lookup(const JS::Latin1Char* chars, size_t length)
-      : latin1Chars(chars), isLatin1(true), length(length), atom(nullptr),
+      : latin1Chars(chars), type(Latin1), length(length), atom(nullptr),
        hash(mozilla::HashString(chars, length))
    {}

    inline explicit Lookup(const JSAtom* atom)
-      : isLatin1(atom->hasLatin1Chars()), length(atom->length()), atom(atom),
+      : type(atom->hasLatin1Chars() ? Latin1 : TwoByteChar), length(atom->length()), atom(atom),
        hash(atom->hash())
    {
-        if (isLatin1) {
+        if (type == Latin1) {
            latin1Chars = atom->latin1Chars(nogc);
            MOZ_ASSERT(mozilla::HashString(latin1Chars, length) == hash);
        } else {
+            MOZ_ASSERT(type == TwoByteChar);
            twoByteChars = atom->twoByteChars(nogc);
            MOZ_ASSERT(mozilla::HashString(twoByteChars, length) == hash);
        }
@ -97,17 +119,29 @@ js::AtomHasher::match(const AtomStateEntry& entry, const Lookup& lookup)

    if (key->hasLatin1Chars()) {
        const Latin1Char* keyChars = key->latin1Chars(lookup.nogc);
-        if (lookup.isLatin1) {
+        switch (lookup.type) {
+          case Lookup::Latin1:
            return mozilla::ArrayEqual(keyChars, lookup.latin1Chars, lookup.length);
+          case Lookup::TwoByteChar:
+            return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
+          case Lookup::UTF8: {
+            JS::UTF8Chars utf8(lookup.utf8Bytes, lookup.byteLength);
+            return UTF8EqualsChars(utf8, keyChars);
+          }
        }
-        return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
    }

    const char16_t* keyChars = key->twoByteChars(lookup.nogc);
-    if (lookup.isLatin1) {
+    switch (lookup.type) {
+      case Lookup::Latin1:
        return EqualChars(lookup.latin1Chars, keyChars, lookup.length);
+      case Lookup::TwoByteChar:
+        return mozilla::ArrayEqual(keyChars, lookup.twoByteChars, lookup.length);
+      case Lookup::UTF8: {
+        JS::UTF8Chars utf8(lookup.utf8Bytes, lookup.byteLength);
+        return UTF8EqualsChars(utf8, keyChars);
+      }
    }
-    return mozilla::ArrayEqual(keyChars, lookup.twoByteChars, lookup.length);
 }

 inline JSAtom*
@ -620,6 +654,12 @@ MOZ_ALWAYS_INLINE static JSAtom*
 AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
                const Maybe<uint32_t>& indexValue, const AtomHasher::Lookup& lookup);

+template <typename CharT>
+MOZ_ALWAYS_INLINE
+static JSAtom*
+AtomizeAndCopyCharsFromLookup(JSContext* cx, const CharT* tbchars, size_t length, const AtomHasher::Lookup& lookup,
+                              PinningBehavior pin, const Maybe<uint32_t>& indexValue);
+
 /* |tbchars| must not point into an inline or short string. */
 template <typename CharT>
 MOZ_ALWAYS_INLINE
@ -632,7 +672,16 @@ AtomizeAndCopyChars(JSContext* cx, const CharT* tbchars, size_t length, PinningB
    }

    AtomHasher::Lookup lookup(tbchars, length);
+    return AtomizeAndCopyCharsFromLookup(cx, tbchars, length, lookup, pin, indexValue);
+}

+
+template <typename CharT>
+MOZ_ALWAYS_INLINE
+static JSAtom*
+AtomizeAndCopyCharsFromLookup(JSContext* cx, const CharT* tbchars, size_t length, const AtomHasher::Lookup& lookup,
+                              PinningBehavior pin, const Maybe<uint32_t>& indexValue)
+{
    // Try the per-Zone cache first. If we find the atom there we can avoid the
    // atoms lock, the markAtom call, and the multiple HashSet lookups below.
    // We don't use the per-Zone cache if we want a pinned atom: handling that
@ -810,6 +859,75 @@ PermanentlyAtomizeAndCopyChars(JSContext* cx,
    return atom;
 }

+struct AtomizeUTF8CharsWrapper
+{
+    JS::UTF8Chars utf8;
+    JS::SmallestEncoding encoding;
+
+    AtomizeUTF8CharsWrapper(const JS::UTF8Chars& chars, JS::SmallestEncoding minEncode)
+      : utf8(chars), encoding(minEncode)
+    { }
+};
+
+template <typename CharT>
+MOZ_ALWAYS_INLINE
+static JSFlatString*
+MakeFlatStringForAtomization(JSContext* cx, const CharT* tbchars, size_t length)
+{
+    return NewStringCopyN<NoGC>(cx, tbchars, length);
+}
+
+template<typename CharT>
+MOZ_ALWAYS_INLINE
+static JSFlatString*
+MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
+{
+    if (JSInlineString::lengthFits<CharT>(length)) {
+        CharT* storage;
+        JSInlineString* str = AllocateInlineString<NoGC>(cx, length, &storage);
+        if (!str) {
+            return nullptr;
+        }
+
+        InflateUTF8CharsToBufferAndTerminate(chars->utf8, storage, length, chars->encoding);
+        return str;
+    }
+
+    // MakeAtomUTF8Helper is called from deep in the Atomization path, which expects
+    // functions to fail gracefully with nullptr on OOM, without throwing.
+    //
+    // Flat strings are null-terminated. Leave room with length + 1
+    UniquePtr<CharT[], JS::FreePolicy> newStr(js_pod_malloc<CharT>(length + 1));
+    if (!newStr) {
+        return nullptr;
+    }
+
+    InflateUTF8CharsToBufferAndTerminate(chars->utf8, newStr.get(), length, chars->encoding);
+
+    JSFlatString* str = JSFlatString::new_<NoGC>(cx, newStr.get(), length);
+    if (!str) {
+        return nullptr;
+    }
+
+    mozilla::Unused << newStr.release();
+    return str;
+}
+
+template<>
+MOZ_ALWAYS_INLINE
+/* static */ JSFlatString*
+MakeFlatStringForAtomization(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
+{
+    if (length == 0) {
+        return cx->emptyString();
+    }
+
+    if (chars->encoding == JS::SmallestEncoding::UTF16) {
+        return MakeUTF8AtomHelper<char16_t>(cx, chars, length);
+    }
+    return MakeUTF8AtomHelper<JS::Latin1Char>(cx, chars, length);
+}
+
 template <typename CharT>
 MOZ_ALWAYS_INLINE static JSAtom*
 AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
@ -817,7 +935,7 @@ AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehav
 {
    AutoAllocInAtomsZone ac(cx);

-    JSFlatString* flat = NewStringCopyN<NoGC>(cx, tbchars, length);
+    JSFlatString* flat = MakeFlatStringForAtomization(cx, tbchars, length);
    if (!flat) {
        // Grudgingly forgo last-ditch GC. The alternative would be to release
        // the lock, manually GC here, and retry from the top. If you fix this,
@ -919,19 +1037,22 @@ js::AtomizeChars(JSContext* cx, const char16_t* chars, size_t length, PinningBeh
 JSAtom*
 js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
 {
-    // This could be optimized to hand the char16_t's directly to the JSAtom
-    // instead of making a copy. UTF8CharsToNewTwoByteCharsZ should be
-    // refactored to take an JSContext so that this function could also.
-
-    UTF8Chars utf8(utf8Chars, utf8ByteLength);
+    // Since the static strings are all ascii, we can check them before trying anything else.
+    if (JSAtom* s = cx->staticStrings().lookup(utf8Chars, utf8ByteLength)) {
+        return s;
+    }

    size_t length;
-    UniqueTwoByteChars chars(JS::UTF8CharsToNewTwoByteCharsZ(cx, utf8, &length).get());
-    if (!chars) {
+    HashNumber hash;
+    JS::SmallestEncoding forCopy;
+    UTF8Chars utf8(utf8Chars, utf8ByteLength);
+    if (!GetUTF8AtomizationData(cx, utf8, &length, &forCopy, &hash)) {
        return nullptr;
    }

-    return AtomizeChars(cx, chars.get(), length);
+    AtomizeUTF8CharsWrapper chars(utf8, forCopy);
+    AtomHasher::Lookup lookup(utf8Chars, utf8ByteLength, length, hash);
+    return AtomizeAndCopyCharsFromLookup(cx, &chars, length, lookup, DoNotPinAtom, Nothing());
 }

 bool