Bug 1494942 - Improve AtomizeUTF8Chars performance. (r=Waldo)

--HG--
extra : rebase_source : 520a97331ee2358617acbf9db3bfc9eb425248e0
This commit is contained in:
Eric Faust 2018-11-05 16:03:16 -08:00
Родитель de0535b7e0
Коммит ea3bcd2b8b
2 изменённых файлов: 265 добавлений и 46 удалений

Просмотреть файл

@ -377,6 +377,28 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
return true;
}
template <OnUTF8Error ErrorAction, typename CharT>
static void
CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars src, CharT *dst, size_t outlen, bool allASCII)
{
if (allASCII) {
size_t srclen = src.length();
MOZ_ASSERT(outlen == srclen);
for (uint32_t i = 0; i < srclen; i++) {
dst[i] = CharT(src[i]);
}
} else {
size_t j = 0;
auto push = [dst, &j](char16_t c) -> LoopDisposition {
dst[j++] = CharT(c);
return LoopDisposition::Continue;
};
MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
MOZ_ASSERT(j == outlen);
}
dst[outlen] = CharT('\0'); // NUL char
}
template <OnUTF8Error ErrorAction, typename CharsT>
static CharsT
InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
@ -406,25 +428,10 @@ InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
return CharsT();
}
if (allASCII) {
size_t srclen = src.length();
MOZ_ASSERT(*outlen == srclen);
for (uint32_t i = 0; i < srclen; i++) {
dst[i] = CharT(src[i]);
}
} else {
constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
? OnUTF8Error::InsertQuestionMark
: OnUTF8Error::InsertReplacementCharacter;
size_t j = 0;
auto push = [dst, &j](char16_t c) -> LoopDisposition {
dst[j++] = CharT(c);
return LoopDisposition::Continue;
};
MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<errorMode>(cx, src, push)));
MOZ_ASSERT(j == len);
}
dst[*outlen] = 0; // NUL char
constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
? OnUTF8Error::InsertQuestionMark
: OnUTF8Error::InsertReplacementCharacter;
CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);
return CharsT(dst, *outlen);
}
@ -455,20 +462,31 @@ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const JS::ConstUTF8CharsZ& u
return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter, TwoByteCharsZ>(cx, chars, outlen);
}
static void
UpdateSmallestEncodingForChar(char16_t c, JS::SmallestEncoding* encoding)
{
JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
if (c >= 0x80) {
if (c < 0x100) {
newEncoding = JS::SmallestEncoding::Latin1;
} else {
newEncoding = JS::SmallestEncoding::UTF16;
}
}
if (newEncoding > *encoding) {
*encoding = newEncoding;
}
}
JS::SmallestEncoding
JS::FindSmallestEncoding(UTF8Chars utf8)
{
JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
auto onChar = [&encoding](char16_t c) -> LoopDisposition {
if (c >= 0x80) {
if (c < 0x100) {
encoding = JS::SmallestEncoding::Latin1;
} else {
encoding = JS::SmallestEncoding::UTF16;
return LoopDisposition::Break;
}
}
return LoopDisposition::Continue;
UpdateSmallestEncodingForChar(c, &encoding);
return encoding == JS::SmallestEncoding::UTF16
? LoopDisposition::Break
: LoopDisposition::Continue;
};
MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<OnUTF8Error::InsertReplacementCharacter>(
/* cx = */ nullptr, utf8, onChar)));
@ -487,6 +505,86 @@ JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t*
return InflateUTF8StringHelper<OnUTF8Error::InsertQuestionMark, Latin1CharsZ>(cx, utf8, outlen);
}
/**
* Atomization Helpers.
*
* These functions are extremely single-use, and are not intended for general
* consumption.
*/
bool
GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen,
JS::SmallestEncoding* encoding, HashNumber* hashNum)
{
*outlen = 0;
*encoding = JS::SmallestEncoding::ASCII;
*hashNum = 0;
auto getMetadata = [outlen, encoding, hashNum](char16_t c) -> LoopDisposition {
(*outlen)++;
UpdateSmallestEncodingForChar(c, encoding);
*hashNum = mozilla::AddToHash(*hashNum, c);
return LoopDisposition::Continue;
};
if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
return false;
}
return true;
}
template <typename CharT>
bool
UTF8EqualsChars(const JS::UTF8Chars utfChars, const CharT* chars)
{
size_t ind = 0;
bool isEqual = true;
auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
#ifdef DEBUG
JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
UpdateSmallestEncodingForChar(c, &encoding);
if (std::is_same<CharT, JS::Latin1Char>::value) {
MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
} else if (!std::is_same<CharT, char16_t>::value) {
MOZ_CRASH("Invalid character type in UTF8EqualsChars");
}
#endif
if (CharT(c) != chars[ind]) {
isEqual = false;
return LoopDisposition::Break;
}
ind++;
return LoopDisposition::Continue;
};
// To get here, you must have checked your work.
InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars, checkEqual);
return isEqual;
}
template bool UTF8EqualsChars<char16_t>(const JS::UTF8Chars, const char16_t*);
template bool UTF8EqualsChars<JS::Latin1Char>(const JS::UTF8Chars, const JS::Latin1Char*);
template <typename CharT>
void
InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
JS::SmallestEncoding encoding)
{
CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(/* cx = */ nullptr, src, dst, dstLen,
encoding == JS::SmallestEncoding::ASCII);
}
template void
InflateUTF8CharsToBufferAndTerminate<char16_t>(const UTF8Chars src, char16_t* dst, size_t dstLen,
JS::SmallestEncoding encoding);
template void
InflateUTF8CharsToBufferAndTerminate<JS::Latin1Char>(const UTF8Chars src, JS::Latin1Char* dst,
size_t dstLen, JS::SmallestEncoding encoding);
#ifdef DEBUG
void
JS::ConstUTF8CharsZ::validate(size_t aLength)

Просмотреть файл

@ -41,37 +41,59 @@ using mozilla::Maybe;
using mozilla::Nothing;
using mozilla::RangedPtr;
template <typename CharT>
extern void InflateUTF8CharsToBufferAndTerminate(const UTF8Chars src, CharT* dst, size_t dstLen,
JS::SmallestEncoding encoding);
template <typename CharT>
extern bool UTF8EqualsChars(const JS::UTF8Chars utf8, const CharT* chars);
extern bool
GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen, JS::SmallestEncoding* encoding,
HashNumber* hashNum);
struct js::AtomHasher::Lookup
{
union {
const JS::Latin1Char* latin1Chars;
const char16_t* twoByteChars;
const char* utf8Bytes;
};
bool isLatin1;
enum {
TwoByteChar,
Latin1,
UTF8
} type;
size_t length;
size_t byteLength;
const JSAtom* atom; /* Optional. */
JS::AutoCheckCannotGC nogc;
HashNumber hash;
MOZ_ALWAYS_INLINE Lookup(const char* utf8Bytes, size_t byteLen, size_t length, HashNumber hash)
: utf8Bytes(utf8Bytes), type(UTF8), length(length), byteLength(byteLen), atom(nullptr), hash(hash)
{}
MOZ_ALWAYS_INLINE Lookup(const char16_t* chars, size_t length)
: twoByteChars(chars), isLatin1(false), length(length), atom(nullptr),
: twoByteChars(chars), type(TwoByteChar), length(length), atom(nullptr),
hash(mozilla::HashString(chars, length))
{}
MOZ_ALWAYS_INLINE Lookup(const JS::Latin1Char* chars, size_t length)
: latin1Chars(chars), isLatin1(true), length(length), atom(nullptr),
: latin1Chars(chars), type(Latin1), length(length), atom(nullptr),
hash(mozilla::HashString(chars, length))
{}
inline explicit Lookup(const JSAtom* atom)
: isLatin1(atom->hasLatin1Chars()), length(atom->length()), atom(atom),
: type(atom->hasLatin1Chars() ? Latin1 : TwoByteChar), length(atom->length()), atom(atom),
hash(atom->hash())
{
if (isLatin1) {
if (type == Latin1) {
latin1Chars = atom->latin1Chars(nogc);
MOZ_ASSERT(mozilla::HashString(latin1Chars, length) == hash);
} else {
MOZ_ASSERT(type == TwoByteChar);
twoByteChars = atom->twoByteChars(nogc);
MOZ_ASSERT(mozilla::HashString(twoByteChars, length) == hash);
}
@ -97,17 +119,29 @@ js::AtomHasher::match(const AtomStateEntry& entry, const Lookup& lookup)
if (key->hasLatin1Chars()) {
const Latin1Char* keyChars = key->latin1Chars(lookup.nogc);
if (lookup.isLatin1) {
switch (lookup.type) {
case Lookup::Latin1:
return mozilla::ArrayEqual(keyChars, lookup.latin1Chars, lookup.length);
case Lookup::TwoByteChar:
return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
case Lookup::UTF8: {
JS::UTF8Chars utf8(lookup.utf8Bytes, lookup.byteLength);
return UTF8EqualsChars(utf8, keyChars);
}
}
return EqualChars(keyChars, lookup.twoByteChars, lookup.length);
}
const char16_t* keyChars = key->twoByteChars(lookup.nogc);
if (lookup.isLatin1) {
switch (lookup.type) {
case Lookup::Latin1:
return EqualChars(lookup.latin1Chars, keyChars, lookup.length);
case Lookup::TwoByteChar:
return mozilla::ArrayEqual(keyChars, lookup.twoByteChars, lookup.length);
case Lookup::UTF8: {
JS::UTF8Chars utf8(lookup.utf8Bytes, lookup.byteLength);
return UTF8EqualsChars(utf8, keyChars);
}
}
return mozilla::ArrayEqual(keyChars, lookup.twoByteChars, lookup.length);
}
inline JSAtom*
@ -620,6 +654,12 @@ MOZ_ALWAYS_INLINE static JSAtom*
AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
const Maybe<uint32_t>& indexValue, const AtomHasher::Lookup& lookup);
template <typename CharT>
MOZ_ALWAYS_INLINE
static JSAtom*
AtomizeAndCopyCharsFromLookup(JSContext* cx, const CharT* tbchars, size_t length, const AtomHasher::Lookup& lookup,
PinningBehavior pin, const Maybe<uint32_t>& indexValue);
/* |tbchars| must not point into an inline or short string. */
template <typename CharT>
MOZ_ALWAYS_INLINE
@ -632,7 +672,16 @@ AtomizeAndCopyChars(JSContext* cx, const CharT* tbchars, size_t length, PinningB
}
AtomHasher::Lookup lookup(tbchars, length);
return AtomizeAndCopyCharsFromLookup(cx, tbchars, length, lookup, pin, indexValue);
}
template <typename CharT>
MOZ_ALWAYS_INLINE
static JSAtom*
AtomizeAndCopyCharsFromLookup(JSContext* cx, const CharT* tbchars, size_t length, const AtomHasher::Lookup& lookup,
PinningBehavior pin, const Maybe<uint32_t>& indexValue)
{
// Try the per-Zone cache first. If we find the atom there we can avoid the
// atoms lock, the markAtom call, and the multiple HashSet lookups below.
// We don't use the per-Zone cache if we want a pinned atom: handling that
@ -810,6 +859,75 @@ PermanentlyAtomizeAndCopyChars(JSContext* cx,
return atom;
}
struct AtomizeUTF8CharsWrapper
{
JS::UTF8Chars utf8;
JS::SmallestEncoding encoding;
AtomizeUTF8CharsWrapper(const JS::UTF8Chars& chars, JS::SmallestEncoding minEncode)
: utf8(chars), encoding(minEncode)
{ }
};
template <typename CharT>
MOZ_ALWAYS_INLINE
static JSFlatString*
MakeFlatStringForAtomization(JSContext* cx, const CharT* tbchars, size_t length)
{
return NewStringCopyN<NoGC>(cx, tbchars, length);
}
template<typename CharT>
MOZ_ALWAYS_INLINE
static JSFlatString*
MakeUTF8AtomHelper(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
{
if (JSInlineString::lengthFits<CharT>(length)) {
CharT* storage;
JSInlineString* str = AllocateInlineString<NoGC>(cx, length, &storage);
if (!str) {
return nullptr;
}
InflateUTF8CharsToBufferAndTerminate(chars->utf8, storage, length, chars->encoding);
return str;
}
// MakeAtomUTF8Helper is called from deep in the Atomization path, which expects
// functions to fail gracefully with nullptr on OOM, without throwing.
//
// Flat strings are null-terminated. Leave room with length + 1
UniquePtr<CharT[], JS::FreePolicy> newStr(js_pod_malloc<CharT>(length + 1));
if (!newStr) {
return nullptr;
}
InflateUTF8CharsToBufferAndTerminate(chars->utf8, newStr.get(), length, chars->encoding);
JSFlatString* str = JSFlatString::new_<NoGC>(cx, newStr.get(), length);
if (!str) {
return nullptr;
}
mozilla::Unused << newStr.release();
return str;
}
template<>
MOZ_ALWAYS_INLINE
/* static */ JSFlatString*
MakeFlatStringForAtomization(JSContext* cx, const AtomizeUTF8CharsWrapper* chars, size_t length)
{
if (length == 0) {
return cx->emptyString();
}
if (chars->encoding == JS::SmallestEncoding::UTF16) {
return MakeUTF8AtomHelper<char16_t>(cx, chars, length);
}
return MakeUTF8AtomHelper<JS::Latin1Char>(cx, chars, length);
}
template <typename CharT>
MOZ_ALWAYS_INLINE static JSAtom*
AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehavior pin,
@ -817,7 +935,7 @@ AllocateNewAtom(JSContext* cx, const CharT* tbchars, size_t length, PinningBehav
{
AutoAllocInAtomsZone ac(cx);
JSFlatString* flat = NewStringCopyN<NoGC>(cx, tbchars, length);
JSFlatString* flat = MakeFlatStringForAtomization(cx, tbchars, length);
if (!flat) {
// Grudgingly forgo last-ditch GC. The alternative would be to release
// the lock, manually GC here, and retry from the top. If you fix this,
@ -919,19 +1037,22 @@ js::AtomizeChars(JSContext* cx, const char16_t* chars, size_t length, PinningBeh
JSAtom*
js::AtomizeUTF8Chars(JSContext* cx, const char* utf8Chars, size_t utf8ByteLength)
{
// This could be optimized to hand the char16_t's directly to the JSAtom
// instead of making a copy. UTF8CharsToNewTwoByteCharsZ should be
// refactored to take an JSContext so that this function could also.
UTF8Chars utf8(utf8Chars, utf8ByteLength);
// Since the static strings are all ascii, we can check them before trying anything else.
if (JSAtom* s = cx->staticStrings().lookup(utf8Chars, utf8ByteLength)) {
return s;
}
size_t length;
UniqueTwoByteChars chars(JS::UTF8CharsToNewTwoByteCharsZ(cx, utf8, &length).get());
if (!chars) {
HashNumber hash;
JS::SmallestEncoding forCopy;
UTF8Chars utf8(utf8Chars, utf8ByteLength);
if (!GetUTF8AtomizationData(cx, utf8, &length, &forCopy, &hash)) {
return nullptr;
}
return AtomizeChars(cx, chars.get(), length);
AtomizeUTF8CharsWrapper chars(utf8, forCopy);
AtomHasher::Lookup lookup(utf8Chars, utf8ByteLength, length, hash);
return AtomizeAndCopyCharsFromLookup(cx, &chars, length, lookup, DoNotPinAtom, Nothing());
}
bool