зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1779807 - Implement memchr64 in AVX2 r=iain
This only makes sense for AVX2, because widening it from a 64-bit comparison to a 128-bit comparison is hardly worth it, and there are gaps in the SSE2 instruction set (missing _mm_cmpeq_epi64, which is introduced in SSE4.1) that would require us to compensate and probably take a sizeable perf hit. Differential Revision: https://phabricator.services.mozilla.com/D152297
This commit is contained in:
Родитель
1f10d44d74
Коммит
c2cde6897f
286
mfbt/SIMD.cpp
286
mfbt/SIMD.cpp
|
@ -14,6 +14,19 @@
|
|||
|
||||
namespace mozilla {
|
||||
|
||||
template <typename TValue>
|
||||
const TValue* FindInBufferNaive(const TValue* ptr, TValue value,
|
||||
size_t length) {
|
||||
const TValue* end = ptr + length;
|
||||
while (ptr < end) {
|
||||
if (*ptr == value) {
|
||||
return ptr;
|
||||
}
|
||||
ptr++;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef MOZILLA_PRESUME_SSE2
|
||||
|
||||
# include <immintrin.h>
|
||||
|
@ -41,22 +54,27 @@ uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
|
|||
|
||||
uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
|
||||
|
||||
template <typename CharType>
|
||||
template <typename TValue>
|
||||
__m128i CmpEq128(__m128i a, __m128i b) {
|
||||
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
|
||||
if (sizeof(CharType) == 1) {
|
||||
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
|
||||
if (sizeof(TValue) == 1) {
|
||||
return _mm_cmpeq_epi8(a, b);
|
||||
}
|
||||
return _mm_cmpeq_epi16(a, b);
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
template <typename TValue>
|
||||
__m256i CmpEq256(__m256i a, __m256i b) {
|
||||
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
|
||||
if (sizeof(CharType) == 1) {
|
||||
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
|
||||
sizeof(TValue) == 8);
|
||||
if (sizeof(TValue) == 1) {
|
||||
return _mm256_cmpeq_epi8(a, b);
|
||||
}
|
||||
return _mm256_cmpeq_epi16(a, b);
|
||||
if (sizeof(TValue) == 2) {
|
||||
return _mm256_cmpeq_epi16(a, b);
|
||||
}
|
||||
|
||||
return _mm256_cmpeq_epi64(a, b);
|
||||
}
|
||||
|
||||
# ifdef __GNUC__
|
||||
|
@ -128,17 +146,17 @@ const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
|
||||
template <typename TValue>
|
||||
const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
|
||||
uintptr_t c, uintptr_t d) {
|
||||
__m128i haystackA = Load64BitsIntoXMM(a);
|
||||
__m128i cmpA = CmpEq128<CharType>(needle, haystackA);
|
||||
__m128i cmpA = CmpEq128<TValue>(needle, haystackA);
|
||||
__m128i haystackB = Load64BitsIntoXMM(b);
|
||||
__m128i cmpB = CmpEq128<CharType>(needle, haystackB);
|
||||
__m128i cmpB = CmpEq128<TValue>(needle, haystackB);
|
||||
__m128i haystackC = Load64BitsIntoXMM(c);
|
||||
__m128i cmpC = CmpEq128<CharType>(needle, haystackC);
|
||||
__m128i cmpC = CmpEq128<TValue>(needle, haystackC);
|
||||
__m128i haystackD = Load64BitsIntoXMM(d);
|
||||
__m128i cmpD = CmpEq128<CharType>(needle, haystackD);
|
||||
__m128i cmpD = CmpEq128<TValue>(needle, haystackD);
|
||||
__m128i or_ab = _mm_or_si128(cmpA, cmpB);
|
||||
__m128i or_cd = _mm_or_si128(cmpC, cmpD);
|
||||
__m128i or_abcd = _mm_or_si128(or_ab, or_cd);
|
||||
|
@ -147,36 +165,36 @@ const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
|
|||
int cmpMask;
|
||||
cmpMask = _mm_movemask_epi8(cmpA);
|
||||
if (cmpMask & 0xff) {
|
||||
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpB);
|
||||
if (cmpMask & 0xff) {
|
||||
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpC);
|
||||
if (cmpMask & 0xff) {
|
||||
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpD);
|
||||
if (cmpMask & 0xff) {
|
||||
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
|
||||
uintptr_t c, uintptr_t d) {
|
||||
template <typename TValue>
|
||||
const TValue* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
|
||||
uintptr_t c, uintptr_t d) {
|
||||
__m128i haystackA = _mm_loadu_si128(Cast128(a));
|
||||
__m128i cmpA = CmpEq128<CharType>(needle, haystackA);
|
||||
__m128i cmpA = CmpEq128<TValue>(needle, haystackA);
|
||||
__m128i haystackB = _mm_loadu_si128(Cast128(b));
|
||||
__m128i cmpB = CmpEq128<CharType>(needle, haystackB);
|
||||
__m128i cmpB = CmpEq128<TValue>(needle, haystackB);
|
||||
__m128i haystackC = _mm_loadu_si128(Cast128(c));
|
||||
__m128i cmpC = CmpEq128<CharType>(needle, haystackC);
|
||||
__m128i cmpC = CmpEq128<TValue>(needle, haystackC);
|
||||
__m128i haystackD = _mm_loadu_si128(Cast128(d));
|
||||
__m128i cmpD = CmpEq128<CharType>(needle, haystackD);
|
||||
__m128i cmpD = CmpEq128<TValue>(needle, haystackD);
|
||||
__m128i or_ab = _mm_or_si128(cmpA, cmpB);
|
||||
__m128i or_cd = _mm_or_si128(cmpC, cmpD);
|
||||
__m128i or_abcd = _mm_or_si128(or_ab, or_cd);
|
||||
|
@ -185,36 +203,36 @@ const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
|
|||
int cmpMask;
|
||||
cmpMask = _mm_movemask_epi8(cmpA);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpB);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpC);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpD);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
|
||||
uintptr_t c, uintptr_t d) {
|
||||
template <typename TValue>
|
||||
const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
|
||||
uintptr_t c, uintptr_t d) {
|
||||
__m256i haystackA = _mm256_loadu_si256(Cast256(a));
|
||||
__m256i cmpA = CmpEq256<CharType>(needle, haystackA);
|
||||
__m256i cmpA = CmpEq256<TValue>(needle, haystackA);
|
||||
__m256i haystackB = _mm256_loadu_si256(Cast256(b));
|
||||
__m256i cmpB = CmpEq256<CharType>(needle, haystackB);
|
||||
__m256i cmpB = CmpEq256<TValue>(needle, haystackB);
|
||||
__m256i haystackC = _mm256_loadu_si256(Cast256(c));
|
||||
__m256i cmpC = CmpEq256<CharType>(needle, haystackC);
|
||||
__m256i cmpC = CmpEq256<TValue>(needle, haystackC);
|
||||
__m256i haystackD = _mm256_loadu_si256(Cast256(d));
|
||||
__m256i cmpD = CmpEq256<CharType>(needle, haystackD);
|
||||
__m256i cmpD = CmpEq256<TValue>(needle, haystackD);
|
||||
__m256i or_ab = _mm256_or_si256(cmpA, cmpB);
|
||||
__m256i or_cd = _mm256_or_si256(cmpC, cmpD);
|
||||
__m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
|
||||
|
@ -223,19 +241,19 @@ const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
|
|||
int cmpMask;
|
||||
cmpMask = _mm256_movemask_epi8(cmpA);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm256_movemask_epi8(cmpB);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm256_movemask_epi8(cmpC);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
|
||||
}
|
||||
cmpMask = _mm256_movemask_epi8(cmpD);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -255,15 +273,15 @@ enum class HaystackOverlap {
|
|||
// the next a's 16-byte chunk is needle2. `overlap` and whether
|
||||
// `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid
|
||||
// branching.
|
||||
template <typename CharType>
|
||||
const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
|
||||
uintptr_t b, __m128i* carryIn,
|
||||
__m128i* carryOut, HaystackOverlap overlap) {
|
||||
const int shiftRightAmount = 16 - sizeof(CharType);
|
||||
const int shiftLeftAmount = sizeof(CharType);
|
||||
template <typename TValue>
|
||||
const TValue* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
|
||||
uintptr_t b, __m128i* carryIn, __m128i* carryOut,
|
||||
HaystackOverlap overlap) {
|
||||
const int shiftRightAmount = 16 - sizeof(TValue);
|
||||
const int shiftLeftAmount = sizeof(TValue);
|
||||
__m128i haystackA = _mm_loadu_si128(Cast128(a));
|
||||
__m128i cmpA1 = CmpEq128<CharType>(needle1, haystackA);
|
||||
__m128i cmpA2 = CmpEq128<CharType>(needle2, haystackA);
|
||||
__m128i cmpA1 = CmpEq128<TValue>(needle1, haystackA);
|
||||
__m128i cmpA2 = CmpEq128<TValue>(needle2, haystackA);
|
||||
__m128i cmpA;
|
||||
if (carryIn) {
|
||||
cmpA = _mm_and_si128(
|
||||
|
@ -272,8 +290,8 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
|
|||
cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2);
|
||||
}
|
||||
__m128i haystackB = _mm_loadu_si128(Cast128(b));
|
||||
__m128i cmpB1 = CmpEq128<CharType>(needle1, haystackB);
|
||||
__m128i cmpB2 = CmpEq128<CharType>(needle2, haystackB);
|
||||
__m128i cmpB1 = CmpEq128<TValue>(needle1, haystackB);
|
||||
__m128i cmpB2 = CmpEq128<TValue>(needle2, haystackB);
|
||||
__m128i cmpB;
|
||||
if (overlap == HaystackOverlap::Overlapping) {
|
||||
cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2);
|
||||
|
@ -289,13 +307,13 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
|
|||
int cmpMask;
|
||||
cmpMask = _mm_movemask_epi8(cmpA);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask) -
|
||||
shiftLeftAmount);
|
||||
return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask) -
|
||||
shiftLeftAmount);
|
||||
}
|
||||
cmpMask = _mm_movemask_epi8(cmpB);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask) -
|
||||
shiftLeftAmount);
|
||||
return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask) -
|
||||
shiftLeftAmount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -306,13 +324,12 @@ const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* FindInBuffer(const CharType* ptr, CharType value,
|
||||
size_t length) {
|
||||
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
|
||||
static_assert(std::is_unsigned<CharType>::value);
|
||||
template <typename TValue>
|
||||
const TValue* FindInBuffer(const TValue* ptr, TValue value, size_t length) {
|
||||
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
|
||||
static_assert(std::is_unsigned<TValue>::value);
|
||||
uint64_t splat64;
|
||||
if (sizeof(CharType) == 1) {
|
||||
if (sizeof(TValue) == 1) {
|
||||
splat64 = 0x0101010101010101llu;
|
||||
} else {
|
||||
splat64 = 0x0001000100010001llu;
|
||||
|
@ -323,16 +340,16 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
|
|||
int64_t i64_value = *reinterpret_cast<int64_t*>(&u64_value);
|
||||
__m128i needle = _mm_set_epi64x(i64_value, i64_value);
|
||||
|
||||
size_t numBytes = length * sizeof(CharType);
|
||||
size_t numBytes = length * sizeof(TValue);
|
||||
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
|
||||
uintptr_t end = cur + numBytes;
|
||||
|
||||
if ((sizeof(CharType) > 1 && numBytes < 16) || numBytes < 4) {
|
||||
if ((sizeof(TValue) > 1 && numBytes < 16) || numBytes < 4) {
|
||||
while (cur < end) {
|
||||
if (GetAs<CharType>(cur) == value) {
|
||||
return reinterpret_cast<const CharType*>(cur);
|
||||
if (GetAs<TValue>(cur) == value) {
|
||||
return reinterpret_cast<const TValue*>(cur);
|
||||
}
|
||||
cur += sizeof(CharType);
|
||||
cur += sizeof(TValue);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -352,9 +369,9 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
|
|||
uintptr_t c = end - 4 - ((numBytes & 8) >> 1);
|
||||
uintptr_t d = end - 4;
|
||||
const char* charResult = Check4x4Chars(needle, a, b, c, d);
|
||||
// Note: we ensure above that sizeof(CharType) == 1 here, so this is
|
||||
// Note: we ensure above that sizeof(TValue) == 1 here, so this is
|
||||
// either char to char or char to something like a uint8_t.
|
||||
return reinterpret_cast<const CharType*>(charResult);
|
||||
return reinterpret_cast<const TValue*>(charResult);
|
||||
}
|
||||
|
||||
if (numBytes < 64) {
|
||||
|
@ -364,17 +381,17 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
|
|||
uintptr_t b = cur + ((numBytes & 32) >> 1);
|
||||
uintptr_t c = end - 16 - ((numBytes & 32) >> 1);
|
||||
uintptr_t d = end - 16;
|
||||
return Check4x16Bytes<CharType>(needle, a, b, c, d);
|
||||
return Check4x16Bytes<TValue>(needle, a, b, c, d);
|
||||
}
|
||||
|
||||
// Get the initial unaligned load out of the way. This will overlap with the
|
||||
// aligned stuff below, but the overlapped part should effectively be free
|
||||
// (relative to a mispredict from doing a byte-by-byte loop).
|
||||
__m128i haystack = _mm_loadu_si128(Cast128(cur));
|
||||
__m128i cmp = CmpEq128<CharType>(needle, haystack);
|
||||
__m128i cmp = CmpEq128<TValue>(needle, haystack);
|
||||
int cmpMask = _mm_movemask_epi8(cmp);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
|
||||
}
|
||||
|
||||
// Now we're working with aligned memory. Hooray! \o/
|
||||
|
@ -391,7 +408,7 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
|
|||
uintptr_t b = cur + 16;
|
||||
uintptr_t c = cur + 32;
|
||||
uintptr_t d = cur + 48;
|
||||
const CharType* result = Check4x16Bytes<CharType>(needle, a, b, c, d);
|
||||
const TValue* result = Check4x16Bytes<TValue>(needle, a, b, c, d);
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
|
@ -402,49 +419,53 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
|
|||
uintptr_t b = tailStartPtr + 16;
|
||||
uintptr_t c = tailStartPtr + 32;
|
||||
uintptr_t d = tailEndPtr;
|
||||
return Check4x16Bytes<CharType>(needle, a, b, c, d);
|
||||
return Check4x16Bytes<TValue>(needle, a, b, c, d);
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
|
||||
size_t length) {
|
||||
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
|
||||
static_assert(std::is_unsigned<CharType>::value);
|
||||
template <typename TValue>
|
||||
const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
|
||||
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
|
||||
sizeof(TValue) == 8);
|
||||
static_assert(std::is_unsigned<TValue>::value);
|
||||
|
||||
// Load our needle into a 32-byte register
|
||||
__m256i needle;
|
||||
if (sizeof(CharType) == 1) {
|
||||
if (sizeof(TValue) == 1) {
|
||||
needle = _mm256_set1_epi8(value);
|
||||
} else {
|
||||
} else if (sizeof(TValue) == 2) {
|
||||
needle = _mm256_set1_epi16(value);
|
||||
} else {
|
||||
needle = _mm256_set1_epi64x(value);
|
||||
}
|
||||
|
||||
size_t numBytes = length * sizeof(CharType);
|
||||
size_t numBytes = length * sizeof(TValue);
|
||||
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
|
||||
uintptr_t end = cur + numBytes;
|
||||
|
||||
if (numBytes < 8) {
|
||||
if (numBytes < 8 || (sizeof(TValue) == 8 && numBytes < 32)) {
|
||||
while (cur < end) {
|
||||
if (GetAs<CharType>(cur) == value) {
|
||||
return reinterpret_cast<const CharType*>(cur);
|
||||
if (GetAs<TValue>(cur) == value) {
|
||||
return reinterpret_cast<const TValue*>(cur);
|
||||
}
|
||||
cur += sizeof(CharType);
|
||||
cur += sizeof(TValue);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (numBytes < 32) {
|
||||
__m128i needle_narrow;
|
||||
if (sizeof(CharType) == 1) {
|
||||
needle_narrow = _mm_set1_epi8(value);
|
||||
} else {
|
||||
needle_narrow = _mm_set1_epi16(value);
|
||||
if constexpr (sizeof(TValue) != 8) {
|
||||
if (numBytes < 32) {
|
||||
__m128i needle_narrow;
|
||||
if (sizeof(TValue) == 1) {
|
||||
needle_narrow = _mm_set1_epi8(value);
|
||||
} else {
|
||||
needle_narrow = _mm_set1_epi16(value);
|
||||
}
|
||||
uintptr_t a = cur;
|
||||
uintptr_t b = cur + ((numBytes & 16) >> 1);
|
||||
uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
|
||||
uintptr_t d = end - 8;
|
||||
return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
|
||||
}
|
||||
uintptr_t a = cur;
|
||||
uintptr_t b = cur + ((numBytes & 16) >> 1);
|
||||
uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
|
||||
uintptr_t d = end - 8;
|
||||
return Check4x8Bytes<CharType>(needle_narrow, a, b, c, d);
|
||||
}
|
||||
|
||||
if (numBytes < 128) {
|
||||
|
@ -454,17 +475,17 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
|
|||
uintptr_t b = cur + ((numBytes & 64) >> 1);
|
||||
uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
|
||||
uintptr_t d = end - 32;
|
||||
return Check4x32Bytes<CharType>(needle, a, b, c, d);
|
||||
return Check4x32Bytes<TValue>(needle, a, b, c, d);
|
||||
}
|
||||
|
||||
// Get the initial unaligned load out of the way. This will overlap with the
|
||||
// aligned stuff below, but the overlapped part should effectively be free
|
||||
// (relative to a mispredict from doing a byte-by-byte loop).
|
||||
__m256i haystack = _mm256_loadu_si256(Cast256(cur));
|
||||
__m256i cmp = CmpEq256<CharType>(needle, haystack);
|
||||
__m256i cmp = CmpEq256<TValue>(needle, haystack);
|
||||
int cmpMask = _mm256_movemask_epi8(cmp);
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
|
||||
return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
|
||||
}
|
||||
|
||||
// Now we're working with aligned memory. Hooray! \o/
|
||||
|
@ -478,7 +499,7 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
|
|||
uintptr_t b = cur + 32;
|
||||
uintptr_t c = cur + 64;
|
||||
uintptr_t d = cur + 96;
|
||||
const CharType* result = Check4x32Bytes<CharType>(needle, a, b, c, d);
|
||||
const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
|
@ -489,12 +510,11 @@ const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
|
|||
uintptr_t b = tailStartPtr + 32;
|
||||
uintptr_t c = tailStartPtr + 64;
|
||||
uintptr_t d = tailEndPtr;
|
||||
return Check4x32Bytes<CharType>(needle, a, b, c, d);
|
||||
return Check4x32Bytes<TValue>(needle, a, b, c, d);
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1,
|
||||
CharType v2);
|
||||
template <typename TValue>
|
||||
const TValue* TwoByteLoop(uintptr_t start, uintptr_t end, TValue v1, TValue v2);
|
||||
|
||||
template <>
|
||||
const unsigned char* TwoByteLoop<unsigned char>(uintptr_t start, uintptr_t end,
|
||||
|
@ -533,13 +553,13 @@ const char16_t* TwoByteLoop<char16_t>(uintptr_t start, uintptr_t end,
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename CharType>
|
||||
const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
|
||||
size_t length) {
|
||||
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
|
||||
static_assert(std::is_unsigned<CharType>::value);
|
||||
template <typename TValue>
|
||||
const TValue* FindTwoInBuffer(const TValue* ptr, TValue v1, TValue v2,
|
||||
size_t length) {
|
||||
static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
|
||||
static_assert(std::is_unsigned<TValue>::value);
|
||||
uint64_t splat64;
|
||||
if (sizeof(CharType) == 1) {
|
||||
if (sizeof(TValue) == 1) {
|
||||
splat64 = 0x0101010101010101llu;
|
||||
} else {
|
||||
splat64 = 0x0001000100010001llu;
|
||||
|
@ -553,33 +573,33 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
|
|||
int64_t i64_v2 = *reinterpret_cast<int64_t*>(&u64_v2);
|
||||
__m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2);
|
||||
|
||||
size_t numBytes = length * sizeof(CharType);
|
||||
size_t numBytes = length * sizeof(TValue);
|
||||
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
|
||||
uintptr_t end = cur + numBytes;
|
||||
|
||||
if (numBytes < 16) {
|
||||
return TwoByteLoop<CharType>(cur, end, v1, v2);
|
||||
return TwoByteLoop<TValue>(cur, end, v1, v2);
|
||||
}
|
||||
|
||||
if (numBytes < 32) {
|
||||
uintptr_t a = cur;
|
||||
uintptr_t b = end - 16;
|
||||
return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, nullptr, nullptr,
|
||||
HaystackOverlap::Overlapping);
|
||||
return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, nullptr, nullptr,
|
||||
HaystackOverlap::Overlapping);
|
||||
}
|
||||
|
||||
// Get the initial unaligned load out of the way. This will likely overlap
|
||||
// with the aligned stuff below, but the overlapped part should effectively
|
||||
// be free.
|
||||
__m128i haystack = _mm_loadu_si128(Cast128(cur));
|
||||
__m128i cmp1 = CmpEq128<CharType>(needle1, haystack);
|
||||
__m128i cmp2 = CmpEq128<CharType>(needle2, haystack);
|
||||
__m128i cmp1 = CmpEq128<TValue>(needle1, haystack);
|
||||
__m128i cmp2 = CmpEq128<TValue>(needle2, haystack);
|
||||
int cmpMask1 = _mm_movemask_epi8(cmp1);
|
||||
int cmpMask2 = _mm_movemask_epi8(cmp2);
|
||||
int cmpMask = (cmpMask1 << sizeof(CharType)) & cmpMask2;
|
||||
int cmpMask = (cmpMask1 << sizeof(TValue)) & cmpMask2;
|
||||
if (cmpMask) {
|
||||
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask) -
|
||||
sizeof(CharType));
|
||||
return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask) -
|
||||
sizeof(TValue));
|
||||
}
|
||||
|
||||
// Now we're working with aligned memory. Hooray! \o/
|
||||
|
@ -595,9 +615,9 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
|
|||
while (cur < tailStartPtr) {
|
||||
uintptr_t a = cur;
|
||||
uintptr_t b = cur + 16;
|
||||
const CharType* result =
|
||||
Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
|
||||
&cmpMaskCarry, HaystackOverlap::Sequential);
|
||||
const TValue* result =
|
||||
Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
|
||||
&cmpMaskCarry, HaystackOverlap::Sequential);
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
|
@ -609,8 +629,8 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
|
|||
cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry);
|
||||
uintptr_t a = tailStartPtr;
|
||||
uintptr_t b = tailEndPtr;
|
||||
return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
|
||||
nullptr, HaystackOverlap::Overlapping);
|
||||
return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry,
|
||||
nullptr, HaystackOverlap::Overlapping);
|
||||
}
|
||||
|
||||
const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
|
||||
|
@ -647,6 +667,14 @@ const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
|
|||
return memchr16SSE2(ptr, value, length);
|
||||
}
|
||||
|
||||
const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
|
||||
size_t length) {
|
||||
if (supports_avx2()) {
|
||||
return FindInBufferAVX2<uint64_t>(ptr, value, length);
|
||||
}
|
||||
return FindInBufferNaive<uint64_t>(ptr, value, length);
|
||||
}
|
||||
|
||||
const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
|
||||
// Signed chars are just really annoying to do bit logic with. Convert to
|
||||
// unsigned at the outermost scope so we don't have to worry about it.
|
||||
|
@ -679,14 +707,7 @@ const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
|
|||
|
||||
const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
|
||||
size_t length) {
|
||||
const char16_t* end = ptr + length;
|
||||
while (ptr < end) {
|
||||
if (*ptr == value) {
|
||||
return ptr;
|
||||
}
|
||||
ptr++;
|
||||
}
|
||||
return nullptr;
|
||||
return FindInBufferNaive<char16_t>(ptr, value, length);
|
||||
}
|
||||
|
||||
const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
|
||||
|
@ -694,6 +715,11 @@ const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
|
|||
return memchr16(ptr, value, length);
|
||||
}
|
||||
|
||||
const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value,
|
||||
size_t length) {
|
||||
return FindInBufferNaive<uint64_t>(ptr, value, length);
|
||||
}
|
||||
|
||||
const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
|
||||
const char* end = ptr + length - 1;
|
||||
while (ptr < end) {
|
||||
|
|
|
@ -46,6 +46,11 @@ class SIMD {
|
|||
static MFBT_API const char16_t* memchr16SSE2(const char16_t* ptr,
|
||||
char16_t value, size_t length);
|
||||
|
||||
// Search through `ptr[0..length]` for the first occurrence of `value` and
|
||||
// return the pointer to it, or nullptr if it cannot be found.
|
||||
static MFBT_API const uint64_t* memchr64(const uint64_t* ptr, uint64_t value,
|
||||
size_t length);
|
||||
|
||||
// Search through `ptr[0..length]` for the first occurrence of `v1` which is
|
||||
// immediately followed by `v2` and return the pointer to the occurrence of
|
||||
// `v1`.
|
||||
|
|
|
@ -105,8 +105,8 @@ void TestLongString() {
|
|||
MOZ_RELEASE_ASSERT(
|
||||
SIMD::memchr8SSE2(test, static_cast<char>(i), count - 1) == test + i);
|
||||
}
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(count - 1),
|
||||
count - 1) == nullptr);
|
||||
MOZ_RELEASE_ASSERT(
|
||||
SIMD::memchr8(test, static_cast<char>(count - 1), count - 1) == nullptr);
|
||||
}
|
||||
|
||||
void TestGauntlet() {
|
||||
|
@ -124,8 +124,8 @@ void TestGauntlet() {
|
|||
if (j >= k && j < i) {
|
||||
expected = test + j;
|
||||
}
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr8(test + k, static_cast<char>(j),
|
||||
i - k) == expected);
|
||||
MOZ_RELEASE_ASSERT(
|
||||
SIMD::memchr8(test + k, static_cast<char>(j), i - k) == expected);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test + k, static_cast<char>(j),
|
||||
i - k) == expected);
|
||||
}
|
||||
|
@ -221,8 +221,8 @@ void TestLongString16() {
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < count - 1; ++i) {
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, static_cast<char16_t>(i),
|
||||
count - 1) == test + i);
|
||||
MOZ_RELEASE_ASSERT(
|
||||
SIMD::memchr16(test, static_cast<char16_t>(i), count - 1) == test + i);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, static_cast<char16_t>(i),
|
||||
count - 1) == test + i);
|
||||
}
|
||||
|
@ -245,9 +245,8 @@ void TestGauntlet16() {
|
|||
if (j >= k && j < i) {
|
||||
expected = test + j;
|
||||
}
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k,
|
||||
static_cast<char16_t>(j),
|
||||
i - k) == expected);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, static_cast<char16_t>(j),
|
||||
i - k) == expected);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test + k,
|
||||
static_cast<char16_t>(j),
|
||||
i - k) == expected);
|
||||
|
@ -257,6 +256,113 @@ void TestGauntlet16() {
|
|||
}
|
||||
}
|
||||
|
||||
void TestTinyString64() {
|
||||
const uint64_t test[4] = {0, 1, 2, 3};
|
||||
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 3) == test + 0x0);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 3) == test + 0x1);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 3) == test + 0x2);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 3) == nullptr);
|
||||
}
|
||||
|
||||
void TestShortString64() {
|
||||
const uint64_t test[16] = {0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 15) == test + 0);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 15) == test + 1);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 15) == test + 2);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 15) == test + 3);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 15) == test + 4);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 15) == test + 5);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 15) == test + 6);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 15) == test + 7);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 15) == test + 8);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 15) == test + 9);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 15) == test + 10);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 15) == test + 11);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 15) == test + 12);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 15) == test + 13);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 15) == test + 14);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 15) == nullptr);
|
||||
}
|
||||
|
||||
void TestMediumString64() {
|
||||
const uint64_t test[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
|
||||
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
|
||||
22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 0, 31) == test + 0);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 1, 31) == test + 1);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 2, 31) == test + 2);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 3, 31) == test + 3);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 4, 31) == test + 4);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 5, 31) == test + 5);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 6, 31) == test + 6);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 7, 31) == test + 7);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 8, 31) == test + 8);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 9, 31) == test + 9);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 10, 31) == test + 10);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 11, 31) == test + 11);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 12, 31) == test + 12);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 13, 31) == test + 13);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 14, 31) == test + 14);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 15, 31) == test + 15);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 16, 31) == test + 16);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 17, 31) == test + 17);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 18, 31) == test + 18);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 19, 31) == test + 19);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 20, 31) == test + 20);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 21, 31) == test + 21);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 22, 31) == test + 22);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 23, 31) == test + 23);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 24, 31) == test + 24);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 25, 31) == test + 25);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 26, 31) == test + 26);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 27, 31) == test + 27);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 28, 31) == test + 28);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 29, 31) == test + 29);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 30, 31) == test + 30);
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, 31, 31) == nullptr);
|
||||
}
|
||||
|
||||
void TestLongString64() {
|
||||
const size_t count = 256;
|
||||
uint64_t test[count];
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
test[i] = i;
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < count - 1; ++i) {
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, i, count - 1) == test + i);
|
||||
}
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test, count - 1, count - 1) == nullptr);
|
||||
}
|
||||
|
||||
void TestGauntlet64() {
|
||||
const size_t count = 257;
|
||||
uint64_t test[count];
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
test[i] = i;
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < count - 1; ++i) {
|
||||
for (uint64_t j = 0; j < count - 1; ++j) {
|
||||
for (uint64_t k = 0; k < count - 1; ++k) {
|
||||
if (i >= k) {
|
||||
const uint64_t* expected = nullptr;
|
||||
if (j >= k && j < i) {
|
||||
expected = test + j;
|
||||
}
|
||||
MOZ_RELEASE_ASSERT(SIMD::memchr64(test + k, j, i - k) == expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestTinyString2x8() {
|
||||
const char* test = "012\n";
|
||||
|
||||
|
@ -498,6 +604,12 @@ int main(void) {
|
|||
TestLongString16();
|
||||
TestGauntlet16();
|
||||
|
||||
TestTinyString64();
|
||||
TestShortString64();
|
||||
TestMediumString64();
|
||||
TestLongString64();
|
||||
TestGauntlet64();
|
||||
|
||||
TestTinyString2x8();
|
||||
TestShortString2x8();
|
||||
TestMediumString2x8();
|
||||
|
|
Загрузка…
Ссылка в новой задаче