Bug 1779807 - Support AVX2 for SIMD memchr r=iain

This showed a modest improvement in the geomean of my benchmarking, but
importantly it showed a consistent and relatively strong improvement across
all of the cases which I would guess are more realistic. Notably this change
makes it perform better at iteratively searching for the next occurrence of X
in the HTML of a large web page.

Differential Revision: https://phabricator.services.mozilla.com/D152296
This commit is contained in:
Doug Thayer 2022-07-29 03:26:06 +00:00
Родитель 7355d8d027
Коммит 1f10d44d74
4 изменённых файлов: 319 добавлений и 11 удалений

Просмотреть файл

@ -9,6 +9,7 @@
#include <stdint.h> #include <stdint.h>
#include <type_traits> #include <type_traits>
#include "mozilla/EndianUtils.h"
#include "mozilla/SSE.h" #include "mozilla/SSE.h"
namespace mozilla { namespace mozilla {
@ -21,6 +22,10 @@ const __m128i* Cast128(uintptr_t ptr) {
return reinterpret_cast<const __m128i*>(ptr); return reinterpret_cast<const __m128i*>(ptr);
} }
const __m256i* Cast256(uintptr_t ptr) {
return reinterpret_cast<const __m256i*>(ptr);
}
template <typename T> template <typename T>
T GetAs(uintptr_t ptr) { T GetAs(uintptr_t ptr) {
return *reinterpret_cast<const T*>(ptr); return *reinterpret_cast<const T*>(ptr);
@ -32,6 +37,10 @@ uintptr_t AlignDown16(uintptr_t ptr) { return ptr & ~0xf; }
uintptr_t AlignUp16(uintptr_t ptr) { return AlignDown16(ptr + 0xf); } uintptr_t AlignUp16(uintptr_t ptr) { return AlignDown16(ptr + 0xf); }
uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
template <typename CharType> template <typename CharType>
__m128i CmpEq128(__m128i a, __m128i b) { __m128i CmpEq128(__m128i a, __m128i b) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2); static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
@ -41,6 +50,15 @@ __m128i CmpEq128(__m128i a, __m128i b) {
return _mm_cmpeq_epi16(a, b); return _mm_cmpeq_epi16(a, b);
} }
template <typename CharType>
__m256i CmpEq256(__m256i a, __m256i b) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
if (sizeof(CharType) == 1) {
return _mm256_cmpeq_epi8(a, b);
}
return _mm256_cmpeq_epi16(a, b);
}
# ifdef __GNUC__ # ifdef __GNUC__
// Earlier versions of GCC are missing the _mm_loadu_si32 instruction. This // Earlier versions of GCC are missing the _mm_loadu_si32 instruction. This
@ -53,12 +71,24 @@ __m128i Load32BitsIntoXMM(uintptr_t ptr) {
return _mm_cvtsi32_si128(tmp); // efficient on GCC/clang/MSVC return _mm_cvtsi32_si128(tmp); // efficient on GCC/clang/MSVC
} }
// This is just adapted from the above workaround. Testing this, it also yields
// the correct instructions across all tested compilers.
__m128i Load64BitsIntoXMM(uintptr_t ptr) {
int64_t tmp;
memcpy(&tmp, reinterpret_cast<const void*>(ptr), sizeof(tmp));
return _mm_cvtsi64_si128(tmp);
}
# else # else
__m128i Load32BitsIntoXMM(uintptr_t ptr) { __m128i Load32BitsIntoXMM(uintptr_t ptr) {
return _mm_loadu_si32(Cast128(ptr)); return _mm_loadu_si32(Cast128(ptr));
} }
__m128i Load64BitsIntoXMM(uintptr_t ptr) {
return _mm_loadu_si64(Cast128(ptr));
}
# endif # endif
const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c, const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
@ -98,6 +128,44 @@ const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
return nullptr; return nullptr;
} }
template <typename CharType>
const CharType* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
__m128i haystackA = Load64BitsIntoXMM(a);
__m128i cmpA = CmpEq128<CharType>(needle, haystackA);
__m128i haystackB = Load64BitsIntoXMM(b);
__m128i cmpB = CmpEq128<CharType>(needle, haystackB);
__m128i haystackC = Load64BitsIntoXMM(c);
__m128i cmpC = CmpEq128<CharType>(needle, haystackC);
__m128i haystackD = Load64BitsIntoXMM(d);
__m128i cmpD = CmpEq128<CharType>(needle, haystackD);
__m128i or_ab = _mm_or_si128(cmpA, cmpB);
__m128i or_cd = _mm_or_si128(cmpC, cmpD);
__m128i or_abcd = _mm_or_si128(or_ab, or_cd);
int orMask = _mm_movemask_epi8(or_abcd);
if (orMask & 0xff) {
int cmpMask;
cmpMask = _mm_movemask_epi8(cmpA);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpB);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpC);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
}
cmpMask = _mm_movemask_epi8(cmpD);
if (cmpMask & 0xff) {
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
}
}
return nullptr;
}
template <typename CharType> template <typename CharType>
const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b, const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) { uintptr_t c, uintptr_t d) {
@ -136,6 +204,44 @@ const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
return nullptr; return nullptr;
} }
template <typename CharType>
const CharType* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
uintptr_t c, uintptr_t d) {
__m256i haystackA = _mm256_loadu_si256(Cast256(a));
__m256i cmpA = CmpEq256<CharType>(needle, haystackA);
__m256i haystackB = _mm256_loadu_si256(Cast256(b));
__m256i cmpB = CmpEq256<CharType>(needle, haystackB);
__m256i haystackC = _mm256_loadu_si256(Cast256(c));
__m256i cmpC = CmpEq256<CharType>(needle, haystackC);
__m256i haystackD = _mm256_loadu_si256(Cast256(d));
__m256i cmpD = CmpEq256<CharType>(needle, haystackD);
__m256i or_ab = _mm256_or_si256(cmpA, cmpB);
__m256i or_cd = _mm256_or_si256(cmpC, cmpD);
__m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
int orMask = _mm256_movemask_epi8(or_abcd);
if (orMask) {
int cmpMask;
cmpMask = _mm256_movemask_epi8(cmpA);
if (cmpMask) {
return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
}
cmpMask = _mm256_movemask_epi8(cmpB);
if (cmpMask) {
return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
}
cmpMask = _mm256_movemask_epi8(cmpC);
if (cmpMask) {
return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
}
cmpMask = _mm256_movemask_epi8(cmpD);
if (cmpMask) {
return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
}
}
return nullptr;
}
enum class HaystackOverlap { enum class HaystackOverlap {
Overlapping, Overlapping,
Sequential, Sequential,
@ -299,6 +405,93 @@ const CharType* FindInBuffer(const CharType* ptr, CharType value,
return Check4x16Bytes<CharType>(needle, a, b, c, d); return Check4x16Bytes<CharType>(needle, a, b, c, d);
} }
template <typename CharType>
const CharType* FindInBufferAVX2(const CharType* ptr, CharType value,
size_t length) {
static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
static_assert(std::is_unsigned<CharType>::value);
// Load our needle into a 32-byte register
__m256i needle;
if (sizeof(CharType) == 1) {
needle = _mm256_set1_epi8(value);
} else {
needle = _mm256_set1_epi16(value);
}
size_t numBytes = length * sizeof(CharType);
uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
uintptr_t end = cur + numBytes;
if (numBytes < 8) {
while (cur < end) {
if (GetAs<CharType>(cur) == value) {
return reinterpret_cast<const CharType*>(cur);
}
cur += sizeof(CharType);
}
return nullptr;
}
if (numBytes < 32) {
__m128i needle_narrow;
if (sizeof(CharType) == 1) {
needle_narrow = _mm_set1_epi8(value);
} else {
needle_narrow = _mm_set1_epi16(value);
}
uintptr_t a = cur;
uintptr_t b = cur + ((numBytes & 16) >> 1);
uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
uintptr_t d = end - 8;
return Check4x8Bytes<CharType>(needle_narrow, a, b, c, d);
}
if (numBytes < 128) {
// NOTE: see the above explanation of the similar chunk of code, but in
// this case, replace 16 with 64 and 8 with 32.
uintptr_t a = cur;
uintptr_t b = cur + ((numBytes & 64) >> 1);
uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
uintptr_t d = end - 32;
return Check4x32Bytes<CharType>(needle, a, b, c, d);
}
// Get the initial unaligned load out of the way. This will overlap with the
// aligned stuff below, but the overlapped part should effectively be free
// (relative to a mispredict from doing a byte-by-byte loop).
__m256i haystack = _mm256_loadu_si256(Cast256(cur));
__m256i cmp = CmpEq256<CharType>(needle, haystack);
int cmpMask = _mm256_movemask_epi8(cmp);
if (cmpMask) {
return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
}
// Now we're working with aligned memory. Hooray! \o/
cur = AlignUp32(cur);
uintptr_t tailStartPtr = AlignDown32(end - 96);
uintptr_t tailEndPtr = end - 32;
while (cur < tailStartPtr) {
uintptr_t a = cur;
uintptr_t b = cur + 32;
uintptr_t c = cur + 64;
uintptr_t d = cur + 96;
const CharType* result = Check4x32Bytes<CharType>(needle, a, b, c, d);
if (result) {
return result;
}
cur += 128;
}
uintptr_t a = tailStartPtr;
uintptr_t b = tailStartPtr + 32;
uintptr_t c = tailStartPtr + 64;
uintptr_t d = tailEndPtr;
return Check4x32Bytes<CharType>(needle, a, b, c, d);
}
template <typename CharType> template <typename CharType>
const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1, const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1,
CharType v2); CharType v2);
@ -420,7 +613,7 @@ const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
nullptr, HaystackOverlap::Overlapping); nullptr, HaystackOverlap::Overlapping);
} }
const char* SIMD::memchr8(const char* ptr, char value, size_t length) { const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
// Signed chars are just really annoying to do bit logic with. Convert to // Signed chars are just really annoying to do bit logic with. Convert to
// unsigned at the outermost scope so we don't have to worry about it. // unsigned at the outermost scope so we don't have to worry about it.
const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr); const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
@ -430,9 +623,28 @@ const char* SIMD::memchr8(const char* ptr, char value, size_t length) {
return reinterpret_cast<const char*>(uresult); return reinterpret_cast<const char*>(uresult);
} }
const char* SIMD::memchr8(const char* ptr, char value, size_t length) {
if (supports_avx2()) {
const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
unsigned char uvalue = static_cast<unsigned char>(value);
const unsigned char* uresult =
FindInBufferAVX2<unsigned char>(uptr, uvalue, length);
return reinterpret_cast<const char*>(uresult);
}
return memchr8SSE2(ptr, value, length);
}
const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
size_t length) {
return FindInBuffer<char16_t>(ptr, value, length);
}
const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value, const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
size_t length) { size_t length) {
return FindInBuffer<char16_t>(ptr, value, length); if (supports_avx2()) {
return FindInBufferAVX2<char16_t>(ptr, value, length);
}
return memchr16SSE2(ptr, value, length);
} }
const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) { const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
@ -461,6 +673,10 @@ const char* SIMD::memchr8(const char* ptr, char value, size_t length) {
return reinterpret_cast<const char*>(result); return reinterpret_cast<const char*>(result);
} }
const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) {
return memchr8(ptr, value, length);
}
const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value, const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
size_t length) { size_t length) {
const char16_t* end = ptr + length; const char16_t* end = ptr + length;
@ -473,6 +689,11 @@ const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
return nullptr; return nullptr;
} }
const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value,
size_t length) {
return memchr16(ptr, value, length);
}
const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) { const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
const char* end = ptr + length - 1; const char* end = ptr + length - 1;
while (ptr < end) { while (ptr < end) {

Просмотреть файл

@ -33,11 +33,19 @@ class SIMD {
static MFBT_API const char* memchr8(const char* ptr, char value, static MFBT_API const char* memchr8(const char* ptr, char value,
size_t length); size_t length);
// This function just restricts our execution to the SSE2 path
static MFBT_API const char* memchr8SSE2(const char* ptr, char value,
size_t length);
// Search through `ptr[0..length]` for the first occurrence of `value` and // Search through `ptr[0..length]` for the first occurrence of `value` and
// return the pointer to it, or nullptr if it cannot be found. // return the pointer to it, or nullptr if it cannot be found.
static MFBT_API const char16_t* memchr16(const char16_t* ptr, char16_t value, static MFBT_API const char16_t* memchr16(const char16_t* ptr, char16_t value,
size_t length); size_t length);
// This function just restricts our execution to the SSE2 path
static MFBT_API const char16_t* memchr16SSE2(const char16_t* ptr,
char16_t value, size_t length);
// Search through `ptr[0..length]` for the first occurrence of `v1` which is // Search through `ptr[0..length]` for the first occurrence of `v1` which is
// immediately followed by `v2` and return the pointer to the occurrence of // immediately followed by `v2` and return the pointer to the occurrence of
// `v1`. // `v1`.

Просмотреть файл

@ -177,13 +177,17 @@ UNIFIED_SOURCES += [
"Poison.cpp", "Poison.cpp",
"RandomNum.cpp", "RandomNum.cpp",
"SHA1.cpp", "SHA1.cpp",
"SIMD.cpp",
"TaggedAnonymousMemory.cpp", "TaggedAnonymousMemory.cpp",
"UniquePtrExtensions.cpp", "UniquePtrExtensions.cpp",
"Unused.cpp", "Unused.cpp",
"Utf8.cpp", "Utf8.cpp",
] ]
SOURCES += [
"SIMD.cpp",
]
SOURCES["SIMD.cpp"].flags += ["-mavx2"]
if CONFIG["CPU_ARCH"].startswith("x86"): if CONFIG["CPU_ARCH"].startswith("x86"):
SOURCES += [ SOURCES += [
"SSE.cpp", "SSE.cpp",

Просмотреть файл

@ -13,47 +13,79 @@ void TestTinyString() {
const char* test = "012\n"; const char* test = "012\n";
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '0', 3) == test + 0x0); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '0', 3) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '0', 3) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '1', 3) == test + 0x1); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '1', 3) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '1', 3) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '2', 3) == test + 0x2); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '2', 3) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '2', 3) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '\n', 3) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '\n', 3) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '\n', 3) == nullptr);
} }
void TestShortString() { void TestShortString() {
const char* test = "0123456789\n"; const char* test = "0123456789\n";
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '0', 10) == test + 0x0); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '0', 10) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '0', 10) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '1', 10) == test + 0x1); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '1', 10) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '1', 10) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '2', 10) == test + 0x2); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '2', 10) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '2', 10) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '3', 10) == test + 0x3); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '3', 10) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '3', 10) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '4', 10) == test + 0x4); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '4', 10) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '4', 10) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '5', 10) == test + 0x5); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '5', 10) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '5', 10) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '6', 10) == test + 0x6); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '6', 10) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '6', 10) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '7', 10) == test + 0x7); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '7', 10) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '7', 10) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '8', 10) == test + 0x8); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '8', 10) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '8', 10) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '9', 10) == test + 0x9); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '9', 10) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '9', 10) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '\n', 10) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '\n', 10) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '\n', 10) == nullptr);
} }
void TestMediumString() { void TestMediumString() {
const char* test = "0123456789abcdef\n"; const char* test = "0123456789abcdef\n";
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '0', 16) == test + 0x0); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '0', 16) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '0', 16) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '1', 16) == test + 0x1); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '1', 16) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '1', 16) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '2', 16) == test + 0x2); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '2', 16) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '2', 16) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '3', 16) == test + 0x3); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '3', 16) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '3', 16) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '4', 16) == test + 0x4); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '4', 16) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '4', 16) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '5', 16) == test + 0x5); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '5', 16) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '5', 16) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '6', 16) == test + 0x6); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '6', 16) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '6', 16) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '7', 16) == test + 0x7); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '7', 16) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '7', 16) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '8', 16) == test + 0x8); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '8', 16) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '8', 16) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '9', 16) == test + 0x9); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '9', 16) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '9', 16) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'a', 16) == test + 0xa); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'a', 16) == test + 0xa);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, 'a', 16) == test + 0xa);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'b', 16) == test + 0xb); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'b', 16) == test + 0xb);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, 'b', 16) == test + 0xb);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'c', 16) == test + 0xc); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'c', 16) == test + 0xc);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, 'c', 16) == test + 0xc);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'd', 16) == test + 0xd); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'd', 16) == test + 0xd);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, 'd', 16) == test + 0xd);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'e', 16) == test + 0xe); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'e', 16) == test + 0xe);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, 'e', 16) == test + 0xe);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'f', 16) == test + 0xf); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, 'f', 16) == test + 0xf);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, 'f', 16) == test + 0xf);
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '\n', 16) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr8(test, '\n', 16) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test, '\n', 16) == nullptr);
} }
void TestLongString() { void TestLongString() {
@ -70,9 +102,11 @@ void TestLongString() {
for (size_t i = 0; i < count - 1; ++i) { for (size_t i = 0; i < count - 1; ++i) {
MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(i), count - 1) == MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(i), count - 1) ==
test + i); test + i);
MOZ_RELEASE_ASSERT(
SIMD::memchr8SSE2(test, static_cast<char>(i), count - 1) == test + i);
} }
MOZ_RELEASE_ASSERT( MOZ_RELEASE_ASSERT(SIMD::memchr8(test, static_cast<char>(count - 1),
SIMD::memchr8(test, static_cast<char>(count - 1), count - 1) == nullptr); count - 1) == nullptr);
} }
void TestGauntlet() { void TestGauntlet() {
@ -90,8 +124,10 @@ void TestGauntlet() {
if (j >= k && j < i) { if (j >= k && j < i) {
expected = test + j; expected = test + j;
} }
MOZ_RELEASE_ASSERT( MOZ_RELEASE_ASSERT(SIMD::memchr8(test + k, static_cast<char>(j),
SIMD::memchr8(test + k, static_cast<char>(j), i - k) == expected); i - k) == expected);
MOZ_RELEASE_ASSERT(SIMD::memchr8SSE2(test + k, static_cast<char>(j),
i - k) == expected);
} }
} }
} }
@ -102,47 +138,79 @@ void TestTinyString16() {
const char16_t* test = u"012\n"; const char16_t* test = u"012\n";
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'0', 3) == test + 0x0); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'0', 3) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'0', 3) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'1', 3) == test + 0x1); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'1', 3) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'1', 3) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'2', 3) == test + 0x2); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'2', 3) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'2', 3) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'\n', 3) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'\n', 3) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'\n', 3) == nullptr);
} }
void TestShortString16() { void TestShortString16() {
const char16_t* test = u"0123456789\n"; const char16_t* test = u"0123456789\n";
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'0', 10) == test + 0x0); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'0', 10) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'0', 10) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'1', 10) == test + 0x1); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'1', 10) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'1', 10) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'2', 10) == test + 0x2); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'2', 10) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'2', 10) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'3', 10) == test + 0x3); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'3', 10) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'3', 10) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'4', 10) == test + 0x4); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'4', 10) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'4', 10) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'5', 10) == test + 0x5); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'5', 10) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'5', 10) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'6', 10) == test + 0x6); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'6', 10) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'6', 10) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'7', 10) == test + 0x7); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'7', 10) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'7', 10) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'8', 10) == test + 0x8); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'8', 10) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'8', 10) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'9', 10) == test + 0x9); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'9', 10) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'9', 10) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'\n', 10) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'\n', 10) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'\n', 10) == nullptr);
} }
void TestMediumString16() { void TestMediumString16() {
const char16_t* test = u"0123456789abcdef\n"; const char16_t* test = u"0123456789abcdef\n";
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'0', 16) == test + 0x0); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'0', 16) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'0', 16) == test + 0x0);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'1', 16) == test + 0x1); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'1', 16) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'1', 16) == test + 0x1);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'2', 16) == test + 0x2); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'2', 16) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'2', 16) == test + 0x2);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'3', 16) == test + 0x3); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'3', 16) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'3', 16) == test + 0x3);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'4', 16) == test + 0x4); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'4', 16) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'4', 16) == test + 0x4);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'5', 16) == test + 0x5); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'5', 16) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'5', 16) == test + 0x5);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'6', 16) == test + 0x6); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'6', 16) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'6', 16) == test + 0x6);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'7', 16) == test + 0x7); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'7', 16) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'7', 16) == test + 0x7);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'8', 16) == test + 0x8); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'8', 16) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'8', 16) == test + 0x8);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'9', 16) == test + 0x9); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'9', 16) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'9', 16) == test + 0x9);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'a', 16) == test + 0xa); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'a', 16) == test + 0xa);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'a', 16) == test + 0xa);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'b', 16) == test + 0xb); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'b', 16) == test + 0xb);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'b', 16) == test + 0xb);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'c', 16) == test + 0xc); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'c', 16) == test + 0xc);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'c', 16) == test + 0xc);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'd', 16) == test + 0xd); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'd', 16) == test + 0xd);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'd', 16) == test + 0xd);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'e', 16) == test + 0xe); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'e', 16) == test + 0xe);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'e', 16) == test + 0xe);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'f', 16) == test + 0xf); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'f', 16) == test + 0xf);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'f', 16) == test + 0xf);
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'\n', 16) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, u'\n', 16) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, u'\n', 16) == nullptr);
} }
void TestLongString16() { void TestLongString16() {
@ -153,10 +221,13 @@ void TestLongString16() {
} }
for (size_t i = 0; i < count - 1; ++i) { for (size_t i = 0; i < count - 1; ++i) {
MOZ_RELEASE_ASSERT( MOZ_RELEASE_ASSERT(SIMD::memchr16(test, static_cast<char16_t>(i),
SIMD::memchr16(test, static_cast<char16_t>(i), count - 1) == test + i); count - 1) == test + i);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, static_cast<char16_t>(i),
count - 1) == test + i);
} }
MOZ_RELEASE_ASSERT(SIMD::memchr16(test, count - 1, count - 1) == nullptr); MOZ_RELEASE_ASSERT(SIMD::memchr16(test, count - 1, count - 1) == nullptr);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test, count - 1, count - 1) == nullptr);
} }
void TestGauntlet16() { void TestGauntlet16() {
@ -174,8 +245,12 @@ void TestGauntlet16() {
if (j >= k && j < i) { if (j >= k && j < i) {
expected = test + j; expected = test + j;
} }
MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k, static_cast<char16_t>(j), MOZ_RELEASE_ASSERT(SIMD::memchr16(test + k,
i - k) == expected); static_cast<char16_t>(j),
i - k) == expected);
MOZ_RELEASE_ASSERT(SIMD::memchr16SSE2(test + k,
static_cast<char16_t>(j),
i - k) == expected);
} }
} }
} }