Vectorize `find_first_of` for 4 and 8 byte elements (#4587)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
This commit is contained in:
Alex Guteniev 2024-04-19 04:53:25 +03:00 коммит произвёл GitHub
Родитель f54203ff7c
Коммит 1b06c5276b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 361 добавлений и 132 удалений

Просмотреть файл

@ -33,18 +33,14 @@ void bm(benchmark::State& state) {
}
}
#define ARGS \
Args({2, 3}) \
->Args({7, 4}) \
->Args({9, 3}) \
->Args({22, 5}) \
->Args({58, 2}) \
->Args({102, 4}) \
->Args({325, 1}) \
->Args({1011, 11}) \
->Args({3056, 7});
void common_args(auto bm) {
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2});
bm->Args({102, 4})->Args({325, 1})->Args({1011, 11})->Args({3056, 7});
}
BENCHMARK(bm<uint8_t>)->ARGS;
BENCHMARK(bm<uint16_t>)->ARGS;
BENCHMARK(bm<uint8_t>)->Apply(common_args);
BENCHMARK(bm<uint16_t>)->Apply(common_args);
BENCHMARK(bm<uint32_t>)->Apply(common_args);
BENCHMARK(bm<uint64_t>)->Apply(common_args);
BENCHMARK_MAIN();

Просмотреть файл

@ -62,6 +62,10 @@ const void* __stdcall __std_find_first_of_trivial_1(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
const void* __stdcall __std_find_first_of_trivial_2(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
const void* __stdcall __std_find_first_of_trivial_4(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
const void* __stdcall __std_find_first_of_trivial_8(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
__declspec(noalias) _Min_max_1i __stdcall __std_minmax_1i(const void* _First, const void* _Last) noexcept;
__declspec(noalias) _Min_max_1u __stdcall __std_minmax_1u(const void* _First, const void* _Last) noexcept;
@ -202,6 +206,12 @@ _Ty1* _Find_first_of_vectorized(
} else if constexpr (sizeof(_Ty1) == 2) {
return const_cast<_Ty1*>(
static_cast<const _Ty1*>(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2)));
} else if constexpr (sizeof(_Ty1) == 4) {
return const_cast<_Ty1*>(
static_cast<const _Ty1*>(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2)));
} else if constexpr (sizeof(_Ty1) == 8) {
return const_cast<_Ty1*>(
static_cast<const _Ty1*>(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2)));
} else {
static_assert(false, "Unexpected size");
}
@ -230,9 +240,7 @@ _INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16;
// Can we activate the vector algorithms for find_first_of?
template <class _It1, class _It2, class _Pr>
constexpr bool _Vector_alg_in_find_first_of_is_safe =
_Equal_memcmp_is_safe<_It1, _It2, _Pr> // can replace value comparison with bitwise comparison
&& sizeof(_Iter_value_t<_It1>) <= 2; // pcmpestri compatible size
constexpr bool _Vector_alg_in_find_first_of_is_safe = _Equal_memcmp_is_safe<_It1, _It2, _Pr>;
// Can we activate the vector algorithms for replace?
template <class _Iter, class _Ty1>

Просмотреть файл

@ -42,9 +42,10 @@ namespace {
};
__m256i _Avx2_tail_mask_32(const size_t _Count_in_dwords) noexcept {
// _Count_in_dwords must be within [1, 7].
static constexpr unsigned int _Tail_masks[14] = {~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0};
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_Tail_masks + (7 - _Count_in_dwords)));
// _Count_in_dwords must be within [0, 8].
static constexpr unsigned int _Tail_masks[16] = {
~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0};
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_Tail_masks + (8 - _Count_in_dwords)));
}
} // namespace
#endif // !defined(_M_ARM64EC)
@ -2037,83 +2038,143 @@ namespace {
return _Result;
}
template <class _Ty>
const void* __stdcall __std_find_first_of_trivial_impl(
const void* _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept {
namespace __std_find_first_of {
template <class _Ty>
const void* __stdcall _Fallback(const void* _First1, const void* const _Last1, const void* const _First2,
const void* const _Last2) noexcept {
auto _Ptr_haystack = static_cast<const _Ty*>(_First1);
const auto _Ptr_haystack_end = static_cast<const _Ty*>(_Last1);
const auto _Ptr_needle = static_cast<const _Ty*>(_First2);
const auto _Ptr_needle_end = static_cast<const _Ty*>(_Last2);
for (; _Ptr_haystack != _Ptr_haystack_end; ++_Ptr_haystack) {
for (auto _Ptr = _Ptr_needle; _Ptr != _Ptr_needle_end; ++_Ptr) {
if (*_Ptr_haystack == *_Ptr) {
return _Ptr_haystack;
}
}
}
return _Ptr_haystack;
}
template <class _Ty>
const void* __stdcall _Impl_pcmpestri(const void* _First1, const void* const _Last1, const void* const _First2,
const void* const _Last2) noexcept {
#ifndef _M_ARM64EC
if (_Use_sse42()) {
constexpr int _Op =
(sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT;
constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8;
const size_t _Needle_length = _Byte_length(_First2, _Last2);
if (_Use_sse42()) {
constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY
| _SIDD_LEAST_SIGNIFICANT;
constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8;
const size_t _Needle_length = _Byte_length(_First2, _Last2);
if (_Needle_length <= 16) {
// Special handling of small needle
// The generic branch could also handle it but with slightly worse performance
if (_Needle_length <= 16) {
// Special handling of small needle
// The generic branch could also handle it but with slightly worse performance
const int _Needle_length_el = static_cast<int>(_Needle_length / sizeof(_Ty));
const int _Needle_length_el = static_cast<int>(_Needle_length / sizeof(_Ty));
alignas(16) uint8_t _Tmp1[16];
memcpy(_Tmp1, _First2, _Needle_length);
const __m128i _Needle = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp1));
alignas(16) uint8_t _Tmp2[16];
memcpy(_Tmp2, _First2, _Needle_length);
const __m128i _Data2 = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp2));
const size_t _Haystack_length = _Byte_length(_First1, _Last1);
const void* _Stop_at = _First1;
_Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF});
const size_t _Haystack_length = _Byte_length(_First1, _Last1);
const void* _Stop_at = _First1;
_Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF});
while (_First1 != _Stop_at) {
const __m128i _Haystack_part = _mm_loadu_si128(static_cast<const __m128i*>(_First1));
if (_mm_cmpestrc(_Needle, _Needle_length_el, _Haystack_part, _Part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Needle, _Needle_length_el, _Haystack_part, _Part_size_el, _Op);
while (_First1 != _Stop_at) {
const __m128i _Data1 = _mm_loadu_si128(static_cast<const __m128i*>(_First1));
if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op);
_Advance_bytes(_First1, _Pos * sizeof(_Ty));
return _First1;
}
_Advance_bytes(_First1, 16);
}
const size_t _Last_part_size = _Haystack_length & 0xF;
const int _Last_part_size_el = static_cast<int>(_Last_part_size / sizeof(_Ty));
alignas(16) uint8_t _Tmp1[16];
memcpy(_Tmp1, _First1, _Last_part_size);
const __m128i _Data1 = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp1));
if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op);
_Advance_bytes(_First1, _Pos * sizeof(_Ty));
return _First1;
}
_Advance_bytes(_First1, 16);
}
const size_t _Last_part_size = _Haystack_length & 0xF;
const int _Last_part_size_el = static_cast<int>(_Last_part_size / sizeof(_Ty));
alignas(16) uint8_t _Tmp2[16];
memcpy(_Tmp2, _First1, _Last_part_size);
const __m128i _Haystack_part = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp2));
if (_mm_cmpestrc(_Needle, _Needle_length_el, _Haystack_part, _Last_part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Needle, _Needle_length_el, _Haystack_part, _Last_part_size_el, _Op);
_Advance_bytes(_First1, _Pos * sizeof(_Ty));
_Advance_bytes(_First1, _Last_part_size);
return _First1;
}
} else {
const void* _Last_needle = _First2;
_Advance_bytes(_Last_needle, _Needle_length & ~size_t{0xF});
_Advance_bytes(_First1, _Last_part_size);
return _First1;
} else {
const void* _Last_needle = _First2;
_Advance_bytes(_Last_needle, _Needle_length & ~size_t{0xF});
const int _Last_needle_length = static_cast<int>(_Needle_length & 0xF);
const int _Last_needle_length = static_cast<int>(_Needle_length & 0xF);
alignas(16) uint8_t _Tmp2[16];
memcpy(_Tmp2, _Last_needle, _Last_needle_length);
const __m128i _Last_needle_val = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp2));
const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty);
alignas(16) uint8_t _Tmp1[16];
memcpy(_Tmp1, _Last_needle, _Last_needle_length);
const __m128i _Last_needle_val = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp1));
const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty);
constexpr int _Not_found = 16; // arbitrary value greater than any found value
constexpr int _Not_found = 16; // arbitrary value greater than any found value
int _Found_pos = _Not_found;
int _Found_pos = _Not_found;
const size_t _Haystack_length = _Byte_length(_First1, _Last1);
const void* _Stop_at = _First1;
_Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF});
const size_t _Haystack_length = _Byte_length(_First1, _Last1);
const void* _Stop_at = _First1;
_Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF});
while (_First1 != _Stop_at) {
const __m128i _Data1 = _mm_loadu_si128(static_cast<const __m128i*>(_First1));
while (_First1 != _Stop_at) {
const __m128i _Haystack_part = _mm_loadu_si128(static_cast<const __m128i*>(_First1));
for (const void* _Cur_needle = _First2; _Cur_needle != _Last_needle;
_Advance_bytes(_Cur_needle, 16)) {
const __m128i _Data2 = _mm_loadu_si128(static_cast<const __m128i*>(_Cur_needle));
if (_mm_cmpestrc(_Data2, _Part_size_el, _Data1, _Part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Data2, _Part_size_el, _Data1, _Part_size_el, _Op);
if (_Pos < _Found_pos) {
_Found_pos = _Pos;
}
}
}
if (const int _Needle_length_el = _Last_needle_length_el; _Needle_length_el != 0) {
const __m128i _Data2 = _Last_needle_val;
if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op);
if (_Pos < _Found_pos) {
_Found_pos = _Pos;
}
}
}
if (_Found_pos != _Not_found) {
_Advance_bytes(_First1, _Found_pos * sizeof(_Ty));
return _First1;
}
_Advance_bytes(_First1, 16);
}
const size_t _Last_part_size = _Haystack_length & 0xF;
const int _Last_part_size_el = static_cast<int>(_Last_part_size / sizeof(_Ty));
alignas(16) uint8_t _Tmp1[16];
memcpy(_Tmp1, _First1, _Last_part_size);
const __m128i _Data1 = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp1));
_Found_pos = _Last_part_size_el;
for (const void* _Cur_needle = _First2; _Cur_needle != _Last_needle;
_Advance_bytes(_Cur_needle, 16)) {
const __m128i _Needle = _mm_loadu_si128(static_cast<const __m128i*>(_Cur_needle));
if (_mm_cmpestrc(_Needle, _Part_size_el, _Haystack_part, _Part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Needle, _Part_size_el, _Haystack_part, _Part_size_el, _Op);
const __m128i _Data2 = _mm_loadu_si128(static_cast<const __m128i*>(_Cur_needle));
if (_mm_cmpestrc(_Data2, _Part_size_el, _Data1, _Last_part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Data2, _Part_size_el, _Data1, _Last_part_size_el, _Op);
if (_Pos < _Found_pos) {
_Found_pos = _Pos;
}
@ -2121,77 +2182,231 @@ namespace {
}
if (const int _Needle_length_el = _Last_needle_length_el; _Needle_length_el != 0) {
const __m128i _Needle = _Last_needle_val;
if (_mm_cmpestrc(_Needle, _Needle_length_el, _Haystack_part, _Part_size_el, _Op)) {
const int _Pos =
_mm_cmpestri(_Needle, _Needle_length_el, _Haystack_part, _Part_size_el, _Op);
const __m128i _Data2 = _Last_needle_val;
if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op);
if (_Pos < _Found_pos) {
_Found_pos = _Pos;
}
}
}
if (_Found_pos != _Not_found) {
_Advance_bytes(_First1, _Found_pos * sizeof(_Ty));
return _First1;
}
_Advance_bytes(_First1, 16);
_Advance_bytes(_First1, _Found_pos * sizeof(_Ty));
return _First1;
}
const size_t _Last_part_size = _Haystack_length & 0xF;
const int _Last_part_size_el = static_cast<int>(_Last_part_size / sizeof(_Ty));
alignas(16) uint8_t _Tmp2[16];
memcpy(_Tmp2, _First1, _Last_part_size);
const __m128i _Haystack_part = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp2));
_Found_pos = _Last_part_size_el;
for (const void* _Cur_needle = _First2; _Cur_needle != _Last_needle; _Advance_bytes(_Cur_needle, 16)) {
const __m128i _Needle = _mm_loadu_si128(static_cast<const __m128i*>(_Cur_needle));
if (_mm_cmpestrc(_Needle, _Part_size_el, _Haystack_part, _Last_part_size_el, _Op)) {
const int _Pos = _mm_cmpestri(_Needle, _Part_size_el, _Haystack_part, _Last_part_size_el, _Op);
if (_Pos < _Found_pos) {
_Found_pos = _Pos;
}
}
}
if (const int _Needle_length_el = _Last_needle_length_el; _Needle_length_el != 0) {
const __m128i _Needle = _Last_needle_val;
if (_mm_cmpestrc(_Needle, _Needle_length_el, _Haystack_part, _Last_part_size_el, _Op)) {
const int _Pos =
_mm_cmpestri(_Needle, _Needle_length_el, _Haystack_part, _Last_part_size_el, _Op);
if (_Pos < _Found_pos) {
_Found_pos = _Pos;
}
}
}
_Advance_bytes(_First1, _Found_pos * sizeof(_Ty));
return _First1;
}
#endif // !_M_ARM64EC
return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2);
}
struct _Traits_4 : _Find_traits_4 {
using _Ty = uint32_t;
#ifndef _M_ARM64EC
template <size_t _Amount>
static __m256i _Spread_avx(__m256i _Val, const size_t _Needle_length_el) noexcept {
if constexpr (_Amount == 1) {
return _mm256_broadcastd_epi32(_mm256_castsi256_si128(_Val));
} else if constexpr (_Amount == 2) {
return _mm256_broadcastq_epi64(_mm256_castsi256_si128(_Val));
} else if constexpr (_Amount == 4) {
if (_Needle_length_el < 4) {
_Val = _mm256_shuffle_epi32(_Val, _MM_SHUFFLE(0, 2, 1, 0));
}
return _mm256_permute4x64_epi64(_Val, _MM_SHUFFLE(1, 0, 1, 0));
} else if constexpr (_Amount == 8) {
if (_Needle_length_el < 8) {
const __m256i _Mask = _Avx2_tail_mask_32(_Needle_length_el);
// zero unused elements in sequential permutation mask, so will be filled by 1st
const __m256i _Perm = _mm256_and_si256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _Mask);
_Val = _mm256_permutevar8x32_epi32(_Val, _Perm);
}
return _Val;
} else {
static_assert(false, "Unexpected amount");
}
}
template <size_t _Amount>
static __m256i _Shuffle_avx(const __m256i _Val) noexcept {
if constexpr (_Amount == 1) {
return _mm256_shuffle_epi32(_Val, _MM_SHUFFLE(2, 3, 0, 1));
} else if constexpr (_Amount == 2) {
return _mm256_shuffle_epi32(_Val, _MM_SHUFFLE(1, 0, 3, 2));
} else if constexpr (_Amount == 4) {
return _mm256_permute4x64_epi64(_Val, _MM_SHUFFLE(1, 0, 3, 2));
} else {
static_assert(false, "Unexpected amount");
}
}
#endif // !_M_ARM64EC
};
struct _Traits_8 : _Find_traits_8 {
using _Ty = uint64_t;
#ifndef _M_ARM64EC
template <size_t _Amount>
static __m256i _Spread_avx(const __m256i _Val, const size_t _Needle_length_el) noexcept {
if constexpr (_Amount == 1) {
return _mm256_broadcastq_epi64(_mm256_castsi256_si128(_Val));
} else if constexpr (_Amount == 2) {
return _mm256_permute4x64_epi64(_Val, _MM_SHUFFLE(1, 0, 1, 0));
} else if constexpr (_Amount == 4) {
if (_Needle_length_el < 4) {
return _mm256_permute4x64_epi64(_Val, _MM_SHUFFLE(0, 2, 1, 0));
}
return _Val;
} else {
static_assert(false, "Unexpected amount");
}
}
template <size_t _Amount>
static __m256i _Shuffle_avx(const __m256i _Val) noexcept {
if constexpr (_Amount == 1) {
return _mm256_shuffle_epi32(_Val, _MM_SHUFFLE(1, 0, 3, 2));
} else if constexpr (_Amount == 2) {
return _mm256_permute4x64_epi64(_Val, _MM_SHUFFLE(1, 0, 3, 2));
} else {
static_assert(false, "Unexpected amount");
}
}
#endif // !_M_ARM64EC
};
#ifndef _M_ARM64EC
template <class _Traits, size_t _Needle_length_el_magnitude>
__m256i _Shuffle_step(const __m256i _Data1, const __m256i _Data2s0) noexcept {
__m256i _Eq = _Traits::_Cmp_avx(_Data1, _Data2s0);
if constexpr (_Needle_length_el_magnitude >= 2) {
const __m256i _Data2s1 = _Traits::_Shuffle_avx<1>(_Data2s0);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s1));
if constexpr (_Needle_length_el_magnitude >= 4) {
const __m256i _Data2s2 = _Traits::_Shuffle_avx<2>(_Data2s0);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s2));
const __m256i _Data2s3 = _Traits::_Shuffle_avx<1>(_Data2s2);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s3));
if constexpr (_Needle_length_el_magnitude >= 8) {
const __m256i _Data2s4 = _Traits::_Shuffle_avx<4>(_Data2s0);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s4));
const __m256i _Data2s5 = _Traits::_Shuffle_avx<1>(_Data2s4);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s5));
const __m256i _Data2s6 = _Traits::_Shuffle_avx<2>(_Data2s4);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s6));
const __m256i _Data2s7 = _Traits::_Shuffle_avx<1>(_Data2s6);
_Eq = _mm256_or_si256(_Eq, _Traits::_Cmp_avx(_Data1, _Data2s7));
}
}
}
return _Eq;
}
template <class _Traits, size_t _Needle_length_el_magnitude>
const void* _Shuffle_impl(const void* _First1, const void* const _Last1, const void* const _First2,
const size_t _Needle_length_el) noexcept {
using _Ty = _Traits::_Ty;
const __m256i _Data2 = _mm256_maskload_epi32(
reinterpret_cast<const int*>(_First2), _Avx2_tail_mask_32(_Needle_length_el * (sizeof(_Ty) / 4)));
const __m256i _Data2s0 = _Traits::_Spread_avx<_Needle_length_el_magnitude>(_Data2, _Needle_length_el);
const size_t _Haystack_length = _Byte_length(_First1, _Last1);
const void* _Stop1 = _First1;
_Advance_bytes(_Stop1, _Haystack_length & ~size_t{0x1F});
for (; _First1 != _Stop1; _Advance_bytes(_First1, 32)) {
const __m256i _Data1 = _mm256_loadu_si256(static_cast<const __m256i*>(_First1));
const __m256i _Eq = _Shuffle_step<_Traits, _Needle_length_el_magnitude>(_Data1, _Data2s0);
const int _Bingo = _mm256_movemask_epi8(_Eq);
if (_Bingo != 0) {
const unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First1, _Offset);
return _First1;
}
}
if (const size_t _Haystack_tail_length = _Haystack_length & 0x1C; _Haystack_tail_length != 0) {
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Haystack_tail_length >> 2);
const __m256i _Data1 = _mm256_maskload_epi32(static_cast<const int*>(_First1), _Tail_mask);
const __m256i _Eq = _Shuffle_step<_Traits, _Needle_length_el_magnitude>(_Data1, _Data2s0);
const int _Bingo = _mm256_movemask_epi8(_mm256_and_si256(_Eq, _Tail_mask));
if (_Bingo != 0) {
const unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First1, _Offset);
return _First1;
}
_Advance_bytes(_First1, _Haystack_tail_length);
}
return _First1;
}
#endif // !_M_ARM64EC
auto _Ptr_haystack = static_cast<const _Ty*>(_First1);
const auto _Ptr_haystack_end = static_cast<const _Ty*>(_Last1);
const auto _Ptr_needle = static_cast<const _Ty*>(_First2);
const auto _Ptr_needle_end = static_cast<const _Ty*>(_Last2);
template <class _Traits>
const void* __stdcall _Impl_4_8(const void* const _First1, const void* const _Last1, const void* const _First2,
const void* const _Last2) noexcept {
using _Ty = _Traits::_Ty;
#ifndef _M_ARM64EC
if (_Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414
for (; _Ptr_haystack != _Ptr_haystack_end; ++_Ptr_haystack) {
for (auto _Ptr = _Ptr_needle; _Ptr != _Ptr_needle_end; ++_Ptr) {
if (*_Ptr_haystack == *_Ptr) {
return _Ptr_haystack;
const size_t _Needle_length = _Byte_length(_First2, _Last2);
const int _Needle_length_el = static_cast<int>(_Needle_length / sizeof(_Ty));
// Special handling of small needle
// The generic approach could also handle it but with worse performance
if (_Needle_length_el == 0) {
return _Last1;
} else if (_Needle_length_el == 1) {
_STL_UNREACHABLE; // This is expected to be forwarded to 'find' on an upper level
} else if (_Needle_length_el == 2) {
return _Shuffle_impl<_Traits, 2>(_First1, _Last1, _First2, _Needle_length_el);
} else if (_Needle_length_el <= 4) {
return _Shuffle_impl<_Traits, 4>(_First1, _Last1, _First2, _Needle_length_el);
} else if (_Needle_length_el <= 8) {
if constexpr (sizeof(_Ty) == 4) {
return _Shuffle_impl<_Traits, 8>(_First1, _Last1, _First2, _Needle_length_el);
}
}
// Generic approach
const size_t _Needle_length_tail = _Needle_length & 0x1C;
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Needle_length_tail >> 2);
const void* _Stop2 = _First2;
_Advance_bytes(_Stop2, _Needle_length & ~size_t{0x1F});
for (auto _Ptr1 = static_cast<const _Ty*>(_First1); _Ptr1 != _Last1; ++_Ptr1) {
const auto _Data1 = _Traits::_Set_avx(*_Ptr1);
for (auto _Ptr2 = _First2; _Ptr2 != _Stop2; _Advance_bytes(_Ptr2, 32)) {
const __m256i _Data2 = _mm256_loadu_si256(static_cast<const __m256i*>(_Ptr2));
const __m256i _Eq = _Traits::_Cmp_avx(_Data1, _Data2);
if (!_mm256_testz_si256(_Eq, _Eq)) {
return _Ptr1;
}
}
if (_Needle_length_tail != 0) {
const __m256i _Data2 = _mm256_maskload_epi32(static_cast<const int*>(_Stop2), _Tail_mask);
const __m256i _Eq = _Traits::_Cmp_avx(_Data1, _Data2);
if (!_mm256_testz_si256(_Eq, _Tail_mask)) {
return _Ptr1;
}
}
}
return _Last1;
}
#endif // !_M_ARM64EC
return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2);
}
return _Ptr_haystack;
}
} // namespace __std_find_first_of
template <class _Traits, class _Ty>
__declspec(noalias) size_t __stdcall __std_mismatch_impl(
@ -2354,12 +2569,22 @@ __declspec(noalias) size_t
const void* __stdcall __std_find_first_of_trivial_1(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept {
return __std_find_first_of_trivial_impl<uint8_t>(_First1, _Last1, _First2, _Last2);
return __std_find_first_of::_Impl_pcmpestri<uint8_t>(_First1, _Last1, _First2, _Last2);
}
const void* __stdcall __std_find_first_of_trivial_2(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept {
return __std_find_first_of_trivial_impl<uint16_t>(_First1, _Last1, _First2, _Last2);
return __std_find_first_of::_Impl_pcmpestri<uint16_t>(_First1, _Last1, _First2, _Last2);
}
const void* __stdcall __std_find_first_of_trivial_4(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept {
return __std_find_first_of::_Impl_4_8<__std_find_first_of::_Traits_4>(_First1, _Last1, _First2, _Last2);
}
const void* __stdcall __std_find_first_of_trivial_8(
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept {
return __std_find_first_of::_Impl_4_8<__std_find_first_of::_Traits_8>(_First1, _Last1, _First2, _Last2);
}
__declspec(noalias) size_t