From 886ef10aafc93fe40d6ee6442bb3aa79391a9545 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 12 Apr 2024 21:24:33 +0300 Subject: [PATCH] `vector_algorithms.cpp`: `find`, `find_last`, `count`: make AVX2 path avoid SSE path and (for some types) fallback (#4570) Co-authored-by: Stephan T. Lavavej --- benchmarks/src/find_and_count.cpp | 49 ++++++++++------ stl/src/vector_algorithms.cpp | 92 +++++++++++++++++++++++-------- 2 files changed, 99 insertions(+), 42 deletions(-) diff --git a/benchmarks/src/find_and_count.cpp b/benchmarks/src/find_and_count.cpp index 7b205aee6..9c608bfe3 100644 --- a/benchmarks/src/find_and_count.cpp +++ b/benchmarks/src/find_and_count.cpp @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include enum class Op { FindSized, @@ -15,39 +17,50 @@ enum class Op { using namespace std; -template +template void bm(benchmark::State& state) { - T a[Size]; + const auto size = static_cast(state.range(0)); + const auto pos = static_cast(state.range(1)); - fill_n(a, Size, T{'0'}); - if constexpr (Pos < Size) { - a[Pos] = T{'1'}; + vector a(size, T{'0'}); + + if (pos < size) { + a[pos] = T{'1'}; } else { - static_assert(Operation != Op::FindUnsized); + if constexpr (Operation == Op::FindUnsized) { + abort(); + } } for (auto _ : state) { if constexpr (Operation == Op::FindSized) { - benchmark::DoNotOptimize(ranges::find(a, a + Size, T{'1'})); + benchmark::DoNotOptimize(ranges::find(a.begin(), a.end(), T{'1'})); } else if constexpr (Operation == Op::FindUnsized) { - benchmark::DoNotOptimize(ranges::find(a, unreachable_sentinel, T{'1'})); + benchmark::DoNotOptimize(ranges::find(a.begin(), unreachable_sentinel, T{'1'})); } else if constexpr (Operation == Op::Count) { - benchmark::DoNotOptimize(ranges::count(a, a + Size, T{'1'})); + benchmark::DoNotOptimize(ranges::count(a.begin(), a.end(), T{'1'})); } } } -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); +void common_args(auto bm) { + bm->Args({8021, 3056}); + // AVX tail tests + bm->Args({63, 62})->Args({31, 30})->Args({15, 14})->Args({7, 6}); +} -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); -BENCHMARK(bm); -BENCHMARK(bm); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); BENCHMARK_MAIN(); diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 0eb3c4cf2..2349d804e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -1837,15 +1837,15 @@ namespace { template const void* __stdcall __std_find_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept { #ifndef _M_ARM64EC - size_t _Size_bytes = _Byte_length(_First, _Last); + const size_t _Size_bytes = _Byte_length(_First, _Last); - const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; - if (_Avx_size != 0 && _Use_avx2()) { + if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) { _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 const __m256i _Comparand = _Traits::_Set_avx(_Val); const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Avx_size); + do { const __m256i _Data = _mm256_loadu_si256(static_cast(_First)); const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand)); @@ -1858,14 +1858,30 @@ namespace { _Advance_bytes(_First, 32); } while (_First != _Stop_at); - _Size_bytes &= 0x1F; - } - const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; - if (_Sse_size != 0 && _Use_sse42()) { + if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) { + const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2); + const __m256i _Data = _mm256_maskload_epi32(static_cast(_First), _Tail_mask); + const int _Bingo = + _mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask)); + + if (_Bingo != 0) { + const unsigned long _Offset = _tzcnt_u32(_Bingo); + _Advance_bytes(_First, _Offset); + return _First; + } + + _Advance_bytes(_First, _Avx_tail_size); + } + + if constexpr (sizeof(_Ty) >= 4) { + return _First; + } + } else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) { const __m128i _Comparand = _Traits::_Set_sse(_Val); const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Sse_size); + do { const __m128i _Data = _mm_loadu_si128(static_cast(_First)); const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand)); @@ -1892,15 +1908,15 @@ namespace { const void* __stdcall __std_find_last_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept { const void* const _Real_last = _Last; #ifndef _M_ARM64EC - size_t _Size_bytes = _Byte_length(_First, _Last); + const size_t _Size_bytes = _Byte_length(_First, _Last); - const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; - if (_Avx_size != 0 && _Use_avx2()) { + if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) { _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 const __m256i _Comparand = _Traits::_Set_avx(_Val); const void* _Stop_at = _Last; _Rewind_bytes(_Stop_at, _Avx_size); + do { _Rewind_bytes(_Last, 32); const __m256i _Data = _mm256_loadu_si256(static_cast(_Last)); @@ -1912,14 +1928,29 @@ namespace { return _Last; } } while (_Last != _Stop_at); - _Size_bytes &= 0x1F; - } - const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; - if (_Sse_size != 0 && _Use_sse42()) { + if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) { + _Rewind_bytes(_Last, _Avx_tail_size); + const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2); + const __m256i _Data = _mm256_maskload_epi32(static_cast(_Last), _Tail_mask); + const int _Bingo = + _mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask)); + + if (_Bingo != 0) { + const unsigned long _Offset = _lzcnt_u32(_Bingo); + _Advance_bytes(_Last, (31 - _Offset) - (sizeof(_Ty) - 1)); + return _Last; + } + } + + if constexpr (sizeof(_Ty) >= 4) { + return _Real_last; + } + } else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) { const __m128i _Comparand = _Traits::_Set_sse(_Val); const void* _Stop_at = _Last; _Rewind_bytes(_Stop_at, _Sse_size); + do { _Rewind_bytes(_Last, 16); const __m128i _Data = _mm_loadu_si128(static_cast(_Last)); @@ -1952,40 +1983,53 @@ namespace { size_t _Result = 0; #ifndef _M_ARM64EC - size_t _Size_bytes = _Byte_length(_First, _Last); + const size_t _Size_bytes = _Byte_length(_First, _Last); - const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; - if (_Avx_size != 0 && _Use_avx2()) { + if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) { const __m256i _Comparand = _Traits::_Set_avx(_Val); const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Avx_size); + do { const __m256i _Data = _mm256_loadu_si256(static_cast(_First)); const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand)); _Result += __popcnt(_Bingo); // Assume available with SSE4.2 _Advance_bytes(_First, 32); } while (_First != _Stop_at); - _Size_bytes &= 0x1F; + + if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) { + const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2); + const __m256i _Data = _mm256_maskload_epi32(static_cast(_First), _Tail_mask); + const int _Bingo = + _mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask)); + _Result += __popcnt(_Bingo); // Assume available with SSE4.2 + _Advance_bytes(_First, _Avx_tail_size); + } _mm256_zeroupper(); // TRANSITION, DevCom-10331414 - } - const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; - if (_Sse_size != 0 && _Use_sse42()) { + _Result >>= _Traits::_Shift; + + if constexpr (sizeof(_Ty) >= 4) { + return _Result; + } + } else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) { const __m128i _Comparand = _Traits::_Set_sse(_Val); const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Sse_size); + do { const __m128i _Data = _mm_loadu_si128(static_cast(_First)); const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand)); _Result += __popcnt(_Bingo); // Assume available with SSE4.2 _Advance_bytes(_First, 16); } while (_First != _Stop_at); + + _Result >>= _Traits::_Shift; } #endif // !_M_ARM64EC - _Result >>= _Traits::_Shift; - auto _Ptr = static_cast(_First); - for (; _Ptr != _Last; ++_Ptr) { + + for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { if (*_Ptr == _Val) { ++_Result; }