`vector_algorithms.cpp`: `find`, `find_last`, `count`: make AVX2 path avoid SSE path and (for some types) fallback (#4570)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
This commit is contained in:
Alex Guteniev 2024-04-12 21:24:33 +03:00 коммит произвёл GitHub
Родитель 9839187345
Коммит 886ef10aaf
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
2 изменённых файлов: 99 добавлений и 42 удалений

Просмотреть файл

@ -5,7 +5,9 @@
#include <benchmark/benchmark.h>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <ranges>
#include <vector>
enum class Op {
FindSized,
@ -15,39 +17,50 @@ enum class Op {
using namespace std;
template <class T, size_t Size, size_t Pos, Op Operation>
template <class T, Op Operation>
void bm(benchmark::State& state) {
T a[Size];
const auto size = static_cast<size_t>(state.range(0));
const auto pos = static_cast<size_t>(state.range(1));
fill_n(a, Size, T{'0'});
if constexpr (Pos < Size) {
a[Pos] = T{'1'};
vector<T> a(size, T{'0'});
if (pos < size) {
a[pos] = T{'1'};
} else {
static_assert(Operation != Op::FindUnsized);
if constexpr (Operation == Op::FindUnsized) {
abort();
}
}
for (auto _ : state) {
if constexpr (Operation == Op::FindSized) {
benchmark::DoNotOptimize(ranges::find(a, a + Size, T{'1'}));
benchmark::DoNotOptimize(ranges::find(a.begin(), a.end(), T{'1'}));
} else if constexpr (Operation == Op::FindUnsized) {
benchmark::DoNotOptimize(ranges::find(a, unreachable_sentinel, T{'1'}));
benchmark::DoNotOptimize(ranges::find(a.begin(), unreachable_sentinel, T{'1'}));
} else if constexpr (Operation == Op::Count) {
benchmark::DoNotOptimize(ranges::count(a, a + Size, T{'1'}));
benchmark::DoNotOptimize(ranges::count(a.begin(), a.end(), T{'1'}));
}
}
}
BENCHMARK(bm<uint8_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint8_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<uint8_t, 8021, 3056, Op::Count>);
void common_args(auto bm) {
bm->Args({8021, 3056});
// AVX tail tests
bm->Args({63, 62})->Args({31, 30})->Args({15, 14})->Args({7, 6});
}
BENCHMARK(bm<uint16_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint16_t, 8021, 3056, Op::Count>);
BENCHMARK(bm<uint32_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint32_t, 8021, 3056, Op::Count>);
BENCHMARK(bm<uint8_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint8_t, Op::FindUnsized>)->Apply(common_args);
BENCHMARK(bm<uint8_t, Op::Count>)->Apply(common_args);
BENCHMARK(bm<uint64_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint64_t, 8021, 3056, Op::Count>);
BENCHMARK(bm<uint16_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint16_t, Op::Count>)->Apply(common_args);
BENCHMARK(bm<uint32_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint32_t, Op::Count>)->Apply(common_args);
BENCHMARK(bm<uint64_t, Op::FindSized>)->Apply(common_args);
BENCHMARK(bm<uint64_t, Op::Count>)->Apply(common_args);
BENCHMARK_MAIN();

Просмотреть файл

@ -1837,15 +1837,15 @@ namespace {
template <class _Traits, class _Ty>
const void* __stdcall __std_find_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept {
#ifndef _M_ARM64EC
size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Avx_size = _Size_bytes & ~size_t{0x1F};
if (_Avx_size != 0 && _Use_avx2()) {
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414
const __m256i _Comparand = _Traits::_Set_avx(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Avx_size);
do {
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand));
@ -1858,14 +1858,30 @@ namespace {
_Advance_bytes(_First, 32);
} while (_First != _Stop_at);
_Size_bytes &= 0x1F;
}
const size_t _Sse_size = _Size_bytes & ~size_t{0xF};
if (_Sse_size != 0 && _Use_sse42()) {
if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));
if (_Bingo != 0) {
const unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First, _Offset);
return _First;
}
_Advance_bytes(_First, _Avx_tail_size);
}
if constexpr (sizeof(_Ty) >= 4) {
return _First;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Sse_size);
do {
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand));
@ -1892,15 +1908,15 @@ namespace {
const void* __stdcall __std_find_last_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept {
const void* const _Real_last = _Last;
#ifndef _M_ARM64EC
size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Avx_size = _Size_bytes & ~size_t{0x1F};
if (_Avx_size != 0 && _Use_avx2()) {
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414
const __m256i _Comparand = _Traits::_Set_avx(_Val);
const void* _Stop_at = _Last;
_Rewind_bytes(_Stop_at, _Avx_size);
do {
_Rewind_bytes(_Last, 32);
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_Last));
@ -1912,14 +1928,29 @@ namespace {
return _Last;
}
} while (_Last != _Stop_at);
_Size_bytes &= 0x1F;
}
const size_t _Sse_size = _Size_bytes & ~size_t{0xF};
if (_Sse_size != 0 && _Use_sse42()) {
if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
_Rewind_bytes(_Last, _Avx_tail_size);
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_Last), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));
if (_Bingo != 0) {
const unsigned long _Offset = _lzcnt_u32(_Bingo);
_Advance_bytes(_Last, (31 - _Offset) - (sizeof(_Ty) - 1));
return _Last;
}
}
if constexpr (sizeof(_Ty) >= 4) {
return _Real_last;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Val);
const void* _Stop_at = _Last;
_Rewind_bytes(_Stop_at, _Sse_size);
do {
_Rewind_bytes(_Last, 16);
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_Last));
@ -1952,40 +1983,53 @@ namespace {
size_t _Result = 0;
#ifndef _M_ARM64EC
size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Size_bytes = _Byte_length(_First, _Last);
const size_t _Avx_size = _Size_bytes & ~size_t{0x1F};
if (_Avx_size != 0 && _Use_avx2()) {
if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
const __m256i _Comparand = _Traits::_Set_avx(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Avx_size);
do {
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand));
_Result += __popcnt(_Bingo); // Assume available with SSE4.2
_Advance_bytes(_First, 32);
} while (_First != _Stop_at);
_Size_bytes &= 0x1F;
if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
const int _Bingo =
_mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask));
_Result += __popcnt(_Bingo); // Assume available with SSE4.2
_Advance_bytes(_First, _Avx_tail_size);
}
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
}
const size_t _Sse_size = _Size_bytes & ~size_t{0xF};
if (_Sse_size != 0 && _Use_sse42()) {
_Result >>= _Traits::_Shift;
if constexpr (sizeof(_Ty) >= 4) {
return _Result;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Sse_size);
do {
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand));
_Result += __popcnt(_Bingo); // Assume available with SSE4.2
_Advance_bytes(_First, 16);
} while (_First != _Stop_at);
_Result >>= _Traits::_Shift;
}
#endif // !_M_ARM64EC
_Result >>= _Traits::_Shift;
auto _Ptr = static_cast<const _Ty*>(_First);
for (; _Ptr != _Last; ++_Ptr) {
for (auto _Ptr = static_cast<const _Ty*>(_First); _Ptr != _Last; ++_Ptr) {
if (*_Ptr == _Val) {
++_Result;
}