diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 64605f496..1a5071ea5 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -110,6 +110,7 @@ endfunction() add_benchmark(bitset_to_string src/bitset_to_string.cpp) add_benchmark(locale_classic src/locale_classic.cpp) +add_benchmark(minmax_element src/minmax_element.cpp) add_benchmark(path_lexically_normal src/path_lexically_normal.cpp) add_benchmark(priority_queue_push_range src/priority_queue_push_range.cpp) add_benchmark(random_integer_generation src/random_integer_generation.cpp) diff --git a/benchmarks/src/minmax_element.cpp b/benchmarks/src/minmax_element.cpp new file mode 100644 index 000000000..da6cf9aff --- /dev/null +++ b/benchmarks/src/minmax_element.cpp @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include + +enum class Op { + Min, + Max, + Both, +}; + +using namespace std; + +template +void bm(benchmark::State& state) { + T a[Size]; + + mt19937 gen(84710); + + if constexpr (is_floating_point_v) { + normal_distribution dis(0, 10000.0); + ranges::generate(a, [&] { return dis(gen); }); + } else { + uniform_int_distribution> dis(1, 20); + ranges::generate(a, [&] { return static_cast(dis(gen)); }); + } + + for (auto _ : state) { + if constexpr (Operation == Op::Min) { + benchmark::DoNotOptimize(ranges::min_element(a)); + } else if constexpr (Operation == Op::Max) { + benchmark::DoNotOptimize(ranges::max_element(a)); + } else if constexpr (Operation == Op::Both) { + benchmark::DoNotOptimize(ranges::minmax_element(a)); + } + } +} + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + +BENCHMARK(bm); +BENCHMARK(bm); +BENCHMARK(bm); + + +BENCHMARK_MAIN(); diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 2a17760a9..5140b2640 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -54,6 +54,8 @@ _Min_max_element_t __stdcall __std_minmax_element_1(const void* _First, const vo _Min_max_element_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; _Min_max_element_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; _Min_max_element_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_element_t __stdcall __std_minmax_element_f(const void* _First, const void* _Last, bool _Unused) noexcept; +_Min_max_element_t __stdcall __std_minmax_element_d(const void* _First, const void* _Last, bool _Unused) noexcept; const void* __stdcall __std_find_last_trivial_1(const void* _First, const void* _Last, uint8_t _Val) noexcept; const void* __stdcall __std_find_last_trivial_2(const void* _First, const void* _Last, uint16_t _Val) noexcept; @@ -68,7 +70,11 @@ _STD pair<_Ty*, _Ty*> __std_minmax_element(_Ty* _First, _Ty* _Last) noexcept { _Min_max_element_t _Res; - if constexpr (sizeof(_Ty) == 1) { + if constexpr (_STD is_same_v<_STD remove_const_t<_Ty>, float>) { + _Res = ::__std_minmax_element_f(_First, _Last, false); + } else if constexpr (_STD _Is_any_of_v<_STD remove_const_t<_Ty>, double, long double>) { + _Res = ::__std_minmax_element_d(_First, _Last, false); + } else if constexpr (sizeof(_Ty) == 1) { _Res = ::__std_minmax_element_1(_First, _Last, _Signed); } else if constexpr (sizeof(_Ty) == 2) { _Res = ::__std_minmax_element_2(_First, _Last, _Signed); diff --git a/stl/inc/xutility b/stl/inc/xutility index 08e73238d..f2634616e 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -48,6 +48,18 @@ _STL_DISABLE_CLANG_WARNINGS #endif // ^^^ _USE_STD_VECTOR_ALGORITHMS != 0 ^^^ #endif // ^^^ no support for vector algorithms ^^^ +#ifndef _USE_STD_VECTOR_FLOATING_ALGORITHMS +#if _USE_STD_VECTOR_ALGORITHMS && !defined(_M_FP_EXCEPT) +#define _USE_STD_VECTOR_FLOATING_ALGORITHMS 1 +#else // ^^^ use vector algorithms and fast math / not use vector algorithms or not use fast math vvv +#define _USE_STD_VECTOR_FLOATING_ALGORITHMS 0 +#endif // ^^^ not use vector algorithms or not use fast math ^^^ +#else // ^^^ !defined(_USE_STD_VECTOR_FLOATING_ALGORITHMS) / defined(_USE_STD_VECTOR_FLOATING_ALGORITHMS) vvv +#if _USE_STD_VECTOR_FLOATING_ALGORITHMS && !_USE_STD_VECTOR_ALGORITHMS +#error _USE_STD_VECTOR_FLOATING_ALGORITHMS must imply _USE_STD_VECTOR_ALGORITHMS. +#endif // _USE_STD_VECTOR_FLOATING_ALGORITHMS && !_USE_STD_VECTOR_ALGORITHMS +#endif // ^^^ defined(_USE_STD_VECTOR_FLOATING_ALGORITHMS) ^^^ + #if _USE_STD_VECTOR_ALGORITHMS extern "C" { // The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms @@ -87,11 +99,15 @@ const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_min_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_min_element_f(const void* _First, const void* _Last, bool _Unused) noexcept; +const void* __stdcall __std_min_element_d(const void* _First, const void* _Last, bool _Unused) noexcept; const void* __stdcall __std_max_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_max_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_max_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_max_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_max_element_f(const void* _First, const void* _Last, bool _Unused) noexcept; +const void* __stdcall __std_max_element_d(const void* _First, const void* _Last, bool _Unused) noexcept; } // extern "C" _STD_BEGIN @@ -158,7 +174,11 @@ template _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; - if constexpr (sizeof(_Ty) == 1) { + if constexpr (_STD is_same_v<_STD remove_const_t<_Ty>, float>) { + return const_cast<_Ty*>(static_cast(::__std_min_element_f(_First, _Last, false))); + } else if constexpr (_STD _Is_any_of_v<_STD remove_const_t<_Ty>, double, long double>) { + return const_cast<_Ty*>(static_cast(::__std_min_element_d(_First, _Last, false))); + } else if constexpr (sizeof(_Ty) == 1) { return const_cast<_Ty*>(static_cast(::__std_min_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { return const_cast<_Ty*>(static_cast(::__std_min_element_2(_First, _Last, _Signed))); @@ -175,7 +195,11 @@ template _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; - if constexpr (sizeof(_Ty) == 1) { + if constexpr (_STD is_same_v<_STD remove_const_t<_Ty>, float>) { + return const_cast<_Ty*>(static_cast(::__std_max_element_f(_First, _Last, false))); + } else if constexpr (_STD _Is_any_of_v<_STD remove_const_t<_Ty>, double, long double>) { + return const_cast<_Ty*>(static_cast(::__std_max_element_d(_First, _Last, false))); + } else if constexpr (sizeof(_Ty) == 1) { return const_cast<_Ty*>(static_cast(::__std_max_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { return const_cast<_Ty*>(static_cast(::__std_max_element_2(_First, _Last, _Signed))); @@ -6607,7 +6631,15 @@ template > _INLINE_VAR constexpr bool _Is_min_max_optimization_safe = // Activate the vector algorithms for min_/max_element? _Iterator_is_contiguous<_Iter> // The iterator must be contiguous so we can get raw pointers. && !_Iterator_is_volatile<_Iter> // The iterator must not be volatile. - && conjunction_v, is_pointer<_Elem>>, // Element is of integral or pointer type. + && conjunction_v, is_same<_Elem, double>, +#else // ^^^ 80-bit long double (not supported by MSVC in general, see GH-1316) / 64-bit long double vvv + is_floating_point<_Elem>, // Element is floating point or... +#endif // ^^^ 64-bit long double ^^^ +#endif // _USE_STD_VECTOR_FLOATING_ALGORITHMS + is_integral<_Elem>, is_pointer<_Elem>>, // ... integral or pointer type. disjunction< // And either of the following: #if _HAS_CXX20 is_same<_Pr, _RANGES less>, // predicate is ranges::less diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 70a4e5b45..66f889575 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -545,6 +545,8 @@ namespace { }; struct _Minmax_traits_1 { + static constexpr bool _Is_floating = false; + using _Signed_t = int8_t; using _Unsigned_t = uint8_t; @@ -555,6 +557,10 @@ namespace { static constexpr bool _Has_portion_max = true; static constexpr size_t _Portion_max = 256; + static __m128i _Load(const void* _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][16] = { {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, {}}; @@ -610,6 +616,10 @@ namespace { return _mm_cmpgt_epi8(_First, _Second); } + static __m128i _Cmp_eq_idx(const __m128i _First, const __m128i _Second) noexcept { + return _mm_cmpeq_epi8(_First, _Second); + } + static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi8(_First, _Second); } @@ -617,10 +627,16 @@ namespace { static __m128i _Max(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_max_epi8(_First, _Second); } + + static __m128i _Mask_cast(__m128i _Mask) noexcept { + return _Mask; + } #endif // !_M_ARM64EC }; struct _Minmax_traits_2 { + static constexpr bool _Is_floating = false; + using _Signed_t = int16_t; using _Unsigned_t = uint16_t; @@ -631,6 +647,10 @@ namespace { static constexpr bool _Has_portion_max = true; static constexpr size_t _Portion_max = 65536; + static __m128i _Load(const void* _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][8] = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, {}}; @@ -687,6 +707,10 @@ namespace { return _mm_cmpgt_epi16(_First, _Second); } + static __m128i _Cmp_eq_idx(const __m128i _First, const __m128i _Second) noexcept { + return _mm_cmpeq_epi16(_First, _Second); + } + static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi16(_First, _Second); } @@ -694,10 +718,16 @@ namespace { static __m128i _Max(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_max_epi16(_First, _Second); } + + static __m128i _Mask_cast(__m128i _Mask) noexcept { + return _Mask; + } #endif // !_M_ARM64EC }; struct _Minmax_traits_4 { + static constexpr bool _Is_floating = false; + using _Signed_t = int32_t; using _Unsigned_t = uint32_t; @@ -712,6 +742,10 @@ namespace { static constexpr size_t _Portion_max = 0x1'0000'0000ULL; #endif // ^^^ 64-bit ^^^ + static __m128i _Load(const void* _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][4] = { 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, {}}; @@ -764,6 +798,10 @@ namespace { return _mm_cmpgt_epi32(_First, _Second); } + static __m128i _Cmp_eq_idx(const __m128i _First, const __m128i _Second) noexcept { + return _mm_cmpeq_epi32(_First, _Second); + } + static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi32(_First, _Second); } @@ -771,10 +809,16 @@ namespace { static __m128i _Max(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_max_epi32(_First, _Second); } + + static __m128i _Mask_cast(__m128i _Mask) noexcept { + return _Mask; + } #endif // !_M_ARM64EC }; struct _Minmax_traits_8 { + static constexpr bool _Is_floating = false; + using _Signed_t = int64_t; using _Unsigned_t = uint64_t; @@ -784,6 +828,10 @@ namespace { #ifndef _M_ARM64EC static constexpr bool _Has_portion_max = false; + static __m128i _Load(const void* _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][2] = { 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; @@ -844,6 +892,10 @@ namespace { return _mm_cmpgt_epi64(_First, _Second); } + static __m128i _Cmp_eq_idx(const __m128i _First, const __m128i _Second) noexcept { + return _mm_cmpeq_epi64(_First, _Second); + } + static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i _Mask) noexcept { return _mm_blendv_epi8(_First, _Second, _Mask); } @@ -851,6 +903,206 @@ namespace { static __m128i _Max(const __m128i _First, const __m128i _Second, const __m128i _Mask) noexcept { return _mm_blendv_epi8(_First, _Second, _Mask); } + + static __m128i _Mask_cast(__m128i _Mask) noexcept { + return _Mask; + } +#endif // !_M_ARM64EC + }; + + struct _Minmax_traits_f { + static constexpr bool _Is_floating = true; + + using _Signed_t = float; + + static constexpr _Signed_t _Init_min_val = __builtin_huge_valf(); + static constexpr _Signed_t _Init_max_val = -__builtin_huge_valf(); + +#ifndef _M_ARM64EC +#ifdef _M_IX86 + static constexpr bool _Has_portion_max = false; +#else // ^^^ 32-bit / 64-bit vvv + static constexpr bool _Has_portion_max = true; + static constexpr size_t _Portion_max = 0x1'0000'0000ULL; +#endif // ^^^ 64-bit ^^^ + + static __m128 _Load(const void* _Src) noexcept { + return _mm_loadu_ps(reinterpret_cast(_Src)); + } + + static __m128 _Sign_correction(const __m128 _Val, bool) noexcept { + return _Val; + } + + static __m128i _Inc(__m128i _Idx) noexcept { + return _mm_add_epi32(_Idx, _mm_set1_epi32(1)); + } + + template + static __m128 _H_func(const __m128 _Cur, _Fn _Funct) noexcept { + __m128 _H_min_val = _Cur; + _H_min_val = _Funct(_H_min_val, _mm_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(2, 3, 0, 1))); + return _H_min_val; + } + + template + static __m128i _H_func_u(const __m128i _Cur, _Fn _Funct) noexcept { + __m128i _H_min_val = _Cur; + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(2, 3, 0, 1))); + return _H_min_val; + } + + static __m128 _H_min(const __m128 _Cur) noexcept { + return _H_func(_Cur, [](__m128 _First, __m128 _Second) { return _mm_min_ps(_First, _Second); }); + } + + static __m128 _H_max(const __m128 _Cur) noexcept { + return _H_func(_Cur, [](__m128 _First, __m128 _Second) { return _mm_max_ps(_First, _Second); }); + } + + static __m128i _H_min_u(const __m128i _Cur) noexcept { + return _H_func_u(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epu32(_First, _Second); }); + } + + static __m128i _H_max_u(const __m128i _Cur) noexcept { + return _H_func_u(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epu32(_First, _Second); }); + } + + static float _Get_any(const __m128 _Cur) noexcept { + return _mm_cvtss_f32(_Cur); + } + + static uint32_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { + uint32_t _Array[4]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); + return _Array[_H_pos >> 2]; + } + + static __m128 _Cmp_eq(const __m128 _First, const __m128 _Second) noexcept { + return _mm_cmpeq_ps(_First, _Second); + } + + static __m128 _Cmp_gt(const __m128 _First, const __m128 _Second) noexcept { + return _mm_cmpgt_ps(_First, _Second); + } + + static __m128i _Cmp_eq_idx(const __m128i _First, const __m128i _Second) noexcept { + return _mm_cmpeq_epi32(_First, _Second); + } + + static __m128 _Min(const __m128 _First, const __m128 _Second, __m128) noexcept { + return _mm_min_ps(_First, _Second); + } + + static __m128 _Max(const __m128 _First, const __m128 _Second, __m128) noexcept { + return _mm_max_ps(_First, _Second); + } + + static __m128i _Mask_cast(__m128 _Mask) noexcept { + return _mm_castps_si128(_Mask); + } +#endif // !_M_ARM64EC + }; + + struct _Minmax_traits_d { + static constexpr bool _Is_floating = true; + + using _Signed_t = double; + + static constexpr _Signed_t _Init_min_val = __builtin_huge_val(); + static constexpr _Signed_t _Init_max_val = -__builtin_huge_val(); + +#ifndef _M_ARM64EC + static constexpr bool _Has_portion_max = false; + + static __m128d _Load(const void* _Src) noexcept { + return _mm_loadu_pd(reinterpret_cast(_Src)); + } + + static __m128d _Sign_correction(const __m128d _Val, bool) noexcept { + return _Val; + } + + static __m128i _Inc(__m128i _Idx) noexcept { + return _mm_add_epi64(_Idx, _mm_set1_epi64x(1)); + } + + template + static __m128d _H_func(const __m128d _Cur, _Fn _Funct) noexcept { + __m128d _H_min_val = _Cur; + _H_min_val = _Funct(_H_min_val, _mm_shuffle_pd(_H_min_val, _H_min_val, 1)); + return _H_min_val; + } + + template + static __m128i _H_func_u(const __m128i _Cur, _Fn _Funct) noexcept { + uint64_t _H_min_a = _Get_any_u(_Cur); + uint64_t _H_min_b = _Get_any_u(_mm_bsrli_si128(_Cur, 8)); + if (_Funct(_H_min_b, _H_min_a)) { + _H_min_a = _H_min_b; + } + return _mm_set1_epi64x(_H_min_a); + } + + static __m128d _H_min(const __m128d _Cur) noexcept { + return _H_func(_Cur, [](__m128d _First, __m128d _Second) { return _mm_min_pd(_First, _Second); }); + } + + static __m128d _H_max(const __m128d _Cur) noexcept { + return _H_func(_Cur, [](__m128d _First, __m128d _Second) { return _mm_max_pd(_First, _Second); }); + } + + static __m128i _H_min_u(const __m128i _Cur) noexcept { + return _H_func_u(_Cur, [](uint64_t _Lhs, uint64_t _Rhs) { return _Lhs < _Rhs; }); + } + + static __m128i _H_max_u(const __m128i _Cur) noexcept { + return _H_func_u(_Cur, [](uint64_t _Lhs, uint64_t _Rhs) { return _Lhs > _Rhs; }); + } + static double _Get_any(const __m128d _Cur) noexcept { + return _mm_cvtsd_f64(_Cur); + } + + static uint64_t _Get_any_u(const __m128i _Cur) noexcept { +#ifdef _M_IX86 + return (static_cast(static_cast(_mm_extract_epi32(_Cur, 1))) << 32) + | static_cast(static_cast(_mm_cvtsi128_si32(_Cur))); +#else // ^^^ x86 / x64 vvv + return static_cast(_mm_cvtsi128_si64(_Cur)); +#endif // ^^^ x64 ^^^ + } + + static uint64_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { + uint64_t _Array[2]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); + return _Array[_H_pos >> 3]; + } + + static __m128d _Cmp_eq(const __m128d _First, const __m128d _Second) noexcept { + return _mm_cmpeq_pd(_First, _Second); + } + + static __m128d _Cmp_gt(const __m128d _First, const __m128d _Second) noexcept { + return _mm_cmpgt_pd(_First, _Second); + } + + static __m128i _Cmp_eq_idx(const __m128i _First, const __m128i _Second) noexcept { + return _mm_cmpeq_epi64(_First, _Second); + } + + static __m128d _Min(const __m128d _First, const __m128d _Second, __m128d) noexcept { + return _mm_min_pd(_First, _Second); + } + + static __m128d _Max(const __m128d _First, const __m128d _Second, __m128d) noexcept { + return _mm_max_pd(_First, _Second); + } + + static __m128i _Mask_cast(__m128d _Mask) noexcept { + return _mm_castpd_si128(_Mask); + } #endif // !_M_ARM64EC }; @@ -882,13 +1134,12 @@ namespace { _Advance_bytes(_Stop_at, _Portion_byte_size); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) - __m128i _Cur_vals = - _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - __m128i _Cur_vals_min = _Cur_vals; // vector of vertical minimum values - __m128i _Cur_idx_min = _mm_setzero_si128(); // vector of vertical minimum indices - __m128i _Cur_vals_max = _Cur_vals; // vector of vertical maximum values - __m128i _Cur_idx_max = _mm_setzero_si128(); // vector of vertical maximum indices - __m128i _Cur_idx = _mm_setzero_si128(); // current vector of indices + auto _Cur_vals = _Traits::_Sign_correction(_Traits::_Load(_First), _Sign); + auto _Cur_vals_min = _Cur_vals; // vector of vertical minimum values + auto _Cur_idx_min = _mm_setzero_si128(); // vector of vertical minimum indices + auto _Cur_vals_max = _Cur_vals; // vector of vertical maximum values + auto _Cur_idx_max = _mm_setzero_si128(); // vector of vertical maximum indices + auto _Cur_idx = _mm_setzero_si128(); // current vector of indices for (;;) { _Advance_bytes(_First, 16); @@ -901,21 +1152,21 @@ namespace { // Compute horizontal min and/or max. Determine horizontal and vertical position of it. if constexpr ((_Mode & _Mode_min) != 0) { - const __m128i _H_min = - _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element + const auto _H_min = _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element const auto _H_min_val = _Traits::_Get_any(_H_min); // Get any element of it if (_H_min_val < _Cur_min_val) { // Current horizontal min is less than the old _Cur_min_val = _H_min_val; // update min - const __m128i _Eq_mask = + const auto _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); // Mask of all elems eq to min - int _Mask = _mm_movemask_epi8(_Eq_mask); + int _Mask = _mm_movemask_epi8(_Traits::_Mask_cast(_Eq_mask)); // Indices of minimum elements or the greatest index if none - const __m128i _All_max = _mm_set1_epi8(static_cast(0xFF)); - const __m128i _Idx_min_val = _mm_blendv_epi8(_All_max, _Cur_idx_min, _Eq_mask); - __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); // The smallest indices + const auto _All_max = _mm_set1_epi8(static_cast(0xFF)); + const auto _Idx_min_val = + _mm_blendv_epi8(_All_max, _Cur_idx_min, _Traits::_Mask_cast(_Eq_mask)); + auto _Idx_min = _Traits::_H_min_u(_Idx_min_val); // The smallest indices // Select the smallest vertical indices from the smallest element mask - _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)); + _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq_idx(_Idx_min, _Idx_min_val)); unsigned long _H_pos; // Find the smallest horizontal index @@ -928,8 +1179,7 @@ namespace { } if constexpr ((_Mode & _Mode_max) != 0) { - const __m128i _H_max = - _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element + const auto _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it if (_Mode == _Mode_both && _Cur_max_val <= _H_max_val @@ -937,19 +1187,19 @@ namespace { // max_element: current horizontal max is greater than the old, update max // minmax_element: current horizontal max is not less than the old, update max _Cur_max_val = _H_max_val; - const __m128i _Eq_mask = + const auto _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); // Mask of all elems eq to max - int _Mask = _mm_movemask_epi8(_Eq_mask); + int _Mask = _mm_movemask_epi8(_Traits::_Mask_cast(_Eq_mask)); unsigned long _H_pos; if constexpr (_Mode == _Mode_both) { // Looking for the last occurrence of maximum // Indices of maximum elements or zero if none - const __m128i _Idx_max_val = - _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Eq_mask); - const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); // The greatest indices + const auto _Idx_max_val = + _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Traits::_Mask_cast(_Eq_mask)); + const auto _Idx_max = _Traits::_H_max_u(_Idx_max_val); // The greatest indices // Select the greatest vertical indices from the largest element mask - _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); + _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq_idx(_Idx_max, _Idx_max_val)); // Find the largest horizontal index _BitScanReverse(&_H_pos, _Mask); // lgtm [cpp/conditionallyuninitializedvariable] @@ -958,11 +1208,12 @@ namespace { } else { // Looking for the first occurrence of maximum // Indices of maximum elements or the greatest index if none - const __m128i _All_max = _mm_set1_epi8(static_cast(0xFF)); - const __m128i _Idx_max_val = _mm_blendv_epi8(_All_max, _Cur_idx_max, _Eq_mask); - const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); // The smallest indices + const auto _All_max = _mm_set1_epi8(static_cast(0xFF)); + const auto _Idx_max_val = + _mm_blendv_epi8(_All_max, _Cur_idx_max, _Traits::_Mask_cast(_Eq_mask)); + const auto _Idx_max = _Traits::_H_min_u(_Idx_max_val); // The smallest indices // Select the smallest vertical indices from the largest element mask - _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); + _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq_idx(_Idx_max, _Idx_max_val)); // Find the smallest horizontal index _BitScanForward(&_H_pos, _Mask); // lgtm [cpp/conditionallyuninitializedvariable] @@ -991,8 +1242,7 @@ namespace { // Indices will be relative to the new base _Base = static_cast(_First); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) - _Cur_vals = - _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + _Cur_vals = _Traits::_Sign_correction(_Traits::_Load(_First), _Sign); if constexpr ((_Mode & _Mode_min) != 0) { _Cur_vals_min = _Cur_vals; @@ -1012,55 +1262,67 @@ namespace { // This is the main part, finding vertical minimum/maximum // Load values and if unsigned adjust them to be signed (for signed vector comparisons) - _Cur_vals = _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + _Cur_vals = _Traits::_Sign_correction(_Traits::_Load(_First), _Sign); if constexpr ((_Mode & _Mode_min) != 0) { // Looking for the first occurrence of minimum, don't overwrite with newly found occurrences - const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min - _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices + const auto _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min + _Cur_idx_min = _mm_blendv_epi8( + _Cur_idx_min, _Cur_idx, _Traits::_Mask_cast(_Is_less)); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } if constexpr (_Mode == _Mode_max) { // Looking for the first occurrence of maximum, don't overwrite with newly found occurrences - const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max - _Cur_idx_max = - _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); // Remember their vertical indices + const auto _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max + _Cur_idx_max = _mm_blendv_epi8( + _Cur_idx_max, _Cur_idx, _Traits::_Mask_cast(_Is_greater)); // Remember their vertical indices _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); // Update the current maximum } else if constexpr (_Mode == _Mode_both) { // Looking for the last occurrence of maximum, do overwrite with newly found occurrences - const __m128i _Is_less = - _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); // Remember their vertical indices + const auto _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, + _Traits::_Mask_cast(_Is_less)); // Remember their vertical indices _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_less); // Update the current maximum } } } #endif // !_M_ARM64EC - using _STy = _Traits::_Signed_t; - using _UTy = _Traits::_Unsigned_t; - constexpr _UTy _Correction = _UTy{1} << (sizeof(_UTy) * 8 - 1); - - if constexpr (_Mode == _Mode_min) { - if (_Sign) { - return _Min_tail(_First, _Last, _Res._Min, static_cast<_STy>(_Cur_min_val)); + if constexpr (_Traits::_Is_floating) { + if constexpr (_Mode == _Mode_min) { + return _Min_tail(_First, _Last, _Res._Min, _Cur_min_val); + } else if constexpr (_Mode == _Mode_max) { + return _Max_tail(_First, _Last, _Res._Max, _Cur_max_val); } else { - return _Min_tail(_First, _Last, _Res._Min, static_cast<_UTy>(_Cur_min_val + _Correction)); - } - } else if constexpr (_Mode == _Mode_max) { - if (_Sign) { - return _Max_tail(_First, _Last, _Res._Max, static_cast<_STy>(_Cur_max_val)); - } else { - return _Max_tail(_First, _Last, _Res._Max, static_cast<_UTy>(_Cur_max_val + _Correction)); + return _Both_tail(_First, _Last, _Res, _Cur_min_val, _Cur_max_val); } } else { - if (_Sign) { - return _Both_tail( - _First, _Last, _Res, static_cast<_STy>(_Cur_min_val), static_cast<_STy>(_Cur_max_val)); + using _STy = _Traits::_Signed_t; + using _UTy = _Traits::_Unsigned_t; + + constexpr _UTy _Correction = _UTy{1} << (sizeof(_UTy) * 8 - 1); + + if constexpr (_Mode == _Mode_min) { + if (_Sign) { + return _Min_tail(_First, _Last, _Res._Min, static_cast<_STy>(_Cur_min_val)); + } else { + return _Min_tail(_First, _Last, _Res._Min, static_cast<_UTy>(_Cur_min_val + _Correction)); + } + } else if constexpr (_Mode == _Mode_max) { + if (_Sign) { + return _Max_tail(_First, _Last, _Res._Max, static_cast<_STy>(_Cur_max_val)); + } else { + return _Max_tail(_First, _Last, _Res._Max, static_cast<_UTy>(_Cur_max_val + _Correction)); + } } else { - return _Both_tail(_First, _Last, _Res, static_cast<_UTy>(_Cur_min_val + _Correction), - static_cast<_UTy>(_Cur_max_val + _Correction)); + if (_Sign) { + return _Both_tail( + _First, _Last, _Res, static_cast<_STy>(_Cur_min_val), static_cast<_STy>(_Cur_max_val)); + } else { + return _Both_tail(_First, _Last, _Res, static_cast<_UTy>(_Cur_min_val + _Correction), + static_cast<_UTy>(_Cur_max_val + _Correction)); + } } } } @@ -1089,6 +1351,16 @@ const void* __stdcall __std_min_element_8( return _Minmax_element<_Mode_min, _Minmax_traits_8>(_First, _Last, _Signed); } +const void* __stdcall __std_min_element_f( + const void* const _First, const void* const _Last, const bool _Unused) noexcept { + return _Minmax_element<_Mode_min, _Minmax_traits_f>(_First, _Last, _Unused); +} + +const void* __stdcall __std_min_element_d( + const void* const _First, const void* const _Last, const bool _Unused) noexcept { + return _Minmax_element<_Mode_min, _Minmax_traits_d>(_First, _Last, _Unused); +} + const void* __stdcall __std_max_element_1( const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Mode_max, _Minmax_traits_1>(_First, _Last, _Signed); @@ -1109,6 +1381,16 @@ const void* __stdcall __std_max_element_8( return _Minmax_element<_Mode_max, _Minmax_traits_8>(_First, _Last, _Signed); } +const void* __stdcall __std_max_element_f( + const void* const _First, const void* const _Last, const bool _Unused) noexcept { + return _Minmax_element<_Mode_max, _Minmax_traits_f>(_First, _Last, _Unused); +} + +const void* __stdcall __std_max_element_d( + const void* const _First, const void* const _Last, const bool _Unused) noexcept { + return _Minmax_element<_Mode_max, _Minmax_traits_d>(_First, _Last, _Unused); +} + _Min_max_element_t __stdcall __std_minmax_element_1( const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Mode_both, _Minmax_traits_1>(_First, _Last, _Signed); @@ -1129,6 +1411,15 @@ _Min_max_element_t __stdcall __std_minmax_element_8( return _Minmax_element<_Mode_both, _Minmax_traits_8>(_First, _Last, _Signed); } +_Min_max_element_t __stdcall __std_minmax_element_f( + const void* const _First, const void* const _Last, const bool _Unused) noexcept { + return _Minmax_element<_Mode_both, _Minmax_traits_f>(_First, _Last, _Unused); +} + +_Min_max_element_t __stdcall __std_minmax_element_d( + const void* const _First, const void* const _Last, const bool _Unused) noexcept { + return _Minmax_element<_Mode_both, _Minmax_traits_d>(_First, _Last, _Unused); +} } // extern "C" namespace { diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index bc46150db..cafb85031 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -178,6 +178,31 @@ void test_min_max_element(mt19937_64& gen) { } } +template +void test_min_max_element_floating(mt19937_64& gen) { + normal_distribution dis(-100000.0, 100000.0); + + constexpr auto input_of_input_size = dataCount / 2; + vector input_of_input(input_of_input_size); + input_of_input[0] = -numeric_limits::infinity(); + input_of_input[1] = +numeric_limits::infinity(); + input_of_input[2] = -0.0; + input_of_input[3] = +0.0; + for (size_t i = 4; i < input_of_input_size; ++i) { + input_of_input[i] = dis(gen); + } + + uniform_int_distribution idx_dis(0, input_of_input_size - 1); + + vector input; + input.reserve(dataCount); + test_case_min_max_element(input); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + input.push_back(input_of_input[idx_dis(gen)]); + test_case_min_max_element(input); + } +} + void test_min_max_element_pointers(mt19937_64& gen) { const short arr[20]{}; @@ -367,6 +392,10 @@ void test_vector_algorithms(mt19937_64& gen) { test_min_max_element(gen); test_min_max_element(gen); + test_min_max_element_floating(gen); + test_min_max_element_floating(gen); + test_min_max_element_floating(gen); + test_min_max_element_pointers(gen); test_min_max_element_special_cases(); // SSE2 vectors