зеркало из https://github.com/microsoft/STL.git
Generalize vectorized `bitset::to_string` implementation (#5013)
Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
This commit is contained in:
Родитель
aaca194535
Коммит
bfd5b8ce7b
|
@ -3845,191 +3845,172 @@ __declspec(noalias) void __stdcall __std_replace_8(
|
|||
|
||||
} // extern "C"
|
||||
|
||||
#ifndef _M_ARM64EC
|
||||
namespace {
|
||||
__m256i __forceinline _Bitset_to_string_1_step_avx(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) {
|
||||
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
|
||||
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
|
||||
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
|
||||
const __m256i _Vx3 = _mm256_permutevar8x32_epi32(_Vx2, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
|
||||
const __m256i _Msk = _mm256_and_si256(_Vx3, _mm256_set1_epi64x(0x0102040810204080));
|
||||
const __m256i _Ex0 = _mm256_cmpeq_epi8(_Msk, _mm256_setzero_si256());
|
||||
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
namespace __std_bitset_to_string {
|
||||
#ifdef _M_ARM64EC
|
||||
using _Traits_1_avx = void;
|
||||
using _Traits_1_sse = void;
|
||||
using _Traits_2_avx = void;
|
||||
using _Traits_2_sse = void;
|
||||
#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
|
||||
struct _Traits_1_avx {
|
||||
using _Value_type = uint32_t;
|
||||
|
||||
__m128i __forceinline _Bitset_to_string_1_step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) {
|
||||
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
|
||||
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
|
||||
const __m128i _Msk = _mm_and_si128(_Vx1, _mm_set1_epi64x(0x0102040810204080));
|
||||
const __m128i _Ex0 = _mm_cmpeq_epi8(_Msk, _mm_setzero_si128());
|
||||
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
static __m256i _Set(const char _Val) noexcept {
|
||||
return _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Val));
|
||||
}
|
||||
|
||||
__m256i __forceinline _Bitset_to_string_2_step_avx(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) {
|
||||
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
|
||||
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
|
||||
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
|
||||
const __m256i _Vx3 = _mm256_permute4x64_epi64(_Vx2, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
const __m256i _Msk = _mm256_and_si256(
|
||||
_Vx3, _mm256_set_epi64x(0x0001000200040008, 0x0010002000400080, 0x0001000200040008, 0x0010002000400080));
|
||||
const __m256i _Ex0 = _mm256_cmpeq_epi16(_Msk, _mm256_setzero_si256());
|
||||
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
static __m256i __forceinline _Step(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
|
||||
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
|
||||
const __m128i _Vx1 =
|
||||
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
|
||||
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
|
||||
const __m256i _Vx3 = _mm256_permutevar8x32_epi32(_Vx2, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
|
||||
const __m256i _Msk = _mm256_and_si256(_Vx3, _mm256_set1_epi64x(0x0102040810204080));
|
||||
const __m256i _Ex0 = _mm256_cmpeq_epi8(_Msk, _mm256_setzero_si256());
|
||||
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
|
||||
__m128i __forceinline _Bitset_to_string_2_step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) {
|
||||
const __m128i _Vx = _mm_set1_epi16(_Val);
|
||||
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
|
||||
const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
|
||||
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
} // unnamed namespace
|
||||
static void _Out(void* _Dest, const __m256i _Elems) noexcept {
|
||||
_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
|
||||
}
|
||||
};
|
||||
|
||||
struct _Traits_1_sse {
|
||||
using _Value_type = uint16_t;
|
||||
|
||||
static __m128i _Set(const char _Val) noexcept {
|
||||
return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __m128i __forceinline _Step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
|
||||
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
|
||||
const __m128i _Vx1 =
|
||||
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
|
||||
const __m128i _Msk = _mm_and_si128(_Vx1, _mm_set1_epi64x(0x0102040810204080));
|
||||
const __m128i _Ex0 = _mm_cmpeq_epi8(_Msk, _mm_setzero_si128());
|
||||
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
|
||||
static void _Out(void* _Dest, const __m128i _Elems) noexcept {
|
||||
_mm_storeu_si128(static_cast<__m128i*>(_Dest), _Elems);
|
||||
}
|
||||
};
|
||||
|
||||
struct _Traits_2_avx {
|
||||
using _Value_type = uint16_t;
|
||||
|
||||
static __m256i _Set(const wchar_t _Val) noexcept {
|
||||
return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Val));
|
||||
}
|
||||
|
||||
static __m256i __forceinline _Step(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
|
||||
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
|
||||
const __m128i _Vx1 =
|
||||
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
|
||||
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
|
||||
const __m256i _Vx3 = _mm256_permute4x64_epi64(_Vx2, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
const __m256i _Msk = _mm256_and_si256(_Vx3,
|
||||
_mm256_set_epi64x(0x0001000200040008, 0x0010002000400080, 0x0001000200040008, 0x0010002000400080));
|
||||
const __m256i _Ex0 = _mm256_cmpeq_epi16(_Msk, _mm256_setzero_si256());
|
||||
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
|
||||
static void _Out(void* _Dest, const __m256i _Elems) noexcept {
|
||||
_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
|
||||
}
|
||||
};
|
||||
|
||||
struct _Traits_2_sse {
|
||||
using _Value_type = uint8_t;
|
||||
|
||||
static __m128i _Set(const wchar_t _Val) noexcept {
|
||||
return _mm_set1_epi16(_Val);
|
||||
}
|
||||
|
||||
static __m128i __forceinline _Step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
|
||||
const __m128i _Vx = _mm_set1_epi16(_Val);
|
||||
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
|
||||
const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
|
||||
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
|
||||
return _Ex1;
|
||||
}
|
||||
|
||||
static void _Out(void* _Dest, const __m128i _Elems) noexcept {
|
||||
_mm_storeu_si128(static_cast<__m128i*>(_Dest), _Elems);
|
||||
}
|
||||
};
|
||||
|
||||
template <class _Traits, class _Elem>
|
||||
void __stdcall _Impl(
|
||||
_Elem* const _Dest, const void* _Src, size_t _Size_bits, const _Elem _Elem0, const _Elem _Elem1) noexcept {
|
||||
constexpr size_t _Step_size_bits = sizeof(_Traits::_Value_type) * 8;
|
||||
|
||||
const auto _Px0 = _Traits::_Set(_Elem0);
|
||||
const auto _Px1 = _Traits::_Set(_Elem1);
|
||||
if (_Size_bits >= _Step_size_bits) {
|
||||
_Elem* _Pos = _Dest + _Size_bits;
|
||||
_Size_bits &= _Step_size_bits - 1;
|
||||
_Elem* const _Stop_at = _Dest + _Size_bits;
|
||||
do {
|
||||
typename _Traits::_Value_type _Val;
|
||||
memcpy(&_Val, _Src, sizeof(_Val));
|
||||
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
|
||||
_Pos -= _Step_size_bits;
|
||||
_Traits::_Out(_Pos, _Elems);
|
||||
_Advance_bytes(_Src, sizeof(_Val));
|
||||
} while (_Pos != _Stop_at);
|
||||
}
|
||||
|
||||
if (_Size_bits > 0) {
|
||||
__assume(_Size_bits < sizeof(_Traits::_Value_type));
|
||||
typename _Traits::_Value_type _Val;
|
||||
memcpy(&_Val, _Src, sizeof(_Val));
|
||||
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
|
||||
_Elem _Tmp[_Step_size_bits];
|
||||
_Traits::_Out(_Tmp, _Elems);
|
||||
const _Elem* const _Tmpd = _Tmp + (_Step_size_bits - _Size_bits);
|
||||
memcpy(_Dest, _Tmpd, _Size_bits * sizeof(_Elem));
|
||||
}
|
||||
}
|
||||
#endif // ^^^ !defined(_M_ARM64EC) ^^^
|
||||
|
||||
template <class _Avx_traits, class _Sse_traits, class _Elem>
|
||||
void __stdcall _Dispatch(_Elem* const _Dest, const void* const _Src, const size_t _Size_bits,
|
||||
const _Elem _Elem0, const _Elem _Elem1) noexcept {
|
||||
#ifndef _M_ARM64EC
|
||||
if (_Use_avx2() && _Size_bits >= 256) {
|
||||
_Impl<_Avx_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
|
||||
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
|
||||
} else if (_Use_sse42()) {
|
||||
_Impl<_Sse_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
|
||||
} else
|
||||
#endif // !defined(_M_ARM64EC)
|
||||
{
|
||||
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
|
||||
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
|
||||
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace __std_bitset_to_string
|
||||
} // unnamed namespace
|
||||
|
||||
extern "C" {
|
||||
|
||||
__declspec(noalias) void __stdcall __std_bitset_to_string_1(
|
||||
char* const _Dest, const void* _Src, size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
|
||||
#ifndef _M_ARM64EC
|
||||
if (_Use_avx2() && _Size_bits >= 256) {
|
||||
const __m256i _Px0 = _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Elem0));
|
||||
const __m256i _Px1 = _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Elem1));
|
||||
if (_Size_bits >= 32) {
|
||||
char* _Pos = _Dest + _Size_bits;
|
||||
_Size_bits &= 0x1F;
|
||||
char* const _Stop_at = _Dest + _Size_bits;
|
||||
do {
|
||||
uint32_t _Val;
|
||||
memcpy(&_Val, _Src, 4);
|
||||
const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1);
|
||||
_Pos -= 32;
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Pos), _Elems);
|
||||
_Advance_bytes(_Src, 4);
|
||||
} while (_Pos != _Stop_at);
|
||||
}
|
||||
|
||||
if (_Size_bits > 0) {
|
||||
__assume(_Size_bits < 32);
|
||||
uint32_t _Val;
|
||||
memcpy(&_Val, _Src, 4);
|
||||
const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1);
|
||||
char _Tmp[32];
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems);
|
||||
const char* const _Tmpd = _Tmp + (32 - _Size_bits);
|
||||
memcpy(_Dest, _Tmpd, _Size_bits);
|
||||
}
|
||||
|
||||
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
|
||||
} else if (_Use_sse42()) {
|
||||
const __m128i _Px0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128());
|
||||
const __m128i _Px1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128());
|
||||
if (_Size_bits >= 16) {
|
||||
char* _Pos = _Dest + _Size_bits;
|
||||
_Size_bits &= 0xF;
|
||||
char* const _Stop_at = _Dest + _Size_bits;
|
||||
do {
|
||||
uint16_t _Val;
|
||||
memcpy(&_Val, _Src, 2);
|
||||
const __m128i _Elems = _Bitset_to_string_1_step(_Val, _Px0, _Px1);
|
||||
_Pos -= 16;
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Pos), _Elems);
|
||||
_Advance_bytes(_Src, 2);
|
||||
} while (_Pos != _Stop_at);
|
||||
}
|
||||
|
||||
if (_Size_bits > 0) {
|
||||
__assume(_Size_bits < 16);
|
||||
uint16_t _Val;
|
||||
memcpy(&_Val, _Src, 2);
|
||||
const __m128i _Elems = _Bitset_to_string_1_step(_Val, _Px0, _Px1);
|
||||
char _Tmp[16];
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Elems);
|
||||
const char* const _Tmpd = _Tmp + (16 - _Size_bits);
|
||||
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
|
||||
_Dest[_Ix] = _Tmpd[_Ix];
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif // !defined(_M_ARM64EC)
|
||||
{
|
||||
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
|
||||
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
|
||||
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
|
||||
}
|
||||
}
|
||||
char* const _Dest, const void* const _Src, const size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
|
||||
using namespace __std_bitset_to_string;
|
||||
_Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
|
||||
}
|
||||
|
||||
__declspec(noalias) void __stdcall __std_bitset_to_string_2(
|
||||
wchar_t* const _Dest, const void* _Src, size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
|
||||
#ifndef _M_ARM64EC
|
||||
if (_Use_avx2() && _Size_bits >= 256) {
|
||||
const __m256i _Px0 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Elem0));
|
||||
const __m256i _Px1 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Elem1));
|
||||
|
||||
if (_Size_bits >= 16) {
|
||||
wchar_t* _Pos = _Dest + _Size_bits;
|
||||
_Size_bits &= 0xF;
|
||||
wchar_t* const _Stop_at = _Dest + _Size_bits;
|
||||
do {
|
||||
uint16_t _Val;
|
||||
memcpy(&_Val, _Src, 2);
|
||||
const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1);
|
||||
_Pos -= 16;
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Pos), _Elems);
|
||||
_Advance_bytes(_Src, 2);
|
||||
} while (_Pos != _Stop_at);
|
||||
}
|
||||
|
||||
if (_Size_bits > 0) {
|
||||
__assume(_Size_bits < 16);
|
||||
uint16_t _Val;
|
||||
memcpy(&_Val, _Src, 2);
|
||||
const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1);
|
||||
wchar_t _Tmp[16];
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems);
|
||||
const wchar_t* const _Tmpd = _Tmp + (16 - _Size_bits);
|
||||
memcpy(_Dest, _Tmpd, _Size_bits * 2);
|
||||
}
|
||||
|
||||
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
|
||||
} else if (_Use_sse42()) {
|
||||
const __m128i _Px0 = _mm_set1_epi16(_Elem0);
|
||||
const __m128i _Px1 = _mm_set1_epi16(_Elem1);
|
||||
if (_Size_bits >= 8) {
|
||||
wchar_t* _Pos = _Dest + _Size_bits;
|
||||
_Size_bits &= 0x7;
|
||||
wchar_t* const _Stop_at = _Dest + _Size_bits;
|
||||
do {
|
||||
const uint8_t _Val = *reinterpret_cast<const uint8_t*>(_Src);
|
||||
const __m128i _Elems = _Bitset_to_string_2_step(_Val, _Px0, _Px1);
|
||||
_Pos -= 8;
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Pos), _Elems);
|
||||
_Advance_bytes(_Src, 1);
|
||||
} while (_Pos != _Stop_at);
|
||||
}
|
||||
|
||||
if (_Size_bits > 0) {
|
||||
__assume(_Size_bits < 8);
|
||||
const uint8_t _Val = *reinterpret_cast<const uint8_t*>(_Src);
|
||||
const __m128i _Elems = _Bitset_to_string_2_step(_Val, _Px0, _Px1);
|
||||
wchar_t _Tmp[8];
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Elems);
|
||||
const wchar_t* const _Tmpd = _Tmp + (8 - _Size_bits);
|
||||
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
|
||||
_Dest[_Ix] = _Tmpd[_Ix];
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif // !defined(_M_ARM64EC)
|
||||
{
|
||||
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
|
||||
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
|
||||
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
|
||||
}
|
||||
}
|
||||
__declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest, const void* const _Src,
|
||||
const size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
|
||||
using namespace __std_bitset_to_string;
|
||||
_Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
|
Загрузка…
Ссылка в новой задаче