Generalize vectorized `bitset::to_string` implementation (#5013)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
This commit is contained in:
Alex Guteniev 2024-10-21 14:52:56 -07:00 коммит произвёл GitHub
Родитель aaca194535
Коммит bfd5b8ce7b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
1 изменённых файлов: 155 добавлений и 174 удалений

Просмотреть файл

@ -3845,191 +3845,172 @@ __declspec(noalias) void __stdcall __std_replace_8(
} // extern "C"
#ifndef _M_ARM64EC
namespace {
__m256i __forceinline _Bitset_to_string_1_step_avx(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
const __m256i _Vx3 = _mm256_permutevar8x32_epi32(_Vx2, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
const __m256i _Msk = _mm256_and_si256(_Vx3, _mm256_set1_epi64x(0x0102040810204080));
const __m256i _Ex0 = _mm256_cmpeq_epi8(_Msk, _mm256_setzero_si256());
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
namespace __std_bitset_to_string {
#ifdef _M_ARM64EC
using _Traits_1_avx = void;
using _Traits_1_sse = void;
using _Traits_2_avx = void;
using _Traits_2_sse = void;
#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
struct _Traits_1_avx {
using _Value_type = uint32_t;
__m128i __forceinline _Bitset_to_string_1_step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
const __m128i _Msk = _mm_and_si128(_Vx1, _mm_set1_epi64x(0x0102040810204080));
const __m128i _Ex0 = _mm_cmpeq_epi8(_Msk, _mm_setzero_si128());
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
static __m256i _Set(const char _Val) noexcept {
return _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Val));
}
__m256i __forceinline _Bitset_to_string_2_step_avx(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
const __m256i _Vx3 = _mm256_permute4x64_epi64(_Vx2, _MM_SHUFFLE(1, 1, 0, 0));
const __m256i _Msk = _mm256_and_si256(
_Vx3, _mm256_set_epi64x(0x0001000200040008, 0x0010002000400080, 0x0001000200040008, 0x0010002000400080));
const __m256i _Ex0 = _mm256_cmpeq_epi16(_Msk, _mm256_setzero_si256());
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
static __m256i __forceinline _Step(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 =
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
const __m256i _Vx3 = _mm256_permutevar8x32_epi32(_Vx2, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
const __m256i _Msk = _mm256_and_si256(_Vx3, _mm256_set1_epi64x(0x0102040810204080));
const __m256i _Ex0 = _mm256_cmpeq_epi8(_Msk, _mm256_setzero_si256());
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
__m128i __forceinline _Bitset_to_string_2_step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) {
const __m128i _Vx = _mm_set1_epi16(_Val);
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
} // unnamed namespace
static void _Out(void* _Dest, const __m256i _Elems) noexcept {
_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
}
};
struct _Traits_1_sse {
using _Value_type = uint16_t;
static __m128i _Set(const char _Val) noexcept {
return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128());
}
static __m128i __forceinline _Step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 =
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
const __m128i _Msk = _mm_and_si128(_Vx1, _mm_set1_epi64x(0x0102040810204080));
const __m128i _Ex0 = _mm_cmpeq_epi8(_Msk, _mm_setzero_si128());
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
static void _Out(void* _Dest, const __m128i _Elems) noexcept {
_mm_storeu_si128(static_cast<__m128i*>(_Dest), _Elems);
}
};
struct _Traits_2_avx {
using _Value_type = uint16_t;
static __m256i _Set(const wchar_t _Val) noexcept {
return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Val));
}
static __m256i __forceinline _Step(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 =
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
const __m256i _Vx3 = _mm256_permute4x64_epi64(_Vx2, _MM_SHUFFLE(1, 1, 0, 0));
const __m256i _Msk = _mm256_and_si256(_Vx3,
_mm256_set_epi64x(0x0001000200040008, 0x0010002000400080, 0x0001000200040008, 0x0010002000400080));
const __m256i _Ex0 = _mm256_cmpeq_epi16(_Msk, _mm256_setzero_si256());
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
static void _Out(void* _Dest, const __m256i _Elems) noexcept {
_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
}
};
struct _Traits_2_sse {
using _Value_type = uint8_t;
static __m128i _Set(const wchar_t _Val) noexcept {
return _mm_set1_epi16(_Val);
}
static __m128i __forceinline _Step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
const __m128i _Vx = _mm_set1_epi16(_Val);
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}
static void _Out(void* _Dest, const __m128i _Elems) noexcept {
_mm_storeu_si128(static_cast<__m128i*>(_Dest), _Elems);
}
};
template <class _Traits, class _Elem>
void __stdcall _Impl(
_Elem* const _Dest, const void* _Src, size_t _Size_bits, const _Elem _Elem0, const _Elem _Elem1) noexcept {
constexpr size_t _Step_size_bits = sizeof(_Traits::_Value_type) * 8;
const auto _Px0 = _Traits::_Set(_Elem0);
const auto _Px1 = _Traits::_Set(_Elem1);
if (_Size_bits >= _Step_size_bits) {
_Elem* _Pos = _Dest + _Size_bits;
_Size_bits &= _Step_size_bits - 1;
_Elem* const _Stop_at = _Dest + _Size_bits;
do {
typename _Traits::_Value_type _Val;
memcpy(&_Val, _Src, sizeof(_Val));
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
_Pos -= _Step_size_bits;
_Traits::_Out(_Pos, _Elems);
_Advance_bytes(_Src, sizeof(_Val));
} while (_Pos != _Stop_at);
}
if (_Size_bits > 0) {
__assume(_Size_bits < sizeof(_Traits::_Value_type));
typename _Traits::_Value_type _Val;
memcpy(&_Val, _Src, sizeof(_Val));
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
_Elem _Tmp[_Step_size_bits];
_Traits::_Out(_Tmp, _Elems);
const _Elem* const _Tmpd = _Tmp + (_Step_size_bits - _Size_bits);
memcpy(_Dest, _Tmpd, _Size_bits * sizeof(_Elem));
}
}
#endif // ^^^ !defined(_M_ARM64EC) ^^^
template <class _Avx_traits, class _Sse_traits, class _Elem>
void __stdcall _Dispatch(_Elem* const _Dest, const void* const _Src, const size_t _Size_bits,
const _Elem _Elem0, const _Elem _Elem1) noexcept {
#ifndef _M_ARM64EC
if (_Use_avx2() && _Size_bits >= 256) {
_Impl<_Avx_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
} else if (_Use_sse42()) {
_Impl<_Sse_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
} else
#endif // !defined(_M_ARM64EC)
{
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
}
}
}
} // namespace __std_bitset_to_string
} // unnamed namespace
extern "C" {
__declspec(noalias) void __stdcall __std_bitset_to_string_1(
char* const _Dest, const void* _Src, size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
#ifndef _M_ARM64EC
if (_Use_avx2() && _Size_bits >= 256) {
const __m256i _Px0 = _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Elem0));
const __m256i _Px1 = _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Elem1));
if (_Size_bits >= 32) {
char* _Pos = _Dest + _Size_bits;
_Size_bits &= 0x1F;
char* const _Stop_at = _Dest + _Size_bits;
do {
uint32_t _Val;
memcpy(&_Val, _Src, 4);
const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1);
_Pos -= 32;
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Pos), _Elems);
_Advance_bytes(_Src, 4);
} while (_Pos != _Stop_at);
}
if (_Size_bits > 0) {
__assume(_Size_bits < 32);
uint32_t _Val;
memcpy(&_Val, _Src, 4);
const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1);
char _Tmp[32];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems);
const char* const _Tmpd = _Tmp + (32 - _Size_bits);
memcpy(_Dest, _Tmpd, _Size_bits);
}
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
} else if (_Use_sse42()) {
const __m128i _Px0 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem0), _mm_setzero_si128());
const __m128i _Px1 = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Elem1), _mm_setzero_si128());
if (_Size_bits >= 16) {
char* _Pos = _Dest + _Size_bits;
_Size_bits &= 0xF;
char* const _Stop_at = _Dest + _Size_bits;
do {
uint16_t _Val;
memcpy(&_Val, _Src, 2);
const __m128i _Elems = _Bitset_to_string_1_step(_Val, _Px0, _Px1);
_Pos -= 16;
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Pos), _Elems);
_Advance_bytes(_Src, 2);
} while (_Pos != _Stop_at);
}
if (_Size_bits > 0) {
__assume(_Size_bits < 16);
uint16_t _Val;
memcpy(&_Val, _Src, 2);
const __m128i _Elems = _Bitset_to_string_1_step(_Val, _Px0, _Px1);
char _Tmp[16];
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Elems);
const char* const _Tmpd = _Tmp + (16 - _Size_bits);
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
_Dest[_Ix] = _Tmpd[_Ix];
}
}
} else
#endif // !defined(_M_ARM64EC)
{
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
}
}
char* const _Dest, const void* const _Src, const size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
using namespace __std_bitset_to_string;
_Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
}
__declspec(noalias) void __stdcall __std_bitset_to_string_2(
wchar_t* const _Dest, const void* _Src, size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
#ifndef _M_ARM64EC
if (_Use_avx2() && _Size_bits >= 256) {
const __m256i _Px0 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Elem0));
const __m256i _Px1 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Elem1));
if (_Size_bits >= 16) {
wchar_t* _Pos = _Dest + _Size_bits;
_Size_bits &= 0xF;
wchar_t* const _Stop_at = _Dest + _Size_bits;
do {
uint16_t _Val;
memcpy(&_Val, _Src, 2);
const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1);
_Pos -= 16;
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Pos), _Elems);
_Advance_bytes(_Src, 2);
} while (_Pos != _Stop_at);
}
if (_Size_bits > 0) {
__assume(_Size_bits < 16);
uint16_t _Val;
memcpy(&_Val, _Src, 2);
const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1);
wchar_t _Tmp[16];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems);
const wchar_t* const _Tmpd = _Tmp + (16 - _Size_bits);
memcpy(_Dest, _Tmpd, _Size_bits * 2);
}
_mm256_zeroupper(); // TRANSITION, DevCom-10331414
} else if (_Use_sse42()) {
const __m128i _Px0 = _mm_set1_epi16(_Elem0);
const __m128i _Px1 = _mm_set1_epi16(_Elem1);
if (_Size_bits >= 8) {
wchar_t* _Pos = _Dest + _Size_bits;
_Size_bits &= 0x7;
wchar_t* const _Stop_at = _Dest + _Size_bits;
do {
const uint8_t _Val = *reinterpret_cast<const uint8_t*>(_Src);
const __m128i _Elems = _Bitset_to_string_2_step(_Val, _Px0, _Px1);
_Pos -= 8;
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Pos), _Elems);
_Advance_bytes(_Src, 1);
} while (_Pos != _Stop_at);
}
if (_Size_bits > 0) {
__assume(_Size_bits < 8);
const uint8_t _Val = *reinterpret_cast<const uint8_t*>(_Src);
const __m128i _Elems = _Bitset_to_string_2_step(_Val, _Px0, _Px1);
wchar_t _Tmp[8];
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Elems);
const wchar_t* const _Tmpd = _Tmp + (8 - _Size_bits);
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
_Dest[_Ix] = _Tmpd[_Ix];
}
}
} else
#endif // !defined(_M_ARM64EC)
{
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
}
}
__declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest, const void* const _Src,
const size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
using namespace __std_bitset_to_string;
_Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
}
} // extern "C"