Optimize SSE2 packing intrinsics (#12301)

Avoid scalarization in implementation of
_mm_packs_epi16/_mm_packs_epi32/_mm_packus_epi16
This commit is contained in:
Marat Dukhan 2020-09-22 14:53:56 -07:00 коммит произвёл GitHub
Родитель 421eda7f3e
Коммит e2ac2a8b59
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 7 добавлений и 55 удалений

Просмотреть файл

@ -508,3 +508,4 @@ a license to everyone to use it as detailed in LICENSE.)
* Georg Rottensteiner <georg@georg-rottensteiner.de>
* Julien Jorge <julien.jorge@gmx.fr>
* Attila Oláh <atl@google.com> (copyright owned by Google, LLC)
* Marat Dukhan <maratek@google.com> (copyright owned by Google, LLC)

Просмотреть файл

@ -546,11 +546,11 @@ The following table highlights the availability and expected performance of diff
* - _mm_or_si128
- 🟡 wasm_v128_or. VM must guess type.
* - _mm_packs_epi16
- ❌ scalarized
- ✅ wasm_i8x16_narrow_i16x8
* - _mm_packs_epi32
- ❌ scalarized
- ✅ wasm_i16x8_narrow_i32x4
* - _mm_packus_epi16
- ❌ scalarized
- ✅ wasm_u8x16_narrow_i16x8
* - _mm_pause
- 💭 No-op.
* - _mm_sad_epu8

Просмотреть файл

@ -14,7 +14,6 @@
#include <xmmintrin.h>
#include <emscripten/emscripten.h>
#define __SATURATE(x, Min, Max) ((x) >= Min ? ((x) <= Max ? (x) : Max) : Min)
#define __MIN(x, y) ((x) <= (y) ? (x) : (y))
#define __MAX(x, y) ((x) >= (y) ? (x) : (y))
@ -1366,67 +1365,19 @@ _mm_mfence(void)
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i __a, __m128i __b)
{
// TODO: optimize
union {
signed short x[8];
__m128i m;
} src, src2;
union {
signed char x[16];
__m128i m;
} dst;
src.m = __a;
src2.m = __b;
for(int i = 0; i < 8; ++i)
{
dst.x[i] = __SATURATE(src.x[i], -128, 127);
dst.x[8+i] = __SATURATE(src2.x[i], -128, 127);
}
return dst.m;
return (__m128i)wasm_i8x16_narrow_i16x8(__a, __b);
}
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i __a, __m128i __b)
{
// TODO: optimize
union {
signed int x[4];
__m128i m;
} src, src2;
union {
signed short x[8];
__m128i m;
} dst;
src.m = __a;
src2.m = __b;
for(int i = 0; i < 4; ++i)
{
dst.x[i] = __SATURATE(src.x[i], -32768, 32767);
dst.x[4+i] = __SATURATE(src2.x[i], -32768, 32767);
}
return dst.m;
return (__m128i)wasm_i16x8_narrow_i32x4(__a, __b);
}
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i __a, __m128i __b)
{
// TODO: optimize
union {
signed short x[8];
__m128i m;
} src, src2;
union {
unsigned char x[16];
__m128i m;
} dst;
src.m = __a;
src2.m = __b;
for(int i = 0; i < 8; ++i)
{
dst.x[i] = __SATURATE(src.x[i], 0, 255);
dst.x[8+i] = __SATURATE(src2.x[i], 0, 255);
}
return dst.m;
return (__m128i)wasm_u8x16_narrow_i16x8(__a, __b);
}
#define _mm_extract_epi16(__a, __imm) wasm_u16x8_extract_lane((v128_t)(__a), (__imm) & 7)