Optimize SSE2 packing intrinsics (#12301)
Avoid scalarization in implementation of _mm_packs_epi16/_mm_packs_epi32/_mm_packus_epi16
This commit is contained in:
Родитель
421eda7f3e
Коммит
e2ac2a8b59
1
AUTHORS
1
AUTHORS
|
@ -508,3 +508,4 @@ a license to everyone to use it as detailed in LICENSE.)
|
|||
* Georg Rottensteiner <georg@georg-rottensteiner.de>
|
||||
* Julien Jorge <julien.jorge@gmx.fr>
|
||||
* Attila Oláh <atl@google.com> (copyright owned by Google, LLC)
|
||||
* Marat Dukhan <maratek@google.com> (copyright owned by Google, LLC)
|
||||
|
|
|
@ -546,11 +546,11 @@ The following table highlights the availability and expected performance of diff
|
|||
* - _mm_or_si128
|
||||
- 🟡 wasm_v128_or. VM must guess type.
|
||||
* - _mm_packs_epi16
|
||||
- ❌ scalarized
|
||||
- ✅ wasm_i8x16_narrow_i16x8
|
||||
* - _mm_packs_epi32
|
||||
- ❌ scalarized
|
||||
- ✅ wasm_i16x8_narrow_i32x4
|
||||
* - _mm_packus_epi16
|
||||
- ❌ scalarized
|
||||
- ✅ wasm_u8x16_narrow_i16x8
|
||||
* - _mm_pause
|
||||
- 💭 No-op.
|
||||
* - _mm_sad_epu8
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
#include <xmmintrin.h>
|
||||
#include <emscripten/emscripten.h>
|
||||
|
||||
#define __SATURATE(x, Min, Max) ((x) >= Min ? ((x) <= Max ? (x) : Max) : Min)
|
||||
#define __MIN(x, y) ((x) <= (y) ? (x) : (y))
|
||||
#define __MAX(x, y) ((x) >= (y) ? (x) : (y))
|
||||
|
||||
|
@ -1366,67 +1365,19 @@ _mm_mfence(void)
|
|||
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_packs_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
// TODO: optimize
|
||||
union {
|
||||
signed short x[8];
|
||||
__m128i m;
|
||||
} src, src2;
|
||||
union {
|
||||
signed char x[16];
|
||||
__m128i m;
|
||||
} dst;
|
||||
src.m = __a;
|
||||
src2.m = __b;
|
||||
for(int i = 0; i < 8; ++i)
|
||||
{
|
||||
dst.x[i] = __SATURATE(src.x[i], -128, 127);
|
||||
dst.x[8+i] = __SATURATE(src2.x[i], -128, 127);
|
||||
}
|
||||
return dst.m;
|
||||
return (__m128i)wasm_i8x16_narrow_i16x8(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_packs_epi32(__m128i __a, __m128i __b)
|
||||
{
|
||||
// TODO: optimize
|
||||
union {
|
||||
signed int x[4];
|
||||
__m128i m;
|
||||
} src, src2;
|
||||
union {
|
||||
signed short x[8];
|
||||
__m128i m;
|
||||
} dst;
|
||||
src.m = __a;
|
||||
src2.m = __b;
|
||||
for(int i = 0; i < 4; ++i)
|
||||
{
|
||||
dst.x[i] = __SATURATE(src.x[i], -32768, 32767);
|
||||
dst.x[4+i] = __SATURATE(src2.x[i], -32768, 32767);
|
||||
}
|
||||
return dst.m;
|
||||
return (__m128i)wasm_i16x8_narrow_i32x4(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
|
||||
_mm_packus_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
// TODO: optimize
|
||||
union {
|
||||
signed short x[8];
|
||||
__m128i m;
|
||||
} src, src2;
|
||||
union {
|
||||
unsigned char x[16];
|
||||
__m128i m;
|
||||
} dst;
|
||||
src.m = __a;
|
||||
src2.m = __b;
|
||||
for(int i = 0; i < 8; ++i)
|
||||
{
|
||||
dst.x[i] = __SATURATE(src.x[i], 0, 255);
|
||||
dst.x[8+i] = __SATURATE(src2.x[i], 0, 255);
|
||||
}
|
||||
return dst.m;
|
||||
return (__m128i)wasm_u8x16_narrow_i16x8(__a, __b);
|
||||
}
|
||||
|
||||
#define _mm_extract_epi16(__a, __imm) wasm_u16x8_extract_lane((v128_t)(__a), (__imm) & 7)
|
||||
|
|
Загрузка…
Ссылка в новой задаче