зеркало из https://github.com/mozilla/pjs.git
Bug 487900 - Enable SSE2 code for x86_64 in qcms. r=jmuizelaar
This commit is contained in:
Родитель
f5cf6e8361
Коммит
f4a4cf894e
|
@ -25,7 +25,7 @@
|
|||
#include <assert.h>
|
||||
#include "qcmsint.h"
|
||||
|
||||
#if defined(_M_IX86) || defined(__i386__)
|
||||
#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
|
||||
#define X86
|
||||
#endif
|
||||
|
||||
|
@ -760,6 +760,11 @@ void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsig
|
|||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && defined(_M_AMD64)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
|
||||
{
|
||||
int i;
|
||||
|
@ -827,7 +832,7 @@ static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsig
|
|||
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
#endif
|
||||
);
|
||||
#else
|
||||
#elif defined(_MSC_VER) && defined(_M_IX86)
|
||||
__asm {
|
||||
mov eax, mat
|
||||
mov ecx, clampMax
|
||||
|
@ -863,6 +868,33 @@ static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsig
|
|||
cvtps2dq xmm1, xmm1
|
||||
movdqa [ebx], xmm1
|
||||
}
|
||||
#elif defined(_MSC_VER) && defined(_M_AMD64)
|
||||
{
|
||||
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
|
||||
|
||||
xmm1 = _mm_load_ps((__m128*)mat);
|
||||
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
|
||||
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
|
||||
xmm0 = _mm_load_ps((__m128*)input);
|
||||
|
||||
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
|
||||
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
|
||||
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
|
||||
|
||||
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
|
||||
|
||||
xmm7 = _mm_load_ss(clampMax);
|
||||
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
|
||||
xmm1 = _mm_min_ps(xmm1, xmm7);
|
||||
xmm6 = _mm_xor_ps(xmm6, xmm6);
|
||||
xmm1 = _mm_max_ps(xmm1, xmm6);
|
||||
xmm5 = _mm_load_ss(&floatScale);
|
||||
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
|
||||
xmm1 = _mm_mul_ps(xmm1, xmm5);
|
||||
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
|
||||
}
|
||||
#else
|
||||
#error "Unknown platform"
|
||||
#endif
|
||||
|
||||
*dest++ = transform->output_table_r->data[output[0]];
|
||||
|
@ -936,7 +968,7 @@ static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsi
|
|||
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
#endif
|
||||
);
|
||||
#else
|
||||
#elif defined(_MSC_VER) && defined(_M_IX86)
|
||||
__asm {
|
||||
mov eax, mat
|
||||
mov ecx, clampMax
|
||||
|
@ -972,6 +1004,33 @@ static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsi
|
|||
cvtps2dq xmm1, xmm1
|
||||
movdqa [ebx], xmm1
|
||||
}
|
||||
#elif defined(_MSC_VER) && defined(_M_AMD64)
|
||||
{
|
||||
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
|
||||
|
||||
xmm1 = _mm_load_ps((__m128*)mat);
|
||||
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
|
||||
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
|
||||
xmm0 = _mm_load_ps((__m128*)input);
|
||||
|
||||
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
|
||||
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
|
||||
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
|
||||
|
||||
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
|
||||
|
||||
xmm7 = _mm_load_ss(clampMax);
|
||||
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
|
||||
xmm1 = _mm_min_ps(xmm1, xmm7);
|
||||
xmm6 = _mm_xor_ps(xmm6, xmm6);
|
||||
xmm1 = _mm_max_ps(xmm1, xmm6);
|
||||
xmm5 = _mm_load_ss(&floatScale);
|
||||
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
|
||||
xmm1 = _mm_mul_ps(xmm1, xmm5);
|
||||
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
|
||||
}
|
||||
#else
|
||||
#error "Unknown platform"
|
||||
#endif
|
||||
|
||||
*dest++ = transform->output_table_r->data[output[0]];
|
||||
|
@ -1323,7 +1382,9 @@ static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t
|
|||
#define SSE2_EDX_MASK (1UL << 26)
|
||||
static qcms_bool sse2_available(void)
|
||||
{
|
||||
#ifdef HAS_CPUID
|
||||
#if defined(__x86_64__) || defined(_M_AMD64)
|
||||
return true;
|
||||
#elif defined(HAS_CPUID)
|
||||
static int has_sse2 = -1;
|
||||
uint32_t a, b, c, d;
|
||||
uint32_t function = 0x00000001;
|
||||
|
|
Загрузка…
Ссылка в новой задаче