From f4a4cf894e4d2d8b96484f3c1046caa9b8cd10a6 Mon Sep 17 00:00:00 2001 From: Makoto Kato Date: Wed, 15 Apr 2009 22:14:03 +0900 Subject: [PATCH] Bug 487900 - Enable SSE2 code for x86_64 in qcms. r=jmuizelaar --- gfx/qcms/transform.c | 69 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/gfx/qcms/transform.c b/gfx/qcms/transform.c index e079d67f95c..f52a6b7f4db 100644 --- a/gfx/qcms/transform.c +++ b/gfx/qcms/transform.c @@ -25,7 +25,7 @@ #include #include "qcmsint.h" -#if defined(_M_IX86) || defined(__i386__) +#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64) #define X86 #endif @@ -760,6 +760,11 @@ void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsig } } #endif + +#if defined(_MSC_VER) && defined(_M_AMD64) +#include +#endif + static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { int i; @@ -827,7 +832,7 @@ static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsig , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" #endif ); -#else +#elif defined(_MSC_VER) && defined(_M_IX86) __asm { mov eax, mat mov ecx, clampMax @@ -863,6 +868,33 @@ static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsig cvtps2dq xmm1, xmm1 movdqa [ebx], xmm1 } +#elif defined(_MSC_VER) && defined(_M_AMD64) + { + __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7; + + xmm1 = _mm_load_ps((__m128*)mat); + xmm2 = _mm_load_ps(((__m128*)mat) + 1); + xmm3 = _mm_load_ps(((__m128*)mat) + 2); + xmm0 = _mm_load_ps((__m128*)input); + + xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0))); + xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1))); + xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2))); + + xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3)); + + xmm7 = _mm_load_ss(clampMax); + xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0)); + xmm1 = _mm_min_ps(xmm1, xmm7); + xmm6 = _mm_xor_ps(xmm6, xmm6); + xmm1 = _mm_max_ps(xmm1, xmm6); + xmm5 = _mm_load_ss(&floatScale); + xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0)); + xmm1 = _mm_mul_ps(xmm1, xmm5); + _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1)); + } +#else +#error "Unknown platform" #endif *dest++ = transform->output_table_r->data[output[0]]; @@ -936,7 +968,7 @@ static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsi , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" #endif ); -#else +#elif defined(_MSC_VER) && defined(_M_IX86) __asm { mov eax, mat mov ecx, clampMax @@ -972,6 +1004,33 @@ static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsi cvtps2dq xmm1, xmm1 movdqa [ebx], xmm1 } +#elif defined(_MSC_VER) && defined(_M_AMD64) + { + __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7; + + xmm1 = _mm_load_ps((__m128*)mat); + xmm2 = _mm_load_ps(((__m128*)mat) + 1); + xmm3 = _mm_load_ps(((__m128*)mat) + 2); + xmm0 = _mm_load_ps((__m128*)input); + + xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0))); + xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1))); + xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2))); + + xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3)); + + xmm7 = _mm_load_ss(clampMax); + xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0)); + xmm1 = _mm_min_ps(xmm1, xmm7); + xmm6 = _mm_xor_ps(xmm6, xmm6); + xmm1 = _mm_max_ps(xmm1, xmm6); + xmm5 = _mm_load_ss(&floatScale); + xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0)); + xmm1 = _mm_mul_ps(xmm1, xmm5); + _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1)); + } +#else +#error "Unknown platform" #endif *dest++ = transform->output_table_r->data[output[0]]; @@ -1323,7 +1382,9 @@ static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t #define SSE2_EDX_MASK (1UL << 26) static qcms_bool sse2_available(void) { -#ifdef HAS_CPUID +#if defined(__x86_64__) || defined(_M_AMD64) + return true; +#elif defined(HAS_CPUID) static int has_sse2 = -1; uint32_t a, b, c, d; uint32_t function = 0x00000001;