Bug 487900 - Enable SSE2 code for x86_64 in qcms. r=jmuizelaar

This commit is contained in:
Makoto Kato 2009-04-15 22:14:03 +09:00
Родитель f5cf6e8361
Коммит f4a4cf894e
1 изменённых файлов: 65 добавлений и 4 удалений

Просмотреть файл

@ -25,7 +25,7 @@
#include <assert.h>
#include "qcmsint.h"
#if defined(_M_IX86) || defined(__i386__)
#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
#define X86
#endif
@ -760,6 +760,11 @@ void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsig
}
}
#endif
#if defined(_MSC_VER) && defined(_M_AMD64)
#include <emmintrin.h>
#endif
static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
int i;
@ -827,7 +832,7 @@ static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsig
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#else
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
mov eax, mat
mov ecx, clampMax
@ -863,6 +868,33 @@ static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsig
cvtps2dq xmm1, xmm1
movdqa [ebx], xmm1
}
#elif defined(_MSC_VER) && defined(_M_AMD64)
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
xmm1 = _mm_load_ps((__m128*)mat);
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
xmm0 = _mm_load_ps((__m128*)input);
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
xmm7 = _mm_load_ss(clampMax);
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_min_ps(xmm1, xmm7);
xmm6 = _mm_xor_ps(xmm6, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
xmm5 = _mm_load_ss(&floatScale);
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_mul_ps(xmm1, xmm5);
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
}
#else
#error "Unknown platform"
#endif
*dest++ = transform->output_table_r->data[output[0]];
@ -936,7 +968,7 @@ static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsi
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#else
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
mov eax, mat
mov ecx, clampMax
@ -972,6 +1004,33 @@ static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsi
cvtps2dq xmm1, xmm1
movdqa [ebx], xmm1
}
#elif defined(_MSC_VER) && defined(_M_AMD64)
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
xmm1 = _mm_load_ps((__m128*)mat);
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
xmm0 = _mm_load_ps((__m128*)input);
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
xmm7 = _mm_load_ss(clampMax);
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_min_ps(xmm1, xmm7);
xmm6 = _mm_xor_ps(xmm6, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
xmm5 = _mm_load_ss(&floatScale);
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_mul_ps(xmm1, xmm5);
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
}
#else
#error "Unknown platform"
#endif
*dest++ = transform->output_table_r->data[output[0]];
@ -1323,7 +1382,9 @@ static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t
#define SSE2_EDX_MASK (1UL << 26)
static qcms_bool sse2_available(void)
{
#ifdef HAS_CPUID
#if defined(__x86_64__) || defined(_M_AMD64)
return true;
#elif defined(HAS_CPUID)
static int has_sse2 = -1;
uint32_t a, b, c, d;
uint32_t function = 0x00000001;