Backed out changeset cb4f078cc8cb (bug 512865)

Was causing crashes on the leak test box.
This commit is contained in:
Jeff Muizelaar 2009-09-24 23:35:53 -04:00
Родитель 7e089e6fae
Коммит 3f907e258a
5 изменённых файлов: 368 добавлений и 578 удалений

Просмотреть файл

@ -15,17 +15,6 @@ EXPORTS = qcms.h qcmstypes.h
CSRCS = iccread.c transform.c
ifeq (86,$(findstring 86,$(OS_TEST)))
CSRCS += transform-sse2.c transform-sse1.c
ifdef GNU_CC
SSE1_FLAGS=-msse
SSE2_FLAGS=-msse2
else
SSE1_FLAGS=
SSE2_FLAGS=
endif
endif
FORCE_STATIC_LIB = 1
# This library is used by other shared libs
FORCE_USE_PIC = 1
@ -33,15 +22,3 @@ FORCE_USE_PIC = 1
include $(topsrcdir)/config/rules.mk
CFLAGS += -DMOZ_QCMS
# special rules for transform-sse*.c to get the right cflags. (taken from pixman/src/Makefile.in)
transform-sse1.$(OBJ_SUFFIX): transform-sse1.c Makefile Makefile.in
$(REPORT_BUILD)
@$(MAKE_DEPS_AUTO_CC)
$(ELOG) $(CC) $(OUTOPTION)$@ -c $(COMPILE_CFLAGS) $(SSE1_FLAGS) $(_VPATH_SRCS)
transform-sse2.$(OBJ_SUFFIX): transform-sse2.c Makefile Makefile.in
$(REPORT_BUILD)
@$(MAKE_DEPS_AUTO_CC)
$(ELOG) $(CC) $(OUTOPTION)$@ -c $(COMPILE_CFLAGS) $(SSE2_FLAGS) $(_VPATH_SRCS)

Просмотреть файл

@ -141,20 +141,3 @@ static inline s15Fixed16Number double_to_s15Fixed16Number(double v)
void precache_release(struct precache_output *p);
qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);
void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);
void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);
void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);

Просмотреть файл

@ -1,253 +0,0 @@
#include <xmmintrin.h>
#include "qcmsint.h"
/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
#define FLOATSCALE 65536.0f
#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
static const ALIGN float floatScaleX4[4] =
{ FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
static const ALIGN float clampMaxValueX4[4] =
{ CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
/* load for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 3;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
_mm_empty();
}
void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
unsigned char alpha;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
alpha = src[3];
src += 4;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* store alpha for this pixel; load alpha for next */
dest[3] = alpha;
alpha = src[3];
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
/* load gamma values for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 4;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 4;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
dest[3] = alpha;
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
_mm_empty();
}

Просмотреть файл

@ -1,243 +0,0 @@
#include <emmintrin.h>
#include "qcmsint.h"
/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
#define FLOATSCALE 65536.0f
#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
static const ALIGN float floatScaleX4[4] =
{ FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
static const ALIGN float clampMaxValueX4[4] =
{ CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
/* load for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 3;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
}
void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
unsigned char alpha;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
alpha = src[3];
src += 4;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* store alpha for this pixel; load alpha for next */
dest[3] = alpha;
alpha = src[3];
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
/* load gamma values for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 4;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 4;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
dest[3] = alpha;
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
}

Просмотреть файл

@ -25,10 +25,9 @@
#include <assert.h>
#include "qcmsint.h"
/* for MSVC, GCC, and Intel compilers */
#if defined(_M_IX86) || defined(__i386__) || defined(_M_AMD64) || defined(__x86_64__)
#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
#define X86
#endif /* _M_IX86 || __i386__ || _M_AMD64 || __x86_64__ */
#endif
//XXX: could use a bettername
typedef uint16_t uint16_fract_t;
@ -735,6 +734,352 @@ static void qcms_transform_data_graya_out_precache(qcms_transform *transform, un
}
}
static const ALIGN float floatScale = 65536.0f;
static const ALIGN float * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
static const ALIGN float clampMaxValue = ((float) (65536 - 1)) / 65536.0f;
#ifdef X86
#if 0
#include <emmintrin.h>
void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t* output = (uint32_t*)input;
for (i=0; i<length; i++) {
const float *clampMax = &clampMaxValue;
unsigned char device_r = *src++;
unsigned char device_g = *src++;
unsigned char device_b = *src++;
__m128 xmm1 = _mm_load_ps(mat[0]);
__m128 xmm2 = _mm_load_ps(mat[1]);
__m128 xmm3 = _mm_load_ps(mat[2]);
__m128 vec_r = _mm_load_ss(&transform->input_gamma_table_r[device_r]);
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
__m128 vec_g = _mm_load_ss(&transform->input_gamma_table_r[device_g]);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
__m128 vec_b = _mm_load_ss(&transform->input_gamma_table_r[device_b]);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, xmm1);
vec_g = _mm_mul_ps(vec_g, xmm2);
vec_b = _mm_mul_ps(vec_b, xmm3);
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
__m128 max = _mm_load_ss(&clampMax);
max = _mm_shuffle_ps(max, max, 0);
__m128 min = _mm_setzero_ps();
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
__m128 scale = _mm_load_ss(&floatScale);
scale = _mm_shuffle_ps(scale, scale, 0);
__m128 result = _mm_mul_ps(vec_r, scale);
__m128i out = _mm_cvtps_epi32(result);
_mm_store_si128((__m128i*)input, out);
*dest++ = transform->output_table_r->data[output[0]];
*dest++ = transform->output_table_g->data[output[1]];
*dest++ = transform->output_table_b->data[output[2]];
}
}
#endif
#if defined(_MSC_VER) && defined(_M_AMD64)
#include <emmintrin.h>
#endif
static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t* output = (uint32_t*)input;
for (i = 0; i < length; i++) {
const float *clampMax = &clampMaxValue;
unsigned char device_r = *src++;
unsigned char device_g = *src++;
unsigned char device_b = *src++;
input[0] = transform->input_gamma_table_r[device_r];
input[1] = transform->input_gamma_table_g[device_g];
input[2] = transform->input_gamma_table_b[device_b];
#ifdef __GNUC__
__asm(
"movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1
"movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2
"movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3
"movaps (%3), %%xmm0;\n\t" // Move the vector to xmm0
// Note - We have to copy and then shuffle because of the weird
// semantics of shufps
//
"movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4
"shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times
"mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column
"movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5
"shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
"mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column
"movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6
"shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
"mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column
"addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns
"addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns
"movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536
"shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
"minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max
"xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f
"maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min
"movss (%2), %%xmm5;\n\t" // load the floating point scale factor
"shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
"mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor
"cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers
"movdqa %%xmm1, (%3);\n\t" // store
:
: "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
: "memory"
/* older versions of gcc don't know about these registers so only include them as constraints
if gcc knows about them */
#ifdef __SSE2__
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
mov eax, mat
mov ecx, clampMax
mov edx, floatScaleAddr
mov ebx, input
movaps xmm1, [eax]
movaps xmm2, [eax + 16]
movaps xmm3, [eax + 32]
movaps xmm0, [ebx]
movaps xmm4, xmm0
shufps xmm4, xmm4, 0
mulps xmm1, xmm4
movaps xmm5, xmm0
shufps xmm5, xmm5, 0x55
mulps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm6, xmm6, 0xAA
mulps xmm3, xmm6
addps xmm2, xmm3
addps xmm1, xmm2
movss xmm7, [ecx]
shufps xmm7, xmm7, 0
minps xmm1, xmm7
xorps xmm6, xmm6
maxps xmm1, xmm6
movss xmm5, [edx]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
cvtps2dq xmm1, xmm1
movdqa [ebx], xmm1
}
#elif defined(_MSC_VER) && defined(_M_AMD64)
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
xmm1 = _mm_load_ps((__m128*)mat);
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
xmm0 = _mm_load_ps((__m128*)input);
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
xmm7 = _mm_load_ss(clampMax);
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_min_ps(xmm1, xmm7);
xmm6 = _mm_xor_ps(xmm6, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
xmm5 = _mm_load_ss(&floatScale);
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_mul_ps(xmm1, xmm5);
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
}
#else
#error "Unknown platform"
#endif
*dest++ = transform->output_table_r->data[output[0]];
*dest++ = transform->output_table_g->data[output[1]];
*dest++ = transform->output_table_b->data[output[2]];
}
}
static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* align input on 16 byte boundary */
float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t* output = (uint32_t*)input;
for (i = 0; i < length; i++) {
const float *clampMax = &clampMaxValue;
unsigned char device_r = *src++;
unsigned char device_g = *src++;
unsigned char device_b = *src++;
unsigned char alpha = *src++;
input[0] = transform->input_gamma_table_r[device_r];
input[1] = transform->input_gamma_table_g[device_g];
input[2] = transform->input_gamma_table_b[device_b];
#ifdef __GNUC__
__asm(
"movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1
"movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2
"movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3
"movaps (%3), %%xmm0;\n\t" // Move the vector to xmm0
// Note - We have to copy and then shuffle because of the weird
// semantics of shufps
//
"movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4
"shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times
"mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column
"movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5
"shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
"mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column
"movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6
"shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
"mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column
"addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns
"addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns
"movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536
"shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
"minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max
"xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f
"maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min
"movss (%2), %%xmm5;\n\t" // load the floating point scale factor
"shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
"mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor
"cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers
"movdqa %%xmm1, (%3);\n\t" // store
:
: "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
: "memory"
/* older versions of gcc don't know about these registers so only include them as constraints
if gcc knows about them */
#ifdef __SSE2__
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
mov eax, mat
mov ecx, clampMax
mov edx, floatScaleAddr
mov ebx, input
movaps xmm1, [eax]
movaps xmm2, [eax + 16]
movaps xmm3, [eax + 32]
movaps xmm0, [ebx]
movaps xmm4, xmm0
shufps xmm4, xmm4, 0
mulps xmm1, xmm4
movaps xmm5, xmm0
shufps xmm5, xmm5, 0x55
mulps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm6, xmm6, 0xAA
mulps xmm3, xmm6
addps xmm2, xmm3
addps xmm1, xmm2
movss xmm7, [ecx]
shufps xmm7, xmm7, 0
minps xmm1, xmm7
xorps xmm6, xmm6
maxps xmm1, xmm6
movss xmm5, [edx]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
cvtps2dq xmm1, xmm1
movdqa [ebx], xmm1
}
#elif defined(_MSC_VER) && defined(_M_AMD64)
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
xmm1 = _mm_load_ps((__m128*)mat);
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
xmm0 = _mm_load_ps((__m128*)input);
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
xmm7 = _mm_load_ss(clampMax);
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_min_ps(xmm1, xmm7);
xmm6 = _mm_xor_ps(xmm6, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
xmm5 = _mm_load_ss(&floatScale);
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_mul_ps(xmm1, xmm5);
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
}
#else
#error "Unknown platform"
#endif
*dest++ = transform->output_table_r->data[output[0]];
*dest++ = transform->output_table_g->data[output[1]];
*dest++ = transform->output_table_b->data[output[2]];
*dest++ = alpha;
}
}
#endif
static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
unsigned int i;
@ -1035,7 +1380,7 @@ qcms_bool compute_precache(struct curveType *trc, uint8_t *output)
return true;
}
#ifdef X86
// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
// mozilla/jpeg)
// -------------------------------------------------------------------------
@ -1078,43 +1423,31 @@ static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t
}
#endif
// -------------------------Runtime SSEx Detection-----------------------------
// -------------------------Runtime SSE2 Detection-----------------------------
/* MMX is always supported per
* Gecko v1.9.1 minimum CPU requirements */
#define SSE1_EDX_MASK (1UL << 25)
#define SSE2_EDX_MASK (1UL << 26)
#define SSE3_ECX_MASK (1UL << 0)
static int sse_version_available(void)
static qcms_bool sse2_available(void)
{
#if defined(__x86_64__) || defined(_M_AMD64)
/* we know at build time that 64-bit CPUs always have SSE2
* this tells the compiler that non-SSE2 branches will never be
* taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
return 2;
return true;
#elif defined(HAS_CPUID)
static int sse_version = -1;
uint32_t a, b, c, d;
uint32_t function = 0x00000001;
static int has_sse2 = -1;
uint32_t a, b, c, d;
uint32_t function = 0x00000001;
if (sse_version == -1) {
sse_version = 0;
cpuid(function, &a, &b, &c, &d);
if (c & SSE3_ECX_MASK)
sse_version = 3;
else if (d & SSE2_EDX_MASK)
sse_version = 2;
else if (d & SSE1_EDX_MASK)
sse_version = 1;
}
if (has_sse2 == -1) {
has_sse2 = 0;
cpuid(function, &a, &b, &c, &d);
if (d & SSE2_EDX_MASK)
has_sse2 = 1;
else
has_sse2 = 0;
}
return sse_version;
#else
return 0;
return has_sse2;
#endif
return false;
}
#endif
void build_output_lut(struct curveType *trc,
uint16_t **output_gamma_lut, size_t *output_gamma_lut_length)
@ -1220,18 +1553,11 @@ qcms_transform* qcms_transform_create(
}
if (precache) {
#ifdef X86
if (sse_version_available() >= 2) {
if (sse2_available()) {
if (in_type == QCMS_DATA_RGB_8)
transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
transform->transform_fn = qcms_transform_data_rgb_out_lut_sse;
else
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
} else
if (sse_version_available() >= 1) {
if (in_type == QCMS_DATA_RGB_8)
transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
else
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse;
} else
#endif