Backed out changeset cb4f078cc8cb (bug 512865)

Was causing crashes on the leak test box.
2009-09-24 23:35:53 -04:00 · 2009-09-24 23:35:53 -04:00 · 3f907e258a
--- a/gfx/qcms/Makefile.in
+++ b/gfx/qcms/Makefile.in
@ -15,17 +15,6 @@ EXPORTS      = qcms.h qcmstypes.h

 CSRCS = iccread.c transform.c

-ifeq (86,$(findstring 86,$(OS_TEST)))
-	CSRCS += transform-sse2.c transform-sse1.c
-ifdef GNU_CC
-	SSE1_FLAGS=-msse
-	SSE2_FLAGS=-msse2
-else
-	SSE1_FLAGS=
-	SSE2_FLAGS=
-endif
-endif
-
 FORCE_STATIC_LIB = 1
 # This library is used by other shared libs
 FORCE_USE_PIC = 1
@ -33,15 +22,3 @@ FORCE_USE_PIC = 1
 include $(topsrcdir)/config/rules.mk

 CFLAGS          += -DMOZ_QCMS
-
-
-# special rules for transform-sse*.c to get the right cflags. (taken from pixman/src/Makefile.in)
-transform-sse1.$(OBJ_SUFFIX): transform-sse1.c Makefile Makefile.in
-	$(REPORT_BUILD)
-	@$(MAKE_DEPS_AUTO_CC)
-	$(ELOG) $(CC) $(OUTOPTION)$@ -c $(COMPILE_CFLAGS) $(SSE1_FLAGS) $(_VPATH_SRCS)
-
-transform-sse2.$(OBJ_SUFFIX): transform-sse2.c Makefile Makefile.in
-	$(REPORT_BUILD)
-	@$(MAKE_DEPS_AUTO_CC)
-	$(ELOG) $(CC) $(OUTOPTION)$@ -c $(COMPILE_CFLAGS) $(SSE2_FLAGS) $(_VPATH_SRCS)
--- a/gfx/qcms/qcmsint.h
+++ b/gfx/qcms/qcmsint.h
@ -141,20 +141,3 @@ static inline s15Fixed16Number double_to_s15Fixed16Number(double v)

 void precache_release(struct precache_output *p);
 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
-
-void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length);
-void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length);
-void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length);
-void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length);
--- a/gfx/qcms/transform-sse1.c
+++ b/gfx/qcms/transform-sse1.c
@ -1,253 +0,0 @@
-#include <xmmintrin.h>
-
-#include "qcmsint.h"
-
-/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
-#define FLOATSCALE  65536.0f
-#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
-static const ALIGN float floatScaleX4[4] =
-    { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
-static const ALIGN float clampMaxValueX4[4] =
-    { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
-
-void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length)
-{
-    unsigned int i;
-    float (*mat)[4] = transform->matrix;
-    char input_back[32];
-    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
-     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-     * because they don't work on stack variables. gcc 4.4 does do the right thing
-     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-    /* share input and output locations to save having to keep the
-     * locations in separate registers */
-    uint32_t const * output = (uint32_t*)input;
-
-    /* deref *transform now to avoid it in loop */
-    const float *igtbl_r = transform->input_gamma_table_r;
-    const float *igtbl_g = transform->input_gamma_table_g;
-    const float *igtbl_b = transform->input_gamma_table_b;
-
-    /* deref *transform now to avoid it in loop */
-    const uint8_t *otdata_r = &transform->output_table_r->data[0];
-    const uint8_t *otdata_g = &transform->output_table_g->data[0];
-    const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-    /* input matrix values never change */
-    const __m128 mat0  = _mm_load_ps(mat[0]);
-    const __m128 mat1  = _mm_load_ps(mat[1]);
-    const __m128 mat2  = _mm_load_ps(mat[2]);
-
-    /* these values don't change, either */
-    const __m128 max   = _mm_load_ps(clampMaxValueX4);
-    const __m128 min   = _mm_setzero_ps();
-    const __m128 scale = _mm_load_ps(floatScaleX4);
-
-    /* working variables */
-    __m128 vec_r, vec_g, vec_b, result;
-
-    /* CYA */
-    if (!length)
-        return;
-
-    /* one pixel is handled outside of the loop */
-    length--;
-
-    /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    src += 3;
-
-    /* transform all but final pixel */
-
-    for (i=0; i<length; i++)
-    {
-        /* position values from gamma tables */
-        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-        /* gamma * matrix */
-        vec_r = _mm_mul_ps(vec_r, mat0);
-        vec_g = _mm_mul_ps(vec_g, mat1);
-        vec_b = _mm_mul_ps(vec_b, mat2);
-
-        /* crunch, crunch, crunch */
-        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-        vec_r  = _mm_max_ps(min, vec_r);
-        vec_r  = _mm_min_ps(max, vec_r);
-        result = _mm_mul_ps(vec_r, scale);
-
-        /* store calc'd output tables indices */
-        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
-        result = _mm_movehl_ps(result, result);
-        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
-
-        /* load for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 3;
-
-        /* use calc'd indices to output RGB values */
-        dest[0] = otdata_r[output[0]];
-        dest[1] = otdata_g[output[1]];
-        dest[2] = otdata_b[output[2]];
-        dest += 3;
-    }
-
-    /* handle final (maybe only) pixel */
-
-    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-    vec_r = _mm_mul_ps(vec_r, mat0);
-    vec_g = _mm_mul_ps(vec_g, mat1);
-    vec_b = _mm_mul_ps(vec_b, mat2);
-
-    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-    vec_r  = _mm_max_ps(min, vec_r);
-    vec_r  = _mm_min_ps(max, vec_r);
-    result = _mm_mul_ps(vec_r, scale);
-
-    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
-    result = _mm_movehl_ps(result, result);
-    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
-
-    dest[0] = otdata_r[output[0]];
-    dest[1] = otdata_g[output[1]];
-    dest[2] = otdata_b[output[2]];
-
-    _mm_empty();
-}
-
-void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
-                                           unsigned char *src,
-                                           unsigned char *dest,
-                                           size_t length)
-{
-    unsigned int i;
-    float (*mat)[4] = transform->matrix;
-    char input_back[32];
-    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
-     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-     * because they don't work on stack variables. gcc 4.4 does do the right thing
-     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-    /* share input and output locations to save having to keep the
-     * locations in separate registers */
-    uint32_t const * output = (uint32_t*)input;
-
-    /* deref *transform now to avoid it in loop */
-    const float *igtbl_r = transform->input_gamma_table_r;
-    const float *igtbl_g = transform->input_gamma_table_g;
-    const float *igtbl_b = transform->input_gamma_table_b;
-
-    /* deref *transform now to avoid it in loop */
-    const uint8_t *otdata_r = &transform->output_table_r->data[0];
-    const uint8_t *otdata_g = &transform->output_table_g->data[0];
-    const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-    /* input matrix values never change */
-    const __m128 mat0  = _mm_load_ps(mat[0]);
-    const __m128 mat1  = _mm_load_ps(mat[1]);
-    const __m128 mat2  = _mm_load_ps(mat[2]);
-
-    /* these values don't change, either */
-    const __m128 max   = _mm_load_ps(clampMaxValueX4);
-    const __m128 min   = _mm_setzero_ps();
-    const __m128 scale = _mm_load_ps(floatScaleX4);
-
-    /* working variables */
-    __m128 vec_r, vec_g, vec_b, result;
-    unsigned char alpha;
-
-    /* CYA */
-    if (!length)
-        return;
-
-    /* one pixel is handled outside of the loop */
-    length--;
-
-    /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    alpha = src[3];
-    src += 4;
-
-    /* transform all but final pixel */
-
-    for (i=0; i<length; i++)
-    {
-        /* position values from gamma tables */
-        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-        /* gamma * matrix */
-        vec_r = _mm_mul_ps(vec_r, mat0);
-        vec_g = _mm_mul_ps(vec_g, mat1);
-        vec_b = _mm_mul_ps(vec_b, mat2);
-
-        /* store alpha for this pixel; load alpha for next */
-        dest[3] = alpha;
-        alpha   = src[3];
-
-        /* crunch, crunch, crunch */
-        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-        vec_r  = _mm_max_ps(min, vec_r);
-        vec_r  = _mm_min_ps(max, vec_r);
-        result = _mm_mul_ps(vec_r, scale);
-
-        /* store calc'd output tables indices */
-        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
-        result = _mm_movehl_ps(result, result);
-        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
-
-        /* load gamma values for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 4;
-
-        /* use calc'd indices to output RGB values */
-        dest[0] = otdata_r[output[0]];
-        dest[1] = otdata_g[output[1]];
-        dest[2] = otdata_b[output[2]];
-        dest += 4;
-    }
-
-    /* handle final (maybe only) pixel */
-
-    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-    vec_r = _mm_mul_ps(vec_r, mat0);
-    vec_g = _mm_mul_ps(vec_g, mat1);
-    vec_b = _mm_mul_ps(vec_b, mat2);
-
-    dest[3] = alpha;
-
-    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-    vec_r  = _mm_max_ps(min, vec_r);
-    vec_r  = _mm_min_ps(max, vec_r);
-    result = _mm_mul_ps(vec_r, scale);
-
-    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
-    result = _mm_movehl_ps(result, result);
-    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
-
-    dest[0] = otdata_r[output[0]];
-    dest[1] = otdata_g[output[1]];
-    dest[2] = otdata_b[output[2]];
-
-    _mm_empty();
-}
--- a/gfx/qcms/transform-sse2.c
+++ b/gfx/qcms/transform-sse2.c
@ -1,243 +0,0 @@
-#include <emmintrin.h>
-
-#include "qcmsint.h"
-
-/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
-#define FLOATSCALE  65536.0f
-#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
-static const ALIGN float floatScaleX4[4] =
-    { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
-static const ALIGN float clampMaxValueX4[4] =
-    { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
-
-void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length)
-{
-    unsigned int i;
-    float (*mat)[4] = transform->matrix;
-    char input_back[32];
-    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
-     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-     * because they don't work on stack variables. gcc 4.4 does do the right thing
-     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-    /* share input and output locations to save having to keep the
-     * locations in separate registers */
-    uint32_t const * output = (uint32_t*)input;
-
-    /* deref *transform now to avoid it in loop */
-    const float *igtbl_r = transform->input_gamma_table_r;
-    const float *igtbl_g = transform->input_gamma_table_g;
-    const float *igtbl_b = transform->input_gamma_table_b;
-
-    /* deref *transform now to avoid it in loop */
-    const uint8_t *otdata_r = &transform->output_table_r->data[0];
-    const uint8_t *otdata_g = &transform->output_table_g->data[0];
-    const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-    /* input matrix values never change */
-    const __m128 mat0  = _mm_load_ps(mat[0]);
-    const __m128 mat1  = _mm_load_ps(mat[1]);
-    const __m128 mat2  = _mm_load_ps(mat[2]);
-
-    /* these values don't change, either */
-    const __m128 max   = _mm_load_ps(clampMaxValueX4);
-    const __m128 min   = _mm_setzero_ps();
-    const __m128 scale = _mm_load_ps(floatScaleX4);
-
-    /* working variables */
-    __m128 vec_r, vec_g, vec_b, result;
-
-    /* CYA */
-    if (!length)
-        return;
-
-    /* one pixel is handled outside of the loop */
-    length--;
-
-    /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    src += 3;
-
-    /* transform all but final pixel */
-
-    for (i=0; i<length; i++)
-    {
-        /* position values from gamma tables */
-        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-        /* gamma * matrix */
-        vec_r = _mm_mul_ps(vec_r, mat0);
-        vec_g = _mm_mul_ps(vec_g, mat1);
-        vec_b = _mm_mul_ps(vec_b, mat2);
-
-        /* crunch, crunch, crunch */
-        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-        vec_r  = _mm_max_ps(min, vec_r);
-        vec_r  = _mm_min_ps(max, vec_r);
-        result = _mm_mul_ps(vec_r, scale);
-
-        /* store calc'd output tables indices */
-        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
-
-        /* load for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 3;
-
-        /* use calc'd indices to output RGB values */
-        dest[0] = otdata_r[output[0]];
-        dest[1] = otdata_g[output[1]];
-        dest[2] = otdata_b[output[2]];
-        dest += 3;
-    }
-
-    /* handle final (maybe only) pixel */
-
-    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-    vec_r = _mm_mul_ps(vec_r, mat0);
-    vec_g = _mm_mul_ps(vec_g, mat1);
-    vec_b = _mm_mul_ps(vec_b, mat2);
-
-    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-    vec_r  = _mm_max_ps(min, vec_r);
-    vec_r  = _mm_min_ps(max, vec_r);
-    result = _mm_mul_ps(vec_r, scale);
-
-    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
-
-    dest[0] = otdata_r[output[0]];
-    dest[1] = otdata_g[output[1]];
-    dest[2] = otdata_b[output[2]];
-}
-
-void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
-                                           unsigned char *src,
-                                           unsigned char *dest,
-                                           size_t length)
-{
-    unsigned int i;
-    float (*mat)[4] = transform->matrix;
-    char input_back[32];
-    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
-     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-     * because they don't work on stack variables. gcc 4.4 does do the right thing
-     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-    /* share input and output locations to save having to keep the
-     * locations in separate registers */
-    uint32_t const * output = (uint32_t*)input;
-
-    /* deref *transform now to avoid it in loop */
-    const float *igtbl_r = transform->input_gamma_table_r;
-    const float *igtbl_g = transform->input_gamma_table_g;
-    const float *igtbl_b = transform->input_gamma_table_b;
-
-    /* deref *transform now to avoid it in loop */
-    const uint8_t *otdata_r = &transform->output_table_r->data[0];
-    const uint8_t *otdata_g = &transform->output_table_g->data[0];
-    const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-    /* input matrix values never change */
-    const __m128 mat0  = _mm_load_ps(mat[0]);
-    const __m128 mat1  = _mm_load_ps(mat[1]);
-    const __m128 mat2  = _mm_load_ps(mat[2]);
-
-    /* these values don't change, either */
-    const __m128 max   = _mm_load_ps(clampMaxValueX4);
-    const __m128 min   = _mm_setzero_ps();
-    const __m128 scale = _mm_load_ps(floatScaleX4);
-
-    /* working variables */
-    __m128 vec_r, vec_g, vec_b, result;
-    unsigned char alpha;
-
-    /* CYA */
-    if (!length)
-        return;
-
-    /* one pixel is handled outside of the loop */
-    length--;
-
-    /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    alpha = src[3];
-    src += 4;
-
-    /* transform all but final pixel */
-
-    for (i=0; i<length; i++)
-    {
-        /* position values from gamma tables */
-        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-        /* gamma * matrix */
-        vec_r = _mm_mul_ps(vec_r, mat0);
-        vec_g = _mm_mul_ps(vec_g, mat1);
-        vec_b = _mm_mul_ps(vec_b, mat2);
-
-        /* store alpha for this pixel; load alpha for next */
-        dest[3] = alpha;
-        alpha   = src[3];
-
-        /* crunch, crunch, crunch */
-        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-        vec_r  = _mm_max_ps(min, vec_r);
-        vec_r  = _mm_min_ps(max, vec_r);
-        result = _mm_mul_ps(vec_r, scale);
-
-        /* store calc'd output tables indices */
-        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
-
-        /* load gamma values for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 4;
-
-        /* use calc'd indices to output RGB values */
-        dest[0] = otdata_r[output[0]];
-        dest[1] = otdata_g[output[1]];
-        dest[2] = otdata_b[output[2]];
-        dest += 4;
-    }
-
-    /* handle final (maybe only) pixel */
-
-    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-    vec_r = _mm_mul_ps(vec_r, mat0);
-    vec_g = _mm_mul_ps(vec_g, mat1);
-    vec_b = _mm_mul_ps(vec_b, mat2);
-
-    dest[3] = alpha;
-
-    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-    vec_r  = _mm_max_ps(min, vec_r);
-    vec_r  = _mm_min_ps(max, vec_r);
-    result = _mm_mul_ps(vec_r, scale);
-
-    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
-
-    dest[0] = otdata_r[output[0]];
-    dest[1] = otdata_g[output[1]];
-    dest[2] = otdata_b[output[2]];
-}
-
-
--- a/gfx/qcms/transform.c
+++ b/gfx/qcms/transform.c
@ -25,10 +25,9 @@
 #include <assert.h>
 #include "qcmsint.h"

-/* for MSVC, GCC, and Intel compilers */
-#if defined(_M_IX86) || defined(__i386__) || defined(_M_AMD64) || defined(__x86_64__)
+#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
 #define X86
-#endif /* _M_IX86 || __i386__ || _M_AMD64 || __x86_64__ */
+#endif

 //XXX: could use a bettername
 typedef uint16_t uint16_fract_t;
@ -735,6 +734,352 @@ static void qcms_transform_data_graya_out_precache(qcms_transform *transform, un
 	}
 }

+static const ALIGN float floatScale = 65536.0f;
+static const ALIGN float * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
+
+static const ALIGN float clampMaxValue = ((float) (65536 - 1)) / 65536.0f;
+
+#ifdef X86
+#if 0
+#include <emmintrin.h>
+void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+{
+	int i;
+	float (*mat)[4] = transform->matrix;
+        char input_back[32];
+	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
+	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+	 * because they don't work on stack variables. gcc 4.4 does do the right thing 
+	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+        float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+        /* share input and output locations to save having to keep the
+         * locations in separate registers */
+        uint32_t* output = (uint32_t*)input;
+	for (i=0; i<length; i++) {
+		const float *clampMax = &clampMaxValue;
+
+		unsigned char device_r = *src++;
+		unsigned char device_g = *src++;
+		unsigned char device_b = *src++;
+
+		__m128 xmm1 = _mm_load_ps(mat[0]);
+		__m128 xmm2 = _mm_load_ps(mat[1]);
+		__m128 xmm3 = _mm_load_ps(mat[2]);
+
+		__m128 vec_r = _mm_load_ss(&transform->input_gamma_table_r[device_r]);
+		vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+		__m128 vec_g = _mm_load_ss(&transform->input_gamma_table_r[device_g]);
+		vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+		__m128 vec_b = _mm_load_ss(&transform->input_gamma_table_r[device_b]);
+		vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+		vec_r = _mm_mul_ps(vec_r, xmm1);
+		vec_g = _mm_mul_ps(vec_g, xmm2);
+		vec_b = _mm_mul_ps(vec_b, xmm3);
+
+		vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+
+		__m128 max = _mm_load_ss(&clampMax);
+		max = _mm_shuffle_ps(max, max, 0);
+		__m128 min = _mm_setzero_ps();
+
+		vec_r = _mm_max_ps(min, vec_r);
+		vec_r = _mm_min_ps(max, vec_r);
+
+		__m128 scale = _mm_load_ss(&floatScale);
+		scale = _mm_shuffle_ps(scale, scale, 0);
+		__m128 result = _mm_mul_ps(vec_r, scale);
+
+		__m128i out = _mm_cvtps_epi32(result);
+		_mm_store_si128((__m128i*)input, out);
+
+		*dest++ = transform->output_table_r->data[output[0]];
+		*dest++ = transform->output_table_g->data[output[1]];
+		*dest++ = transform->output_table_b->data[output[2]];
+	}
+}
+#endif
+
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#include <emmintrin.h>
+#endif
+
+static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+{
+	unsigned int i;
+	float (*mat)[4] = transform->matrix;
+        char input_back[32];
+	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
+	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+	 * because they don't work on stack variables. gcc 4.4 does do the right thing 
+	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+        float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+        /* share input and output locations to save having to keep the
+         * locations in separate registers */
+        uint32_t* output = (uint32_t*)input;
+	for (i = 0; i < length; i++) {
+		const float *clampMax = &clampMaxValue;
+
+		unsigned char device_r = *src++;
+		unsigned char device_g = *src++;
+		unsigned char device_b = *src++;
+
+		input[0] = transform->input_gamma_table_r[device_r];
+		input[1] = transform->input_gamma_table_g[device_g];
+		input[2] = transform->input_gamma_table_b[device_b];
+
+#ifdef __GNUC__
+		__asm(
+                      "movaps (%0), %%xmm1;\n\t"          // Move the first matrix column to xmm1
+                      "movaps 16(%0), %%xmm2;\n\t"        // Move the second matrix column to xmm2
+                      "movaps 32(%0), %%xmm3;\n\t"        // move the third matrix column to xmm3
+                      "movaps (%3), %%xmm0;\n\t"        // Move the vector to xmm0
+
+                                                          // Note - We have to copy and then shuffle because of the weird
+                                                          // semantics of shufps
+                                                          //
+                      "movaps %%xmm0, %%xmm4;\n\t"        // Copy the vector to xmm4
+                      "shufps $0, %%xmm4, %%xmm4;\n\t"    // Shuffle to repeat the first vector element repeated 4 times
+                      "mulps %%xmm4, %%xmm1;\n\t"         // Multiply the first vector element by the first matrix column
+                      "movaps %%xmm0, %%xmm5; \n\t"       // Copy the vector to xmm5
+                      "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
+                      "mulps %%xmm5, %%xmm2;\n\t"         // Multiply the second vector element by the seccond matrix column 
+                      "movaps %%xmm0, %%xmm6;\n\t"        // Copy the vector to xmm6
+                      "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
+                      "mulps %%xmm6, %%xmm3;\n\t"         // Multiply the third vector element by the third matrix column
+
+                      "addps %%xmm3, %%xmm2;\n\t"         // Sum (second + third) columns
+                      "addps %%xmm2, %%xmm1;\n\t"         // Sum ((second + third) + first) columns
+
+                      "movss (%1), %%xmm7;\n\t"        // load the floating point representation of 65535/65536 
+                      "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
+                      "minps %%xmm7, %%xmm1;\n\t"      // clamp the vector to 1.0 max
+                      "xorps %%xmm6, %%xmm6;\n\t"       // get us cleared bitpatern, which is 0.0f
+                      "maxps %%xmm6, %%xmm1;\n\t"      // clamp the vector to 0.0 min
+                      "movss (%2), %%xmm5;\n\t"        // load the floating point scale factor
+                      "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
+                      "mulps %%xmm5, %%xmm1;\n\t"      // multiply by the scale factor
+                      "cvtps2dq %%xmm1, %%xmm1;\n\t"   // convert to integers
+                      "movdqa %%xmm1, (%3);\n\t"       // store
+
+                      : 
+                      : "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
+                      : "memory"
+/* older versions of gcc don't know about these registers so only include them as constraints
+   if gcc knows about them */
+#ifdef __SSE2__
+                        , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+#endif
+                      );
+#elif defined(_MSC_VER) && defined(_M_IX86)
+                __asm {
+                      mov      eax, mat
+                      mov      ecx, clampMax
+                      mov      edx, floatScaleAddr
+		      mov      ebx, input
+
+                      movaps   xmm1, [eax]
+                      movaps   xmm2, [eax + 16]
+                      movaps   xmm3, [eax + 32]
+                      movaps   xmm0, [ebx]
+
+                      movaps   xmm4, xmm0
+                      shufps   xmm4, xmm4, 0
+                      mulps    xmm1, xmm4
+                      movaps   xmm5, xmm0
+                      shufps   xmm5, xmm5, 0x55
+                      mulps    xmm2, xmm5
+                      movaps   xmm6, xmm0
+                      shufps   xmm6, xmm6, 0xAA
+                      mulps    xmm3, xmm6
+
+                      addps    xmm2, xmm3
+                      addps    xmm1, xmm2
+
+                      movss    xmm7, [ecx]
+                      shufps   xmm7, xmm7, 0
+                      minps    xmm1, xmm7
+                      xorps    xmm6, xmm6
+                      maxps    xmm1, xmm6
+                      movss    xmm5, [edx]
+                      shufps   xmm5, xmm5, 0
+                      mulps    xmm1, xmm5
+                      cvtps2dq xmm1, xmm1
+                      movdqa   [ebx], xmm1
+                }
+#elif defined(_MSC_VER) && defined(_M_AMD64)
+                {
+                        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
+
+                        xmm1 = _mm_load_ps((__m128*)mat);
+                        xmm2 = _mm_load_ps(((__m128*)mat) + 1);
+                        xmm3 = _mm_load_ps(((__m128*)mat) + 2);
+                        xmm0 = _mm_load_ps((__m128*)input);
+
+                        xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
+                        xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
+                        xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
+
+                        xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
+
+                        xmm7 = _mm_load_ss(clampMax);
+                        xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_min_ps(xmm1, xmm7);
+                        xmm6 = _mm_xor_ps(xmm6, xmm6);
+                        xmm1 = _mm_max_ps(xmm1, xmm6);
+                        xmm5 = _mm_load_ss(&floatScale);
+                        xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_mul_ps(xmm1, xmm5);
+                        _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
+                }
+#else
+#error "Unknown platform"
+#endif
+
+		*dest++ = transform->output_table_r->data[output[0]];
+		*dest++ = transform->output_table_g->data[output[1]];
+		*dest++ = transform->output_table_b->data[output[2]];
+	}
+}
+
+static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+{
+	unsigned int i;
+	float (*mat)[4] = transform->matrix;
+        char input_back[32];
+	/* align input on 16 byte boundary */
+        float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+        /* share input and output locations to save having to keep the
+         * locations in separate registers */
+        uint32_t* output = (uint32_t*)input;
+	for (i = 0; i < length; i++) {
+		const float *clampMax = &clampMaxValue;
+
+		unsigned char device_r = *src++;
+		unsigned char device_g = *src++;
+		unsigned char device_b = *src++;
+		unsigned char alpha = *src++;
+
+		input[0] = transform->input_gamma_table_r[device_r];
+		input[1] = transform->input_gamma_table_g[device_g];
+		input[2] = transform->input_gamma_table_b[device_b];
+
+#ifdef __GNUC__
+		__asm(
+                      "movaps (%0), %%xmm1;\n\t"          // Move the first matrix column to xmm1
+                      "movaps 16(%0), %%xmm2;\n\t"        // Move the second matrix column to xmm2
+                      "movaps 32(%0), %%xmm3;\n\t"        // move the third matrix column to xmm3
+                      "movaps (%3), %%xmm0;\n\t"        // Move the vector to xmm0
+
+                                                          // Note - We have to copy and then shuffle because of the weird
+                                                          // semantics of shufps
+                                                          //
+                      "movaps %%xmm0, %%xmm4;\n\t"        // Copy the vector to xmm4
+                      "shufps $0, %%xmm4, %%xmm4;\n\t"    // Shuffle to repeat the first vector element repeated 4 times
+                      "mulps %%xmm4, %%xmm1;\n\t"         // Multiply the first vector element by the first matrix column
+                      "movaps %%xmm0, %%xmm5; \n\t"       // Copy the vector to xmm5
+                      "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
+                      "mulps %%xmm5, %%xmm2;\n\t"         // Multiply the second vector element by the seccond matrix column 
+                      "movaps %%xmm0, %%xmm6;\n\t"        // Copy the vector to xmm6
+                      "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
+                      "mulps %%xmm6, %%xmm3;\n\t"         // Multiply the third vector element by the third matrix column
+
+                      "addps %%xmm3, %%xmm2;\n\t"         // Sum (second + third) columns
+                      "addps %%xmm2, %%xmm1;\n\t"         // Sum ((second + third) + first) columns
+
+                      "movss (%1), %%xmm7;\n\t"        // load the floating point representation of 65535/65536 
+                      "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
+                      "minps %%xmm7, %%xmm1;\n\t"      // clamp the vector to 1.0 max
+                      "xorps %%xmm6, %%xmm6;\n\t"       // get us cleared bitpatern, which is 0.0f
+                      "maxps %%xmm6, %%xmm1;\n\t"      // clamp the vector to 0.0 min
+                      "movss (%2), %%xmm5;\n\t"        // load the floating point scale factor
+                      "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
+                      "mulps %%xmm5, %%xmm1;\n\t"      // multiply by the scale factor
+                      "cvtps2dq %%xmm1, %%xmm1;\n\t"   // convert to integers
+                      "movdqa %%xmm1, (%3);\n\t"       // store
+
+                      : 
+                      : "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
+                      : "memory"
+/* older versions of gcc don't know about these registers so only include them as constraints
+   if gcc knows about them */
+#ifdef __SSE2__
+                        , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+#endif
+                      );
+#elif defined(_MSC_VER) && defined(_M_IX86)
+                __asm {
+                      mov      eax, mat
+                      mov      ecx, clampMax
+                      mov      edx, floatScaleAddr
+		      mov      ebx, input
+
+                      movaps   xmm1, [eax]
+                      movaps   xmm2, [eax + 16]
+                      movaps   xmm3, [eax + 32]
+                      movaps   xmm0, [ebx]
+
+                      movaps   xmm4, xmm0
+                      shufps   xmm4, xmm4, 0
+                      mulps    xmm1, xmm4
+                      movaps   xmm5, xmm0
+                      shufps   xmm5, xmm5, 0x55
+                      mulps    xmm2, xmm5
+                      movaps   xmm6, xmm0
+                      shufps   xmm6, xmm6, 0xAA
+                      mulps    xmm3, xmm6
+
+                      addps    xmm2, xmm3
+                      addps    xmm1, xmm2
+
+                      movss    xmm7, [ecx]
+                      shufps   xmm7, xmm7, 0
+                      minps    xmm1, xmm7
+                      xorps    xmm6, xmm6
+                      maxps    xmm1, xmm6
+                      movss    xmm5, [edx]
+                      shufps   xmm5, xmm5, 0
+                      mulps    xmm1, xmm5
+                      cvtps2dq xmm1, xmm1
+                      movdqa   [ebx], xmm1
+                }
+#elif defined(_MSC_VER) && defined(_M_AMD64)
+                {
+                        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
+
+                        xmm1 = _mm_load_ps((__m128*)mat);
+                        xmm2 = _mm_load_ps(((__m128*)mat) + 1);
+                        xmm3 = _mm_load_ps(((__m128*)mat) + 2);
+                        xmm0 = _mm_load_ps((__m128*)input);
+
+                        xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
+                        xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
+                        xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
+
+                        xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
+
+                        xmm7 = _mm_load_ss(clampMax);
+                        xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_min_ps(xmm1, xmm7);
+                        xmm6 = _mm_xor_ps(xmm6, xmm6);
+                        xmm1 = _mm_max_ps(xmm1, xmm6);
+                        xmm5 = _mm_load_ss(&floatScale);
+                        xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
+                        xmm1 = _mm_mul_ps(xmm1, xmm5);
+                        _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
+                }
+#else
+#error "Unknown platform"
+#endif
+
+		*dest++ = transform->output_table_r->data[output[0]];
+		*dest++ = transform->output_table_g->data[output[1]];
+		*dest++ = transform->output_table_b->data[output[2]];
+		*dest++ = alpha;
+	}
+}
+#endif
+
 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 {
 	unsigned int i;
@ -1035,7 +1380,7 @@ qcms_bool compute_precache(struct curveType *trc, uint8_t *output)
 	return true;
 }

-#ifdef X86
+
 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
 // mozilla/jpeg)
 // -------------------------------------------------------------------------
@ -1078,43 +1423,31 @@ static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t
 }
 #endif

-// -------------------------Runtime SSEx Detection-----------------------------
+// -------------------------Runtime SSE2 Detection-----------------------------

-/* MMX is always supported per
- *  Gecko v1.9.1 minimum CPU requirements */
-#define SSE1_EDX_MASK (1UL << 25)
 #define SSE2_EDX_MASK (1UL << 26)
-#define SSE3_ECX_MASK (1UL <<  0)
-
-static int sse_version_available(void)
+static qcms_bool sse2_available(void)
 {
 #if defined(__x86_64__) || defined(_M_AMD64)
-	/* we know at build time that 64-bit CPUs always have SSE2
-	 * this tells the compiler that non-SSE2 branches will never be
-	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
-	return 2;
+       return true;
 #elif defined(HAS_CPUID)
-	static int sse_version = -1;
-	uint32_t a, b, c, d;
-	uint32_t function = 0x00000001;
+       static int has_sse2 = -1;
+       uint32_t a, b, c, d;
+       uint32_t function = 0x00000001;

-	if (sse_version == -1) {
-		sse_version = 0;
-		cpuid(function, &a, &b, &c, &d);
-		if (c & SSE3_ECX_MASK)
-			sse_version = 3;
-		else if (d & SSE2_EDX_MASK)
-			sse_version = 2;
-		else if (d & SSE1_EDX_MASK)
-			sse_version = 1;
-	}
+       if (has_sse2 == -1) {
+              has_sse2 = 0;
+	      cpuid(function, &a, &b, &c, &d);
+              if (d & SSE2_EDX_MASK)
+                     has_sse2 = 1;
+              else
+                     has_sse2 = 0;
+       }

-	return sse_version;
-#else
-	return 0;
+       return has_sse2;
 #endif
+       return false;
 }
-#endif

 void build_output_lut(struct curveType *trc,
 		uint16_t **output_gamma_lut, size_t *output_gamma_lut_length)
@ -1220,18 +1553,11 @@ qcms_transform* qcms_transform_create(
            }
 	    if (precache) {
 #ifdef X86
-		    if (sse_version_available() >= 2) {
+		    if (sse2_available()) {
 			    if (in_type == QCMS_DATA_RGB_8)
-				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
+				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse;
 			    else
-				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
-
-		    } else
-		    if (sse_version_available() >= 1) {
-			    if (in_type == QCMS_DATA_RGB_8)
-				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
-			    else
-				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
+				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse;

 		    } else
 #endif