diff --git a/modules/lcms/include/lcms.h b/modules/lcms/include/lcms.h
index 0f4d4028c63..f9fb0c636bc 100644
--- a/modules/lcms/include/lcms.h
+++ b/modules/lcms/include/lcms.h
@@ -1571,6 +1571,7 @@ void      cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
 void      cdecl MAT3evalF(LPFVEC3 r, LPFMAT3 a, LPFVEC3 v);
 void      cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
 void      cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
+void      cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
 void      cdecl MAT3fromFix(LPMAT3 r, LPWMAT3 v);
 void      cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
 LCMSBOOL  cdecl MAT3isIdentity(LPWMAT3 a, double Tolerance);
@@ -1862,8 +1863,16 @@ typedef struct {
                union {
                   WMAT3 W;
                   FMAT3A FA; // This is not a matrix proper - use FA.F to access the matrix pointer
+                             // Moreover, we store the transpose of the matrix instead, so the first
+                             // vector corresponds to the first column instead of the first row.
                } Matrix;
 
+               FLOAT clampMax; // SSE2 doesn't have an efficient way to clamp using integers, so we have
+                               // to clamp in the float domain. Unfortunately, since we eventually want
+                               // our integer values clamped to 2^16 - 1, we need to clamp with a very
+                               // precise value in the float domain. We let the CPU take care of by calculating
+                               // it at transform creation time rather than trusting the compiler.
+
                L16PARAMS p16;       // Primary curve
                LPWORD L[3];
                LPLCMSPRECACHE L_Precache;
@@ -1880,7 +1889,6 @@ LPMATSHAPER cdecl cmsAllocMatShaper2(LPMAT3 matrix, LPGAMMATABLE In[], LPLCMSPRE
 
 void        cdecl cmsFreeMatShaper(LPMATSHAPER MatShaper);
 void        cdecl cmsEvalMatShaper(LPMATSHAPER MatShaper, WORD In[], WORD Out[]);
-void        cdecl cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[]);
 
 LCMSBOOL    cdecl cmsReadICCMatrixRGB2XYZ(LPMAT3 r, cmsHPROFILE hProfile);
 
diff --git a/modules/lcms/src/cmsmatsh.c b/modules/lcms/src/cmsmatsh.c
index 28f4289df46..990a996cda6 100644
--- a/modules/lcms/src/cmsmatsh.c
+++ b/modules/lcms/src/cmsmatsh.c
@@ -103,9 +103,15 @@ LPMATSHAPER cmsAllocMatShaper2(LPMAT3 Matrix, LPGAMMATABLE In[], LPLCMSPRECACHE
        // Fill matrix part
        if (Behaviour & MATSHAPER_FLOATMAT) {
               FMAT3ASetup(&NewMatShaper->Matrix.FA);
-              MAT3toFloat(NewMatShaper -> Matrix.FA.F, Matrix);
+              MAT3toFloatTranspose(NewMatShaper -> Matrix.FA.F, Matrix);
               if (!FMAT3isIdentity(NewMatShaper -> Matrix.FA.F, 0.00001f))
                             NewMatShaper -> dwFlags |= MATSHAPER_HASMATRIX;
+
+              // This needs to be calculated by the CPU or a very precise
+              // compiler. If it's too big (like 1.0), values are clamped
+              // to 65536 instead 65535, and we either have an overflow of
+              // the precache bounds or scary downcasting.
+              NewMatShaper -> clampMax = ((FLOAT) (65536 - 1)) / 65536.0f;
        }
        else {
               MAT3toFix(&NewMatShaper -> Matrix.W, Matrix);
@@ -397,76 +403,6 @@ void OutputBehaviour(LPMATSHAPER MatShaper, WORD In[], WORD Out[])
 
 }
 
-void cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[])
-{
-       WORD tmp[3];
-       FVEC3 OutVect;
-       LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
-
-       if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
-       {
-              if (MatShaper->L2_Precache != NULL) 
-              {
-              FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
-              FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
-              FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
-              }
-              else
-              {
-              FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
-              FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
-              FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
-              }
-       }
-       else
-       {
-       FloatVals->n[VX] = ToFloatDomain(In[0]);
-       FloatVals->n[VY] = ToFloatDomain(In[1]);
-       FloatVals->n[VZ] = ToFloatDomain(In[2]);
-       }
-
-
-       if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
-       {       
-                         
-       MAT3evalF(&OutVect, MatShaper -> Matrix.FA.F, FloatVals);
-       }
-       else 
-       {
-       OutVect.n[VX] = FloatVals->n[VX];
-       OutVect.n[VY] = FloatVals->n[VY];
-       OutVect.n[VZ] = FloatVals->n[VZ];
-       }
-
-             
-       tmp[0] = _cmsClampWord(FromFloatDomain(OutVect.n[VX]));
-       tmp[1] = _cmsClampWord(FromFloatDomain(OutVect.n[VY]));
-       tmp[2] = _cmsClampWord(FromFloatDomain(OutVect.n[VZ]));
-
-       
-           
-       if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
-       {
-              if (MatShaper->L_Precache != NULL) 
-              {
-              Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
-              Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
-              Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
-              }
-              else 
-              {
-              Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[0], MatShaper -> L[0], &MatShaper -> p16));
-              Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[1], MatShaper -> L[1], &MatShaper -> p16));
-              Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[2], MatShaper -> L[2], &MatShaper -> p16));
-              }
-       }
-       else
-       {
-       Out[0] = RGB_16_TO_8(tmp[0]);
-       Out[1] = RGB_16_TO_8(tmp[1]);
-       Out[2] = RGB_16_TO_8(tmp[2]);
-       }
-}
 
 // Master on evaluating shapers, 3 different behaviours
 
diff --git a/modules/lcms/src/cmsmtrx.c b/modules/lcms/src/cmsmtrx.c
index b30233c0086..edaccaa3153 100644
--- a/modules/lcms/src/cmsmtrx.c
+++ b/modules/lcms/src/cmsmtrx.c
@@ -51,6 +51,7 @@ double    cdecl MAT3det(LPMAT3 m);
 void      cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
 void      cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
 void      cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
+void      cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
 void      cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
 void      cdecl MAT3perK(LPMAT3 r, LPMAT3 v, double d);
 void      cdecl MAT3scaleAndCut(LPWMAT3 r, LPMAT3 v, double d);
@@ -861,6 +862,20 @@ void MAT3toFloat(LPFMAT3 r, LPMAT3 v)
        VEC3toFloat(&r -> v[2], &v -> v[2]);
 }
 
+void MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v)
+{
+       unsigned i, j;
+
+       /* for each row of the source. */
+       for (i = 0; i < 3; ++i) 
+
+           /* For element in the row. */
+           for (j = 0; j < 3; ++j) 
+               
+               /* Col=>Row, Row=>Col. */
+               r -> v[j].n[i] = DOUBLE_TO_FLOAT(v -> v[i].n[j]);
+}
+
 void MAT3fromFix(LPMAT3 r, LPWMAT3 v)
 {
        VEC3fromFix(&r -> v[0], &v -> v[0]);
diff --git a/modules/lcms/src/cmsxform.c b/modules/lcms/src/cmsxform.c
index 6c7318dd745..cc951a77d92 100644
--- a/modules/lcms/src/cmsxform.c
+++ b/modules/lcms/src/cmsxform.c
@@ -58,7 +58,61 @@ void         LCMSEXPORT cmsSetAlarmCodes(int r, int g, int b);
 LCMSBOOL     LCMSEXPORT cmsIsIntentSupported(cmsHPROFILE hProfile,
                                                 int Intent, int UsedDirection);
 
+// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
+// mozilla/jpeg)
 // -------------------------------------------------------------------------
+#if defined(_M_IX86) && !defined(__GNUC__)
+
+/* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
+   register - I'm not sure if that ever happens on windows, but cpuid isn't
+   on the critical path so we just preserve the register to be safe and to be
+   consistent with the non-windows version. */
+LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
+       DWORD a_, b_, c_, d_;
+  
+       ASM {
+              xchg   ebx, esi
+              mov    eax, fxn
+              cpuid
+              mov    a_, eax
+              mov    b_, ebx
+              mov    c_, ecx
+              mov    d_, edx
+              xchg   ebx, esi
+       }
+       *a = a_;
+       *b = b_;
+       *c = c_;
+       *d = d_;
+}
+
+#define HAVE_MMX_INTEL_MNEMONICS 
+
+/* SSE2 code appears broken for some cpus (bug 247437) */
+#define HAVE_SSE2_INTEL_MNEMONICS
+#define HAVE_SSE2_INTRINSICS
+#endif
+
+#if defined(__GNUC__) && defined(__i386__)
+
+/* Get us a CPUID function. We can't use ebx because it's the PIC register on
+   some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
+LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
+	
+	   DWORD a_, b_, c_, d_;
+       __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;" 
+                             : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
+	   *a = a_;
+	   *b = b_;
+	   *c = c_;
+	   *d = d_;
+ }
+
+#define HAVE_SSE2_INTRINSICS
+/* XXX - the below wasn't in jpeg/jmorecfg.h - why? */
+#define HAVE_SSE2_INTEL_MNEMONICS
+#endif /* ! GNUC && i386 */
+
 
 
 // Alarm RGB codes
@@ -89,6 +143,33 @@ static icTagSignature Preview[]    = {icSigPreview0Tag,
 
 static volatile double GlobalAdaptationState = 0;
 
+// -------------------------Runtime SSE2 Detection-----------------------------
+
+#define SSE2_EDX_MASK (1UL << 26)
+static LCMSBOOL SSE2Available() {
+       
+       static int isAvailable = -1;
+       DWORD a, b, c, d;
+       DWORD function = 0x00000001;
+
+       if (isAvailable == -1) {
+
+// If we don't have compile-time support, we don't have runtime support
+#ifndef HAVE_SSE2_INTEL_MNEMONICS
+              isAvailable = 0;
+#else
+       /* We have CPUID macros defined if we have sse2 mnemonics. */
+              LCMSCPUID(function, &a, &b, &c, &d);
+              if (d & SSE2_EDX_MASK)
+                     isAvailable = 1;
+              else
+                     isAvailable = 0;
+#endif
+       }
+
+       return (isAvailable) ? TRUE : FALSE;
+}
+
 // --------------------------------Stages--------------------------------------
 
 // Following routines does implement several kind of steps inside 
@@ -501,8 +582,8 @@ void CachedXFORMGamutCheck(_LPcmsTRANSFORM p,
 
 static
 void MatrixShaperXFORM(_LPcmsTRANSFORM p,
-                     LPVOID in,
-                     LPVOID out, unsigned int Size)
+                       LPVOID in,
+                       LPVOID out, unsigned int Size)
 {
        register LPBYTE accum;
        register LPBYTE output;
@@ -522,25 +603,166 @@ void MatrixShaperXFORM(_LPcmsTRANSFORM p,
        }
 }
 
+static const FLOAT floatScale = 65536.0f;
+static const FLOAT * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
+
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
 static
 void MatrixShaperXFORMFloat(_LPcmsTRANSFORM p,
                             LPVOID in,
                             LPVOID out, unsigned int Size)
 {
-       register LPBYTE input, output;
+       register LPBYTE In, Out;
        register unsigned int i;
+       LPMATSHAPER MatShaper;
 
 
-       input  = (LPBYTE) in;
-       output = (LPBYTE) out;
+       In  = (LPBYTE) in;
+       Out = (LPBYTE) out;
+       MatShaper = p -> SmeltMatShaper;
 
        for (i=0; i < Size; i++)
        {
-       cmsEvalMatShaperFloat(p -> SmeltMatShaper, input, output);
-       input += 3;
-       output += 3;
+
+              LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
+              LPFVEC3 MatPtr = MatShaper ->  Matrix.FA.F->v; // Matrix
+              LPFLOAT clampMax = &MatShaper -> clampMax;
+              LPDWORD tmp = (LPDWORD) FloatVals;
+
+              if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
+              {
+                     if (MatShaper->L2_Precache != NULL) 
+                     {
+                     FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
+                     FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
+                     FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
+                     }
+                     else
+                     {
+                     FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
+                     FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
+                     FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
+                     }
+              }
+              else
+              {
+              FloatVals->n[VX] = ToFloatDomain(In[0]);
+              FloatVals->n[VY] = ToFloatDomain(In[1]);
+              FloatVals->n[VZ] = ToFloatDomain(In[2]);
+              }
+
+              if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
+              {       
+#ifdef __GNUC__
+                __asm(
+                      "movaps (%0), %%xmm1;\n\t"          // Move the first matrix column to xmm1
+                      "movaps 16(%0), %%xmm2;\n\t"        // Move the second matrix column to xmm2
+                      "movaps 32(%0), %%xmm3;\n\t"        // move the third matrix column to xmm3
+                      "movaps 48(%0), %%xmm0;\n\t"        // Move the vector to xmm0
+
+                                                          // Note - We have to copy and then shuffle because of the weird
+                                                          // semantics of shufps
+                                                          //
+                      "movaps %%xmm0, %%xmm4;\n\t"        // Copy the vector to xmm4
+                      "shufps $0, %%xmm4, %%xmm4;\n\t"    // Shuffle to repeat the first vector element repeated 4 times
+                      "mulps %%xmm4, %%xmm1;\n\t"         // Multiply the first vector element by the first matrix column
+                      "movaps %%xmm0, %%xmm5; \n\t"       // Copy the vector to xmm5
+                      "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
+                      "mulps %%xmm5, %%xmm2;\n\t"         // Multiply the second vector element by the seccond matrix column 
+                      "movaps %%xmm0, %%xmm6;\n\t"        // Copy the vector to xmm6
+                      "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
+                      "mulps %%xmm6, %%xmm3;\n\t"         // Multiply the third vector element by the third matrix column
+
+                      "addps %%xmm3, %%xmm2;\n\t"         // Sum (second + third) columns
+                      "addps %%xmm2, %%xmm1;\n\t"         // Sum ((second + third) + first) columns
+
+                      "movss (%1), %%xmm7;\n\t"        // load the floating point representation of 65535/65536 
+                      "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
+                      "minps %%xmm7, %%xmm1;\n\t"      // clamp the vector to 1.0 max
+                      "xorps %%xmm6, %%xmm6;\n\t"       // get us cleared bitpatern, which is 0.0f
+                      "maxps %%xmm6, %%xmm1;\n\t"      // clamp the vector to 0.0 min
+                      "movss (%2), %%xmm5;\n\t"        // load the floating point scale factor
+                      "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
+                      "mulps %%xmm5, %%xmm1;\n\t"      // multiply by the scale factor
+                      "cvtps2dq %%xmm1, %%xmm1;\n\t"   // convert to integers
+                      "movdqa %%xmm1, 48(%0);\n\t"       // store
+
+                      : 
+                      : "r" (MatPtr), "r" (clampMax), "r" (&floatScale)
+                      : "memory"
+                      );
+#else
+                ASM {
+                      mov      eax, MatPtr
+                      mov      ecx, clampMax
+                      mov      edx, floatScaleAddr
+
+                      movaps   xmm1, [eax]
+                      movaps   xmm2, [eax + 16]
+                      movaps   xmm3, [eax + 32]
+                      movaps   xmm0, [eax + 48]
+
+                      movaps   xmm4, xmm0
+                      shufps   xmm4, xmm4, 0
+                      mulps    xmm1, xmm4
+                      movaps   xmm5, xmm0
+                      shufps   xmm5, xmm5, 0x55
+                      mulps    xmm2, xmm5
+                      movaps   xmm6, xmm0
+                      shufps   xmm6, xmm6, 0xAA
+                      mulps    xmm3, xmm6
+
+                      addps    xmm2, xmm3
+                      addps    xmm1, xmm2
+
+                      movss    xmm7, [ecx]
+                      shufps   xmm7, xmm7, 0
+                      minps    xmm1, xmm7
+                      xorps    xmm6, xmm6
+                      maxps    xmm1, xmm6
+                      movss    xmm5, [edx]
+                      shufps   xmm5, xmm5, 0
+                      mulps    xmm1, xmm5
+                      cvtps2dq xmm1, xmm1
+                      movdqa   [eax + 48], xmm1
+                }
+#endif
+
+              }
+              else 
+              {
+              tmp[0] = _cmsClampWord(FromFloatDomain(FloatVals->n[VX]));
+              tmp[1] = _cmsClampWord(FromFloatDomain(FloatVals->n[VY]));
+              tmp[2] = _cmsClampWord(FromFloatDomain(FloatVals->n[VZ]));
+              }
+                  
+              if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
+              {
+                     if (MatShaper->L_Precache != NULL) 
+                     {
+                     Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
+                     Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
+                     Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
+                     }
+                     else 
+                     {
+                     Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[0], MatShaper -> L[0], &MatShaper -> p16));
+                     Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[1], MatShaper -> L[1], &MatShaper -> p16));
+                     Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[2], MatShaper -> L[2], &MatShaper -> p16));
+                     }
+              }
+              else
+              {
+              Out[0] = RGB_16_TO_8((WORD)tmp[0]);
+              Out[1] = RGB_16_TO_8((WORD)tmp[1]);
+              Out[2] = RGB_16_TO_8((WORD)tmp[2]);
+              }
+
+              In += 3;
+              Out += 3;
        }
 }
+#endif
 
 
 // Using Named color input table
@@ -1296,8 +1518,29 @@ _LPcmsTRANSFORM PickTransformRoutine(_LPcmsTRANSFORM p,
                        (p -> ExitColorSpace == icSigRgbData)  &&
                        !(p -> dwOriginalFlags & cmsFLAGS_BLACKPOINTCOMPENSATION)) {
 
+
+                          // If the floating point path is requested, see if we support it
+                          if (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER)
+                          {
+
+#ifndef HAVE_SSE2_INTEL_MNEMONICS 
+                                 // Turn it off if we can't compile it
+                                 p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
+#else
+                                 // Turn it off if we don't have it at runtime
+                                 if (!SSE2Available())
+                                        p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
+#endif
+                          }
+
                           // Yes... try to smelt matrix-shapers
+
+#ifndef HAVE_SSE2_INTEL_MNEMONICS
+                          p -> xform = MatrixShaperXFORM;
+#else
                           p -> xform = (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) ? MatrixShaperXFORMFloat : MatrixShaperXFORM;
+#endif
+
                           p -> dwOriginalFlags |= cmsFLAGS_NOTPRECALC;
 
                           if (!cmsBuildSmeltMatShaper(p))