Optimized SSE2 assembly for LCMS - part of bug 445552. r=vlad

2008-08-17 22:07:51 -07:00 · 2008-08-17 22:07:51 -07:00 · 47b7be01c1
--- a/modules/lcms/include/lcms.h
+++ b/modules/lcms/include/lcms.h
@ -1571,6 +1571,7 @@ void      cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
 void      cdecl MAT3evalF(LPFVEC3 r, LPFMAT3 a, LPFVEC3 v);
 void      cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
 void      cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
 void      cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
 void      cdecl MAT3fromFix(LPMAT3 r, LPWMAT3 v);
 void      cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
 LCMSBOOL  cdecl MAT3isIdentity(LPWMAT3 a, double Tolerance);
@ -1862,8 +1863,16 @@ typedef struct {
               union {
                  WMAT3 W;
                  FMAT3A FA; // This is not a matrix proper - use FA.F to access the matrix pointer
                             // Moreover, we store the transpose of the matrix instead, so the first
                             // vector corresponds to the first column instead of the first row.
               } Matrix;
               FLOAT clampMax; // SSE2 doesn't have an efficient way to clamp using integers, so we have
                               // to clamp in the float domain. Unfortunately, since we eventually want
                               // our integer values clamped to 2^16 - 1, we need to clamp with a very
                               // precise value in the float domain. We let the CPU take care of by calculating
                               // it at transform creation time rather than trusting the compiler.
               L16PARAMS p16;       // Primary curve
               LPWORD L[3];
               LPLCMSPRECACHE L_Precache;
@ -1880,7 +1889,6 @@ LPMATSHAPER cdecl cmsAllocMatShaper2(LPMAT3 matrix, LPGAMMATABLE In[], LPLCMSPRE
 void        cdecl cmsFreeMatShaper(LPMATSHAPER MatShaper);
 void        cdecl cmsEvalMatShaper(LPMATSHAPER MatShaper, WORD In[], WORD Out[]);
 void        cdecl cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[]);
 LCMSBOOL    cdecl cmsReadICCMatrixRGB2XYZ(LPMAT3 r, cmsHPROFILE hProfile);
--- a/modules/lcms/src/cmsmatsh.c
+++ b/modules/lcms/src/cmsmatsh.c
@ -103,9 +103,15 @@ LPMATSHAPER cmsAllocMatShaper2(LPMAT3 Matrix, LPGAMMATABLE In[], LPLCMSPRECACHE
       // Fill matrix part
       if (Behaviour & MATSHAPER_FLOATMAT) {
              FMAT3ASetup(&NewMatShaper->Matrix.FA);
-              MAT3toFloat(NewMatShaper -> Matrix.FA.F, Matrix);
+              MAT3toFloatTranspose(NewMatShaper -> Matrix.FA.F, Matrix);
              if (!FMAT3isIdentity(NewMatShaper -> Matrix.FA.F, 0.00001f))
                            NewMatShaper -> dwFlags |= MATSHAPER_HASMATRIX;
              // This needs to be calculated by the CPU or a very precise
              // compiler. If it's too big (like 1.0), values are clamped
              // to 65536 instead 65535, and we either have an overflow of
              // the precache bounds or scary downcasting.
              NewMatShaper -> clampMax = ((FLOAT) (65536 - 1)) / 65536.0f;
       }
       else {
              MAT3toFix(&NewMatShaper -> Matrix.W, Matrix);
@ -397,76 +403,6 @@ void OutputBehaviour(LPMATSHAPER MatShaper, WORD In[], WORD Out[])
 }
 void cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[])
 {
       WORD tmp[3];
       FVEC3 OutVect;
       LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
       if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
       {
              if (MatShaper->L2_Precache != NULL) 
              {
              FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
              FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
              FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
              }
              else
              {
              FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
              FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
              FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
              }
       }
       else
       {
       FloatVals->n[VX] = ToFloatDomain(In[0]);
       FloatVals->n[VY] = ToFloatDomain(In[1]);
       FloatVals->n[VZ] = ToFloatDomain(In[2]);
       }
       if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
       {       
       MAT3evalF(&OutVect, MatShaper -> Matrix.FA.F, FloatVals);
       }
       else 
       {
       OutVect.n[VX] = FloatVals->n[VX];
       OutVect.n[VY] = FloatVals->n[VY];
       OutVect.n[VZ] = FloatVals->n[VZ];
       }
       tmp[0] = _cmsClampWord(FromFloatDomain(OutVect.n[VX]));
       tmp[1] = _cmsClampWord(FromFloatDomain(OutVect.n[VY]));
       tmp[2] = _cmsClampWord(FromFloatDomain(OutVect.n[VZ]));
       if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
       {
              if (MatShaper->L_Precache != NULL) 
              {
              Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
              Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
              Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
              }
              else 
              {
              Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[0], MatShaper -> L[0], &MatShaper -> p16));
              Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[1], MatShaper -> L[1], &MatShaper -> p16));
              Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[2], MatShaper -> L[2], &MatShaper -> p16));
              }
       }
       else
       {
       Out[0] = RGB_16_TO_8(tmp[0]);
       Out[1] = RGB_16_TO_8(tmp[1]);
       Out[2] = RGB_16_TO_8(tmp[2]);
       }
 }
 // Master on evaluating shapers, 3 different behaviours
--- a/modules/lcms/src/cmsmtrx.c
+++ b/modules/lcms/src/cmsmtrx.c
@ -51,6 +51,7 @@ double    cdecl MAT3det(LPMAT3 m);
 void      cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
 void      cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
 void      cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
 void      cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
 void      cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
 void      cdecl MAT3perK(LPMAT3 r, LPMAT3 v, double d);
 void      cdecl MAT3scaleAndCut(LPWMAT3 r, LPMAT3 v, double d);
@ -861,6 +862,20 @@ void MAT3toFloat(LPFMAT3 r, LPMAT3 v)
       VEC3toFloat(&r -> v[2], &v -> v[2]);
 }
 void MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v)
 {
       unsigned i, j;
       /* for each row of the source. */
       for (i = 0; i < 3; ++i) 
           /* For element in the row. */
           for (j = 0; j < 3; ++j) 
               /* Col=>Row, Row=>Col. */
               r -> v[j].n[i] = DOUBLE_TO_FLOAT(v -> v[i].n[j]);
 }
 void MAT3fromFix(LPMAT3 r, LPWMAT3 v)
 {
       VEC3fromFix(&r -> v[0], &v -> v[0]);
--- a/modules/lcms/src/cmsxform.c
+++ b/modules/lcms/src/cmsxform.c
@ -58,7 +58,61 @@ void         LCMSEXPORT cmsSetAlarmCodes(int r, int g, int b);
 LCMSBOOL     LCMSEXPORT cmsIsIntentSupported(cmsHPROFILE hProfile,
                                                int Intent, int UsedDirection);
 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
 // mozilla/jpeg)
 // -------------------------------------------------------------------------
 #if defined(_M_IX86) && !defined(__GNUC__)
 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
   register - I'm not sure if that ever happens on windows, but cpuid isn't
   on the critical path so we just preserve the register to be safe and to be
   consistent with the non-windows version. */
 LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
       DWORD a_, b_, c_, d_;
       ASM {
              xchg   ebx, esi
              mov    eax, fxn
              cpuid
              mov    a_, eax
              mov    b_, ebx
              mov    c_, ecx
              mov    d_, edx
              xchg   ebx, esi
       }
       *a = a_;
       *b = b_;
       *c = c_;
       *d = d_;
 }
 #define HAVE_MMX_INTEL_MNEMONICS 
 /* SSE2 code appears broken for some cpus (bug 247437) */
 #define HAVE_SSE2_INTEL_MNEMONICS
 #define HAVE_SSE2_INTRINSICS
 #endif
 #if defined(__GNUC__) && defined(__i386__)
 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
   some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
 LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
 	   DWORD a_, b_, c_, d_;
       __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;" 
                             : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
 	   *a = a_;
 	   *b = b_;
 	   *c = c_;
 	   *d = d_;
 }
 #define HAVE_SSE2_INTRINSICS
 /* XXX - the below wasn't in jpeg/jmorecfg.h - why? */
 #define HAVE_SSE2_INTEL_MNEMONICS
 #endif /* ! GNUC && i386 */
 // Alarm RGB codes
@ -89,6 +143,33 @@ static icTagSignature Preview[]    = {icSigPreview0Tag,
 static volatile double GlobalAdaptationState = 0;
 // -------------------------Runtime SSE2 Detection-----------------------------
 #define SSE2_EDX_MASK (1UL << 26)
 static LCMSBOOL SSE2Available() {
       static int isAvailable = -1;
       DWORD a, b, c, d;
       DWORD function = 0x00000001;
       if (isAvailable == -1) {
 // If we don't have compile-time support, we don't have runtime support
 #ifndef HAVE_SSE2_INTEL_MNEMONICS
              isAvailable = 0;
 #else
       /* We have CPUID macros defined if we have sse2 mnemonics. */
              LCMSCPUID(function, &a, &b, &c, &d);
              if (d & SSE2_EDX_MASK)
                     isAvailable = 1;
              else
                     isAvailable = 0;
 #endif
       }
       return (isAvailable) ? TRUE : FALSE;
 }
 // --------------------------------Stages--------------------------------------
 // Following routines does implement several kind of steps inside 
@ -501,8 +582,8 @@ void CachedXFORMGamutCheck(_LPcmsTRANSFORM p,
 static
 void MatrixShaperXFORM(_LPcmsTRANSFORM p,
-                     LPVOID in,
+                       LPVOID in,
-                     LPVOID out, unsigned int Size)
+                       LPVOID out, unsigned int Size)
 {
       register LPBYTE accum;
       register LPBYTE output;
@ -522,25 +603,166 @@ void MatrixShaperXFORM(_LPcmsTRANSFORM p,
       }
 }
 static const FLOAT floatScale = 65536.0f;
 static const FLOAT * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
 #ifdef HAVE_SSE2_INTEL_MNEMONICS
 static
 void MatrixShaperXFORMFloat(_LPcmsTRANSFORM p,
                            LPVOID in,
                            LPVOID out, unsigned int Size)
 {
-       register LPBYTE input, output;
+       register LPBYTE In, Out;
       register unsigned int i;
       LPMATSHAPER MatShaper;
-       input  = (LPBYTE) in;
+       In  = (LPBYTE) in;
-       output = (LPBYTE) out;
+       Out = (LPBYTE) out;
       MatShaper = p -> SmeltMatShaper;
       for (i=0; i < Size; i++)
       {
-       cmsEvalMatShaperFloat(p -> SmeltMatShaper, input, output);
+
-       input += 3;
+              LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
-       output += 3;
+              LPFVEC3 MatPtr = MatShaper ->  Matrix.FA.F->v; // Matrix
              LPFLOAT clampMax = &MatShaper -> clampMax;
              LPDWORD tmp = (LPDWORD) FloatVals;
              if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
              {
                     if (MatShaper->L2_Precache != NULL) 
                     {
                     FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
                     FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
                     FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
                     }
                     else
                     {
                     FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
                     FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
                     FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
                     }
              }
              else
              {
              FloatVals->n[VX] = ToFloatDomain(In[0]);
              FloatVals->n[VY] = ToFloatDomain(In[1]);
              FloatVals->n[VZ] = ToFloatDomain(In[2]);
              }
              if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
              {       
 #ifdef __GNUC__
                __asm(
                      "movaps (%0), %%xmm1;\n\t"          // Move the first matrix column to xmm1
                      "movaps 16(%0), %%xmm2;\n\t"        // Move the second matrix column to xmm2
                      "movaps 32(%0), %%xmm3;\n\t"        // move the third matrix column to xmm3
                      "movaps 48(%0), %%xmm0;\n\t"        // Move the vector to xmm0
                                                          // Note - We have to copy and then shuffle because of the weird
                                                          // semantics of shufps
                                                          //
                      "movaps %%xmm0, %%xmm4;\n\t"        // Copy the vector to xmm4
                      "shufps $0, %%xmm4, %%xmm4;\n\t"    // Shuffle to repeat the first vector element repeated 4 times
                      "mulps %%xmm4, %%xmm1;\n\t"         // Multiply the first vector element by the first matrix column
                      "movaps %%xmm0, %%xmm5; \n\t"       // Copy the vector to xmm5
                      "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
                      "mulps %%xmm5, %%xmm2;\n\t"         // Multiply the second vector element by the seccond matrix column 
                      "movaps %%xmm0, %%xmm6;\n\t"        // Copy the vector to xmm6
                      "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
                      "mulps %%xmm6, %%xmm3;\n\t"         // Multiply the third vector element by the third matrix column
                      "addps %%xmm3, %%xmm2;\n\t"         // Sum (second + third) columns
                      "addps %%xmm2, %%xmm1;\n\t"         // Sum ((second + third) + first) columns
                      "movss (%1), %%xmm7;\n\t"        // load the floating point representation of 65535/65536 
                      "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
                      "minps %%xmm7, %%xmm1;\n\t"      // clamp the vector to 1.0 max
                      "xorps %%xmm6, %%xmm6;\n\t"       // get us cleared bitpatern, which is 0.0f
                      "maxps %%xmm6, %%xmm1;\n\t"      // clamp the vector to 0.0 min
                      "movss (%2), %%xmm5;\n\t"        // load the floating point scale factor
                      "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
                      "mulps %%xmm5, %%xmm1;\n\t"      // multiply by the scale factor
                      "cvtps2dq %%xmm1, %%xmm1;\n\t"   // convert to integers
                      "movdqa %%xmm1, 48(%0);\n\t"       // store
                      : 
                      : "r" (MatPtr), "r" (clampMax), "r" (&floatScale)
                      : "memory"
                      );
 #else
                ASM {
                      mov      eax, MatPtr
                      mov      ecx, clampMax
                      mov      edx, floatScaleAddr
                      movaps   xmm1, [eax]
                      movaps   xmm2, [eax + 16]
                      movaps   xmm3, [eax + 32]
                      movaps   xmm0, [eax + 48]
                      movaps   xmm4, xmm0
                      shufps   xmm4, xmm4, 0
                      mulps    xmm1, xmm4
                      movaps   xmm5, xmm0
                      shufps   xmm5, xmm5, 0x55
                      mulps    xmm2, xmm5
                      movaps   xmm6, xmm0
                      shufps   xmm6, xmm6, 0xAA
                      mulps    xmm3, xmm6
                      addps    xmm2, xmm3
                      addps    xmm1, xmm2
                      movss    xmm7, [ecx]
                      shufps   xmm7, xmm7, 0
                      minps    xmm1, xmm7
                      xorps    xmm6, xmm6
                      maxps    xmm1, xmm6
                      movss    xmm5, [edx]
                      shufps   xmm5, xmm5, 0
                      mulps    xmm1, xmm5
                      cvtps2dq xmm1, xmm1
                      movdqa   [eax + 48], xmm1
                }
 #endif
              }
              else 
              {
              tmp[0] = _cmsClampWord(FromFloatDomain(FloatVals->n[VX]));
              tmp[1] = _cmsClampWord(FromFloatDomain(FloatVals->n[VY]));
              tmp[2] = _cmsClampWord(FromFloatDomain(FloatVals->n[VZ]));
              }
              if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
              {
                     if (MatShaper->L_Precache != NULL) 
                     {
                     Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
                     Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
                     Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
                     }
                     else 
                     {
                     Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[0], MatShaper -> L[0], &MatShaper -> p16));
                     Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[1], MatShaper -> L[1], &MatShaper -> p16));
                     Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[2], MatShaper -> L[2], &MatShaper -> p16));
                     }
              }
              else
              {
              Out[0] = RGB_16_TO_8((WORD)tmp[0]);
              Out[1] = RGB_16_TO_8((WORD)tmp[1]);
              Out[2] = RGB_16_TO_8((WORD)tmp[2]);
              }
              In += 3;
              Out += 3;
       }
 }
 #endif
 // Using Named color input table
@ -1296,8 +1518,29 @@ _LPcmsTRANSFORM PickTransformRoutine(_LPcmsTRANSFORM p,
                       (p -> ExitColorSpace == icSigRgbData)  &&
                       !(p -> dwOriginalFlags & cmsFLAGS_BLACKPOINTCOMPENSATION)) {
                          // If the floating point path is requested, see if we support it
                          if (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER)
                          {
 #ifndef HAVE_SSE2_INTEL_MNEMONICS 
                                 // Turn it off if we can't compile it
                                 p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
 #else
                                 // Turn it off if we don't have it at runtime
                                 if (!SSE2Available())
                                        p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
 #endif
                          }
                          // Yes... try to smelt matrix-shapers
 #ifndef HAVE_SSE2_INTEL_MNEMONICS
                          p -> xform = MatrixShaperXFORM;
 #else
                          p -> xform = (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) ? MatrixShaperXFORMFloat : MatrixShaperXFORM;
 #endif
                          p -> dwOriginalFlags |= cmsFLAGS_NOTPRECALC;
                          if (!cmsBuildSmeltMatShaper(p))