diff --git a/modules/lcms/include/lcms.h b/modules/lcms/include/lcms.h index 0f4d4028c63..f9fb0c636bc 100644 --- a/modules/lcms/include/lcms.h +++ b/modules/lcms/include/lcms.h @@ -1571,6 +1571,7 @@ void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v); void cdecl MAT3evalF(LPFVEC3 r, LPFMAT3 a, LPFVEC3 v); void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v); void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v); +void cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v); void cdecl MAT3fromFix(LPMAT3 r, LPWMAT3 v); void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v); LCMSBOOL cdecl MAT3isIdentity(LPWMAT3 a, double Tolerance); @@ -1862,8 +1863,16 @@ typedef struct { union { WMAT3 W; FMAT3A FA; // This is not a matrix proper - use FA.F to access the matrix pointer + // Moreover, we store the transpose of the matrix instead, so the first + // vector corresponds to the first column instead of the first row. } Matrix; + FLOAT clampMax; // SSE2 doesn't have an efficient way to clamp using integers, so we have + // to clamp in the float domain. Unfortunately, since we eventually want + // our integer values clamped to 2^16 - 1, we need to clamp with a very + // precise value in the float domain. We let the CPU take care of by calculating + // it at transform creation time rather than trusting the compiler. + L16PARAMS p16; // Primary curve LPWORD L[3]; LPLCMSPRECACHE L_Precache; @@ -1880,7 +1889,6 @@ LPMATSHAPER cdecl cmsAllocMatShaper2(LPMAT3 matrix, LPGAMMATABLE In[], LPLCMSPRE void cdecl cmsFreeMatShaper(LPMATSHAPER MatShaper); void cdecl cmsEvalMatShaper(LPMATSHAPER MatShaper, WORD In[], WORD Out[]); -void cdecl cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[]); LCMSBOOL cdecl cmsReadICCMatrixRGB2XYZ(LPMAT3 r, cmsHPROFILE hProfile); diff --git a/modules/lcms/src/cmsmatsh.c b/modules/lcms/src/cmsmatsh.c index 28f4289df46..990a996cda6 100644 --- a/modules/lcms/src/cmsmatsh.c +++ b/modules/lcms/src/cmsmatsh.c @@ -103,9 +103,15 @@ LPMATSHAPER cmsAllocMatShaper2(LPMAT3 Matrix, LPGAMMATABLE In[], LPLCMSPRECACHE // Fill matrix part if (Behaviour & MATSHAPER_FLOATMAT) { FMAT3ASetup(&NewMatShaper->Matrix.FA); - MAT3toFloat(NewMatShaper -> Matrix.FA.F, Matrix); + MAT3toFloatTranspose(NewMatShaper -> Matrix.FA.F, Matrix); if (!FMAT3isIdentity(NewMatShaper -> Matrix.FA.F, 0.00001f)) NewMatShaper -> dwFlags |= MATSHAPER_HASMATRIX; + + // This needs to be calculated by the CPU or a very precise + // compiler. If it's too big (like 1.0), values are clamped + // to 65536 instead 65535, and we either have an overflow of + // the precache bounds or scary downcasting. + NewMatShaper -> clampMax = ((FLOAT) (65536 - 1)) / 65536.0f; } else { MAT3toFix(&NewMatShaper -> Matrix.W, Matrix); @@ -397,76 +403,6 @@ void OutputBehaviour(LPMATSHAPER MatShaper, WORD In[], WORD Out[]) } -void cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[]) -{ - WORD tmp[3]; - FVEC3 OutVect; - LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer - - if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER) - { - if (MatShaper->L2_Precache != NULL) - { - FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]]; - FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]]; - FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]]; - } - else - { - FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16)); - FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16)); - FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16)); - } - } - else - { - FloatVals->n[VX] = ToFloatDomain(In[0]); - FloatVals->n[VY] = ToFloatDomain(In[1]); - FloatVals->n[VZ] = ToFloatDomain(In[2]); - } - - - if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX) - { - - MAT3evalF(&OutVect, MatShaper -> Matrix.FA.F, FloatVals); - } - else - { - OutVect.n[VX] = FloatVals->n[VX]; - OutVect.n[VY] = FloatVals->n[VY]; - OutVect.n[VZ] = FloatVals->n[VZ]; - } - - - tmp[0] = _cmsClampWord(FromFloatDomain(OutVect.n[VX])); - tmp[1] = _cmsClampWord(FromFloatDomain(OutVect.n[VY])); - tmp[2] = _cmsClampWord(FromFloatDomain(OutVect.n[VZ])); - - - - if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER) - { - if (MatShaper->L_Precache != NULL) - { - Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]]; - Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]]; - Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]]; - } - else - { - Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[0], MatShaper -> L[0], &MatShaper -> p16)); - Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[1], MatShaper -> L[1], &MatShaper -> p16)); - Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[2], MatShaper -> L[2], &MatShaper -> p16)); - } - } - else - { - Out[0] = RGB_16_TO_8(tmp[0]); - Out[1] = RGB_16_TO_8(tmp[1]); - Out[2] = RGB_16_TO_8(tmp[2]); - } -} // Master on evaluating shapers, 3 different behaviours diff --git a/modules/lcms/src/cmsmtrx.c b/modules/lcms/src/cmsmtrx.c index b30233c0086..edaccaa3153 100644 --- a/modules/lcms/src/cmsmtrx.c +++ b/modules/lcms/src/cmsmtrx.c @@ -51,6 +51,7 @@ double cdecl MAT3det(LPMAT3 m); void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v); void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v); void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v); +void cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v); void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v); void cdecl MAT3perK(LPMAT3 r, LPMAT3 v, double d); void cdecl MAT3scaleAndCut(LPWMAT3 r, LPMAT3 v, double d); @@ -861,6 +862,20 @@ void MAT3toFloat(LPFMAT3 r, LPMAT3 v) VEC3toFloat(&r -> v[2], &v -> v[2]); } +void MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v) +{ + unsigned i, j; + + /* for each row of the source. */ + for (i = 0; i < 3; ++i) + + /* For element in the row. */ + for (j = 0; j < 3; ++j) + + /* Col=>Row, Row=>Col. */ + r -> v[j].n[i] = DOUBLE_TO_FLOAT(v -> v[i].n[j]); +} + void MAT3fromFix(LPMAT3 r, LPWMAT3 v) { VEC3fromFix(&r -> v[0], &v -> v[0]); diff --git a/modules/lcms/src/cmsxform.c b/modules/lcms/src/cmsxform.c index 6c7318dd745..cc951a77d92 100644 --- a/modules/lcms/src/cmsxform.c +++ b/modules/lcms/src/cmsxform.c @@ -58,7 +58,61 @@ void LCMSEXPORT cmsSetAlarmCodes(int r, int g, int b); LCMSBOOL LCMSEXPORT cmsIsIntentSupported(cmsHPROFILE hProfile, int Intent, int UsedDirection); +// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in +// mozilla/jpeg) // ------------------------------------------------------------------------- +#if defined(_M_IX86) && !defined(__GNUC__) + +/* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC + register - I'm not sure if that ever happens on windows, but cpuid isn't + on the critical path so we just preserve the register to be safe and to be + consistent with the non-windows version. */ +LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) { + DWORD a_, b_, c_, d_; + + ASM { + xchg ebx, esi + mov eax, fxn + cpuid + mov a_, eax + mov b_, ebx + mov c_, ecx + mov d_, edx + xchg ebx, esi + } + *a = a_; + *b = b_; + *c = c_; + *d = d_; +} + +#define HAVE_MMX_INTEL_MNEMONICS + +/* SSE2 code appears broken for some cpus (bug 247437) */ +#define HAVE_SSE2_INTEL_MNEMONICS +#define HAVE_SSE2_INTRINSICS +#endif + +#if defined(__GNUC__) && defined(__i386__) + +/* Get us a CPUID function. We can't use ebx because it's the PIC register on + some platforms, so we use ESI instead and save ebx to avoid clobbering it. */ +LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) { + + DWORD a_, b_, c_, d_; + __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;" + : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn)); + *a = a_; + *b = b_; + *c = c_; + *d = d_; + } + +#define HAVE_SSE2_INTRINSICS +/* XXX - the below wasn't in jpeg/jmorecfg.h - why? */ +#define HAVE_SSE2_INTEL_MNEMONICS +#endif /* ! GNUC && i386 */ + // Alarm RGB codes @@ -89,6 +143,33 @@ static icTagSignature Preview[] = {icSigPreview0Tag, static volatile double GlobalAdaptationState = 0; +// -------------------------Runtime SSE2 Detection----------------------------- + +#define SSE2_EDX_MASK (1UL << 26) +static LCMSBOOL SSE2Available() { + + static int isAvailable = -1; + DWORD a, b, c, d; + DWORD function = 0x00000001; + + if (isAvailable == -1) { + +// If we don't have compile-time support, we don't have runtime support +#ifndef HAVE_SSE2_INTEL_MNEMONICS + isAvailable = 0; +#else + /* We have CPUID macros defined if we have sse2 mnemonics. */ + LCMSCPUID(function, &a, &b, &c, &d); + if (d & SSE2_EDX_MASK) + isAvailable = 1; + else + isAvailable = 0; +#endif + } + + return (isAvailable) ? TRUE : FALSE; +} + // --------------------------------Stages-------------------------------------- // Following routines does implement several kind of steps inside @@ -501,8 +582,8 @@ void CachedXFORMGamutCheck(_LPcmsTRANSFORM p, static void MatrixShaperXFORM(_LPcmsTRANSFORM p, - LPVOID in, - LPVOID out, unsigned int Size) + LPVOID in, + LPVOID out, unsigned int Size) { register LPBYTE accum; register LPBYTE output; @@ -522,25 +603,166 @@ void MatrixShaperXFORM(_LPcmsTRANSFORM p, } } +static const FLOAT floatScale = 65536.0f; +static const FLOAT * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline + +#ifdef HAVE_SSE2_INTEL_MNEMONICS static void MatrixShaperXFORMFloat(_LPcmsTRANSFORM p, LPVOID in, LPVOID out, unsigned int Size) { - register LPBYTE input, output; + register LPBYTE In, Out; register unsigned int i; + LPMATSHAPER MatShaper; - input = (LPBYTE) in; - output = (LPBYTE) out; + In = (LPBYTE) in; + Out = (LPBYTE) out; + MatShaper = p -> SmeltMatShaper; for (i=0; i < Size; i++) { - cmsEvalMatShaperFloat(p -> SmeltMatShaper, input, output); - input += 3; - output += 3; + + LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer + LPFVEC3 MatPtr = MatShaper -> Matrix.FA.F->v; // Matrix + LPFLOAT clampMax = &MatShaper -> clampMax; + LPDWORD tmp = (LPDWORD) FloatVals; + + if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER) + { + if (MatShaper->L2_Precache != NULL) + { + FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]]; + FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]]; + FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]]; + } + else + { + FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16)); + FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16)); + FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16)); + } + } + else + { + FloatVals->n[VX] = ToFloatDomain(In[0]); + FloatVals->n[VY] = ToFloatDomain(In[1]); + FloatVals->n[VZ] = ToFloatDomain(In[2]); + } + + if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX) + { +#ifdef __GNUC__ + __asm( + "movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1 + "movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2 + "movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3 + "movaps 48(%0), %%xmm0;\n\t" // Move the vector to xmm0 + + // Note - We have to copy and then shuffle because of the weird + // semantics of shufps + // + "movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4 + "shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times + "mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column + "movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5 + "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times + "mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column + "movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6 + "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times + "mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column + + "addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns + "addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns + + "movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536 + "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots + "minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max + "xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f + "maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min + "movss (%2), %%xmm5;\n\t" // load the floating point scale factor + "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots + "mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor + "cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers + "movdqa %%xmm1, 48(%0);\n\t" // store + + : + : "r" (MatPtr), "r" (clampMax), "r" (&floatScale) + : "memory" + ); +#else + ASM { + mov eax, MatPtr + mov ecx, clampMax + mov edx, floatScaleAddr + + movaps xmm1, [eax] + movaps xmm2, [eax + 16] + movaps xmm3, [eax + 32] + movaps xmm0, [eax + 48] + + movaps xmm4, xmm0 + shufps xmm4, xmm4, 0 + mulps xmm1, xmm4 + movaps xmm5, xmm0 + shufps xmm5, xmm5, 0x55 + mulps xmm2, xmm5 + movaps xmm6, xmm0 + shufps xmm6, xmm6, 0xAA + mulps xmm3, xmm6 + + addps xmm2, xmm3 + addps xmm1, xmm2 + + movss xmm7, [ecx] + shufps xmm7, xmm7, 0 + minps xmm1, xmm7 + xorps xmm6, xmm6 + maxps xmm1, xmm6 + movss xmm5, [edx] + shufps xmm5, xmm5, 0 + mulps xmm1, xmm5 + cvtps2dq xmm1, xmm1 + movdqa [eax + 48], xmm1 + } +#endif + + } + else + { + tmp[0] = _cmsClampWord(FromFloatDomain(FloatVals->n[VX])); + tmp[1] = _cmsClampWord(FromFloatDomain(FloatVals->n[VY])); + tmp[2] = _cmsClampWord(FromFloatDomain(FloatVals->n[VZ])); + } + + if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER) + { + if (MatShaper->L_Precache != NULL) + { + Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]]; + Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]]; + Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]]; + } + else + { + Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[0], MatShaper -> L[0], &MatShaper -> p16)); + Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[1], MatShaper -> L[1], &MatShaper -> p16)); + Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[2], MatShaper -> L[2], &MatShaper -> p16)); + } + } + else + { + Out[0] = RGB_16_TO_8((WORD)tmp[0]); + Out[1] = RGB_16_TO_8((WORD)tmp[1]); + Out[2] = RGB_16_TO_8((WORD)tmp[2]); + } + + In += 3; + Out += 3; } } +#endif // Using Named color input table @@ -1296,8 +1518,29 @@ _LPcmsTRANSFORM PickTransformRoutine(_LPcmsTRANSFORM p, (p -> ExitColorSpace == icSigRgbData) && !(p -> dwOriginalFlags & cmsFLAGS_BLACKPOINTCOMPENSATION)) { + + // If the floating point path is requested, see if we support it + if (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) + { + +#ifndef HAVE_SSE2_INTEL_MNEMONICS + // Turn it off if we can't compile it + p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER; +#else + // Turn it off if we don't have it at runtime + if (!SSE2Available()) + p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER; +#endif + } + // Yes... try to smelt matrix-shapers + +#ifndef HAVE_SSE2_INTEL_MNEMONICS + p -> xform = MatrixShaperXFORM; +#else p -> xform = (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) ? MatrixShaperXFORMFloat : MatrixShaperXFORM; +#endif + p -> dwOriginalFlags |= cmsFLAGS_NOTPRECALC; if (!cmsBuildSmeltMatShaper(p))