Optimized SSE2 assembly for LCMS - part of bug 445552. r=vlad

This commit is contained in:
Bobby Holley 2008-08-17 22:07:51 -07:00
Родитель b22606db84
Коммит 47b7be01c1
4 изменённых файлов: 282 добавлений и 80 удалений

Просмотреть файл

@ -1571,6 +1571,7 @@ void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
void cdecl MAT3evalF(LPFVEC3 r, LPFMAT3 a, LPFVEC3 v);
void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
void cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
void cdecl MAT3fromFix(LPMAT3 r, LPWMAT3 v);
void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
LCMSBOOL cdecl MAT3isIdentity(LPWMAT3 a, double Tolerance);
@ -1862,8 +1863,16 @@ typedef struct {
union {
WMAT3 W;
FMAT3A FA; // This is not a matrix proper - use FA.F to access the matrix pointer
// Moreover, we store the transpose of the matrix instead, so the first
// vector corresponds to the first column instead of the first row.
} Matrix;
FLOAT clampMax; // SSE2 doesn't have an efficient way to clamp using integers, so we have
// to clamp in the float domain. Unfortunately, since we eventually want
// our integer values clamped to 2^16 - 1, we need to clamp with a very
// precise value in the float domain. We let the CPU take care of by calculating
// it at transform creation time rather than trusting the compiler.
L16PARAMS p16; // Primary curve
LPWORD L[3];
LPLCMSPRECACHE L_Precache;
@ -1880,7 +1889,6 @@ LPMATSHAPER cdecl cmsAllocMatShaper2(LPMAT3 matrix, LPGAMMATABLE In[], LPLCMSPRE
void cdecl cmsFreeMatShaper(LPMATSHAPER MatShaper);
void cdecl cmsEvalMatShaper(LPMATSHAPER MatShaper, WORD In[], WORD Out[]);
void cdecl cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[]);
LCMSBOOL cdecl cmsReadICCMatrixRGB2XYZ(LPMAT3 r, cmsHPROFILE hProfile);

Просмотреть файл

@ -103,9 +103,15 @@ LPMATSHAPER cmsAllocMatShaper2(LPMAT3 Matrix, LPGAMMATABLE In[], LPLCMSPRECACHE
// Fill matrix part
if (Behaviour & MATSHAPER_FLOATMAT) {
FMAT3ASetup(&NewMatShaper->Matrix.FA);
MAT3toFloat(NewMatShaper -> Matrix.FA.F, Matrix);
MAT3toFloatTranspose(NewMatShaper -> Matrix.FA.F, Matrix);
if (!FMAT3isIdentity(NewMatShaper -> Matrix.FA.F, 0.00001f))
NewMatShaper -> dwFlags |= MATSHAPER_HASMATRIX;
// This needs to be calculated by the CPU or a very precise
// compiler. If it's too big (like 1.0), values are clamped
// to 65536 instead 65535, and we either have an overflow of
// the precache bounds or scary downcasting.
NewMatShaper -> clampMax = ((FLOAT) (65536 - 1)) / 65536.0f;
}
else {
MAT3toFix(&NewMatShaper -> Matrix.W, Matrix);
@ -397,76 +403,6 @@ void OutputBehaviour(LPMATSHAPER MatShaper, WORD In[], WORD Out[])
}
void cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[])
{
WORD tmp[3];
FVEC3 OutVect;
LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
{
if (MatShaper->L2_Precache != NULL)
{
FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
}
else
{
FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
}
}
else
{
FloatVals->n[VX] = ToFloatDomain(In[0]);
FloatVals->n[VY] = ToFloatDomain(In[1]);
FloatVals->n[VZ] = ToFloatDomain(In[2]);
}
if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
{
MAT3evalF(&OutVect, MatShaper -> Matrix.FA.F, FloatVals);
}
else
{
OutVect.n[VX] = FloatVals->n[VX];
OutVect.n[VY] = FloatVals->n[VY];
OutVect.n[VZ] = FloatVals->n[VZ];
}
tmp[0] = _cmsClampWord(FromFloatDomain(OutVect.n[VX]));
tmp[1] = _cmsClampWord(FromFloatDomain(OutVect.n[VY]));
tmp[2] = _cmsClampWord(FromFloatDomain(OutVect.n[VZ]));
if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
{
if (MatShaper->L_Precache != NULL)
{
Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
}
else
{
Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[0], MatShaper -> L[0], &MatShaper -> p16));
Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[1], MatShaper -> L[1], &MatShaper -> p16));
Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[2], MatShaper -> L[2], &MatShaper -> p16));
}
}
else
{
Out[0] = RGB_16_TO_8(tmp[0]);
Out[1] = RGB_16_TO_8(tmp[1]);
Out[2] = RGB_16_TO_8(tmp[2]);
}
}
// Master on evaluating shapers, 3 different behaviours

Просмотреть файл

@ -51,6 +51,7 @@ double cdecl MAT3det(LPMAT3 m);
void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
void cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
void cdecl MAT3perK(LPMAT3 r, LPMAT3 v, double d);
void cdecl MAT3scaleAndCut(LPWMAT3 r, LPMAT3 v, double d);
@ -861,6 +862,20 @@ void MAT3toFloat(LPFMAT3 r, LPMAT3 v)
VEC3toFloat(&r -> v[2], &v -> v[2]);
}
void MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v)
{
unsigned i, j;
/* for each row of the source. */
for (i = 0; i < 3; ++i)
/* For element in the row. */
for (j = 0; j < 3; ++j)
/* Col=>Row, Row=>Col. */
r -> v[j].n[i] = DOUBLE_TO_FLOAT(v -> v[i].n[j]);
}
void MAT3fromFix(LPMAT3 r, LPWMAT3 v)
{
VEC3fromFix(&r -> v[0], &v -> v[0]);

Просмотреть файл

@ -58,7 +58,61 @@ void LCMSEXPORT cmsSetAlarmCodes(int r, int g, int b);
LCMSBOOL LCMSEXPORT cmsIsIntentSupported(cmsHPROFILE hProfile,
int Intent, int UsedDirection);
// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
// mozilla/jpeg)
// -------------------------------------------------------------------------
#if defined(_M_IX86) && !defined(__GNUC__)
/* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
register - I'm not sure if that ever happens on windows, but cpuid isn't
on the critical path so we just preserve the register to be safe and to be
consistent with the non-windows version. */
LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
DWORD a_, b_, c_, d_;
ASM {
xchg ebx, esi
mov eax, fxn
cpuid
mov a_, eax
mov b_, ebx
mov c_, ecx
mov d_, edx
xchg ebx, esi
}
*a = a_;
*b = b_;
*c = c_;
*d = d_;
}
#define HAVE_MMX_INTEL_MNEMONICS
/* SSE2 code appears broken for some cpus (bug 247437) */
#define HAVE_SSE2_INTEL_MNEMONICS
#define HAVE_SSE2_INTRINSICS
#endif
#if defined(__GNUC__) && defined(__i386__)
/* Get us a CPUID function. We can't use ebx because it's the PIC register on
some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
DWORD a_, b_, c_, d_;
__asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
: "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
*a = a_;
*b = b_;
*c = c_;
*d = d_;
}
#define HAVE_SSE2_INTRINSICS
/* XXX - the below wasn't in jpeg/jmorecfg.h - why? */
#define HAVE_SSE2_INTEL_MNEMONICS
#endif /* ! GNUC && i386 */
// Alarm RGB codes
@ -89,6 +143,33 @@ static icTagSignature Preview[] = {icSigPreview0Tag,
static volatile double GlobalAdaptationState = 0;
// -------------------------Runtime SSE2 Detection-----------------------------
#define SSE2_EDX_MASK (1UL << 26)
static LCMSBOOL SSE2Available() {
static int isAvailable = -1;
DWORD a, b, c, d;
DWORD function = 0x00000001;
if (isAvailable == -1) {
// If we don't have compile-time support, we don't have runtime support
#ifndef HAVE_SSE2_INTEL_MNEMONICS
isAvailable = 0;
#else
/* We have CPUID macros defined if we have sse2 mnemonics. */
LCMSCPUID(function, &a, &b, &c, &d);
if (d & SSE2_EDX_MASK)
isAvailable = 1;
else
isAvailable = 0;
#endif
}
return (isAvailable) ? TRUE : FALSE;
}
// --------------------------------Stages--------------------------------------
// Following routines does implement several kind of steps inside
@ -501,8 +582,8 @@ void CachedXFORMGamutCheck(_LPcmsTRANSFORM p,
static
void MatrixShaperXFORM(_LPcmsTRANSFORM p,
LPVOID in,
LPVOID out, unsigned int Size)
LPVOID in,
LPVOID out, unsigned int Size)
{
register LPBYTE accum;
register LPBYTE output;
@ -522,25 +603,166 @@ void MatrixShaperXFORM(_LPcmsTRANSFORM p,
}
}
static const FLOAT floatScale = 65536.0f;
static const FLOAT * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
#ifdef HAVE_SSE2_INTEL_MNEMONICS
static
void MatrixShaperXFORMFloat(_LPcmsTRANSFORM p,
LPVOID in,
LPVOID out, unsigned int Size)
{
register LPBYTE input, output;
register LPBYTE In, Out;
register unsigned int i;
LPMATSHAPER MatShaper;
input = (LPBYTE) in;
output = (LPBYTE) out;
In = (LPBYTE) in;
Out = (LPBYTE) out;
MatShaper = p -> SmeltMatShaper;
for (i=0; i < Size; i++)
{
cmsEvalMatShaperFloat(p -> SmeltMatShaper, input, output);
input += 3;
output += 3;
LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
LPFVEC3 MatPtr = MatShaper -> Matrix.FA.F->v; // Matrix
LPFLOAT clampMax = &MatShaper -> clampMax;
LPDWORD tmp = (LPDWORD) FloatVals;
if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
{
if (MatShaper->L2_Precache != NULL)
{
FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
}
else
{
FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
}
}
else
{
FloatVals->n[VX] = ToFloatDomain(In[0]);
FloatVals->n[VY] = ToFloatDomain(In[1]);
FloatVals->n[VZ] = ToFloatDomain(In[2]);
}
if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
{
#ifdef __GNUC__
__asm(
"movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1
"movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2
"movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3
"movaps 48(%0), %%xmm0;\n\t" // Move the vector to xmm0
// Note - We have to copy and then shuffle because of the weird
// semantics of shufps
//
"movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4
"shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times
"mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column
"movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5
"shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
"mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column
"movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6
"shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
"mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column
"addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns
"addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns
"movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536
"shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
"minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max
"xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f
"maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min
"movss (%2), %%xmm5;\n\t" // load the floating point scale factor
"shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
"mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor
"cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers
"movdqa %%xmm1, 48(%0);\n\t" // store
:
: "r" (MatPtr), "r" (clampMax), "r" (&floatScale)
: "memory"
);
#else
ASM {
mov eax, MatPtr
mov ecx, clampMax
mov edx, floatScaleAddr
movaps xmm1, [eax]
movaps xmm2, [eax + 16]
movaps xmm3, [eax + 32]
movaps xmm0, [eax + 48]
movaps xmm4, xmm0
shufps xmm4, xmm4, 0
mulps xmm1, xmm4
movaps xmm5, xmm0
shufps xmm5, xmm5, 0x55
mulps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm6, xmm6, 0xAA
mulps xmm3, xmm6
addps xmm2, xmm3
addps xmm1, xmm2
movss xmm7, [ecx]
shufps xmm7, xmm7, 0
minps xmm1, xmm7
xorps xmm6, xmm6
maxps xmm1, xmm6
movss xmm5, [edx]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
cvtps2dq xmm1, xmm1
movdqa [eax + 48], xmm1
}
#endif
}
else
{
tmp[0] = _cmsClampWord(FromFloatDomain(FloatVals->n[VX]));
tmp[1] = _cmsClampWord(FromFloatDomain(FloatVals->n[VY]));
tmp[2] = _cmsClampWord(FromFloatDomain(FloatVals->n[VZ]));
}
if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
{
if (MatShaper->L_Precache != NULL)
{
Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
}
else
{
Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[0], MatShaper -> L[0], &MatShaper -> p16));
Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[1], MatShaper -> L[1], &MatShaper -> p16));
Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[2], MatShaper -> L[2], &MatShaper -> p16));
}
}
else
{
Out[0] = RGB_16_TO_8((WORD)tmp[0]);
Out[1] = RGB_16_TO_8((WORD)tmp[1]);
Out[2] = RGB_16_TO_8((WORD)tmp[2]);
}
In += 3;
Out += 3;
}
}
#endif
// Using Named color input table
@ -1296,8 +1518,29 @@ _LPcmsTRANSFORM PickTransformRoutine(_LPcmsTRANSFORM p,
(p -> ExitColorSpace == icSigRgbData) &&
!(p -> dwOriginalFlags & cmsFLAGS_BLACKPOINTCOMPENSATION)) {
// If the floating point path is requested, see if we support it
if (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER)
{
#ifndef HAVE_SSE2_INTEL_MNEMONICS
// Turn it off if we can't compile it
p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
#else
// Turn it off if we don't have it at runtime
if (!SSE2Available())
p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
#endif
}
// Yes... try to smelt matrix-shapers
#ifndef HAVE_SSE2_INTEL_MNEMONICS
p -> xform = MatrixShaperXFORM;
#else
p -> xform = (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) ? MatrixShaperXFORMFloat : MatrixShaperXFORM;
#endif
p -> dwOriginalFlags |= cmsFLAGS_NOTPRECALC;
if (!cmsBuildSmeltMatShaper(p))