зеркало из https://github.com/mozilla/pjs.git
Optimized SSE2 assembly for LCMS - part of bug 445552. r=vlad
This commit is contained in:
Родитель
b22606db84
Коммит
47b7be01c1
|
@ -1571,6 +1571,7 @@ void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
|
||||||
void cdecl MAT3evalF(LPFVEC3 r, LPFMAT3 a, LPFVEC3 v);
|
void cdecl MAT3evalF(LPFVEC3 r, LPFMAT3 a, LPFVEC3 v);
|
||||||
void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
|
void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
|
||||||
void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
|
void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
|
||||||
|
void cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
|
||||||
void cdecl MAT3fromFix(LPMAT3 r, LPWMAT3 v);
|
void cdecl MAT3fromFix(LPMAT3 r, LPWMAT3 v);
|
||||||
void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
|
void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
|
||||||
LCMSBOOL cdecl MAT3isIdentity(LPWMAT3 a, double Tolerance);
|
LCMSBOOL cdecl MAT3isIdentity(LPWMAT3 a, double Tolerance);
|
||||||
|
@ -1862,8 +1863,16 @@ typedef struct {
|
||||||
union {
|
union {
|
||||||
WMAT3 W;
|
WMAT3 W;
|
||||||
FMAT3A FA; // This is not a matrix proper - use FA.F to access the matrix pointer
|
FMAT3A FA; // This is not a matrix proper - use FA.F to access the matrix pointer
|
||||||
|
// Moreover, we store the transpose of the matrix instead, so the first
|
||||||
|
// vector corresponds to the first column instead of the first row.
|
||||||
} Matrix;
|
} Matrix;
|
||||||
|
|
||||||
|
FLOAT clampMax; // SSE2 doesn't have an efficient way to clamp using integers, so we have
|
||||||
|
// to clamp in the float domain. Unfortunately, since we eventually want
|
||||||
|
// our integer values clamped to 2^16 - 1, we need to clamp with a very
|
||||||
|
// precise value in the float domain. We let the CPU take care of by calculating
|
||||||
|
// it at transform creation time rather than trusting the compiler.
|
||||||
|
|
||||||
L16PARAMS p16; // Primary curve
|
L16PARAMS p16; // Primary curve
|
||||||
LPWORD L[3];
|
LPWORD L[3];
|
||||||
LPLCMSPRECACHE L_Precache;
|
LPLCMSPRECACHE L_Precache;
|
||||||
|
@ -1880,7 +1889,6 @@ LPMATSHAPER cdecl cmsAllocMatShaper2(LPMAT3 matrix, LPGAMMATABLE In[], LPLCMSPRE
|
||||||
|
|
||||||
void cdecl cmsFreeMatShaper(LPMATSHAPER MatShaper);
|
void cdecl cmsFreeMatShaper(LPMATSHAPER MatShaper);
|
||||||
void cdecl cmsEvalMatShaper(LPMATSHAPER MatShaper, WORD In[], WORD Out[]);
|
void cdecl cmsEvalMatShaper(LPMATSHAPER MatShaper, WORD In[], WORD Out[]);
|
||||||
void cdecl cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[]);
|
|
||||||
|
|
||||||
LCMSBOOL cdecl cmsReadICCMatrixRGB2XYZ(LPMAT3 r, cmsHPROFILE hProfile);
|
LCMSBOOL cdecl cmsReadICCMatrixRGB2XYZ(LPMAT3 r, cmsHPROFILE hProfile);
|
||||||
|
|
||||||
|
|
|
@ -103,9 +103,15 @@ LPMATSHAPER cmsAllocMatShaper2(LPMAT3 Matrix, LPGAMMATABLE In[], LPLCMSPRECACHE
|
||||||
// Fill matrix part
|
// Fill matrix part
|
||||||
if (Behaviour & MATSHAPER_FLOATMAT) {
|
if (Behaviour & MATSHAPER_FLOATMAT) {
|
||||||
FMAT3ASetup(&NewMatShaper->Matrix.FA);
|
FMAT3ASetup(&NewMatShaper->Matrix.FA);
|
||||||
MAT3toFloat(NewMatShaper -> Matrix.FA.F, Matrix);
|
MAT3toFloatTranspose(NewMatShaper -> Matrix.FA.F, Matrix);
|
||||||
if (!FMAT3isIdentity(NewMatShaper -> Matrix.FA.F, 0.00001f))
|
if (!FMAT3isIdentity(NewMatShaper -> Matrix.FA.F, 0.00001f))
|
||||||
NewMatShaper -> dwFlags |= MATSHAPER_HASMATRIX;
|
NewMatShaper -> dwFlags |= MATSHAPER_HASMATRIX;
|
||||||
|
|
||||||
|
// This needs to be calculated by the CPU or a very precise
|
||||||
|
// compiler. If it's too big (like 1.0), values are clamped
|
||||||
|
// to 65536 instead 65535, and we either have an overflow of
|
||||||
|
// the precache bounds or scary downcasting.
|
||||||
|
NewMatShaper -> clampMax = ((FLOAT) (65536 - 1)) / 65536.0f;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
MAT3toFix(&NewMatShaper -> Matrix.W, Matrix);
|
MAT3toFix(&NewMatShaper -> Matrix.W, Matrix);
|
||||||
|
@ -397,76 +403,6 @@ void OutputBehaviour(LPMATSHAPER MatShaper, WORD In[], WORD Out[])
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void cmsEvalMatShaperFloat(LPMATSHAPER MatShaper, BYTE In[], BYTE Out[])
|
|
||||||
{
|
|
||||||
WORD tmp[3];
|
|
||||||
FVEC3 OutVect;
|
|
||||||
LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
|
|
||||||
|
|
||||||
if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
|
|
||||||
{
|
|
||||||
if (MatShaper->L2_Precache != NULL)
|
|
||||||
{
|
|
||||||
FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
|
|
||||||
FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
|
|
||||||
FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
|
|
||||||
FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
|
|
||||||
FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
FloatVals->n[VX] = ToFloatDomain(In[0]);
|
|
||||||
FloatVals->n[VY] = ToFloatDomain(In[1]);
|
|
||||||
FloatVals->n[VZ] = ToFloatDomain(In[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
|
|
||||||
{
|
|
||||||
|
|
||||||
MAT3evalF(&OutVect, MatShaper -> Matrix.FA.F, FloatVals);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
OutVect.n[VX] = FloatVals->n[VX];
|
|
||||||
OutVect.n[VY] = FloatVals->n[VY];
|
|
||||||
OutVect.n[VZ] = FloatVals->n[VZ];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
tmp[0] = _cmsClampWord(FromFloatDomain(OutVect.n[VX]));
|
|
||||||
tmp[1] = _cmsClampWord(FromFloatDomain(OutVect.n[VY]));
|
|
||||||
tmp[2] = _cmsClampWord(FromFloatDomain(OutVect.n[VZ]));
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
|
|
||||||
{
|
|
||||||
if (MatShaper->L_Precache != NULL)
|
|
||||||
{
|
|
||||||
Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
|
|
||||||
Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
|
|
||||||
Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[0], MatShaper -> L[0], &MatShaper -> p16));
|
|
||||||
Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[1], MatShaper -> L[1], &MatShaper -> p16));
|
|
||||||
Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16(tmp[2], MatShaper -> L[2], &MatShaper -> p16));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
Out[0] = RGB_16_TO_8(tmp[0]);
|
|
||||||
Out[1] = RGB_16_TO_8(tmp[1]);
|
|
||||||
Out[2] = RGB_16_TO_8(tmp[2]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Master on evaluating shapers, 3 different behaviours
|
// Master on evaluating shapers, 3 different behaviours
|
||||||
|
|
||||||
|
|
|
@ -51,6 +51,7 @@ double cdecl MAT3det(LPMAT3 m);
|
||||||
void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
|
void cdecl MAT3eval(LPVEC3 r, LPMAT3 a, LPVEC3 v);
|
||||||
void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
|
void cdecl MAT3toFix(LPWMAT3 r, LPMAT3 v);
|
||||||
void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
|
void cdecl MAT3toFloat(LPFMAT3 r, LPMAT3 v);
|
||||||
|
void cdecl MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v);
|
||||||
void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
|
void cdecl MAT3evalW(LPWVEC3 r, LPWMAT3 a, LPWVEC3 v);
|
||||||
void cdecl MAT3perK(LPMAT3 r, LPMAT3 v, double d);
|
void cdecl MAT3perK(LPMAT3 r, LPMAT3 v, double d);
|
||||||
void cdecl MAT3scaleAndCut(LPWMAT3 r, LPMAT3 v, double d);
|
void cdecl MAT3scaleAndCut(LPWMAT3 r, LPMAT3 v, double d);
|
||||||
|
@ -861,6 +862,20 @@ void MAT3toFloat(LPFMAT3 r, LPMAT3 v)
|
||||||
VEC3toFloat(&r -> v[2], &v -> v[2]);
|
VEC3toFloat(&r -> v[2], &v -> v[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MAT3toFloatTranspose(LPFMAT3 r, LPMAT3 v)
|
||||||
|
{
|
||||||
|
unsigned i, j;
|
||||||
|
|
||||||
|
/* for each row of the source. */
|
||||||
|
for (i = 0; i < 3; ++i)
|
||||||
|
|
||||||
|
/* For element in the row. */
|
||||||
|
for (j = 0; j < 3; ++j)
|
||||||
|
|
||||||
|
/* Col=>Row, Row=>Col. */
|
||||||
|
r -> v[j].n[i] = DOUBLE_TO_FLOAT(v -> v[i].n[j]);
|
||||||
|
}
|
||||||
|
|
||||||
void MAT3fromFix(LPMAT3 r, LPWMAT3 v)
|
void MAT3fromFix(LPMAT3 r, LPWMAT3 v)
|
||||||
{
|
{
|
||||||
VEC3fromFix(&r -> v[0], &v -> v[0]);
|
VEC3fromFix(&r -> v[0], &v -> v[0]);
|
||||||
|
|
|
@ -58,7 +58,61 @@ void LCMSEXPORT cmsSetAlarmCodes(int r, int g, int b);
|
||||||
LCMSBOOL LCMSEXPORT cmsIsIntentSupported(cmsHPROFILE hProfile,
|
LCMSBOOL LCMSEXPORT cmsIsIntentSupported(cmsHPROFILE hProfile,
|
||||||
int Intent, int UsedDirection);
|
int Intent, int UsedDirection);
|
||||||
|
|
||||||
|
// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
|
||||||
|
// mozilla/jpeg)
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
#if defined(_M_IX86) && !defined(__GNUC__)
|
||||||
|
|
||||||
|
/* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
|
||||||
|
register - I'm not sure if that ever happens on windows, but cpuid isn't
|
||||||
|
on the critical path so we just preserve the register to be safe and to be
|
||||||
|
consistent with the non-windows version. */
|
||||||
|
LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
|
||||||
|
DWORD a_, b_, c_, d_;
|
||||||
|
|
||||||
|
ASM {
|
||||||
|
xchg ebx, esi
|
||||||
|
mov eax, fxn
|
||||||
|
cpuid
|
||||||
|
mov a_, eax
|
||||||
|
mov b_, ebx
|
||||||
|
mov c_, ecx
|
||||||
|
mov d_, edx
|
||||||
|
xchg ebx, esi
|
||||||
|
}
|
||||||
|
*a = a_;
|
||||||
|
*b = b_;
|
||||||
|
*c = c_;
|
||||||
|
*d = d_;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAVE_MMX_INTEL_MNEMONICS
|
||||||
|
|
||||||
|
/* SSE2 code appears broken for some cpus (bug 247437) */
|
||||||
|
#define HAVE_SSE2_INTEL_MNEMONICS
|
||||||
|
#define HAVE_SSE2_INTRINSICS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__GNUC__) && defined(__i386__)
|
||||||
|
|
||||||
|
/* Get us a CPUID function. We can't use ebx because it's the PIC register on
|
||||||
|
some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
|
||||||
|
LCMS_INLINE void LCMSCPUID(DWORD fxn, LPDWORD a, LPDWORD b, LPDWORD c, LPDWORD d) {
|
||||||
|
|
||||||
|
DWORD a_, b_, c_, d_;
|
||||||
|
__asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
|
||||||
|
: "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
|
||||||
|
*a = a_;
|
||||||
|
*b = b_;
|
||||||
|
*c = c_;
|
||||||
|
*d = d_;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAVE_SSE2_INTRINSICS
|
||||||
|
/* XXX - the below wasn't in jpeg/jmorecfg.h - why? */
|
||||||
|
#define HAVE_SSE2_INTEL_MNEMONICS
|
||||||
|
#endif /* ! GNUC && i386 */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Alarm RGB codes
|
// Alarm RGB codes
|
||||||
|
@ -89,6 +143,33 @@ static icTagSignature Preview[] = {icSigPreview0Tag,
|
||||||
|
|
||||||
static volatile double GlobalAdaptationState = 0;
|
static volatile double GlobalAdaptationState = 0;
|
||||||
|
|
||||||
|
// -------------------------Runtime SSE2 Detection-----------------------------
|
||||||
|
|
||||||
|
#define SSE2_EDX_MASK (1UL << 26)
|
||||||
|
static LCMSBOOL SSE2Available() {
|
||||||
|
|
||||||
|
static int isAvailable = -1;
|
||||||
|
DWORD a, b, c, d;
|
||||||
|
DWORD function = 0x00000001;
|
||||||
|
|
||||||
|
if (isAvailable == -1) {
|
||||||
|
|
||||||
|
// If we don't have compile-time support, we don't have runtime support
|
||||||
|
#ifndef HAVE_SSE2_INTEL_MNEMONICS
|
||||||
|
isAvailable = 0;
|
||||||
|
#else
|
||||||
|
/* We have CPUID macros defined if we have sse2 mnemonics. */
|
||||||
|
LCMSCPUID(function, &a, &b, &c, &d);
|
||||||
|
if (d & SSE2_EDX_MASK)
|
||||||
|
isAvailable = 1;
|
||||||
|
else
|
||||||
|
isAvailable = 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
return (isAvailable) ? TRUE : FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------Stages--------------------------------------
|
// --------------------------------Stages--------------------------------------
|
||||||
|
|
||||||
// Following routines does implement several kind of steps inside
|
// Following routines does implement several kind of steps inside
|
||||||
|
@ -501,8 +582,8 @@ void CachedXFORMGamutCheck(_LPcmsTRANSFORM p,
|
||||||
|
|
||||||
static
|
static
|
||||||
void MatrixShaperXFORM(_LPcmsTRANSFORM p,
|
void MatrixShaperXFORM(_LPcmsTRANSFORM p,
|
||||||
LPVOID in,
|
LPVOID in,
|
||||||
LPVOID out, unsigned int Size)
|
LPVOID out, unsigned int Size)
|
||||||
{
|
{
|
||||||
register LPBYTE accum;
|
register LPBYTE accum;
|
||||||
register LPBYTE output;
|
register LPBYTE output;
|
||||||
|
@ -522,25 +603,166 @@ void MatrixShaperXFORM(_LPcmsTRANSFORM p,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const FLOAT floatScale = 65536.0f;
|
||||||
|
static const FLOAT * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
|
||||||
|
|
||||||
|
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||||
static
|
static
|
||||||
void MatrixShaperXFORMFloat(_LPcmsTRANSFORM p,
|
void MatrixShaperXFORMFloat(_LPcmsTRANSFORM p,
|
||||||
LPVOID in,
|
LPVOID in,
|
||||||
LPVOID out, unsigned int Size)
|
LPVOID out, unsigned int Size)
|
||||||
{
|
{
|
||||||
register LPBYTE input, output;
|
register LPBYTE In, Out;
|
||||||
register unsigned int i;
|
register unsigned int i;
|
||||||
|
LPMATSHAPER MatShaper;
|
||||||
|
|
||||||
|
|
||||||
input = (LPBYTE) in;
|
In = (LPBYTE) in;
|
||||||
output = (LPBYTE) out;
|
Out = (LPBYTE) out;
|
||||||
|
MatShaper = p -> SmeltMatShaper;
|
||||||
|
|
||||||
for (i=0; i < Size; i++)
|
for (i=0; i < Size; i++)
|
||||||
{
|
{
|
||||||
cmsEvalMatShaperFloat(p -> SmeltMatShaper, input, output);
|
|
||||||
input += 3;
|
LPFVEC3 FloatVals = &MatShaper -> Matrix.FA.F->v[3]; // Access our secret aligned temp buffer
|
||||||
output += 3;
|
LPFVEC3 MatPtr = MatShaper -> Matrix.FA.F->v; // Matrix
|
||||||
|
LPFLOAT clampMax = &MatShaper -> clampMax;
|
||||||
|
LPDWORD tmp = (LPDWORD) FloatVals;
|
||||||
|
|
||||||
|
if (MatShaper -> dwFlags & MATSHAPER_HASINPSHAPER)
|
||||||
|
{
|
||||||
|
if (MatShaper->L2_Precache != NULL)
|
||||||
|
{
|
||||||
|
FloatVals->n[VX] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[0][In[0]];
|
||||||
|
FloatVals->n[VY] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[1][In[1]];
|
||||||
|
FloatVals->n[VZ] = MatShaper->L2_Precache->Impl.LI16F_FORWARD.Cache[2][In[2]];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
FloatVals->n[VX] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[0]), MatShaper -> L2[0], &MatShaper -> p2_16));
|
||||||
|
FloatVals->n[VY] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[1]), MatShaper -> L2[1], &MatShaper -> p2_16));
|
||||||
|
FloatVals->n[VZ] = ToFloatDomain(cmsLinearInterpLUT16(RGB_8_TO_16(In[2]), MatShaper -> L2[2], &MatShaper -> p2_16));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
FloatVals->n[VX] = ToFloatDomain(In[0]);
|
||||||
|
FloatVals->n[VY] = ToFloatDomain(In[1]);
|
||||||
|
FloatVals->n[VZ] = ToFloatDomain(In[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MatShaper -> dwFlags & MATSHAPER_HASMATRIX)
|
||||||
|
{
|
||||||
|
#ifdef __GNUC__
|
||||||
|
__asm(
|
||||||
|
"movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1
|
||||||
|
"movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2
|
||||||
|
"movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3
|
||||||
|
"movaps 48(%0), %%xmm0;\n\t" // Move the vector to xmm0
|
||||||
|
|
||||||
|
// Note - We have to copy and then shuffle because of the weird
|
||||||
|
// semantics of shufps
|
||||||
|
//
|
||||||
|
"movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4
|
||||||
|
"shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times
|
||||||
|
"mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column
|
||||||
|
"movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5
|
||||||
|
"shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
|
||||||
|
"mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column
|
||||||
|
"movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6
|
||||||
|
"shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
|
||||||
|
"mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column
|
||||||
|
|
||||||
|
"addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns
|
||||||
|
"addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns
|
||||||
|
|
||||||
|
"movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536
|
||||||
|
"shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
|
||||||
|
"minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max
|
||||||
|
"xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f
|
||||||
|
"maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min
|
||||||
|
"movss (%2), %%xmm5;\n\t" // load the floating point scale factor
|
||||||
|
"shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
|
||||||
|
"mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor
|
||||||
|
"cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers
|
||||||
|
"movdqa %%xmm1, 48(%0);\n\t" // store
|
||||||
|
|
||||||
|
:
|
||||||
|
: "r" (MatPtr), "r" (clampMax), "r" (&floatScale)
|
||||||
|
: "memory"
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
ASM {
|
||||||
|
mov eax, MatPtr
|
||||||
|
mov ecx, clampMax
|
||||||
|
mov edx, floatScaleAddr
|
||||||
|
|
||||||
|
movaps xmm1, [eax]
|
||||||
|
movaps xmm2, [eax + 16]
|
||||||
|
movaps xmm3, [eax + 32]
|
||||||
|
movaps xmm0, [eax + 48]
|
||||||
|
|
||||||
|
movaps xmm4, xmm0
|
||||||
|
shufps xmm4, xmm4, 0
|
||||||
|
mulps xmm1, xmm4
|
||||||
|
movaps xmm5, xmm0
|
||||||
|
shufps xmm5, xmm5, 0x55
|
||||||
|
mulps xmm2, xmm5
|
||||||
|
movaps xmm6, xmm0
|
||||||
|
shufps xmm6, xmm6, 0xAA
|
||||||
|
mulps xmm3, xmm6
|
||||||
|
|
||||||
|
addps xmm2, xmm3
|
||||||
|
addps xmm1, xmm2
|
||||||
|
|
||||||
|
movss xmm7, [ecx]
|
||||||
|
shufps xmm7, xmm7, 0
|
||||||
|
minps xmm1, xmm7
|
||||||
|
xorps xmm6, xmm6
|
||||||
|
maxps xmm1, xmm6
|
||||||
|
movss xmm5, [edx]
|
||||||
|
shufps xmm5, xmm5, 0
|
||||||
|
mulps xmm1, xmm5
|
||||||
|
cvtps2dq xmm1, xmm1
|
||||||
|
movdqa [eax + 48], xmm1
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
tmp[0] = _cmsClampWord(FromFloatDomain(FloatVals->n[VX]));
|
||||||
|
tmp[1] = _cmsClampWord(FromFloatDomain(FloatVals->n[VY]));
|
||||||
|
tmp[2] = _cmsClampWord(FromFloatDomain(FloatVals->n[VZ]));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MatShaper -> dwFlags & MATSHAPER_HASSHAPER)
|
||||||
|
{
|
||||||
|
if (MatShaper->L_Precache != NULL)
|
||||||
|
{
|
||||||
|
Out[0] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[0][tmp[0]];
|
||||||
|
Out[1] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[1][tmp[1]];
|
||||||
|
Out[2] = MatShaper->L_Precache->Impl.LI168_REVERSE.Cache[2][tmp[2]];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Out[0] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[0], MatShaper -> L[0], &MatShaper -> p16));
|
||||||
|
Out[1] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[1], MatShaper -> L[1], &MatShaper -> p16));
|
||||||
|
Out[2] = RGB_16_TO_8(cmsLinearInterpLUT16((WORD)tmp[2], MatShaper -> L[2], &MatShaper -> p16));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Out[0] = RGB_16_TO_8((WORD)tmp[0]);
|
||||||
|
Out[1] = RGB_16_TO_8((WORD)tmp[1]);
|
||||||
|
Out[2] = RGB_16_TO_8((WORD)tmp[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
In += 3;
|
||||||
|
Out += 3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// Using Named color input table
|
// Using Named color input table
|
||||||
|
@ -1296,8 +1518,29 @@ _LPcmsTRANSFORM PickTransformRoutine(_LPcmsTRANSFORM p,
|
||||||
(p -> ExitColorSpace == icSigRgbData) &&
|
(p -> ExitColorSpace == icSigRgbData) &&
|
||||||
!(p -> dwOriginalFlags & cmsFLAGS_BLACKPOINTCOMPENSATION)) {
|
!(p -> dwOriginalFlags & cmsFLAGS_BLACKPOINTCOMPENSATION)) {
|
||||||
|
|
||||||
|
|
||||||
|
// If the floating point path is requested, see if we support it
|
||||||
|
if (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER)
|
||||||
|
{
|
||||||
|
|
||||||
|
#ifndef HAVE_SSE2_INTEL_MNEMONICS
|
||||||
|
// Turn it off if we can't compile it
|
||||||
|
p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
|
||||||
|
#else
|
||||||
|
// Turn it off if we don't have it at runtime
|
||||||
|
if (!SSE2Available())
|
||||||
|
p -> dwOriginalFlags &= ~cmsFLAGS_FLOATSHAPER;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// Yes... try to smelt matrix-shapers
|
// Yes... try to smelt matrix-shapers
|
||||||
|
|
||||||
|
#ifndef HAVE_SSE2_INTEL_MNEMONICS
|
||||||
|
p -> xform = MatrixShaperXFORM;
|
||||||
|
#else
|
||||||
p -> xform = (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) ? MatrixShaperXFORMFloat : MatrixShaperXFORM;
|
p -> xform = (p -> dwOriginalFlags & cmsFLAGS_FLOATSHAPER) ? MatrixShaperXFORMFloat : MatrixShaperXFORM;
|
||||||
|
#endif
|
||||||
|
|
||||||
p -> dwOriginalFlags |= cmsFLAGS_NOTPRECALC;
|
p -> dwOriginalFlags |= cmsFLAGS_NOTPRECALC;
|
||||||
|
|
||||||
if (!cmsBuildSmeltMatShaper(p))
|
if (!cmsBuildSmeltMatShaper(p))
|
||||||
|
|
Загрузка…
Ссылка в новой задаче