зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1551088 - Part 1. Expose SwizzleRow and PremultiplyRow variants. r=lsalzman
The image decoders produce surfaces row by row, so a variant to get a function pointer to perform swizzle/premultiply operations makes more ergonomic sense. Differential Revision: https://phabricator.services.mozilla.com/D46444 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
b0ffdb3a87
Коммит
413fb28670
|
@ -41,6 +41,10 @@ namespace gfx {
|
|||
#define FORMAT_CASE(aSrcFormat, aDstFormat, ...) \
|
||||
FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, FORMAT_CASE_CALL(__VA_ARGS__))
|
||||
|
||||
#define FORMAT_CASE_ROW(aSrcFormat, aDstFormat, ...) \
|
||||
case FORMAT_KEY(aSrcFormat, aDstFormat): \
|
||||
return &__VA_ARGS__;
|
||||
|
||||
/**
|
||||
* Constexpr functions for analyzing format attributes in templates.
|
||||
*/
|
||||
|
@ -114,6 +118,15 @@ void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
|||
Premultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
|
||||
|
||||
# define PREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW( \
|
||||
aSrcFormat, aDstFormat, \
|
||||
PremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
template <bool aSwapRB>
|
||||
void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||||
|
||||
|
@ -129,6 +142,15 @@ void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
|||
Swizzle_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t);
|
||||
|
||||
# define SWIZZLE_ROW_SSE2(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW( \
|
||||
aSrcFormat, aDstFormat, \
|
||||
SwizzleRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef USE_NEON
|
||||
|
@ -144,6 +166,15 @@ void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
|||
Premultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void PremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
|
||||
|
||||
# define PREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW( \
|
||||
aSrcFormat, aDstFormat, \
|
||||
PremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
template <bool aSwapRB>
|
||||
void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
||||
|
||||
|
@ -159,6 +190,14 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
|||
Swizzle_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t);
|
||||
|
||||
# define SWIZZLE_ROW_NEON(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW( \
|
||||
aSrcFormat, aDstFormat, \
|
||||
SwizzleRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
@ -171,51 +210,65 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
|
|||
// 2-component vectors. Otherwise, an approximation if divide-by-255 is used
|
||||
// which is faster than an actual division. These optimizations are also used
|
||||
// for the SSE2 and NEON implementations.
|
||||
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
|
||||
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
|
||||
static void PremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
|
||||
int32_t aLength) {
|
||||
const uint8_t* end = aSrc + 4 * aLength;
|
||||
do {
|
||||
// Load and process 1 entire pixel at a time.
|
||||
uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
|
||||
uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
|
||||
|
||||
// Isolate the R and B components.
|
||||
uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
|
||||
// Swap the order of R and B if necessary.
|
||||
if (aSwapRB) {
|
||||
rb = (rb >> 16) | (rb << 16);
|
||||
}
|
||||
// Approximate the multiply by alpha and divide by 255 which is
|
||||
// essentially:
|
||||
// c = c*a + 255; c = (c + (c >> 8)) >> 8;
|
||||
// However, we omit the final >> 8 to fold it with the final shift into
|
||||
// place depending on desired output format.
|
||||
rb = rb * a + 0x00FF00FF;
|
||||
rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
|
||||
|
||||
// Use same approximation as above, but G is shifted 8 bits left.
|
||||
// Alpha is left out and handled separately.
|
||||
uint32_t g = color & (0xFF00 << aSrcRGBShift);
|
||||
g = g * a + (0xFF00 << aSrcRGBShift);
|
||||
g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
|
||||
|
||||
// The above math leaves RGB shifted left by 8 bits.
|
||||
// Shift them right if required for the output format.
|
||||
// then combine them back together to produce output pixel.
|
||||
// Add the alpha back on if the output format is not opaque.
|
||||
*reinterpret_cast<uint32_t*>(aDst) =
|
||||
(rb >> (8 - aDstRGBShift)) | (g >> (8 + aSrcRGBShift - aDstRGBShift)) |
|
||||
(aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
|
||||
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
|
||||
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
|
||||
static void PremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
|
||||
int32_t aLength) {
|
||||
PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
|
||||
aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
|
||||
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
|
||||
static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
|
||||
uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
const uint8_t* end = aSrc + 4 * aSize.width;
|
||||
do {
|
||||
// Load and process 1 entire pixel at a time.
|
||||
uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
|
||||
uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
|
||||
|
||||
// Isolate the R and B components.
|
||||
uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
|
||||
// Swap the order of R and B if necessary.
|
||||
if (aSwapRB) {
|
||||
rb = (rb >> 16) | (rb << 16);
|
||||
}
|
||||
// Approximate the multiply by alpha and divide by 255 which is
|
||||
// essentially:
|
||||
// c = c*a + 255; c = (c + (c >> 8)) >> 8;
|
||||
// However, we omit the final >> 8 to fold it with the final shift into
|
||||
// place depending on desired output format.
|
||||
rb = rb * a + 0x00FF00FF;
|
||||
rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
|
||||
|
||||
// Use same approximation as above, but G is shifted 8 bits left.
|
||||
// Alpha is left out and handled separately.
|
||||
uint32_t g = color & (0xFF00 << aSrcRGBShift);
|
||||
g = g * a + (0xFF00 << aSrcRGBShift);
|
||||
g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
|
||||
|
||||
// The above math leaves RGB shifted left by 8 bits.
|
||||
// Shift them right if required for the output format.
|
||||
// then combine them back together to produce output pixel.
|
||||
// Add the alpha back on if the output format is not opaque.
|
||||
*reinterpret_cast<uint32_t*>(aDst) =
|
||||
(rb >> (8 - aDstRGBShift)) |
|
||||
(g >> (8 + aSrcRGBShift - aDstRGBShift)) |
|
||||
(aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
|
||||
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
|
||||
PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
|
||||
aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
|
@ -237,6 +290,22 @@ static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
|
|||
PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
|
||||
PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
|
||||
|
||||
#define PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW(aSrcFormat, aDstFormat, \
|
||||
PremultiplyRowFallback< \
|
||||
ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat), \
|
||||
RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
|
||||
RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
|
||||
|
||||
#define PREMULTIPLY_ROW_FALLBACK(aSrcFormat) \
|
||||
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
|
||||
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
|
||||
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
|
||||
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
|
||||
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
|
||||
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
|
||||
|
||||
// If rows are tightly packed, and the size of the total area will fit within
|
||||
// the precision range of a single row, then process all the data as if it was
|
||||
// a single row.
|
||||
|
@ -323,6 +392,50 @@ bool PremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
|
|||
return false;
|
||||
}
|
||||
|
||||
SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat,
|
||||
SurfaceFormat aDstFormat) {
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
|
||||
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
|
||||
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
|
||||
PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
|
||||
PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
|
||||
PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpremultiplying
|
||||
*/
|
||||
|
@ -457,39 +570,54 @@ bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
|
|||
|
||||
// Fallback swizzle implementation that uses shifting and masking to reorder
|
||||
// pixels.
|
||||
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
|
||||
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
|
||||
static void SwizzleChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
|
||||
int32_t aLength) {
|
||||
const uint8_t* end = aSrc + 4 * aLength;
|
||||
do {
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
|
||||
if (aSwapRB) {
|
||||
// Handle R and B swaps by exchanging words and masking.
|
||||
uint32_t rb =
|
||||
((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
|
||||
uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
|
||||
rgba = rb | ga;
|
||||
}
|
||||
|
||||
// If src and dst shifts differ, rotate left or right to move RGB into
|
||||
// place, i.e. ARGB -> RGBA or ARGB -> RGBA.
|
||||
if (aDstRGBShift > aSrcRGBShift) {
|
||||
rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
|
||||
} else if (aSrcRGBShift > aDstRGBShift) {
|
||||
rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
|
||||
} else if (aOpaqueAlpha) {
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
}
|
||||
|
||||
*reinterpret_cast<uint32_t*>(aDst) = rgba;
|
||||
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
|
||||
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
|
||||
static void SwizzleRowFallback(const uint8_t* aSrc, uint8_t* aDst,
|
||||
int32_t aLength) {
|
||||
SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
|
||||
aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
|
||||
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
|
||||
static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
const uint8_t* end = aSrc + 4 * aSize.width;
|
||||
do {
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
|
||||
if (aSwapRB) {
|
||||
// Handle R and B swaps by exchanging words and masking.
|
||||
uint32_t rb =
|
||||
((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
|
||||
uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
|
||||
rgba = rb | ga;
|
||||
}
|
||||
|
||||
// If src and dst shifts differ, rotate left or right to move RGB into
|
||||
// place, i.e. ARGB -> RGBA or ARGB -> RGBA.
|
||||
if (aDstRGBShift > aSrcRGBShift) {
|
||||
rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
|
||||
} else if (aSrcRGBShift > aDstRGBShift) {
|
||||
rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
|
||||
} else if (aOpaqueAlpha) {
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
}
|
||||
|
||||
*reinterpret_cast<uint32_t*>(aDst) = rgba;
|
||||
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
|
||||
SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
|
||||
aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
|
@ -503,6 +631,14 @@ static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
|
||||
RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
|
||||
|
||||
#define SWIZZLE_ROW_FALLBACK(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW( \
|
||||
aSrcFormat, aDstFormat, \
|
||||
SwizzleRowFallback<ShouldSwapRB(aSrcFormat, aDstFormat), \
|
||||
ShouldForceOpaque(aSrcFormat, aDstFormat), \
|
||||
RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
|
||||
RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
|
||||
|
||||
// Fast-path for matching formats.
|
||||
static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize, int32_t aBPP) {
|
||||
|
@ -517,26 +653,39 @@ static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
}
|
||||
|
||||
// Fast-path for conversions that swap all bytes.
|
||||
template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
|
||||
static void SwizzleChunkSwap(const uint8_t*& aSrc, uint8_t*& aDst,
|
||||
int32_t aLength) {
|
||||
const uint8_t* end = aSrc + 4 * aLength;
|
||||
do {
|
||||
// Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
#if MOZ_LITTLE_ENDIAN
|
||||
rgba = NativeEndian::swapToBigEndian(rgba);
|
||||
#else
|
||||
rgba = NativeEndian::swapToLittleEndian(rgba);
|
||||
#endif
|
||||
if (aOpaqueAlpha) {
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
}
|
||||
*reinterpret_cast<uint32_t*>(aDst) = rgba;
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
}
|
||||
|
||||
template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
|
||||
static void SwizzleRowSwap(const uint8_t* aSrc, uint8_t* aDst,
|
||||
int32_t aLength) {
|
||||
SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst, aLength);
|
||||
}
|
||||
|
||||
template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
|
||||
static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
const uint8_t* end = aSrc + 4 * aSize.width;
|
||||
do {
|
||||
// Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
#if MOZ_LITTLE_ENDIAN
|
||||
rgba = NativeEndian::swapToBigEndian(rgba);
|
||||
#else
|
||||
rgba = NativeEndian::swapToLittleEndian(rgba);
|
||||
#endif
|
||||
if (aOpaqueAlpha) {
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
}
|
||||
*reinterpret_cast<uint32_t*>(aDst) = rgba;
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst,
|
||||
aSize.width);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
|
@ -548,34 +697,61 @@ static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
SwizzleSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
|
||||
AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
|
||||
|
||||
#define SWIZZLE_ROW_SWAP(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW( \
|
||||
aSrcFormat, aDstFormat, \
|
||||
SwizzleRowSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
|
||||
AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
|
||||
|
||||
// Fast-path for conversions that force alpha to opaque.
|
||||
template <uint32_t aDstAShift>
|
||||
static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) {
|
||||
const uint8_t* end = aBuffer + 4 * aLength;
|
||||
do {
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aBuffer);
|
||||
// Just add on the alpha bits to the source.
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
*reinterpret_cast<uint32_t*>(aBuffer) = rgba;
|
||||
aBuffer += 4;
|
||||
} while (aBuffer < end);
|
||||
}
|
||||
|
||||
template <uint32_t aDstAShift>
|
||||
static void SwizzleChunkOpaqueCopy(const uint8_t*& aSrc, uint8_t* aDst,
|
||||
int32_t aLength) {
|
||||
const uint8_t* end = aSrc + 4 * aLength;
|
||||
do {
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
// Just add on the alpha bits to the source.
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
*reinterpret_cast<uint32_t*>(aDst) = rgba;
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
}
|
||||
|
||||
template <uint32_t aDstAShift>
|
||||
static void SwizzleRowOpaque(const uint8_t* aSrc, uint8_t* aDst,
|
||||
int32_t aLength) {
|
||||
if (aSrc == aDst) {
|
||||
SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aLength);
|
||||
} else {
|
||||
SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aLength);
|
||||
}
|
||||
}
|
||||
|
||||
template <uint32_t aDstAShift>
|
||||
static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
if (aSrc == aDst) {
|
||||
// Modifying in-place, so just write out the alpha.
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
const uint8_t* end = aDst + 4 * aSize.width;
|
||||
do {
|
||||
// ORing directly onto destination memory profiles faster than writing
|
||||
// individually to the alpha byte and also profiles equivalently to a
|
||||
// SSE2 implementation.
|
||||
*reinterpret_cast<uint32_t*>(aDst) |= 0xFF << aDstAShift;
|
||||
aDst += 4;
|
||||
} while (aDst < end);
|
||||
SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aSize.width);
|
||||
aDst += aDstGap;
|
||||
}
|
||||
} else {
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
const uint8_t* end = aSrc + 4 * aSize.width;
|
||||
do {
|
||||
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
|
||||
// Just add on the alpha bits to the source.
|
||||
rgba |= 0xFF << aDstAShift;
|
||||
*reinterpret_cast<uint32_t*>(aDst) = rgba;
|
||||
aSrc += 4;
|
||||
aDst += 4;
|
||||
} while (aSrc < end);
|
||||
SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aSize.width);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
|
@ -585,6 +761,10 @@ static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
#define SWIZZLE_OPAQUE(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleOpaque<AlphaBitShift(aDstFormat)>)
|
||||
|
||||
#define SWIZZLE_ROW_OPAQUE(aSrcFormat, aDstFormat) \
|
||||
FORMAT_CASE_ROW(aSrcFormat, aDstFormat, \
|
||||
SwizzleRowOpaque<AlphaBitShift(aDstFormat)>)
|
||||
|
||||
// Packing of 32-bit formats to RGB565.
|
||||
template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
|
||||
static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
|
@ -780,5 +960,60 @@ bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
|
|||
return false;
|
||||
}
|
||||
|
||||
SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
|
||||
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
|
||||
|
||||
SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
|
||||
SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
|
||||
SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
|
||||
SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace gfx
|
||||
} // namespace mozilla
|
||||
|
|
|
@ -41,6 +41,22 @@ GFX2D_API bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
|
|||
int32_t aDstStride, SurfaceFormat aDstFormat,
|
||||
const IntSize& aSize);
|
||||
|
||||
/**
|
||||
* Swizzles source and writes it to destination. Source and destination may be
|
||||
* the same to swizzle in-place.
|
||||
*/
|
||||
typedef void (*SwizzleRowFn)(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
|
||||
|
||||
/**
|
||||
* Get a function pointer to perform premultiplication between two formats.
|
||||
*/
|
||||
GFX2D_API SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat);
|
||||
|
||||
/**
|
||||
* Get a function pointer to perform swizzling between two formats.
|
||||
*/
|
||||
GFX2D_API SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat);
|
||||
|
||||
} // namespace gfx
|
||||
} // namespace mozilla
|
||||
|
||||
|
|
|
@ -85,6 +85,36 @@ PremultiplyVector_NEON(const uint16x8_t& aSrc) {
|
|||
return vsriq_n_u16(ga, rb, 8);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
|
||||
uint8_t*& aDst,
|
||||
int32_t aAlignedRow,
|
||||
int32_t aRemainder) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
|
||||
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||||
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (aRemainder) {
|
||||
uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
|
||||
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_NEON(aDst, aRemainder, px);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
|
||||
int32_t alignedRow = 4 * (aLength & ~3);
|
||||
int32_t remainder = aLength & 3;
|
||||
PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
|
||||
remainder);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
|
@ -95,28 +125,22 @@ void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
aDstGap += 4 * remainder;
|
||||
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||||
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||||
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (remainder) {
|
||||
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
|
||||
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_NEON(aDst, remainder, px);
|
||||
}
|
||||
|
||||
PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
|
||||
remainder);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
}
|
||||
|
||||
// Force instantiation of premultiply variants here.
|
||||
template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
|
||||
int32_t, IntSize);
|
||||
template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
|
||||
|
@ -258,7 +282,7 @@ template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
|
|||
|
||||
// Swizzle a vector of 4 pixels providing swaps and opaquifying.
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
|
||||
static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
|
||||
// Swap R and B, then add to G and A (forced to 255):
|
||||
// (((src>>16) | (src << 16)) & 0x00FF00FF) |
|
||||
// ((src | 0xFF000000) & ~0x00FF00FF)
|
||||
|
@ -275,7 +299,7 @@ MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
|
|||
|
||||
// Optimized implementations for when there is no R and B swap.
|
||||
template<>
|
||||
MOZ_ALWAYS_INLINE uint16x8_t
|
||||
static MOZ_ALWAYS_INLINE uint16x8_t
|
||||
SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
|
||||
{
|
||||
// Force alpha to 255.
|
||||
|
@ -283,13 +307,42 @@ SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
|
|||
}
|
||||
|
||||
template<>
|
||||
MOZ_ALWAYS_INLINE uint16x8_t
|
||||
static MOZ_ALWAYS_INLINE uint16x8_t
|
||||
SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
|
||||
{
|
||||
return aSrc;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
|
||||
uint8_t*& aDst,
|
||||
int32_t aAlignedRow,
|
||||
int32_t aRemainder) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
|
||||
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||||
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (aRemainder) {
|
||||
uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
|
||||
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_NEON(aDst, aRemainder, px);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
|
||||
int32_t alignedRow = 4 * (aLength & ~3);
|
||||
int32_t remainder = aLength & 3;
|
||||
SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
|
@ -300,28 +353,16 @@ void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
aDstGap += 4 * remainder;
|
||||
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||||
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
|
||||
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (remainder) {
|
||||
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
|
||||
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_NEON(aDst, remainder, px);
|
||||
}
|
||||
|
||||
SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
|
||||
remainder);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
}
|
||||
|
||||
// Force instantiation of swizzle variants here.
|
||||
template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
|
||||
template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
|
||||
template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
|
||||
int32_t, IntSize);
|
||||
template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
|
||||
|
|
|
@ -88,6 +88,38 @@ static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) {
|
|||
return _mm_or_si128(rb, ga);
|
||||
}
|
||||
|
||||
// Premultiply vector of aAlignedRow + aRemainder pixels.
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc,
|
||||
uint8_t*& aDst,
|
||||
int32_t aAlignedRow,
|
||||
int32_t aRemainder) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
|
||||
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
|
||||
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (aRemainder) {
|
||||
__m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
|
||||
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_SSE2(aDst, aRemainder, px);
|
||||
}
|
||||
}
|
||||
|
||||
// Premultiply vector of aLength pixels.
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
|
||||
int32_t alignedRow = 4 * (aLength & ~3);
|
||||
int32_t remainder = aLength & 3;
|
||||
PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
|
||||
remainder);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
|
@ -98,28 +130,22 @@ void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
aDstGap += 4 * remainder;
|
||||
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||||
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
|
||||
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (remainder) {
|
||||
__m128i px = LoadRemainder_SSE2(aSrc, remainder);
|
||||
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_SSE2(aDst, remainder, px);
|
||||
}
|
||||
|
||||
PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
|
||||
remainder);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
}
|
||||
|
||||
// Force instantiation of premultiply variants here.
|
||||
template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*,
|
||||
int32_t);
|
||||
template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*,
|
||||
int32_t, IntSize);
|
||||
template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*,
|
||||
|
@ -293,6 +319,35 @@ SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
|
|||
}
|
||||
#endif
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc,
|
||||
uint8_t*& aDst,
|
||||
int32_t aAlignedRow,
|
||||
int32_t aRemainder) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
|
||||
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
|
||||
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (aRemainder) {
|
||||
__m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
|
||||
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_SSE2(aDst, aRemainder, px);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
|
||||
int32_t alignedRow = 4 * (aLength & ~3);
|
||||
int32_t remainder = aLength & 3;
|
||||
SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
|
||||
}
|
||||
|
||||
template <bool aSwapRB, bool aOpaqueAlpha>
|
||||
void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
||||
int32_t aDstGap, IntSize aSize) {
|
||||
|
@ -303,28 +358,15 @@ void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
|
|||
aDstGap += 4 * remainder;
|
||||
|
||||
for (int32_t height = aSize.height; height > 0; height--) {
|
||||
// Process all 4-pixel chunks as one vector.
|
||||
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
|
||||
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
|
||||
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
|
||||
aSrc += 4 * 4;
|
||||
aDst += 4 * 4;
|
||||
}
|
||||
|
||||
// Handle any 1-3 remaining pixels.
|
||||
if (remainder) {
|
||||
__m128i px = LoadRemainder_SSE2(aSrc, remainder);
|
||||
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
|
||||
StoreRemainder_SSE2(aDst, remainder, px);
|
||||
}
|
||||
|
||||
SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
|
||||
aSrc += aSrcGap;
|
||||
aDst += aDstGap;
|
||||
}
|
||||
}
|
||||
|
||||
// Force instantiation of swizzle variants here.
|
||||
template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t);
|
||||
template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t);
|
||||
template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
|
||||
int32_t, IntSize);
|
||||
template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
|
||||
|
|
Загрузка…
Ссылка в новой задаче