Bug 1551088 - Part 1. Expose SwizzleRow and PremultiplyRow variants. r=lsalzman

The image decoders produce surfaces row by row, so a variant to get a
function pointer to perform swizzle/premultiply operations makes more
ergonomic sense.

Differential Revision: https://phabricator.services.mozilla.com/D46444

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Andrew Osmond 2019-09-24 20:43:24 +00:00
Родитель b0ffdb3a87
Коммит 413fb28670
4 изменённых файлов: 502 добавлений и 168 удалений

Просмотреть файл

@ -41,6 +41,10 @@ namespace gfx {
#define FORMAT_CASE(aSrcFormat, aDstFormat, ...) \
FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, FORMAT_CASE_CALL(__VA_ARGS__))
#define FORMAT_CASE_ROW(aSrcFormat, aDstFormat, ...) \
case FORMAT_KEY(aSrcFormat, aDstFormat): \
return &__VA_ARGS__;
/**
* Constexpr functions for analyzing format attributes in templates.
*/
@ -114,6 +118,15 @@ void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
Premultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB, bool aOpaqueAlpha>
void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
# define PREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW( \
aSrcFormat, aDstFormat, \
PremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB>
void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
@ -129,6 +142,15 @@ void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
Swizzle_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB, bool aOpaqueAlpha>
void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t);
# define SWIZZLE_ROW_SSE2(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW( \
aSrcFormat, aDstFormat, \
SwizzleRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
#endif
#ifdef USE_NEON
@ -144,6 +166,15 @@ void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
Premultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB, bool aOpaqueAlpha>
void PremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
# define PREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW( \
aSrcFormat, aDstFormat, \
PremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB>
void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
@ -159,6 +190,14 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
Swizzle_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB, bool aOpaqueAlpha>
void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t);
# define SWIZZLE_ROW_NEON(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW( \
aSrcFormat, aDstFormat, \
SwizzleRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
#endif
/**
@ -171,51 +210,65 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
// 2-component vectors. Otherwise, an approximation if divide-by-255 is used
// which is faster than an actual division. These optimizations are also used
// for the SSE2 and NEON implementations.
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
static void PremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
int32_t aLength) {
const uint8_t* end = aSrc + 4 * aLength;
do {
// Load and process 1 entire pixel at a time.
uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
// Isolate the R and B components.
uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
// Swap the order of R and B if necessary.
if (aSwapRB) {
rb = (rb >> 16) | (rb << 16);
}
// Approximate the multiply by alpha and divide by 255 which is
// essentially:
// c = c*a + 255; c = (c + (c >> 8)) >> 8;
// However, we omit the final >> 8 to fold it with the final shift into
// place depending on desired output format.
rb = rb * a + 0x00FF00FF;
rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
// Use same approximation as above, but G is shifted 8 bits left.
// Alpha is left out and handled separately.
uint32_t g = color & (0xFF00 << aSrcRGBShift);
g = g * a + (0xFF00 << aSrcRGBShift);
g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
// The above math leaves RGB shifted left by 8 bits.
// Shift them right if required for the output format.
// then combine them back together to produce output pixel.
// Add the alpha back on if the output format is not opaque.
*reinterpret_cast<uint32_t*>(aDst) =
(rb >> (8 - aDstRGBShift)) | (g >> (8 + aSrcRGBShift - aDstRGBShift)) |
(aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
aSrc += 4;
aDst += 4;
} while (aSrc < end);
}
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
static void PremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
int32_t aLength) {
PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
}
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
for (int32_t height = aSize.height; height > 0; height--) {
const uint8_t* end = aSrc + 4 * aSize.width;
do {
// Load and process 1 entire pixel at a time.
uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
// Isolate the R and B components.
uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
// Swap the order of R and B if necessary.
if (aSwapRB) {
rb = (rb >> 16) | (rb << 16);
}
// Approximate the multiply by alpha and divide by 255 which is
// essentially:
// c = c*a + 255; c = (c + (c >> 8)) >> 8;
// However, we omit the final >> 8 to fold it with the final shift into
// place depending on desired output format.
rb = rb * a + 0x00FF00FF;
rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
// Use same approximation as above, but G is shifted 8 bits left.
// Alpha is left out and handled separately.
uint32_t g = color & (0xFF00 << aSrcRGBShift);
g = g * a + (0xFF00 << aSrcRGBShift);
g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
// The above math leaves RGB shifted left by 8 bits.
// Shift them right if required for the output format.
// then combine them back together to produce output pixel.
// Add the alpha back on if the output format is not opaque.
*reinterpret_cast<uint32_t*>(aDst) =
(rb >> (8 - aDstRGBShift)) |
(g >> (8 + aSrcRGBShift - aDstRGBShift)) |
(aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
aSrc += 4;
aDst += 4;
} while (aSrc < end);
PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
aSrc += aSrcGap;
aDst += aDstGap;
}
@ -237,6 +290,22 @@ static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
#define PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW(aSrcFormat, aDstFormat, \
PremultiplyRowFallback< \
ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat), \
RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
#define PREMULTIPLY_ROW_FALLBACK(aSrcFormat) \
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
// If rows are tightly packed, and the size of the total area will fit within
// the precision range of a single row, then process all the data as if it was
// a single row.
@ -323,6 +392,50 @@ bool PremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
return false;
}
SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat,
SurfaceFormat aDstFormat) {
#ifdef USE_SSE2
if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
default:
break;
}
#endif
#ifdef USE_NEON
if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
default:
break;
}
#endif
switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
default:
break;
}
MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
return nullptr;
}
/**
* Unpremultiplying
*/
@ -457,39 +570,54 @@ bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
// Fallback swizzle implementation that uses shifting and masking to reorder
// pixels.
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
static void SwizzleChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
int32_t aLength) {
const uint8_t* end = aSrc + 4 * aLength;
do {
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
if (aSwapRB) {
// Handle R and B swaps by exchanging words and masking.
uint32_t rb =
((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
rgba = rb | ga;
}
// If src and dst shifts differ, rotate left or right to move RGB into
// place, i.e. ARGB -> RGBA or ARGB -> RGBA.
if (aDstRGBShift > aSrcRGBShift) {
rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
} else if (aSrcRGBShift > aDstRGBShift) {
rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
} else if (aOpaqueAlpha) {
rgba |= 0xFF << aDstAShift;
}
*reinterpret_cast<uint32_t*>(aDst) = rgba;
aSrc += 4;
aDst += 4;
} while (aSrc < end);
}
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
static void SwizzleRowFallback(const uint8_t* aSrc, uint8_t* aDst,
int32_t aLength) {
SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
}
template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
for (int32_t height = aSize.height; height > 0; height--) {
const uint8_t* end = aSrc + 4 * aSize.width;
do {
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
if (aSwapRB) {
// Handle R and B swaps by exchanging words and masking.
uint32_t rb =
((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
rgba = rb | ga;
}
// If src and dst shifts differ, rotate left or right to move RGB into
// place, i.e. ARGB -> RGBA or ARGB -> RGBA.
if (aDstRGBShift > aSrcRGBShift) {
rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
} else if (aSrcRGBShift > aDstRGBShift) {
rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
} else if (aOpaqueAlpha) {
rgba |= 0xFF << aDstAShift;
}
*reinterpret_cast<uint32_t*>(aDst) = rgba;
aSrc += 4;
aDst += 4;
} while (aSrc < end);
SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
aSrc += aSrcGap;
aDst += aDstGap;
}
@ -503,6 +631,14 @@ static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
#define SWIZZLE_ROW_FALLBACK(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW( \
aSrcFormat, aDstFormat, \
SwizzleRowFallback<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat), \
RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
// Fast-path for matching formats.
static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize, int32_t aBPP) {
@ -517,26 +653,39 @@ static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
}
// Fast-path for conversions that swap all bytes.
template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
static void SwizzleChunkSwap(const uint8_t*& aSrc, uint8_t*& aDst,
int32_t aLength) {
const uint8_t* end = aSrc + 4 * aLength;
do {
// Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
#if MOZ_LITTLE_ENDIAN
rgba = NativeEndian::swapToBigEndian(rgba);
#else
rgba = NativeEndian::swapToLittleEndian(rgba);
#endif
if (aOpaqueAlpha) {
rgba |= 0xFF << aDstAShift;
}
*reinterpret_cast<uint32_t*>(aDst) = rgba;
aSrc += 4;
aDst += 4;
} while (aSrc < end);
}
template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
static void SwizzleRowSwap(const uint8_t* aSrc, uint8_t* aDst,
int32_t aLength) {
SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst, aLength);
}
template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
for (int32_t height = aSize.height; height > 0; height--) {
const uint8_t* end = aSrc + 4 * aSize.width;
do {
// Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
#if MOZ_LITTLE_ENDIAN
rgba = NativeEndian::swapToBigEndian(rgba);
#else
rgba = NativeEndian::swapToLittleEndian(rgba);
#endif
if (aOpaqueAlpha) {
rgba |= 0xFF << aDstAShift;
}
*reinterpret_cast<uint32_t*>(aDst) = rgba;
aSrc += 4;
aDst += 4;
} while (aSrc < end);
SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst,
aSize.width);
aSrc += aSrcGap;
aDst += aDstGap;
}
@ -548,34 +697,61 @@ static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
SwizzleSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
#define SWIZZLE_ROW_SWAP(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW( \
aSrcFormat, aDstFormat, \
SwizzleRowSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
// Fast-path for conversions that force alpha to opaque.
template <uint32_t aDstAShift>
static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) {
const uint8_t* end = aBuffer + 4 * aLength;
do {
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aBuffer);
// Just add on the alpha bits to the source.
rgba |= 0xFF << aDstAShift;
*reinterpret_cast<uint32_t*>(aBuffer) = rgba;
aBuffer += 4;
} while (aBuffer < end);
}
template <uint32_t aDstAShift>
static void SwizzleChunkOpaqueCopy(const uint8_t*& aSrc, uint8_t* aDst,
int32_t aLength) {
const uint8_t* end = aSrc + 4 * aLength;
do {
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
// Just add on the alpha bits to the source.
rgba |= 0xFF << aDstAShift;
*reinterpret_cast<uint32_t*>(aDst) = rgba;
aSrc += 4;
aDst += 4;
} while (aSrc < end);
}
template <uint32_t aDstAShift>
static void SwizzleRowOpaque(const uint8_t* aSrc, uint8_t* aDst,
int32_t aLength) {
if (aSrc == aDst) {
SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aLength);
} else {
SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aLength);
}
}
template <uint32_t aDstAShift>
static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
if (aSrc == aDst) {
// Modifying in-place, so just write out the alpha.
for (int32_t height = aSize.height; height > 0; height--) {
const uint8_t* end = aDst + 4 * aSize.width;
do {
// ORing directly onto destination memory profiles faster than writing
// individually to the alpha byte and also profiles equivalently to a
// SSE2 implementation.
*reinterpret_cast<uint32_t*>(aDst) |= 0xFF << aDstAShift;
aDst += 4;
} while (aDst < end);
SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aSize.width);
aDst += aDstGap;
}
} else {
for (int32_t height = aSize.height; height > 0; height--) {
const uint8_t* end = aSrc + 4 * aSize.width;
do {
uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
// Just add on the alpha bits to the source.
rgba |= 0xFF << aDstAShift;
*reinterpret_cast<uint32_t*>(aDst) = rgba;
aSrc += 4;
aDst += 4;
} while (aSrc < end);
SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aSize.width);
aSrc += aSrcGap;
aDst += aDstGap;
}
@ -585,6 +761,10 @@ static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
#define SWIZZLE_OPAQUE(aSrcFormat, aDstFormat) \
FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleOpaque<AlphaBitShift(aDstFormat)>)
#define SWIZZLE_ROW_OPAQUE(aSrcFormat, aDstFormat) \
FORMAT_CASE_ROW(aSrcFormat, aDstFormat, \
SwizzleRowOpaque<AlphaBitShift(aDstFormat)>)
// Packing of 32-bit formats to RGB565.
template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
@ -780,5 +960,60 @@ bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
return false;
}
SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
#ifdef USE_SSE2
if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
default:
break;
}
#endif
#ifdef USE_NEON
if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
default:
break;
}
#endif
switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
default:
break;
}
MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats");
return nullptr;
}
} // namespace gfx
} // namespace mozilla

Просмотреть файл

@ -41,6 +41,22 @@ GFX2D_API bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
int32_t aDstStride, SurfaceFormat aDstFormat,
const IntSize& aSize);
/**
* Swizzles source and writes it to destination. Source and destination may be
* the same to swizzle in-place.
*/
typedef void (*SwizzleRowFn)(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
/**
* Get a function pointer to perform premultiplication between two formats.
*/
GFX2D_API SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat);
/**
* Get a function pointer to perform swizzling between two formats.
*/
GFX2D_API SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat);
} // namespace gfx
} // namespace mozilla

Просмотреть файл

@ -85,6 +85,36 @@ PremultiplyVector_NEON(const uint16x8_t& aSrc) {
return vsriq_n_u16(ga, rb, 8);
}
template <bool aSwapRB, bool aOpaqueAlpha>
static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
uint8_t*& aDst,
int32_t aAlignedRow,
int32_t aRemainder) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (aRemainder) {
uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_NEON(aDst, aRemainder, px);
}
}
template <bool aSwapRB, bool aOpaqueAlpha>
void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
int32_t alignedRow = 4 * (aLength & ~3);
int32_t remainder = aLength & 3;
PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
remainder);
}
template <bool aSwapRB, bool aOpaqueAlpha>
void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
@ -95,28 +125,22 @@ void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
aDstGap += 4 * remainder;
for (int32_t height = aSize.height; height > 0; height--) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (remainder) {
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_NEON(aDst, remainder, px);
}
PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
remainder);
aSrc += aSrcGap;
aDst += aDstGap;
}
}
// Force instantiation of premultiply variants here.
template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
int32_t);
template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
int32_t);
template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
int32_t);
template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
int32_t);
template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
int32_t, IntSize);
template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
@ -258,7 +282,7 @@ template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
// Swizzle a vector of 4 pixels providing swaps and opaquifying.
template <bool aSwapRB, bool aOpaqueAlpha>
MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
// Swap R and B, then add to G and A (forced to 255):
// (((src>>16) | (src << 16)) & 0x00FF00FF) |
// ((src | 0xFF000000) & ~0x00FF00FF)
@ -275,7 +299,7 @@ MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
// Optimized implementations for when there is no R and B swap.
template<>
MOZ_ALWAYS_INLINE uint16x8_t
static MOZ_ALWAYS_INLINE uint16x8_t
SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
{
// Force alpha to 255.
@ -283,13 +307,42 @@ SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
}
template<>
MOZ_ALWAYS_INLINE uint16x8_t
static MOZ_ALWAYS_INLINE uint16x8_t
SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
{
return aSrc;
}
#endif
template <bool aSwapRB, bool aOpaqueAlpha>
static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
uint8_t*& aDst,
int32_t aAlignedRow,
int32_t aRemainder) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (aRemainder) {
uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_NEON(aDst, aRemainder, px);
}
}
template <bool aSwapRB, bool aOpaqueAlpha>
void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
int32_t alignedRow = 4 * (aLength & ~3);
int32_t remainder = aLength & 3;
SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
}
template <bool aSwapRB, bool aOpaqueAlpha>
void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
@ -300,28 +353,16 @@ void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
aDstGap += 4 * remainder;
for (int32_t height = aSize.height; height > 0; height--) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (remainder) {
uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_NEON(aDst, remainder, px);
}
SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
remainder);
aSrc += aSrcGap;
aDst += aDstGap;
}
}
// Force instantiation of swizzle variants here.
template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
int32_t, IntSize);
template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,

Просмотреть файл

@ -88,6 +88,38 @@ static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) {
return _mm_or_si128(rb, ga);
}
// Premultiply vector of aAlignedRow + aRemainder pixels.
template <bool aSwapRB, bool aOpaqueAlpha>
static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc,
uint8_t*& aDst,
int32_t aAlignedRow,
int32_t aRemainder) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (aRemainder) {
__m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_SSE2(aDst, aRemainder, px);
}
}
// Premultiply vector of aLength pixels.
template <bool aSwapRB, bool aOpaqueAlpha>
void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
int32_t alignedRow = 4 * (aLength & ~3);
int32_t remainder = aLength & 3;
PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
remainder);
}
template <bool aSwapRB, bool aOpaqueAlpha>
void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
@ -98,28 +130,22 @@ void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
aDstGap += 4 * remainder;
for (int32_t height = aSize.height; height > 0; height--) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (remainder) {
__m128i px = LoadRemainder_SSE2(aSrc, remainder);
px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_SSE2(aDst, remainder, px);
}
PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
remainder);
aSrc += aSrcGap;
aDst += aDstGap;
}
}
// Force instantiation of premultiply variants here.
template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*,
int32_t);
template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*,
int32_t);
template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*,
int32_t);
template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*,
int32_t);
template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*,
int32_t, IntSize);
template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*,
@ -293,6 +319,35 @@ SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
}
#endif
template <bool aSwapRB, bool aOpaqueAlpha>
static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc,
uint8_t*& aDst,
int32_t aAlignedRow,
int32_t aRemainder) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (aRemainder) {
__m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_SSE2(aDst, aRemainder, px);
}
}
template <bool aSwapRB, bool aOpaqueAlpha>
void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
int32_t alignedRow = 4 * (aLength & ~3);
int32_t remainder = aLength & 3;
SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
}
template <bool aSwapRB, bool aOpaqueAlpha>
void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
int32_t aDstGap, IntSize aSize) {
@ -303,28 +358,15 @@ void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
aDstGap += 4 * remainder;
for (int32_t height = aSize.height; height > 0; height--) {
// Process all 4-pixel chunks as one vector.
for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
__m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
_mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
aSrc += 4 * 4;
aDst += 4 * 4;
}
// Handle any 1-3 remaining pixels.
if (remainder) {
__m128i px = LoadRemainder_SSE2(aSrc, remainder);
px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
StoreRemainder_SSE2(aDst, remainder, px);
}
SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
aSrc += aSrcGap;
aDst += aDstGap;
}
}
// Force instantiation of swizzle variants here.
template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t);
template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t);
template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
int32_t, IntSize);
template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,