diff --git a/gfx/2d/Swizzle.cpp b/gfx/2d/Swizzle.cpp index 6844b5813dbb..df51da2d1a22 100644 --- a/gfx/2d/Swizzle.cpp +++ b/gfx/2d/Swizzle.cpp @@ -41,6 +41,10 @@ namespace gfx { #define FORMAT_CASE(aSrcFormat, aDstFormat, ...) \ FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, FORMAT_CASE_CALL(__VA_ARGS__)) +#define FORMAT_CASE_ROW(aSrcFormat, aDstFormat, ...) \ + case FORMAT_KEY(aSrcFormat, aDstFormat): \ + return &__VA_ARGS__; + /** * Constexpr functions for analyzing format attributes in templates. */ @@ -114,6 +118,15 @@ void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); Premultiply_SSE2) +template +void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t); + +# define PREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW( \ + aSrcFormat, aDstFormat, \ + PremultiplyRow_SSE2) + template void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); @@ -129,6 +142,15 @@ void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); Swizzle_SSE2) +template +void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t); + +# define SWIZZLE_ROW_SSE2(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW( \ + aSrcFormat, aDstFormat, \ + SwizzleRow_SSE2) + #endif #ifdef USE_NEON @@ -144,6 +166,15 @@ void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); Premultiply_NEON) +template +void PremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t); + +# define PREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW( \ + aSrcFormat, aDstFormat, \ + PremultiplyRow_NEON) + template void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); @@ -159,6 +190,14 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); Swizzle_NEON) +template +void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t); + +# define SWIZZLE_ROW_NEON(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW( \ + aSrcFormat, aDstFormat, \ + SwizzleRow_NEON) #endif /** @@ -171,51 +210,65 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); // 2-component vectors. Otherwise, an approximation if divide-by-255 is used // which is faster than an actual division. These optimizations are also used // for the SSE2 and NEON implementations. +template +static void PremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst, + int32_t aLength) { + const uint8_t* end = aSrc + 4 * aLength; + do { + // Load and process 1 entire pixel at a time. + uint32_t color = *reinterpret_cast(aSrc); + + uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF; + + // Isolate the R and B components. + uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF; + // Swap the order of R and B if necessary. + if (aSwapRB) { + rb = (rb >> 16) | (rb << 16); + } + // Approximate the multiply by alpha and divide by 255 which is + // essentially: + // c = c*a + 255; c = (c + (c >> 8)) >> 8; + // However, we omit the final >> 8 to fold it with the final shift into + // place depending on desired output format. + rb = rb * a + 0x00FF00FF; + rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00; + + // Use same approximation as above, but G is shifted 8 bits left. + // Alpha is left out and handled separately. + uint32_t g = color & (0xFF00 << aSrcRGBShift); + g = g * a + (0xFF00 << aSrcRGBShift); + g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift); + + // The above math leaves RGB shifted left by 8 bits. + // Shift them right if required for the output format. + // then combine them back together to produce output pixel. + // Add the alpha back on if the output format is not opaque. + *reinterpret_cast(aDst) = + (rb >> (8 - aDstRGBShift)) | (g >> (8 + aSrcRGBShift - aDstRGBShift)) | + (aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift); + + aSrc += 4; + aDst += 4; + } while (aSrc < end); +} + +template +static void PremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst, + int32_t aLength) { + PremultiplyChunkFallback(aSrc, aDst, aLength); +} + template static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { for (int32_t height = aSize.height; height > 0; height--) { - const uint8_t* end = aSrc + 4 * aSize.width; - do { - // Load and process 1 entire pixel at a time. - uint32_t color = *reinterpret_cast(aSrc); - - uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF; - - // Isolate the R and B components. - uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF; - // Swap the order of R and B if necessary. - if (aSwapRB) { - rb = (rb >> 16) | (rb << 16); - } - // Approximate the multiply by alpha and divide by 255 which is - // essentially: - // c = c*a + 255; c = (c + (c >> 8)) >> 8; - // However, we omit the final >> 8 to fold it with the final shift into - // place depending on desired output format. - rb = rb * a + 0x00FF00FF; - rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00; - - // Use same approximation as above, but G is shifted 8 bits left. - // Alpha is left out and handled separately. - uint32_t g = color & (0xFF00 << aSrcRGBShift); - g = g * a + (0xFF00 << aSrcRGBShift); - g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift); - - // The above math leaves RGB shifted left by 8 bits. - // Shift them right if required for the output format. - // then combine them back together to produce output pixel. - // Add the alpha back on if the output format is not opaque. - *reinterpret_cast(aDst) = - (rb >> (8 - aDstRGBShift)) | - (g >> (8 + aSrcRGBShift - aDstRGBShift)) | - (aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift); - - aSrc += 4; - aDst += 4; - } while (aSrc < end); - + PremultiplyChunkFallback(aSrc, aDst, aSize.width); aSrc += aSrcGap; aDst += aDstGap; } @@ -237,6 +290,22 @@ static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap, PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \ PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8) +#define PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW(aSrcFormat, aDstFormat, \ + PremultiplyRowFallback< \ + ShouldSwapRB(aSrcFormat, aDstFormat), \ + ShouldForceOpaque(aSrcFormat, aDstFormat), \ + RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \ + RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>) + +#define PREMULTIPLY_ROW_FALLBACK(aSrcFormat) \ + PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \ + PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \ + PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \ + PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \ + PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \ + PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8) + // If rows are tightly packed, and the size of the total area will fit within // the precision range of a single row, then process all the data as if it was // a single row. @@ -323,6 +392,50 @@ bool PremultiplyData(const uint8_t* aSrc, int32_t aSrcStride, return false; } +SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat, + SurfaceFormat aDstFormat) { +#ifdef USE_SSE2 + if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8) + PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8) + default: + break; + } +#endif + +#ifdef USE_NEON + if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8) + PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8) + default: + break; + } +#endif + + switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8) + PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8) + PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8) + default: + break; + } + + MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats"); + return nullptr; +} + /** * Unpremultiplying */ @@ -457,39 +570,54 @@ bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride, // Fallback swizzle implementation that uses shifting and masking to reorder // pixels. +template +static void SwizzleChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst, + int32_t aLength) { + const uint8_t* end = aSrc + 4 * aLength; + do { + uint32_t rgba = *reinterpret_cast(aSrc); + + if (aSwapRB) { + // Handle R and B swaps by exchanging words and masking. + uint32_t rb = + ((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift); + uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift)); + rgba = rb | ga; + } + + // If src and dst shifts differ, rotate left or right to move RGB into + // place, i.e. ARGB -> RGBA or ARGB -> RGBA. + if (aDstRGBShift > aSrcRGBShift) { + rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24); + } else if (aSrcRGBShift > aDstRGBShift) { + rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24); + } else if (aOpaqueAlpha) { + rgba |= 0xFF << aDstAShift; + } + + *reinterpret_cast(aDst) = rgba; + + aSrc += 4; + aDst += 4; + } while (aSrc < end); +} + +template +static void SwizzleRowFallback(const uint8_t* aSrc, uint8_t* aDst, + int32_t aLength) { + SwizzleChunkFallback(aSrc, aDst, aLength); +} + template static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { for (int32_t height = aSize.height; height > 0; height--) { - const uint8_t* end = aSrc + 4 * aSize.width; - do { - uint32_t rgba = *reinterpret_cast(aSrc); - - if (aSwapRB) { - // Handle R and B swaps by exchanging words and masking. - uint32_t rb = - ((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift); - uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift)); - rgba = rb | ga; - } - - // If src and dst shifts differ, rotate left or right to move RGB into - // place, i.e. ARGB -> RGBA or ARGB -> RGBA. - if (aDstRGBShift > aSrcRGBShift) { - rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24); - } else if (aSrcRGBShift > aDstRGBShift) { - rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24); - } else if (aOpaqueAlpha) { - rgba |= 0xFF << aDstAShift; - } - - *reinterpret_cast(aDst) = rgba; - - aSrc += 4; - aDst += 4; - } while (aSrc < end); - + SwizzleChunkFallback(aSrc, aDst, aSize.width); aSrc += aSrcGap; aDst += aDstGap; } @@ -503,6 +631,14 @@ static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \ RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>) +#define SWIZZLE_ROW_FALLBACK(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW( \ + aSrcFormat, aDstFormat, \ + SwizzleRowFallback) + // Fast-path for matching formats. static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize, int32_t aBPP) { @@ -517,26 +653,39 @@ static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, } // Fast-path for conversions that swap all bytes. +template +static void SwizzleChunkSwap(const uint8_t*& aSrc, uint8_t*& aDst, + int32_t aLength) { + const uint8_t* end = aSrc + 4 * aLength; + do { + // Use an endian swap to move the bytes, i.e. BGRA -> ARGB. + uint32_t rgba = *reinterpret_cast(aSrc); +#if MOZ_LITTLE_ENDIAN + rgba = NativeEndian::swapToBigEndian(rgba); +#else + rgba = NativeEndian::swapToLittleEndian(rgba); +#endif + if (aOpaqueAlpha) { + rgba |= 0xFF << aDstAShift; + } + *reinterpret_cast(aDst) = rgba; + aSrc += 4; + aDst += 4; + } while (aSrc < end); +} + +template +static void SwizzleRowSwap(const uint8_t* aSrc, uint8_t* aDst, + int32_t aLength) { + SwizzleChunkSwap(aSrc, aDst, aLength); +} + template static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { for (int32_t height = aSize.height; height > 0; height--) { - const uint8_t* end = aSrc + 4 * aSize.width; - do { - // Use an endian swap to move the bytes, i.e. BGRA -> ARGB. - uint32_t rgba = *reinterpret_cast(aSrc); -#if MOZ_LITTLE_ENDIAN - rgba = NativeEndian::swapToBigEndian(rgba); -#else - rgba = NativeEndian::swapToLittleEndian(rgba); -#endif - if (aOpaqueAlpha) { - rgba |= 0xFF << aDstAShift; - } - *reinterpret_cast(aDst) = rgba; - aSrc += 4; - aDst += 4; - } while (aSrc < end); + SwizzleChunkSwap(aSrc, aDst, + aSize.width); aSrc += aSrcGap; aDst += aDstGap; } @@ -548,34 +697,61 @@ static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, SwizzleSwap) +#define SWIZZLE_ROW_SWAP(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW( \ + aSrcFormat, aDstFormat, \ + SwizzleRowSwap) + // Fast-path for conversions that force alpha to opaque. +template +static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) { + const uint8_t* end = aBuffer + 4 * aLength; + do { + uint32_t rgba = *reinterpret_cast(aBuffer); + // Just add on the alpha bits to the source. + rgba |= 0xFF << aDstAShift; + *reinterpret_cast(aBuffer) = rgba; + aBuffer += 4; + } while (aBuffer < end); +} + +template +static void SwizzleChunkOpaqueCopy(const uint8_t*& aSrc, uint8_t* aDst, + int32_t aLength) { + const uint8_t* end = aSrc + 4 * aLength; + do { + uint32_t rgba = *reinterpret_cast(aSrc); + // Just add on the alpha bits to the source. + rgba |= 0xFF << aDstAShift; + *reinterpret_cast(aDst) = rgba; + aSrc += 4; + aDst += 4; + } while (aSrc < end); +} + +template +static void SwizzleRowOpaque(const uint8_t* aSrc, uint8_t* aDst, + int32_t aLength) { + if (aSrc == aDst) { + SwizzleChunkOpaqueUpdate(aDst, aLength); + } else { + SwizzleChunkOpaqueCopy(aSrc, aDst, aLength); + } +} + template static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { if (aSrc == aDst) { // Modifying in-place, so just write out the alpha. for (int32_t height = aSize.height; height > 0; height--) { - const uint8_t* end = aDst + 4 * aSize.width; - do { - // ORing directly onto destination memory profiles faster than writing - // individually to the alpha byte and also profiles equivalently to a - // SSE2 implementation. - *reinterpret_cast(aDst) |= 0xFF << aDstAShift; - aDst += 4; - } while (aDst < end); + SwizzleChunkOpaqueUpdate(aDst, aSize.width); aDst += aDstGap; } } else { for (int32_t height = aSize.height; height > 0; height--) { - const uint8_t* end = aSrc + 4 * aSize.width; - do { - uint32_t rgba = *reinterpret_cast(aSrc); - // Just add on the alpha bits to the source. - rgba |= 0xFF << aDstAShift; - *reinterpret_cast(aDst) = rgba; - aSrc += 4; - aDst += 4; - } while (aSrc < end); + SwizzleChunkOpaqueCopy(aSrc, aDst, aSize.width); aSrc += aSrcGap; aDst += aDstGap; } @@ -585,6 +761,10 @@ static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, #define SWIZZLE_OPAQUE(aSrcFormat, aDstFormat) \ FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleOpaque) +#define SWIZZLE_ROW_OPAQUE(aSrcFormat, aDstFormat) \ + FORMAT_CASE_ROW(aSrcFormat, aDstFormat, \ + SwizzleRowOpaque) + // Packing of 32-bit formats to RGB565. template static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, @@ -780,5 +960,60 @@ bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride, return false; } +SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) { +#ifdef USE_SSE2 + if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8) + SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8) + SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8) + SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8) + default: + break; + } +#endif + +#ifdef USE_NEON + if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8) + SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8) + SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8) + SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8) + default: + break; + } +#endif + + switch (FORMAT_KEY(aSrcFormat, aDstFormat)) { + SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8) + SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8) + + SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8) + SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8) + + SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8) + SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8) + SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8) + SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8) + + default: + break; + } + + MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats"); + return nullptr; +} + } // namespace gfx } // namespace mozilla diff --git a/gfx/2d/Swizzle.h b/gfx/2d/Swizzle.h index a52471d5d66e..44b4073ee66f 100644 --- a/gfx/2d/Swizzle.h +++ b/gfx/2d/Swizzle.h @@ -41,6 +41,22 @@ GFX2D_API bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride, int32_t aDstStride, SurfaceFormat aDstFormat, const IntSize& aSize); +/** + * Swizzles source and writes it to destination. Source and destination may be + * the same to swizzle in-place. + */ +typedef void (*SwizzleRowFn)(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength); + +/** + * Get a function pointer to perform premultiplication between two formats. + */ +GFX2D_API SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat); + +/** + * Get a function pointer to perform swizzling between two formats. + */ +GFX2D_API SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat); + } // namespace gfx } // namespace mozilla diff --git a/gfx/2d/SwizzleNEON.cpp b/gfx/2d/SwizzleNEON.cpp index 0926e6b4f71c..5e56a5ffe834 100644 --- a/gfx/2d/SwizzleNEON.cpp +++ b/gfx/2d/SwizzleNEON.cpp @@ -85,6 +85,36 @@ PremultiplyVector_NEON(const uint16x8_t& aSrc) { return vsriq_n_u16(ga, rb, 8); } +template +static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc, + uint8_t*& aDst, + int32_t aAlignedRow, + int32_t aRemainder) { + // Process all 4-pixel chunks as one vector. + for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { + uint16x8_t px = vld1q_u16(reinterpret_cast(aSrc)); + px = PremultiplyVector_NEON(px); + vst1q_u16(reinterpret_cast(aDst), px); + aSrc += 4 * 4; + aDst += 4 * 4; + } + + // Handle any 1-3 remaining pixels. + if (aRemainder) { + uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder); + px = PremultiplyVector_NEON(px); + StoreRemainder_NEON(aDst, aRemainder, px); + } +} + +template +void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { + int32_t alignedRow = 4 * (aLength & ~3); + int32_t remainder = aLength & 3; + PremultiplyChunk_NEON(aSrc, aDst, alignedRow, + remainder); +} + template void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { @@ -95,28 +125,22 @@ void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, aDstGap += 4 * remainder; for (int32_t height = aSize.height; height > 0; height--) { - // Process all 4-pixel chunks as one vector. - for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) { - uint16x8_t px = vld1q_u16(reinterpret_cast(aSrc)); - px = PremultiplyVector_NEON(px); - vst1q_u16(reinterpret_cast(aDst), px); - aSrc += 4 * 4; - aDst += 4 * 4; - } - - // Handle any 1-3 remaining pixels. - if (remainder) { - uint16x8_t px = LoadRemainder_NEON(aSrc, remainder); - px = PremultiplyVector_NEON(px); - StoreRemainder_NEON(aDst, remainder, px); - } - + PremultiplyChunk_NEON(aSrc, aDst, alignedRow, + remainder); aSrc += aSrcGap; aDst += aDstGap; } } // Force instantiation of premultiply variants here. +template void PremultiplyRow_NEON(const uint8_t*, uint8_t*, + int32_t); +template void PremultiplyRow_NEON(const uint8_t*, uint8_t*, + int32_t); +template void PremultiplyRow_NEON(const uint8_t*, uint8_t*, + int32_t); +template void PremultiplyRow_NEON(const uint8_t*, uint8_t*, + int32_t); template void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); template void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, @@ -258,7 +282,7 @@ template void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, // Swizzle a vector of 4 pixels providing swaps and opaquifying. template -MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) { +static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) { // Swap R and B, then add to G and A (forced to 255): // (((src>>16) | (src << 16)) & 0x00FF00FF) | // ((src | 0xFF000000) & ~0x00FF00FF) @@ -275,7 +299,7 @@ MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) { // Optimized implementations for when there is no R and B swap. template<> -MOZ_ALWAYS_INLINE uint16x8_t +static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) { // Force alpha to 255. @@ -283,13 +307,42 @@ SwizzleVector_NEON(const uint16x8_t& aSrc) } template<> -MOZ_ALWAYS_INLINE uint16x8_t +static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) { return aSrc; } #endif +template +static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc, + uint8_t*& aDst, + int32_t aAlignedRow, + int32_t aRemainder) { + // Process all 4-pixel chunks as one vector. + for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { + uint16x8_t px = vld1q_u16(reinterpret_cast(aSrc)); + px = SwizzleVector_NEON(px); + vst1q_u16(reinterpret_cast(aDst), px); + aSrc += 4 * 4; + aDst += 4 * 4; + } + + // Handle any 1-3 remaining pixels. + if (aRemainder) { + uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder); + px = SwizzleVector_NEON(px); + StoreRemainder_NEON(aDst, aRemainder, px); + } +} + +template +void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { + int32_t alignedRow = 4 * (aLength & ~3); + int32_t remainder = aLength & 3; + SwizzleChunk_NEON(aSrc, aDst, alignedRow, remainder); +} + template void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { @@ -300,28 +353,16 @@ void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, aDstGap += 4 * remainder; for (int32_t height = aSize.height; height > 0; height--) { - // Process all 4-pixel chunks as one vector. - for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) { - uint16x8_t px = vld1q_u16(reinterpret_cast(aSrc)); - px = SwizzleVector_NEON(px); - vst1q_u16(reinterpret_cast(aDst), px); - aSrc += 4 * 4; - aDst += 4 * 4; - } - - // Handle any 1-3 remaining pixels. - if (remainder) { - uint16x8_t px = LoadRemainder_NEON(aSrc, remainder); - px = SwizzleVector_NEON(px); - StoreRemainder_NEON(aDst, remainder, px); - } - + SwizzleChunk_NEON(aSrc, aDst, alignedRow, + remainder); aSrc += aSrcGap; aDst += aDstGap; } } // Force instantiation of swizzle variants here. +template void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t); +template void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t); template void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); template void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, diff --git a/gfx/2d/SwizzleSSE2.cpp b/gfx/2d/SwizzleSSE2.cpp index 12d742dd730d..38e016809531 100644 --- a/gfx/2d/SwizzleSSE2.cpp +++ b/gfx/2d/SwizzleSSE2.cpp @@ -88,6 +88,38 @@ static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) { return _mm_or_si128(rb, ga); } +// Premultiply vector of aAlignedRow + aRemainder pixels. +template +static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc, + uint8_t*& aDst, + int32_t aAlignedRow, + int32_t aRemainder) { + // Process all 4-pixel chunks as one vector. + for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { + __m128i px = _mm_loadu_si128(reinterpret_cast(aSrc)); + px = PremultiplyVector_SSE2(px); + _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); + aSrc += 4 * 4; + aDst += 4 * 4; + } + + // Handle any 1-3 remaining pixels. + if (aRemainder) { + __m128i px = LoadRemainder_SSE2(aSrc, aRemainder); + px = PremultiplyVector_SSE2(px); + StoreRemainder_SSE2(aDst, aRemainder, px); + } +} + +// Premultiply vector of aLength pixels. +template +void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { + int32_t alignedRow = 4 * (aLength & ~3); + int32_t remainder = aLength & 3; + PremultiplyChunk_SSE2(aSrc, aDst, alignedRow, + remainder); +} + template void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { @@ -98,28 +130,22 @@ void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, aDstGap += 4 * remainder; for (int32_t height = aSize.height; height > 0; height--) { - // Process all 4-pixel chunks as one vector. - for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) { - __m128i px = _mm_loadu_si128(reinterpret_cast(aSrc)); - px = PremultiplyVector_SSE2(px); - _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); - aSrc += 4 * 4; - aDst += 4 * 4; - } - - // Handle any 1-3 remaining pixels. - if (remainder) { - __m128i px = LoadRemainder_SSE2(aSrc, remainder); - px = PremultiplyVector_SSE2(px); - StoreRemainder_SSE2(aDst, remainder, px); - } - + PremultiplyChunk_SSE2(aSrc, aDst, alignedRow, + remainder); aSrc += aSrcGap; aDst += aDstGap; } } // Force instantiation of premultiply variants here. +template void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, + int32_t); +template void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, + int32_t); +template void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, + int32_t); +template void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, + int32_t); template void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); template void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, @@ -293,6 +319,35 @@ SwizzleVector_SSE2(const __m128i& aSrc) } #endif +template +static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc, + uint8_t*& aDst, + int32_t aAlignedRow, + int32_t aRemainder) { + // Process all 4-pixel chunks as one vector. + for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { + __m128i px = _mm_loadu_si128(reinterpret_cast(aSrc)); + px = SwizzleVector_SSE2(px); + _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); + aSrc += 4 * 4; + aDst += 4 * 4; + } + + // Handle any 1-3 remaining pixels. + if (aRemainder) { + __m128i px = LoadRemainder_SSE2(aSrc, aRemainder); + px = SwizzleVector_SSE2(px); + StoreRemainder_SSE2(aDst, aRemainder, px); + } +} + +template +void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { + int32_t alignedRow = 4 * (aLength & ~3); + int32_t remainder = aLength & 3; + SwizzleChunk_SSE2(aSrc, aDst, alignedRow, remainder); +} + template void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, int32_t aDstGap, IntSize aSize) { @@ -303,28 +358,15 @@ void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, aDstGap += 4 * remainder; for (int32_t height = aSize.height; height > 0; height--) { - // Process all 4-pixel chunks as one vector. - for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) { - __m128i px = _mm_loadu_si128(reinterpret_cast(aSrc)); - px = SwizzleVector_SSE2(px); - _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); - aSrc += 4 * 4; - aDst += 4 * 4; - } - - // Handle any 1-3 remaining pixels. - if (remainder) { - __m128i px = LoadRemainder_SSE2(aSrc, remainder); - px = SwizzleVector_SSE2(px); - StoreRemainder_SSE2(aDst, remainder, px); - } - + SwizzleChunk_SSE2(aSrc, aDst, alignedRow, remainder); aSrc += aSrcGap; aDst += aDstGap; } } // Force instantiation of swizzle variants here. +template void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t); +template void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t); template void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize); template void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*,