Backed out changeset 70b7dcfea394 (bug 1622220) for causing failures in gfx/2d/Swizzle.cpp

CLOSED TREE
2020-03-13 15:07:13 +02:00 · 2020-03-13 15:07:13 +02:00 · b0060ff6a8
--- a/gfx/2d/Swizzle.cpp
+++ b/gfx/2d/Swizzle.cpp
@ -134,14 +134,6 @@ void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    FORMAT_CASE(aSrcFormat, aDstFormat,              \
                Unpremultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat)>)

-template <bool aSwapRB>
-void UnpremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
-
-#  define UNPREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat) \
-    FORMAT_CASE_ROW(                                     \
-        aSrcFormat, aDstFormat,                          \
-        UnpremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat)>)
-
 template <bool aSwapRB, bool aOpaqueAlpha>
 void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

@ -206,14 +198,6 @@ void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    FORMAT_CASE(aSrcFormat, aDstFormat,              \
                Unpremultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat)>)

-template <bool aSwapRB>
-void UnpremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
-
-#  define UNPREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat) \
-    FORMAT_CASE_ROW(                                     \
-        aSrcFormat, aDstFormat,                          \
-        UnpremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat)>)
-
 template <bool aSwapRB, bool aOpaqueAlpha>
 void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

@ -507,47 +491,32 @@ static const uint32_t sUnpremultiplyTable[256] = {0,
 // implementation also accesses color components using individual byte accesses
 // as this profiles faster than accessing the pixel as a uint32_t and
 // shifting/masking to access components.
-template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
-          uint32_t aDstRGBIndex, uint32_t aDstAIndex>
-static void UnpremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
-                                       int32_t aLength) {
-  const uint8_t* end = aSrc + 4 * aLength;
-  do {
-    uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
-    uint8_t g = aSrc[aSrcRGBIndex + 1];
-    uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
-    uint8_t a = aSrc[aSrcAIndex];
-
-    // Access the 8.16 reciprocal from the table based on alpha. Multiply by
-    // the reciprocal and shift off the fraction bits to approximate the
-    // division by alpha.
-    uint32_t q = sUnpremultiplyTable[a];
-    aDst[aDstRGBIndex + 0] = (r * q) >> 16;
-    aDst[aDstRGBIndex + 1] = (g * q) >> 16;
-    aDst[aDstRGBIndex + 2] = (b * q) >> 16;
-    aDst[aDstAIndex] = a;
-
-    aSrc += 4;
-    aDst += 4;
-  } while (aSrc < end);
-}
-
-template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
-          uint32_t aDstRGBIndex, uint32_t aDstAIndex>
-static void UnpremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
-                                     int32_t aLength) {
-  UnpremultiplyChunkFallback<aSwapRB, aSrcRGBIndex, aSrcAIndex, aDstRGBIndex,
-                             aDstAIndex>(aSrc, aDst, aLength);
-}
-
 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
          uint32_t aDstRGBIndex, uint32_t aDstAIndex>
 static void UnpremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
                                  uint8_t* aDst, int32_t aDstGap,
                                  IntSize aSize) {
  for (int32_t height = aSize.height; height > 0; height--) {
-    UnpremultiplyChunkFallback<aSwapRB, aSrcRGBIndex, aSrcAIndex, aDstRGBIndex,
-                               aDstAIndex>(aSrc, aDst, aSize.width);
+    const uint8_t* end = aSrc + 4 * aSize.width;
+    do {
+      uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
+      uint8_t g = aSrc[aSrcRGBIndex + 1];
+      uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
+      uint8_t a = aSrc[aSrcAIndex];
+
+      // Access the 8.16 reciprocal from the table based on alpha. Multiply by
+      // the reciprocal and shift off the fraction bits to approximate the
+      // division by alpha.
+      uint32_t q = sUnpremultiplyTable[a];
+      aDst[aDstRGBIndex + 0] = (r * q) >> 16;
+      aDst[aDstRGBIndex + 1] = (g * q) >> 16;
+      aDst[aDstRGBIndex + 2] = (b * q) >> 16;
+      aDst[aDstAIndex] = a;
+
+      aSrc += 4;
+      aDst += 4;
+    } while (aSrc < end);
+
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
@ -565,18 +534,6 @@ static void UnpremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
  UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
  UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8)

-#define UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
-  FORMAT_CASE_ROW(aSrcFormat, aDstFormat,                                   \
-                  UnpremultiplyRowFallback<                                 \
-                      ShouldSwapRB(aSrcFormat, aDstFormat),                 \
-                      RGBByteIndex(aSrcFormat), AlphaByteIndex(aSrcFormat), \
-                      RGBByteIndex(aDstFormat), AlphaByteIndex(aDstFormat)>)
-
-#define UNPREMULTIPLY_ROW_FALLBACK(aSrcFormat)                         \
-  UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
-  UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
-  UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8)
-
 bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
                       SurfaceFormat aSrcFormat, uint8_t* aDst,
                       int32_t aDstStride, SurfaceFormat aDstFormat,
@ -631,42 +588,6 @@ bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
  return false;
 }

-SwizzleRowFn UnpremultiplyRow(SurfaceFormat aSrcFormat,
-                              SurfaceFormat aDstFormat) {
-#ifdef USE_SSE2
-  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
-      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
-      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
-      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
-      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
-      default:
-        break;
-    }
-#endif
-
-#ifdef USE_NEON
-  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
-      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
-      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
-      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
-      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
-      default:
-        break;
-    }
-#endif
-
-  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
-    UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
-    UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
-    UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
-    default:
-      break;
-  }
-
-  MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
-  return nullptr;
-}
-
 /**
 * Swizzling
 */
@ -742,15 +663,6 @@ static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                         RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
                         RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)

-// Fast-path for matching formats.
-template <int32_t aBytesPerPixel>
-static void SwizzleRowCopy(const uint8_t* aSrc, uint8_t* aDst,
-                           int32_t aLength) {
-  if (aSrc != aDst) {
-    memcpy(aDst, aSrc, aLength * aBytesPerPixel);
-  }
-}
-
 // Fast-path for matching formats.
 static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                        int32_t aDstGap, IntSize aSize, int32_t aBPP) {
@ -815,41 +727,6 @@ static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
      SwizzleRowSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
                     AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)

-static void SwizzleChunkSwapRGB24(const uint8_t*& aSrc, uint8_t*& aDst,
-                                  int32_t aLength) {
-  const uint8_t* end = aSrc + 3 * aLength;
-  do {
-    uint8_t r = aSrc[0];
-    uint8_t g = aSrc[1];
-    uint8_t b = aSrc[2];
-    aDst[0] = b;
-    aDst[1] = g;
-    aDst[2] = r;
-    aSrc += 3;
-    aDst += 3;
-  } while (aSrc < end);
-}
-
-static void SwizzleRowSwapRGB24(const uint8_t* aSrc, uint8_t* aDst,
-                                int32_t aLength) {
-  SwizzleChunkSwapRGB24(aSrc, aDst, aLength);
-}
-
-static void SwizzleSwapRGB24(const uint8_t* aSrc, int32_t aSrcGap,
-                             uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
-  for (int32_t height = aSize.height; height > 0; height--) {
-    SwizzleChunkSwapRGB24(aSrc, aDst, aSize.width);
-    aSrc += aSrcGap;
-    aDst += aDstGap;
-  }
-}
-
-#define SWIZZLE_SWAP_RGB24(aSrcFormat, aDstFormat) \
-  FORMAT_CASE_ROW(aSrcFormat, aDstFormat, SwizzleSwapRGB24)
-
-#define SWIZZLE_ROW_SWAP_RGB24(aSrcFormat, aDstFormat) \
-  FORMAT_CASE_ROW(aSrcFormat, aDstFormat, SwizzleRowSwapRGB24)
-
 // Fast-path for conversions that force alpha to opaque.
 template <uint32_t aDstAShift>
 static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) {
@ -946,36 +823,24 @@ static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 }

 // Packing of 32-bit formats to 24-bit formats.
-template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
-static void PackChunkToRGB24(const uint8_t*& aSrc, uint8_t*& aDst,
-                             int32_t aLength) {
-  const uint8_t* end = aSrc + 4 * aLength;
-  do {
-    uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
-    uint8_t g = aSrc[aSrcRGBIndex + 1];
-    uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
-
-    aDst[0] = r;
-    aDst[1] = g;
-    aDst[2] = b;
-
-    aSrc += 4;
-    aDst += 3;
-  } while (aSrc < end);
-}
-
-template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
-static void PackRowToRGB24(const uint8_t* aSrc, uint8_t* aDst,
-                           int32_t aLength) {
-  PackChunkToRGB24<aSwapRB, aSrcRGBShift, aSrcRGBIndex>(aSrc, aDst, aLength);
-}
-
 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
 static void PackToRGB24(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                        int32_t aDstGap, IntSize aSize) {
  for (int32_t height = aSize.height; height > 0; height--) {
-    PackChunkToRGB24<aSwapRB, aSrcRGBShift, aSrcRGBIndex>(aSrc, aDst,
-                                                          aSize.width);
+    const uint8_t* end = aSrc + 4 * aSize.width;
+    do {
+      uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
+      uint8_t g = aSrc[aSrcRGBIndex + 1];
+      uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
+
+      aDst[0] = r;
+      aDst[1] = g;
+      aDst[2] = b;
+
+      aSrc += 4;
+      aDst += 3;
+    } while (aSrc < end);
+
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
@ -994,20 +859,6 @@ static void PackToRGB24(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  PACK_RGB_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc) \
  PACK_RGB_CASE(SurfaceFormat::X8R8G8B8, aDstFormat, aPackFunc)

-#define PACK_ROW_RGB_CASE(aSrcFormat, aDstFormat, aPackFunc)                   \
-  FORMAT_CASE_ROW(                                                             \
-      aSrcFormat, aDstFormat,                                                  \
-      aPackFunc<ShouldSwapRB(aSrcFormat, aDstFormat), RGBBitShift(aSrcFormat), \
-                RGBByteIndex(aSrcFormat)>)
-
-#define PACK_ROW_RGB(aDstFormat, aPackFunc)                         \
-  PACK_ROW_RGB_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
-  PACK_ROW_RGB_CASE(SurfaceFormat::B8G8R8X8, aDstFormat, aPackFunc) \
-  PACK_ROW_RGB_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
-  PACK_ROW_RGB_CASE(SurfaceFormat::R8G8B8X8, aDstFormat, aPackFunc) \
-  PACK_ROW_RGB_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc) \
-  PACK_ROW_RGB_CASE(SurfaceFormat::X8R8G8B8, aDstFormat, aPackFunc)
-
 // Packing of 32-bit formats to A8.
 template <uint32_t aSrcAIndex>
 static void PackToA8(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
@ -1155,9 +1006,6 @@ bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
    SWIZZLE_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8X8)
    SWIZZLE_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8A8)

-    SWIZZLE_SWAP_RGB24(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8)
-    SWIZZLE_SWAP_RGB24(SurfaceFormat::B8G8R8, SurfaceFormat::R8G8B8)
-
    SWIZZLE_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
    SWIZZLE_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
    SWIZZLE_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
@ -1273,9 +1121,6 @@ SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
    SWIZZLE_ROW_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8X8)
    SWIZZLE_ROW_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8A8)

-    SWIZZLE_ROW_SWAP_RGB24(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8)
-    SWIZZLE_ROW_SWAP_RGB24(SurfaceFormat::B8G8R8, SurfaceFormat::R8G8B8)
-
    UNPACK_ROW_RGB(SurfaceFormat::R8G8B8X8)
    UNPACK_ROW_RGB(SurfaceFormat::R8G8B8A8)
    UNPACK_ROW_RGB(SurfaceFormat::B8G8R8X8)
@ -1283,24 +1128,10 @@ SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
    UNPACK_ROW_RGB_TO_ARGB(SurfaceFormat::A8R8G8B8)
    UNPACK_ROW_RGB_TO_ARGB(SurfaceFormat::X8R8G8B8)

-    PACK_ROW_RGB(SurfaceFormat::R8G8B8, PackRowToRGB24)
-    PACK_ROW_RGB(SurfaceFormat::B8G8R8, PackRowToRGB24)
-
    default:
      break;
  }

-  if (aSrcFormat == aDstFormat) {
-    switch (BytesPerPixel(aSrcFormat)) {
-      case 4:
-        return &SwizzleRowCopy<4>;
-      case 3:
-        return &SwizzleRowCopy<3>;
-      default:
-        break;
-    }
-  }
-
  MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats");
  return nullptr;
 }
--- a/gfx/2d/Swizzle.h
+++ b/gfx/2d/Swizzle.h
@ -54,12 +54,6 @@ typedef void (*SwizzleRowFn)(const uint8_t* aSrc, uint8_t* aDst,
 GFX2D_API SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat,
                                      SurfaceFormat aDstFormat);

-/**
- * Get a function pointer to perform unpremultiplication between two formats.
- */
-GFX2D_API SwizzleRowFn UnpremultiplyRow(SurfaceFormat aSrcFormat,
-                                        SurfaceFormat aDstFormat);
-
 /**
 * Get a function pointer to perform swizzling between two formats.
 */
--- a/gfx/2d/SwizzleNEON.cpp
+++ b/gfx/2d/SwizzleNEON.cpp
@ -243,36 +243,6 @@ UnpremultiplyVector_NEON(const uint16x8_t& aSrc) {
                   vsliq_n_u16(rb, ga, 8));
 }

-template <bool aSwapRB>
-static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_NEON(const uint8_t*& aSrc,
-                                                      uint8_t*& aDst,
-                                                      int32_t aAlignedRow,
-                                                      int32_t aRemainder) {
-  // Process all 4-pixel chunks as one vector.
-  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
-    uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
-    px = UnpremultiplyVector_NEON<aSwapRB>(px);
-    vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
-    aSrc += 4 * 4;
-    aDst += 4 * 4;
-  }
-
-  // Handle any 1-3 remaining pixels.
-  if (aRemainder) {
-    uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
-    px = UnpremultiplyVector_NEON<aSwapRB>(px);
-    StoreRemainder_NEON(aDst, remainder, px);
-  }
-}
-
-template <bool aSwapRB>
-void UnpremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst,
-                           int32_t aLength) {
-  int32_t alignedRow = 4 * (aLength & ~3);
-  int32_t remainder = aLength & 3;
-  UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
-}
-
 template <bool aSwapRB>
 void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                        int32_t aDstGap, IntSize aSize) {
@ -283,15 +253,28 @@ void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
-    UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
+    // Process all 4-pixel chunks as one vector.
+    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
+      uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
+      px = UnpremultiplyVector_NEON<aSwapRB>(px);
+      vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
+      aSrc += 4 * 4;
+      aDst += 4 * 4;
+    }
+
+    // Handle any 1-3 remaining pixels.
+    if (remainder) {
+      uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
+      px = UnpremultiplyVector_NEON<aSwapRB>(px);
+      StoreRemainder_NEON(aDst, remainder, px);
+    }
+
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
 }

 // Force instantiation of unpremultiply variants here.
-template void UnpremultiplyRow_NEON<false>(const uint8_t*, uint8_t*, int32_t);
-template void UnpremultiplyRow_NEON<true>(const uint8_t*, uint8_t*, int32_t);
 template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*,
                                        int32_t, IntSize);
 template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
--- a/gfx/2d/SwizzleSSE2.cpp
+++ b/gfx/2d/SwizzleSSE2.cpp
@ -242,36 +242,6 @@ static MOZ_ALWAYS_INLINE __m128i UnpremultiplyVector_SSE2(const __m128i& aSrc) {
  return _mm_or_si128(rb, ga);
 }

-template <bool aSwapRB>
-static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_SSE2(const uint8_t*& aSrc,
-                                                      uint8_t*& aDst,
-                                                      int32_t aAlignedRow,
-                                                      int32_t aRemainder) {
-  // Process all 4-pixel chunks as one vector.
-  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
-    __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
-    px = UnpremultiplyVector_SSE2<aSwapRB>(px);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
-    aSrc += 4 * 4;
-    aDst += 4 * 4;
-  }
-
-  // Handle any 1-3 remaining pixels.
-  if (aRemainder) {
-    __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
-    px = UnpremultiplyVector_SSE2<aSwapRB>(px);
-    StoreRemainder_SSE2(aDst, aRemainder, px);
-  }
-}
-
-template <bool aSwapRB>
-void UnpremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst,
-                           int32_t aLength) {
-  int32_t alignedRow = 4 * (aLength & ~3);
-  int32_t remainder = aLength & 3;
-  UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
-}
-
 template <bool aSwapRB>
 void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                        int32_t aDstGap, IntSize aSize) {
@ -282,15 +252,28 @@ void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
-    UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
+    // Process all 4-pixel chunks as one vector.
+    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
+      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
+      px = UnpremultiplyVector_SSE2<aSwapRB>(px);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
+      aSrc += 4 * 4;
+      aDst += 4 * 4;
+    }
+
+    // Handle any 1-3 remaining pixels.
+    if (remainder) {
+      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
+      px = UnpremultiplyVector_SSE2<aSwapRB>(px);
+      StoreRemainder_SSE2(aDst, remainder, px);
+    }
+
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
 }

 // Force instantiation of unpremultiply variants here.
-template void UnpremultiplyRow_SSE2<false>(const uint8_t*, uint8_t*, int32_t);
-template void UnpremultiplyRow_SSE2<true>(const uint8_t*, uint8_t*, int32_t);
 template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*,
                                        int32_t, IntSize);
 template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*,
--- a/gfx/tests/gtest/TestSwizzle.cpp
+++ b/gfx/tests/gtest/TestSwizzle.cpp
@ -62,9 +62,6 @@ TEST(Moz2D, PremultiplyRow)
  const uint8_t check_rgba[5 * 4] = {
      0, 255, 255, 255, 255, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128,
  };
-  const uint8_t check_argb[5 * 4] = {
-      255, 0, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 128,
-  };

  SwizzleRowFn func =
      PremultiplyRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8);
@ -74,10 +71,6 @@ TEST(Moz2D, PremultiplyRow)
  func = PremultiplyRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8);
  func(in_bgra, out, 5);
  EXPECT_TRUE(ArrayEqual(out, check_rgba));
-
-  func = PremultiplyRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::A8R8G8B8);
-  func(in_bgra, out, 5);
-  EXPECT_TRUE(ArrayEqual(out, check_argb));
 }

 TEST(Moz2D, UnpremultiplyData)
@ -114,41 +107,6 @@ TEST(Moz2D, UnpremultiplyData)
  EXPECT_TRUE(ArrayEqual(out, check_argb));
 }

-TEST(Moz2D, UnpremultiplyRow)
-{
-  const uint8_t in_bgra[5 * 4] = {
-      255, 255, 0,   255,              // verify 255 alpha leaves RGB unchanged
-      0,   0,   255, 255, 0, 0, 0, 0,  // verify 0 alpha leaves RGB at 0
-      0,   0,   0,   64,   // verify 0 RGB stays 0 with non-zero alpha
-      128, 0,   0,   128,  // verify that RGB == alpha maps to 255
-
-  };
-  uint8_t out[5 * 4];
-  const uint8_t check_bgra[5 * 4] = {
-      255, 255, 0, 255, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0, 0, 64, 255, 0, 0, 128,
-  };
-  // check swizzled output
-  const uint8_t check_rgba[5 * 4] = {
-      0, 255, 255, 255, 255, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 255, 128,
-  };
-  const uint8_t check_argb[5 * 4] = {
-      255, 0, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 128, 0, 0, 255,
-  };
-
-  SwizzleRowFn func =
-      UnpremultiplyRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8);
-  func(in_bgra, out, 5);
-  EXPECT_TRUE(ArrayEqual(out, check_bgra));
-
-  func = UnpremultiplyRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8);
-  func(in_bgra, out, 5);
-  EXPECT_TRUE(ArrayEqual(out, check_rgba));
-
-  func = UnpremultiplyRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::A8R8G8B8);
-  func(in_bgra, out, 5);
-  EXPECT_TRUE(ArrayEqual(out, check_argb));
-}
-
 TEST(Moz2D, SwizzleData)
 {
  const uint8_t in_bgra[5 * 4] = {
@ -242,13 +200,6 @@ TEST(Moz2D, SwizzleRow)
      0, 254, 253, 255, 255, 0,   0, 255, 0,   0,
      0, 255, 3,   2,   1,   255, 9, 0,   127, 255,
  };
-  // check packing
-  uint8_t out24[5 * 3];
-  const uint8_t check_bgr[5 * 3] = {253, 254, 0, 0, 0,   255, 0, 0,
-                                    0,   1,   2, 3, 127, 0,   9};
-  const uint8_t check_rgb[5 * 3] = {
-      0, 254, 253, 255, 0, 0, 0, 0, 0, 3, 2, 1, 9, 0, 127,
-  };
  // check unpacking
  uint8_t out_unpack[16 * 4];
  const uint8_t in_rgb[16 * 3] = {
@ -284,18 +235,6 @@ TEST(Moz2D, SwizzleRow)
  func(in_bgra, out, 5);
  EXPECT_TRUE(ArrayEqual(out, check_rgbx));

-  func = SwizzleRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8);
-  func(in_bgra, out, 5);
-  EXPECT_TRUE(ArrayEqual(out, in_bgra));
-
-  func = SwizzleRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8);
-  func(in_bgra, out24, 5);
-  EXPECT_TRUE(ArrayEqual(out24, check_bgr));
-
-  func = SwizzleRow(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8);
-  func(in_bgra, out24, 5);
-  EXPECT_TRUE(ArrayEqual(out24, check_rgb));
-
  func = SwizzleRow(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8X8);
  func(in_rgb, out_unpack, 16);
  EXPECT_TRUE(ArrayEqual(out_unpack, check_unpack_bgrx));