Bug 1551088 - Part 1. Expose SwizzleRow and PremultiplyRow variants. r=lsalzman

The image decoders produce surfaces row by row, so a variant to get a function pointer to perform swizzle/premultiply operations makes more ergonomic sense. Differential Revision: https://phabricator.services.mozilla.com/D46444 --HG-- extra : moz-landing-system : lando
2019-09-24 20:43:24 +00:00 · 2019-09-24 20:43:24 +00:00 · 413fb28670
--- a/gfx/2d/Swizzle.cpp
+++ b/gfx/2d/Swizzle.cpp
@ -41,6 +41,10 @@ namespace gfx {
 #define FORMAT_CASE(aSrcFormat, aDstFormat, ...) \
  FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, FORMAT_CASE_CALL(__VA_ARGS__))

+#define FORMAT_CASE_ROW(aSrcFormat, aDstFormat, ...) \
+  case FORMAT_KEY(aSrcFormat, aDstFormat):           \
+    return &__VA_ARGS__;
+
 /**
 * Constexpr functions for analyzing format attributes in templates.
 */
@ -114,6 +118,15 @@ void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
                Premultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
                                 ShouldForceOpaque(aSrcFormat, aDstFormat)>)

+template <bool aSwapRB, bool aOpaqueAlpha>
+void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
+
+#  define PREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat)            \
+    FORMAT_CASE_ROW(                                              \
+        aSrcFormat, aDstFormat,                                   \
+        PremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
+                            ShouldForceOpaque(aSrcFormat, aDstFormat)>)
+
 template <bool aSwapRB>
 void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

@ -129,6 +142,15 @@ void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
                Swizzle_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
                             ShouldForceOpaque(aSrcFormat, aDstFormat)>)

+template <bool aSwapRB, bool aOpaqueAlpha>
+void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t);
+
+#  define SWIZZLE_ROW_SSE2(aSrcFormat, aDstFormat)            \
+    FORMAT_CASE_ROW(                                          \
+        aSrcFormat, aDstFormat,                               \
+        SwizzleRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
+                        ShouldForceOpaque(aSrcFormat, aDstFormat)>)
+
 #endif

 #ifdef USE_NEON
@ -144,6 +166,15 @@ void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
                Premultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
                                 ShouldForceOpaque(aSrcFormat, aDstFormat)>)

+template <bool aSwapRB, bool aOpaqueAlpha>
+void PremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
+
+#  define PREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat)            \
+    FORMAT_CASE_ROW(                                              \
+        aSrcFormat, aDstFormat,                                   \
+        PremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
+                            ShouldForceOpaque(aSrcFormat, aDstFormat)>)
+
 template <bool aSwapRB>
 void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);

@ -159,6 +190,14 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
                Swizzle_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
                             ShouldForceOpaque(aSrcFormat, aDstFormat)>)

+template <bool aSwapRB, bool aOpaqueAlpha>
+void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t);
+
+#  define SWIZZLE_ROW_NEON(aSrcFormat, aDstFormat)            \
+    FORMAT_CASE_ROW(                                          \
+        aSrcFormat, aDstFormat,                               \
+        SwizzleRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
+                        ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 #endif

 /**
@ -171,51 +210,65 @@ void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 // 2-component vectors. Otherwise, an approximation if divide-by-255 is used
 // which is faster than an actual division. These optimizations are also used
 // for the SSE2 and NEON implementations.
+template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
+          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
+static void PremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
+                                     int32_t aLength) {
+  const uint8_t* end = aSrc + 4 * aLength;
+  do {
+    // Load and process 1 entire pixel at a time.
+    uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
+
+    uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
+
+    // Isolate the R and B components.
+    uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
+    // Swap the order of R and B if necessary.
+    if (aSwapRB) {
+      rb = (rb >> 16) | (rb << 16);
+    }
+    // Approximate the multiply by alpha and divide by 255 which is
+    // essentially:
+    // c = c*a + 255; c = (c + (c >> 8)) >> 8;
+    // However, we omit the final >> 8 to fold it with the final shift into
+    // place depending on desired output format.
+    rb = rb * a + 0x00FF00FF;
+    rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
+
+    // Use same approximation as above, but G is shifted 8 bits left.
+    // Alpha is left out and handled separately.
+    uint32_t g = color & (0xFF00 << aSrcRGBShift);
+    g = g * a + (0xFF00 << aSrcRGBShift);
+    g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
+
+    // The above math leaves RGB shifted left by 8 bits.
+    // Shift them right if required for the output format.
+    // then combine them back together to produce output pixel.
+    // Add the alpha back on if the output format is not opaque.
+    *reinterpret_cast<uint32_t*>(aDst) =
+        (rb >> (8 - aDstRGBShift)) | (g >> (8 + aSrcRGBShift - aDstRGBShift)) |
+        (aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
+
+    aSrc += 4;
+    aDst += 4;
+  } while (aSrc < end);
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
+          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
+static void PremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
+                                   int32_t aLength) {
+  PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
+                           aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
+}
+
 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
                                uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
  for (int32_t height = aSize.height; height > 0; height--) {
-    const uint8_t* end = aSrc + 4 * aSize.width;
-    do {
-      // Load and process 1 entire pixel at a time.
-      uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
-
-      uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
-
-      // Isolate the R and B components.
-      uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
-      // Swap the order of R and B if necessary.
-      if (aSwapRB) {
-        rb = (rb >> 16) | (rb << 16);
-      }
-      // Approximate the multiply by alpha and divide by 255 which is
-      // essentially:
-      // c = c*a + 255; c = (c + (c >> 8)) >> 8;
-      // However, we omit the final >> 8 to fold it with the final shift into
-      // place depending on desired output format.
-      rb = rb * a + 0x00FF00FF;
-      rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
-
-      // Use same approximation as above, but G is shifted 8 bits left.
-      // Alpha is left out and handled separately.
-      uint32_t g = color & (0xFF00 << aSrcRGBShift);
-      g = g * a + (0xFF00 << aSrcRGBShift);
-      g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
-
-      // The above math leaves RGB shifted left by 8 bits.
-      // Shift them right if required for the output format.
-      // then combine them back together to produce output pixel.
-      // Add the alpha back on if the output format is not opaque.
-      *reinterpret_cast<uint32_t*>(aDst) =
-          (rb >> (8 - aDstRGBShift)) |
-          (g >> (8 + aSrcRGBShift - aDstRGBShift)) |
-          (aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
-
-      aSrc += 4;
-      aDst += 4;
-    } while (aSrc < end);
-
+    PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
+                             aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
@ -237,6 +290,22 @@ static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)

+#define PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
+  FORMAT_CASE_ROW(aSrcFormat, aDstFormat,                                 \
+                  PremultiplyRowFallback<                                 \
+                      ShouldSwapRB(aSrcFormat, aDstFormat),               \
+                      ShouldForceOpaque(aSrcFormat, aDstFormat),          \
+                      RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
+                      RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
+
+#define PREMULTIPLY_ROW_FALLBACK(aSrcFormat)                         \
+  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
+  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
+  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
+  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
+  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
+  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
+
 // If rows are tightly packed, and the size of the total area will fit within
 // the precision range of a single row, then process all the data as if it was
 // a single row.
@ -323,6 +392,50 @@ bool PremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
  return false;
 }

+SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat,
+                            SurfaceFormat aDstFormat) {
+#ifdef USE_SSE2
+  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
+      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
+      default:
+        break;
+    }
+#endif
+
+#ifdef USE_NEON
+  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
+      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
+      default:
+        break;
+    }
+#endif
+
+  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
+    PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
+    PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
+    PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
+    default:
+      break;
+  }
+
+  MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
+  return nullptr;
+}
+
 /**
 * Unpremultiplying
 */
@ -457,39 +570,54 @@ bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,

 // Fallback swizzle implementation that uses shifting and masking to reorder
 // pixels.
+template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
+          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
+static void SwizzleChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
+                                 int32_t aLength) {
+  const uint8_t* end = aSrc + 4 * aLength;
+  do {
+    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
+
+    if (aSwapRB) {
+      // Handle R and B swaps by exchanging words and masking.
+      uint32_t rb =
+          ((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
+      uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
+      rgba = rb | ga;
+    }
+
+    // If src and dst shifts differ, rotate left or right to move RGB into
+    // place, i.e. ARGB -> RGBA or ARGB -> RGBA.
+    if (aDstRGBShift > aSrcRGBShift) {
+      rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
+    } else if (aSrcRGBShift > aDstRGBShift) {
+      rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
+    } else if (aOpaqueAlpha) {
+      rgba |= 0xFF << aDstAShift;
+    }
+
+    *reinterpret_cast<uint32_t*>(aDst) = rgba;
+
+    aSrc += 4;
+    aDst += 4;
+  } while (aSrc < end);
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
+          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
+static void SwizzleRowFallback(const uint8_t* aSrc, uint8_t* aDst,
+                               int32_t aLength) {
+  SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
+                       aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
+}
+
 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                            int32_t aDstGap, IntSize aSize) {
  for (int32_t height = aSize.height; height > 0; height--) {
-    const uint8_t* end = aSrc + 4 * aSize.width;
-    do {
-      uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
-
-      if (aSwapRB) {
-        // Handle R and B swaps by exchanging words and masking.
-        uint32_t rb =
-            ((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
-        uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
-        rgba = rb | ga;
-      }
-
-      // If src and dst shifts differ, rotate left or right to move RGB into
-      // place, i.e. ARGB -> RGBA or ARGB -> RGBA.
-      if (aDstRGBShift > aSrcRGBShift) {
-        rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
-      } else if (aSrcRGBShift > aDstRGBShift) {
-        rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
-      } else if (aOpaqueAlpha) {
-        rgba |= 0xFF << aDstAShift;
-      }
-
-      *reinterpret_cast<uint32_t*>(aDst) = rgba;
-
-      aSrc += 4;
-      aDst += 4;
-    } while (aSrc < end);
-
+    SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
+                         aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
@ -503,6 +631,14 @@ static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                      RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
                      RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)

+#define SWIZZLE_ROW_FALLBACK(aSrcFormat, aDstFormat)                         \
+  FORMAT_CASE_ROW(                                                           \
+      aSrcFormat, aDstFormat,                                                \
+      SwizzleRowFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
+                         ShouldForceOpaque(aSrcFormat, aDstFormat),          \
+                         RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
+                         RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
+
 // Fast-path for matching formats.
 static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                        int32_t aDstGap, IntSize aSize, int32_t aBPP) {
@ -517,26 +653,39 @@ static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 }

 // Fast-path for conversions that swap all bytes.
+template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
+static void SwizzleChunkSwap(const uint8_t*& aSrc, uint8_t*& aDst,
+                             int32_t aLength) {
+  const uint8_t* end = aSrc + 4 * aLength;
+  do {
+    // Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
+    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
+#if MOZ_LITTLE_ENDIAN
+    rgba = NativeEndian::swapToBigEndian(rgba);
+#else
+    rgba = NativeEndian::swapToLittleEndian(rgba);
+#endif
+    if (aOpaqueAlpha) {
+      rgba |= 0xFF << aDstAShift;
+    }
+    *reinterpret_cast<uint32_t*>(aDst) = rgba;
+    aSrc += 4;
+    aDst += 4;
+  } while (aSrc < end);
+}
+
+template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
+static void SwizzleRowSwap(const uint8_t* aSrc, uint8_t* aDst,
+                           int32_t aLength) {
+  SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst, aLength);
+}
+
 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
 static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                        int32_t aDstGap, IntSize aSize) {
  for (int32_t height = aSize.height; height > 0; height--) {
-    const uint8_t* end = aSrc + 4 * aSize.width;
-    do {
-      // Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
-      uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
-#if MOZ_LITTLE_ENDIAN
-      rgba = NativeEndian::swapToBigEndian(rgba);
-#else
-      rgba = NativeEndian::swapToLittleEndian(rgba);
-#endif
-      if (aOpaqueAlpha) {
-        rgba |= 0xFF << aDstAShift;
-      }
-      *reinterpret_cast<uint32_t*>(aDst) = rgba;
-      aSrc += 4;
-      aDst += 4;
-    } while (aSrc < end);
+    SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst,
+                                                           aSize.width);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
@ -548,34 +697,61 @@ static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
      SwizzleSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
                  AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)

+#define SWIZZLE_ROW_SWAP(aSrcFormat, aDstFormat)                \
+  FORMAT_CASE_ROW(                                              \
+      aSrcFormat, aDstFormat,                                   \
+      SwizzleRowSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
+                     AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
+
 // Fast-path for conversions that force alpha to opaque.
+template <uint32_t aDstAShift>
+static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) {
+  const uint8_t* end = aBuffer + 4 * aLength;
+  do {
+    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aBuffer);
+    // Just add on the alpha bits to the source.
+    rgba |= 0xFF << aDstAShift;
+    *reinterpret_cast<uint32_t*>(aBuffer) = rgba;
+    aBuffer += 4;
+  } while (aBuffer < end);
+}
+
+template <uint32_t aDstAShift>
+static void SwizzleChunkOpaqueCopy(const uint8_t*& aSrc, uint8_t* aDst,
+                                   int32_t aLength) {
+  const uint8_t* end = aSrc + 4 * aLength;
+  do {
+    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
+    // Just add on the alpha bits to the source.
+    rgba |= 0xFF << aDstAShift;
+    *reinterpret_cast<uint32_t*>(aDst) = rgba;
+    aSrc += 4;
+    aDst += 4;
+  } while (aSrc < end);
+}
+
+template <uint32_t aDstAShift>
+static void SwizzleRowOpaque(const uint8_t* aSrc, uint8_t* aDst,
+                             int32_t aLength) {
+  if (aSrc == aDst) {
+    SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aLength);
+  } else {
+    SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aLength);
+  }
+}
+
 template <uint32_t aDstAShift>
 static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                          int32_t aDstGap, IntSize aSize) {
  if (aSrc == aDst) {
    // Modifying in-place, so just write out the alpha.
    for (int32_t height = aSize.height; height > 0; height--) {
-      const uint8_t* end = aDst + 4 * aSize.width;
-      do {
-        // ORing directly onto destination memory profiles faster than writing
-        // individually to the alpha byte and also profiles equivalently to a
-        // SSE2 implementation.
-        *reinterpret_cast<uint32_t*>(aDst) |= 0xFF << aDstAShift;
-        aDst += 4;
-      } while (aDst < end);
+      SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aSize.width);
      aDst += aDstGap;
    }
  } else {
    for (int32_t height = aSize.height; height > 0; height--) {
-      const uint8_t* end = aSrc + 4 * aSize.width;
-      do {
-        uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
-        // Just add on the alpha bits to the source.
-        rgba |= 0xFF << aDstAShift;
-        *reinterpret_cast<uint32_t*>(aDst) = rgba;
-        aSrc += 4;
-        aDst += 4;
-      } while (aSrc < end);
+      SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aSize.width);
      aSrc += aSrcGap;
      aDst += aDstGap;
    }
@ -585,6 +761,10 @@ static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 #define SWIZZLE_OPAQUE(aSrcFormat, aDstFormat) \
  FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleOpaque<AlphaBitShift(aDstFormat)>)

+#define SWIZZLE_ROW_OPAQUE(aSrcFormat, aDstFormat) \
+  FORMAT_CASE_ROW(aSrcFormat, aDstFormat,          \
+                  SwizzleRowOpaque<AlphaBitShift(aDstFormat)>)
+
 // Packing of 32-bit formats to RGB565.
 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
 static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
@ -780,5 +960,60 @@ bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
  return false;
 }

+SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
+#ifdef USE_SSE2
+  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
+      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
+      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
+      default:
+        break;
+    }
+#endif
+
+#ifdef USE_NEON
+  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
+      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
+      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
+      default:
+        break;
+    }
+#endif
+
+  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
+
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
+    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
+
+    SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
+    SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
+    SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
+    SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
+
+    default:
+      break;
+  }
+
+  MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats");
+  return nullptr;
+}
+
 }  // namespace gfx
 }  // namespace mozilla
--- a/gfx/2d/Swizzle.h
+++ b/gfx/2d/Swizzle.h
@ -41,6 +41,22 @@ GFX2D_API bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
                           int32_t aDstStride, SurfaceFormat aDstFormat,
                           const IntSize& aSize);

+/**
+ * Swizzles source and writes it to destination. Source and destination may be
+ * the same to swizzle in-place.
+ */
+typedef void (*SwizzleRowFn)(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
+
+/**
+ * Get a function pointer to perform premultiplication between two formats.
+ */
+GFX2D_API SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat);
+
+/**
+ * Get a function pointer to perform swizzling between two formats.
+ */
+GFX2D_API SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat);
+
 }  // namespace gfx
 }  // namespace mozilla

--- a/gfx/2d/SwizzleNEON.cpp
+++ b/gfx/2d/SwizzleNEON.cpp
@ -85,6 +85,36 @@ PremultiplyVector_NEON(const uint16x8_t& aSrc) {
  return vsriq_n_u16(ga, rb, 8);
 }

+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
+                                                    uint8_t*& aDst,
+                                                    int32_t aAlignedRow,
+                                                    int32_t aRemainder) {
+  // Process all 4-pixel chunks as one vector.
+  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+    uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
+    px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+    vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
+    aSrc += 4 * 4;
+    aDst += 4 * 4;
+  }
+
+  // Handle any 1-3 remaining pixels.
+  if (aRemainder) {
+    uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
+    px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+    StoreRemainder_NEON(aDst, aRemainder, px);
+  }
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+  int32_t alignedRow = 4 * (aLength & ~3);
+  int32_t remainder = aLength & 3;
+  PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+                                               remainder);
+}
+
 template <bool aSwapRB, bool aOpaqueAlpha>
 void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                      int32_t aDstGap, IntSize aSize) {
@ -95,28 +125,22 @@ void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
-    // Process all 4-pixel chunks as one vector.
-    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
-      uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
-      px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
-      vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
-      aSrc += 4 * 4;
-      aDst += 4 * 4;
-    }
-
-    // Handle any 1-3 remaining pixels.
-    if (remainder) {
-      uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
-      px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
-      StoreRemainder_NEON(aDst, remainder, px);
-    }
-
+    PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+                                                 remainder);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
 }

 // Force instantiation of premultiply variants here.
+template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
+                                                int32_t);
+template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
+                                               int32_t);
+template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
+                                               int32_t);
+template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
+                                              int32_t);
 template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
                                             int32_t, IntSize);
 template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
@ -258,7 +282,7 @@ template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,

 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
 template <bool aSwapRB, bool aOpaqueAlpha>
-MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
+static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
  // Swap R and B, then add to G and A (forced to 255):
  // (((src>>16) | (src << 16)) & 0x00FF00FF) |
  //   ((src | 0xFF000000) & ~0x00FF00FF)
@ -275,7 +299,7 @@ MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {

 // Optimized implementations for when there is no R and B swap.
 template<>
-MOZ_ALWAYS_INLINE uint16x8_t
+static MOZ_ALWAYS_INLINE uint16x8_t
 SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
 {
  // Force alpha to 255.
@ -283,13 +307,42 @@ SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
 }

 template<>
-MOZ_ALWAYS_INLINE uint16x8_t
+static MOZ_ALWAYS_INLINE uint16x8_t
 SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
 {
  return aSrc;
 }
 #endif

+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
+                                                uint8_t*& aDst,
+                                                int32_t aAlignedRow,
+                                                int32_t aRemainder) {
+  // Process all 4-pixel chunks as one vector.
+  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+    uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
+    px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+    vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
+    aSrc += 4 * 4;
+    aDst += 4 * 4;
+  }
+
+  // Handle any 1-3 remaining pixels.
+  if (aRemainder) {
+    uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
+    px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
+    StoreRemainder_NEON(aDst, aRemainder, px);
+  }
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+  int32_t alignedRow = 4 * (aLength & ~3);
+  int32_t remainder = aLength & 3;
+  SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
+}
+
 template <bool aSwapRB, bool aOpaqueAlpha>
 void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                  int32_t aDstGap, IntSize aSize) {
@ -300,28 +353,16 @@ void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
-    // Process all 4-pixel chunks as one vector.
-    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
-      uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
-      px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
-      vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
-      aSrc += 4 * 4;
-      aDst += 4 * 4;
-    }
-
-    // Handle any 1-3 remaining pixels.
-    if (remainder) {
-      uint16x8_t px = LoadRemainder_NEON(aSrc, remainder);
-      px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
-      StoreRemainder_NEON(aDst, remainder, px);
-    }
-
+    SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+                                             remainder);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
 }

 // Force instantiation of swizzle variants here.
+template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
+template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
 template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
                                        int32_t, IntSize);
 template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
--- a/gfx/2d/SwizzleSSE2.cpp
+++ b/gfx/2d/SwizzleSSE2.cpp
@ -88,6 +88,38 @@ static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) {
  return _mm_or_si128(rb, ga);
 }

+// Premultiply vector of aAlignedRow + aRemainder pixels.
+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc,
+                                                    uint8_t*& aDst,
+                                                    int32_t aAlignedRow,
+                                                    int32_t aRemainder) {
+  // Process all 4-pixel chunks as one vector.
+  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+    __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
+    px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
+    aSrc += 4 * 4;
+    aDst += 4 * 4;
+  }
+
+  // Handle any 1-3 remaining pixels.
+  if (aRemainder) {
+    __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
+    px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
+    StoreRemainder_SSE2(aDst, aRemainder, px);
+  }
+}
+
+// Premultiply vector of aLength pixels.
+template <bool aSwapRB, bool aOpaqueAlpha>
+void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+  int32_t alignedRow = 4 * (aLength & ~3);
+  int32_t remainder = aLength & 3;
+  PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+                                               remainder);
+}
+
 template <bool aSwapRB, bool aOpaqueAlpha>
 void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                      int32_t aDstGap, IntSize aSize) {
@ -98,28 +130,22 @@ void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
-    // Process all 4-pixel chunks as one vector.
-    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
-      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
-      px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
-      aSrc += 4 * 4;
-      aDst += 4 * 4;
-    }
-
-    // Handle any 1-3 remaining pixels.
-    if (remainder) {
-      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
-      px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
-      StoreRemainder_SSE2(aDst, remainder, px);
-    }
-
+    PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
+                                                 remainder);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
 }

 // Force instantiation of premultiply variants here.
+template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*,
+                                                int32_t);
+template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*,
+                                               int32_t);
+template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*,
+                                               int32_t);
+template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*,
+                                              int32_t);
 template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*,
                                             int32_t, IntSize);
 template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*,
@ -293,6 +319,35 @@ SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
 }
 #endif

+template <bool aSwapRB, bool aOpaqueAlpha>
+static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc,
+                                                uint8_t*& aDst,
+                                                int32_t aAlignedRow,
+                                                int32_t aRemainder) {
+  // Process all 4-pixel chunks as one vector.
+  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
+    __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
+    px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
+    aSrc += 4 * 4;
+    aDst += 4 * 4;
+  }
+
+  // Handle any 1-3 remaining pixels.
+  if (aRemainder) {
+    __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
+    px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
+    StoreRemainder_SSE2(aDst, aRemainder, px);
+  }
+}
+
+template <bool aSwapRB, bool aOpaqueAlpha>
+void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
+  int32_t alignedRow = 4 * (aLength & ~3);
+  int32_t remainder = aLength & 3;
+  SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
+}
+
 template <bool aSwapRB, bool aOpaqueAlpha>
 void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
                  int32_t aDstGap, IntSize aSize) {
@ -303,28 +358,15 @@ void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
  aDstGap += 4 * remainder;

  for (int32_t height = aSize.height; height > 0; height--) {
-    // Process all 4-pixel chunks as one vector.
-    for (const uint8_t* end = aSrc + alignedRow; aSrc < end;) {
-      __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
-      px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
-      aSrc += 4 * 4;
-      aDst += 4 * 4;
-    }
-
-    // Handle any 1-3 remaining pixels.
-    if (remainder) {
-      __m128i px = LoadRemainder_SSE2(aSrc, remainder);
-      px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
-      StoreRemainder_SSE2(aDst, remainder, px);
-    }
-
+    SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
    aSrc += aSrcGap;
    aDst += aDstGap;
  }
 }

 // Force instantiation of swizzle variants here.
+template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t);
+template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t);
 template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
                                        int32_t, IntSize);
 template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,