Bug 587936: Add SSE2 optimizations for alpha recovery. r=vlad a=crowder

2010-08-18 05:43:49 +02:00 · 2010-08-18 05:43:49 +02:00 · 4263d7a74e
--- a/gfx/thebes/Makefile.in
+++ b/gfx/thebes/Makefile.in
@ -199,6 +199,12 @@ CPPSRCS += \
 	$(NULL)
 endif

+ifneq (,$(filter x86 x86_64,$(CPU_ARCH)))
+ifdef __GNUC__
+gfxAlphaRecovery.$(OBJ_SUFFIX): MODULE_OPTIMIZE_FLAGS += -msse2
+endif
+endif
+
 SHARED_LIBRARY_LIBS += \
 	../layers/$(LIB_PREFIX)layers.$(LIB_SUFFIX) \
 	$(NULL)
--- a/gfx/thebes/gfxAlphaRecovery.cpp
+++ b/gfx/thebes/gfxAlphaRecovery.cpp
@ -20,6 +20,7 @@
 *
 * Contributor(s):
 *   Vladimir Vukicevic <vladimir@pobox.com>
+ *   Bas Schouten <bschouten@mozilla.com>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -39,6 +40,8 @@

 #include "gfxImageSurface.h"

+#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
+#include "mozilla/SSE.h"

 /** from cairo-xlib-utils.c, modified */
 /**
@ -97,6 +100,10 @@ gfxAlphaRecovery::RecoverAlpha(gfxImageSurface* blackSurf,
         whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
        return PR_FALSE;

+    if (!analysis && RecoverAlphaSSE2(blackSurf, whiteSurf)) {
+        return PR_TRUE;
+    }
+
    blackSurf->Flush();
    whiteSurf->Flush();

@ -156,3 +163,132 @@ gfxAlphaRecovery::RecoverAlpha(gfxImageSurface* blackSurf,

    return PR_TRUE;
 }
+
+// Align these for all platforms supporting MOZILLA_COMPILE_WITH_SSE2
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
+__declspec(align(16)) PRUint32 greenMaski[] =
+    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
+__declspec(align(16)) PRUint32 alphaMaski[] =
+    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
+#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+PRUint32 greenMaski[] __attribute__ ((aligned (16))) =
+    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
+PRUint32 alphaMaski[] __attribute__ ((aligned (16))) =
+    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
+#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
+#pragma align 16 (greenMaski, alphaMaski)
+PRUint32 greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
+PRUint32 alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
+#endif
+
+PRBool
+gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
+                                   const gfxImageSurface* whiteSurf)
+{
+#if defined(MOZILLA_COMPILE_WITH_SSE2)
+    if (!mozilla::supports_sse2()) {
+        return PR_FALSE;
+    }
+
+    gfxIntSize size = blackSurf->GetSize();
+
+    if (size != whiteSurf->GetSize() ||
+        (blackSurf->Format() != gfxASurface::ImageFormatARGB32 &&
+         blackSurf->Format() != gfxASurface::ImageFormatRGB24) ||
+        (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 &&
+         whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
+        return PR_FALSE;
+
+    blackSurf->Flush();
+    whiteSurf->Flush();
+
+    unsigned char* blackData = blackSurf->Data();
+    unsigned char* whiteData = whiteSurf->Data();
+
+    if (NS_PTR_TO_UINT32(blackData) & 0xf != NS_PTR_TO_UINT32(whiteData) & 0xf ||
+        (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
+        // Cannot keep these in alignment.
+        return PR_FALSE;
+    }
+
+    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
+    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
+
+    for (PRInt32 i = 0; i < size.height; ++i) {
+        PRInt32 j = 0;
+        // Loop single pixels until at 4 byte alignment.
+        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
+            *((PRUint32*)blackData) =
+                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
+                             *reinterpret_cast<PRUint32*>(whiteData));
+            blackData += 4;
+            whiteData += 4;
+            j++;
+        }
+        // This extra loop allows the compiler to do some more clever registry
+        // management and makes it about 5% faster than with only the 4 pixel
+        // at a time loop.
+        for (; j < size.width - 8; j += 8) {
+            __m128i black1 = _mm_load_si128((__m128i*)blackData);
+	    __m128i white1 = _mm_load_si128((__m128i*)whiteData);
+            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
+	    __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
+
+            // Execute the same instructions as described in RecoverPixel, only
+            // using an SSE2 packed saturated subtract.
+            white1 = _mm_subs_epu8(white1, black1);
+            white2 = _mm_subs_epu8(white2, black2);
+            white1 = _mm_subs_epu8(greenMask, white1);
+            white2 = _mm_subs_epu8(greenMask, white2);
+            // Producing the final black pixel in an XMM register and storing
+            // that is actually faster than doing a masked store since that
+            // does an unaligned storage. We have the black pixel in a register
+            // anyway.
+            black1 = _mm_andnot_si128(alphaMask, black1);
+            black2 = _mm_andnot_si128(alphaMask, black2);
+            white1 = _mm_slli_si128(white1, 2);
+            white2 = _mm_slli_si128(white2, 2);
+            white1 = _mm_and_si128(alphaMask, white1);
+            white2 = _mm_and_si128(alphaMask, white2);
+            black1 = _mm_or_si128(white1, black1);
+            black2 = _mm_or_si128(white2, black2);
+
+            _mm_store_si128((__m128i*)blackData, black1);
+            _mm_store_si128((__m128i*)(blackData + 16), black2);
+            blackData += 32;
+            whiteData += 32;
+        }
+        for (; j < size.width - 4; j += 4) {
+            __m128i black = _mm_load_si128((__m128i*)blackData);
+	    __m128i white = _mm_load_si128((__m128i*)whiteData);
+
+            white = _mm_subs_epu8(white, black);
+            white = _mm_subs_epu8(greenMask, white);
+            black = _mm_andnot_si128(alphaMask, black);
+            white = _mm_slli_si128(white, 2);
+            white = _mm_and_si128(alphaMask, white);
+            black = _mm_or_si128(white, black);
+            _mm_store_si128((__m128i*)blackData, black);
+            blackData += 16;
+            whiteData += 16;
+        }
+        // Loop single pixels until we're done.
+        while (j < size.width) {
+            *((PRUint32*)blackData) =
+                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
+                             *reinterpret_cast<PRUint32*>(whiteData));
+            blackData += 4;
+            whiteData += 4;
+            j++;
+        }
+        blackData += blackSurf->Stride() - j * 4;
+        whiteData += whiteSurf->Stride() - j * 4;
+    }
+
+    blackSurf->MarkDirty();
+    
+    return PR_TRUE;
+#else
+    return PR_FALSE;
+#endif
+}
--- a/gfx/thebes/gfxAlphaRecovery.h
+++ b/gfx/thebes/gfxAlphaRecovery.h
@ -59,6 +59,12 @@ public:
    static PRBool RecoverAlpha (gfxImageSurface *blackSurface,
                                const gfxImageSurface *whiteSurface,
                                Analysis *analysis = nsnull);
+
+    /* This does the save as the previous function, only using SSE2
+     * optimizations, usually this should not be called directly.
+     */
+    static PRBool RecoverAlphaSSE2 (gfxImageSurface *blackSurface,
+                                    const gfxImageSurface *whiteSurface);
 };

 #endif /* _GFXALPHARECOVERY_H_ */