Bug 530550. pixman: Fix arm gcc build.

The pixman-arm-simd-asm.c was just a copy of pixman-arm-simd.c instead of the file it was supposed to be.
2010-01-21 11:30:07 -05:00 · 2010-01-21 11:30:07 -05:00 · 2a8a87ae64
--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.c
+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.c
@ -28,8 +28,9 @@
 #endif

 #include "pixman-private.h"
+#include "pixman-arm-simd-asm.h"

-static void
+void
 arm_composite_add_8000_8000 (pixman_implementation_t * impl,
 			     pixman_op_t               op,
 			     pixman_image_t *          src_image,
@ -101,7 +102,7 @@ arm_composite_add_8000_8000 (pixman_implementation_t * impl,

 }

-static void
+void
 arm_composite_over_8888_8888 (pixman_implementation_t * impl,
 			      pixman_op_t               op,
 			      pixman_image_t *          src_image,
@ -194,7 +195,7 @@ arm_composite_over_8888_8888 (pixman_implementation_t * impl,
    }
 }

-static void
+void
 arm_composite_over_8888_n_8888 (pixman_implementation_t * impl,
 				pixman_op_t               op,
 				pixman_image_t *          src_image,
@ -303,7 +304,7 @@ arm_composite_over_8888_n_8888 (pixman_implementation_t * impl,
    }
 }

-static void
+void
 arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
 			     pixman_op_t               op,
 			     pixman_image_t *          src_image,
@ -419,67 +420,277 @@ arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
    }
 }

-static const pixman_fast_path_t arm_simd_fast_path_array[] =
+/**
+ * Conversion x8r8g8b8 -> r5g6b5
+ *
+ * TODO: optimize more, eliminate stalls, try to use burst writes (4 words aligned 
+ * at 16 byte boundary)
+ */
+static inline void fbComposite_x8r8g8b8_src_r5g6b5_internal_mixed_armv6_c(
+    uint16_t *dst, uint32_t *src, int w, int dst_stride,
+    int src_stride, int h)
 {
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, arm_composite_over_8888_8888    },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, arm_composite_over_8888_8888    },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, arm_composite_over_8888_8888    },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, arm_composite_over_8888_8888    },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_solid,    PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888  },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_solid,    PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888  },
-
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       arm_composite_add_8000_8000     },
-
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888     },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888     },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888     },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888     },
-
-    { PIXMAN_OP_NONE },
-};
-
-const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
-
-static void
-arm_simd_composite (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    pixman_image_t *         src,
-                    pixman_image_t *         mask,
-                    pixman_image_t *         dest,
-                    int32_t                  src_x,
-                    int32_t                  src_y,
-                    int32_t                  mask_x,
-                    int32_t                  mask_y,
-                    int32_t                  dest_x,
-                    int32_t                  dest_y,
-                    int32_t                  width,
-                    int32_t                  height)
-{
-    if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
+    uint32_t a, x, y, c1F001F = 0x1F001F;
+    int backup_w = w;
+    while (h--)
    {
-	return;
+        w = backup_w;
+        if (w > 0 && (uintptr_t)dst & 2)
+        {
+            x = *src++;
+
+            a = (x >> 3) & c1F001F;
+            x &= 0xFC00;
+            a |= a >> 5;
+            a |= x >> 5;
+
+            *dst++ = a;
+            w--;
+        }
+
+        asm volatile(
+            "subs  %[w], %[w], #2\n"
+            "blt   2f\n"
+        "1:\n"
+            "ldr   %[x], [%[src]], #4\n"
+            "ldr   %[y], [%[src]], #4\n"
+            "subs  %[w], %[w], #2\n"
+            
+            "and   %[a], %[c1F001F], %[x], lsr #3\n"
+            "and   %[x], %[x], #0xFC00\n\n"
+            "orr   %[a], %[a], %[a], lsr #5\n"
+            "orr   %[x], %[a], %[x], lsr #5\n"
+
+            "and   %[a], %[c1F001F], %[y], lsr #3\n"
+            "and   %[y], %[y], #0xFC00\n\n"
+            "orr   %[a], %[a], %[a], lsr #5\n"
+            "orr   %[y], %[a], %[y], lsr #5\n"
+
+            "pkhbt %[x], %[x], %[y], lsl #16\n"
+            "str   %[x], [%[dst]], #4\n"
+            "bge   1b\n"
+        "2:\n"
+        : [c1F001F] "+&r" (c1F001F), [src] "+&r" (src), [dst] "+&r" (dst), [a] "=&r" (a), 
+          [x] "=&r" (x), [y] "=&r" (y), [w] "+&r" (w)
+        );
+
+        if (w & 1)
+        {
+            x = *src++;
+
+            a = (x >> 3) & c1F001F;
+            x = x & 0xFC00;
+            a |= a >> 5;
+            a |= x >> 5;
+
+            *dst++ = a;
+        }
+
+        src += src_stride - backup_w;
+        dst += dst_stride - backup_w;
    }
-
-    _pixman_implementation_composite (imp->delegate, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
 }

-pixman_implementation_t *
-_pixman_implementation_create_arm_simd (void)
+/**
+ * Conversion x8r8g8b8 -> r5g6b5
+ *
+ * Note: 'w' must be >= 7
+ */
+static void __attribute__((naked)) fbComposite_x8r8g8b8_src_r5g6b5_internal_armv6(
+    uint16_t *dst, uint32_t *src, int w, int dst_stride,
+    int src_stride, int h)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general);
+    asm volatile(
+        /* define supplementary macros */
+        ".macro cvt8888to565 PIX\n"
+            "and   A, C1F001F, \\PIX, lsr #3\n"
+            "and   \\PIX, \\PIX, #0xFC00\n\n"
+            "orr   A, A, A, lsr #5\n"
+            "orr   \\PIX, A, \\PIX, lsr #5\n"
+        ".endm\n"

-    imp->composite = arm_simd_composite;
+        ".macro combine_pixels_pair PIX1, PIX2\n"
+            "pkhbt \\PIX1, \\PIX1, \\PIX2, lsl #16\n" /* Note: assume little endian byte order */
+        ".endm\n"
+
+        /* function entry, save all registers (10 words) to stack */
+        "stmdb   sp!, {r4-r11, ip, lr}\n"
+        
+        /* define some aliases */
+        "DST     .req  r0\n"
+        "SRC     .req  r1\n"
+        "W       .req  r2\n"
+        "H       .req  r3\n"
+
+        "TMP1    .req  r4\n"
+        "TMP2    .req  r5\n"
+        "TMP3    .req  r6\n"
+        "TMP4    .req  r7\n"
+        "TMP5    .req  r8\n"
+        "TMP6    .req  r9\n"
+        "TMP7    .req  r10\n"
+        "TMP8    .req  r11\n"
+
+        "C1F001F .req  ip\n"
+        "A       .req  lr\n"
+        
+        "ldr     TMP1, [sp, #(10*4+0)]\n" /* load src_stride */
+        "ldr     C1F001F, =0x1F001F\n"
+        "sub     r3, r3, W\n"
+        "str     r3, [sp, #(10*4+0)]\n" /* store (dst_stride-w) */
+        "ldr     r3, [sp, #(10*4+4)]\n" /* load h */
+        "sub     TMP1, TMP1, W\n"
+        "str     TMP1, [sp, #(10*4+4)]\n" /* store (src_stride-w) */
+        
+        "str     W, [sp, #(8*4)]\n" /* saved ip = W */
+
+    "0:\n"
+        "subs    H, H, #1\n"
+        "blt     6f\n"
+    "1:\n"
+        /* align DST at 4 byte boundary */
+        "tst     DST, #2\n"
+        "beq     2f\n"
+        "ldr     TMP1, [SRC], #4\n"
+        "sub     W, W, #1\n"
+        "cvt8888to565 TMP1\n"
+        "strh    TMP1, [DST], #2\n"
+    "2:"
+        /* align DST at 8 byte boundary */
+        "tst     DST, #4\n"
+        "beq     2f\n"
+        "ldmia   SRC!, {TMP1, TMP2}\n"
+        "sub     W, W, #2\n"
+        "cvt8888to565 TMP1\n"
+        "cvt8888to565 TMP2\n"
+        "combine_pixels_pair TMP1, TMP2\n"
+        "str     TMP1, [DST], #4\n"
+    "2:"
+        /* align DST at 16 byte boundary */
+        "tst     DST, #8\n"
+        "beq     2f\n"
+        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
+        "sub     W, W, #4\n"
+        "cvt8888to565 TMP1\n"
+        "cvt8888to565 TMP2\n"
+        "cvt8888to565 TMP3\n"
+        "cvt8888to565 TMP4\n"
+        "combine_pixels_pair TMP1, TMP2\n"
+        "combine_pixels_pair TMP3, TMP4\n"
+        "stmia DST!, {TMP1, TMP3}\n"
+    "2:"
+        /* inner loop, process 8 pixels per iteration */
+        "subs    W, W, #8\n"
+        "blt     4f\n"
+    "3:\n"
+        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8}\n"
+        "subs    W, W, #8\n"
+        "cvt8888to565 TMP1\n"
+        "cvt8888to565 TMP2\n"
+        "cvt8888to565 TMP3\n"
+        "cvt8888to565 TMP4\n"
+        "cvt8888to565 TMP5\n"
+        "cvt8888to565 TMP6\n"
+        "cvt8888to565 TMP7\n"
+        "cvt8888to565 TMP8\n"
+        "combine_pixels_pair TMP1, TMP2\n"
+        "combine_pixels_pair TMP3, TMP4\n"
+        "combine_pixels_pair TMP5, TMP6\n"
+        "combine_pixels_pair TMP7, TMP8\n"
+        "stmia   DST!, {TMP1, TMP3, TMP5, TMP7}\n"
+        "bge     3b\n"
+    "4:\n"
+
+        /* process the remaining pixels */
+        "tst     W, #4\n"
+        "beq     4f\n"
+        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
+        "cvt8888to565 TMP1\n"
+        "cvt8888to565 TMP2\n"
+        "cvt8888to565 TMP3\n"
+        "cvt8888to565 TMP4\n"
+        "combine_pixels_pair TMP1, TMP2\n"
+        "combine_pixels_pair TMP3, TMP4\n"
+        "stmia   DST!, {TMP1, TMP3}\n"
+    "4:\n"
+        "tst     W, #2\n"
+        "beq     4f\n"
+        "ldmia   SRC!, {TMP1, TMP2}\n"
+        "cvt8888to565 TMP1\n"
+        "cvt8888to565 TMP2\n"
+        "combine_pixels_pair TMP1, TMP2\n"
+        "str     TMP1, [DST], #4\n"
+    "4:\n"
+        "tst     W, #1\n"
+        "beq     4f\n"
+        "ldr     TMP1, [SRC], #4\n"
+        "cvt8888to565 TMP1\n"
+        "strh    TMP1, [DST], #2\n"
+    "4:\n"
+        "ldr     TMP1, [sp, #(10*4+0)]\n" /* (dst_stride-w) */
+        "ldr     TMP2, [sp, #(10*4+4)]\n" /* (src_stride-w) */
+        "ldr     W, [sp, #(8*4)]\n"
+        "subs    H, H, #1\n"
+        "add     DST, DST, TMP1, lsl #1\n"
+        "add     SRC, SRC, TMP2, lsl #2\n"
+        "bge     1b\n"
+    "6:\n"
+        "ldmia   sp!, {r4-r11, ip, pc}\n" /* restore all registers and return */
+        ".ltorg\n"
+
+        ".unreq   DST\n"
+        ".unreq   SRC\n"
+        ".unreq   W\n"
+        ".unreq   H\n"
+
+        ".unreq   TMP1\n"
+        ".unreq   TMP2\n"
+        ".unreq   TMP3\n"
+        ".unreq   TMP4\n"
+        ".unreq   TMP5\n"
+        ".unreq   TMP6\n"
+        ".unreq   TMP7\n"
+        ".unreq   TMP8\n"
+
+        ".unreq   C1F001F\n"
+        ".unreq   A\n"
+
+        ".purgem  cvt8888to565\n"
+        ".purgem  combine_pixels_pair\n"
+    );
+}
+
+void
+arm_composite_src_8888_0565 (pixman_implementation_t * impl,
+			     pixman_op_t               op,
+			     pixman_image_t *          src_image,
+			     pixman_image_t *          mask_image,
+			     pixman_image_t *          dst_image,
+			     int32_t                   src_x,
+			     int32_t                   src_y,
+			     int32_t                   mask_x,
+			     int32_t                   mask_y,
+			     int32_t                   dest_x,
+			     int32_t                   dest_y,
+			     int32_t                   width,
+			     int32_t                   height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w, h;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+    h = height;
+    w = width;
+
+    if (w < 7)
+        fbComposite_x8r8g8b8_src_r5g6b5_internal_mixed_armv6_c(dst, src, w, dst_stride, src_stride, h);
+    else
+        fbComposite_x8r8g8b8_src_r5g6b5_internal_armv6(dst, src, w, dst_stride, src_stride, h);

-    return imp;
 }