diff --git a/gfx/cairo/README b/gfx/cairo/README index c3f017f441df..27f2cc14ca64 100644 --- a/gfx/cairo/README +++ b/gfx/cairo/README @@ -230,6 +230,10 @@ pixman-mingw32.patch: include xmmintrin.h on MINGW32 builds to avoid redefinitio pixman-rename.patch: include pixman-rename.h for renaming of external symbols +pixman-arm32-clang.patch: don't use -no-integrated-as for arm32 + +pixman-arm64-clang.patch: don't use -no-integrated-as for aarch64 + quartz-support-color-emoji-font.patch: support Apple Color Emoji font in cairo-quartz backend use-show-text-glyphs-if-glyph-path-fails.patch: fall back to show_text_glyphs even at huge sizes if scaled_font_glyph_path didn't work diff --git a/gfx/cairo/pixman-arm32-clang.patch b/gfx/cairo/pixman-arm32-clang.patch new file mode 100644 index 000000000000..e69fb376d615 --- /dev/null +++ b/gfx/cairo/pixman-arm32-clang.patch @@ -0,0 +1,5212 @@ +https://gitlab.freedesktop.org/pixman/pixman/-/issues/74 + +diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S +--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S ++++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S +@@ -77,206 +77,206 @@ + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + + .macro bilinear_load_8888 reg1, reg2, tmp + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 +- vld1.32 {reg1}, [TMP1], STRIDE +- vld1.32 {reg2}, [TMP1] ++ vld1.32 {\reg1}, [TMP1], STRIDE ++ vld1.32 {\reg2}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 +- vld1.32 {reg2[0]}, [TMP1], STRIDE +- vld1.32 {reg2[1]}, [TMP1] +- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++ vld1.32 {\reg2[0]}, [TMP1], STRIDE ++ vld1.32 {\reg2[1]}, [TMP1] ++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp + .endm + + .macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +- bilinear_load_8888 reg1, reg2, tmp1 +- vmull.u8 acc1, reg1, d28 +- vmlal.u8 acc1, reg2, d29 +- bilinear_load_8888 reg3, reg4, tmp2 +- vmull.u8 acc2, reg3, d28 +- vmlal.u8 acc2, reg4, d29 ++ bilinear_load_8888 \reg1, \reg2, \tmp1 ++ vmull.u8 \acc1, \reg1, d28 ++ vmlal.u8 \acc1, \reg2, d29 ++ bilinear_load_8888 \reg3, \reg4, \tmp2 ++ vmull.u8 \acc2, \reg3, d28 ++ vmlal.u8 \acc2, \reg4, d29 + .endm + + .macro bilinear_load_and_vertical_interpolate_four_8888 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ +- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi + .endm + + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #1 +- vld1.32 {acc2lo[0]}, [TMP1], STRIDE +- vld1.32 {acc2hi[0]}, [TMP2], STRIDE +- vld1.32 {acc2lo[1]}, [TMP1] +- vld1.32 {acc2hi[1]}, [TMP2] +- convert_0565_to_x888 acc2, reg3, reg2, reg1 +- vzip.u8 reg1, reg3 +- vzip.u8 reg2, reg4 +- vzip.u8 reg3, reg4 +- vzip.u8 reg1, reg2 +- vmull.u8 acc1, reg1, d28 +- vmlal.u8 acc1, reg2, d29 +- vmull.u8 acc2, reg3, d28 +- vmlal.u8 acc2, reg4, d29 ++ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE ++ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE ++ vld1.32 {\acc2lo[1]}, [TMP1] ++ vld1.32 {\acc2hi[1]}, [TMP2] ++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 ++ vzip.u8 \reg1, \reg3 ++ vzip.u8 \reg2, \reg4 ++ vzip.u8 \reg3, \reg4 ++ vzip.u8 \reg1, \reg2 ++ vmull.u8 \acc1, \reg1, d28 ++ vmlal.u8 \acc1, \reg2, d29 ++ vmull.u8 \acc2, \reg3, d28 ++ vmlal.u8 \acc2, \reg4, d29 + .endm + + .macro bilinear_load_and_vertical_interpolate_four_0565 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #1 +- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE +- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE +- vld1.32 {xacc2lo[1]}, [TMP1] +- vld1.32 {xacc2hi[1]}, [TMP2] +- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE ++ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE ++ vld1.32 {\xacc2lo[1]}, [TMP1] ++ vld1.32 {\xacc2hi[1]}, [TMP2] ++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #1 +- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE +- vzip.u8 xreg1, xreg3 +- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE +- vzip.u8 xreg2, xreg4 +- vld1.32 {yacc2lo[1]}, [TMP1] +- vzip.u8 xreg3, xreg4 +- vld1.32 {yacc2hi[1]}, [TMP2] +- vzip.u8 xreg1, xreg2 +- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +- vmull.u8 xacc1, xreg1, d28 +- vzip.u8 yreg1, yreg3 +- vmlal.u8 xacc1, xreg2, d29 +- vzip.u8 yreg2, yreg4 +- vmull.u8 xacc2, xreg3, d28 +- vzip.u8 yreg3, yreg4 +- vmlal.u8 xacc2, xreg4, d29 +- vzip.u8 yreg1, yreg2 +- vmull.u8 yacc1, yreg1, d28 +- vmlal.u8 yacc1, yreg2, d29 +- vmull.u8 yacc2, yreg3, d28 +- vmlal.u8 yacc2, yreg4, d29 ++ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE ++ vzip.u8 \xreg1, \xreg3 ++ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE ++ vzip.u8 \xreg2, \xreg4 ++ vld1.32 {\yacc2lo[1]}, [TMP1] ++ vzip.u8 \xreg3, \xreg4 ++ vld1.32 {\yacc2hi[1]}, [TMP2] ++ vzip.u8 \xreg1, \xreg2 ++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 ++ vmull.u8 \xacc1, \xreg1, d28 ++ vzip.u8 \yreg1, \yreg3 ++ vmlal.u8 \xacc1, \xreg2, d29 ++ vzip.u8 \yreg2, \yreg4 ++ vmull.u8 \xacc2, \xreg3, d28 ++ vzip.u8 \yreg3, \yreg4 ++ vmlal.u8 \xacc2, \xreg4, d29 ++ vzip.u8 \yreg1, \yreg2 ++ vmull.u8 \yacc1, \yreg1, d28 ++ vmlal.u8 \yacc1, \yreg2, d29 ++ vmull.u8 \yacc2, \yreg3, d28 ++ vmlal.u8 \yacc2, \yreg4, d29 + .endm + + .macro bilinear_store_8888 numpix, tmp1, tmp2 +-.if numpix == 4 ++.if \numpix == 4 + vst1.32 {d0, d1}, [OUT]! +-.elseif numpix == 2 ++.elseif \numpix == 2 + vst1.32 {d0}, [OUT]! +-.elseif numpix == 1 ++.elseif \numpix == 1 + vst1.32 {d0[0]}, [OUT, :32]! + .else + .error bilinear_store_8888 numpix is unsupported + .endif + .endm + + .macro bilinear_store_0565 numpix, tmp1, tmp2 + vuzp.u8 d0, d1 + vuzp.u8 d2, d3 + vuzp.u8 d1, d3 + vuzp.u8 d0, d2 +- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 +-.if numpix == 4 ++ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 ++.if \numpix == 4 + vst1.16 {d2}, [OUT]! +-.elseif numpix == 2 ++.elseif \numpix == 2 + vst1.32 {d2[0]}, [OUT]! +-.elseif numpix == 1 ++.elseif \numpix == 1 + vst1.16 {d2[0]}, [OUT]! + .else + .error bilinear_store_0565 numpix is unsupported + .endif + .endm + + + /* + * Macros for loading mask pixels into register 'mask'. + * vdup must be done in somewhere else. + */ + .macro bilinear_load_mask_x numpix, mask + .endm + + .macro bilinear_load_mask_8 numpix, mask +-.if numpix == 4 +- vld1.32 {mask[0]}, [MASK]! +-.elseif numpix == 2 +- vld1.16 {mask[0]}, [MASK]! +-.elseif numpix == 1 +- vld1.8 {mask[0]}, [MASK]! ++.if \numpix == 4 ++ vld1.32 {\mask[0]}, [MASK]! ++.elseif \numpix == 2 ++ vld1.16 {\mask[0]}, [MASK]! ++.elseif \numpix == 1 ++ vld1.8 {\mask[0]}, [MASK]! + .else +- .error bilinear_load_mask_8 numpix is unsupported ++ .error bilinear_load_mask_8 \numpix is unsupported + .endif + pld [MASK, #prefetch_offset] + .endm + + .macro bilinear_load_mask mask_fmt, numpix, mask +- bilinear_load_mask_&mask_fmt numpix, mask ++ bilinear_load_mask_\()\mask_fmt \numpix, \mask + .endm + + + /* + * Macros for loading destination pixels into register 'dst0' and 'dst1'. + * Interleave should be done somewhere else. + */ + .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 +-.if numpix == 4 +- vld1.32 {dst0, dst1}, [OUT] +-.elseif numpix == 2 +- vld1.32 {dst0}, [OUT] +-.elseif numpix == 1 +- vld1.32 {dst0[0]}, [OUT] ++.if \numpix == 4 ++ vld1.32 {\dst0, \dst1}, [OUT] ++.elseif \numpix == 2 ++ vld1.32 {\dst0}, [OUT] ++.elseif \numpix == 1 ++ vld1.32 {\dst0[0]}, [OUT] + .else +- .error bilinear_load_dst_8888 numpix is unsupported ++ .error bilinear_load_dst_8888 \numpix is unsupported + .endif + pld [OUT, #(prefetch_offset * 4)] + .endm + + .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 +- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 +- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 +- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 ++ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 + .endm + + /* + * Macros for duplicating partially loaded mask to fill entire register. + * We will apply mask to interleaved source pixels, that is + * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * So, we need to duplicate loaded mask into whole register. +@@ -285,79 +285,79 @@ + * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * We can do some optimizations for this including last pixel cases. + */ + .macro bilinear_duplicate_mask_x numpix, mask + .endm + + .macro bilinear_duplicate_mask_8 numpix, mask +-.if numpix == 4 +- vdup.32 mask, mask[0] +-.elseif numpix == 2 +- vdup.16 mask, mask[0] +-.elseif numpix == 1 +- vdup.8 mask, mask[0] ++.if \numpix == 4 ++ vdup.32 \mask, \mask[0] ++.elseif \numpix == 2 ++ vdup.16 \mask, \mask[0] ++.elseif \numpix == 1 ++ vdup.8 \mask, \mask[0] + .else + .error bilinear_duplicate_mask_8 is unsupported + .endif + .endm + + .macro bilinear_duplicate_mask mask_fmt, numpix, mask +- bilinear_duplicate_mask_&mask_fmt numpix, mask ++ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask + .endm + + /* + * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. + * Interleave should be done when maks is enabled or operator is 'over'. + */ + .macro bilinear_interleave src0, src1, dst0, dst1 +- vuzp.8 src0, src1 +- vuzp.8 dst0, dst1 +- vuzp.8 src0, src1 +- vuzp.8 dst0, dst1 ++ vuzp.8 \src0, \src1 ++ vuzp.8 \dst0, \dst1 ++ vuzp.8 \src0, \src1 ++ vuzp.8 \dst0, \dst1 + .endm + + .macro bilinear_interleave_src_dst_x_src \ + numpix, src0, src1, src01, dst0, dst1, dst01 + .endm + + .macro bilinear_interleave_src_dst_x_over \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, dst0, dst1 ++ bilinear_interleave \src0, \src1, \dst0, \dst1 + .endm + + .macro bilinear_interleave_src_dst_x_add \ + numpix, src0, src1, src01, dst0, dst1, dst01 + .endm + + .macro bilinear_interleave_src_dst_8_src \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, dst0, dst1 ++ bilinear_interleave \src0, \src1, \dst0, \dst1 + .endm + + .macro bilinear_interleave_src_dst_8_over \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, dst0, dst1 ++ bilinear_interleave \src0, \src1, \dst0, \dst1 + .endm + + .macro bilinear_interleave_src_dst_8_add \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, dst0, dst1 ++ bilinear_interleave \src0, \src1, \dst0, \dst1 + .endm + + .macro bilinear_interleave_src_dst \ + mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave_src_dst_&mask_fmt&_&op \ +- numpix, src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ ++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + + /* + * Macros for applying masks to src pixels. (see combine_mask_u() function) + * src, dst should be in interleaved form. + * mask register should be in form (m0, m1, m2, m3). + */ +@@ -365,217 +365,217 @@ + numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + .endm + + .macro bilinear_apply_mask_to_src_8 \ + numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + +- vmull.u8 tmp01, src0, mask +- vmull.u8 tmp23, src1, mask ++ vmull.u8 \tmp01, \src0, \mask ++ vmull.u8 \tmp23, \src1, \mask + /* bubbles */ +- vrshr.u16 tmp45, tmp01, #8 +- vrshr.u16 tmp67, tmp23, #8 ++ vrshr.u16 \tmp45, \tmp01, #8 ++ vrshr.u16 \tmp67, \tmp23, #8 + /* bubbles */ +- vraddhn.u16 src0, tmp45, tmp01 +- vraddhn.u16 src1, tmp67, tmp23 ++ vraddhn.u16 \src0, \tmp45, \tmp01 ++ vraddhn.u16 \src1, \tmp67, \tmp23 + .endm + + .macro bilinear_apply_mask_to_src \ + mask_fmt, numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + +- bilinear_apply_mask_to_src_&mask_fmt \ +- numpix, src0, src1, src01, mask, \ +- tmp01, tmp23, tmp45, tmp67 ++ bilinear_apply_mask_to_src_\()\mask_fmt \ ++ \numpix, \src0, \src1, \src01, \mask, \ ++ \tmp01, \tmp23, \tmp45, \tmp67 + .endm + + + /* + * Macros for combining src and destination pixels. + * Interleave or not is depending on operator 'op'. + */ + .macro bilinear_combine_src \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + .endm + + .macro bilinear_combine_over \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- vdup.32 tmp8, src1[1] ++ vdup.32 \tmp8, \src1[1] + /* bubbles */ +- vmvn.8 tmp8, tmp8 ++ vmvn.8 \tmp8, \tmp8 + /* bubbles */ +- vmull.u8 tmp01, dst0, tmp8 ++ vmull.u8 \tmp01, \dst0, \tmp8 + /* bubbles */ +- vmull.u8 tmp23, dst1, tmp8 ++ vmull.u8 \tmp23, \dst1, \tmp8 + /* bubbles */ +- vrshr.u16 tmp45, tmp01, #8 +- vrshr.u16 tmp67, tmp23, #8 ++ vrshr.u16 \tmp45, \tmp01, #8 ++ vrshr.u16 \tmp67, \tmp23, #8 + /* bubbles */ +- vraddhn.u16 dst0, tmp45, tmp01 +- vraddhn.u16 dst1, tmp67, tmp23 ++ vraddhn.u16 \dst0, \tmp45, \tmp01 ++ vraddhn.u16 \dst1, \tmp67, \tmp23 + /* bubbles */ +- vqadd.u8 src01, dst01, src01 ++ vqadd.u8 \src01, \dst01, \src01 + .endm + + .macro bilinear_combine_add \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- vqadd.u8 src01, dst01, src01 ++ vqadd.u8 \src01, \dst01, \src01 + .endm + + .macro bilinear_combine \ + op, numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- bilinear_combine_&op \ +- numpix, src0, src1, src01, dst0, dst1, dst01, \ +- tmp01, tmp23, tmp45, tmp67, tmp8 ++ bilinear_combine_\()\op \ ++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ ++ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 + .endm + + /* + * Macros for final deinterleaving of destination pixels if needed. + */ + .macro bilinear_deinterleave numpix, dst0, dst1, dst01 +- vuzp.8 dst0, dst1 ++ vuzp.8 \dst0, \dst1 + /* bubbles */ +- vuzp.8 dst0, dst1 ++ vuzp.8 \dst0, \dst1 + .endm + + .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 +- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 ++ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 + .endm + + + .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_&src_fmt d0, d1, d2 +- bilinear_load_mask mask_fmt, 1, d4 +- bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 ++ bilinear_load_\()\src_fmt d0, d1, d2 ++ bilinear_load_mask \mask_fmt, 1, d4 ++ bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9 + vmull.u8 q1, d0, d28 + vmlal.u8 q1, d1, d29 + /* 5 cycles bubble */ + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + /* 5 cycles bubble */ +- bilinear_duplicate_mask mask_fmt, 1, d4 ++ bilinear_duplicate_mask \mask_fmt, 1, d4 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + /* 3 cycles bubble */ + vmovn.u16 d0, q0 + /* 1 cycle bubble */ + bilinear_interleave_src_dst \ +- mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 ++ \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9 + bilinear_apply_mask_to_src \ +- mask_fmt, 1, d0, d1, q0, d4, \ ++ \mask_fmt, 1, d0, d1, q0, d4, \ + q3, q8, q10, q11 + bilinear_combine \ +- op, 1, d0, d1, q0, d18, d19, q9, \ ++ \op, 1, d0, d1, q0, d18, d19, q9, \ + q3, q8, q10, q11, d5 +- bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 +- bilinear_store_&dst_fmt 1, q2, q3 ++ bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0 ++ bilinear_store_\()\dst_fmt 1, q2, q3 + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ + q1, q11, d0, d1, d20, d21, d22, d23 +- bilinear_load_mask mask_fmt, 2, d4 +- bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 ++ bilinear_load_mask \mask_fmt, 2, d4 ++ bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q10, d22, d31 + vmlal.u16 q10, d23, d31 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +- bilinear_duplicate_mask mask_fmt, 2, d4 ++ bilinear_duplicate_mask \mask_fmt, 2, d4 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vadd.u16 q12, q12, q13 + vmovn.u16 d0, q0 + bilinear_interleave_src_dst \ +- mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 ++ \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9 + bilinear_apply_mask_to_src \ +- mask_fmt, 2, d0, d1, q0, d4, \ ++ \mask_fmt, 2, d0, d1, q0, d4, \ + q3, q8, q10, q11 + bilinear_combine \ +- op, 2, d0, d1, q0, d18, d19, q9, \ ++ \op, 2, d0, d1, q0, d18, d19, q9, \ + q3, q8, q10, q11, d5 +- bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 +- bilinear_store_&dst_fmt 2, q2, q3 ++ bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0 ++ bilinear_store_\()\dst_fmt 2, q2, q3 + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_and_vertical_interpolate_four_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ + q1, q11, d0, d1, d20, d21, d22, d23 \ + q3, q9, d4, d5, d16, d17, d18, d19 + pld [TMP1, PF_OFFS] + sub TMP1, TMP1, STRIDE + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q10, d22, d31 + vmlal.u16 q10, d23, d31 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q2, d6, d30 + vmlal.u16 q2, d7, d30 + vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS +- bilinear_load_mask mask_fmt, 4, d22 +- bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 ++ bilinear_load_mask \mask_fmt, 4, d22 ++ bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1 + pld [TMP1, PF_OFFS] + vmlsl.u16 q8, d18, d31 + vmlal.u16 q8, d19, d31 + vadd.u16 q12, q12, q13 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) +- bilinear_duplicate_mask mask_fmt, 4, d22 ++ bilinear_duplicate_mask \mask_fmt, 4, d22 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vmovn.u16 d0, q0 + vmovn.u16 d1, q2 + vadd.u16 q12, q12, q13 + bilinear_interleave_src_dst \ +- mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 ++ \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1 + bilinear_apply_mask_to_src \ +- mask_fmt, 4, d0, d1, q0, d22, \ ++ \mask_fmt, 4, d0, d1, q0, d22, \ + q3, q8, q9, q10 + bilinear_combine \ +- op, 4, d0, d1, q0, d2, d3, q1, \ ++ \op, 4, d0, d1, q0, d2, d3, q1, \ + q3, q8, q9, q10, d23 +- bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 +- bilinear_store_&dst_fmt 4, q2, q3 ++ bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0 ++ bilinear_store_\()\dst_fmt 4, q2, q3 + .endm + + .set BILINEAR_FLAG_USE_MASK, 1 + .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + + /* + * Main template macro for generating NEON optimized bilinear scanline functions. + * +@@ -605,24 +605,24 @@ + bilinear_process_four_pixels, \ + bilinear_process_pixblock_head, \ + bilinear_process_pixblock_tail, \ + bilinear_process_pixblock_tail_head, \ + pixblock_size, \ + prefetch_distance, \ + flags + +-pixman_asm_function fname +-.if pixblock_size == 8 +-.elseif pixblock_size == 4 ++pixman_asm_function \fname ++.if \pixblock_size == 8 ++.elseif \pixblock_size == 4 + .else + .error unsupported pixblock size + .endif + +-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 + OUT .req r0 + TOP .req r1 + BOTTOM .req r2 + WT .req r3 + WB .req r4 + X .req r5 + UX .req r6 + WIDTH .req ip +@@ -630,17 +630,17 @@ pixman_asm_function fname + TMP2 .req r4 + PF_OFFS .req r7 + TMP3 .req r8 + TMP4 .req r9 + STRIDE .req r2 + + mov ip, sp + push {r4, r5, r6, r7, r8, r9} +- mov PF_OFFS, #prefetch_distance ++ mov PF_OFFS, #\prefetch_distance + ldmia ip, {WB, X, UX, WIDTH} + .else + OUT .req r0 + MASK .req r1 + TOP .req r2 + BOTTOM .req r3 + WT .req r4 + WB .req r5 +@@ -649,27 +649,27 @@ pixman_asm_function fname + WIDTH .req ip + TMP1 .req r4 + TMP2 .req r5 + PF_OFFS .req r8 + TMP3 .req r9 + TMP4 .req r10 + STRIDE .req r3 + +- .set prefetch_offset, prefetch_distance ++ .set prefetch_offset, \prefetch_distance + + mov ip, sp + push {r4, r5, r6, r7, r8, r9, r10, ip} +- mov PF_OFFS, #prefetch_distance ++ mov PF_OFFS, #\prefetch_distance + ldmia ip, {WT, WB, X, UX, WIDTH} + .endif + + mul PF_OFFS, PF_OFFS, UX + +-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 + vpush {d8-d15} + .endif + + sub STRIDE, BOTTOM, TOP + .unreq BOTTOM + + cmp WIDTH, #0 + ble 3f +@@ -678,76 +678,76 @@ pixman_asm_function fname + vdup.u16 q13, UX + vdup.u8 d28, WT + vdup.u8 d29, WB + vadd.u16 d25, d25, d26 + + /* ensure good destination alignment */ + cmp WIDTH, #1 + blt 0f +- tst OUT, #(1 << dst_bpp_shift) ++ tst OUT, #(1 << \dst_bpp_shift) + beq 0f + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vadd.u16 q12, q12, q13 +- bilinear_process_last_pixel ++ \bilinear_process_last_pixel + sub WIDTH, WIDTH, #1 + 0: + vadd.u16 q13, q13, q13 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vadd.u16 q12, q12, q13 + + cmp WIDTH, #2 + blt 0f +- tst OUT, #(1 << (dst_bpp_shift + 1)) ++ tst OUT, #(1 << (\dst_bpp_shift + 1)) + beq 0f +- bilinear_process_two_pixels ++ \bilinear_process_two_pixels + sub WIDTH, WIDTH, #2 + 0: +-.if pixblock_size == 8 ++.if \pixblock_size == 8 + cmp WIDTH, #4 + blt 0f +- tst OUT, #(1 << (dst_bpp_shift + 2)) ++ tst OUT, #(1 << (\dst_bpp_shift + 2)) + beq 0f +- bilinear_process_four_pixels ++ \bilinear_process_four_pixels + sub WIDTH, WIDTH, #4 + 0: + .endif +- subs WIDTH, WIDTH, #pixblock_size ++ subs WIDTH, WIDTH, #\pixblock_size + blt 1f +- mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +- bilinear_process_pixblock_head +- subs WIDTH, WIDTH, #pixblock_size ++ mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift) ++ \bilinear_process_pixblock_head ++ subs WIDTH, WIDTH, #\pixblock_size + blt 5f + 0: +- bilinear_process_pixblock_tail_head +- subs WIDTH, WIDTH, #pixblock_size ++ \bilinear_process_pixblock_tail_head ++ subs WIDTH, WIDTH, #\pixblock_size + bge 0b + 5: +- bilinear_process_pixblock_tail ++ \bilinear_process_pixblock_tail + 1: +-.if pixblock_size == 8 ++.if \pixblock_size == 8 + tst WIDTH, #4 + beq 2f +- bilinear_process_four_pixels ++ \bilinear_process_four_pixels + 2: + .endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 2f +- bilinear_process_two_pixels ++ \bilinear_process_two_pixels + 2: + tst WIDTH, #1 + beq 3f +- bilinear_process_last_pixel ++ \bilinear_process_last_pixel + 3: +-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 + vpop {d8-d15} + .endif + +-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 + pop {r4, r5, r6, r7, r8, r9} + .else + pop {r4, r5, r6, r7, r8, r9, r10, ip} + .endif + bx lr + + .unreq OUT + .unreq TOP +@@ -757,21 +757,23 @@ 3: + .unreq UX + .unreq WIDTH + .unreq TMP1 + .unreq TMP2 + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 + .unreq STRIDE +-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 + .unreq MASK + .endif + ++#ifndef __clang__ + .endfunc ++#endif + + .endm + + /* src_8888_8_8888 */ + .macro bilinear_src_8888_8_8888_process_last_pixel + bilinear_interpolate_last_pixel 8888, 8, 8888, src + .endm + +diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S +--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S ++++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S +@@ -29,16 +29,22 @@ + * (those which are exposing some new or interesting features) are + * extensively commented and can be used as examples. + * + * You may want to have a look at the comments for following functions: + * - pixman_composite_over_8888_0565_asm_neon + * - pixman_composite_over_n_8_0565_asm_neon + */ + ++#ifdef __clang__ ++#define ldrgeb ldrbge ++#define subges subsge ++#define subpls subspl ++#endif ++ + /* Prevent the stack from becoming executable for no reason... */ + #if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits + #endif + + .text + .fpu neon + .arch armv7a +@@ -255,43 +261,43 @@ + vqadd.u8 d16, d2, d20 + vld1.16 {d4, d5}, [DST_R, :128]! + vqadd.u8 q9, q0, q11 + vshrn.u16 d6, q2, #8 + fetch_src_pixblock + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vshll.u8 q14, d16, #8 +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + vshll.u8 q8, d19, #8 +- PF tst PF_CTL, #0xF ++ PF tst, PF_CTL, #0xF + vsri.u8 d6, d6, #5 +- PF addne PF_X, PF_X, #8 ++ PF addne, PF_X, PF_X, #8 + vmvn.8 d3, d3 +- PF subne PF_CTL, PF_CTL, #1 ++ PF subne, PF_CTL, PF_CTL, #1 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + vmull.u8 q10, d3, d6 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vsri.u16 q14, q8, #5 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vshll.u8 q9, d18, #8 + vrshr.u16 q13, q10, #8 +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vsri.u16 q14, q9, #11 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vraddhn.u16 d22, q12, q15 + vst1.16 {d28, d29}, [DST_W, :128]! + .endm + + #else + + /* If we did not care much about the performance, we would just use this... */ + .macro pixman_composite_over_8888_0565_process_pixblock_tail_head +@@ -429,30 +435,30 @@ generate_composite_function \ + + .macro pixman_composite_src_8888_0565_process_pixblock_tail + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 + .endm + + .macro pixman_composite_src_8888_0565_process_pixblock_tail_head + vsri.u16 q14, q8, #5 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + fetch_src_pixblock +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vsri.u16 q14, q9, #11 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! +- PF subge PF_X, PF_X, ORIG_W +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subge, PF_X, PF_X, ORIG_W ++ PF subges, PF_CTL, PF_CTL, #0x10 + vshll.u8 q14, d2, #8 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vshll.u8 q9, d0, #8 + .endm + + generate_composite_function \ + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -504,30 +510,30 @@ generate_composite_function \ + vqadd.u8 q15, q1, q3 + .endm + + .macro pixman_composite_add_8_8_process_pixblock_tail + .endm + + .macro pixman_composite_add_8_8_process_pixblock_tail_head + fetch_src_pixblock +- PF add PF_X, PF_X, #32 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #32 ++ PF tst, PF_CTL, #0xF + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! +- PF addne PF_X, PF_X, #32 +- PF subne PF_CTL, PF_CTL, #1 ++ PF addne, PF_X, PF_X, #32 ++ PF subne, PF_CTL, PF_CTL, #1 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +- PF subge PF_X, PF_X, ORIG_W +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subge, PF_X, PF_X, ORIG_W ++ PF subges, PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 + .endm + + generate_composite_function \ + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -536,30 +542,30 @@ generate_composite_function \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_tail_head + + /******************************************************************************/ + + .macro pixman_composite_add_8888_8888_process_pixblock_tail_head + fetch_src_pixblock +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +- PF subge PF_X, PF_X, ORIG_W +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subge, PF_X, PF_X, ORIG_W ++ PF subges, PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 + .endm + + generate_composite_function \ + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -599,40 +605,40 @@ generate_composite_function_single_scanl + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + .endm + + .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + fetch_src_pixblock + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subsge, PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 + .endm + + generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ +@@ -651,42 +657,42 @@ generate_composite_function_single_scanl + pixman_composite_out_reverse_8888_8888_process_pixblock_tail + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + .endm + + .macro pixman_composite_over_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + fetch_src_pixblock + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 + .endm + + generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ +@@ -737,30 +743,30 @@ generate_composite_function_single_scanl + vrshr.u16 q2, q10, #8 + vrshr.u16 q3, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q2, q10 + vraddhn.u16 d31, q3, q11 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vqadd.u8 q14, q0, q14 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0x0F +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0x0F ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vqadd.u8 q15, q1, q15 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vmull.u8 q8, d24, d4 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vmull.u8 q9, d24, d5 +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q10, d24, d6 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vmull.u8 q11, d24, d7 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + .endm + + .macro pixman_composite_over_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] +@@ -779,40 +785,40 @@ generate_composite_function \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + + /******************************************************************************/ + + .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + vmull.u8 q10, d22, d6 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 + .endm + + .macro pixman_composite_over_reverse_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d7[0]}, [DUMMY] + vdup.8 d4, d7[0] + vdup.8 d5, d7[1] +@@ -1240,33 +1246,33 @@ generate_composite_function \ + vrshrn.u16 d28, q8, #8 + vrshrn.u16 d29, q9, #8 + vrshrn.u16 d30, q10, #8 + vrshrn.u16 d31, q11, #8 + .endm + + .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head + fetch_mask_pixblock +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + vrshrn.u16 d28, q8, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + vrshrn.u16 d29, q9, #8 +- PF addne PF_X, PF_X, #8 ++ PF addne, PF_X, PF_X, #8 + vrshrn.u16 d30, q10, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF subne, PF_CTL, PF_CTL, #1 + vrshrn.u16 d31, q11, #8 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vmull.u8 q8, d24, d0 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q9, d24, d1 +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q10, d24, d2 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vmull.u8 q11, d24, d3 +- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q8, q8, #8 + vrsra.u16 q9, q9, #8 + vrsra.u16 q10, q10, #8 + vrsra.u16 q11, q11, #8 + .endm + + .macro pixman_composite_src_n_8_8888_init +@@ -1309,33 +1315,33 @@ generate_composite_function \ + vrshrn.u16 d28, q0, #8 + vrshrn.u16 d29, q1, #8 + vrshrn.u16 d30, q2, #8 + vrshrn.u16 d31, q3, #8 + .endm + + .macro pixman_composite_src_n_8_8_process_pixblock_tail_head + fetch_mask_pixblock +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + vrshrn.u16 d28, q0, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + vrshrn.u16 d29, q1, #8 +- PF addne PF_X, PF_X, #8 ++ PF addne, PF_X, PF_X, #8 + vrshrn.u16 d30, q2, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF subne, PF_CTL, PF_CTL, #1 + vrshrn.u16 d31, q3, #8 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vmull.u8 q0, d24, d16 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q1, d25, d16 +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q2, d26, d16 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vmull.u8 q3, d27, d16 +- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vrsra.u16 q0, q0, #8 + vrsra.u16 q1, q1, #8 + vrsra.u16 q2, q2, #8 + vrsra.u16 q3, q3, #8 + .endm + + .macro pixman_composite_src_n_8_8_init +@@ -1403,37 +1409,37 @@ generate_composite_function \ + .endm + + .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q15, q9, #8 + fetch_mask_pixblock + vrshr.u16 q6, q10, #8 +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + vrshr.u16 q7, q11, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + vraddhn.u16 d28, q14, q8 +- PF addne PF_X, PF_X, #8 ++ PF addne, PF_X, PF_X, #8 + vraddhn.u16 d29, q15, q9 +- PF subne PF_CTL, PF_CTL, #1 ++ PF subne, PF_CTL, PF_CTL, #1 + vraddhn.u16 d30, q6, q10 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + vraddhn.u16 d31, q7, q11 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vmull.u8 q6, d24, d8 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + vmull.u8 q7, d24, d9 +- PF subge PF_X, PF_X, ORIG_W ++ PF subge, PF_X, PF_X, ORIG_W + vmull.u8 q8, d24, d10 +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subges, PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d24, d11 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q14, q0, q14 +- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + vqadd.u8 q15, q1, q15 + vrshr.u16 q10, q6, #8 + vrshr.u16 q11, q7, #8 + vrshr.u16 q12, q8, #8 + vrshr.u16 q13, q9, #8 + vraddhn.u16 d0, q6, q10 + vraddhn.u16 d1, q7, q11 + vraddhn.u16 d2, q8, q12 +@@ -2420,31 +2426,31 @@ generate_composite_function \ + + .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + fetch_src_pixblock + vraddhn.u16 d30, q11, q8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +- PF subge PF_X, PF_X, ORIG_W +- PF subges PF_CTL, PF_CTL, #0x10 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF subge, PF_X, PF_X, ORIG_W ++ PF subges, PF_CTL, PF_CTL, #0x10 ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + .endm + + generate_composite_function \ + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ +@@ -2477,31 +2483,31 @@ generate_composite_function \ + + .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + fetch_src_pixblock + vraddhn.u16 d28, q11, q8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF +- PF addne PF_X, PF_X, #8 +- PF subne PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF ++ PF addne, PF_X, PF_X, #8 ++ PF subne, PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d30, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +- PF subge PF_X, PF_X, ORIG_W +- PF subges PF_CTL, PF_CTL, #0x10 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF subge, PF_X, PF_X, ORIG_W ++ PF subges, PF_CTL, PF_CTL, #0x10 ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + .endm + + generate_composite_function \ + pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ +@@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + + .macro bilinear_load_8888 reg1, reg2, tmp + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #2 +- vld1.32 {reg1}, [TMP1], STRIDE +- vld1.32 {reg2}, [TMP1] ++ vld1.32 {\reg1}, [TMP1], STRIDE ++ vld1.32 {\reg2}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 +- vld1.32 {reg2[0]}, [TMP1], STRIDE +- vld1.32 {reg2[1]}, [TMP1] +- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++ vld1.32 {\reg2[0]}, [TMP1], STRIDE ++ vld1.32 {\reg2[1]}, [TMP1] ++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp + .endm + + .macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +- bilinear_load_8888 reg1, reg2, tmp1 +- vmull.u8 acc1, reg1, d28 +- vmlal.u8 acc1, reg2, d29 +- bilinear_load_8888 reg3, reg4, tmp2 +- vmull.u8 acc2, reg3, d28 +- vmlal.u8 acc2, reg4, d29 ++ bilinear_load_8888 \reg1, \reg2, \tmp1 ++ vmull.u8 \acc1, \reg1, d28 ++ vmlal.u8 \acc1, \reg2, d29 ++ bilinear_load_8888 \reg3, \reg4, \tmp2 ++ vmull.u8 \acc2, \reg3, d28 ++ vmlal.u8 \acc2, \reg4, d29 + .endm + + .macro bilinear_load_and_vertical_interpolate_four_8888 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ +- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi + .endm + + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #1 +- vld1.32 {acc2lo[0]}, [TMP1], STRIDE +- vld1.32 {acc2hi[0]}, [TMP2], STRIDE +- vld1.32 {acc2lo[1]}, [TMP1] +- vld1.32 {acc2hi[1]}, [TMP2] +- convert_0565_to_x888 acc2, reg3, reg2, reg1 +- vzip.u8 reg1, reg3 +- vzip.u8 reg2, reg4 +- vzip.u8 reg3, reg4 +- vzip.u8 reg1, reg2 +- vmull.u8 acc1, reg1, d28 +- vmlal.u8 acc1, reg2, d29 +- vmull.u8 acc2, reg3, d28 +- vmlal.u8 acc2, reg4, d29 ++ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE ++ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE ++ vld1.32 {\acc2lo[1]}, [TMP1] ++ vld1.32 {\acc2hi[1]}, [TMP2] ++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 ++ vzip.u8 \reg1, \reg3 ++ vzip.u8 \reg2, \reg4 ++ vzip.u8 \reg3, \reg4 ++ vzip.u8 \reg1, \reg2 ++ vmull.u8 \acc1, \reg1, d28 ++ vmlal.u8 \acc1, \reg2, d29 ++ vmull.u8 \acc2, \reg3, d28 ++ vmlal.u8 \acc2, \reg4, d29 + .endm + + .macro bilinear_load_and_vertical_interpolate_four_0565 \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #1 +- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE +- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE +- vld1.32 {xacc2lo[1]}, [TMP1] +- vld1.32 {xacc2hi[1]}, [TMP2] +- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE ++ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE ++ vld1.32 {\xacc2lo[1]}, [TMP1] ++ vld1.32 {\xacc2hi[1]}, [TMP2] ++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 + mov TMP1, X, asr #16 + add X, X, UX + add TMP1, TOP, TMP1, asl #1 + mov TMP2, X, asr #16 + add X, X, UX + add TMP2, TOP, TMP2, asl #1 +- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE +- vzip.u8 xreg1, xreg3 +- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE +- vzip.u8 xreg2, xreg4 +- vld1.32 {yacc2lo[1]}, [TMP1] +- vzip.u8 xreg3, xreg4 +- vld1.32 {yacc2hi[1]}, [TMP2] +- vzip.u8 xreg1, xreg2 +- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +- vmull.u8 xacc1, xreg1, d28 +- vzip.u8 yreg1, yreg3 +- vmlal.u8 xacc1, xreg2, d29 +- vzip.u8 yreg2, yreg4 +- vmull.u8 xacc2, xreg3, d28 +- vzip.u8 yreg3, yreg4 +- vmlal.u8 xacc2, xreg4, d29 +- vzip.u8 yreg1, yreg2 +- vmull.u8 yacc1, yreg1, d28 +- vmlal.u8 yacc1, yreg2, d29 +- vmull.u8 yacc2, yreg3, d28 +- vmlal.u8 yacc2, yreg4, d29 ++ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE ++ vzip.u8 \xreg1, \xreg3 ++ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE ++ vzip.u8 \xreg2, \xreg4 ++ vld1.32 {\yacc2lo[1]}, [TMP1] ++ vzip.u8 \xreg3, \xreg4 ++ vld1.32 {\yacc2hi[1]}, [TMP2] ++ vzip.u8 \xreg1, \xreg2 ++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 ++ vmull.u8 \xacc1, \xreg1, d28 ++ vzip.u8 \yreg1, \yreg3 ++ vmlal.u8 \xacc1, \xreg2, d29 ++ vzip.u8 \yreg2, \yreg4 ++ vmull.u8 \xacc2, \xreg3, d28 ++ vzip.u8 \yreg3, \yreg4 ++ vmlal.u8 \xacc2, \xreg4, d29 ++ vzip.u8 \yreg1, \yreg2 ++ vmull.u8 \yacc1, \yreg1, d28 ++ vmlal.u8 \yacc1, \yreg2, d29 ++ vmull.u8 \yacc2, \yreg3, d28 ++ vmlal.u8 \yacc2, \yreg4, d29 + .endm + + .macro bilinear_store_8888 numpix, tmp1, tmp2 +-.if numpix == 4 ++.if \numpix == 4 + vst1.32 {d0, d1}, [OUT, :128]! +-.elseif numpix == 2 ++.elseif \numpix == 2 + vst1.32 {d0}, [OUT, :64]! +-.elseif numpix == 1 ++.elseif \numpix == 1 + vst1.32 {d0[0]}, [OUT, :32]! + .else +- .error bilinear_store_8888 numpix is unsupported ++ .error bilinear_store_8888 \numpix is unsupported + .endif + .endm + + .macro bilinear_store_0565 numpix, tmp1, tmp2 + vuzp.u8 d0, d1 + vuzp.u8 d2, d3 + vuzp.u8 d1, d3 + vuzp.u8 d0, d2 +- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 +-.if numpix == 4 ++ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 ++.if \numpix == 4 + vst1.16 {d2}, [OUT, :64]! +-.elseif numpix == 2 ++.elseif \numpix == 2 + vst1.32 {d2[0]}, [OUT, :32]! +-.elseif numpix == 1 ++.elseif \numpix == 1 + vst1.16 {d2[0]}, [OUT, :16]! + .else +- .error bilinear_store_0565 numpix is unsupported ++ .error bilinear_store_0565 \numpix is unsupported + .endif + .endm + + .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt +- bilinear_load_&src_fmt d0, d1, d2 ++ bilinear_load_\()\src_fmt d0, d1, d2 + vmull.u8 q1, d0, d28 + vmlal.u8 q1, d1, d29 + /* 5 cycles bubble */ + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + /* 5 cycles bubble */ + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + /* 3 cycles bubble */ + vmovn.u16 d0, q0 + /* 1 cycle bubble */ +- bilinear_store_&dst_fmt 1, q2, q3 ++ bilinear_store_\()\dst_fmt 1, q2, q3 + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt +- bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ + q1, q11, d0, d1, d20, d21, d22, d23 + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q10, d22, d31 + vmlal.u16 q10, d23, d31 + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vadd.u16 q12, q12, q13 + vmovn.u16 d0, q0 +- bilinear_store_&dst_fmt 2, q2, q3 ++ bilinear_store_\()\dst_fmt 2, q2, q3 + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt +- bilinear_load_and_vertical_interpolate_four_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ + q1, q11, d0, d1, d20, d21, d22, d23 \ + q3, q9, d4, d5, d16, d17, d18, d19 + pld [TMP1, PF_OFFS] + sub TMP1, TMP1, STRIDE + vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS + vmlsl.u16 q0, d2, d30 + vmlal.u16 q0, d3, d30 + vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS +@@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan + vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) + vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vmovn.u16 d0, q0 + vmovn.u16 d1, q2 + vadd.u16 q12, q12, q13 +- bilinear_store_&dst_fmt 4, q2, q3 ++ bilinear_store_\()\dst_fmt 4, q2, q3 + .endm + + .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head + .else +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail + .endif + .endm + + .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head + .else +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head + .else +- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail + .else +- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head + .else +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + .endif + .endm + + .set BILINEAR_FLAG_UNROLL_4, 0 + .set BILINEAR_FLAG_UNROLL_8, 1 + .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + + /* +@@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan + * prefetch_distance - prefetch in the source image by that many + * pixels ahead + */ + + .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ + src_bpp_shift, dst_bpp_shift, \ + prefetch_distance, flags + +-pixman_asm_function fname ++pixman_asm_function \fname + OUT .req r0 + TOP .req r1 + BOTTOM .req r2 + WT .req r3 + WB .req r4 + X .req r5 + UX .req r6 + WIDTH .req ip +@@ -3119,21 +3125,21 @@ pixman_asm_function fname + TMP2 .req r4 + PF_OFFS .req r7 + TMP3 .req r8 + TMP4 .req r9 + STRIDE .req r2 + + mov ip, sp + push {r4, r5, r6, r7, r8, r9} +- mov PF_OFFS, #prefetch_distance ++ mov PF_OFFS, #\prefetch_distance + ldmia ip, {WB, X, UX, WIDTH} + mul PF_OFFS, PF_OFFS, UX + +-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 + vpush {d8-d15} + .endif + + sub STRIDE, BOTTOM, TOP + .unreq BOTTOM + + cmp WIDTH, #0 + ble 3f +@@ -3146,83 +3152,83 @@ pixman_asm_function fname + + /* ensure good destination alignment */ + cmp WIDTH, #1 + blt 0f + tst OUT, #(1 << dst_bpp_shift) + beq 0f + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vadd.u16 q12, q12, q13 +- bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #1 + 0: + vadd.u16 q13, q13, q13 + vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) + vadd.u16 q12, q12, q13 + + cmp WIDTH, #2 + blt 0f + tst OUT, #(1 << (dst_bpp_shift + 1)) + beq 0f +- bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #2 + 0: +-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 ++.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 + /*********** 8 pixels per iteration *****************/ + cmp WIDTH, #4 + blt 0f + tst OUT, #(1 << (dst_bpp_shift + 2)) + beq 0f +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #4 + 0: + subs WIDTH, WIDTH, #8 + blt 1f + mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #8 + blt 5f + 0: +- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #8 + bge 0b + 5: +- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt + 1: + tst WIDTH, #4 + beq 2f +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + 2: + .else + /*********** 4 pixels per iteration *****************/ + subs WIDTH, WIDTH, #4 + blt 1f + mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #4 + blt 5f + 0: +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #4 + bge 0b + 5: +- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt + 1: + /****************************************************/ + .endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 2f +- bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt + 2: + tst WIDTH, #1 + beq 3f +- bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt + 3: +-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 + vpop {d8-d15} + .endif + pop {r4, r5, r6, r7, r8, r9} + bx lr + + .unreq OUT + .unreq TOP + .unreq WT +@@ -3231,17 +3237,19 @@ 3: + .unreq UX + .unreq WIDTH + .unreq TMP1 + .unreq TMP2 + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 + .unreq STRIDE ++#ifndef __clang__ + .endfunc ++#endif + + .endm + + /*****************************************************************************/ + + .set have_bilinear_interpolate_four_pixels_8888_8888, 1 + + .macro bilinear_interpolate_four_pixels_8888_8888_head +diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h +--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h ++++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h +@@ -69,303 +69,303 @@ + .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ + + /* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + + .macro pixldst1 op, elem_size, reg1, mem_operand, abits +-.if abits > 0 +- op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! ++.if \abits > 0 ++ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]! + .else +- op&.&elem_size {d®1}, [&mem_operand&]! ++ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]! + .endif + .endm + + .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits +-.if abits > 0 +- op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! ++.if \abits > 0 ++ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]! + .else +- op&.&elem_size {d®1, d®2}, [&mem_operand&]! ++ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]! + .endif + .endm + + .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits +-.if abits > 0 +- op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! ++.if \abits > 0 ++ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]! + .else +- op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! ++ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]! + .endif + .endm + + .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits +- op&.&elem_size {d®1[idx]}, [&mem_operand&]! ++ \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]! + .endm + + .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand +- op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! ++ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]! + .endm + + .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand +- op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! ++ \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]! + .endm + + .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +-.if numbytes == 32 +- pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif numbytes == 16 +- pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits +-.elseif numbytes == 8 +- pixldst1 op, elem_size, %(basereg+1), mem_operand, abits +-.elseif numbytes == 4 +- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) +- pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits +- .elseif elem_size == 16 +- pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits +- pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits ++.if \numbytes == 32 ++ pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif \numbytes == 16 ++ pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits ++.elseif \numbytes == 8 ++ pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits ++.elseif \numbytes == 4 ++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) ++ pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits ++ .elseif \elem_size == 16 ++ pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits ++ pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits + .else +- pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits +- pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits +- pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits +- pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits ++ pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits ++ pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits ++ pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits ++ pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits + .endif +-.elseif numbytes == 2 +- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) +- pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits ++.elseif \numbytes == 2 ++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) ++ pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits + .else +- pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits +- pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits ++ pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits ++ pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits + .endif +-.elseif numbytes == 1 +- pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits ++.elseif \numbytes == 1 ++ pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld numpix, bpp, basereg, mem_operand, abits=0 +-.if bpp > 0 +-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif (bpp == 24) && (numpix == 8) +- pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +-.elseif (bpp == 24) && (numpix == 4) +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +-.elseif (bpp == 24) && (numpix == 2) +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +-.elseif (bpp == 24) && (numpix == 1) +- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand ++.if \bpp > 0 ++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif (\bpp == 24) && (\numpix == 8) ++ pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand ++.elseif (\bpp == 24) && (\numpix == 4) ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 2) ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 1) ++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand + .else +- pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits ++ pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits + .endif + .endif + .endm + + .macro pixst numpix, bpp, basereg, mem_operand, abits=0 +-.if bpp > 0 +-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif (bpp == 24) && (numpix == 8) +- pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +-.elseif (bpp == 24) && (numpix == 4) +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +-.elseif (bpp == 24) && (numpix == 2) +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +-.elseif (bpp == 24) && (numpix == 1) +- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand ++.if \bpp > 0 ++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif (\bpp == 24) && (\numpix == 8) ++ pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand ++.elseif (\bpp == 24) && (\numpix == 4) ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 2) ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 1) ++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand + .else +- pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits ++ pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits + .endif + .endif + .endm + + .macro pixld_a numpix, bpp, basereg, mem_operand +-.if (bpp * numpix) <= 128 +- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.if (\bpp * \numpix) <= 128 ++ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) + .else +- pixld numpix, bpp, basereg, mem_operand, 128 ++ pixld \numpix, \bpp, \basereg, \mem_operand, 128 + .endif + .endm + + .macro pixst_a numpix, bpp, basereg, mem_operand +-.if (bpp * numpix) <= 128 +- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.if (\bpp * \numpix) <= 128 ++ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) + .else +- pixst numpix, bpp, basereg, mem_operand, 128 ++ pixst \numpix, \bpp, \basereg, \mem_operand, 128 + .endif + .endm + + /* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ + .macro pixld1_s elem_size, reg1, mem_operand +-.if elem_size == 16 ++.if \elem_size == 16 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP1, mem_operand, TMP1, asl #1 ++ add TMP1, \mem_operand, TMP1, asl #1 + mov TMP2, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP2, mem_operand, TMP2, asl #1 +- vld1.16 {d®1&[0]}, [TMP1, :16] ++ add TMP2, \mem_operand, TMP2, asl #1 ++ vld1.16 {d\()\reg1\()[0]}, [TMP1, :16] + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP1, mem_operand, TMP1, asl #1 +- vld1.16 {d®1&[1]}, [TMP2, :16] ++ add TMP1, \mem_operand, TMP1, asl #1 ++ vld1.16 {d\()\reg1\()[1]}, [TMP2, :16] + mov TMP2, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP2, mem_operand, TMP2, asl #1 +- vld1.16 {d®1&[2]}, [TMP1, :16] +- vld1.16 {d®1&[3]}, [TMP2, :16] +-.elseif elem_size == 32 ++ add TMP2, \mem_operand, TMP2, asl #1 ++ vld1.16 {d\()\reg1\()[2]}, [TMP1, :16] ++ vld1.16 {d\()\reg1\()[3]}, [TMP2, :16] ++.elseif \elem_size == 32 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP1, mem_operand, TMP1, asl #2 ++ add TMP1, \mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP2, mem_operand, TMP2, asl #2 +- vld1.32 {d®1&[0]}, [TMP1, :32] +- vld1.32 {d®1&[1]}, [TMP2, :32] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32] ++ vld1.32 {d\()\reg1\()[1]}, [TMP2, :32] + .else + .error "unsupported" + .endif + .endm + + .macro pixld2_s elem_size, reg1, reg2, mem_operand + .if 0 /* elem_size == 32 */ + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 +- add TMP1, mem_operand, TMP1, asl #2 ++ add TMP1, \mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X +- add TMP2, mem_operand, TMP2, asl #2 +- vld1.32 {d®1&[0]}, [TMP1, :32] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 +- add TMP1, mem_operand, TMP1, asl #2 +- vld1.32 {d®2&[0]}, [TMP2, :32] ++ add TMP1, \mem_operand, TMP1, asl #2 ++ vld1.32 {d\()\reg2\()[0]}, [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X +- add TMP2, mem_operand, TMP2, asl #2 +- vld1.32 {d®1&[1]}, [TMP1, :32] +- vld1.32 {d®2&[1]}, [TMP2, :32] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ vld1.32 {d\()\reg1\()[1]}, [TMP1, :32] ++ vld1.32 {d\()\reg2\()[1]}, [TMP2, :32] + .else +- pixld1_s elem_size, reg1, mem_operand +- pixld1_s elem_size, reg2, mem_operand ++ pixld1_s \elem_size, \reg1, \mem_operand ++ pixld1_s \elem_size, \reg2, \mem_operand + .endif + .endm + + .macro pixld0_s elem_size, reg1, idx, mem_operand +-.if elem_size == 16 ++.if \elem_size == 16 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP1, mem_operand, TMP1, asl #1 +- vld1.16 {d®1&[idx]}, [TMP1, :16] +-.elseif elem_size == 32 ++ add TMP1, \mem_operand, TMP1, asl #1 ++ vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16] ++.elseif \elem_size == 32 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X + 5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b +- add TMP1, mem_operand, TMP1, asl #2 +- vld1.32 {d®1&[idx]}, [TMP1, :32] ++ add TMP1, \mem_operand, TMP1, asl #2 ++ vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32] + .endif + .endm + + .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +-.if numbytes == 32 +- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand +- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand +- pixdeinterleave elem_size, %(basereg+4) +-.elseif numbytes == 16 +- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +-.elseif numbytes == 8 +- pixld1_s elem_size, %(basereg+1), mem_operand +-.elseif numbytes == 4 +- .if elem_size == 32 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand +- .elseif elem_size == 16 +- pixld0_s elem_size, %(basereg+0), 2, mem_operand +- pixld0_s elem_size, %(basereg+0), 3, mem_operand ++.if \numbytes == 32 ++ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand ++ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand ++ pixdeinterleave \elem_size, %(\basereg+4) ++.elseif \numbytes == 16 ++ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand ++.elseif \numbytes == 8 ++ pixld1_s \elem_size, %(\basereg+1), \mem_operand ++.elseif \numbytes == 4 ++ .if \elem_size == 32 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand ++ .elseif \elem_size == 16 ++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand + .else +- pixld0_s elem_size, %(basereg+0), 4, mem_operand +- pixld0_s elem_size, %(basereg+0), 5, mem_operand +- pixld0_s elem_size, %(basereg+0), 6, mem_operand +- pixld0_s elem_size, %(basereg+0), 7, mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand + .endif +-.elseif numbytes == 2 +- .if elem_size == 16 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.elseif \numbytes == 2 ++ .if \elem_size == 16 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .else +- pixld0_s elem_size, %(basereg+0), 2, mem_operand +- pixld0_s elem_size, %(basereg+0), 3, mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand + .endif +-.elseif numbytes == 1 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.elseif \numbytes == 1 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld_s numpix, bpp, basereg, mem_operand +-.if bpp > 0 +- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand ++.if \bpp > 0 ++ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand + .endif + .endm + + .macro vuzp8 reg1, reg2 +- vuzp.8 d®1, d®2 ++ vuzp.8 d\()\reg1, d\()\reg2 + .endm + + .macro vzip8 reg1, reg2 +- vzip.8 d®1, d®2 ++ vzip.8 d\()\reg1, d\()\reg2 + .endm + + /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ + .macro pixdeinterleave bpp, basereg +-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- vuzp8 %(basereg+0), %(basereg+1) +- vuzp8 %(basereg+2), %(basereg+3) +- vuzp8 %(basereg+1), %(basereg+3) +- vuzp8 %(basereg+0), %(basereg+2) ++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vuzp8 %(\basereg+0), %(\basereg+1) ++ vuzp8 %(\basereg+2), %(\basereg+3) ++ vuzp8 %(\basereg+1), %(\basereg+3) ++ vuzp8 %(\basereg+0), %(\basereg+2) + .endif + .endm + + /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ + .macro pixinterleave bpp, basereg +-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- vzip8 %(basereg+0), %(basereg+2) +- vzip8 %(basereg+1), %(basereg+3) +- vzip8 %(basereg+2), %(basereg+3) +- vzip8 %(basereg+0), %(basereg+1) ++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vzip8 %(\basereg+0), %(\basereg+2) ++ vzip8 %(\basereg+1), %(\basereg+3) ++ vzip8 %(\basereg+2), %(\basereg+3) ++ vzip8 %(\basereg+0), %(\basereg+1) + .endif + .endm + + /* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is +@@ -389,51 +389,51 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ + .macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) +- a x ++ \a \x + .endif + .endm + + .macro cache_preload std_increment, boost_increment + .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) + .if regs_shortage +- PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ ++ PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ + .endif +-.if std_increment != 0 +- PF add PF_X, PF_X, #std_increment ++.if \std_increment != 0 ++ PF add, PF_X, PF_X, #\std_increment + .endif +- PF tst PF_CTL, #0xF +- PF addne PF_X, PF_X, #boost_increment +- PF subne PF_CTL, PF_CTL, #1 +- PF cmp PF_X, ORIG_W ++ PF tst, PF_CTL, #0xF ++ PF addne, PF_X, PF_X, #\boost_increment ++ PF subne, PF_CTL, PF_CTL, #1 ++ PF cmp, PF_X, ORIG_W + .if src_bpp_shift >= 0 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + .endif + .if dst_r_bpp != 0 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + .endif + .if mask_bpp_shift >= 0 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] + .endif +- PF subge PF_X, PF_X, ORIG_W +- PF subges PF_CTL, PF_CTL, #0x10 ++ PF subge, PF_X, PF_X, ORIG_W ++ PF subges, PF_CTL, PF_CTL, #0x10 + .if src_bpp_shift >= 0 +- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + .endif + .if dst_r_bpp != 0 +- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + .endif + .if mask_bpp_shift >= 0 +- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! ++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + .endif + .endif + .endm + + .macro cache_preload_simple + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) + .if src_bpp > 0 + pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] +@@ -460,51 +460,53 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED + .macro ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + .if dst_w_bpp != 24 + tst DST_R, #0xF + beq 2f + + .irp lowbit, 1, 2, 4, 8, 16 ++#ifndef __clang__ + local skip1 +-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +-.if lowbit < 16 /* we don't need more than 16-byte alignment */ +- tst DST_R, #lowbit ++#endif ++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if \lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_R, #\lowbit + beq 1f + .endif +- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC +- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK ++ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC ++ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK + .if dst_r_bpp > 0 +- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R ++ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R + .else +- add DST_R, DST_R, #lowbit ++ add DST_R, DST_R, #\lowbit + .endif +- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) +- sub W, W, #(lowbit * 8 / dst_w_bpp) ++ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) ++ sub W, W, #(\lowbit * 8 / dst_w_bpp) + 1: + .endif + .endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + +- process_pixblock_head ++ \process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple +- process_pixblock_tail ++ \process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg + .irp lowbit, 1, 2, 4, 8, 16 +-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +-.if lowbit < 16 /* we don't need more than 16-byte alignment */ +- tst DST_W, #lowbit ++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if \lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_W, #\lowbit + beq 1f + .endif +- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W ++ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W + 1: + .endif + .endr + .endif + 2: + .endm + + /* +@@ -525,51 +527,51 @@ 2: + .macro process_trailing_pixels cache_preload_flag, \ + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 2f + .irp chunk_size, 16, 8, 4, 2, 1 +-.if pixblock_size > chunk_size +- tst W, #chunk_size ++.if pixblock_size > \chunk_size ++ tst W, #\chunk_size + beq 1f +- pixld_src chunk_size, src_bpp, src_basereg, SRC +- pixld chunk_size, mask_bpp, mask_basereg, MASK +-.if dst_aligned_flag != 0 +- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++ pixld_src \chunk_size, src_bpp, src_basereg, SRC ++ pixld \chunk_size, mask_bpp, mask_basereg, MASK ++.if \dst_aligned_flag != 0 ++ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R + .else +- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R + .endif +-.if cache_preload_flag != 0 +- PF add PF_X, PF_X, #chunk_size ++.if \cache_preload_flag != 0 ++ PF add, PF_X, PF_X, #\chunk_size + .endif + 1: + .endif + .endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + +- process_pixblock_head +-.if cache_preload_flag != 0 ++ \process_pixblock_head ++.if \cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple + .endif +- process_pixblock_tail ++ \process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg + .irp chunk_size, 16, 8, 4, 2, 1 +-.if pixblock_size > chunk_size +- tst W, #chunk_size ++.if pixblock_size > \chunk_size ++ tst W, #\chunk_size + beq 1f +-.if dst_aligned_flag != 0 +- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++.if \dst_aligned_flag != 0 ++ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W + .else +- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W + .endif + 1: + .endif + .endr + 2: + .endm + + /* +@@ -599,17 +601,17 @@ 2: + .if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift + .endif + subs H, H, #1 + mov DST_R, DST_W + .if regs_shortage + str H, [sp, #4] /* save updated height to stack */ + .endif +- bge start_of_loop_label ++ bge \start_of_loop_label + .endm + + /* + * Registers are allocated in the following way by default: + * d0, d1, d2, d3 - reserved for loading source pixel data + * d4, d5, d6, d7 - reserved for loading destination pixel data + * d24, d25, d26, d27 - reserved for loading mask pixel data + * d28, d29, d30, d31 - final destination pixel data for writeback to memory +@@ -626,48 +628,48 @@ 2: + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + +- pixman_asm_function fname ++ pixman_asm_function \fname + + push {r4-r12, lr} /* save all registers */ + + /* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +-.if prefetch_distance == 0 ++.if \prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ +- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) ++ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE + .endif + + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set pixblock_size, pixblock_size_ +- .set dst_w_basereg, dst_w_basereg_ +- .set dst_r_basereg, dst_r_basereg_ +- .set src_basereg, src_basereg_ +- .set mask_basereg, mask_basereg_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set pixblock_size, \pixblock_size_ ++ .set dst_w_basereg, \dst_w_basereg_ ++ .set dst_r_basereg, \dst_r_basereg_ ++ .set src_basereg, \src_basereg_ ++ .set mask_basereg, \mask_basereg_ + + .macro pixld_src x:vararg +- pixld x ++ pixld \x + .endm + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + /* + * Assign symbolic names to registers + */ +@@ -750,38 +752,38 @@ 2: + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 + .else + .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + +-.if (((flags) & FLAG_DST_READWRITE) != 0) ++.if (((\flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif +-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 + .else + .set DEINTERLEAVE_32BPP_ENABLED, 0 + .endif + +-.if prefetch_distance < 0 || prefetch_distance > 15 +- .error "invalid prefetch distance (prefetch_distance)" ++.if \prefetch_distance < 0 || \prefetch_distance > 15 ++ .error "invalid prefetch distance (\prefetch_distance)" + .endif + + .if src_bpp > 0 + ldr SRC, [sp, #40] + .endif + .if mask_bpp > 0 + ldr MASK, [sp, #48] + .endif +- PF mov PF_X, #0 ++ PF mov, PF_X, #0 + .if src_bpp > 0 + ldr SRC_STRIDE, [sp, #44] + .endif + .if mask_bpp > 0 + ldr MASK_STRIDE, [sp, #52] + .endif + mov DST_R, DST_W + +@@ -796,24 +798,24 @@ 2: + .if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 + .endif + + /* + * Setup advanced prefetcher initial state + */ +- PF mov PF_SRC, SRC +- PF mov PF_DST, DST_R +- PF mov PF_MASK, MASK ++ PF mov, PF_SRC, SRC ++ PF mov, PF_DST, DST_R ++ PF mov, PF_MASK, MASK + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ +- PF mov PF_CTL, H, lsl #4 +- PF add PF_CTL, #(prefetch_distance - 0x10) ++ PF mov, PF_CTL, H, lsl #4 ++ PF add, PF_CTL, #(\prefetch_distance - 0x10) + +- init ++ \init + .if regs_shortage + push {r0, r1} + .endif + subs H, H, #1 + .if regs_shortage + str H, [sp, #4] /* save updated height to stack */ + .else + mov ORIG_W, W +@@ -821,84 +823,84 @@ 2: + blt 9f + cmp W, #(pixblock_size * 2) + blt 8f + /* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ + 0: +- ensure_destination_ptr_alignment process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ ensure_destination_ptr_alignment \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- PF add PF_X, PF_X, #pixblock_size +- process_pixblock_head ++ PF add, PF_X, PF_X, #pixblock_size ++ \process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 2f + 1: +- process_pixblock_tail_head ++ \process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 1b + 2: +- process_pixblock_tail ++ \process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + advance_to_next_scanline 0b + + .if regs_shortage + pop {r0, r1} + .endif +- cleanup ++ \cleanup + pop {r4-r12, pc} /* exit */ + /* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ + 8: + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 1f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- process_pixblock_head +- process_pixblock_tail ++ \process_pixblock_head ++ \process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + 1: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + advance_to_next_scanline 8b + 9: + .if regs_shortage + pop {r0, r1} + .endif +- cleanup ++ \cleanup + pop {r4-r12, pc} /* exit */ + + .purgem fetch_src_pixblock + .purgem pixld_src + + .unreq SRC + .unreq MASK + .unreq DST_R +@@ -910,17 +912,19 @@ 9: + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY ++#ifndef __clang__ + .endfunc ++#endif + .endm + + /* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ + .macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ +@@ -934,49 +938,49 @@ 9: + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + +- pixman_asm_function fname ++ pixman_asm_function \fname + + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set pixblock_size, pixblock_size_ +- .set dst_w_basereg, dst_w_basereg_ +- .set dst_r_basereg, dst_r_basereg_ +- .set src_basereg, src_basereg_ +- .set mask_basereg, mask_basereg_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set pixblock_size, \pixblock_size_ ++ .set dst_w_basereg, \dst_w_basereg_ ++ .set dst_r_basereg, \dst_r_basereg_ ++ .set src_basereg, \src_basereg_ ++ .set mask_basereg, \mask_basereg_ + +-.if use_nearest_scaling != 0 ++.if \use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req r0 + DST_W .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + MASK .req lr + TMP1 .req r4 + TMP2 .req r5 + DST_R .req r6 + SRC_WIDTH_FIXED .req r7 + + .macro pixld_src x:vararg +- pixld_s x ++ pixld_s \x + .endm + + ldr UNIT_X, [sp] + push {r4-r8, lr} + ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] + .if mask_bpp != 0 + ldr MASK, [sp, #(24 + 8)] + .endif +@@ -986,89 +990,89 @@ 9: + */ + W .req r0 /* width (is updated during processing) */ + DST_W .req r1 /* destination buffer pointer for writes */ + SRC .req r2 /* source buffer pointer */ + DST_R .req ip /* destination buffer pointer for reads */ + MASK .req r3 /* mask pointer */ + + .macro pixld_src x:vararg +- pixld x ++ pixld \x + .endm + .endif + +-.if (((flags) & FLAG_DST_READWRITE) != 0) ++.if (((\flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif +-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 + .else + .set DEINTERLEAVE_32BPP_ENABLED, 0 + .endif + + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + +- init ++ \init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 8f + +- ensure_destination_ptr_alignment process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ ensure_destination_ptr_alignment \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 7f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- process_pixblock_head ++ \process_pixblock_head + subs W, W, #pixblock_size + blt 2f + 1: +- process_pixblock_tail_head ++ \process_pixblock_tail_head + subs W, W, #pixblock_size + bge 1b + 2: +- process_pixblock_tail ++ \process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + 7: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + +- cleanup +-.if use_nearest_scaling != 0 ++ \cleanup ++.if \use_nearest_scaling != 0 + pop {r4-r8, pc} /* exit */ + .else + bx lr /* exit */ + .endif + 8: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + +- cleanup ++ \cleanup + +-.if use_nearest_scaling != 0 ++.if \use_nearest_scaling != 0 + pop {r4-r8, pc} /* exit */ + + .unreq DST_R + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 +@@ -1085,25 +1089,27 @@ 8: + .unreq DST_R + .unreq DST_W + .unreq W + .endif + + .purgem fetch_src_pixblock + .purgem pixld_src + ++#ifndef __clang__ + .endfunc ++#endif + .endm + + .macro generate_composite_function_single_scanline x:vararg +- generate_composite_function_scanline 0, x ++ generate_composite_function_scanline 0, \x + .endm + + .macro generate_composite_function_nearest_scanline x:vararg +- generate_composite_function_scanline 1, x ++ generate_composite_function_scanline 1, \x + .endm + + /* Default prologue/epilogue, nothing special needs to be done */ + + .macro default_init + .endm + + .macro default_cleanup +@@ -1129,56 +1135,56 @@ 8: + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) + * into a planar a8r8g8b8 format (with a, r, g, b color components + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). + * + * Warning: the conversion is destructive and the original + * value (in) is lost. + */ + .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b +- vshrn.u16 out_r, in, #8 +- vshrn.u16 out_g, in, #3 +- vsli.u16 in, in, #5 +- vmov.u8 out_a, #255 +- vsri.u8 out_r, out_r, #5 +- vsri.u8 out_g, out_g, #6 +- vshrn.u16 out_b, in, #2 ++ vshrn.u16 \out_r, \in, #8 ++ vshrn.u16 \out_g, \in, #3 ++ vsli.u16 \in, \in, #5 ++ vmov.u8 \out_a, #255 ++ vsri.u8 \out_r, \out_r, #5 ++ vsri.u8 \out_g, \out_g, #6 ++ vshrn.u16 \out_b, \in, #2 + .endm + + .macro convert_0565_to_x888 in, out_r, out_g, out_b +- vshrn.u16 out_r, in, #8 +- vshrn.u16 out_g, in, #3 +- vsli.u16 in, in, #5 +- vsri.u8 out_r, out_r, #5 +- vsri.u8 out_g, out_g, #6 +- vshrn.u16 out_b, in, #2 ++ vshrn.u16 \out_r, \in, #8 ++ vshrn.u16 \out_g, \in, #3 ++ vsli.u16 \in, \in, #5 ++ vsri.u8 \out_r, \out_r, #5 ++ vsri.u8 \out_g, \out_g, #6 ++ vshrn.u16 \out_b, \in, #2 + .endm + + /* + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit + * registers (tmp1, tmp2) + */ + .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 +- vshll.u8 tmp1, in_g, #8 +- vshll.u8 out, in_r, #8 +- vshll.u8 tmp2, in_b, #8 +- vsri.u16 out, tmp1, #5 +- vsri.u16 out, tmp2, #11 ++ vshll.u8 \tmp1, \in_g, #8 ++ vshll.u8 \out, \in_r, #8 ++ vshll.u8 \tmp2, \in_b, #8 ++ vsri.u16 \out, \tmp1, #5 ++ vsri.u16 \out, \tmp2, #11 + .endm + + /* + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels + * returned in (out0, out1) registers pair. Requires one temporary + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original + * value from 'in' is lost + */ + .macro convert_four_0565_to_x888_packed in, out0, out1, tmp +- vshl.u16 out0, in, #5 /* G top 6 bits */ +- vshl.u16 tmp, in, #11 /* B top 5 bits */ +- vsri.u16 in, in, #5 /* R is ready in top bits */ +- vsri.u16 out0, out0, #6 /* G is ready in top bits */ +- vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ +- vshr.u16 out1, in, #8 /* R is in place */ +- vsri.u16 out0, tmp, #8 /* G & B is in place */ +- vzip.u16 out0, out1 /* everything is in place */ ++ vshl.u16 \out0, \in, #5 /* G top 6 bits */ ++ vshl.u16 \tmp, \in, #11 /* B top 5 bits */ ++ vsri.u16 \in, \in, #5 /* R is ready in top bits */ ++ vsri.u16 \out0, \out0, #6 /* G is ready in top bits */ ++ vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */ ++ vshr.u16 \out1, \in, #8 /* R is in place */ ++ vsri.u16 \out0, \tmp, #8 /* G & B is in place */ ++ vzip.u16 \out0, \out1 /* everything is in place */ + .endm +diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S +--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S ++++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S +@@ -20,16 +20,20 @@ + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + ++#ifdef __clang__ ++#define subpls subspl ++#endif ++ + /* Prevent the stack from becoming executable */ + #if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits + #endif + + .text + .arch armv6 + .object_arch armv4 +@@ -57,100 +61,107 @@ + * prefetch_braking_distance - stop prefetching when that many pixels are + * remaining before the end of scanline + */ + + .macro generate_nearest_scanline_func fname, bpp_shift, t, \ + prefetch_distance, \ + prefetch_braking_distance + +-pixman_asm_function fname ++pixman_asm_function \fname + W .req r0 + DST .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + TMP1 .req r4 + TMP2 .req r5 + VXMASK .req r6 + PF_OFFS .req r7 + SRC_WIDTH_FIXED .req r8 + + ldr UNIT_X, [sp] + push {r4, r5, r6, r7, r8, r10} +- mvn VXMASK, #((1 << bpp_shift) - 1) ++ mvn VXMASK, #((1 << \bpp_shift) - 1) + ldr SRC_WIDTH_FIXED, [sp, #28] + + /* define helper macro */ + .macro scale_2_pixels +- ldr&t TMP1, [SRC, TMP1] +- and TMP2, VXMASK, VX, asr #(16 - bpp_shift) ++ ldr\()\t TMP1, [SRC, TMP1] ++ and TMP2, VXMASK, VX, asr #(16 - \bpp_shift) + adds VX, VX, UNIT_X +- str&t TMP1, [DST], #(1 << bpp_shift) ++ str\()\t TMP1, [DST], #(1 << \bpp_shift) + 9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + +- ldr&t TMP2, [SRC, TMP2] +- and TMP1, VXMASK, VX, asr #(16 - bpp_shift) ++ ldr\()\t TMP2, [SRC, TMP2] ++ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) + adds VX, VX, UNIT_X +- str&t TMP2, [DST], #(1 << bpp_shift) ++ str\()\t TMP2, [DST], #(1 << \bpp_shift) + 9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + .endm + + /* now do the scaling */ +- and TMP1, VXMASK, VX, asr #(16 - bpp_shift) ++ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) + adds VX, VX, UNIT_X + 9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b +- subs W, W, #(8 + prefetch_braking_distance) ++ subs W, W, #(8 + \prefetch_braking_distance) + blt 2f + /* calculate prefetch offset */ +- mov PF_OFFS, #prefetch_distance ++ mov PF_OFFS, #\prefetch_distance + mla PF_OFFS, UNIT_X, PF_OFFS, VX + 1: /* main loop, process 8 pixels per iteration with prefetch */ +- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] ++ pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)] + add PF_OFFS, UNIT_X, lsl #3 + scale_2_pixels + scale_2_pixels + scale_2_pixels + scale_2_pixels + subs W, W, #8 + bge 1b + 2: +- subs W, W, #(4 - 8 - prefetch_braking_distance) ++ subs W, W, #(4 - 8 - \prefetch_braking_distance) + blt 2f + 1: /* process the remaining pixels */ + scale_2_pixels + scale_2_pixels + subs W, W, #4 + bge 1b + 2: + tst W, #2 + beq 2f + scale_2_pixels + 2: + tst W, #1 +- ldrne&t TMP1, [SRC, TMP1] +- strne&t TMP1, [DST] ++#ifdef __clang__ ++ ldr\()\t\()ne TMP1, [SRC, TMP1] ++ str\()\t\()ne TMP1, [DST] ++#else ++ ldrne\()\t TMP1, [SRC, TMP1] ++ strne\()\t TMP1, [DST] ++#endif + /* cleanup helper macro */ + .purgem scale_2_pixels + .unreq DST + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq VXMASK + .unreq PF_OFFS + .unreq SRC_WIDTH_FIXED + /* return */ + pop {r4, r5, r6, r7, r8, r10} + bx lr ++#ifndef __clang__ + .endfunc ++#endif + .endm + + generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 + + generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 +diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S +--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S ++++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S +@@ -20,16 +20,21 @@ + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Ben Avison (bavison@riscosopen.org) + * + */ + ++#ifdef __clang__ ++#define adceqs adcseq ++#define ldmnedb ldmdbne ++#endif ++ + /* Prevent the stack from becoming executable */ + #if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits + #endif + + .text + .arch armv6 + .object_arch armv4 +@@ -52,26 +57,26 @@ + * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output + */ + + .macro blit_init + line_saved_regs STRIDE_D, STRIDE_S + .endm + + .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- pixld cond, numbytes, firstreg, SRC, unaligned_src ++ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src + .endm + + .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M +-110: pixld , 16, 0, SRC, unaligned_src +- pixld , 16, 4, SRC, unaligned_src ++110: pixld , 16, 0, SRC, \unaligned_src ++ pixld , 16, 4, SRC, \unaligned_src + pld [SRC, SCRATCH] + pixst , 16, 0, DST + pixst , 16, 4, DST + subs X, X, #32*8/src_bpp + bhs 110b + .unreq WK4 + .unreq WK5 + .unreq WK6 +@@ -137,17 +142,17 @@ generate_composite_function \ + mov STRIDE_M, SRC + .endm + + .macro fill_process_tail cond, numbytes, firstreg + WK4 .req SRC + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M +- pixst cond, numbytes, 4, DST ++ pixst \cond, \numbytes, 4, DST + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 + .endm + + generate_composite_function \ + pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ +@@ -177,30 +182,30 @@ generate_composite_function \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + + /******************************************************************************/ + + .macro src_x888_8888_pixel, cond, reg +- orr&cond WK®, WK®, #0xFF000000 ++ orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000 + .endm + + .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- pixld cond, numbytes, firstreg, SRC, unaligned_src ++ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src + .endm + + .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg +- src_x888_8888_pixel cond, %(firstreg+0) +- .if numbytes >= 8 +- src_x888_8888_pixel cond, %(firstreg+1) +- .if numbytes == 16 +- src_x888_8888_pixel cond, %(firstreg+2) +- src_x888_8888_pixel cond, %(firstreg+3) ++ src_x888_8888_pixel \cond, %(\firstreg+0) ++ .if \numbytes >= 8 ++ src_x888_8888_pixel \cond, %(\firstreg+1) ++ .if \numbytes == 16 ++ src_x888_8888_pixel \cond, %(\firstreg+2) ++ src_x888_8888_pixel \cond, %(\firstreg+3) + .endif + .endif + .endm + + generate_composite_function \ + pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 3, /* prefetch distance */ \ +@@ -217,83 +222,83 @@ generate_composite_function \ + ldr MASK, =0x07E007E0 + mov STRIDE_M, #0xFF000000 + /* Set GE[3:0] to 1010 so SEL instructions do what we want */ + ldr SCRATCH, =0x80008000 + uadd8 SCRATCH, SCRATCH, SCRATCH + .endm + + .macro src_0565_8888_2pixels, reg1, reg2 +- and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 +- bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb +- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg +- mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 +- mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG +- bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 +- orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 +- orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 +- pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- +- sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- +- mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg +- pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- +- sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- +- orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb +- orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB ++ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 ++ bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb ++ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg ++ mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000 ++ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG ++ bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000 ++ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 ++ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 ++ pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- ++ sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- ++ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg ++ pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- ++ sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- ++ orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb ++ orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + .endm + + /* This version doesn't need STRIDE_M, but is one instruction longer. + It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? +- and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 +- bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb +- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg +- mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB +- mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 +- bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb +- mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 +- mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 +- orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB +- orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb +- pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB +- pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb +- sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB +- sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb +- orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB +- orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb ++ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 ++ bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb ++ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg ++ mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB ++ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 ++ bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb ++ mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 ++ mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 ++ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB ++ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb ++ pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB ++ pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb ++ sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB ++ sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb ++ orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB ++ orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb + */ + + .macro src_0565_8888_1pixel, reg +- bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb +- and WK®, WK®, MASK @ 000000000000000000000gggggg00000 +- mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 +- mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 +- orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb +- orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 +- pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb +- sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb +- orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb ++ bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb ++ and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000 ++ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 ++ mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000 ++ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb ++ orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000 ++ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb ++ sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb ++ orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb + .endm + + .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- .if numbytes == 16 +- pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src +- .elseif numbytes == 8 +- pixld , 4, firstreg, SRC, unaligned_src +- .elseif numbytes == 4 +- pixld , 2, firstreg, SRC, unaligned_src ++ .if \numbytes == 16 ++ pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src ++ .elseif \numbytes == 8 ++ pixld , 4, \firstreg, SRC, \unaligned_src ++ .elseif \numbytes == 4 ++ pixld , 2, \firstreg, SRC, \unaligned_src + .endif + .endm + + .macro src_0565_8888_process_tail cond, numbytes, firstreg +- .if numbytes == 16 +- src_0565_8888_2pixels firstreg, %(firstreg+1) +- src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) +- .elseif numbytes == 8 +- src_0565_8888_2pixels firstreg, %(firstreg+1) ++ .if \numbytes == 16 ++ src_0565_8888_2pixels \firstreg, %(\firstreg+1) ++ src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3) ++ .elseif \numbytes == 8 ++ src_0565_8888_2pixels \firstreg, %(\firstreg+1) + .else +- src_0565_8888_1pixel firstreg ++ src_0565_8888_1pixel \firstreg + .endif + .endm + + generate_composite_function \ + pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ + 3, /* prefetch distance */ \ + src_0565_8888_init, \ +@@ -306,67 +311,67 @@ generate_composite_function \ + + .macro src_x888_0565_init + /* Hold loop invariant in MASK */ + ldr MASK, =0x001F001F + line_saved_regs STRIDE_S, ORIG_W + .endm + + .macro src_x888_0565_1pixel s, d +- and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb +- and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000 +- orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb +- orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb ++ and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb ++ and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000 ++ orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb ++ orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb + /* Top 16 bits are discarded during the following STRH */ + .endm + + .macro src_x888_0565_2pixels slo, shi, d, tmp +- and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 +- and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB +- and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb +- orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB +- orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB +- and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000 +- orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb +- orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb +- pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb ++ and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 ++ and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB ++ and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb ++ orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB ++ orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB ++ and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000 ++ orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb ++ orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb ++ pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb + .endm + + .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_S + WK5 .req STRIDE_M + WK6 .req WK3 + WK7 .req ORIG_W +- .if numbytes == 16 ++ .if \numbytes == 16 + pixld , 16, 4, SRC, 0 + src_x888_0565_2pixels 4, 5, 0, 0 + pixld , 8, 4, SRC, 0 + src_x888_0565_2pixels 6, 7, 1, 1 + pixld , 8, 6, SRC, 0 + .else +- pixld , numbytes*2, 4, SRC, 0 ++ pixld , \numbytes*2, 4, SRC, 0 + .endif + .endm + + .macro src_x888_0565_process_tail cond, numbytes, firstreg +- .if numbytes == 16 ++ .if \numbytes == 16 + src_x888_0565_2pixels 4, 5, 2, 2 + src_x888_0565_2pixels 6, 7, 3, 4 +- .elseif numbytes == 8 ++ .elseif \numbytes == 8 + src_x888_0565_2pixels 4, 5, 1, 1 + src_x888_0565_2pixels 6, 7, 2, 2 +- .elseif numbytes == 4 ++ .elseif \numbytes == 4 + src_x888_0565_2pixels 4, 5, 1, 1 + .else + src_x888_0565_1pixel 4, 1 + .endif +- .if numbytes == 16 +- pixst , numbytes, 0, DST ++ .if \numbytes == 16 ++ pixst , \numbytes, 0, DST + .else +- pixst , numbytes, 1, DST ++ pixst , \numbytes, 1, DST + .endif + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 + .endm + + generate_composite_function \ +@@ -377,47 +382,47 @@ generate_composite_function \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_x888_0565_process_head, \ + src_x888_0565_process_tail + + /******************************************************************************/ + + .macro add_8_8_8pixels cond, dst1, dst2 +- uqadd8&cond WK&dst1, WK&dst1, MASK +- uqadd8&cond WK&dst2, WK&dst2, STRIDE_M ++ uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK ++ uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M + .endm + + .macro add_8_8_4pixels cond, dst +- uqadd8&cond WK&dst, WK&dst, MASK ++ uqadd8\()\cond WK\()\dst, WK\()\dst, MASK + .endm + + .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req MASK + WK5 .req STRIDE_M +- .if numbytes == 16 +- pixld cond, 8, 4, SRC, unaligned_src +- pixld cond, 16, firstreg, DST, 0 +- add_8_8_8pixels cond, firstreg, %(firstreg+1) +- pixld cond, 8, 4, SRC, unaligned_src ++ .if \numbytes == 16 ++ pixld \cond, 8, 4, SRC, \unaligned_src ++ pixld \cond, 16, \firstreg, DST, 0 ++ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) ++ pixld \cond, 8, 4, SRC, \unaligned_src + .else +- pixld cond, numbytes, 4, SRC, unaligned_src +- pixld cond, numbytes, firstreg, DST, 0 ++ pixld \cond, \numbytes, 4, SRC, \unaligned_src ++ pixld \cond, \numbytes, \firstreg, DST, 0 + .endif + .unreq WK4 + .unreq WK5 + .endm + + .macro add_8_8_process_tail cond, numbytes, firstreg +- .if numbytes == 16 +- add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) +- .elseif numbytes == 8 +- add_8_8_8pixels cond, firstreg, %(firstreg+1) ++ .if \numbytes == 16 ++ add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3) ++ .elseif \numbytes == 8 ++ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) + .else +- add_8_8_4pixels cond, firstreg ++ add_8_8_4pixels \cond, \firstreg + .endif + .endm + + generate_composite_function \ + pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 2, /* prefetch distance */ \ + nop_macro, /* init */ \ +@@ -436,82 +441,82 @@ generate_composite_function \ + line_saved_regs STRIDE_D, STRIDE_S, ORIG_W + .endm + + .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req STRIDE_M + WK7 .req ORIG_W +- pixld , numbytes, %(4+firstreg), SRC, unaligned_src +- pixld , numbytes, firstreg, DST, 0 ++ pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src ++ pixld , \numbytes, \firstreg, DST, 0 + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 + .endm + + .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 + /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ +- teq WK®0, #0 +- .if numbytes > 4 +- teqeq WK®1, #0 +- .if numbytes > 8 +- teqeq WK®2, #0 +- teqeq WK®3, #0 ++ teq WK\()\reg0, #0 ++ .if \numbytes > 4 ++ teqeq WK\()\reg1, #0 ++ .if \numbytes > 8 ++ teqeq WK\()\reg2, #0 ++ teqeq WK\()\reg3, #0 + .endif + .endif + .endm + + .macro over_8888_8888_prepare next +- mov WK&next, WK&next, lsr #24 ++ mov WK\()\next, WK\()\next, lsr #24 + .endm + + .macro over_8888_8888_1pixel src, dst, offset, next + /* src = destination component multiplier */ +- rsb WK&src, WK&src, #255 ++ rsb WK\()\src, WK\()\src, #255 + /* Split even/odd bytes of dst into SCRATCH/dst */ +- uxtb16 SCRATCH, WK&dst +- uxtb16 WK&dst, WK&dst, ror #8 ++ uxtb16 SCRATCH, WK\()\dst ++ uxtb16 WK\()\dst, WK\()\dst, ror #8 + /* Multiply through, adding 0.5 to the upper byte of result for rounding */ +- mla SCRATCH, SCRATCH, WK&src, MASK +- mla WK&dst, WK&dst, WK&src, MASK ++ mla SCRATCH, SCRATCH, WK\()\src, MASK ++ mla WK\()\dst, WK\()\dst, WK\()\src, MASK + /* Where we would have had a stall between the result of the first MLA and the shifter input, + * reload the complete source pixel */ +- ldr WK&src, [SRC, #offset] ++ ldr WK\()\src, [SRC, #\offset] + /* Multiply by 257/256 to approximate 256/255 */ + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + /* In this stall, start processing the next pixel */ +- .if offset < -4 +- mov WK&next, WK&next, lsr #24 ++ .if \offset < -4 ++ mov WK\()\next, WK\()\next, lsr #24 + .endif +- uxtab16 WK&dst, WK&dst, WK&dst, ror #8 ++ uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8 + /* Recombine even/odd bytes of multiplied destination */ + mov SCRATCH, SCRATCH, ror #8 +- sel WK&dst, SCRATCH, WK&dst ++ sel WK\()\dst, SCRATCH, WK\()\dst + /* Saturated add of source to multiplied destination */ +- uqadd8 WK&dst, WK&dst, WK&src ++ uqadd8 WK\()\dst, WK\()\dst, WK\()\src + .endm + + .macro over_8888_8888_process_tail cond, numbytes, firstreg + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req STRIDE_M + WK7 .req ORIG_W +- over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) ++ over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg) + beq 10f +- over_8888_8888_prepare %(4+firstreg) +- .set PROCESS_REG, firstreg +- .set PROCESS_OFF, -numbytes +- .rept numbytes / 4 ++ over_8888_8888_prepare %(4+\firstreg) ++ .set PROCESS_REG, \firstreg ++ .set PROCESS_OFF, -\numbytes ++ .rept \numbytes / 4 + over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .set PROCESS_OFF, PROCESS_OFF+4 + .endr +- pixst , numbytes, firstreg, DST ++ pixst , \numbytes, \firstreg, DST + 10: + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 + .endm + + generate_composite_function \ +@@ -531,26 +536,26 @@ generate_composite_function \ + * word Register containing 4 bytes + * byte Register containing byte multiplier (bits 8-31 must be 0) + * tmp Scratch register + * half Register containing the constant 0x00800080 + * GE[3:0] bits must contain 0101 + */ + .macro mul_8888_8 word, byte, tmp, half + /* Split even/odd bytes of word apart */ +- uxtb16 tmp, word +- uxtb16 word, word, ror #8 ++ uxtb16 \tmp, \word ++ uxtb16 \word, \word, ror #8 + /* Multiply bytes together with rounding, then by 257/256 */ +- mla tmp, tmp, byte, half +- mla word, word, byte, half /* 1 stall follows */ +- uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ +- uxtab16 word, word, word, ror #8 ++ mla \tmp, \tmp, \byte, \half ++ mla \word, \word, \byte, \half /* 1 stall follows */ ++ uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */ ++ uxtab16 \word, \word, \word, ror #8 + /* Recombine bytes */ +- mov tmp, tmp, ror #8 +- sel word, tmp, word ++ mov \tmp, \tmp, ror #8 ++ sel \word, \tmp, \word + .endm + + /******************************************************************************/ + + .macro over_8888_n_8888_init + /* Mask is constant */ + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] + /* Hold loop invariant in STRIDE_M */ +@@ -562,51 +567,51 @@ generate_composite_function \ + line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W + .endm + + .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req Y + WK5 .req STRIDE_D + WK6 .req STRIDE_S + WK7 .req ORIG_W +- pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src +- pixld , numbytes, firstreg, DST, 0 ++ pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src ++ pixld , \numbytes, \firstreg, DST, 0 + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 + .endm + + .macro over_8888_n_8888_1pixel src, dst +- mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M +- sub WK7, WK6, WK&src, lsr #24 +- mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M +- uqadd8 WK&dst, WK&dst, WK&src ++ mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M ++ sub WK7, WK6, WK\()\src, lsr #24 ++ mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M ++ uqadd8 WK\()\dst, WK\()\dst, WK\()\src + .endm + + .macro over_8888_n_8888_process_tail cond, numbytes, firstreg + WK4 .req Y + WK5 .req STRIDE_D + WK6 .req STRIDE_S + WK7 .req ORIG_W +- over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) ++ over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg) + beq 10f + mov WK6, #255 +- .set PROCESS_REG, firstreg +- .rept numbytes / 4 +- .if numbytes == 16 && PROCESS_REG == 2 ++ .set PROCESS_REG, \firstreg ++ .rept \numbytes / 4 ++ .if \numbytes == 16 && PROCESS_REG == 2 + /* We're using WK6 and WK7 as temporaries, so half way through + * 4 pixels, reload the second two source pixels but this time + * into WK4 and WK5 */ + ldmdb SRC, {WK4, WK5} + .endif + over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr +- pixst , numbytes, firstreg, DST ++ pixst , \numbytes, \firstreg, DST + 10: + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 + .endm + + generate_composite_function \ +@@ -637,47 +642,47 @@ generate_composite_function \ + ldr STRIDE_D, =0x00800080 + b 1f + .ltorg + 1: + .endm + + .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_M +- pixld , numbytes/4, 4, MASK, unaligned_mask +- pixld , numbytes, firstreg, DST, 0 ++ pixld , \numbytes/4, 4, MASK, \unaligned_mask ++ pixld , \numbytes, \firstreg, DST, 0 + .unreq WK4 + .endm + + .macro over_n_8_8888_1pixel src, dst +- uxtb Y, WK4, ror #src*8 ++ uxtb Y, WK4, ror #\src*8 + /* Trailing part of multiplication of source */ + mla SCRATCH, STRIDE_S, Y, STRIDE_D + mla Y, SRC, Y, STRIDE_D + mov ORIG_W, #255 + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + uxtab16 Y, Y, Y, ror #8 + mov SCRATCH, SCRATCH, ror #8 + sub ORIG_W, ORIG_W, Y, lsr #24 + sel Y, SCRATCH, Y + /* Then multiply the destination */ +- mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D +- uqadd8 WK&dst, WK&dst, Y ++ mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D ++ uqadd8 WK\()\dst, WK\()\dst, Y + .endm + + .macro over_n_8_8888_process_tail cond, numbytes, firstreg + WK4 .req STRIDE_M + teq WK4, #0 + beq 10f +- .set PROCESS_REG, firstreg +- .rept numbytes / 4 +- over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) ++ .set PROCESS_REG, \firstreg ++ .rept \numbytes / 4 ++ over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr +- pixst , numbytes, firstreg, DST ++ pixst , \numbytes, \firstreg, DST + 10: + .unreq WK4 + .endm + + generate_composite_function \ + pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ +@@ -700,64 +705,64 @@ generate_composite_function \ + line_saved_regs STRIDE_D, ORIG_W + .endm + + .macro over_reverse_n_8888_newline + mov STRIDE_D, #0xFF + .endm + + .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- pixld , numbytes, firstreg, DST, 0 ++ pixld , \numbytes, \firstreg, DST, 0 + .endm + + .macro over_reverse_n_8888_1pixel d, is_only +- teq WK&d, #0 ++ teq WK\()\d, #0 + beq 8f /* replace with source */ +- bics ORIG_W, STRIDE_D, WK&d, lsr #24 +- .if is_only == 1 ++ bics ORIG_W, STRIDE_D, WK\()\d, lsr #24 ++ .if \is_only == 1 + beq 49f /* skip store */ + .else + beq 9f /* write same value back */ + .endif + mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ + mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 + mov SCRATCH, SCRATCH, ror #8 + sel ORIG_W, SCRATCH, ORIG_W +- uqadd8 WK&d, WK&d, ORIG_W ++ uqadd8 WK\()\d, WK\()\d, ORIG_W + b 9f +-8: mov WK&d, SRC ++8: mov WK\()\d, SRC + 9: + .endm + + .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 +- .if numbytes == 4 +- over_reverse_n_8888_1pixel reg1, 1 ++ .if \numbytes == 4 ++ over_reverse_n_8888_1pixel \reg1, 1 + .else +- and SCRATCH, WK®1, WK®2 +- .if numbytes == 16 +- and SCRATCH, SCRATCH, WK®3 +- and SCRATCH, SCRATCH, WK®4 ++ and SCRATCH, WK\()\reg1, WK\()\reg2 ++ .if \numbytes == 16 ++ and SCRATCH, SCRATCH, WK\()\reg3 ++ and SCRATCH, SCRATCH, WK\()\reg4 + .endif + mvns SCRATCH, SCRATCH, asr #24 + beq 49f /* skip store if all opaque */ +- over_reverse_n_8888_1pixel reg1, 0 +- over_reverse_n_8888_1pixel reg2, 0 +- .if numbytes == 16 +- over_reverse_n_8888_1pixel reg3, 0 +- over_reverse_n_8888_1pixel reg4, 0 ++ over_reverse_n_8888_1pixel \reg1, 0 ++ over_reverse_n_8888_1pixel \reg2, 0 ++ .if \numbytes == 16 ++ over_reverse_n_8888_1pixel \reg3, 0 ++ over_reverse_n_8888_1pixel \reg4, 0 + .endif + .endif +- pixst , numbytes, reg1, DST ++ pixst , \numbytes, \reg1, DST + 49: + .endm + + .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg +- over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) ++ over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) + .endm + + generate_composite_function \ + pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ + 3, /* prefetch distance */ \ + over_reverse_n_8888_init, \ + over_reverse_n_8888_newline, \ +@@ -789,30 +794,30 @@ generate_composite_function \ + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq WK4 + .endm + + .macro over_white_8888_8888_ca_combine m, d + uxtb16 TMP1, TMP0 /* rb_notmask */ +- uxtb16 TMP2, d /* rb_dest; 1 stall follows */ ++ uxtb16 TMP2, \d /* rb_dest; 1 stall follows */ + smlatt TMP3, TMP2, TMP1, HALF /* red */ + smlabb TMP2, TMP2, TMP1, HALF /* blue */ + uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ +- uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */ +- smlatt d, TMP1, TMP0, HALF /* alpha */ ++ uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */ ++ smlatt \d, TMP1, TMP0, HALF /* alpha */ + smlabb TMP1, TMP1, TMP0, HALF /* green */ + pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ +- pkhbt TMP1, TMP1, d, lsl #16 /* ag */ ++ pkhbt TMP1, TMP1, \d, lsl #16 /* ag */ + uxtab16 TMP0, TMP0, TMP0, ror #8 + uxtab16 TMP1, TMP1, TMP1, ror #8 + mov TMP0, TMP0, ror #8 +- sel d, TMP0, TMP1 +- uqadd8 d, d, m /* d is a late result */ ++ sel \d, TMP0, TMP1 ++ uqadd8 \d, \d, \m /* d is a late result */ + .endm + + .macro over_white_8888_8888_ca_1pixel_head + pixld , 4, 1, MASK, 0 + pixld , 4, 3, DST, 0 + .endm + + .macro over_white_8888_8888_ca_1pixel_tail +@@ -848,29 +853,29 @@ 02: mvn TMP0, WK2 + movcs WK4, WK2 + b 04f + 03: over_white_8888_8888_ca_combine WK2, WK4 + 04: pixst , 8, 3, DST + 05: + .endm + + .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- .if numbytes == 4 ++ .if \numbytes == 4 + over_white_8888_8888_ca_1pixel_head + .else +- .if numbytes == 16 ++ .if \numbytes == 16 + over_white_8888_8888_ca_2pixels_head + over_white_8888_8888_ca_2pixels_tail + .endif + over_white_8888_8888_ca_2pixels_head + .endif + .endm + + .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg +- .if numbytes == 4 ++ .if \numbytes == 4 + over_white_8888_8888_ca_1pixel_tail + .else + over_white_8888_8888_ca_2pixels_tail + .endif + .endm + + generate_composite_function \ + pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ +@@ -999,33 +1004,35 @@ 20: /* No simplifications possible - + uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ + 30: /* The destination buffer is already in the L1 cache, so + * there's little point in amalgamating writes */ + pixst , 4, 0, DST + 40: + .endm + + .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- .rept (numbytes / 4) - 1 ++ .rept (\numbytes / 4) - 1 + over_n_8888_8888_ca_1pixel_head + over_n_8888_8888_ca_1pixel_tail + .endr + over_n_8888_8888_ca_1pixel_head + .endm + + .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg + over_n_8888_8888_ca_1pixel_tail + .endm + + pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 + ldr ip, [sp] + cmp ip, #-1 + beq pixman_composite_over_white_8888_8888_ca_asm_armv6 + /* else drop through... */ ++#ifndef __clang__ + .endfunc ++#endif + generate_composite_function \ + pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ + 2, /* prefetch distance */ \ + over_n_8888_8888_ca_init, \ + nop_macro, /* newline */ \ + over_n_8888_8888_ca_cleanup, \ + over_n_8888_8888_ca_process_head, \ +@@ -1040,94 +1047,94 @@ generate_composite_function \ + uadd8 SCRATCH, MASK, MASK + /* Offset the source pointer: we only need the alpha bytes */ + add SRC, SRC, #3 + line_saved_regs ORIG_W + .endm + + .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 + ldrb ORIG_W, [SRC], #4 +- .if numbytes >= 8 +- ldrb WK®1, [SRC], #4 +- .if numbytes == 16 +- ldrb WK®2, [SRC], #4 +- ldrb WK®3, [SRC], #4 ++ .if \numbytes >= 8 ++ ldrb WK\()\reg1, [SRC], #4 ++ .if \numbytes == 16 ++ ldrb WK\()\reg2, [SRC], #4 ++ ldrb WK\()\reg3, [SRC], #4 + .endif + .endif +- add DST, DST, #numbytes ++ add DST, DST, #\numbytes + .endm + + .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) ++ in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2) + .endm + + .macro in_reverse_8888_8888_1pixel s, d, offset, is_only +- .if is_only != 1 +- movs s, ORIG_W +- .if offset != 0 +- ldrb ORIG_W, [SRC, #offset] ++ .if \is_only != 1 ++ movs \s, ORIG_W ++ .if \offset != 0 ++ ldrb ORIG_W, [SRC, #\offset] + .endif + beq 01f + teq STRIDE_M, #0xFF + beq 02f + .endif +- uxtb16 SCRATCH, d /* rb_dest */ +- uxtb16 d, d, ror #8 /* ag_dest */ +- mla SCRATCH, SCRATCH, s, MASK +- mla d, d, s, MASK ++ uxtb16 SCRATCH, \d /* rb_dest */ ++ uxtb16 \d, \d, ror #8 /* ag_dest */ ++ mla SCRATCH, SCRATCH, \s, MASK ++ mla \d, \d, \s, MASK + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 +- uxtab16 d, d, d, ror #8 ++ uxtab16 \d, \d, \d, ror #8 + mov SCRATCH, SCRATCH, ror #8 +- sel d, SCRATCH, d ++ sel \d, SCRATCH, \d + b 02f +- .if offset == 0 ++ .if \offset == 0 + 48: /* Last mov d,#0 of the set - used as part of shortcut for + * source values all 0 */ + .endif +-01: mov d, #0 ++01: mov \d, #0 + 02: + .endm + + .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 +- .if numbytes == 4 ++ .if \numbytes == 4 + teq ORIG_W, ORIG_W, asr #32 +- ldrne WK®1, [DST, #-4] +- .elseif numbytes == 8 +- teq ORIG_W, WK®1 ++ ldrne WK\()\reg1, [DST, #-4] ++ .elseif \numbytes == 8 ++ teq ORIG_W, WK\()\reg1 + teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ +- ldmnedb DST, {WK®1-WK®2} ++ ldmnedb DST, {WK\()\reg1-WK\()\reg2} + .else +- teq ORIG_W, WK®1 +- teqeq ORIG_W, WK®2 +- teqeq ORIG_W, WK®3 ++ teq ORIG_W, WK\()\reg1 ++ teqeq ORIG_W, WK\()\reg2 ++ teqeq ORIG_W, WK\()\reg3 + teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ +- ldmnedb DST, {WK®1-WK®4} ++ ldmnedb DST, {WK\()\reg1-WK\()\reg4} + .endif + cmnne DST, #0 /* clear C if NE */ + bcs 49f /* no writes to dest if source all -1 */ + beq 48f /* set dest to all 0 if source all 0 */ +- .if numbytes == 4 +- in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 +- str WK®1, [DST, #-4] +- .elseif numbytes == 8 +- in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 +- in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 +- stmdb DST, {WK®1-WK®2} ++ .if \numbytes == 4 ++ in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1 ++ str WK\()\reg1, [DST, #-4] ++ .elseif \numbytes == 8 ++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0 ++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0 ++ stmdb DST, {WK\()\reg1-WK\()\reg2} + .else +- in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 +- in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 +- in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 +- in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 +- stmdb DST, {WK®1-WK®4} ++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0 ++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0 ++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0 ++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0 ++ stmdb DST, {WK\()\reg1-WK\()\reg4} + .endif + 49: + .endm + + .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg +- in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) ++ in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) + .endm + + generate_composite_function \ + pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ + 2, /* prefetch distance */ \ + in_reverse_8888_8888_init, \ + nop_macro, /* newline */ \ +@@ -1144,31 +1151,31 @@ generate_composite_function \ + /* Hold multiplier for destination in STRIDE_M */ + mov STRIDE_M, #255 + sub STRIDE_M, STRIDE_M, SRC, lsr #24 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, MASK, MASK + .endm + + .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +- pixld , numbytes, firstreg, DST, 0 ++ pixld , \numbytes, \firstreg, DST, 0 + .endm + + .macro over_n_8888_1pixel dst +- mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK +- uqadd8 WK&dst, WK&dst, SRC ++ mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK ++ uqadd8 WK\()\dst, WK\()\dst, SRC + .endm + + .macro over_n_8888_process_tail cond, numbytes, firstreg +- .set PROCESS_REG, firstreg +- .rept numbytes / 4 ++ .set PROCESS_REG, \firstreg ++ .rept \numbytes / 4 + over_n_8888_1pixel %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr +- pixst , numbytes, firstreg, DST ++ pixst , \numbytes, \firstreg, DST + .endm + + generate_composite_function \ + pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ + 2, /* prefetch distance */ \ + over_n_8888_init, \ + nop_macro, /* newline */ \ +diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h +--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h ++++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h +@@ -107,88 +107,120 @@ + .set PREFETCH_TYPE_NONE, 0 + .set PREFETCH_TYPE_STANDARD, 1 + + /* + * Definitions of macros for load/store of pixel data. + */ + + .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 +- .if numbytes == 16 +- .if unaligned == 1 +- op&r&cond WK®0, [base], #4 +- op&r&cond WK®1, [base], #4 +- op&r&cond WK®2, [base], #4 +- op&r&cond WK®3, [base], #4 ++ .if \numbytes == 16 ++ .if \unaligned == 1 ++ \op\()r\()\cond WK\()\reg0, [\base], #4 ++ \op\()r\()\cond WK\()\reg1, [\base], #4 ++ \op\()r\()\cond WK\()\reg2, [\base], #4 ++ \op\()r\()\cond WK\()\reg3, [\base], #4 + .else +- op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} ++#ifdef __clang__ ++ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} ++#else ++ \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} ++#endif + .endif +- .elseif numbytes == 8 +- .if unaligned == 1 +- op&r&cond WK®0, [base], #4 +- op&r&cond WK®1, [base], #4 ++ .elseif \numbytes == 8 ++ .if \unaligned == 1 ++ \op\()r\()\cond WK\()\reg0, [\base], #4 ++ \op\()r\()\cond WK\()\reg1, [\base], #4 + .else +- op&m&cond&ia base!, {WK®0,WK®1} ++#ifdef __clang__ ++ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1} ++#else ++ \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1} ++#endif + .endif +- .elseif numbytes == 4 +- op&r&cond WK®0, [base], #4 +- .elseif numbytes == 2 +- op&r&cond&h WK®0, [base], #2 +- .elseif numbytes == 1 +- op&r&cond&b WK®0, [base], #1 ++ .elseif \numbytes == 4 ++ \op\()r\()\cond WK\()\reg0, [\base], #4 ++ .elseif \numbytes == 2 ++#ifdef __clang__ ++ \op\()rh\()\cond WK\()\reg0, [\base], #2 ++#else ++ \op\()r\()\cond\()h WK\()\reg0, [\base], #2 ++#endif ++ .elseif \numbytes == 1 ++#ifdef __clang__ ++ \op\()rb\()\cond WK\()\reg0, [\base], #1 ++#else ++ \op\()r\()\cond\()b WK\()\reg0, [\base], #1 ++#endif + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base +- .if numbytes == 16 +- stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} +- .elseif numbytes == 8 +- stm&cond&db base, {WK®0,WK®1} +- .elseif numbytes == 4 +- str&cond WK®0, [base, #-4] +- .elseif numbytes == 2 +- str&cond&h WK®0, [base, #-2] +- .elseif numbytes == 1 +- str&cond&b WK®0, [base, #-1] ++ .if \numbytes == 16 ++#ifdef __clang__ ++ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} ++#else ++ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} ++#endif ++ .elseif \numbytes == 8 ++#ifdef __clang__ ++ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1} ++#else ++ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1} ++#endif ++ .elseif \numbytes == 4 ++ str\()\cond WK\()\reg0, [\base, #-4] ++ .elseif \numbytes == 2 ++#ifdef __clang__ ++ strh\()\cond WK\()\reg0, [\base, #-2] ++#else ++ str\()\cond\()h WK\()\reg0, [\base, #-2] ++#endif ++ .elseif \numbytes == 1 ++#ifdef __clang__ ++ strb\()\cond WK\()\reg0, [\base, #-1] ++#else ++ str\()\cond\()b WK\()\reg0, [\base, #-1] ++#endif + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld cond, numbytes, firstreg, base, unaligned +- pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned ++ pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned + .endm + + .macro pixst cond, numbytes, firstreg, base + .if (flags) & FLAG_DST_READWRITE +- pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base ++ pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base + .else +- pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base ++ pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base + .endif + .endm + + .macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) +- a x ++ \a \x + .endif + .endm + + + .macro preload_leading_step1 bpp, ptr, base + /* If the destination is already 16-byte aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ +- .if bpp > 0 +- PF bic, ptr, base, #31 ++ .if \bpp > 0 ++ PF bic, \ptr, \base, #31 + .set OFFSET, 0 + .rept prefetch_distance+1 +- PF pld, [ptr, #OFFSET] ++ PF pld, [\ptr, #OFFSET] + .set OFFSET, OFFSET+32 + .endr + .endif + .endm + + .macro preload_leading_step2 bpp, bpp_shift, ptr, base + /* However, if the destination is not 16-byte aligned, we may need to + * preload more cache lines than that. The question we need to ask is: +@@ -196,81 +228,81 @@ + * by which the source pointer will be rounded down for preloading, and if + * so, by how many cache lines? Effectively, we want to calculate + * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp + * inner_loop_offset = (src+leading_bytes)&31 + * extra_needed = leading_bytes - inner_loop_offset + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only + * possible when there are 4 src bytes for every 1 dst byte). + */ +- .if bpp > 0 +- .ifc base,DST ++ .if \bpp > 0 ++ .ifc \base,DST + /* The test can be simplified further when preloading the destination */ +- PF tst, base, #16 ++ PF tst, \base, #16 + PF beq, 61f + .else +- .if bpp/dst_w_bpp == 4 +- PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift ++ .if \bpp/dst_w_bpp == 4 ++ PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift + PF and, SCRATCH, SCRATCH, #31 +- PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift ++ PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift + PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ + PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ + PF bcs, 61f + PF bpl, 60f + PF pld, [ptr, #32*(prefetch_distance+2)] + .else +- PF mov, SCRATCH, base, lsl #32-5 +- PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift +- PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift ++ PF mov, SCRATCH, \base, lsl #32-5 ++ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift ++ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift + PF bls, 61f + .endif + .endif +-60: PF pld, [ptr, #32*(prefetch_distance+1)] ++60: PF pld, [\ptr, #32*(prefetch_distance+1)] + 61: + .endif + .endm + + #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) + .macro preload_middle bpp, base, scratch_holds_offset +- .if bpp > 0 ++ .if \bpp > 0 + /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ +- .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) +- .if scratch_holds_offset +- PF pld, [base, SCRATCH] ++ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp) ++ .if \scratch_holds_offset ++ PF pld, [\base, SCRATCH] + .else +- PF bic, SCRATCH, base, #31 ++ PF bic, SCRATCH, \base, #31 + PF pld, [SCRATCH, #32*prefetch_distance] + .endif + .endif + .endif + .endm + + .macro preload_trailing bpp, bpp_shift, base +- .if bpp > 0 +- .if bpp*pix_per_block > 256 ++ .if \bpp > 0 ++ .if \bpp*pix_per_block > 256 + /* Calculations are more complex if more than one fetch per block */ +- PF and, WK1, base, #31 +- PF add, WK1, WK1, WK0, lsl #bpp_shift +- PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) +- PF bic, SCRATCH, base, #31 ++ PF and, WK1, \base, #31 ++ PF add, WK1, WK1, WK0, lsl #\bpp_shift ++ PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1) ++ PF bic, SCRATCH, \base, #31 + 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] + PF add, SCRATCH, SCRATCH, #32 + PF subs, WK1, WK1, #32 + PF bhi, 80b + .else + /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ +- PF mov, SCRATCH, base, lsl #32-5 +- PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift ++ PF mov, SCRATCH, \base, lsl #32-5 ++ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift + PF adceqs, SCRATCH, SCRATCH, #0 + /* The instruction above has two effects: ensures Z is only + * set if C was clear (so Z indicates that both shifted quantities + * were 0), and clears C if Z was set (so C indicates that the sum + * of the shifted quantities was greater and not equal to 32) */ + PF beq, 82f +- PF bic, SCRATCH, base, #31 ++ PF bic, SCRATCH, \base, #31 + PF bcc, 81f + PF pld, [SCRATCH, #32*(prefetch_distance+2)] + 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] + 82: + .endif + .endif + .endm + +@@ -283,97 +315,97 @@ 82: + * pixels) they cannot possibly straddle more than 2 32-byte cachelines, + * meaning there's no need for a loop. + * "bpp" - number of bits per pixel in the channel (source, mask or + * destination) that's being preloaded, or 0 if this channel is not used + * for reading + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) + * "base" - base address register of channel to preload (SRC, MASK or DST) + */ +- .if bpp > 0 +- .if narrow_case && (bpp <= dst_w_bpp) ++ .if \bpp > 0 ++ .if \narrow_case && (\bpp <= dst_w_bpp) + /* In these cases, each line for each channel is in either 1 or 2 cache lines */ +- PF bic, WK0, base, #31 ++ PF bic, WK0, \base, #31 + PF pld, [WK0] +- PF add, WK1, base, X, LSL #bpp_shift ++ PF add, WK1, \base, X, LSL #\bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 90f + PF pld, [WK1] + 90: + .else +- PF bic, WK0, base, #31 ++ PF bic, WK0, \base, #31 + PF pld, [WK0] +- PF add, WK1, base, X, lsl #bpp_shift ++ PF add, WK1, \base, X, lsl #\bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 92f + 91: PF add, WK0, WK0, #32 + PF cmp, WK0, WK1 + PF pld, [WK0] + PF bne, 91b + 92: + .endif + .endif + .endm + + + .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx +- process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 +- .if decrementx +- sub&cond X, X, #8*numbytes/dst_w_bpp ++ \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0 ++ .if \decrementx ++ sub\()\cond X, X, #8*\numbytes/dst_w_bpp + .endif +- process_tail cond, numbytes, firstreg ++ \process_tail \cond, \numbytes, \firstreg + .if !((flags) & FLAG_PROCESS_DOES_STORE) +- pixst cond, numbytes, firstreg, DST ++ pixst \cond, \numbytes, \firstreg, DST + .endif + .endm + + .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_BRANCH_OVER +- .ifc cond,mi ++ .ifc \cond,mi + bpl 100f + .endif +- .ifc cond,cs ++ .ifc \cond,cs + bcc 100f + .endif +- .ifc cond,ne ++ .ifc \cond,ne + beq 100f + .endif +- conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx ++ conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx + 100: + .else +- conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx ++ conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx + .endif + .endm + + .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) + /* Can't interleave reads and writes */ +- test +- conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx ++ \test ++ conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx + .if (flags) & FLAG_PROCESS_CORRUPTS_PSR +- test ++ \test + .endif +- conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx ++ conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx + .else + /* Can interleave reads and writes for better scheduling */ +- test +- process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 +- process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 +- .if decrementx +- sub&cond1 X, X, #8*numbytes1/dst_w_bpp +- sub&cond2 X, X, #8*numbytes2/dst_w_bpp ++ \test ++ \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0 ++ \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0 ++ .if \decrementx ++ sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp ++ sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp + .endif +- process_tail cond1, numbytes1, firstreg1 +- process_tail cond2, numbytes2, firstreg2 +- pixst cond1, numbytes1, firstreg1, DST +- pixst cond2, numbytes2, firstreg2, DST ++ \process_tail \cond1, \numbytes1, \firstreg1 ++ \process_tail \cond2, \numbytes2, \firstreg2 ++ pixst \cond1, \numbytes1, \firstreg1, DST ++ pixst \cond2, \numbytes2, \firstreg2, DST + .endif + .endm + + + .macro test_bits_1_0_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ + .else +@@ -395,22 +427,22 @@ 100: + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + .set DECREMENT_X, 0 + sub X, X, WK0, lsr #dst_bpp_shift + str X, [sp, #LINE_SAVED_REG_COUNT*4] + mov X, WK0 + .endif + /* Use unaligned loads in all cases for simplicity */ + .if dst_w_bpp == 8 +- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X ++ conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X + .elseif dst_w_bpp == 16 + test_bits_1_0_ptr +- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X ++ conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X + .endif +- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X ++ conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + ldr X, [sp, #LINE_SAVED_REG_COUNT*4] + .endif + .endm + + .macro test_bits_3_2_pix + movs SCRATCH, X, lsl #dst_bpp_shift+32-3 + .endm +@@ -419,169 +451,169 @@ 100: + .if dst_w_bpp == 8 + movs SCRATCH, X, lsl #dst_bpp_shift+32-1 + .else + movs SCRATCH, X, lsr #1 + .endif + .endm + + .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +- conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 ++ conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0 + .if dst_w_bpp == 16 + test_bits_1_0_pix +- conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 ++ conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0 + .elseif dst_w_bpp == 8 +- conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 ++ conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0 + .endif + .endm + + + .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment + 110: + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ + .rept pix_per_block*dst_w_bpp/128 +- process_head , 16, 0, unaligned_src, unaligned_mask, 1 ++ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle src_bpp, SRC, 1 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle mask_bpp, MASK, 1 + .else + preload_middle src_bpp, SRC, 0 + preload_middle mask_bpp, MASK, 0 + .endif + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) + /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that + * destination prefetches are 32-byte aligned. It's also the easiest channel to offset + * preloads for, to achieve staggered prefetches for multiple channels, because there are + * always two STMs per prefetch, so there is always an opposite STM on which to put the + * preload. Note, no need to BIC the base register here */ +- PF pld, [DST, #32*prefetch_distance - dst_alignment] ++ PF pld, [DST, #32*prefetch_distance - \dst_alignment] + .endif +- process_tail , 16, 0 ++ \process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + .set SUBBLOCK, SUBBLOCK+1 + .endr + subs X, X, #pix_per_block + bhs 110b + .endm + + .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask + /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ + .if dst_r_bpp > 0 + tst DST, #16 + bne 111f +- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS ++ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS + b 112f + 111: + .endif +- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS ++ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS + 112: + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) + PF and, WK0, X, #pix_per_block-1 + .endif + preload_trailing src_bpp, src_bpp_shift, SRC + preload_trailing mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_trailing dst_r_bpp, dst_bpp_shift, DST + .endif + add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp + /* The remainder of the line is handled identically to the medium case */ +- medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask ++ medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask + .endm + + .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask + 120: +- process_head , 16, 0, unaligned_src, unaligned_mask, 0 +- process_tail , 16, 0 ++ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0 ++ \process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + subs X, X, #128/dst_w_bpp + bhs 120b + /* Trailing pixels */ + tst X, #128/dst_w_bpp - 1 +- beq exit_label +- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask ++ beq \exit_label ++ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask + .endm + + .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask + tst X, #16*8/dst_w_bpp +- conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 ++ conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0 + /* Trailing pixels */ + /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ +- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask ++ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask + .endm + + .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ + .if mask_bpp == 8 || mask_bpp == 16 + tst MASK, #3 + bne 141f + .endif + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 140f + .endif +- action process_head, process_tail, process_inner_loop, exit_label, 0, 0 ++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0 + .if src_bpp == 8 || src_bpp == 16 +- b exit_label ++ b \exit_label + 140: +- action process_head, process_tail, process_inner_loop, exit_label, 1, 0 ++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0 + .endif + .if mask_bpp == 8 || mask_bpp == 16 +- b exit_label ++ b \exit_label + 141: + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 142f + .endif +- action process_head, process_tail, process_inner_loop, exit_label, 0, 1 ++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1 + .if src_bpp == 8 || src_bpp == 16 +- b exit_label ++ b \exit_label + 142: +- action process_head, process_tail, process_inner_loop, exit_label, 1, 1 ++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1 + .endif + .endif + .endm + + + .macro end_of_line restore_x, vars_spilled, loop_label, last_one +- .if vars_spilled ++ .if \vars_spilled + /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ + /* This is ldmia sp,{} */ + .word 0xE89D0000 | LINE_SAVED_REGS + .endif + subs Y, Y, #1 +- .if vars_spilled ++ .if \vars_spilled + .if (LINE_SAVED_REGS) & (1<<1) + str Y, [sp] + .endif + .endif + add DST, DST, STRIDE_D + .if src_bpp > 0 + add SRC, SRC, STRIDE_S + .endif + .if mask_bpp > 0 + add MASK, MASK, STRIDE_M + .endif +- .if restore_x ++ .if \restore_x + mov X, ORIG_W + .endif +- bhs loop_label +- .ifc "last_one","" +- .if vars_spilled ++ bhs \loop_label ++ .ifc "\last_one","" ++ .if \vars_spilled + b 197f + .else + b 198f + .endif + .else +- .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) ++ .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) + b 198f + .endif + .endif + .endm + + + .macro generate_composite_function fname, \ + src_bpp_, \ +@@ -591,27 +623,27 @@ 142: + prefetch_distance_, \ + init, \ + newline, \ + cleanup, \ + process_head, \ + process_tail, \ + process_inner_loop + +- pixman_asm_function fname ++ pixman_asm_function \fname + + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set flags, flags_ +- .set prefetch_distance, prefetch_distance_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set flags, \flags_ ++ .set prefetch_distance, \prefetch_distance_ + + /* + * Select prefetch type for this function. + */ + .if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .else + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD +@@ -727,17 +759,17 @@ 142: + .endif + + #ifdef DEBUG_PARAMS + add Y, Y, #1 + stmia sp, {r0-r7,pc} + sub Y, Y, #1 + #endif + +- init ++ \init + + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + /* Reserve a word in which to store X during leading pixels */ + sub sp, sp, #4 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 + .endif + +@@ -768,47 +800,47 @@ 142: + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .endif + 151: /* New line */ +- newline ++ \newline + preload_leading_step1 src_bpp, WK1, SRC + preload_leading_step1 mask_bpp, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_leading_step1 dst_r_bpp, WK3, DST + .endif + + ands WK0, DST, #15 + beq 154f + rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ + + preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC + preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST + .endif + +- leading_15bytes process_head, process_tail ++ leading_15bytes \process_head, \process_tail + + 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, SRC, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, MASK, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .endif +- .ifc "process_inner_loop","" +- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f ++ .ifc "\process_inner_loop","" ++ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f + .else +- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f ++ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f + .endif + + 157: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif +@@ -820,80 +852,80 @@ 160: /* Medium case */ + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .endif + 161: /* New line */ +- newline ++ \newline + preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 0, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_line 0, dst_r_bpp, dst_bpp_shift, DST + .endif + + sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ + ands WK0, DST, #15 + beq 164f + rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ + +- leading_15bytes process_head, process_tail ++ leading_15bytes \process_head, \process_tail + + 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ +- switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f ++ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f + + 167: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b + + .ltorg + + 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + .if dst_w_bpp < 32 + mov ORIG_W, X + .endif + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif + 171: /* New line */ +- newline ++ \newline + preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 1, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_line 1, dst_r_bpp, dst_bpp_shift, DST + .endif + + .if dst_w_bpp == 8 + tst DST, #3 + beq 174f + 172: subs X, X, #1 + blo 177f +- process_head , 1, 0, 1, 1, 0 +- process_tail , 1, 0 ++ \process_head , 1, 0, 1, 1, 0 ++ \process_tail , 1, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 1, 0, DST + .endif + tst DST, #3 + bne 172b + .elseif dst_w_bpp == 16 + tst DST, #2 + beq 174f + subs X, X, #1 + blo 177f +- process_head , 2, 0, 1, 1, 0 +- process_tail , 2, 0 ++ \process_head , 2, 0, 1, 1, 0 ++ \process_tail , 2, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 2, 0, DST + .endif + .endif + + 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ +- switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f ++ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f + + 177: /* Check for another line */ + end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif + +@@ -903,17 +935,17 @@ 197: + .endif + 198: + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 + add sp, sp, #4 + .endif + +- cleanup ++ \cleanup + + #ifdef DEBUG_PARAMS + add sp, sp, #9*4 /* junk the debug copy of arguments */ + #endif + 199: + pop {r4-r11, pc} /* exit */ + + .ltorg +@@ -927,23 +959,25 @@ 199: + .unreq MASK + .unreq STRIDE_M + .unreq WK0 + .unreq WK1 + .unreq WK2 + .unreq WK3 + .unreq SCRATCH + .unreq ORIG_W ++#ifndef __clang__ + .endfunc ++#endif + .endm + + .macro line_saved_regs x:vararg + .set LINE_SAVED_REGS, 0 + .set LINE_SAVED_REG_COUNT, 0 +- .irp SAVED_REG,x ++ .irp SAVED_REG,\x + .ifc "SAVED_REG","Y" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_D" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif diff --git a/gfx/cairo/pixman-arm64-clang.patch b/gfx/cairo/pixman-arm64-clang.patch new file mode 100644 index 000000000000..f0597345319d --- /dev/null +++ b/gfx/cairo/pixman-arm64-clang.patch @@ -0,0 +1,3756 @@ +https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71 + +diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h +--- a/gfx/cairo/libpixman/src/pixman-arm-asm.h ++++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h +@@ -21,17 +21,33 @@ + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + + /* Supplementary macro for setting function attributes */ +-.macro pixman_asm_function fname +- .func fname +- .global fname ++.macro pixman_asm_function_impl fname ++#ifdef ASM_HAVE_FUNC_DIRECTIVE ++ .func \fname ++#endif ++ .global \fname + #ifdef __ELF__ +- .hidden fname +- .type fname, %function ++ .hidden \fname ++ .type \fname, %function + #endif +-fname: ++\fname: + .endm ++ ++.macro pixman_asm_function fname ++#ifdef ASM_LEADING_UNDERSCORE ++ pixman_asm_function_impl _\fname ++#else ++ pixman_asm_function_impl \fname ++#endif ++.endm ++ ++.macro pixman_end_asm_function ++#ifdef ASM_HAVE_FUNC_DIRECTIVE ++ .endfunc ++#endif ++.endm +diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S +--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S ++++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S +@@ -72,219 +72,219 @@ + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + + .macro bilinear_load_8888 reg1, reg2, tmp + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #2 +- ld1 {®1&.2s}, [TMP1], STRIDE +- ld1 {®2&.2s}, [TMP1] ++ ld1 {\()\reg1\().2s}, [TMP1], STRIDE ++ ld1 {\()\reg2\().2s}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 +- ld1 {®2&.s}[0], [TMP1], STRIDE +- ld1 {®2&.s}[1], [TMP1] +- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\reg2\().s}[1], [TMP1] ++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp + .endm + + .macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +- bilinear_load_8888 reg1, reg2, tmp1 +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- bilinear_load_8888 reg3, reg4, tmp2 +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ bilinear_load_8888 \reg1, \reg2, \tmp1 ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ bilinear_load_8888 \reg3, \reg4, \tmp2 ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ +- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi + .endm + + .macro vzip reg1, reg2 +- zip1 v24.8b, reg1, reg2 +- zip2 reg2, reg1, reg2 +- mov reg1, v24.8b ++ zip1 v24.8b, \reg1, \reg2 ++ zip2 \reg2, \reg1, \reg2 ++ mov \reg1, v24.8b + .endm + + .macro vuzp reg1, reg2 +- uzp1 v24.8b, reg1, reg2 +- uzp2 reg2, reg1, reg2 +- mov reg1, v24.8b ++ uzp1 v24.8b, \reg1, \reg2 ++ uzp2 \reg2, \reg1, \reg2 ++ mov \reg1, v24.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr WTMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&acc2&.s}[0], [TMP1], STRIDE +- ld1 {&acc2&.s}[2], [TMP2], STRIDE +- ld1 {&acc2&.s}[1], [TMP1] +- ld1 {&acc2&.s}[3], [TMP2] +- convert_0565_to_x888 acc2, reg3, reg2, reg1 +- vzip ®1&.8b, ®3&.8b +- vzip ®2&.8b, ®4&.8b +- vzip ®3&.8b, ®4&.8b +- vzip ®1&.8b, ®2&.8b +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\acc2\().s}[1], [TMP1] ++ ld1 {\()\acc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 ++ vzip \()\reg1\().8b, \()\reg3\().8b ++ vzip \()\reg2\().8b, \()\reg4\().8b ++ vzip \()\reg3\().8b, \()\reg4\().8b ++ vzip \()\reg1\().8b, \()\reg2\().8b ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_0565 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr WTMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&xacc2&.s}[0], [TMP1], STRIDE +- ld1 {&xacc2&.s}[2], [TMP2], STRIDE +- ld1 {&xacc2&.s}[1], [TMP1] +- ld1 {&xacc2&.s}[3], [TMP2] +- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\xacc2\().s}[1], [TMP1] ++ ld1 {\()\xacc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr WTMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&yacc2&.s}[0], [TMP1], STRIDE +- vzip &xreg1&.8b, &xreg3&.8b +- ld1 {&yacc2&.s}[2], [TMP2], STRIDE +- vzip &xreg2&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[1], [TMP1] +- vzip &xreg3&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[3], [TMP2] +- vzip &xreg1&.8b, &xreg2&.8b +- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +- umull &xacc1&.8h, &xreg1&.8b, v28.8b +- vzip &yreg1&.8b, &yreg3&.8b +- umlal &xacc1&.8h, &xreg2&.8b, v29.8b +- vzip &yreg2&.8b, &yreg4&.8b +- umull &xacc2&.8h, &xreg3&.8b, v28.8b +- vzip &yreg3&.8b, &yreg4&.8b +- umlal &xacc2&.8h, &xreg4&.8b, v29.8b +- vzip &yreg1&.8b, &yreg2&.8b +- umull &yacc1&.8h, &yreg1&.8b, v28.8b +- umlal &yacc1&.8h, &yreg2&.8b, v29.8b +- umull &yacc2&.8h, &yreg3&.8b, v28.8b +- umlal &yacc2&.8h, &yreg4&.8b, v29.8b ++ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE ++ vzip \()\xreg1\().8b, \()\xreg3\().8b ++ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE ++ vzip \()\xreg2\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[1], [TMP1] ++ vzip \()\xreg3\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[3], [TMP2] ++ vzip \()\xreg1\().8b, \()\xreg2\().8b ++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 ++ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b ++ vzip \()\yreg1\().8b, \()\yreg3\().8b ++ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b ++ vzip \()\yreg2\().8b, \()\yreg4\().8b ++ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b ++ vzip \()\yreg3\().8b, \()\yreg4\().8b ++ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b ++ vzip \()\yreg1\().8b, \()\yreg2\().8b ++ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b ++ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b ++ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b ++ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b + .endm + + .macro bilinear_store_8888 numpix, tmp1, tmp2 +-.if numpix == 4 ++.if \numpix == 4 + st1 {v0.2s, v1.2s}, [OUT], #16 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v0.2s}, [OUT], #8 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v0.s}[0], [OUT], #4 + .else +- .error bilinear_store_8888 numpix is unsupported ++ .error bilinear_store_8888 \numpix is unsupported + .endif + .endm + + .macro bilinear_store_0565 numpix, tmp1, tmp2 + vuzp v0.8b, v1.8b + vuzp v2.8b, v3.8b + vuzp v1.8b, v3.8b + vuzp v0.8b, v2.8b +- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 +-.if numpix == 4 ++ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 ++.if \numpix == 4 + st1 {v1.4h}, [OUT], #8 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v1.s}[0], [OUT], #4 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v1.h}[0], [OUT], #2 + .else +- .error bilinear_store_0565 numpix is unsupported ++ .error bilinear_store_0565 \numpix is unsupported + .endif + .endm + + + /* + * Macros for loading mask pixels into register 'mask'. + * dup must be done in somewhere else. + */ + .macro bilinear_load_mask_x numpix, mask + .endm + + .macro bilinear_load_mask_8 numpix, mask +-.if numpix == 4 +- ld1 {&mask&.s}[0], [MASK], #4 +-.elseif numpix == 2 +- ld1 {&mask&.h}[0], [MASK], #2 +-.elseif numpix == 1 +- ld1 {&mask&.b}[0], [MASK], #1 ++.if \numpix == 4 ++ ld1 {\()\mask\().s}[0], [MASK], #4 ++.elseif \numpix == 2 ++ ld1 {\()\mask\().h}[0], [MASK], #2 ++.elseif \numpix == 1 ++ ld1 {\()\mask\().b}[0], [MASK], #1 + .else +- .error bilinear_load_mask_8 numpix is unsupported ++ .error bilinear_load_mask_8 \numpix is unsupported + .endif +- prfm PREFETCH_MODE, [MASK, #prefetch_offset] ++ prfum PREFETCH_MODE, [MASK, #(prefetch_offset)] + .endm + + .macro bilinear_load_mask mask_fmt, numpix, mask +- bilinear_load_mask_&mask_fmt numpix, mask ++ bilinear_load_mask_\mask_fmt \numpix, \mask + .endm + + + /* + * Macros for loading destination pixels into register 'dst0' and 'dst1'. + * Interleave should be done somewhere else. + */ + .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 +-.if numpix == 4 +- ld1 {&dst0&.2s, &dst1&.2s}, [OUT] +-.elseif numpix == 2 +- ld1 {&dst0&.2s}, [OUT] +-.elseif numpix == 1 +- ld1 {&dst0&.s}[0], [OUT] ++.if \numpix == 4 ++ ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT] ++.elseif \numpix == 2 ++ ld1 {\()\dst0\().2s}, [OUT] ++.elseif \numpix == 1 ++ ld1 {\()\dst0\().s}[0], [OUT] + .else +- .error bilinear_load_dst_8888 numpix is unsupported ++ .error bilinear_load_dst_8888 \numpix is unsupported + .endif +- mov &dst01&.d[0], &dst0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] ++ mov \()\dst01\().d[0], \()\dst0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] + prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] + .endm + + .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 +- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 +- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 +- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 ++ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 + .endm + + /* + * Macros for duplicating partially loaded mask to fill entire register. + * We will apply mask to interleaved source pixels, that is + * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * So, we need to duplicate loaded mask into whole register. +@@ -293,84 +293,85 @@ + * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * We can do some optimizations for this including last pixel cases. + */ + .macro bilinear_duplicate_mask_x numpix, mask + .endm + + .macro bilinear_duplicate_mask_8 numpix, mask +-.if numpix == 4 +- dup &mask&.2s, &mask&.s[0] +-.elseif numpix == 2 +- dup &mask&.4h, &mask&.h[0] +-.elseif numpix == 1 +- dup &mask&.8b, &mask&.b[0] ++.if \numpix == 4 ++ dup \()\mask\().2s, \()\mask\().s[0] ++.elseif \numpix == 2 ++ dup \()\mask\().4h, \()\mask\().h[0] ++.elseif \numpix == 1 ++ dup \()\mask\().8b, \()\mask\().b[0] + .else +- .error bilinear_duplicate_mask_8 is unsupported ++ .error bilinear_duplicate_\mask_8 is unsupported + .endif + .endm + + .macro bilinear_duplicate_mask mask_fmt, numpix, mask +- bilinear_duplicate_mask_&mask_fmt numpix, mask ++ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask + .endm + + /* + * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. + * Interleave should be done when maks is enabled or operator is 'over'. + */ + .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 +- vuzp &src0&.8b, &src1&.8b +- vuzp &dst0&.8b, &dst1&.8b +- vuzp &src0&.8b, &src1&.8b +- vuzp &dst0&.8b, &dst1&.8b +- mov &src01&.d[1], &src1&.d[0] +- mov &src01&.d[0], &src0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] +- mov &dst01&.d[0], &dst0&.d[0] ++ vuzp \()\src0\().8b, \()\src1\().8b ++ vuzp \()\dst0\().8b, \()\dst1\().8b ++ vuzp \()\src0\().8b, \()\src1\().8b ++ vuzp \()\dst0\().8b, \()\dst1\().8b ++ mov \()\src01\().d[1], \()\src1\().d[0] ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] ++ mov \()\dst01\().d[0], \()\dst0\().d[0] + .endm + + .macro bilinear_interleave_src_dst_x_src \ + numpix, src0, src1, src01, dst0, dst1, dst01 + .endm + + .macro bilinear_interleave_src_dst_x_over \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_x_add \ + numpix, src0, src1, src01, dst0, dst1, dst01 +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_8_src \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_8_over \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_8_add \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst \ + mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave_src_dst_&mask_fmt&_&op \ +- numpix, src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ ++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + + /* + * Macros for applying masks to src pixels. (see combine_mask_u() function) + * src, dst should be in interleaved form. + * mask register should be in form (m0, m1, m2, m3). + */ +@@ -378,191 +379,191 @@ + numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + .endm + + .macro bilinear_apply_mask_to_src_8 \ + numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + +- umull &tmp01&.8h, &src0&.8b, &mask&.8b +- umull &tmp23&.8h, &src1&.8b, &mask&.8b ++ umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b ++ umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b + /* bubbles */ +- urshr &tmp45&.8h, &tmp01&.8h, #8 +- urshr &tmp67&.8h, &tmp23&.8h, #8 ++ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 ++ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 + /* bubbles */ +- raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h +- raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h +- mov &src01&.d[0], &src0&.d[0] +- mov &src01&.d[1], &src1&.d[0] ++ raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h ++ raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\src01\().d[1], \()\src1\().d[0] + .endm + + .macro bilinear_apply_mask_to_src \ + mask_fmt, numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + +- bilinear_apply_mask_to_src_&mask_fmt \ +- numpix, src0, src1, src01, mask, \ +- tmp01, tmp23, tmp45, tmp67 ++ bilinear_apply_mask_to_src_\()\mask_fmt \ ++ \numpix, \src0, \src1, \src01, \mask, \ ++ \tmp01, \tmp23, \tmp45, \tmp67 + .endm + + + /* + * Macros for combining src and destination pixels. + * Interleave or not is depending on operator 'op'. + */ + .macro bilinear_combine_src \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + .endm + + .macro bilinear_combine_over \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- dup &tmp8&.2s, &src1&.s[1] ++ dup \()\tmp8\().2s, \()\src1\().s[1] + /* bubbles */ +- mvn &tmp8&.8b, &tmp8&.8b ++ mvn \()\tmp8\().8b, \()\tmp8\().8b + /* bubbles */ +- umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b ++ umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b + /* bubbles */ +- umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b ++ umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b + /* bubbles */ +- urshr &tmp45&.8h, &tmp01&.8h, #8 +- urshr &tmp67&.8h, &tmp23&.8h, #8 ++ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 ++ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 + /* bubbles */ +- raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h +- raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h +- mov &dst01&.d[0], &dst0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] ++ raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h ++ raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h ++ mov \()\dst01\().d[0], \()\dst0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] + /* bubbles */ +- uqadd &src0&.8b, &dst0&.8b, &src0&.8b +- uqadd &src1&.8b, &dst1&.8b, &src1&.8b +- mov &src01&.d[0], &src0&.d[0] +- mov &src01&.d[1], &src1&.d[0] ++ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b ++ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\src01\().d[1], \()\src1\().d[0] + .endm + + .macro bilinear_combine_add \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- uqadd &src0&.8b, &dst0&.8b, &src0&.8b +- uqadd &src1&.8b, &dst1&.8b, &src1&.8b +- mov &src01&.d[0], &src0&.d[0] +- mov &src01&.d[1], &src1&.d[0] ++ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b ++ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\src01\().d[1], \()\src1\().d[0] + .endm + + .macro bilinear_combine \ + op, numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- bilinear_combine_&op \ +- numpix, src0, src1, src01, dst0, dst1, dst01, \ +- tmp01, tmp23, tmp45, tmp67, tmp8 ++ bilinear_combine_\()\op \ ++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ ++ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 + .endm + + /* + * Macros for final deinterleaving of destination pixels if needed. + */ + .macro bilinear_deinterleave numpix, dst0, dst1, dst01 +- vuzp &dst0&.8b, &dst1&.8b ++ vuzp \()\dst0\().8b, \()\dst1\().8b + /* bubbles */ +- vuzp &dst0&.8b, &dst1&.8b +- mov &dst01&.d[0], &dst0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] ++ vuzp \()\dst0\().8b, \()\dst1\().8b ++ mov \()\dst01\().d[0], \()\dst0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] + .endm + + .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 +- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 ++ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 + .endm + + + .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_&src_fmt v0, v1, v2 +- bilinear_load_mask mask_fmt, 1, v4 +- bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 ++ bilinear_load_\()\src_fmt v0, v1, v2 ++ bilinear_load_mask \mask_fmt, 1, v4 ++ bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9 + umull v2.8h, v0.8b, v28.8b + umlal v2.8h, v1.8b, v29.8b + /* 5 cycles bubble */ + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v2.4h, v15.h[0] + umlal2 v0.4s, v2.8h, v15.h[0] + /* 5 cycles bubble */ +- bilinear_duplicate_mask mask_fmt, 1, v4 ++ bilinear_duplicate_mask \mask_fmt, 1, v4 + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + /* 3 cycles bubble */ + xtn v0.8b, v0.8h + /* 1 cycle bubble */ + bilinear_interleave_src_dst \ +- mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 ++ \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9 + bilinear_apply_mask_to_src \ +- mask_fmt, 1, v0, v1, v0, v4, \ ++ \mask_fmt, 1, v0, v1, v0, v4, \ + v3, v8, v10, v11 + bilinear_combine \ +- op, 1, v0, v1, v0, v18, v19, v9, \ ++ \op, 1, v0, v1, v0, v18, v19, v9, \ + v3, v8, v10, v11, v5 +- bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 +- bilinear_store_&dst_fmt 1, v17, v18 ++ bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0 ++ bilinear_store_\()\dst_fmt 1, v17, v18 + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ + v1, v11, v18, v19, v20, v21, v22, v23 +- bilinear_load_mask mask_fmt, 2, v4 +- bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 ++ bilinear_load_mask \mask_fmt, 2, v4 ++ bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) +- bilinear_duplicate_mask mask_fmt, 2, v4 ++ bilinear_duplicate_mask \mask_fmt, 2, v4 + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + xtn v0.8b, v0.8h + bilinear_interleave_src_dst \ +- mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 ++ \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9 + bilinear_apply_mask_to_src \ +- mask_fmt, 2, v0, v1, v0, v4, \ ++ \mask_fmt, 2, v0, v1, v0, v4, \ + v3, v8, v10, v11 + bilinear_combine \ +- op, 2, v0, v1, v0, v18, v19, v9, \ ++ \op, 2, v0, v1, v0, v18, v19, v9, \ + v3, v8, v10, v11, v5 +- bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 +- bilinear_store_&dst_fmt 2, v16, v17 ++ bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0 ++ bilinear_store_\()\dst_fmt 2, v16, v17 + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_and_vertical_interpolate_four_&src_fmt \ +- v1, v11, v4, v5, v6, v7, v22, v23 \ ++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ ++ v1, v11, v4, v5, v6, v7, v22, v23, \ + v3, v9, v16, v17, v20, v21, v18, v19 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + sub TMP1, TMP1, STRIDE + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS +@@ -575,33 +576,33 @@ + ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v8.4s, v9.4h, v15.h[4] + umlal2 v8.4s, v9.8h, v15.h[4] + add v12.8h, v12.8h, v13.8h + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) +- bilinear_load_mask mask_fmt, 4, v4 +- bilinear_duplicate_mask mask_fmt, 4, v4 ++ bilinear_load_mask \mask_fmt, 4, v4 ++ bilinear_duplicate_mask \mask_fmt, 4, v4 + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + xtn v0.8b, v0.8h + xtn v1.8b, v2.8h + add v12.8h, v12.8h, v13.8h +- bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 ++ bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21 + bilinear_interleave_src_dst \ +- mask_fmt, op, 4, v0, v1, v0, v2, v3, v11 ++ \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11 + bilinear_apply_mask_to_src \ +- mask_fmt, 4, v0, v1, v0, v4, \ ++ \mask_fmt, 4, v0, v1, v0, v4, \ + v6, v8, v9, v10 + bilinear_combine \ +- op, 4, v0, v1, v0, v2, v3, v1, \ ++ \op, 4, v0, v1, v0, v2, v3, v1, \ + v6, v8, v9, v10, v23 +- bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 +- bilinear_store_&dst_fmt 4, v6, v7 ++ bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0 ++ bilinear_store_\()\dst_fmt 4, v6, v7 + .endm + + .set BILINEAR_FLAG_USE_MASK, 1 + .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + + /* + * Main template macro for generating NEON optimized bilinear scanline functions. + * +@@ -631,24 +632,24 @@ + bilinear_process_four_pixels, \ + bilinear_process_pixblock_head, \ + bilinear_process_pixblock_tail, \ + bilinear_process_pixblock_tail_head, \ + pixblock_size, \ + prefetch_distance, \ + flags + +-pixman_asm_function fname +-.if pixblock_size == 8 +-.elseif pixblock_size == 4 ++pixman_asm_function \fname ++.if \pixblock_size == 8 ++.elseif \pixblock_size == 4 + .else + .error unsupported pixblock size + .endif + +-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 + OUT .req x0 + TOP .req x1 + BOTTOM .req x2 + WT .req x3 + WWT .req w3 + WB .req x4 + WWB .req w4 + X .req w5 +@@ -694,32 +695,32 @@ pixman_asm_function fname + PF_OFFS .req x12 + TMP3 .req x13 + WTMP3 .req w13 + TMP4 .req x14 + WTMP4 .req w14 + STRIDE .req x15 + DUMMY .req x30 + +- .set prefetch_offset, prefetch_distance ++ .set prefetch_offset, \prefetch_distance + + stp x29, x30, [sp, -16]! + mov x29, sp + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x10, x11, [x29, -80] + stp x12, x13, [x29, -96] + stp x14, x15, [x29, -112] + str x8, [x29, -120] + ldr w8, [x29, 16] + sub sp, sp, 120 + .endif + +- mov WTMP1, #prefetch_distance ++ mov WTMP1, #\prefetch_distance + umull PF_OFFS, WTMP1, UX + + sub STRIDE, BOTTOM, TOP + .unreq BOTTOM + + cmp WIDTH, #0 + ble 300f + +@@ -730,73 +731,73 @@ pixman_asm_function fname + mov v25.d[0], v12.d[1] + mov v26.d[0], v13.d[0] + add v25.4h, v25.4h, v26.4h + mov v12.d[1], v25.d[0] + + /* ensure good destination alignment */ + cmp WIDTH, #1 + blt 100f +- tst OUT, #(1 << dst_bpp_shift) ++ tst OUT, #(1 << \dst_bpp_shift) + beq 100f + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h +- bilinear_process_last_pixel ++ \bilinear_process_last_pixel + sub WIDTH, WIDTH, #1 + 100: + add v13.8h, v13.8h, v13.8h + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + + cmp WIDTH, #2 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 1)) ++ tst OUT, #(1 << (\dst_bpp_shift + 1)) + beq 100f +- bilinear_process_two_pixels ++ \bilinear_process_two_pixels + sub WIDTH, WIDTH, #2 + 100: +-.if pixblock_size == 8 ++.if \pixblock_size == 8 + cmp WIDTH, #4 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 2)) ++ tst OUT, #(1 << (\dst_bpp_shift + 2)) + beq 100f +- bilinear_process_four_pixels ++ \bilinear_process_four_pixels + sub WIDTH, WIDTH, #4 + 100: + .endif +- subs WIDTH, WIDTH, #pixblock_size ++ subs WIDTH, WIDTH, #\pixblock_size + blt 100f +- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) +- bilinear_process_pixblock_head +- subs WIDTH, WIDTH, #pixblock_size ++ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) ++ \bilinear_process_pixblock_head ++ subs WIDTH, WIDTH, #\pixblock_size + blt 500f + 0: +- bilinear_process_pixblock_tail_head +- subs WIDTH, WIDTH, #pixblock_size ++ \bilinear_process_pixblock_tail_head ++ subs WIDTH, WIDTH, #\pixblock_size + bge 0b + 500: +- bilinear_process_pixblock_tail ++ \bilinear_process_pixblock_tail + 100: +-.if pixblock_size == 8 ++.if \pixblock_size == 8 + tst WIDTH, #4 + beq 200f +- bilinear_process_four_pixels ++ \bilinear_process_four_pixels + 200: + .endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 200f +- bilinear_process_two_pixels ++ \bilinear_process_two_pixels + 200: + tst WIDTH, #1 + beq 300f +- bilinear_process_last_pixel ++ \bilinear_process_last_pixel + 300: + +-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x10, x11, [x29, -80] + ldp x12, x13, [x29, -96] + ldp x14, x15, [x29, -112] + mov sp, x29 + ldp x29, x30, [sp], 16 +@@ -824,21 +825,21 @@ 300: + .unreq WIDTH + .unreq TMP1 + .unreq WTMP1 + .unreq TMP2 + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 + .unreq STRIDE +-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 + .unreq MASK + .endif + +-.endfunc ++pixman_end_asm_function + + .endm + + /* src_8888_8_8888 */ + .macro bilinear_src_8888_8_8888_process_last_pixel + bilinear_interpolate_last_pixel 8888, 8, 8888, src + .endm + +diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S +--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S ++++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S +@@ -262,64 +262,64 @@ + uqadd v18.8b, v0.8b, v22.8b + uqadd v19.8b, v1.8b, v23.8b + shrn v6.8b, v4.8h, #8 + fetch_src_pixblock + shrn v7.8b, v4.8h, #3 + sli v4.8h, v4.8h, #5 + ushll v14.8h, v17.8b, #7 + sli v14.8h, v14.8h, #1 +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + ushll v8.8h, v19.8b, #7 + sli v8.8h, v8.8h, #1 +- PF tst PF_CTL, #0xF ++ PF tst, PF_CTL, #0xF + sri v6.8b, v6.8b, #5 +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + mvn v3.8b, v3.8b +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + sri v7.8b, v7.8b, #6 + shrn v30.8b, v4.8h, #2 + umull v10.8h, v3.8b, v6.8b +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + umull v11.8h, v3.8b, v7.8b + umull v12.8h, v3.8b, v30.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + sri v14.8h, v8.8h, #5 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + ushll v9.8h, v18.8b, #7 + sli v9.8h, v9.8h, #1 + urshr v17.8h, v10.8h, #8 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + urshr v19.8h, v11.8h, #8 + urshr v18.8h, v12.8h, #8 +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + raddhn v20.8b, v10.8h, v17.8h + raddhn v23.8b, v11.8h, v19.8h +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_SRC, #1 + 10: + raddhn v22.8b, v12.8h, v18.8h + st1 {v14.8h}, [DST_W], #16 + .endm + + #else + + /* If we did not care much about the performance, we would just use this... */ +@@ -469,42 +469,42 @@ generate_composite_function \ + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] + .endm + + .macro pixman_composite_src_8888_0565_process_pixblock_tail_head + sri v14.8h, v8.8h, #5 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + fetch_src_pixblock +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + ushll v8.8h, v1.8b, #7 + sli v8.8h, v8.8h, #1 + st1 {v14.8h}, [DST_W], #16 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + ushll v14.8h, v2.8b, #7 + sli v14.8h, v14.8h, #1 +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + ushll v9.8h, v0.8b, #7 + sli v9.8h, v9.8h, #1 + .endm + + generate_composite_function \ + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +@@ -561,41 +561,41 @@ generate_composite_function \ + uqadd v31.8b, v3.8b, v7.8b + .endm + + .macro pixman_composite_add_8_8_process_pixblock_tail + .endm + + .macro pixman_composite_add_8_8_process_pixblock_tail_head + fetch_src_pixblock +- PF add PF_X, PF_X, #32 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #32 ++ PF tst, PF_CTL, #0xF + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 +- PF beq 10f +- PF add PF_X, PF_X, #32 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #32 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + uqadd v28.8b, v0.8b, v4.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b + .endm + + generate_composite_function \ + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ +@@ -607,41 +607,41 @@ generate_composite_function \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_tail_head + + /******************************************************************************/ + + .macro pixman_composite_add_8888_8888_process_pixblock_tail_head + fetch_src_pixblock +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + uqadd v28.8b, v0.8b, v4.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b + .endm + + generate_composite_function \ + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ +@@ -684,55 +684,55 @@ generate_composite_function_single_scanl + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + .endm + + .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + fetch_src_pixblock +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v8.8h, v22.8b, v4.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v9.8h, v22.8b, v5.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + umull v10.8h, v22.8b, v6.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + umull v11.8h, v22.8b, v7.8b + .endm + + generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ +@@ -754,59 +754,59 @@ generate_composite_function_single_scanl + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + .endm + + .macro pixman_composite_over_8888_8888_process_pixblock_tail_head + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + fetch_src_pixblock +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v8.8h, v22.8b, v4.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v9.8h, v22.8b, v5.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + umull v10.8h, v22.8b, v6.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + umull v11.8h, v22.8b, v7.8b + .endm + + generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ +@@ -860,40 +860,40 @@ generate_composite_function_single_scanl + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + uqadd v28.8b, v0.8b, v28.8b +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0x0F +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0x0F ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + umull v8.8h, v24.8b, v4.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + umull v9.8h, v24.8b, v5.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v10.8h, v24.8b, v6.8b +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF subs, PF_CTL, PF_CTL, #0x10 + umull v11.8h, v24.8b, v7.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + .endm + + .macro pixman_composite_over_n_8888_init + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] +@@ -912,52 +912,52 @@ generate_composite_function \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + + /******************************************************************************/ + + .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head + urshr v14.8h, v8.8h, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v12.8h, v10.8h, #8 + urshr v13.8h, v11.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v30.8b, v12.8h, v10.8h + raddhn v31.8b, v13.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 + mvn v22.8b, v3.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF blt 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF blt, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v8.8h, v22.8b, v4.8b +- PF blt 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF blt, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v9.8h, v22.8b, v5.8b + umull v10.8h, v22.8b, v6.8b +- PF blt 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF blt, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + umull v11.8h, v22.8b, v7.8b + .endm + + .macro pixman_composite_over_reverse_n_8888_init + mov v7.s[0], w4 + dup v4.8b, v7.b[0] + dup v5.8b, v7.b[1] +@@ -1405,45 +1405,45 @@ generate_composite_function \ + rshrn v28.8b, v8.8h, #8 + rshrn v29.8b, v9.8h, #8 + rshrn v30.8b, v10.8h, #8 + rshrn v31.8b, v11.8h, #8 + .endm + + .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head + fetch_mask_pixblock +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + rshrn v28.8b, v8.8h, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + rshrn v29.8b, v9.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + rshrn v30.8b, v10.8h, #8 +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + rshrn v31.8b, v11.8h, #8 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + umull v8.8h, v24.8b, v0.8b +- PF lsl DUMMY, PF_X, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + umull v9.8h, v24.8b, v1.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v10.8h, v24.8b, v2.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v11.8h, v24.8b, v3.8b +- PF ble 10f +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + 10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v8.8h, v8.8h, #8 + ursra v9.8h, v9.8h, #8 + ursra v10.8h, v10.8h, #8 + ursra v11.8h, v11.8h, #8 + .endm + +@@ -1486,45 +1486,45 @@ generate_composite_function \ + rshrn v28.8b, v0.8h, #8 + rshrn v29.8b, v1.8h, #8 + rshrn v30.8b, v2.8h, #8 + rshrn v31.8b, v3.8h, #8 + .endm + + .macro pixman_composite_src_n_8_8_process_pixblock_tail_head + fetch_mask_pixblock +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + rshrn v28.8b, v0.8h, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + rshrn v29.8b, v1.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + rshrn v30.8b, v2.8h, #8 +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + rshrn v31.8b, v3.8h, #8 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + umull v0.8h, v24.8b, v16.8b +- PF lsl DUMMY, PF_X, mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + umull v1.8h, v25.8b, v16.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v2.8h, v26.8b, v16.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v3.8h, v27.8b, v16.8b +- PF ble 10f +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + 10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v0.8h, v0.8h, #8 + ursra v1.8h, v1.8h, #8 + ursra v2.8h, v2.8h, #8 + ursra v3.8h, v3.8h, #8 + .endm + +@@ -1594,54 +1594,54 @@ generate_composite_function \ + .endm + + .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head + urshr v16.8h, v12.8h, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v17.8h, v13.8h, #8 + fetch_mask_pixblock + urshr v18.8h, v14.8h, #8 +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + urshr v19.8h, v15.8h, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + raddhn v28.8b, v16.8h, v12.8h +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + raddhn v29.8b, v17.8h, v13.8h +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v30.8b, v18.8h, v14.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v31.8b, v19.8h, v15.8h +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + umull v16.8h, v24.8b, v8.8b +- PF lsl DUMMY, PF_X, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + umull v17.8h, v24.8b, v9.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v18.8h, v24.8b, v10.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v19.8h, v24.8b, v11.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + uqadd v28.8b, v0.8b, v28.8b +- PF ble 10f +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + 10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + urshr v12.8h, v16.8h, #8 + urshr v13.8h, v17.8h, #8 + urshr v14.8h, v18.8h, #8 + urshr v15.8h, v19.8h, #8 +@@ -2407,17 +2407,17 @@ generate_composite_function \ + generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ ++ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + + /******************************************************************************/ + + .macro pixman_composite_over_8888_n_8888_process_pixblock_head +@@ -2482,31 +2482,31 @@ generate_composite_function \ + pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ ++ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + + generate_composite_function_single_scanline \ + pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ ++ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + + /******************************************************************************/ + + /* TODO: expand macros and do better instructions scheduling */ +@@ -2524,17 +2524,17 @@ generate_composite_function \ + pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ ++ pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 15 /* mask_basereg */ + + /******************************************************************************/ + + .macro pixman_composite_src_0888_0888_process_pixblock_head +@@ -2675,38 +2675,38 @@ generate_composite_function \ + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v31.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + fetch_src_pixblock + raddhn v30.8b, v11.8h, v8.8h +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v29.8b, v12.8h, v9.8h + raddhn v28.8b, v13.8h, v10.8h + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + .endm + + generate_composite_function \ + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -2744,38 +2744,38 @@ generate_composite_function \ + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v30.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + fetch_src_pixblock + raddhn v28.8b, v11.8h, v8.8h +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v29.8b, v12.8h, v9.8h + raddhn v30.8b, v13.8h, v10.8h + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + .endm + + generate_composite_function \ + pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + + .macro bilinear_load_8888 reg1, reg2, tmp + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #2 +- ld1 {®1&.2s}, [TMP1], STRIDE +- ld1 {®2&.2s}, [TMP1] ++ ld1 {\()\reg1\().2s}, [TMP1], STRIDE ++ ld1 {\()\reg2\().2s}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 +- ld1 {®2&.s}[0], [TMP1], STRIDE +- ld1 {®2&.s}[1], [TMP1] +- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\reg2\().s}[1], [TMP1] ++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp + .endm + + .macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +- bilinear_load_8888 reg1, reg2, tmp1 +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- bilinear_load_8888 reg3, reg4, tmp2 +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ bilinear_load_8888 \reg1, \reg2, \tmp1 ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ bilinear_load_8888 \reg3, \reg4, \tmp2 ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ +- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi + .endm + + .macro vzip reg1, reg2 + umov TMP4, v31.d[0] +- zip1 v31.8b, reg1, reg2 +- zip2 reg2, reg1, reg2 +- mov reg1, v31.8b ++ zip1 v31.8b, \reg1, \reg2 ++ zip2 \reg2, \reg1, \reg2 ++ mov \reg1, v31.8b + mov v31.d[0], TMP4 + .endm + + .macro vuzp reg1, reg2 + umov TMP4, v31.d[0] +- uzp1 v31.8b, reg1, reg2 +- uzp2 reg2, reg1, reg2 +- mov reg1, v31.8b ++ uzp1 v31.8b, \reg1, \reg2 ++ uzp2 \reg2, \reg1, \reg2 ++ mov \reg1, v31.8b + mov v31.d[0], TMP4 + .endm + + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr TMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&acc2&.s}[0], [TMP1], STRIDE +- ld1 {&acc2&.s}[2], [TMP2], STRIDE +- ld1 {&acc2&.s}[1], [TMP1] +- ld1 {&acc2&.s}[3], [TMP2] +- convert_0565_to_x888 acc2, reg3, reg2, reg1 +- vzip ®1&.8b, ®3&.8b +- vzip ®2&.8b, ®4&.8b +- vzip ®3&.8b, ®4&.8b +- vzip ®1&.8b, ®2&.8b +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\acc2\().s}[1], [TMP1] ++ ld1 {\()\acc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 ++ vzip \()\reg1\().8b, \()\reg3\().8b ++ vzip \()\reg2\().8b, \()\reg4\().8b ++ vzip \()\reg3\().8b, \()\reg4\().8b ++ vzip \()\reg1\().8b, \()\reg2\().8b ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_0565 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr TMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&xacc2&.s}[0], [TMP1], STRIDE +- ld1 {&xacc2&.s}[2], [TMP2], STRIDE +- ld1 {&xacc2&.s}[1], [TMP1] +- ld1 {&xacc2&.s}[3], [TMP2] +- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\xacc2\().s}[1], [TMP1] ++ ld1 {\()\xacc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr TMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&yacc2&.s}[0], [TMP1], STRIDE +- vzip &xreg1&.8b, &xreg3&.8b +- ld1 {&yacc2&.s}[2], [TMP2], STRIDE +- vzip &xreg2&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[1], [TMP1] +- vzip &xreg3&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[3], [TMP2] +- vzip &xreg1&.8b, &xreg2&.8b +- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +- umull &xacc1&.8h, &xreg1&.8b, v28.8b +- vzip &yreg1&.8b, &yreg3&.8b +- umlal &xacc1&.8h, &xreg2&.8b, v29.8b +- vzip &yreg2&.8b, &yreg4&.8b +- umull &xacc2&.8h, &xreg3&.8b, v28.8b +- vzip &yreg3&.8b, &yreg4&.8b +- umlal &xacc2&.8h, &xreg4&.8b, v29.8b +- vzip &yreg1&.8b, &yreg2&.8b +- umull &yacc1&.8h, &yreg1&.8b, v28.8b +- umlal &yacc1&.8h, &yreg2&.8b, v29.8b +- umull &yacc2&.8h, &yreg3&.8b, v28.8b +- umlal &yacc2&.8h, &yreg4&.8b, v29.8b ++ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE ++ vzip \()\xreg1\().8b, \()\xreg3\().8b ++ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE ++ vzip \()\xreg2\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[1], [TMP1] ++ vzip \()\xreg3\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[3], [TMP2] ++ vzip \()\xreg1\().8b, \()\xreg2\().8b ++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 ++ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b ++ vzip \()\yreg1\().8b, \()\yreg3\().8b ++ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b ++ vzip \()\yreg2\().8b, \()\yreg4\().8b ++ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b ++ vzip \()\yreg3\().8b, \()\yreg4\().8b ++ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b ++ vzip \()\yreg1\().8b, \()\yreg2\().8b ++ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b ++ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b ++ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b ++ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b + .endm + + .macro bilinear_store_8888 numpix, tmp1, tmp2 +-.if numpix == 4 ++.if \numpix == 4 + st1 {v0.2s, v1.2s}, [OUT], #16 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v0.2s}, [OUT], #8 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v0.s}[0], [OUT], #4 + .else +- .error bilinear_store_8888 numpix is unsupported ++ .error bilinear_store_8888 \numpix is unsupported + .endif + .endm + + .macro bilinear_store_0565 numpix, tmp1, tmp2 + vuzp v0.8b, v1.8b + vuzp v2.8b, v3.8b + vuzp v1.8b, v3.8b + vuzp v0.8b, v2.8b +- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 +-.if numpix == 4 ++ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 ++.if \numpix == 4 + st1 {v1.4h}, [OUT], #8 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v1.s}[0], [OUT], #4 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v1.h}[0], [OUT], #2 + .else +- .error bilinear_store_0565 numpix is unsupported ++ .error bilinear_store_0565 \numpix is unsupported + .endif + .endm + + .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt +- bilinear_load_&src_fmt v0, v1, v2 ++ bilinear_load_\()\src_fmt v0, v1, v2 + umull v2.8h, v0.8b, v28.8b + umlal v2.8h, v1.8b, v29.8b + /* 5 cycles bubble */ + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v2.4h, v15.h[0] + umlal2 v0.4s, v2.8h, v15.h[0] + /* 5 cycles bubble */ + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + /* 3 cycles bubble */ + xtn v0.8b, v0.8h + /* 1 cycle bubble */ +- bilinear_store_&dst_fmt 1, v3, v4 ++ bilinear_store_\()\dst_fmt 1, v3, v4 + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt +- bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ + v1, v11, v2, v3, v20, v21, v22, v23 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + xtn v0.8b, v0.8h +- bilinear_store_&dst_fmt 2, v3, v4 ++ bilinear_store_\()\dst_fmt 2, v3, v4 + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt +- bilinear_load_and_vertical_interpolate_four_&src_fmt \ +- v1, v11, v14, v20, v16, v17, v22, v23 \ ++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ ++ v1, v11, v14, v20, v16, v17, v22, v23, \ + v3, v9, v24, v25, v26, v27, v18, v19 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + sub TMP1, TMP1, STRIDE + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] +@@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + xtn v0.8b, v0.8h + xtn v1.8b, v2.8h + add v12.8h, v12.8h, v13.8h +- bilinear_store_&dst_fmt 4, v3, v4 ++ bilinear_store_\()\dst_fmt 4, v3, v4 + .endm + + .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head + .else +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail + .endif + .endm + + .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head + .else +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head + .else +- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail + .else +- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head + .else +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + .endif + .endm + + .set BILINEAR_FLAG_UNROLL_4, 0 + .set BILINEAR_FLAG_UNROLL_8, 1 + .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + + /* +@@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan + * prefetch_distance - prefetch in the source image by that many + * pixels ahead + */ + + .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ + src_bpp_shift, dst_bpp_shift, \ + prefetch_distance, flags + +-pixman_asm_function fname ++pixman_asm_function \fname + OUT .req x0 + TOP .req x1 + BOTTOM .req x2 + WT .req x3 + WB .req x4 + X .req x5 + UX .req x6 + WIDTH .req x7 +@@ -3437,17 +3437,17 @@ pixman_asm_function fname + sub sp, sp, 112 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] + stp x12, x13, [x29, -112] + +- mov PF_OFFS, #prefetch_distance ++ mov PF_OFFS, #\prefetch_distance + mul PF_OFFS, PF_OFFS, UX + + subs STRIDE, BOTTOM, TOP + .unreq BOTTOM + + cmp WIDTH, #0 + ble 300f + +@@ -3458,85 +3458,85 @@ pixman_asm_function fname + mov v25.d[0], v12.d[1] + mov v26.d[0], v13.d[0] + add v25.4h, v25.4h, v26.4h + mov v12.d[1], v25.d[0] + + /* ensure good destination alignment */ + cmp WIDTH, #1 + blt 100f +- tst OUT, #(1 << dst_bpp_shift) ++ tst OUT, #(1 << \dst_bpp_shift) + beq 100f + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h +- bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #1 + 100: + add v13.8h, v13.8h, v13.8h + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + + cmp WIDTH, #2 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 1)) ++ tst OUT, #(1 << (\dst_bpp_shift + 1)) + beq 100f +- bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #2 + 100: +-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 ++.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 + /*********** 8 pixels per iteration *****************/ + cmp WIDTH, #4 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 2)) ++ tst OUT, #(1 << (\dst_bpp_shift + 2)) + beq 100f +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #4 + 100: + subs WIDTH, WIDTH, #8 + blt 100f +- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) +- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) ++ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #8 + blt 500f + 1000: +- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #8 + bge 1000b + 500: +- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt + 100: + tst WIDTH, #4 + beq 200f +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + 200: + .else + /*********** 4 pixels per iteration *****************/ + subs WIDTH, WIDTH, #4 + blt 100f +- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) +- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) ++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #4 + blt 500f + 1000: +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #4 + bge 1000b + 500: +- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt + 100: + /****************************************************/ + .endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 200f +- bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt + 200: + tst WIDTH, #1 + beq 300f +- bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt + 300: + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -104] + mov sp, x29 +@@ -3551,17 +3551,17 @@ 300: + .unreq UX + .unreq WIDTH + .unreq TMP1 + .unreq TMP2 + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 + .unreq STRIDE +-.endfunc ++pixman_end_asm_function + + .endm + + /*****************************************************************************/ + + .set have_bilinear_interpolate_four_pixels_8888_8888, 1 + + .macro bilinear_interpolate_four_pixels_8888_8888_head +diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h +--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h ++++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h +@@ -75,340 +75,340 @@ + #define PREFETCH_MODE pldl1keep + + /* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + + .macro pixldst1 op, elem_size, reg1, mem_operand, abits +- op {v®1&.&elem_size}, [&mem_operand&], #8 ++ \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8 + .endm + + .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits +- op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16 + .endm + + .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits +- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32 + .endm + + .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes +- op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& ++ \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\() + .endm + + .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand +- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24 + .endm + + .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand +- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3 + .endm + + .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +-.if numbytes == 32 +- .if elem_size==32 +- pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +- .elseif elem_size==16 +- pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits ++.if \numbytes == 32 ++ .if \elem_size==32 ++ pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++ .elseif \elem_size==16 ++ pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits + .else +- pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits ++ pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits + .endif +-.elseif numbytes == 16 +- .if elem_size==32 +- pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits +- .elseif elem_size==16 +- pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits ++.elseif \numbytes == 16 ++ .if \elem_size==32 ++ pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits ++ .elseif \elem_size==16 ++ pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits + .else +- pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits ++ pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits + .endif +-.elseif numbytes == 8 +- .if elem_size==32 +- pixldst1 op, 2s, %(basereg+1), mem_operand, abits +- .elseif elem_size==16 +- pixldst1 op, 4h, %(basereg+1), mem_operand, abits ++.elseif \numbytes == 8 ++ .if \elem_size==32 ++ pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits ++ .elseif \elem_size==16 ++ pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits + .else +- pixldst1 op, 8b, %(basereg+1), mem_operand, abits ++ pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits + .endif +-.elseif numbytes == 4 +- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) +- pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 +- .elseif elem_size == 16 +- pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 +- pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 ++.elseif \numbytes == 4 ++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) ++ pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4 ++ .elseif \elem_size == 16 ++ pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2 ++ pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2 + .else +- pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1 + .endif +-.elseif numbytes == 2 +- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) +- pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 ++.elseif \numbytes == 2 ++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) ++ pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2 + .else +- pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1 + .endif +-.elseif numbytes == 1 +- pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 ++.elseif \numbytes == 1 ++ pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1 + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld numpix, bpp, basereg, mem_operand, abits=0 +-.if bpp > 0 +-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif (bpp == 24) && (numpix == 8) +- pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +-.elseif (bpp == 24) && (numpix == 4) +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +-.elseif (bpp == 24) && (numpix == 2) +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +-.elseif (bpp == 24) && (numpix == 1) +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand ++.if \bpp > 0 ++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif (\bpp == 24) && (\numpix == 8) ++ pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand ++.elseif (\bpp == 24) && (\numpix == 4) ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 2) ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 1) ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand + .else +- pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits ++ pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits + .endif + .endif + .endm + + .macro pixst numpix, bpp, basereg, mem_operand, abits=0 +-.if bpp > 0 +-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif (bpp == 24) && (numpix == 8) +- pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +-.elseif (bpp == 24) && (numpix == 4) +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +-.elseif (bpp == 24) && (numpix == 2) +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +-.elseif (bpp == 24) && (numpix == 1) +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +-.elseif numpix * bpp == 32 && abits == 32 +- pixldst 4, st1, 32, basereg, mem_operand, abits +-.elseif numpix * bpp == 16 && abits == 16 +- pixldst 2, st1, 16, basereg, mem_operand, abits ++.if \bpp > 0 ++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif (\bpp == 24) && (\numpix == 8) ++ pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand ++.elseif (\bpp == 24) && (\numpix == 4) ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 2) ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 1) ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand ++.elseif \numpix * \bpp == 32 && \abits == 32 ++ pixldst 4, st1, 32, \basereg, \mem_operand, \abits ++.elseif \numpix * \bpp == 16 && \abits == 16 ++ pixldst 2, st1, 16, \basereg, \mem_operand, \abits + .else +- pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits ++ pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits + .endif + .endif + .endm + + .macro pixld_a numpix, bpp, basereg, mem_operand +-.if (bpp * numpix) <= 128 +- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.if (\bpp * \numpix) <= 128 ++ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) + .else +- pixld numpix, bpp, basereg, mem_operand, 128 ++ pixld \numpix, \bpp, \basereg, \mem_operand, 128 + .endif + .endm + + .macro pixst_a numpix, bpp, basereg, mem_operand +-.if (bpp * numpix) <= 128 +- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.if (\bpp * \numpix) <= 128 ++ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) + .else +- pixst numpix, bpp, basereg, mem_operand, 128 ++ pixst \numpix, \bpp, \basereg, \mem_operand, 128 + .endif + .endm + + /* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ + .macro pixld1_s elem_size, reg1, mem_operand +-.if elem_size == 16 ++.if \elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #1 ++ add TMP1, \mem_operand, TMP1, lsl #1 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP2, mem_operand, TMP2, lsl #1 +- ld1 {v®1&.h}[0], [TMP1] ++ add TMP2, \mem_operand, TMP2, lsl #1 ++ ld1 {v\()\reg1\().h}[0], [TMP1] + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #1 +- ld1 {v®1&.h}[1], [TMP2] ++ add TMP1, \mem_operand, TMP1, lsl #1 ++ ld1 {v\()\reg1\().h}[1], [TMP2] + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP2, mem_operand, TMP2, lsl #1 +- ld1 {v®1&.h}[2], [TMP1] +- ld1 {v®1&.h}[3], [TMP2] +-.elseif elem_size == 32 ++ add TMP2, \mem_operand, TMP2, lsl #1 ++ ld1 {v\()\reg1\().h}[2], [TMP1] ++ ld1 {v\()\reg1\().h}[3], [TMP2] ++.elseif \elem_size == 32 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #2 ++ add TMP1, \mem_operand, TMP1, lsl #2 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP2, mem_operand, TMP2, lsl #2 +- ld1 {v®1&.s}[0], [TMP1] +- ld1 {v®1&.s}[1], [TMP2] ++ add TMP2, \mem_operand, TMP2, lsl #2 ++ ld1 {v\()\reg1\().s}[0], [TMP1] ++ ld1 {v\()\reg1\().s}[1], [TMP2] + .else + .error "unsupported" + .endif + .endm + + .macro pixld2_s elem_size, reg1, reg2, mem_operand +-.if 0 /* elem_size == 32 */ ++.if 0 /* \elem_size == 32 */ + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 +- add TMP1, mem_operand, TMP1, asl #2 ++ add TMP1, \mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X +- add TMP2, mem_operand, TMP2, asl #2 +- ld1 {v®1&.s}[0], [TMP1] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ ld1 {v\()\reg1\().s}[0], [TMP1] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 +- add TMP1, mem_operand, TMP1, asl #2 +- ld1 {v®2&.s}[0], [TMP2, :32] ++ add TMP1, \mem_operand, TMP1, asl #2 ++ ld1 {v\()\reg2\().s}[0], [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X +- add TMP2, mem_operand, TMP2, asl #2 +- ld1 {v®1&.s}[1], [TMP1] +- ld1 {v®2&.s}[1], [TMP2] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ ld1 {v\()\reg1\().s}[1], [TMP1] ++ ld1 {v\()\reg2\().s}[1], [TMP2] + .else +- pixld1_s elem_size, reg1, mem_operand +- pixld1_s elem_size, reg2, mem_operand ++ pixld1_s \elem_size, \reg1, \mem_operand ++ pixld1_s \elem_size, \reg2, \mem_operand + .endif + .endm + + .macro pixld0_s elem_size, reg1, idx, mem_operand +-.if elem_size == 16 ++.if \elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #1 +- ld1 {v®1&.h}[idx], [TMP1] +-.elseif elem_size == 32 ++ add TMP1, \mem_operand, TMP1, lsl #1 ++ ld1 {v\()\reg1\().h}[\idx], [TMP1] ++.elseif \elem_size == 32 + asr DUMMY, VX, #16 + mov TMP1, DUMMY + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #2 +- ld1 {v®1&.s}[idx], [TMP1] ++ add TMP1, \mem_operand, TMP1, lsl #2 ++ ld1 {v\()\reg1\().s}[\idx], [TMP1] + .endif + .endm + + .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +-.if numbytes == 32 +- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand +- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand +- pixdeinterleave elem_size, %(basereg+4) +-.elseif numbytes == 16 +- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +-.elseif numbytes == 8 +- pixld1_s elem_size, %(basereg+1), mem_operand +-.elseif numbytes == 4 +- .if elem_size == 32 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand +- .elseif elem_size == 16 +- pixld0_s elem_size, %(basereg+0), 2, mem_operand +- pixld0_s elem_size, %(basereg+0), 3, mem_operand ++.if \numbytes == 32 ++ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand ++ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand ++ pixdeinterleave \elem_size, %(\basereg+4) ++.elseif \numbytes == 16 ++ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand ++.elseif \numbytes == 8 ++ pixld1_s \elem_size, %(\basereg+1), \mem_operand ++.elseif \numbytes == 4 ++ .if \elem_size == 32 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand ++ .elseif \elem_size == 16 ++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand + .else +- pixld0_s elem_size, %(basereg+0), 4, mem_operand +- pixld0_s elem_size, %(basereg+0), 5, mem_operand +- pixld0_s elem_size, %(basereg+0), 6, mem_operand +- pixld0_s elem_size, %(basereg+0), 7, mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand + .endif +-.elseif numbytes == 2 +- .if elem_size == 16 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.elseif \numbytes == 2 ++ .if \elem_size == 16 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .else +- pixld0_s elem_size, %(basereg+0), 2, mem_operand +- pixld0_s elem_size, %(basereg+0), 3, mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand + .endif +-.elseif numbytes == 1 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.elseif \numbytes == 1 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld_s numpix, bpp, basereg, mem_operand +-.if bpp > 0 +- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand ++.if \bpp > 0 ++ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand + .endif + .endm + + .macro vuzp8 reg1, reg2 + umov DUMMY, v16.d[0] +- uzp1 v16.8b, v®1&.8b, v®2&.8b +- uzp2 v®2&.8b, v®1&.8b, v®2&.8b +- mov v®1&.8b, v16.8b ++ uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b ++ uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b ++ mov v\()\reg1\().8b, v16.8b + mov v16.d[0], DUMMY + .endm + + .macro vzip8 reg1, reg2 + umov DUMMY, v16.d[0] +- zip1 v16.8b, v®1&.8b, v®2&.8b +- zip2 v®2&.8b, v®1&.8b, v®2&.8b +- mov v®1&.8b, v16.8b ++ zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b ++ zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b ++ mov v\()\reg1\().8b, v16.8b + mov v16.d[0], DUMMY + .endm + + /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ + .macro pixdeinterleave bpp, basereg +-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- vuzp8 %(basereg+0), %(basereg+1) +- vuzp8 %(basereg+2), %(basereg+3) +- vuzp8 %(basereg+1), %(basereg+3) +- vuzp8 %(basereg+0), %(basereg+2) ++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vuzp8 %(\basereg+0), %(\basereg+1) ++ vuzp8 %(\basereg+2), %(\basereg+3) ++ vuzp8 %(\basereg+1), %(\basereg+3) ++ vuzp8 %(\basereg+0), %(\basereg+2) + .endif + .endm + + /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ + .macro pixinterleave bpp, basereg +-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- vzip8 %(basereg+0), %(basereg+2) +- vzip8 %(basereg+1), %(basereg+3) +- vzip8 %(basereg+2), %(basereg+3) +- vzip8 %(basereg+0), %(basereg+1) ++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vzip8 %(\basereg+0), %(\basereg+2) ++ vzip8 %(\basereg+1), %(\basereg+3) ++ vzip8 %(\basereg+2), %(\basereg+3) ++ vzip8 %(\basereg+0), %(\basereg+1) + .endif + .endm + + /* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is +@@ -432,62 +432,62 @@ 55: + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ + .macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) +- a x ++ \a \x + .endif + .endm + + .macro cache_preload std_increment, boost_increment + .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +-.if std_increment != 0 +- PF add PF_X, PF_X, #std_increment ++.if \std_increment != 0 ++ PF add, PF_X, PF_X, #\std_increment + .endif +- PF tst PF_CTL, #0xF +- PF beq 71f +- PF add PF_X, PF_X, #boost_increment +- PF sub PF_CTL, PF_CTL, #1 ++ PF tst, PF_CTL, #0xF ++ PF beq, 71f ++ PF add, PF_X, PF_X, #\boost_increment ++ PF sub, PF_CTL, PF_CTL, #1 + 71: +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + .if src_bpp_shift >= 0 +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + .endif + .if dst_r_bpp != 0 +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + .endif + .if mask_bpp_shift >= 0 +- PF lsl DUMMY, PF_X, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + .endif +- PF ble 71f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 71f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 71: +- PF ble 72f ++ PF ble, 72f + .if src_bpp_shift >= 0 +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + .endif + .if dst_r_bpp != 0 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + .endif + .if mask_bpp_shift >= 0 +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + .endif + 72: + .endif + .endm + + .macro cache_preload_simple + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) + .if src_bpp > 0 +@@ -516,56 +516,56 @@ 72: + process_pixblock_tail, \ + process_pixblock_tail_head + .if dst_w_bpp != 24 + tst DST_R, #0xF + beq 52f + + .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 + .irp lowbit, 1, 2, 4, 8, 16 +-local skip1 +-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +-.if lowbit < 16 /* we don't need more than 16-byte alignment */ +- tst DST_R, #lowbit ++ ++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if \lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_R, #\lowbit + beq 51f + .endif +- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC +- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK ++ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC ++ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK + .if dst_r_bpp > 0 +- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R ++ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R + .else +- add DST_R, DST_R, #lowbit ++ add DST_R, DST_R, #\lowbit + .endif +- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) +- sub W, W, #(lowbit * 8 / dst_w_bpp) ++ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) ++ sub W, W, #(\lowbit * 8 / dst_w_bpp) + 51: + .endif + .endr + .endif + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + +- process_pixblock_head ++ \process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple +- process_pixblock_tail ++ \process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg + + .irp lowbit, 1, 2, 4, 8, 16 +-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +-.if lowbit < 16 /* we don't need more than 16-byte alignment */ +- tst DST_W, #lowbit ++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if \lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_W, #\lowbit + beq 51f + .endif + .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 +- sub W, W, #(lowbit * 8 / dst_w_bpp) ++ sub W, W, #(\lowbit * 8 / dst_w_bpp) + .endif +- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W ++ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W + 51: + .endif + .endr + .endif + 52: + .endm + + /* +@@ -587,52 +587,52 @@ 52: + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 52f + .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 + .irp chunk_size, 16, 8, 4, 2, 1 +-.if pixblock_size > chunk_size +- tst W, #chunk_size ++.if pixblock_size > \chunk_size ++ tst W, #\chunk_size + beq 51f +- pixld_src chunk_size, src_bpp, src_basereg, SRC +- pixld chunk_size, mask_bpp, mask_basereg, MASK +-.if dst_aligned_flag != 0 +- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++ pixld_src \chunk_size, src_bpp, src_basereg, SRC ++ pixld \chunk_size, mask_bpp, mask_basereg, MASK ++.if \dst_aligned_flag != 0 ++ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R + .else +- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R + .endif +-.if cache_preload_flag != 0 +- PF add PF_X, PF_X, #chunk_size ++.if \cache_preload_flag != 0 ++ PF add, PF_X, PF_X, #\chunk_size + .endif + 51: + .endif + .endr + .endif + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + +- process_pixblock_head +-.if cache_preload_flag != 0 ++ \process_pixblock_head ++.if \cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple + .endif +- process_pixblock_tail ++ \process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg + .irp chunk_size, 16, 8, 4, 2, 1 +-.if pixblock_size > chunk_size +- tst W, #chunk_size ++.if pixblock_size > \chunk_size ++ tst W, #\chunk_size + beq 51f +-.if dst_aligned_flag != 0 +- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++.if \dst_aligned_flag != 0 ++ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W + .else +- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W + .endif + 51: + .endif + .endr + 52: + .endm + + /* +@@ -655,17 +655,17 @@ 52: + .if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift + .endif + .if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift + .endif + subs H, H, #1 + mov DST_R, DST_W +- bge start_of_loop_label ++ bge \start_of_loop_label + .endm + + /* + * Registers are allocated in the following way by default: + * v0, v1, v2, v3 - reserved for loading source pixel data + * v4, v5, v6, v7 - reserved for loading destination pixel data + * v24, v25, v26, v27 - reserved for loading mask pixel data + * v28, v29, v30, v31 - final destination pixel data for writeback to memory +@@ -682,17 +682,17 @@ 52: + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + +- pixman_asm_function fname ++ pixman_asm_function \fname + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 232 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] +@@ -707,38 +707,38 @@ 52: + str x28, [x29, -232] + + /* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +-.if prefetch_distance == 0 ++.if \prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ +- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) ++ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE + .endif + + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set pixblock_size, pixblock_size_ +- .set dst_w_basereg, dst_w_basereg_ +- .set dst_r_basereg, dst_r_basereg_ +- .set src_basereg, src_basereg_ +- .set mask_basereg, mask_basereg_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set pixblock_size, \pixblock_size_ ++ .set dst_w_basereg, \dst_w_basereg_ ++ .set dst_r_basereg, \dst_r_basereg_ ++ .set src_basereg, \src_basereg_ ++ .set mask_basereg, \mask_basereg_ + + .macro pixld_src x:vararg +- pixld x ++ pixld \x + .endm + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + /* + * Assign symbolic names to registers + */ +@@ -805,32 +805,32 @@ 52: + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 + .else + .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + +-.if (((flags) & FLAG_DST_READWRITE) != 0) ++.if (((\flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif +-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 + .else + .set DEINTERLEAVE_32BPP_ENABLED, 0 + .endif + +-.if prefetch_distance < 0 || prefetch_distance > 15 +- .error "invalid prefetch distance (prefetch_distance)" ++.if \prefetch_distance < 0 || \prefetch_distance > 15 ++ .error "invalid prefetch distance (\prefetch_distance)" + .endif + +- PF mov PF_X, #0 ++ PF mov, PF_X, #0 + mov DST_R, DST_W + + .if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 + .endif + .if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W +@@ -839,71 +839,71 @@ 52: + .if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 + .endif + + /* + * Setup advanced prefetcher initial state + */ +- PF mov PF_SRC, SRC +- PF mov PF_DST, DST_R +- PF mov PF_MASK, MASK +- /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ +- PF lsl DUMMY, H, #4 +- PF mov PF_CTL, DUMMY +- PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) ++ PF mov, PF_SRC, SRC ++ PF mov, PF_DST, DST_R ++ PF mov, PF_MASK, MASK ++ /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */ ++ PF lsl, DUMMY, H, #4 ++ PF mov, PF_CTL, DUMMY ++ PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10) + +- init ++ \init + subs H, H, #1 + mov ORIG_W, W + blt 9f + cmp W, #(pixblock_size * 2) + blt 800f + /* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ + 0: +- ensure_destination_ptr_alignment process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ ensure_destination_ptr_alignment \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- PF add PF_X, PF_X, #pixblock_size +- process_pixblock_head ++ PF add, PF_X, PF_X, #pixblock_size ++ \process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 200f + + 100: +- process_pixblock_tail_head ++ \process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 100b + + 200: +- process_pixblock_tail ++ \process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + advance_to_next_scanline 0b + +- cleanup ++ \cleanup + 1000: + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] +@@ -920,48 +920,48 @@ 1000: + ret /* exit */ + /* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ + 800: + .if src_bpp_shift >= 0 +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF prfm PREFETCH_MODE, [SRC, DUMMY] ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [SRC, DUMMY] + .endif + .if dst_r_bpp != 0 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [DST_R, DUMMY] ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [DST_R, DUMMY] + .endif + .if mask_bpp_shift >= 0 +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [MASK, DUMMY] ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [MASK, DUMMY] + .endif + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 100f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- process_pixblock_head +- process_pixblock_tail ++ \process_pixblock_head ++ \process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + 100: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + advance_to_next_scanline 800b + 9: +- cleanup ++ \cleanup + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] +@@ -990,17 +990,17 @@ 9: + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY +- .endfunc ++ pixman_end_asm_function + .endm + + /* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ + .macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ +@@ -1014,50 +1014,50 @@ 9: + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + +- pixman_asm_function fname ++ pixman_asm_function \fname + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set pixblock_size, pixblock_size_ +- .set dst_w_basereg, dst_w_basereg_ +- .set dst_r_basereg, dst_r_basereg_ +- .set src_basereg, src_basereg_ +- .set mask_basereg, mask_basereg_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set pixblock_size, \pixblock_size_ ++ .set dst_w_basereg, \dst_w_basereg_ ++ .set dst_r_basereg, \dst_r_basereg_ ++ .set src_basereg, \src_basereg_ ++ .set mask_basereg, \mask_basereg_ + +-.if use_nearest_scaling != 0 ++.if \use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req x0 + DST_W .req x1 + SRC .req x2 + VX .req x3 + UNIT_X .req x4 + SRC_WIDTH_FIXED .req x5 + MASK .req x6 + TMP1 .req x8 + TMP2 .req x9 + DST_R .req x10 + DUMMY .req x30 + + .macro pixld_src x:vararg +- pixld_s x ++ pixld_s \x + .endm + + sxtw x0, w0 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + + stp x29, x30, [sp, -16]! +@@ -1075,84 +1075,84 @@ 9: + W .req x0 /* width (is updated during processing) */ + DST_W .req x1 /* destination buffer pointer for writes */ + SRC .req x2 /* source buffer pointer */ + MASK .req x3 /* mask pointer */ + DST_R .req x4 /* destination buffer pointer for reads */ + DUMMY .req x30 + + .macro pixld_src x:vararg +- pixld x ++ pixld \x + .endm + + sxtw x0, w0 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 64 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + .endif + +-.if (((flags) & FLAG_DST_READWRITE) != 0) ++.if (((\flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif +-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 + .else + .set DEINTERLEAVE_32BPP_ENABLED, 0 + .endif + + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + +- init ++ \init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 800f + +- ensure_destination_ptr_alignment process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ ensure_destination_ptr_alignment \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 700f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- process_pixblock_head ++ \process_pixblock_head + subs W, W, #pixblock_size + blt 200f + 100: +- process_pixblock_tail_head ++ \process_pixblock_tail_head + subs W, W, #pixblock_size + bge 100b + 200: +- process_pixblock_tail ++ \process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + 700: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + +- cleanup +-.if use_nearest_scaling != 0 ++ \cleanup ++.if \use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -96] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +@@ -1162,22 +1162,22 @@ 700: + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + .endif + 800: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + +- cleanup +-.if use_nearest_scaling != 0 ++ \cleanup ++.if \use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -88] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +@@ -1208,25 +1208,25 @@ 800: + .unreq DST_R + .unreq DST_W + .unreq W + .endif + + .purgem fetch_src_pixblock + .purgem pixld_src + +- .endfunc ++ pixman_end_asm_function + .endm + + .macro generate_composite_function_single_scanline x:vararg +- generate_composite_function_scanline 0, x ++ generate_composite_function_scanline 0, \x + .endm + + .macro generate_composite_function_nearest_scanline x:vararg +- generate_composite_function_scanline 1, x ++ generate_composite_function_scanline 1, \x + .endm + + /* Default prologue/epilogue, nothing special needs to be done */ + + .macro default_init + .endm + + .macro default_cleanup +@@ -1250,61 +1250,61 @@ 800: + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) + * into a planar a8r8g8b8 format (with a, r, g, b color components + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). + * + * Warning: the conversion is destructive and the original + * value (in) is lost. + */ + .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b +- shrn &out_r&.8b, &in&.8h, #8 +- shrn &out_g&.8b, &in&.8h, #3 +- sli &in&.8h, &in&.8h, #5 +- movi &out_a&.8b, #255 +- sri &out_r&.8b, &out_r&.8b, #5 +- sri &out_g&.8b, &out_g&.8b, #6 +- shrn &out_b&.8b, &in&.8h, #2 ++ shrn \()\out_r\().8b, \()\in\().8h, #8 ++ shrn \()\out_g\().8b, \()\in\().8h, #3 ++ sli \()\in\().8h, \()\in\().8h, #5 ++ movi \()\out_a\().8b, #255 ++ sri \()\out_r\().8b, \()\out_r\().8b, #5 ++ sri \()\out_g\().8b, \()\out_g\().8b, #6 ++ shrn \()\out_b\().8b, \()\in\().8h, #2 + .endm + + .macro convert_0565_to_x888 in, out_r, out_g, out_b +- shrn &out_r&.8b, &in&.8h, #8 +- shrn &out_g&.8b, &in&.8h, #3 +- sli &in&.8h, &in&.8h, #5 +- sri &out_r&.8b, &out_r&.8b, #5 +- sri &out_g&.8b, &out_g&.8b, #6 +- shrn &out_b&.8b, &in&.8h, #2 ++ shrn \()\out_r\().8b, \()\in\().8h, #8 ++ shrn \()\out_g\().8b, \()\in\().8h, #3 ++ sli \()\in\().8h, \()\in\().8h, #5 ++ sri \()\out_r\().8b, \()\out_r\().8b, #5 ++ sri \()\out_g\().8b, \()\out_g\().8b, #6 ++ shrn \()\out_b\().8b, \()\in\().8h, #2 + .endm + + /* + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit + * registers (tmp1, tmp2) + */ + .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 +- ushll &tmp1&.8h, &in_g&.8b, #7 +- shl &tmp1&.8h, &tmp1&.8h, #1 +- ushll &out&.8h, &in_r&.8b, #7 +- shl &out&.8h, &out&.8h, #1 +- ushll &tmp2&.8h, &in_b&.8b, #7 +- shl &tmp2&.8h, &tmp2&.8h, #1 +- sri &out&.8h, &tmp1&.8h, #5 +- sri &out&.8h, &tmp2&.8h, #11 ++ ushll \()\tmp1\().8h, \()\in_g\().8b, #7 ++ shl \()\tmp1\().8h, \()\tmp1\().8h, #1 ++ ushll \()\out\().8h, \()\in_r\().8b, #7 ++ shl \()\out\().8h, \()\out\().8h, #1 ++ ushll \()\tmp2\().8h, \()\in_b\().8b, #7 ++ shl \()\tmp2\().8h, \()\tmp2\().8h, #1 ++ sri \()\out\().8h, \()\tmp1\().8h, #5 ++ sri \()\out\().8h, \()\tmp2\().8h, #11 + .endm + + /* + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels + * returned in (out0, out1) registers pair. Requires one temporary + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original + * value from 'in' is lost + */ + .macro convert_four_0565_to_x888_packed in, out0, out1, tmp +- shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ +- shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ +- sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ +- sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ +- sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ +- ushr &out1&.4h, &in&.4h, #8 /* R is in place */ +- sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ +- zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ +- zip2 &out1&.4h, &out0&.4h, &out1&.4h +- mov &out0&.d[0], &tmp&.d[0] ++ shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */ ++ shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */ ++ sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */ ++ sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */ ++ sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */ ++ ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */ ++ sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */ ++ zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */ ++ zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h ++ mov \()\out0\().d[0], \()\tmp\().d[0] + .endm