This is arm32 issue to build clang only.

Depends on D183969

Differential Revision: https://phabricator.services.mozilla.com/D183970
This commit is contained in:
Makoto Kato 2023-07-24 08:06:36 +00:00
Родитель 97981b55e8
Коммит 47a779e282
6 изменённых файлов: 1143 добавлений и 1075 удалений

Просмотреть файл

@ -82,28 +82,28 @@
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
vld1.32 {reg1}, [TMP1], STRIDE
vld1.32 {reg2}, [TMP1]
vld1.32 {\reg1}, [TMP1], STRIDE
vld1.32 {\reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
vld1.32 {reg2[0]}, [TMP1], STRIDE
vld1.32 {reg2[1]}, [TMP1]
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
vld1.32 {\reg2[0]}, [TMP1], STRIDE
vld1.32 {\reg2[1]}, [TMP1]
convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
.endm
.macro bilinear_load_and_vertical_interpolate_two_8888 \
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
bilinear_load_8888 reg1, reg2, tmp1
vmull.u8 acc1, reg1, d28
vmlal.u8 acc1, reg2, d29
bilinear_load_8888 reg3, reg4, tmp2
vmull.u8 acc2, reg3, d28
vmlal.u8 acc2, reg4, d29
bilinear_load_8888 \reg1, \reg2, \tmp1
vmull.u8 \acc1, \reg1, d28
vmlal.u8 \acc1, \reg2, d29
bilinear_load_8888 \reg3, \reg4, \tmp2
vmull.u8 \acc2, \reg3, d28
vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_8888 \
@ -111,9 +111,9 @@
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
\xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
\yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
.endm
.macro bilinear_load_and_vertical_interpolate_two_0565 \
@ -125,19 +125,19 @@
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
vld1.32 {acc2lo[0]}, [TMP1], STRIDE
vld1.32 {acc2hi[0]}, [TMP2], STRIDE
vld1.32 {acc2lo[1]}, [TMP1]
vld1.32 {acc2hi[1]}, [TMP2]
convert_0565_to_x888 acc2, reg3, reg2, reg1
vzip.u8 reg1, reg3
vzip.u8 reg2, reg4
vzip.u8 reg3, reg4
vzip.u8 reg1, reg2
vmull.u8 acc1, reg1, d28
vmlal.u8 acc1, reg2, d29
vmull.u8 acc2, reg3, d28
vmlal.u8 acc2, reg4, d29
vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
vld1.32 {\acc2lo[1]}, [TMP1]
vld1.32 {\acc2hi[1]}, [TMP2]
convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
vzip.u8 \reg1, \reg3
vzip.u8 \reg2, \reg4
vzip.u8 \reg3, \reg4
vzip.u8 \reg1, \reg2
vmull.u8 \acc1, \reg1, d28
vmlal.u8 \acc1, \reg2, d29
vmull.u8 \acc2, \reg3, d28
vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_0565 \
@ -150,46 +150,46 @@
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
vld1.32 {xacc2lo[1]}, [TMP1]
vld1.32 {xacc2hi[1]}, [TMP2]
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
vld1.32 {\xacc2lo[1]}, [TMP1]
vld1.32 {\xacc2hi[1]}, [TMP2]
convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
vzip.u8 xreg1, xreg3
vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
vzip.u8 xreg2, xreg4
vld1.32 {yacc2lo[1]}, [TMP1]
vzip.u8 xreg3, xreg4
vld1.32 {yacc2hi[1]}, [TMP2]
vzip.u8 xreg1, xreg2
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
vmull.u8 xacc1, xreg1, d28
vzip.u8 yreg1, yreg3
vmlal.u8 xacc1, xreg2, d29
vzip.u8 yreg2, yreg4
vmull.u8 xacc2, xreg3, d28
vzip.u8 yreg3, yreg4
vmlal.u8 xacc2, xreg4, d29
vzip.u8 yreg1, yreg2
vmull.u8 yacc1, yreg1, d28
vmlal.u8 yacc1, yreg2, d29
vmull.u8 yacc2, yreg3, d28
vmlal.u8 yacc2, yreg4, d29
vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
vzip.u8 \xreg1, \xreg3
vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
vzip.u8 \xreg2, \xreg4
vld1.32 {\yacc2lo[1]}, [TMP1]
vzip.u8 \xreg3, \xreg4
vld1.32 {\yacc2hi[1]}, [TMP2]
vzip.u8 \xreg1, \xreg2
convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
vmull.u8 \xacc1, \xreg1, d28
vzip.u8 \yreg1, \yreg3
vmlal.u8 \xacc1, \xreg2, d29
vzip.u8 \yreg2, \yreg4
vmull.u8 \xacc2, \xreg3, d28
vzip.u8 \yreg3, \yreg4
vmlal.u8 \xacc2, \xreg4, d29
vzip.u8 \yreg1, \yreg2
vmull.u8 \yacc1, \yreg1, d28
vmlal.u8 \yacc1, \yreg2, d29
vmull.u8 \yacc2, \yreg3, d28
vmlal.u8 \yacc2, \yreg4, d29
.endm
.macro bilinear_store_8888 numpix, tmp1, tmp2
.if numpix == 4
.if \numpix == 4
vst1.32 {d0, d1}, [OUT]!
.elseif numpix == 2
.elseif \numpix == 2
vst1.32 {d0}, [OUT]!
.elseif numpix == 1
.elseif \numpix == 1
vst1.32 {d0[0]}, [OUT, :32]!
.else
.error bilinear_store_8888 numpix is unsupported
@ -201,12 +201,12 @@
vuzp.u8 d2, d3
vuzp.u8 d1, d3
vuzp.u8 d0, d2
convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
.if numpix == 4
convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
.if \numpix == 4
vst1.16 {d2}, [OUT]!
.elseif numpix == 2
.elseif \numpix == 2
vst1.32 {d2[0]}, [OUT]!
.elseif numpix == 1
.elseif \numpix == 1
vst1.16 {d2[0]}, [OUT]!
.else
.error bilinear_store_0565 numpix is unsupported
@ -222,20 +222,20 @@
.endm
.macro bilinear_load_mask_8 numpix, mask
.if numpix == 4
vld1.32 {mask[0]}, [MASK]!
.elseif numpix == 2
vld1.16 {mask[0]}, [MASK]!
.elseif numpix == 1
vld1.8 {mask[0]}, [MASK]!
.if \numpix == 4
vld1.32 {\mask[0]}, [MASK]!
.elseif \numpix == 2
vld1.16 {\mask[0]}, [MASK]!
.elseif \numpix == 1
vld1.8 {\mask[0]}, [MASK]!
.else
.error bilinear_load_mask_8 numpix is unsupported
.error bilinear_load_mask_8 \numpix is unsupported
.endif
pld [MASK, #prefetch_offset]
.endm
.macro bilinear_load_mask mask_fmt, numpix, mask
bilinear_load_mask_&mask_fmt numpix, mask
bilinear_load_mask_\()\mask_fmt \numpix, \mask
.endm
@ -250,28 +250,28 @@
.endm
.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
.if numpix == 4
vld1.32 {dst0, dst1}, [OUT]
.elseif numpix == 2
vld1.32 {dst0}, [OUT]
.elseif numpix == 1
vld1.32 {dst0[0]}, [OUT]
.if \numpix == 4
vld1.32 {\dst0, \dst1}, [OUT]
.elseif \numpix == 2
vld1.32 {\dst0}, [OUT]
.elseif \numpix == 1
vld1.32 {\dst0[0]}, [OUT]
.else
.error bilinear_load_dst_8888 numpix is unsupported
.error bilinear_load_dst_8888 \numpix is unsupported
.endif
pld [OUT, #(prefetch_offset * 4)]
.endm
.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
bilinear_load_dst_8888 numpix, dst0, dst1, dst01
bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
bilinear_load_dst_8888 numpix, dst0, dst1, dst01
bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
.endm
/*
@ -290,19 +290,19 @@
.endm
.macro bilinear_duplicate_mask_8 numpix, mask
.if numpix == 4
vdup.32 mask, mask[0]
.elseif numpix == 2
vdup.16 mask, mask[0]
.elseif numpix == 1
vdup.8 mask, mask[0]
.if \numpix == 4
vdup.32 \mask, \mask[0]
.elseif \numpix == 2
vdup.16 \mask, \mask[0]
.elseif \numpix == 1
vdup.8 \mask, \mask[0]
.else
.error bilinear_duplicate_mask_8 is unsupported
.endif
.endm
.macro bilinear_duplicate_mask mask_fmt, numpix, mask
bilinear_duplicate_mask_&mask_fmt numpix, mask
bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
.endm
/*
@ -310,10 +310,10 @@
* Interleave should be done when maks is enabled or operator is 'over'.
*/
.macro bilinear_interleave src0, src1, dst0, dst1
vuzp.8 src0, src1
vuzp.8 dst0, dst1
vuzp.8 src0, src1
vuzp.8 dst0, dst1
vuzp.8 \src0, \src1
vuzp.8 \dst0, \dst1
vuzp.8 \src0, \src1
vuzp.8 \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_x_src \
@ -323,7 +323,7 @@
.macro bilinear_interleave_src_dst_x_over \
numpix, src0, src1, src01, dst0, dst1, dst01
bilinear_interleave src0, src1, dst0, dst1
bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_x_add \
@ -333,26 +333,26 @@
.macro bilinear_interleave_src_dst_8_src \
numpix, src0, src1, src01, dst0, dst1, dst01
bilinear_interleave src0, src1, dst0, dst1
bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_8_over \
numpix, src0, src1, src01, dst0, dst1, dst01
bilinear_interleave src0, src1, dst0, dst1
bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_8_add \
numpix, src0, src1, src01, dst0, dst1, dst01
bilinear_interleave src0, src1, dst0, dst1
bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst \
mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
bilinear_interleave_src_dst_&mask_fmt&_&op \
numpix, src0, src1, src01, dst0, dst1, dst01
bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
\numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
.endm
@ -370,23 +370,23 @@
numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
vmull.u8 tmp01, src0, mask
vmull.u8 tmp23, src1, mask
vmull.u8 \tmp01, \src0, \mask
vmull.u8 \tmp23, \src1, \mask
/* bubbles */
vrshr.u16 tmp45, tmp01, #8
vrshr.u16 tmp67, tmp23, #8
vrshr.u16 \tmp45, \tmp01, #8
vrshr.u16 \tmp67, \tmp23, #8
/* bubbles */
vraddhn.u16 src0, tmp45, tmp01
vraddhn.u16 src1, tmp67, tmp23
vraddhn.u16 \src0, \tmp45, \tmp01
vraddhn.u16 \src1, \tmp67, \tmp23
.endm
.macro bilinear_apply_mask_to_src \
mask_fmt, numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
bilinear_apply_mask_to_src_&mask_fmt \
numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
bilinear_apply_mask_to_src_\()\mask_fmt \
\numpix, \src0, \src1, \src01, \mask, \
\tmp01, \tmp23, \tmp45, \tmp67
.endm
@ -403,79 +403,79 @@
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
vdup.32 tmp8, src1[1]
vdup.32 \tmp8, \src1[1]
/* bubbles */
vmvn.8 tmp8, tmp8
vmvn.8 \tmp8, \tmp8
/* bubbles */
vmull.u8 tmp01, dst0, tmp8
vmull.u8 \tmp01, \dst0, \tmp8
/* bubbles */
vmull.u8 tmp23, dst1, tmp8
vmull.u8 \tmp23, \dst1, \tmp8
/* bubbles */
vrshr.u16 tmp45, tmp01, #8
vrshr.u16 tmp67, tmp23, #8
vrshr.u16 \tmp45, \tmp01, #8
vrshr.u16 \tmp67, \tmp23, #8
/* bubbles */
vraddhn.u16 dst0, tmp45, tmp01
vraddhn.u16 dst1, tmp67, tmp23
vraddhn.u16 \dst0, \tmp45, \tmp01
vraddhn.u16 \dst1, \tmp67, \tmp23
/* bubbles */
vqadd.u8 src01, dst01, src01
vqadd.u8 \src01, \dst01, \src01
.endm
.macro bilinear_combine_add \
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
vqadd.u8 src01, dst01, src01
vqadd.u8 \src01, \dst01, \src01
.endm
.macro bilinear_combine \
op, numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
bilinear_combine_&op \
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
bilinear_combine_\()\op \
\numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
\tmp01, \tmp23, \tmp45, \tmp67, \tmp8
.endm
/*
* Macros for final deinterleaving of destination pixels if needed.
*/
.macro bilinear_deinterleave numpix, dst0, dst1, dst01
vuzp.8 dst0, dst1
vuzp.8 \dst0, \dst1
/* bubbles */
vuzp.8 dst0, dst1
vuzp.8 \dst0, \dst1
.endm
.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
.endm
.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
bilinear_deinterleave numpix, dst0, dst1, dst01
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
.endm
.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
bilinear_deinterleave numpix, dst0, dst1, dst01
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
bilinear_deinterleave numpix, dst0, dst1, dst01
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
bilinear_deinterleave numpix, dst0, dst1, dst01
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
bilinear_load_&src_fmt d0, d1, d2
bilinear_load_mask mask_fmt, 1, d4
bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
bilinear_load_\()\src_fmt d0, d1, d2
bilinear_load_mask \mask_fmt, 1, d4
bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
@ -483,28 +483,28 @@
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
/* 5 cycles bubble */
bilinear_duplicate_mask mask_fmt, 1, d4
bilinear_duplicate_mask \mask_fmt, 1, d4
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
bilinear_interleave_src_dst \
mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
\mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
mask_fmt, 1, d0, d1, q0, d4, \
\mask_fmt, 1, d0, d1, q0, d4, \
q3, q8, q10, q11
bilinear_combine \
op, 1, d0, d1, q0, d18, d19, q9, \
\op, 1, d0, d1, q0, d18, d19, q9, \
q3, q8, q10, q11, d5
bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
bilinear_store_&dst_fmt 1, q2, q3
bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
bilinear_store_\()\dst_fmt 1, q2, q3
.endm
.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
bilinear_load_and_vertical_interpolate_two_&src_fmt \
bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
bilinear_load_mask mask_fmt, 2, d4
bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
bilinear_load_mask \mask_fmt, 2, d4
bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@ -513,24 +513,24 @@
vmlal.u16 q10, d23, d31
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
bilinear_duplicate_mask mask_fmt, 2, d4
bilinear_duplicate_mask \mask_fmt, 2, d4
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
bilinear_interleave_src_dst \
mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
\mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
mask_fmt, 2, d0, d1, q0, d4, \
\mask_fmt, 2, d0, d1, q0, d4, \
q3, q8, q10, q11
bilinear_combine \
op, 2, d0, d1, q0, d18, d19, q9, \
\op, 2, d0, d1, q0, d18, d19, q9, \
q3, q8, q10, q11, d5
bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
bilinear_store_&dst_fmt 2, q2, q3
bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
bilinear_store_\()\dst_fmt 2, q2, q3
.endm
.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
bilinear_load_and_vertical_interpolate_four_&src_fmt \
bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
@ -546,8 +546,8 @@
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
bilinear_load_mask mask_fmt, 4, d22
bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
bilinear_load_mask \mask_fmt, 4, d22
bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
pld [TMP1, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
@ -556,21 +556,21 @@
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
bilinear_duplicate_mask mask_fmt, 4, d22
bilinear_duplicate_mask \mask_fmt, 4, d22
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
bilinear_interleave_src_dst \
mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
\mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
bilinear_apply_mask_to_src \
mask_fmt, 4, d0, d1, q0, d22, \
\mask_fmt, 4, d0, d1, q0, d22, \
q3, q8, q9, q10
bilinear_combine \
op, 4, d0, d1, q0, d2, d3, q1, \
\op, 4, d0, d1, q0, d2, d3, q1, \
q3, q8, q9, q10, d23
bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
bilinear_store_&dst_fmt 4, q2, q3
bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
bilinear_store_\()\dst_fmt 4, q2, q3
.endm
.set BILINEAR_FLAG_USE_MASK, 1
@ -610,14 +610,14 @@
prefetch_distance, \
flags
pixman_asm_function fname
.if pixblock_size == 8
.elseif pixblock_size == 4
pixman_asm_function \fname
.if \pixblock_size == 8
.elseif \pixblock_size == 4
.else
.error unsupported pixblock size
.endif
.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
OUT .req r0
TOP .req r1
BOTTOM .req r2
@ -635,7 +635,7 @@ pixman_asm_function fname
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
mov PF_OFFS, #prefetch_distance
mov PF_OFFS, #\prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
.else
OUT .req r0
@ -654,17 +654,17 @@ pixman_asm_function fname
TMP4 .req r10
STRIDE .req r3
.set prefetch_offset, prefetch_distance
.set prefetch_offset, \prefetch_distance
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
mov PF_OFFS, #prefetch_distance
mov PF_OFFS, #\prefetch_distance
ldmia ip, {WT, WB, X, UX, WIDTH}
.endif
mul PF_OFFS, PF_OFFS, UX
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpush {d8-d15}
.endif
@ -683,11 +683,11 @@ pixman_asm_function fname
/* ensure good destination alignment */
cmp WIDTH, #1
blt 0f
tst OUT, #(1 << dst_bpp_shift)
tst OUT, #(1 << \dst_bpp_shift)
beq 0f
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
bilinear_process_last_pixel
\bilinear_process_last_pixel
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
@ -696,53 +696,53 @@ pixman_asm_function fname
cmp WIDTH, #2
blt 0f
tst OUT, #(1 << (dst_bpp_shift + 1))
tst OUT, #(1 << (\dst_bpp_shift + 1))
beq 0f
bilinear_process_two_pixels
\bilinear_process_two_pixels
sub WIDTH, WIDTH, #2
0:
.if pixblock_size == 8
.if \pixblock_size == 8
cmp WIDTH, #4
blt 0f
tst OUT, #(1 << (dst_bpp_shift + 2))
tst OUT, #(1 << (\dst_bpp_shift + 2))
beq 0f
bilinear_process_four_pixels
\bilinear_process_four_pixels
sub WIDTH, WIDTH, #4
0:
.endif
subs WIDTH, WIDTH, #pixblock_size
subs WIDTH, WIDTH, #\pixblock_size
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
bilinear_process_pixblock_head
subs WIDTH, WIDTH, #pixblock_size
mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
\bilinear_process_pixblock_head
subs WIDTH, WIDTH, #\pixblock_size
blt 5f
0:
bilinear_process_pixblock_tail_head
subs WIDTH, WIDTH, #pixblock_size
\bilinear_process_pixblock_tail_head
subs WIDTH, WIDTH, #\pixblock_size
bge 0b
5:
bilinear_process_pixblock_tail
\bilinear_process_pixblock_tail
1:
.if pixblock_size == 8
.if \pixblock_size == 8
tst WIDTH, #4
beq 2f
bilinear_process_four_pixels
\bilinear_process_four_pixels
2:
.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
bilinear_process_two_pixels
\bilinear_process_two_pixels
2:
tst WIDTH, #1
beq 3f
bilinear_process_last_pixel
\bilinear_process_last_pixel
3:
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpop {d8-d15}
.endif
.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
pop {r4, r5, r6, r7, r8, r9}
.else
pop {r4, r5, r6, r7, r8, r9, r10, ip}
@ -762,11 +762,13 @@ pixman_asm_function fname
.unreq TMP3
.unreq TMP4
.unreq STRIDE
.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
.unreq MASK
.endif
#ifndef __clang__
.endfunc
#endif
.endm

Просмотреть файл

@ -34,6 +34,12 @@
* - pixman_composite_over_n_8_0565_asm_neon
*/
#ifdef __clang__
#define ldrgeb ldrbge
#define subges subsge
#define subpls subspl
#endif
/* Prevent the stack from becoming executable for no reason... */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
@ -260,13 +266,13 @@
vshrn.u16 d7, q2, #3
vsli.u16 q2, q2, #5
vshll.u8 q14, d16, #8
PF add PF_X, PF_X, #8
PF add, PF_X, PF_X, #8
vshll.u8 q8, d19, #8
PF tst PF_CTL, #0xF
PF tst, PF_CTL, #0xF
vsri.u8 d6, d6, #5
PF addne PF_X, PF_X, #8
PF addne, PF_X, PF_X, #8
vmvn.8 d3, d3
PF subne PF_CTL, PF_CTL, #1
PF subne, PF_CTL, PF_CTL, #1
vsri.u8 d7, d7, #6
vshrn.u16 d30, q2, #2
vmull.u8 q10, d3, d6
@ -275,18 +281,18 @@
vmull.u8 q12, d3, d30
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vsri.u16 q14, q8, #5
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vshll.u8 q9, d18, #8
vrshr.u16 q13, q10, #8
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vrshr.u16 q3, q11, #8
vrshr.u16 q15, q12, #8
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vsri.u16 q14, q9, #11
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vraddhn.u16 d20, q10, q13
vraddhn.u16 d23, q11, q3
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vraddhn.u16 d22, q12, q15
vst1.16 {d28, d29}, [DST_W, :128]!
.endm
@ -434,20 +440,20 @@ generate_composite_function \
.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
vsri.u16 q14, q8, #5
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
fetch_src_pixblock
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vsri.u16 q14, q9, #11
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vshll.u8 q8, d1, #8
vst1.16 {d28, d29}, [DST_W, :128]!
PF subge PF_X, PF_X, ORIG_W
PF subges PF_CTL, PF_CTL, #0x10
PF subge, PF_X, PF_X, ORIG_W
PF subges, PF_CTL, PF_CTL, #0x10
vshll.u8 q14, d2, #8
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vshll.u8 q9, d0, #8
.endm
@ -509,20 +515,20 @@ generate_composite_function \
.macro pixman_composite_add_8_8_process_pixblock_tail_head
fetch_src_pixblock
PF add PF_X, PF_X, #32
PF tst PF_CTL, #0xF
PF add, PF_X, PF_X, #32
PF tst, PF_CTL, #0xF
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
PF addne PF_X, PF_X, #32
PF subne PF_CTL, PF_CTL, #1
PF addne, PF_X, PF_X, #32
PF subne, PF_CTL, PF_CTL, #1
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
PF subge PF_X, PF_X, ORIG_W
PF subges PF_CTL, PF_CTL, #0x10
PF subge, PF_X, PF_X, ORIG_W
PF subges, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@ -541,20 +547,20 @@ generate_composite_function \
.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
fetch_src_pixblock
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
PF subge PF_X, PF_X, ORIG_W
PF subges PF_CTL, PF_CTL, #0x10
PF subge, PF_X, PF_X, ORIG_W
PF subges, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@ -604,16 +610,16 @@ generate_composite_function_single_scanline \
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
fetch_src_pixblock
@ -621,13 +627,13 @@ generate_composite_function_single_scanline \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
PF subges PF_CTL, PF_CTL, #0x10
PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@ -656,16 +662,16 @@ generate_composite_function_single_scanline \
.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
@ -675,13 +681,13 @@ generate_composite_function_single_scanline \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@ -742,20 +748,20 @@ generate_composite_function_single_scanline \
vraddhn.u16 d31, q3, q11
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vqadd.u8 q14, q0, q14
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0x0F
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0x0F
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vqadd.u8 q15, q1, q15
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vmull.u8 q8, d24, d4
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vmull.u8 q9, d24, d5
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d6
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d7
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@ -784,16 +790,16 @@ generate_composite_function \
.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
vrshr.u16 q14, q8, #8
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
@ -802,12 +808,12 @@ generate_composite_function \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
vmull.u8 q10, d22, d6
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@ -1245,23 +1251,23 @@ generate_composite_function \
.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
fetch_mask_pixblock
PF add PF_X, PF_X, #8
PF add, PF_X, PF_X, #8
vrshrn.u16 d28, q8, #8
PF tst PF_CTL, #0x0F
PF tst, PF_CTL, #0x0F
vrshrn.u16 d29, q9, #8
PF addne PF_X, PF_X, #8
PF addne, PF_X, PF_X, #8
vrshrn.u16 d30, q10, #8
PF subne PF_CTL, PF_CTL, #1
PF subne, PF_CTL, PF_CTL, #1
vrshrn.u16 d31, q11, #8
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vmull.u8 q8, d24, d0
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q9, d24, d1
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d2
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d3
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q8, q8, #8
vrsra.u16 q9, q9, #8
@ -1314,23 +1320,23 @@ generate_composite_function \
.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
fetch_mask_pixblock
PF add PF_X, PF_X, #8
PF add, PF_X, PF_X, #8
vrshrn.u16 d28, q0, #8
PF tst PF_CTL, #0x0F
PF tst, PF_CTL, #0x0F
vrshrn.u16 d29, q1, #8
PF addne PF_X, PF_X, #8
PF addne, PF_X, PF_X, #8
vrshrn.u16 d30, q2, #8
PF subne PF_CTL, PF_CTL, #1
PF subne, PF_CTL, PF_CTL, #1
vrshrn.u16 d31, q3, #8
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vmull.u8 q0, d24, d16
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q1, d25, d16
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q2, d26, d16
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vmull.u8 q3, d27, d16
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q0, q0, #8
vrsra.u16 q1, q1, #8
@ -1408,27 +1414,27 @@ generate_composite_function \
vrshr.u16 q15, q9, #8
fetch_mask_pixblock
vrshr.u16 q6, q10, #8
PF add PF_X, PF_X, #8
PF add, PF_X, PF_X, #8
vrshr.u16 q7, q11, #8
PF tst PF_CTL, #0x0F
PF tst, PF_CTL, #0x0F
vraddhn.u16 d28, q14, q8
PF addne PF_X, PF_X, #8
PF addne, PF_X, PF_X, #8
vraddhn.u16 d29, q15, q9
PF subne PF_CTL, PF_CTL, #1
PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d30, q6, q10
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
vraddhn.u16 d31, q7, q11
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vmull.u8 q6, d24, d8
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q7, d24, d9
PF subge PF_X, PF_X, ORIG_W
PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d24, d10
PF subges PF_CTL, PF_CTL, #0x10
PF subges, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d24, d11
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q14, q0, q14
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vqadd.u8 q15, q1, q15
vrshr.u16 q10, q6, #8
vrshr.u16 q11, q7, #8
@ -2425,21 +2431,21 @@ generate_composite_function \
vrshr.u16 q13, q10, #8
fetch_src_pixblock
vraddhn.u16 d30, q11, q8
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d29, q12, q9
vraddhn.u16 d28, q13, q10
vmull.u8 q8, d3, d0
vmull.u8 q9, d3, d1
vmull.u8 q10, d3, d2
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF subge PF_X, PF_X, ORIG_W
PF subges PF_CTL, PF_CTL, #0x10
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF subge, PF_X, PF_X, ORIG_W
PF subges, PF_CTL, PF_CTL, #0x10
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@ -2482,21 +2488,21 @@ generate_composite_function \
vrshr.u16 q13, q10, #8
fetch_src_pixblock
vraddhn.u16 d28, q11, q8
PF add PF_X, PF_X, #8
PF tst PF_CTL, #0xF
PF addne PF_X, PF_X, #8
PF subne PF_CTL, PF_CTL, #1
PF add, PF_X, PF_X, #8
PF tst, PF_CTL, #0xF
PF addne, PF_X, PF_X, #8
PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d29, q12, q9
vraddhn.u16 d30, q13, q10
vmull.u8 q8, d3, d0
vmull.u8 q9, d3, d1
vmull.u8 q10, d3, d2
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
PF cmp PF_X, ORIG_W
PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF subge PF_X, PF_X, ORIG_W
PF subges PF_CTL, PF_CTL, #0x10
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
PF subge, PF_X, PF_X, ORIG_W
PF subges, PF_CTL, PF_CTL, #0x10
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@ -2841,28 +2847,28 @@ generate_composite_function_nearest_scanline \
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
vld1.32 {reg1}, [TMP1], STRIDE
vld1.32 {reg2}, [TMP1]
vld1.32 {\reg1}, [TMP1], STRIDE
vld1.32 {\reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
vld1.32 {reg2[0]}, [TMP1], STRIDE
vld1.32 {reg2[1]}, [TMP1]
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
vld1.32 {\reg2[0]}, [TMP1], STRIDE
vld1.32 {\reg2[1]}, [TMP1]
convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
.endm
.macro bilinear_load_and_vertical_interpolate_two_8888 \
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
bilinear_load_8888 reg1, reg2, tmp1
vmull.u8 acc1, reg1, d28
vmlal.u8 acc1, reg2, d29
bilinear_load_8888 reg3, reg4, tmp2
vmull.u8 acc2, reg3, d28
vmlal.u8 acc2, reg4, d29
bilinear_load_8888 \reg1, \reg2, \tmp1
vmull.u8 \acc1, \reg1, d28
vmlal.u8 \acc1, \reg2, d29
bilinear_load_8888 \reg3, \reg4, \tmp2
vmull.u8 \acc2, \reg3, d28
vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_8888 \
@ -2870,9 +2876,9 @@ generate_composite_function_nearest_scanline \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
\xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
\yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
.endm
.macro bilinear_load_and_vertical_interpolate_two_0565 \
@ -2884,19 +2890,19 @@ generate_composite_function_nearest_scanline \
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
vld1.32 {acc2lo[0]}, [TMP1], STRIDE
vld1.32 {acc2hi[0]}, [TMP2], STRIDE
vld1.32 {acc2lo[1]}, [TMP1]
vld1.32 {acc2hi[1]}, [TMP2]
convert_0565_to_x888 acc2, reg3, reg2, reg1
vzip.u8 reg1, reg3
vzip.u8 reg2, reg4
vzip.u8 reg3, reg4
vzip.u8 reg1, reg2
vmull.u8 acc1, reg1, d28
vmlal.u8 acc1, reg2, d29
vmull.u8 acc2, reg3, d28
vmlal.u8 acc2, reg4, d29
vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
vld1.32 {\acc2lo[1]}, [TMP1]
vld1.32 {\acc2hi[1]}, [TMP2]
convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
vzip.u8 \reg1, \reg3
vzip.u8 \reg2, \reg4
vzip.u8 \reg3, \reg4
vzip.u8 \reg1, \reg2
vmull.u8 \acc1, \reg1, d28
vmlal.u8 \acc1, \reg2, d29
vmull.u8 \acc2, \reg3, d28
vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_0565 \
@ -2909,49 +2915,49 @@ generate_composite_function_nearest_scanline \
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
vld1.32 {xacc2lo[1]}, [TMP1]
vld1.32 {xacc2hi[1]}, [TMP2]
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
vld1.32 {\xacc2lo[1]}, [TMP1]
vld1.32 {\xacc2hi[1]}, [TMP2]
convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
vzip.u8 xreg1, xreg3
vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
vzip.u8 xreg2, xreg4
vld1.32 {yacc2lo[1]}, [TMP1]
vzip.u8 xreg3, xreg4
vld1.32 {yacc2hi[1]}, [TMP2]
vzip.u8 xreg1, xreg2
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
vmull.u8 xacc1, xreg1, d28
vzip.u8 yreg1, yreg3
vmlal.u8 xacc1, xreg2, d29
vzip.u8 yreg2, yreg4
vmull.u8 xacc2, xreg3, d28
vzip.u8 yreg3, yreg4
vmlal.u8 xacc2, xreg4, d29
vzip.u8 yreg1, yreg2
vmull.u8 yacc1, yreg1, d28
vmlal.u8 yacc1, yreg2, d29
vmull.u8 yacc2, yreg3, d28
vmlal.u8 yacc2, yreg4, d29
vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
vzip.u8 \xreg1, \xreg3
vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
vzip.u8 \xreg2, \xreg4
vld1.32 {\yacc2lo[1]}, [TMP1]
vzip.u8 \xreg3, \xreg4
vld1.32 {\yacc2hi[1]}, [TMP2]
vzip.u8 \xreg1, \xreg2
convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
vmull.u8 \xacc1, \xreg1, d28
vzip.u8 \yreg1, \yreg3
vmlal.u8 \xacc1, \xreg2, d29
vzip.u8 \yreg2, \yreg4
vmull.u8 \xacc2, \xreg3, d28
vzip.u8 \yreg3, \yreg4
vmlal.u8 \xacc2, \xreg4, d29
vzip.u8 \yreg1, \yreg2
vmull.u8 \yacc1, \yreg1, d28
vmlal.u8 \yacc1, \yreg2, d29
vmull.u8 \yacc2, \yreg3, d28
vmlal.u8 \yacc2, \yreg4, d29
.endm
.macro bilinear_store_8888 numpix, tmp1, tmp2
.if numpix == 4
.if \numpix == 4
vst1.32 {d0, d1}, [OUT, :128]!
.elseif numpix == 2
.elseif \numpix == 2
vst1.32 {d0}, [OUT, :64]!
.elseif numpix == 1
.elseif \numpix == 1
vst1.32 {d0[0]}, [OUT, :32]!
.else
.error bilinear_store_8888 numpix is unsupported
.error bilinear_store_8888 \numpix is unsupported
.endif
.endm
@ -2960,20 +2966,20 @@ generate_composite_function_nearest_scanline \
vuzp.u8 d2, d3
vuzp.u8 d1, d3
vuzp.u8 d0, d2
convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
.if numpix == 4
convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
.if \numpix == 4
vst1.16 {d2}, [OUT, :64]!
.elseif numpix == 2
.elseif \numpix == 2
vst1.32 {d2[0]}, [OUT, :32]!
.elseif numpix == 1
.elseif \numpix == 1
vst1.16 {d2[0]}, [OUT, :16]!
.else
.error bilinear_store_0565 numpix is unsupported
.error bilinear_store_0565 \numpix is unsupported
.endif
.endm
.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
bilinear_load_&src_fmt d0, d1, d2
bilinear_load_\()\src_fmt d0, d1, d2
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
@ -2985,11 +2991,11 @@ generate_composite_function_nearest_scanline \
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
bilinear_store_&dst_fmt 1, q2, q3
bilinear_store_\()\dst_fmt 1, q2, q3
.endm
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
bilinear_load_and_vertical_interpolate_two_&src_fmt \
bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
@ -3002,11 +3008,11 @@ generate_composite_function_nearest_scanline \
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
bilinear_store_&dst_fmt 2, q2, q3
bilinear_store_\()\dst_fmt 2, q2, q3
.endm
.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
bilinear_load_and_vertical_interpolate_four_&src_fmt \
bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
@ -3034,54 +3040,54 @@ generate_composite_function_nearest_scanline \
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
bilinear_store_&dst_fmt 4, q2, q3
bilinear_store_\()\dst_fmt 4, q2, q3
.endm
.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
.else
bilinear_interpolate_four_pixels src_fmt, dst_fmt
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
.endif
.endm
.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
.else
bilinear_interpolate_four_pixels src_fmt, dst_fmt
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
.else
bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
.else
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
.else
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
.endif
.endm
@ -3106,7 +3112,7 @@ generate_composite_function_nearest_scanline \
src_bpp_shift, dst_bpp_shift, \
prefetch_distance, flags
pixman_asm_function fname
pixman_asm_function \fname
OUT .req r0
TOP .req r1
BOTTOM .req r2
@ -3124,11 +3130,11 @@ pixman_asm_function fname
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
mov PF_OFFS, #prefetch_distance
mov PF_OFFS, #\prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpush {d8-d15}
.endif
@ -3151,7 +3157,7 @@ pixman_asm_function fname
beq 0f
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
bilinear_interpolate_last_pixel src_fmt, dst_fmt
bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
@ -3162,62 +3168,62 @@ pixman_asm_function fname
blt 0f
tst OUT, #(1 << (dst_bpp_shift + 1))
beq 0f
bilinear_interpolate_two_pixels src_fmt, dst_fmt
bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #2
0:
.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
/*********** 8 pixels per iteration *****************/
cmp WIDTH, #4
blt 0f
tst OUT, #(1 << (dst_bpp_shift + 2))
beq 0f
bilinear_interpolate_four_pixels src_fmt, dst_fmt
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #4
0:
subs WIDTH, WIDTH, #8
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #8
blt 5f
0:
bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #8
bge 0b
5:
bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
1:
tst WIDTH, #4
beq 2f
bilinear_interpolate_four_pixels src_fmt, dst_fmt
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2:
.else
/*********** 4 pixels per iteration *****************/
subs WIDTH, WIDTH, #4
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #4
blt 5f
0:
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #4
bge 0b
5:
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1:
/****************************************************/
.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
bilinear_interpolate_two_pixels src_fmt, dst_fmt
bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2:
tst WIDTH, #1
beq 3f
bilinear_interpolate_last_pixel src_fmt, dst_fmt
bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
3:
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpop {d8-d15}
.endif
pop {r4, r5, r6, r7, r8, r9}
@ -3236,7 +3242,9 @@ pixman_asm_function fname
.unreq TMP3
.unreq TMP4
.unreq STRIDE
#ifndef __clang__
.endfunc
#endif
.endm

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -25,6 +25,10 @@
*
*/
#ifdef __clang__
#define subpls subspl
#endif
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
@ -62,7 +66,7 @@
prefetch_distance, \
prefetch_braking_distance
pixman_asm_function fname
pixman_asm_function \fname
W .req r0
DST .req r1
SRC .req r2
@ -76,38 +80,38 @@ pixman_asm_function fname
ldr UNIT_X, [sp]
push {r4, r5, r6, r7, r8, r10}
mvn VXMASK, #((1 << bpp_shift) - 1)
mvn VXMASK, #((1 << \bpp_shift) - 1)
ldr SRC_WIDTH_FIXED, [sp, #28]
/* define helper macro */
.macro scale_2_pixels
ldr&t TMP1, [SRC, TMP1]
and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
ldr\()\t TMP1, [SRC, TMP1]
and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
str&t TMP1, [DST], #(1 << bpp_shift)
str\()\t TMP1, [DST], #(1 << \bpp_shift)
9: subpls VX, VX, SRC_WIDTH_FIXED
bpl 9b
ldr&t TMP2, [SRC, TMP2]
and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
ldr\()\t TMP2, [SRC, TMP2]
and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
str&t TMP2, [DST], #(1 << bpp_shift)
str\()\t TMP2, [DST], #(1 << \bpp_shift)
9: subpls VX, VX, SRC_WIDTH_FIXED
bpl 9b
.endm
/* now do the scaling */
and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
9: subpls VX, VX, SRC_WIDTH_FIXED
bpl 9b
subs W, W, #(8 + prefetch_braking_distance)
subs W, W, #(8 + \prefetch_braking_distance)
blt 2f
/* calculate prefetch offset */
mov PF_OFFS, #prefetch_distance
mov PF_OFFS, #\prefetch_distance
mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
add PF_OFFS, UNIT_X, lsl #3
scale_2_pixels
scale_2_pixels
@ -116,7 +120,7 @@ pixman_asm_function fname
subs W, W, #8
bge 1b
2:
subs W, W, #(4 - 8 - prefetch_braking_distance)
subs W, W, #(4 - 8 - \prefetch_braking_distance)
blt 2f
1: /* process the remaining pixels */
scale_2_pixels
@ -129,8 +133,13 @@ pixman_asm_function fname
scale_2_pixels
2:
tst W, #1
ldrne&t TMP1, [SRC, TMP1]
strne&t TMP1, [DST]
#ifdef __clang__
ldr\()\t\()ne TMP1, [SRC, TMP1]
str\()\t\()ne TMP1, [DST]
#else
ldrne\()\t TMP1, [SRC, TMP1]
strne\()\t TMP1, [DST]
#endif
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@ -146,7 +155,9 @@ pixman_asm_function fname
/* return */
pop {r4, r5, r6, r7, r8, r10}
bx lr
#ifndef __clang__
.endfunc
#endif
.endm
generate_nearest_scanline_func \

Просмотреть файл

@ -25,6 +25,11 @@
*
*/
#ifdef __clang__
#define adceqs adcseq
#define ldmnedb ldmdbne
#endif
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
@ -57,7 +62,7 @@
.endm
.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
pixld cond, numbytes, firstreg, SRC, unaligned_src
pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm
.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
@ -65,8 +70,8 @@
WK5 .req STRIDE_S
WK6 .req MASK
WK7 .req STRIDE_M
110: pixld , 16, 0, SRC, unaligned_src
pixld , 16, 4, SRC, unaligned_src
110: pixld , 16, 0, SRC, \unaligned_src
pixld , 16, 4, SRC, \unaligned_src
pld [SRC, SCRATCH]
pixst , 16, 0, DST
pixst , 16, 4, DST
@ -142,7 +147,7 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req MASK
WK7 .req STRIDE_M
pixst cond, numbytes, 4, DST
pixst \cond, \numbytes, 4, DST
.unreq WK4
.unreq WK5
.unreq WK6
@ -182,20 +187,20 @@ generate_composite_function \
/******************************************************************************/
.macro src_x888_8888_pixel, cond, reg
orr&cond WK&reg, WK&reg, #0xFF000000
orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
.endm
.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
pixld cond, numbytes, firstreg, SRC, unaligned_src
pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm
.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
src_x888_8888_pixel cond, %(firstreg+0)
.if numbytes >= 8
src_x888_8888_pixel cond, %(firstreg+1)
.if numbytes == 16
src_x888_8888_pixel cond, %(firstreg+2)
src_x888_8888_pixel cond, %(firstreg+3)
src_x888_8888_pixel \cond, %(\firstreg+0)
.if \numbytes >= 8
src_x888_8888_pixel \cond, %(\firstreg+1)
.if \numbytes == 16
src_x888_8888_pixel \cond, %(\firstreg+2)
src_x888_8888_pixel \cond, %(\firstreg+3)
.endif
.endif
.endm
@ -222,73 +227,73 @@ generate_composite_function \
.endm
.macro src_0565_8888_2pixels, reg1, reg2
and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
.endm
/* This version doesn't need STRIDE_M, but is one instruction longer.
It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb
orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
*/
.macro src_0565_8888_1pixel, reg
bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb
and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000
mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000
orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000
pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
.endm
.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.if numbytes == 16
pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
.elseif numbytes == 8
pixld , 4, firstreg, SRC, unaligned_src
.elseif numbytes == 4
pixld , 2, firstreg, SRC, unaligned_src
.if \numbytes == 16
pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
.elseif \numbytes == 8
pixld , 4, \firstreg, SRC, \unaligned_src
.elseif \numbytes == 4
pixld , 2, \firstreg, SRC, \unaligned_src
.endif
.endm
.macro src_0565_8888_process_tail cond, numbytes, firstreg
.if numbytes == 16
src_0565_8888_2pixels firstreg, %(firstreg+1)
src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
.elseif numbytes == 8
src_0565_8888_2pixels firstreg, %(firstreg+1)
.if \numbytes == 16
src_0565_8888_2pixels \firstreg, %(\firstreg+1)
src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
.elseif \numbytes == 8
src_0565_8888_2pixels \firstreg, %(\firstreg+1)
.else
src_0565_8888_1pixel firstreg
src_0565_8888_1pixel \firstreg
.endif
.endm
@ -311,23 +316,23 @@ generate_composite_function \
.endm
.macro src_x888_0565_1pixel s, d
and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000
orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
/* Top 16 bits are discarded during the following STRH */
.endm
.macro src_x888_0565_2pixels slo, shi, d, tmp
and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000
orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
.endm
.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
@ -335,33 +340,33 @@ generate_composite_function \
WK5 .req STRIDE_M
WK6 .req WK3
WK7 .req ORIG_W
.if numbytes == 16
.if \numbytes == 16
pixld , 16, 4, SRC, 0
src_x888_0565_2pixels 4, 5, 0, 0
pixld , 8, 4, SRC, 0
src_x888_0565_2pixels 6, 7, 1, 1
pixld , 8, 6, SRC, 0
.else
pixld , numbytes*2, 4, SRC, 0
pixld , \numbytes*2, 4, SRC, 0
.endif
.endm
.macro src_x888_0565_process_tail cond, numbytes, firstreg
.if numbytes == 16
.if \numbytes == 16
src_x888_0565_2pixels 4, 5, 2, 2
src_x888_0565_2pixels 6, 7, 3, 4
.elseif numbytes == 8
.elseif \numbytes == 8
src_x888_0565_2pixels 4, 5, 1, 1
src_x888_0565_2pixels 6, 7, 2, 2
.elseif numbytes == 4
.elseif \numbytes == 4
src_x888_0565_2pixels 4, 5, 1, 1
.else
src_x888_0565_1pixel 4, 1
.endif
.if numbytes == 16
pixst , numbytes, 0, DST
.if \numbytes == 16
pixst , \numbytes, 0, DST
.else
pixst , numbytes, 1, DST
pixst , \numbytes, 1, DST
.endif
.unreq WK4
.unreq WK5
@ -382,37 +387,37 @@ generate_composite_function \
/******************************************************************************/
.macro add_8_8_8pixels cond, dst1, dst2
uqadd8&cond WK&dst1, WK&dst1, MASK
uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK
uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M
.endm
.macro add_8_8_4pixels cond, dst
uqadd8&cond WK&dst, WK&dst, MASK
uqadd8\()\cond WK\()\dst, WK\()\dst, MASK
.endm
.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
WK4 .req MASK
WK5 .req STRIDE_M
.if numbytes == 16
pixld cond, 8, 4, SRC, unaligned_src
pixld cond, 16, firstreg, DST, 0
add_8_8_8pixels cond, firstreg, %(firstreg+1)
pixld cond, 8, 4, SRC, unaligned_src
.if \numbytes == 16
pixld \cond, 8, 4, SRC, \unaligned_src
pixld \cond, 16, \firstreg, DST, 0
add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
pixld \cond, 8, 4, SRC, \unaligned_src
.else
pixld cond, numbytes, 4, SRC, unaligned_src
pixld cond, numbytes, firstreg, DST, 0
pixld \cond, \numbytes, 4, SRC, \unaligned_src
pixld \cond, \numbytes, \firstreg, DST, 0
.endif
.unreq WK4
.unreq WK5
.endm
.macro add_8_8_process_tail cond, numbytes, firstreg
.if numbytes == 16
add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
.elseif numbytes == 8
add_8_8_8pixels cond, firstreg, %(firstreg+1)
.if \numbytes == 16
add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
.elseif \numbytes == 8
add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
.else
add_8_8_4pixels cond, firstreg
add_8_8_4pixels \cond, \firstreg
.endif
.endm
@ -441,8 +446,8 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req STRIDE_M
WK7 .req ORIG_W
pixld , numbytes, %(4+firstreg), SRC, unaligned_src
pixld , numbytes, firstreg, DST, 0
pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src
pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.unreq WK5
.unreq WK6
@ -451,44 +456,44 @@ generate_composite_function \
.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
/* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
teq WK&reg0, #0
.if numbytes > 4
teqeq WK&reg1, #0
.if numbytes > 8
teqeq WK&reg2, #0
teqeq WK&reg3, #0
teq WK\()\reg0, #0
.if \numbytes > 4
teqeq WK\()\reg1, #0
.if \numbytes > 8
teqeq WK\()\reg2, #0
teqeq WK\()\reg3, #0
.endif
.endif
.endm
.macro over_8888_8888_prepare next
mov WK&next, WK&next, lsr #24
mov WK\()\next, WK\()\next, lsr #24
.endm
.macro over_8888_8888_1pixel src, dst, offset, next
/* src = destination component multiplier */
rsb WK&src, WK&src, #255
rsb WK\()\src, WK\()\src, #255
/* Split even/odd bytes of dst into SCRATCH/dst */
uxtb16 SCRATCH, WK&dst
uxtb16 WK&dst, WK&dst, ror #8
uxtb16 SCRATCH, WK\()\dst
uxtb16 WK\()\dst, WK\()\dst, ror #8
/* Multiply through, adding 0.5 to the upper byte of result for rounding */
mla SCRATCH, SCRATCH, WK&src, MASK
mla WK&dst, WK&dst, WK&src, MASK
mla SCRATCH, SCRATCH, WK\()\src, MASK
mla WK\()\dst, WK\()\dst, WK\()\src, MASK
/* Where we would have had a stall between the result of the first MLA and the shifter input,
* reload the complete source pixel */
ldr WK&src, [SRC, #offset]
ldr WK\()\src, [SRC, #\offset]
/* Multiply by 257/256 to approximate 256/255 */
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
/* In this stall, start processing the next pixel */
.if offset < -4
mov WK&next, WK&next, lsr #24
.if \offset < -4
mov WK\()\next, WK\()\next, lsr #24
.endif
uxtab16 WK&dst, WK&dst, WK&dst, ror #8
uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
/* Recombine even/odd bytes of multiplied destination */
mov SCRATCH, SCRATCH, ror #8
sel WK&dst, SCRATCH, WK&dst
sel WK\()\dst, SCRATCH, WK\()\dst
/* Saturated add of source to multiplied destination */
uqadd8 WK&dst, WK&dst, WK&src
uqadd8 WK\()\dst, WK\()\dst, WK\()\src
.endm
.macro over_8888_8888_process_tail cond, numbytes, firstreg
@ -496,17 +501,17 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req STRIDE_M
WK7 .req ORIG_W
over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
beq 10f
over_8888_8888_prepare %(4+firstreg)
.set PROCESS_REG, firstreg
.set PROCESS_OFF, -numbytes
.rept numbytes / 4
over_8888_8888_prepare %(4+\firstreg)
.set PROCESS_REG, \firstreg
.set PROCESS_OFF, -\numbytes
.rept \numbytes / 4
over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.set PROCESS_OFF, PROCESS_OFF+4
.endr
pixst , numbytes, firstreg, DST
pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.unreq WK5
@ -536,16 +541,16 @@ generate_composite_function \
*/
.macro mul_8888_8 word, byte, tmp, half
/* Split even/odd bytes of word apart */
uxtb16 tmp, word
uxtb16 word, word, ror #8
uxtb16 \tmp, \word
uxtb16 \word, \word, ror #8
/* Multiply bytes together with rounding, then by 257/256 */
mla tmp, tmp, byte, half
mla word, word, byte, half /* 1 stall follows */
uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
uxtab16 word, word, word, ror #8
mla \tmp, \tmp, \byte, \half
mla \word, \word, \byte, \half /* 1 stall follows */
uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */
uxtab16 \word, \word, \word, ror #8
/* Recombine bytes */
mov tmp, tmp, ror #8
sel word, tmp, word
mov \tmp, \tmp, ror #8
sel \word, \tmp, \word
.endm
/******************************************************************************/
@ -567,8 +572,8 @@ generate_composite_function \
WK5 .req STRIDE_D
WK6 .req STRIDE_S
WK7 .req ORIG_W
pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
pixld , numbytes, firstreg, DST, 0
pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.unreq WK5
.unreq WK6
@ -576,10 +581,10 @@ generate_composite_function \
.endm
.macro over_8888_n_8888_1pixel src, dst
mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
sub WK7, WK6, WK&src, lsr #24
mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
uqadd8 WK&dst, WK&dst, WK&src
mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M
sub WK7, WK6, WK\()\src, lsr #24
mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M
uqadd8 WK\()\dst, WK\()\dst, WK\()\src
.endm
.macro over_8888_n_8888_process_tail cond, numbytes, firstreg
@ -587,12 +592,12 @@ generate_composite_function \
WK5 .req STRIDE_D
WK6 .req STRIDE_S
WK7 .req ORIG_W
over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
beq 10f
mov WK6, #255
.set PROCESS_REG, firstreg
.rept numbytes / 4
.if numbytes == 16 && PROCESS_REG == 2
.set PROCESS_REG, \firstreg
.rept \numbytes / 4
.if \numbytes == 16 && PROCESS_REG == 2
/* We're using WK6 and WK7 as temporaries, so half way through
* 4 pixels, reload the second two source pixels but this time
* into WK4 and WK5 */
@ -601,7 +606,7 @@ generate_composite_function \
over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
pixst , numbytes, firstreg, DST
pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.unreq WK5
@ -642,13 +647,13 @@ generate_composite_function \
.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
WK4 .req STRIDE_M
pixld , numbytes/4, 4, MASK, unaligned_mask
pixld , numbytes, firstreg, DST, 0
pixld , \numbytes/4, 4, MASK, \unaligned_mask
pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.endm
.macro over_n_8_8888_1pixel src, dst
uxtb Y, WK4, ror #src*8
uxtb Y, WK4, ror #\src*8
/* Trailing part of multiplication of source */
mla SCRATCH, STRIDE_S, Y, STRIDE_D
mla Y, SRC, Y, STRIDE_D
@ -659,20 +664,20 @@ generate_composite_function \
sub ORIG_W, ORIG_W, Y, lsr #24
sel Y, SCRATCH, Y
/* Then multiply the destination */
mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
uqadd8 WK&dst, WK&dst, Y
mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
uqadd8 WK\()\dst, WK\()\dst, Y
.endm
.macro over_n_8_8888_process_tail cond, numbytes, firstreg
WK4 .req STRIDE_M
teq WK4, #0
beq 10f
.set PROCESS_REG, firstreg
.rept numbytes / 4
over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
.set PROCESS_REG, \firstreg
.rept \numbytes / 4
over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
pixst , numbytes, firstreg, DST
pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.endm
@ -705,14 +710,14 @@ generate_composite_function \
.endm
.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
pixld , numbytes, firstreg, DST, 0
pixld , \numbytes, \firstreg, DST, 0
.endm
.macro over_reverse_n_8888_1pixel d, is_only
teq WK&d, #0
teq WK\()\d, #0
beq 8f /* replace with source */
bics ORIG_W, STRIDE_D, WK&d, lsr #24
.if is_only == 1
bics ORIG_W, STRIDE_D, WK\()\d, lsr #24
.if \is_only == 1
beq 49f /* skip store */
.else
beq 9f /* write same value back */
@ -723,36 +728,36 @@ generate_composite_function \
uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
mov SCRATCH, SCRATCH, ror #8
sel ORIG_W, SCRATCH, ORIG_W
uqadd8 WK&d, WK&d, ORIG_W
uqadd8 WK\()\d, WK\()\d, ORIG_W
b 9f
8: mov WK&d, SRC
8: mov WK\()\d, SRC
9:
.endm
.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
.if numbytes == 4
over_reverse_n_8888_1pixel reg1, 1
.if \numbytes == 4
over_reverse_n_8888_1pixel \reg1, 1
.else
and SCRATCH, WK&reg1, WK&reg2
.if numbytes == 16
and SCRATCH, SCRATCH, WK&reg3
and SCRATCH, SCRATCH, WK&reg4
and SCRATCH, WK\()\reg1, WK\()\reg2
.if \numbytes == 16
and SCRATCH, SCRATCH, WK\()\reg3
and SCRATCH, SCRATCH, WK\()\reg4
.endif
mvns SCRATCH, SCRATCH, asr #24
beq 49f /* skip store if all opaque */
over_reverse_n_8888_1pixel reg1, 0
over_reverse_n_8888_1pixel reg2, 0
.if numbytes == 16
over_reverse_n_8888_1pixel reg3, 0
over_reverse_n_8888_1pixel reg4, 0
over_reverse_n_8888_1pixel \reg1, 0
over_reverse_n_8888_1pixel \reg2, 0
.if \numbytes == 16
over_reverse_n_8888_1pixel \reg3, 0
over_reverse_n_8888_1pixel \reg4, 0
.endif
.endif
pixst , numbytes, reg1, DST
pixst , \numbytes, \reg1, DST
49:
.endm
.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm
generate_composite_function \
@ -794,20 +799,20 @@ generate_composite_function \
.macro over_white_8888_8888_ca_combine m, d
uxtb16 TMP1, TMP0 /* rb_notmask */
uxtb16 TMP2, d /* rb_dest; 1 stall follows */
uxtb16 TMP2, \d /* rb_dest; 1 stall follows */
smlatt TMP3, TMP2, TMP1, HALF /* red */
smlabb TMP2, TMP2, TMP1, HALF /* blue */
uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
smlatt d, TMP1, TMP0, HALF /* alpha */
uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */
smlatt \d, TMP1, TMP0, HALF /* alpha */
smlabb TMP1, TMP1, TMP0, HALF /* green */
pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
pkhbt TMP1, TMP1, d, lsl #16 /* ag */
pkhbt TMP1, TMP1, \d, lsl #16 /* ag */
uxtab16 TMP0, TMP0, TMP0, ror #8
uxtab16 TMP1, TMP1, TMP1, ror #8
mov TMP0, TMP0, ror #8
sel d, TMP0, TMP1
uqadd8 d, d, m /* d is a late result */
sel \d, TMP0, TMP1
uqadd8 \d, \d, \m /* d is a late result */
.endm
.macro over_white_8888_8888_ca_1pixel_head
@ -853,10 +858,10 @@ generate_composite_function \
.endm
.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.if numbytes == 4
.if \numbytes == 4
over_white_8888_8888_ca_1pixel_head
.else
.if numbytes == 16
.if \numbytes == 16
over_white_8888_8888_ca_2pixels_head
over_white_8888_8888_ca_2pixels_tail
.endif
@ -865,7 +870,7 @@ generate_composite_function \
.endm
.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
.if numbytes == 4
.if \numbytes == 4
over_white_8888_8888_ca_1pixel_tail
.else
over_white_8888_8888_ca_2pixels_tail
@ -1004,7 +1009,7 @@ generate_composite_function \
.endm
.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.rept (numbytes / 4) - 1
.rept (\numbytes / 4) - 1
over_n_8888_8888_ca_1pixel_head
over_n_8888_8888_ca_1pixel_tail
.endr
@ -1020,7 +1025,9 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
cmp ip, #-1
beq pixman_composite_over_white_8888_8888_ca_asm_armv6
/* else drop through... */
#ifndef __clang__
.endfunc
#endif
generate_composite_function \
pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
@ -1045,84 +1052,84 @@ generate_composite_function \
.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
ldrb ORIG_W, [SRC], #4
.if numbytes >= 8
ldrb WK&reg1, [SRC], #4
.if numbytes == 16
ldrb WK&reg2, [SRC], #4
ldrb WK&reg3, [SRC], #4
.if \numbytes >= 8
ldrb WK\()\reg1, [SRC], #4
.if \numbytes == 16
ldrb WK\()\reg2, [SRC], #4
ldrb WK\()\reg3, [SRC], #4
.endif
.endif
add DST, DST, #numbytes
add DST, DST, #\numbytes
.endm
.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
.endm
.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
.if is_only != 1
movs s, ORIG_W
.if offset != 0
ldrb ORIG_W, [SRC, #offset]
.if \is_only != 1
movs \s, ORIG_W
.if \offset != 0
ldrb ORIG_W, [SRC, #\offset]
.endif
beq 01f
teq STRIDE_M, #0xFF
beq 02f
.endif
uxtb16 SCRATCH, d /* rb_dest */
uxtb16 d, d, ror #8 /* ag_dest */
mla SCRATCH, SCRATCH, s, MASK
mla d, d, s, MASK
uxtb16 SCRATCH, \d /* rb_dest */
uxtb16 \d, \d, ror #8 /* ag_dest */
mla SCRATCH, SCRATCH, \s, MASK
mla \d, \d, \s, MASK
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
uxtab16 d, d, d, ror #8
uxtab16 \d, \d, \d, ror #8
mov SCRATCH, SCRATCH, ror #8
sel d, SCRATCH, d
sel \d, SCRATCH, \d
b 02f
.if offset == 0
.if \offset == 0
48: /* Last mov d,#0 of the set - used as part of shortcut for
* source values all 0 */
.endif
01: mov d, #0
01: mov \d, #0
02:
.endm
.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
.if numbytes == 4
.if \numbytes == 4
teq ORIG_W, ORIG_W, asr #32
ldrne WK&reg1, [DST, #-4]
.elseif numbytes == 8
teq ORIG_W, WK&reg1
ldrne WK\()\reg1, [DST, #-4]
.elseif \numbytes == 8
teq ORIG_W, WK\()\reg1
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
ldmnedb DST, {WK&reg1-WK&reg2}
ldmnedb DST, {WK\()\reg1-WK\()\reg2}
.else
teq ORIG_W, WK&reg1
teqeq ORIG_W, WK&reg2
teqeq ORIG_W, WK&reg3
teq ORIG_W, WK\()\reg1
teqeq ORIG_W, WK\()\reg2
teqeq ORIG_W, WK\()\reg3
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
ldmnedb DST, {WK&reg1-WK&reg4}
ldmnedb DST, {WK\()\reg1-WK\()\reg4}
.endif
cmnne DST, #0 /* clear C if NE */
bcs 49f /* no writes to dest if source all -1 */
beq 48f /* set dest to all 0 if source all 0 */
.if numbytes == 4
in_reverse_8888_8888_1pixel ORIG_W, WK&reg1, 0, 1
str WK&reg1, [DST, #-4]
.elseif numbytes == 8
in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -4, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, 0, 0
stmdb DST, {WK&reg1-WK&reg2}
.if \numbytes == 4
in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1
str WK\()\reg1, [DST, #-4]
.elseif \numbytes == 8
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0
stmdb DST, {WK\()\reg1-WK\()\reg2}
.else
in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -12, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, -8, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK&reg3, -4, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK&reg4, 0, 0
stmdb DST, {WK&reg1-WK&reg4}
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0
stmdb DST, {WK\()\reg1-WK\()\reg4}
.endif
49:
.endm
.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm
generate_composite_function \
@ -1149,21 +1156,21 @@ generate_composite_function \
.endm
.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
pixld , numbytes, firstreg, DST, 0
pixld , \numbytes, \firstreg, DST, 0
.endm
.macro over_n_8888_1pixel dst
mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK
uqadd8 WK&dst, WK&dst, SRC
mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK
uqadd8 WK\()\dst, WK\()\dst, SRC
.endm
.macro over_n_8888_process_tail cond, numbytes, firstreg
.set PROCESS_REG, firstreg
.rept numbytes / 4
.set PROCESS_REG, \firstreg
.rept \numbytes / 4
over_n_8888_1pixel %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
pixst , numbytes, firstreg, DST
pixst , \numbytes, \firstreg, DST
.endm
generate_composite_function \

Просмотреть файл

@ -112,64 +112,96 @@
*/
.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
.if numbytes == 16
.if unaligned == 1
op&r&cond WK&reg0, [base], #4
op&r&cond WK&reg1, [base], #4
op&r&cond WK&reg2, [base], #4
op&r&cond WK&reg3, [base], #4
.if \numbytes == 16
.if \unaligned == 1
\op\()r\()\cond WK\()\reg0, [\base], #4
\op\()r\()\cond WK\()\reg1, [\base], #4
\op\()r\()\cond WK\()\reg2, [\base], #4
\op\()r\()\cond WK\()\reg3, [\base], #4
.else
op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
#ifdef __clang__
\op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
#else
\op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
#endif
.endif
.elseif numbytes == 8
.if unaligned == 1
op&r&cond WK&reg0, [base], #4
op&r&cond WK&reg1, [base], #4
.elseif \numbytes == 8
.if \unaligned == 1
\op\()r\()\cond WK\()\reg0, [\base], #4
\op\()r\()\cond WK\()\reg1, [\base], #4
.else
op&m&cond&ia base!, {WK&reg0,WK&reg1}
#ifdef __clang__
\op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
#else
\op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1}
#endif
.endif
.elseif numbytes == 4
op&r&cond WK&reg0, [base], #4
.elseif numbytes == 2
op&r&cond&h WK&reg0, [base], #2
.elseif numbytes == 1
op&r&cond&b WK&reg0, [base], #1
.elseif \numbytes == 4
\op\()r\()\cond WK\()\reg0, [\base], #4
.elseif \numbytes == 2
#ifdef __clang__
\op\()rh\()\cond WK\()\reg0, [\base], #2
#else
\op\()r\()\cond\()h WK\()\reg0, [\base], #2
#endif
.elseif \numbytes == 1
#ifdef __clang__
\op\()rb\()\cond WK\()\reg0, [\base], #1
#else
\op\()r\()\cond\()b WK\()\reg0, [\base], #1
#endif
.else
.error "unsupported size: numbytes"
.error "unsupported size: \numbytes"
.endif
.endm
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
.if numbytes == 16
stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
.elseif numbytes == 8
stm&cond&db base, {WK&reg0,WK&reg1}
.elseif numbytes == 4
str&cond WK&reg0, [base, #-4]
.elseif numbytes == 2
str&cond&h WK&reg0, [base, #-2]
.elseif numbytes == 1
str&cond&b WK&reg0, [base, #-1]
.if \numbytes == 16
#ifdef __clang__
stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
#else
stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
#endif
.elseif \numbytes == 8
#ifdef __clang__
stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
#else
stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}
#endif
.elseif \numbytes == 4
str\()\cond WK\()\reg0, [\base, #-4]
.elseif \numbytes == 2
#ifdef __clang__
strh\()\cond WK\()\reg0, [\base, #-2]
#else
str\()\cond\()h WK\()\reg0, [\base, #-2]
#endif
.elseif \numbytes == 1
#ifdef __clang__
strb\()\cond WK\()\reg0, [\base, #-1]
#else
str\()\cond\()b WK\()\reg0, [\base, #-1]
#endif
.else
.error "unsupported size: numbytes"
.error "unsupported size: \numbytes"
.endif
.endm
.macro pixld cond, numbytes, firstreg, base, unaligned
pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
.endm
.macro pixst cond, numbytes, firstreg, base
.if (flags) & FLAG_DST_READWRITE
pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
.else
pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
.endif
.endm
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
a x
\a \x
.endif
.endm
@ -179,11 +211,11 @@
* between 0 and prefetch_distance (inclusive) cache lines ahead so there
* are no gaps when the inner loop starts.
*/
.if bpp > 0
PF bic, ptr, base, #31
.if \bpp > 0
PF bic, \ptr, \base, #31
.set OFFSET, 0
.rept prefetch_distance+1
PF pld, [ptr, #OFFSET]
PF pld, [\ptr, #OFFSET]
.set OFFSET, OFFSET+32
.endr
.endif
@ -201,42 +233,42 @@
* and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
* possible when there are 4 src bytes for every 1 dst byte).
*/
.if bpp > 0
.ifc base,DST
.if \bpp > 0
.ifc \base,DST
/* The test can be simplified further when preloading the destination */
PF tst, base, #16
PF tst, \base, #16
PF beq, 61f
.else
.if bpp/dst_w_bpp == 4
PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
.if \bpp/dst_w_bpp == 4
PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
PF and, SCRATCH, SCRATCH, #31
PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
PF bcs, 61f
PF bpl, 60f
PF pld, [ptr, #32*(prefetch_distance+2)]
.else
PF mov, SCRATCH, base, lsl #32-5
PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
PF mov, SCRATCH, \base, lsl #32-5
PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
PF bls, 61f
.endif
.endif
60: PF pld, [ptr, #32*(prefetch_distance+1)]
60: PF pld, [\ptr, #32*(prefetch_distance+1)]
61:
.endif
.endm
#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
.macro preload_middle bpp, base, scratch_holds_offset
.if bpp > 0
.if \bpp > 0
/* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
.if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
.if scratch_holds_offset
PF pld, [base, SCRATCH]
.if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
.if \scratch_holds_offset
PF pld, [\base, SCRATCH]
.else
PF bic, SCRATCH, base, #31
PF bic, SCRATCH, \base, #31
PF pld, [SCRATCH, #32*prefetch_distance]
.endif
.endif
@ -244,28 +276,28 @@
.endm
.macro preload_trailing bpp, bpp_shift, base
.if bpp > 0
.if bpp*pix_per_block > 256
.if \bpp > 0
.if \bpp*pix_per_block > 256
/* Calculations are more complex if more than one fetch per block */
PF and, WK1, base, #31
PF add, WK1, WK1, WK0, lsl #bpp_shift
PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
PF bic, SCRATCH, base, #31
PF and, WK1, \base, #31
PF add, WK1, WK1, WK0, lsl #\bpp_shift
PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
PF bic, SCRATCH, \base, #31
80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
PF add, SCRATCH, SCRATCH, #32
PF subs, WK1, WK1, #32
PF bhi, 80b
.else
/* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
PF mov, SCRATCH, base, lsl #32-5
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
PF mov, SCRATCH, \base, lsl #32-5
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
PF adceqs, SCRATCH, SCRATCH, #0
/* The instruction above has two effects: ensures Z is only
* set if C was clear (so Z indicates that both shifted quantities
* were 0), and clears C if Z was set (so C indicates that the sum
* of the shifted quantities was greater and not equal to 32) */
PF beq, 82f
PF bic, SCRATCH, base, #31
PF bic, SCRATCH, \base, #31
PF bcc, 81f
PF pld, [SCRATCH, #32*(prefetch_distance+2)]
81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
@ -288,12 +320,12 @@
* "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
* "base" - base address register of channel to preload (SRC, MASK or DST)
*/
.if bpp > 0
.if narrow_case && (bpp <= dst_w_bpp)
.if \bpp > 0
.if \narrow_case && (\bpp <= dst_w_bpp)
/* In these cases, each line for each channel is in either 1 or 2 cache lines */
PF bic, WK0, base, #31
PF bic, WK0, \base, #31
PF pld, [WK0]
PF add, WK1, base, X, LSL #bpp_shift
PF add, WK1, \base, X, LSL #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
@ -301,9 +333,9 @@
PF pld, [WK1]
90:
.else
PF bic, WK0, base, #31
PF bic, WK0, \base, #31
PF pld, [WK0]
PF add, WK1, base, X, lsl #bpp_shift
PF add, WK1, \base, X, lsl #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
@ -319,56 +351,56 @@
.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
.if decrementx
sub&cond X, X, #8*numbytes/dst_w_bpp
\process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
.if \decrementx
sub\()\cond X, X, #8*\numbytes/dst_w_bpp
.endif
process_tail cond, numbytes, firstreg
\process_tail \cond, \numbytes, \firstreg
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst cond, numbytes, firstreg, DST
pixst \cond, \numbytes, \firstreg, DST
.endif
.endm
.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
.if (flags) & FLAG_BRANCH_OVER
.ifc cond,mi
.ifc \cond,mi
bpl 100f
.endif
.ifc cond,cs
.ifc \cond,cs
bcc 100f
.endif
.ifc cond,ne
.ifc \cond,ne
beq 100f
.endif
conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
100:
.else
conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
.endif
.endm
.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
.if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
/* Can't interleave reads and writes */
test
conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
\test
conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
.if (flags) & FLAG_PROCESS_CORRUPTS_PSR
test
\test
.endif
conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
.else
/* Can interleave reads and writes for better scheduling */
test
process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
.if decrementx
sub&cond1 X, X, #8*numbytes1/dst_w_bpp
sub&cond2 X, X, #8*numbytes2/dst_w_bpp
\test
\process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
\process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
.if \decrementx
sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
.endif
process_tail cond1, numbytes1, firstreg1
process_tail cond2, numbytes2, firstreg2
pixst cond1, numbytes1, firstreg1, DST
pixst cond2, numbytes2, firstreg2, DST
\process_tail \cond1, \numbytes1, \firstreg1
\process_tail \cond2, \numbytes2, \firstreg2
pixst \cond1, \numbytes1, \firstreg1, DST
pixst \cond2, \numbytes2, \firstreg2, DST
.endif
.endm
@ -400,12 +432,12 @@
.endif
/* Use unaligned loads in all cases for simplicity */
.if dst_w_bpp == 8
conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
.elseif dst_w_bpp == 16
test_bits_1_0_ptr
conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
.endif
conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
.endif
@ -424,12 +456,12 @@
.endm
.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
.if dst_w_bpp == 16
test_bits_1_0_pix
conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
.elseif dst_w_bpp == 8
conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
.endif
.endm
@ -438,7 +470,7 @@
110:
.set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
.rept pix_per_block*dst_w_bpp/128
process_head , 16, 0, unaligned_src, unaligned_mask, 1
\process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
preload_middle src_bpp, SRC, 1
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@ -453,9 +485,9 @@
* preloads for, to achieve staggered prefetches for multiple channels, because there are
* always two STMs per prefetch, so there is always an opposite STM on which to put the
* preload. Note, no need to BIC the base register here */
PF pld, [DST, #32*prefetch_distance - dst_alignment]
PF pld, [DST, #32*prefetch_distance - \dst_alignment]
.endif
process_tail , 16, 0
\process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
@ -470,11 +502,11 @@
.if dst_r_bpp > 0
tst DST, #16
bne 111f
process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
\process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
\process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
112:
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
@ -487,13 +519,13 @@
.endif
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
/* The remainder of the line is handled identically to the medium case */
medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
.endm
.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
120:
process_head , 16, 0, unaligned_src, unaligned_mask, 0
process_tail , 16, 0
\process_head , 16, 0, \unaligned_src, \unaligned_mask, 0
\process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
@ -501,16 +533,16 @@
bhs 120b
/* Trailing pixels */
tst X, #128/dst_w_bpp - 1
beq exit_label
trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
beq \exit_label
trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
.endm
.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
tst X, #16*8/dst_w_bpp
conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
/* Trailing pixels */
/* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
.endm
.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
@ -523,37 +555,37 @@
tst SRC, #3
bne 140f
.endif
action process_head, process_tail, process_inner_loop, exit_label, 0, 0
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
.if src_bpp == 8 || src_bpp == 16
b exit_label
b \exit_label
140:
action process_head, process_tail, process_inner_loop, exit_label, 1, 0
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
.endif
.if mask_bpp == 8 || mask_bpp == 16
b exit_label
b \exit_label
141:
.if src_bpp == 8 || src_bpp == 16
tst SRC, #3
bne 142f
.endif
action process_head, process_tail, process_inner_loop, exit_label, 0, 1
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
.if src_bpp == 8 || src_bpp == 16
b exit_label
b \exit_label
142:
action process_head, process_tail, process_inner_loop, exit_label, 1, 1
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
.endif
.endif
.endm
.macro end_of_line restore_x, vars_spilled, loop_label, last_one
.if vars_spilled
.if \vars_spilled
/* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
/* This is ldmia sp,{} */
.word 0xE89D0000 | LINE_SAVED_REGS
.endif
subs Y, Y, #1
.if vars_spilled
.if \vars_spilled
.if (LINE_SAVED_REGS) & (1<<1)
str Y, [sp]
.endif
@ -565,18 +597,18 @@
.if mask_bpp > 0
add MASK, MASK, STRIDE_M
.endif
.if restore_x
.if \restore_x
mov X, ORIG_W
.endif
bhs loop_label
.ifc "last_one",""
.if vars_spilled
bhs \loop_label
.ifc "\last_one",""
.if \vars_spilled
b 197f
.else
b 198f
.endif
.else
.if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
.if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
b 198f
.endif
.endif
@ -596,17 +628,17 @@
process_tail, \
process_inner_loop
pixman_asm_function fname
pixman_asm_function \fname
/*
* Make some macro arguments globally visible and accessible
* from other macros
*/
.set src_bpp, src_bpp_
.set mask_bpp, mask_bpp_
.set dst_w_bpp, dst_w_bpp_
.set flags, flags_
.set prefetch_distance, prefetch_distance_
.set src_bpp, \src_bpp_
.set mask_bpp, \mask_bpp_
.set dst_w_bpp, \dst_w_bpp_
.set flags, \flags_
.set prefetch_distance, \prefetch_distance_
/*
* Select prefetch type for this function.
@ -732,7 +764,7 @@
sub Y, Y, #1
#endif
init
\init
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
/* Reserve a word in which to store X during leading pixels */
@ -773,7 +805,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
newline
\newline
preload_leading_step1 src_bpp, WK1, SRC
preload_leading_step1 mask_bpp, WK2, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@ -790,7 +822,7 @@
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
.endif
leading_15bytes process_head, process_tail
leading_15bytes \process_head, \process_tail
154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@ -800,10 +832,10 @@
and SCRATCH, MASK, #31
rsb SCRATCH, SCRATCH, #32*prefetch_distance
.endif
.ifc "process_inner_loop",""
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
.ifc "\process_inner_loop",""
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
.else
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
.endif
157: /* Check for another line */
@ -825,7 +857,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
newline
\newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 0, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@ -837,10 +869,10 @@
beq 164f
rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
leading_15bytes process_head, process_tail
leading_15bytes \process_head, \process_tail
164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
167: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
@ -856,7 +888,7 @@
.word 0xE92D0000 | LINE_SAVED_REGS
.endif
171: /* New line */
newline
\newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 1, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@ -868,8 +900,8 @@
beq 174f
172: subs X, X, #1
blo 177f
process_head , 1, 0, 1, 1, 0
process_tail , 1, 0
\process_head , 1, 0, 1, 1, 0
\process_tail , 1, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 1, 0, DST
.endif
@ -880,15 +912,15 @@
beq 174f
subs X, X, #1
blo 177f
process_head , 2, 0, 1, 1, 0
process_tail , 2, 0
\process_head , 2, 0, 1, 1, 0
\process_tail , 2, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 2, 0, DST
.endif
.endif
174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
@ -908,7 +940,7 @@
add sp, sp, #4
.endif
cleanup
\cleanup
#ifdef DEBUG_PARAMS
add sp, sp, #9*4 /* junk the debug copy of arguments */
@ -932,13 +964,15 @@
.unreq WK3
.unreq SCRATCH
.unreq ORIG_W
#ifndef __clang__
.endfunc
#endif
.endm
.macro line_saved_regs x:vararg
.set LINE_SAVED_REGS, 0
.set LINE_SAVED_REG_COUNT, 0
.irp SAVED_REG,x
.irp SAVED_REG,\x
.ifc "SAVED_REG","Y"
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1