зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1820720 - Part 2. Import https://gitlab.freedesktop.org/pixman/pixman/-/issues/74. r=gfx-reviewers,lsalzman.
This is arm32 issue to build clang only. Depends on D183969 Differential Revision: https://phabricator.services.mozilla.com/D183970
This commit is contained in:
Родитель
97981b55e8
Коммит
47a779e282
|
@ -82,28 +82,28 @@
|
|||
mov TMP1, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP1, TOP, TMP1, asl #2
|
||||
vld1.32 {reg1}, [TMP1], STRIDE
|
||||
vld1.32 {reg2}, [TMP1]
|
||||
vld1.32 {\reg1}, [TMP1], STRIDE
|
||||
vld1.32 {\reg2}, [TMP1]
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_0565 reg1, reg2, tmp
|
||||
mov TMP1, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP1, TOP, TMP1, asl #1
|
||||
vld1.32 {reg2[0]}, [TMP1], STRIDE
|
||||
vld1.32 {reg2[1]}, [TMP1]
|
||||
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
|
||||
vld1.32 {\reg2[0]}, [TMP1], STRIDE
|
||||
vld1.32 {\reg2[1]}, [TMP1]
|
||||
convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_two_8888 \
|
||||
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
|
||||
|
||||
bilinear_load_8888 reg1, reg2, tmp1
|
||||
vmull.u8 acc1, reg1, d28
|
||||
vmlal.u8 acc1, reg2, d29
|
||||
bilinear_load_8888 reg3, reg4, tmp2
|
||||
vmull.u8 acc2, reg3, d28
|
||||
vmlal.u8 acc2, reg4, d29
|
||||
bilinear_load_8888 \reg1, \reg2, \tmp1
|
||||
vmull.u8 \acc1, \reg1, d28
|
||||
vmlal.u8 \acc1, \reg2, d29
|
||||
bilinear_load_8888 \reg3, \reg4, \tmp2
|
||||
vmull.u8 \acc2, \reg3, d28
|
||||
vmlal.u8 \acc2, \reg4, d29
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_four_8888 \
|
||||
|
@ -111,9 +111,9 @@
|
|||
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
|
||||
|
||||
bilinear_load_and_vertical_interpolate_two_8888 \
|
||||
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
|
||||
\xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
|
||||
bilinear_load_and_vertical_interpolate_two_8888 \
|
||||
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
|
||||
\yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_two_0565 \
|
||||
|
@ -125,19 +125,19 @@
|
|||
mov TMP2, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP2, TOP, TMP2, asl #1
|
||||
vld1.32 {acc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {acc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {acc2lo[1]}, [TMP1]
|
||||
vld1.32 {acc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 acc2, reg3, reg2, reg1
|
||||
vzip.u8 reg1, reg3
|
||||
vzip.u8 reg2, reg4
|
||||
vzip.u8 reg3, reg4
|
||||
vzip.u8 reg1, reg2
|
||||
vmull.u8 acc1, reg1, d28
|
||||
vmlal.u8 acc1, reg2, d29
|
||||
vmull.u8 acc2, reg3, d28
|
||||
vmlal.u8 acc2, reg4, d29
|
||||
vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {\acc2lo[1]}, [TMP1]
|
||||
vld1.32 {\acc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
|
||||
vzip.u8 \reg1, \reg3
|
||||
vzip.u8 \reg2, \reg4
|
||||
vzip.u8 \reg3, \reg4
|
||||
vzip.u8 \reg1, \reg2
|
||||
vmull.u8 \acc1, \reg1, d28
|
||||
vmlal.u8 \acc1, \reg2, d29
|
||||
vmull.u8 \acc2, \reg3, d28
|
||||
vmlal.u8 \acc2, \reg4, d29
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_four_0565 \
|
||||
|
@ -150,46 +150,46 @@
|
|||
mov TMP2, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP2, TOP, TMP2, asl #1
|
||||
vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {xacc2lo[1]}, [TMP1]
|
||||
vld1.32 {xacc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
|
||||
vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {\xacc2lo[1]}, [TMP1]
|
||||
vld1.32 {\xacc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
|
||||
mov TMP1, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP1, TOP, TMP1, asl #1
|
||||
mov TMP2, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP2, TOP, TMP2, asl #1
|
||||
vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
|
||||
vzip.u8 xreg1, xreg3
|
||||
vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
|
||||
vzip.u8 xreg2, xreg4
|
||||
vld1.32 {yacc2lo[1]}, [TMP1]
|
||||
vzip.u8 xreg3, xreg4
|
||||
vld1.32 {yacc2hi[1]}, [TMP2]
|
||||
vzip.u8 xreg1, xreg2
|
||||
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
|
||||
vmull.u8 xacc1, xreg1, d28
|
||||
vzip.u8 yreg1, yreg3
|
||||
vmlal.u8 xacc1, xreg2, d29
|
||||
vzip.u8 yreg2, yreg4
|
||||
vmull.u8 xacc2, xreg3, d28
|
||||
vzip.u8 yreg3, yreg4
|
||||
vmlal.u8 xacc2, xreg4, d29
|
||||
vzip.u8 yreg1, yreg2
|
||||
vmull.u8 yacc1, yreg1, d28
|
||||
vmlal.u8 yacc1, yreg2, d29
|
||||
vmull.u8 yacc2, yreg3, d28
|
||||
vmlal.u8 yacc2, yreg4, d29
|
||||
vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
|
||||
vzip.u8 \xreg1, \xreg3
|
||||
vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
|
||||
vzip.u8 \xreg2, \xreg4
|
||||
vld1.32 {\yacc2lo[1]}, [TMP1]
|
||||
vzip.u8 \xreg3, \xreg4
|
||||
vld1.32 {\yacc2hi[1]}, [TMP2]
|
||||
vzip.u8 \xreg1, \xreg2
|
||||
convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
|
||||
vmull.u8 \xacc1, \xreg1, d28
|
||||
vzip.u8 \yreg1, \yreg3
|
||||
vmlal.u8 \xacc1, \xreg2, d29
|
||||
vzip.u8 \yreg2, \yreg4
|
||||
vmull.u8 \xacc2, \xreg3, d28
|
||||
vzip.u8 \yreg3, \yreg4
|
||||
vmlal.u8 \xacc2, \xreg4, d29
|
||||
vzip.u8 \yreg1, \yreg2
|
||||
vmull.u8 \yacc1, \yreg1, d28
|
||||
vmlal.u8 \yacc1, \yreg2, d29
|
||||
vmull.u8 \yacc2, \yreg3, d28
|
||||
vmlal.u8 \yacc2, \yreg4, d29
|
||||
.endm
|
||||
|
||||
.macro bilinear_store_8888 numpix, tmp1, tmp2
|
||||
.if numpix == 4
|
||||
.if \numpix == 4
|
||||
vst1.32 {d0, d1}, [OUT]!
|
||||
.elseif numpix == 2
|
||||
.elseif \numpix == 2
|
||||
vst1.32 {d0}, [OUT]!
|
||||
.elseif numpix == 1
|
||||
.elseif \numpix == 1
|
||||
vst1.32 {d0[0]}, [OUT, :32]!
|
||||
.else
|
||||
.error bilinear_store_8888 numpix is unsupported
|
||||
|
@ -201,12 +201,12 @@
|
|||
vuzp.u8 d2, d3
|
||||
vuzp.u8 d1, d3
|
||||
vuzp.u8 d0, d2
|
||||
convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
|
||||
.if numpix == 4
|
||||
convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
|
||||
.if \numpix == 4
|
||||
vst1.16 {d2}, [OUT]!
|
||||
.elseif numpix == 2
|
||||
.elseif \numpix == 2
|
||||
vst1.32 {d2[0]}, [OUT]!
|
||||
.elseif numpix == 1
|
||||
.elseif \numpix == 1
|
||||
vst1.16 {d2[0]}, [OUT]!
|
||||
.else
|
||||
.error bilinear_store_0565 numpix is unsupported
|
||||
|
@ -222,20 +222,20 @@
|
|||
.endm
|
||||
|
||||
.macro bilinear_load_mask_8 numpix, mask
|
||||
.if numpix == 4
|
||||
vld1.32 {mask[0]}, [MASK]!
|
||||
.elseif numpix == 2
|
||||
vld1.16 {mask[0]}, [MASK]!
|
||||
.elseif numpix == 1
|
||||
vld1.8 {mask[0]}, [MASK]!
|
||||
.if \numpix == 4
|
||||
vld1.32 {\mask[0]}, [MASK]!
|
||||
.elseif \numpix == 2
|
||||
vld1.16 {\mask[0]}, [MASK]!
|
||||
.elseif \numpix == 1
|
||||
vld1.8 {\mask[0]}, [MASK]!
|
||||
.else
|
||||
.error bilinear_load_mask_8 numpix is unsupported
|
||||
.error bilinear_load_mask_8 \numpix is unsupported
|
||||
.endif
|
||||
pld [MASK, #prefetch_offset]
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_mask mask_fmt, numpix, mask
|
||||
bilinear_load_mask_&mask_fmt numpix, mask
|
||||
bilinear_load_mask_\()\mask_fmt \numpix, \mask
|
||||
.endm
|
||||
|
||||
|
||||
|
@ -250,28 +250,28 @@
|
|||
.endm
|
||||
|
||||
.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
|
||||
.if numpix == 4
|
||||
vld1.32 {dst0, dst1}, [OUT]
|
||||
.elseif numpix == 2
|
||||
vld1.32 {dst0}, [OUT]
|
||||
.elseif numpix == 1
|
||||
vld1.32 {dst0[0]}, [OUT]
|
||||
.if \numpix == 4
|
||||
vld1.32 {\dst0, \dst1}, [OUT]
|
||||
.elseif \numpix == 2
|
||||
vld1.32 {\dst0}, [OUT]
|
||||
.elseif \numpix == 1
|
||||
vld1.32 {\dst0[0]}, [OUT]
|
||||
.else
|
||||
.error bilinear_load_dst_8888 numpix is unsupported
|
||||
.error bilinear_load_dst_8888 \numpix is unsupported
|
||||
.endif
|
||||
pld [OUT, #(prefetch_offset * 4)]
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
|
||||
bilinear_load_dst_8888 numpix, dst0, dst1, dst01
|
||||
bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
|
||||
bilinear_load_dst_8888 numpix, dst0, dst1, dst01
|
||||
bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
|
||||
bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
|
||||
bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
/*
|
||||
|
@ -290,19 +290,19 @@
|
|||
.endm
|
||||
|
||||
.macro bilinear_duplicate_mask_8 numpix, mask
|
||||
.if numpix == 4
|
||||
vdup.32 mask, mask[0]
|
||||
.elseif numpix == 2
|
||||
vdup.16 mask, mask[0]
|
||||
.elseif numpix == 1
|
||||
vdup.8 mask, mask[0]
|
||||
.if \numpix == 4
|
||||
vdup.32 \mask, \mask[0]
|
||||
.elseif \numpix == 2
|
||||
vdup.16 \mask, \mask[0]
|
||||
.elseif \numpix == 1
|
||||
vdup.8 \mask, \mask[0]
|
||||
.else
|
||||
.error bilinear_duplicate_mask_8 is unsupported
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_duplicate_mask mask_fmt, numpix, mask
|
||||
bilinear_duplicate_mask_&mask_fmt numpix, mask
|
||||
bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
|
||||
.endm
|
||||
|
||||
/*
|
||||
|
@ -310,10 +310,10 @@
|
|||
* Interleave should be done when maks is enabled or operator is 'over'.
|
||||
*/
|
||||
.macro bilinear_interleave src0, src1, dst0, dst1
|
||||
vuzp.8 src0, src1
|
||||
vuzp.8 dst0, dst1
|
||||
vuzp.8 src0, src1
|
||||
vuzp.8 dst0, dst1
|
||||
vuzp.8 \src0, \src1
|
||||
vuzp.8 \dst0, \dst1
|
||||
vuzp.8 \src0, \src1
|
||||
vuzp.8 \dst0, \dst1
|
||||
.endm
|
||||
|
||||
.macro bilinear_interleave_src_dst_x_src \
|
||||
|
@ -323,7 +323,7 @@
|
|||
.macro bilinear_interleave_src_dst_x_over \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01
|
||||
|
||||
bilinear_interleave src0, src1, dst0, dst1
|
||||
bilinear_interleave \src0, \src1, \dst0, \dst1
|
||||
.endm
|
||||
|
||||
.macro bilinear_interleave_src_dst_x_add \
|
||||
|
@ -333,26 +333,26 @@
|
|||
.macro bilinear_interleave_src_dst_8_src \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01
|
||||
|
||||
bilinear_interleave src0, src1, dst0, dst1
|
||||
bilinear_interleave \src0, \src1, \dst0, \dst1
|
||||
.endm
|
||||
|
||||
.macro bilinear_interleave_src_dst_8_over \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01
|
||||
|
||||
bilinear_interleave src0, src1, dst0, dst1
|
||||
bilinear_interleave \src0, \src1, \dst0, \dst1
|
||||
.endm
|
||||
|
||||
.macro bilinear_interleave_src_dst_8_add \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01
|
||||
|
||||
bilinear_interleave src0, src1, dst0, dst1
|
||||
bilinear_interleave \src0, \src1, \dst0, \dst1
|
||||
.endm
|
||||
|
||||
.macro bilinear_interleave_src_dst \
|
||||
mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
|
||||
|
||||
bilinear_interleave_src_dst_&mask_fmt&_&op \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01
|
||||
bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
|
||||
\numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
|
||||
|
@ -370,23 +370,23 @@
|
|||
numpix, src0, src1, src01, mask, \
|
||||
tmp01, tmp23, tmp45, tmp67
|
||||
|
||||
vmull.u8 tmp01, src0, mask
|
||||
vmull.u8 tmp23, src1, mask
|
||||
vmull.u8 \tmp01, \src0, \mask
|
||||
vmull.u8 \tmp23, \src1, \mask
|
||||
/* bubbles */
|
||||
vrshr.u16 tmp45, tmp01, #8
|
||||
vrshr.u16 tmp67, tmp23, #8
|
||||
vrshr.u16 \tmp45, \tmp01, #8
|
||||
vrshr.u16 \tmp67, \tmp23, #8
|
||||
/* bubbles */
|
||||
vraddhn.u16 src0, tmp45, tmp01
|
||||
vraddhn.u16 src1, tmp67, tmp23
|
||||
vraddhn.u16 \src0, \tmp45, \tmp01
|
||||
vraddhn.u16 \src1, \tmp67, \tmp23
|
||||
.endm
|
||||
|
||||
.macro bilinear_apply_mask_to_src \
|
||||
mask_fmt, numpix, src0, src1, src01, mask, \
|
||||
tmp01, tmp23, tmp45, tmp67
|
||||
|
||||
bilinear_apply_mask_to_src_&mask_fmt \
|
||||
numpix, src0, src1, src01, mask, \
|
||||
tmp01, tmp23, tmp45, tmp67
|
||||
bilinear_apply_mask_to_src_\()\mask_fmt \
|
||||
\numpix, \src0, \src1, \src01, \mask, \
|
||||
\tmp01, \tmp23, \tmp45, \tmp67
|
||||
.endm
|
||||
|
||||
|
||||
|
@ -403,79 +403,79 @@
|
|||
numpix, src0, src1, src01, dst0, dst1, dst01, \
|
||||
tmp01, tmp23, tmp45, tmp67, tmp8
|
||||
|
||||
vdup.32 tmp8, src1[1]
|
||||
vdup.32 \tmp8, \src1[1]
|
||||
/* bubbles */
|
||||
vmvn.8 tmp8, tmp8
|
||||
vmvn.8 \tmp8, \tmp8
|
||||
/* bubbles */
|
||||
vmull.u8 tmp01, dst0, tmp8
|
||||
vmull.u8 \tmp01, \dst0, \tmp8
|
||||
/* bubbles */
|
||||
vmull.u8 tmp23, dst1, tmp8
|
||||
vmull.u8 \tmp23, \dst1, \tmp8
|
||||
/* bubbles */
|
||||
vrshr.u16 tmp45, tmp01, #8
|
||||
vrshr.u16 tmp67, tmp23, #8
|
||||
vrshr.u16 \tmp45, \tmp01, #8
|
||||
vrshr.u16 \tmp67, \tmp23, #8
|
||||
/* bubbles */
|
||||
vraddhn.u16 dst0, tmp45, tmp01
|
||||
vraddhn.u16 dst1, tmp67, tmp23
|
||||
vraddhn.u16 \dst0, \tmp45, \tmp01
|
||||
vraddhn.u16 \dst1, \tmp67, \tmp23
|
||||
/* bubbles */
|
||||
vqadd.u8 src01, dst01, src01
|
||||
vqadd.u8 \src01, \dst01, \src01
|
||||
.endm
|
||||
|
||||
.macro bilinear_combine_add \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01, \
|
||||
tmp01, tmp23, tmp45, tmp67, tmp8
|
||||
|
||||
vqadd.u8 src01, dst01, src01
|
||||
vqadd.u8 \src01, \dst01, \src01
|
||||
.endm
|
||||
|
||||
.macro bilinear_combine \
|
||||
op, numpix, src0, src1, src01, dst0, dst1, dst01, \
|
||||
tmp01, tmp23, tmp45, tmp67, tmp8
|
||||
|
||||
bilinear_combine_&op \
|
||||
numpix, src0, src1, src01, dst0, dst1, dst01, \
|
||||
tmp01, tmp23, tmp45, tmp67, tmp8
|
||||
bilinear_combine_\()\op \
|
||||
\numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
|
||||
\tmp01, \tmp23, \tmp45, \tmp67, \tmp8
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Macros for final deinterleaving of destination pixels if needed.
|
||||
*/
|
||||
.macro bilinear_deinterleave numpix, dst0, dst1, dst01
|
||||
vuzp.8 dst0, dst1
|
||||
vuzp.8 \dst0, \dst1
|
||||
/* bubbles */
|
||||
vuzp.8 dst0, dst1
|
||||
vuzp.8 \dst0, \dst1
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
|
||||
bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
|
||||
.endm
|
||||
|
||||
|
||||
.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
|
||||
bilinear_load_&src_fmt d0, d1, d2
|
||||
bilinear_load_mask mask_fmt, 1, d4
|
||||
bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
|
||||
bilinear_load_\()\src_fmt d0, d1, d2
|
||||
bilinear_load_mask \mask_fmt, 1, d4
|
||||
bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
|
||||
vmull.u8 q1, d0, d28
|
||||
vmlal.u8 q1, d1, d29
|
||||
/* 5 cycles bubble */
|
||||
|
@ -483,28 +483,28 @@
|
|||
vmlsl.u16 q0, d2, d30
|
||||
vmlal.u16 q0, d3, d30
|
||||
/* 5 cycles bubble */
|
||||
bilinear_duplicate_mask mask_fmt, 1, d4
|
||||
bilinear_duplicate_mask \mask_fmt, 1, d4
|
||||
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
|
||||
/* 3 cycles bubble */
|
||||
vmovn.u16 d0, q0
|
||||
/* 1 cycle bubble */
|
||||
bilinear_interleave_src_dst \
|
||||
mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
|
||||
\mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
|
||||
bilinear_apply_mask_to_src \
|
||||
mask_fmt, 1, d0, d1, q0, d4, \
|
||||
\mask_fmt, 1, d0, d1, q0, d4, \
|
||||
q3, q8, q10, q11
|
||||
bilinear_combine \
|
||||
op, 1, d0, d1, q0, d18, d19, q9, \
|
||||
\op, 1, d0, d1, q0, d18, d19, q9, \
|
||||
q3, q8, q10, q11, d5
|
||||
bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
|
||||
bilinear_store_&dst_fmt 1, q2, q3
|
||||
bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
|
||||
bilinear_store_\()\dst_fmt 1, q2, q3
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
|
||||
bilinear_load_and_vertical_interpolate_two_&src_fmt \
|
||||
bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
|
||||
q1, q11, d0, d1, d20, d21, d22, d23
|
||||
bilinear_load_mask mask_fmt, 2, d4
|
||||
bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
|
||||
bilinear_load_mask \mask_fmt, 2, d4
|
||||
bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
|
||||
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
|
||||
vmlsl.u16 q0, d2, d30
|
||||
vmlal.u16 q0, d3, d30
|
||||
|
@ -513,24 +513,24 @@
|
|||
vmlal.u16 q10, d23, d31
|
||||
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
|
||||
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
|
||||
bilinear_duplicate_mask mask_fmt, 2, d4
|
||||
bilinear_duplicate_mask \mask_fmt, 2, d4
|
||||
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
|
||||
vadd.u16 q12, q12, q13
|
||||
vmovn.u16 d0, q0
|
||||
bilinear_interleave_src_dst \
|
||||
mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
|
||||
\mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
|
||||
bilinear_apply_mask_to_src \
|
||||
mask_fmt, 2, d0, d1, q0, d4, \
|
||||
\mask_fmt, 2, d0, d1, q0, d4, \
|
||||
q3, q8, q10, q11
|
||||
bilinear_combine \
|
||||
op, 2, d0, d1, q0, d18, d19, q9, \
|
||||
\op, 2, d0, d1, q0, d18, d19, q9, \
|
||||
q3, q8, q10, q11, d5
|
||||
bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
|
||||
bilinear_store_&dst_fmt 2, q2, q3
|
||||
bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
|
||||
bilinear_store_\()\dst_fmt 2, q2, q3
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
|
||||
bilinear_load_and_vertical_interpolate_four_&src_fmt \
|
||||
bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
|
||||
q1, q11, d0, d1, d20, d21, d22, d23 \
|
||||
q3, q9, d4, d5, d16, d17, d18, d19
|
||||
pld [TMP1, PF_OFFS]
|
||||
|
@ -546,8 +546,8 @@
|
|||
vmlsl.u16 q2, d6, d30
|
||||
vmlal.u16 q2, d7, d30
|
||||
vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
|
||||
bilinear_load_mask mask_fmt, 4, d22
|
||||
bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
|
||||
bilinear_load_mask \mask_fmt, 4, d22
|
||||
bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
|
||||
pld [TMP1, PF_OFFS]
|
||||
vmlsl.u16 q8, d18, d31
|
||||
vmlal.u16 q8, d19, d31
|
||||
|
@ -556,21 +556,21 @@
|
|||
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
|
||||
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
|
||||
vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
|
||||
bilinear_duplicate_mask mask_fmt, 4, d22
|
||||
bilinear_duplicate_mask \mask_fmt, 4, d22
|
||||
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q2
|
||||
vadd.u16 q12, q12, q13
|
||||
bilinear_interleave_src_dst \
|
||||
mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
|
||||
\mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
|
||||
bilinear_apply_mask_to_src \
|
||||
mask_fmt, 4, d0, d1, q0, d22, \
|
||||
\mask_fmt, 4, d0, d1, q0, d22, \
|
||||
q3, q8, q9, q10
|
||||
bilinear_combine \
|
||||
op, 4, d0, d1, q0, d2, d3, q1, \
|
||||
\op, 4, d0, d1, q0, d2, d3, q1, \
|
||||
q3, q8, q9, q10, d23
|
||||
bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
|
||||
bilinear_store_&dst_fmt 4, q2, q3
|
||||
bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
|
||||
bilinear_store_\()\dst_fmt 4, q2, q3
|
||||
.endm
|
||||
|
||||
.set BILINEAR_FLAG_USE_MASK, 1
|
||||
|
@ -610,14 +610,14 @@
|
|||
prefetch_distance, \
|
||||
flags
|
||||
|
||||
pixman_asm_function fname
|
||||
.if pixblock_size == 8
|
||||
.elseif pixblock_size == 4
|
||||
pixman_asm_function \fname
|
||||
.if \pixblock_size == 8
|
||||
.elseif \pixblock_size == 4
|
||||
.else
|
||||
.error unsupported pixblock size
|
||||
.endif
|
||||
|
||||
.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
|
||||
OUT .req r0
|
||||
TOP .req r1
|
||||
BOTTOM .req r2
|
||||
|
@ -635,7 +635,7 @@ pixman_asm_function fname
|
|||
|
||||
mov ip, sp
|
||||
push {r4, r5, r6, r7, r8, r9}
|
||||
mov PF_OFFS, #prefetch_distance
|
||||
mov PF_OFFS, #\prefetch_distance
|
||||
ldmia ip, {WB, X, UX, WIDTH}
|
||||
.else
|
||||
OUT .req r0
|
||||
|
@ -654,17 +654,17 @@ pixman_asm_function fname
|
|||
TMP4 .req r10
|
||||
STRIDE .req r3
|
||||
|
||||
.set prefetch_offset, prefetch_distance
|
||||
.set prefetch_offset, \prefetch_distance
|
||||
|
||||
mov ip, sp
|
||||
push {r4, r5, r6, r7, r8, r9, r10, ip}
|
||||
mov PF_OFFS, #prefetch_distance
|
||||
mov PF_OFFS, #\prefetch_distance
|
||||
ldmia ip, {WT, WB, X, UX, WIDTH}
|
||||
.endif
|
||||
|
||||
mul PF_OFFS, PF_OFFS, UX
|
||||
|
||||
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
vpush {d8-d15}
|
||||
.endif
|
||||
|
||||
|
@ -683,11 +683,11 @@ pixman_asm_function fname
|
|||
/* ensure good destination alignment */
|
||||
cmp WIDTH, #1
|
||||
blt 0f
|
||||
tst OUT, #(1 << dst_bpp_shift)
|
||||
tst OUT, #(1 << \dst_bpp_shift)
|
||||
beq 0f
|
||||
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
|
||||
vadd.u16 q12, q12, q13
|
||||
bilinear_process_last_pixel
|
||||
\bilinear_process_last_pixel
|
||||
sub WIDTH, WIDTH, #1
|
||||
0:
|
||||
vadd.u16 q13, q13, q13
|
||||
|
@ -696,53 +696,53 @@ pixman_asm_function fname
|
|||
|
||||
cmp WIDTH, #2
|
||||
blt 0f
|
||||
tst OUT, #(1 << (dst_bpp_shift + 1))
|
||||
tst OUT, #(1 << (\dst_bpp_shift + 1))
|
||||
beq 0f
|
||||
bilinear_process_two_pixels
|
||||
\bilinear_process_two_pixels
|
||||
sub WIDTH, WIDTH, #2
|
||||
0:
|
||||
.if pixblock_size == 8
|
||||
.if \pixblock_size == 8
|
||||
cmp WIDTH, #4
|
||||
blt 0f
|
||||
tst OUT, #(1 << (dst_bpp_shift + 2))
|
||||
tst OUT, #(1 << (\dst_bpp_shift + 2))
|
||||
beq 0f
|
||||
bilinear_process_four_pixels
|
||||
\bilinear_process_four_pixels
|
||||
sub WIDTH, WIDTH, #4
|
||||
0:
|
||||
.endif
|
||||
subs WIDTH, WIDTH, #pixblock_size
|
||||
subs WIDTH, WIDTH, #\pixblock_size
|
||||
blt 1f
|
||||
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
|
||||
bilinear_process_pixblock_head
|
||||
subs WIDTH, WIDTH, #pixblock_size
|
||||
mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
|
||||
\bilinear_process_pixblock_head
|
||||
subs WIDTH, WIDTH, #\pixblock_size
|
||||
blt 5f
|
||||
0:
|
||||
bilinear_process_pixblock_tail_head
|
||||
subs WIDTH, WIDTH, #pixblock_size
|
||||
\bilinear_process_pixblock_tail_head
|
||||
subs WIDTH, WIDTH, #\pixblock_size
|
||||
bge 0b
|
||||
5:
|
||||
bilinear_process_pixblock_tail
|
||||
\bilinear_process_pixblock_tail
|
||||
1:
|
||||
.if pixblock_size == 8
|
||||
.if \pixblock_size == 8
|
||||
tst WIDTH, #4
|
||||
beq 2f
|
||||
bilinear_process_four_pixels
|
||||
\bilinear_process_four_pixels
|
||||
2:
|
||||
.endif
|
||||
/* handle the remaining trailing pixels */
|
||||
tst WIDTH, #2
|
||||
beq 2f
|
||||
bilinear_process_two_pixels
|
||||
\bilinear_process_two_pixels
|
||||
2:
|
||||
tst WIDTH, #1
|
||||
beq 3f
|
||||
bilinear_process_last_pixel
|
||||
\bilinear_process_last_pixel
|
||||
3:
|
||||
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
vpop {d8-d15}
|
||||
.endif
|
||||
|
||||
.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
|
||||
pop {r4, r5, r6, r7, r8, r9}
|
||||
.else
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, ip}
|
||||
|
@ -762,11 +762,13 @@ pixman_asm_function fname
|
|||
.unreq TMP3
|
||||
.unreq TMP4
|
||||
.unreq STRIDE
|
||||
.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
|
||||
.unreq MASK
|
||||
.endif
|
||||
|
||||
#ifndef __clang__
|
||||
.endfunc
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
|
|
|
@ -34,6 +34,12 @@
|
|||
* - pixman_composite_over_n_8_0565_asm_neon
|
||||
*/
|
||||
|
||||
#ifdef __clang__
|
||||
#define ldrgeb ldrbge
|
||||
#define subges subsge
|
||||
#define subpls subspl
|
||||
#endif
|
||||
|
||||
/* Prevent the stack from becoming executable for no reason... */
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
@ -260,13 +266,13 @@
|
|||
vshrn.u16 d7, q2, #3
|
||||
vsli.u16 q2, q2, #5
|
||||
vshll.u8 q14, d16, #8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF add, PF_X, PF_X, #8
|
||||
vshll.u8 q8, d19, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF tst, PF_CTL, #0xF
|
||||
vsri.u8 d6, d6, #5
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF addne, PF_X, PF_X, #8
|
||||
vmvn.8 d3, d3
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vsri.u8 d7, d7, #6
|
||||
vshrn.u16 d30, q2, #2
|
||||
vmull.u8 q10, d3, d6
|
||||
|
@ -275,18 +281,18 @@
|
|||
vmull.u8 q12, d3, d30
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vsri.u16 q14, q8, #5
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vshll.u8 q9, d18, #8
|
||||
vrshr.u16 q13, q10, #8
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vrshr.u16 q3, q11, #8
|
||||
vrshr.u16 q15, q12, #8
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vsri.u16 q14, q9, #11
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
vraddhn.u16 d20, q10, q13
|
||||
vraddhn.u16 d23, q11, q3
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vraddhn.u16 d22, q12, q15
|
||||
vst1.16 {d28, d29}, [DST_W, :128]!
|
||||
.endm
|
||||
|
@ -434,20 +440,20 @@ generate_composite_function \
|
|||
|
||||
.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
|
||||
vsri.u16 q14, q8, #5
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
fetch_src_pixblock
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vsri.u16 q14, q9, #11
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
||||
vshll.u8 q8, d1, #8
|
||||
vst1.16 {d28, d29}, [DST_W, :128]!
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vshll.u8 q14, d2, #8
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
vshll.u8 q9, d0, #8
|
||||
.endm
|
||||
|
||||
|
@ -509,20 +515,20 @@ generate_composite_function \
|
|||
|
||||
.macro pixman_composite_add_8_8_process_pixblock_tail_head
|
||||
fetch_src_pixblock
|
||||
PF add PF_X, PF_X, #32
|
||||
PF tst PF_CTL, #0xF
|
||||
PF add, PF_X, PF_X, #32
|
||||
PF tst, PF_CTL, #0xF
|
||||
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
|
||||
PF addne PF_X, PF_X, #32
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF addne, PF_X, PF_X, #32
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vqadd.u8 q14, q0, q2
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vqadd.u8 q15, q1, q3
|
||||
.endm
|
||||
|
||||
|
@ -541,20 +547,20 @@ generate_composite_function \
|
|||
|
||||
.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
|
||||
fetch_src_pixblock
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vqadd.u8 q14, q0, q2
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vqadd.u8 q15, q1, q3
|
||||
.endm
|
||||
|
||||
|
@ -604,16 +610,16 @@ generate_composite_function_single_scanline \
|
|||
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
|
||||
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
|
||||
vrshr.u16 q14, q8, #8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
vrshr.u16 q15, q9, #8
|
||||
vrshr.u16 q12, q10, #8
|
||||
vrshr.u16 q13, q11, #8
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vraddhn.u16 d28, q14, q8
|
||||
vraddhn.u16 d29, q15, q9
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vraddhn.u16 d30, q12, q10
|
||||
vraddhn.u16 d31, q13, q11
|
||||
fetch_src_pixblock
|
||||
|
@ -621,13 +627,13 @@ generate_composite_function_single_scanline \
|
|||
vmvn.8 d22, d3
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q8, d22, d4
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subsge, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q9, d22, d5
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
vmull.u8 q10, d22, d6
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vmull.u8 q11, d22, d7
|
||||
.endm
|
||||
|
||||
|
@ -656,16 +662,16 @@ generate_composite_function_single_scanline \
|
|||
.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
|
||||
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
|
||||
vrshr.u16 q14, q8, #8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
vrshr.u16 q15, q9, #8
|
||||
vrshr.u16 q12, q10, #8
|
||||
vrshr.u16 q13, q11, #8
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vraddhn.u16 d28, q14, q8
|
||||
vraddhn.u16 d29, q15, q9
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vraddhn.u16 d30, q12, q10
|
||||
vraddhn.u16 d31, q13, q11
|
||||
vqadd.u8 q14, q0, q14
|
||||
|
@ -675,13 +681,13 @@ generate_composite_function_single_scanline \
|
|||
vmvn.8 d22, d3
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q8, d22, d4
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q9, d22, d5
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
vmull.u8 q10, d22, d6
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vmull.u8 q11, d22, d7
|
||||
.endm
|
||||
|
||||
|
@ -742,20 +748,20 @@ generate_composite_function_single_scanline \
|
|||
vraddhn.u16 d31, q3, q11
|
||||
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
|
||||
vqadd.u8 q14, q0, q14
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0x0F
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0x0F
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vqadd.u8 q15, q1, q15
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vmull.u8 q8, d24, d4
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vmull.u8 q9, d24, d5
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q10, d24, d6
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q11, d24, d7
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
.endm
|
||||
|
||||
|
@ -784,16 +790,16 @@ generate_composite_function \
|
|||
|
||||
.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
|
||||
vrshr.u16 q14, q8, #8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
vrshr.u16 q15, q9, #8
|
||||
vrshr.u16 q12, q10, #8
|
||||
vrshr.u16 q13, q11, #8
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vraddhn.u16 d28, q14, q8
|
||||
vraddhn.u16 d29, q15, q9
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vraddhn.u16 d30, q12, q10
|
||||
vraddhn.u16 d31, q13, q11
|
||||
vqadd.u8 q14, q0, q14
|
||||
|
@ -802,12 +808,12 @@ generate_composite_function \
|
|||
vmvn.8 d22, d3
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q8, d22, d4
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q9, d22, d5
|
||||
vmull.u8 q10, d22, d6
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vmull.u8 q11, d22, d7
|
||||
.endm
|
||||
|
||||
|
@ -1245,23 +1251,23 @@ generate_composite_function \
|
|||
|
||||
.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
|
||||
fetch_mask_pixblock
|
||||
PF add PF_X, PF_X, #8
|
||||
PF add, PF_X, PF_X, #8
|
||||
vrshrn.u16 d28, q8, #8
|
||||
PF tst PF_CTL, #0x0F
|
||||
PF tst, PF_CTL, #0x0F
|
||||
vrshrn.u16 d29, q9, #8
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF addne, PF_X, PF_X, #8
|
||||
vrshrn.u16 d30, q10, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vrshrn.u16 d31, q11, #8
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vmull.u8 q8, d24, d0
|
||||
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
|
||||
vmull.u8 q9, d24, d1
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q10, d24, d2
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q11, d24, d3
|
||||
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
vrsra.u16 q8, q8, #8
|
||||
vrsra.u16 q9, q9, #8
|
||||
|
@ -1314,23 +1320,23 @@ generate_composite_function \
|
|||
|
||||
.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
|
||||
fetch_mask_pixblock
|
||||
PF add PF_X, PF_X, #8
|
||||
PF add, PF_X, PF_X, #8
|
||||
vrshrn.u16 d28, q0, #8
|
||||
PF tst PF_CTL, #0x0F
|
||||
PF tst, PF_CTL, #0x0F
|
||||
vrshrn.u16 d29, q1, #8
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF addne, PF_X, PF_X, #8
|
||||
vrshrn.u16 d30, q2, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vrshrn.u16 d31, q3, #8
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vmull.u8 q0, d24, d16
|
||||
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
|
||||
vmull.u8 q1, d25, d16
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q2, d26, d16
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q3, d27, d16
|
||||
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
||||
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
vrsra.u16 q0, q0, #8
|
||||
vrsra.u16 q1, q1, #8
|
||||
|
@ -1408,27 +1414,27 @@ generate_composite_function \
|
|||
vrshr.u16 q15, q9, #8
|
||||
fetch_mask_pixblock
|
||||
vrshr.u16 q6, q10, #8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF add, PF_X, PF_X, #8
|
||||
vrshr.u16 q7, q11, #8
|
||||
PF tst PF_CTL, #0x0F
|
||||
PF tst, PF_CTL, #0x0F
|
||||
vraddhn.u16 d28, q14, q8
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF addne, PF_X, PF_X, #8
|
||||
vraddhn.u16 d29, q15, q9
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vraddhn.u16 d30, q6, q10
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
vraddhn.u16 d31, q7, q11
|
||||
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
|
||||
vmull.u8 q6, d24, d8
|
||||
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
|
||||
vmull.u8 q7, d24, d9
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
vmull.u8 q8, d24, d10
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
vmull.u8 q9, d24, d11
|
||||
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
|
||||
vqadd.u8 q14, q0, q14
|
||||
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
||||
PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
|
||||
vqadd.u8 q15, q1, q15
|
||||
vrshr.u16 q10, q6, #8
|
||||
vrshr.u16 q11, q7, #8
|
||||
|
@ -2425,21 +2431,21 @@ generate_composite_function \
|
|||
vrshr.u16 q13, q10, #8
|
||||
fetch_src_pixblock
|
||||
vraddhn.u16 d30, q11, q8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vraddhn.u16 d29, q12, q9
|
||||
vraddhn.u16 d28, q13, q10
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d3, d1
|
||||
vmull.u8 q10, d3, d2
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
.endm
|
||||
|
||||
generate_composite_function \
|
||||
|
@ -2482,21 +2488,21 @@ generate_composite_function \
|
|||
vrshr.u16 q13, q10, #8
|
||||
fetch_src_pixblock
|
||||
vraddhn.u16 d28, q11, q8
|
||||
PF add PF_X, PF_X, #8
|
||||
PF tst PF_CTL, #0xF
|
||||
PF addne PF_X, PF_X, #8
|
||||
PF subne PF_CTL, PF_CTL, #1
|
||||
PF add, PF_X, PF_X, #8
|
||||
PF tst, PF_CTL, #0xF
|
||||
PF addne, PF_X, PF_X, #8
|
||||
PF subne, PF_CTL, PF_CTL, #1
|
||||
vraddhn.u16 d29, q12, q9
|
||||
vraddhn.u16 d30, q13, q10
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d3, d1
|
||||
vmull.u8 q10, d3, d2
|
||||
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
||||
PF cmp PF_X, ORIG_W
|
||||
PF cmp, PF_X, ORIG_W
|
||||
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
|
||||
PF subge PF_X, PF_X, ORIG_W
|
||||
PF subges PF_CTL, PF_CTL, #0x10
|
||||
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
PF subge, PF_X, PF_X, ORIG_W
|
||||
PF subges, PF_CTL, PF_CTL, #0x10
|
||||
PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
|
||||
.endm
|
||||
|
||||
generate_composite_function \
|
||||
|
@ -2841,28 +2847,28 @@ generate_composite_function_nearest_scanline \
|
|||
mov TMP1, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP1, TOP, TMP1, asl #2
|
||||
vld1.32 {reg1}, [TMP1], STRIDE
|
||||
vld1.32 {reg2}, [TMP1]
|
||||
vld1.32 {\reg1}, [TMP1], STRIDE
|
||||
vld1.32 {\reg2}, [TMP1]
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_0565 reg1, reg2, tmp
|
||||
mov TMP1, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP1, TOP, TMP1, asl #1
|
||||
vld1.32 {reg2[0]}, [TMP1], STRIDE
|
||||
vld1.32 {reg2[1]}, [TMP1]
|
||||
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
|
||||
vld1.32 {\reg2[0]}, [TMP1], STRIDE
|
||||
vld1.32 {\reg2[1]}, [TMP1]
|
||||
convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_two_8888 \
|
||||
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
|
||||
|
||||
bilinear_load_8888 reg1, reg2, tmp1
|
||||
vmull.u8 acc1, reg1, d28
|
||||
vmlal.u8 acc1, reg2, d29
|
||||
bilinear_load_8888 reg3, reg4, tmp2
|
||||
vmull.u8 acc2, reg3, d28
|
||||
vmlal.u8 acc2, reg4, d29
|
||||
bilinear_load_8888 \reg1, \reg2, \tmp1
|
||||
vmull.u8 \acc1, \reg1, d28
|
||||
vmlal.u8 \acc1, \reg2, d29
|
||||
bilinear_load_8888 \reg3, \reg4, \tmp2
|
||||
vmull.u8 \acc2, \reg3, d28
|
||||
vmlal.u8 \acc2, \reg4, d29
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_four_8888 \
|
||||
|
@ -2870,9 +2876,9 @@ generate_composite_function_nearest_scanline \
|
|||
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
|
||||
|
||||
bilinear_load_and_vertical_interpolate_two_8888 \
|
||||
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
|
||||
\xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
|
||||
bilinear_load_and_vertical_interpolate_two_8888 \
|
||||
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
|
||||
\yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_two_0565 \
|
||||
|
@ -2884,19 +2890,19 @@ generate_composite_function_nearest_scanline \
|
|||
mov TMP2, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP2, TOP, TMP2, asl #1
|
||||
vld1.32 {acc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {acc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {acc2lo[1]}, [TMP1]
|
||||
vld1.32 {acc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 acc2, reg3, reg2, reg1
|
||||
vzip.u8 reg1, reg3
|
||||
vzip.u8 reg2, reg4
|
||||
vzip.u8 reg3, reg4
|
||||
vzip.u8 reg1, reg2
|
||||
vmull.u8 acc1, reg1, d28
|
||||
vmlal.u8 acc1, reg2, d29
|
||||
vmull.u8 acc2, reg3, d28
|
||||
vmlal.u8 acc2, reg4, d29
|
||||
vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {\acc2lo[1]}, [TMP1]
|
||||
vld1.32 {\acc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
|
||||
vzip.u8 \reg1, \reg3
|
||||
vzip.u8 \reg2, \reg4
|
||||
vzip.u8 \reg3, \reg4
|
||||
vzip.u8 \reg1, \reg2
|
||||
vmull.u8 \acc1, \reg1, d28
|
||||
vmlal.u8 \acc1, \reg2, d29
|
||||
vmull.u8 \acc2, \reg3, d28
|
||||
vmlal.u8 \acc2, \reg4, d29
|
||||
.endm
|
||||
|
||||
.macro bilinear_load_and_vertical_interpolate_four_0565 \
|
||||
|
@ -2909,49 +2915,49 @@ generate_composite_function_nearest_scanline \
|
|||
mov TMP2, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP2, TOP, TMP2, asl #1
|
||||
vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {xacc2lo[1]}, [TMP1]
|
||||
vld1.32 {xacc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
|
||||
vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
|
||||
vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
|
||||
vld1.32 {\xacc2lo[1]}, [TMP1]
|
||||
vld1.32 {\xacc2hi[1]}, [TMP2]
|
||||
convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
|
||||
mov TMP1, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP1, TOP, TMP1, asl #1
|
||||
mov TMP2, X, asr #16
|
||||
add X, X, UX
|
||||
add TMP2, TOP, TMP2, asl #1
|
||||
vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
|
||||
vzip.u8 xreg1, xreg3
|
||||
vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
|
||||
vzip.u8 xreg2, xreg4
|
||||
vld1.32 {yacc2lo[1]}, [TMP1]
|
||||
vzip.u8 xreg3, xreg4
|
||||
vld1.32 {yacc2hi[1]}, [TMP2]
|
||||
vzip.u8 xreg1, xreg2
|
||||
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
|
||||
vmull.u8 xacc1, xreg1, d28
|
||||
vzip.u8 yreg1, yreg3
|
||||
vmlal.u8 xacc1, xreg2, d29
|
||||
vzip.u8 yreg2, yreg4
|
||||
vmull.u8 xacc2, xreg3, d28
|
||||
vzip.u8 yreg3, yreg4
|
||||
vmlal.u8 xacc2, xreg4, d29
|
||||
vzip.u8 yreg1, yreg2
|
||||
vmull.u8 yacc1, yreg1, d28
|
||||
vmlal.u8 yacc1, yreg2, d29
|
||||
vmull.u8 yacc2, yreg3, d28
|
||||
vmlal.u8 yacc2, yreg4, d29
|
||||
vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
|
||||
vzip.u8 \xreg1, \xreg3
|
||||
vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
|
||||
vzip.u8 \xreg2, \xreg4
|
||||
vld1.32 {\yacc2lo[1]}, [TMP1]
|
||||
vzip.u8 \xreg3, \xreg4
|
||||
vld1.32 {\yacc2hi[1]}, [TMP2]
|
||||
vzip.u8 \xreg1, \xreg2
|
||||
convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
|
||||
vmull.u8 \xacc1, \xreg1, d28
|
||||
vzip.u8 \yreg1, \yreg3
|
||||
vmlal.u8 \xacc1, \xreg2, d29
|
||||
vzip.u8 \yreg2, \yreg4
|
||||
vmull.u8 \xacc2, \xreg3, d28
|
||||
vzip.u8 \yreg3, \yreg4
|
||||
vmlal.u8 \xacc2, \xreg4, d29
|
||||
vzip.u8 \yreg1, \yreg2
|
||||
vmull.u8 \yacc1, \yreg1, d28
|
||||
vmlal.u8 \yacc1, \yreg2, d29
|
||||
vmull.u8 \yacc2, \yreg3, d28
|
||||
vmlal.u8 \yacc2, \yreg4, d29
|
||||
.endm
|
||||
|
||||
.macro bilinear_store_8888 numpix, tmp1, tmp2
|
||||
.if numpix == 4
|
||||
.if \numpix == 4
|
||||
vst1.32 {d0, d1}, [OUT, :128]!
|
||||
.elseif numpix == 2
|
||||
.elseif \numpix == 2
|
||||
vst1.32 {d0}, [OUT, :64]!
|
||||
.elseif numpix == 1
|
||||
.elseif \numpix == 1
|
||||
vst1.32 {d0[0]}, [OUT, :32]!
|
||||
.else
|
||||
.error bilinear_store_8888 numpix is unsupported
|
||||
.error bilinear_store_8888 \numpix is unsupported
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -2960,20 +2966,20 @@ generate_composite_function_nearest_scanline \
|
|||
vuzp.u8 d2, d3
|
||||
vuzp.u8 d1, d3
|
||||
vuzp.u8 d0, d2
|
||||
convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
|
||||
.if numpix == 4
|
||||
convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
|
||||
.if \numpix == 4
|
||||
vst1.16 {d2}, [OUT, :64]!
|
||||
.elseif numpix == 2
|
||||
.elseif \numpix == 2
|
||||
vst1.32 {d2[0]}, [OUT, :32]!
|
||||
.elseif numpix == 1
|
||||
.elseif \numpix == 1
|
||||
vst1.16 {d2[0]}, [OUT, :16]!
|
||||
.else
|
||||
.error bilinear_store_0565 numpix is unsupported
|
||||
.error bilinear_store_0565 \numpix is unsupported
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
|
||||
bilinear_load_&src_fmt d0, d1, d2
|
||||
bilinear_load_\()\src_fmt d0, d1, d2
|
||||
vmull.u8 q1, d0, d28
|
||||
vmlal.u8 q1, d1, d29
|
||||
/* 5 cycles bubble */
|
||||
|
@ -2985,11 +2991,11 @@ generate_composite_function_nearest_scanline \
|
|||
/* 3 cycles bubble */
|
||||
vmovn.u16 d0, q0
|
||||
/* 1 cycle bubble */
|
||||
bilinear_store_&dst_fmt 1, q2, q3
|
||||
bilinear_store_\()\dst_fmt 1, q2, q3
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
|
||||
bilinear_load_and_vertical_interpolate_two_&src_fmt \
|
||||
bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
|
||||
q1, q11, d0, d1, d20, d21, d22, d23
|
||||
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
|
||||
vmlsl.u16 q0, d2, d30
|
||||
|
@ -3002,11 +3008,11 @@ generate_composite_function_nearest_scanline \
|
|||
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
|
||||
vadd.u16 q12, q12, q13
|
||||
vmovn.u16 d0, q0
|
||||
bilinear_store_&dst_fmt 2, q2, q3
|
||||
bilinear_store_\()\dst_fmt 2, q2, q3
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
|
||||
bilinear_load_and_vertical_interpolate_four_&src_fmt \
|
||||
bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
|
||||
q1, q11, d0, d1, d20, d21, d22, d23 \
|
||||
q3, q9, d4, d5, d16, d17, d18, d19
|
||||
pld [TMP1, PF_OFFS]
|
||||
|
@ -3034,54 +3040,54 @@ generate_composite_function_nearest_scanline \
|
|||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q2
|
||||
vadd.u16 q12, q12, q13
|
||||
bilinear_store_&dst_fmt 4, q2, q3
|
||||
bilinear_store_\()\dst_fmt 4, q2, q3
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
|
||||
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
|
||||
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
|
||||
.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
|
||||
bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
|
||||
.else
|
||||
bilinear_interpolate_four_pixels src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
|
||||
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
|
||||
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
|
||||
.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
|
||||
bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
|
||||
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
|
||||
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
|
||||
.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
|
||||
bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
|
||||
.else
|
||||
bilinear_interpolate_four_pixels src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
|
||||
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
|
||||
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
|
||||
.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
|
||||
bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
|
||||
.else
|
||||
bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
|
||||
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
|
||||
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
|
||||
.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
|
||||
bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
|
||||
.else
|
||||
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
|
||||
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
|
||||
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
|
||||
.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
|
||||
bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
|
||||
.else
|
||||
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -3106,7 +3112,7 @@ generate_composite_function_nearest_scanline \
|
|||
src_bpp_shift, dst_bpp_shift, \
|
||||
prefetch_distance, flags
|
||||
|
||||
pixman_asm_function fname
|
||||
pixman_asm_function \fname
|
||||
OUT .req r0
|
||||
TOP .req r1
|
||||
BOTTOM .req r2
|
||||
|
@ -3124,11 +3130,11 @@ pixman_asm_function fname
|
|||
|
||||
mov ip, sp
|
||||
push {r4, r5, r6, r7, r8, r9}
|
||||
mov PF_OFFS, #prefetch_distance
|
||||
mov PF_OFFS, #\prefetch_distance
|
||||
ldmia ip, {WB, X, UX, WIDTH}
|
||||
mul PF_OFFS, PF_OFFS, UX
|
||||
|
||||
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
vpush {d8-d15}
|
||||
.endif
|
||||
|
||||
|
@ -3151,7 +3157,7 @@ pixman_asm_function fname
|
|||
beq 0f
|
||||
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
|
||||
vadd.u16 q12, q12, q13
|
||||
bilinear_interpolate_last_pixel src_fmt, dst_fmt
|
||||
bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
|
||||
sub WIDTH, WIDTH, #1
|
||||
0:
|
||||
vadd.u16 q13, q13, q13
|
||||
|
@ -3162,62 +3168,62 @@ pixman_asm_function fname
|
|||
blt 0f
|
||||
tst OUT, #(1 << (dst_bpp_shift + 1))
|
||||
beq 0f
|
||||
bilinear_interpolate_two_pixels src_fmt, dst_fmt
|
||||
bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
|
||||
sub WIDTH, WIDTH, #2
|
||||
0:
|
||||
.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
|
||||
.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
|
||||
/*********** 8 pixels per iteration *****************/
|
||||
cmp WIDTH, #4
|
||||
blt 0f
|
||||
tst OUT, #(1 << (dst_bpp_shift + 2))
|
||||
beq 0f
|
||||
bilinear_interpolate_four_pixels src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
|
||||
sub WIDTH, WIDTH, #4
|
||||
0:
|
||||
subs WIDTH, WIDTH, #8
|
||||
blt 1f
|
||||
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
|
||||
bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
|
||||
subs WIDTH, WIDTH, #8
|
||||
blt 5f
|
||||
0:
|
||||
bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
|
||||
subs WIDTH, WIDTH, #8
|
||||
bge 0b
|
||||
5:
|
||||
bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
|
||||
bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
|
||||
1:
|
||||
tst WIDTH, #4
|
||||
beq 2f
|
||||
bilinear_interpolate_four_pixels src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
|
||||
2:
|
||||
.else
|
||||
/*********** 4 pixels per iteration *****************/
|
||||
subs WIDTH, WIDTH, #4
|
||||
blt 1f
|
||||
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
|
||||
bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
|
||||
subs WIDTH, WIDTH, #4
|
||||
blt 5f
|
||||
0:
|
||||
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
|
||||
subs WIDTH, WIDTH, #4
|
||||
bge 0b
|
||||
5:
|
||||
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
|
||||
bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
|
||||
1:
|
||||
/****************************************************/
|
||||
.endif
|
||||
/* handle the remaining trailing pixels */
|
||||
tst WIDTH, #2
|
||||
beq 2f
|
||||
bilinear_interpolate_two_pixels src_fmt, dst_fmt
|
||||
bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
|
||||
2:
|
||||
tst WIDTH, #1
|
||||
beq 3f
|
||||
bilinear_interpolate_last_pixel src_fmt, dst_fmt
|
||||
bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
|
||||
3:
|
||||
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
|
||||
vpop {d8-d15}
|
||||
.endif
|
||||
pop {r4, r5, r6, r7, r8, r9}
|
||||
|
@ -3236,7 +3242,9 @@ pixman_asm_function fname
|
|||
.unreq TMP3
|
||||
.unreq TMP4
|
||||
.unreq STRIDE
|
||||
#ifndef __clang__
|
||||
.endfunc
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -25,6 +25,10 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#ifdef __clang__
|
||||
#define subpls subspl
|
||||
#endif
|
||||
|
||||
/* Prevent the stack from becoming executable */
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
@ -62,7 +66,7 @@
|
|||
prefetch_distance, \
|
||||
prefetch_braking_distance
|
||||
|
||||
pixman_asm_function fname
|
||||
pixman_asm_function \fname
|
||||
W .req r0
|
||||
DST .req r1
|
||||
SRC .req r2
|
||||
|
@ -76,38 +80,38 @@ pixman_asm_function fname
|
|||
|
||||
ldr UNIT_X, [sp]
|
||||
push {r4, r5, r6, r7, r8, r10}
|
||||
mvn VXMASK, #((1 << bpp_shift) - 1)
|
||||
mvn VXMASK, #((1 << \bpp_shift) - 1)
|
||||
ldr SRC_WIDTH_FIXED, [sp, #28]
|
||||
|
||||
/* define helper macro */
|
||||
.macro scale_2_pixels
|
||||
ldr&t TMP1, [SRC, TMP1]
|
||||
and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
|
||||
ldr\()\t TMP1, [SRC, TMP1]
|
||||
and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
|
||||
adds VX, VX, UNIT_X
|
||||
str&t TMP1, [DST], #(1 << bpp_shift)
|
||||
str\()\t TMP1, [DST], #(1 << \bpp_shift)
|
||||
9: subpls VX, VX, SRC_WIDTH_FIXED
|
||||
bpl 9b
|
||||
|
||||
ldr&t TMP2, [SRC, TMP2]
|
||||
and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
|
||||
ldr\()\t TMP2, [SRC, TMP2]
|
||||
and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
|
||||
adds VX, VX, UNIT_X
|
||||
str&t TMP2, [DST], #(1 << bpp_shift)
|
||||
str\()\t TMP2, [DST], #(1 << \bpp_shift)
|
||||
9: subpls VX, VX, SRC_WIDTH_FIXED
|
||||
bpl 9b
|
||||
.endm
|
||||
|
||||
/* now do the scaling */
|
||||
and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
|
||||
and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
|
||||
adds VX, VX, UNIT_X
|
||||
9: subpls VX, VX, SRC_WIDTH_FIXED
|
||||
bpl 9b
|
||||
subs W, W, #(8 + prefetch_braking_distance)
|
||||
subs W, W, #(8 + \prefetch_braking_distance)
|
||||
blt 2f
|
||||
/* calculate prefetch offset */
|
||||
mov PF_OFFS, #prefetch_distance
|
||||
mov PF_OFFS, #\prefetch_distance
|
||||
mla PF_OFFS, UNIT_X, PF_OFFS, VX
|
||||
1: /* main loop, process 8 pixels per iteration with prefetch */
|
||||
pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
|
||||
pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
|
||||
add PF_OFFS, UNIT_X, lsl #3
|
||||
scale_2_pixels
|
||||
scale_2_pixels
|
||||
|
@ -116,7 +120,7 @@ pixman_asm_function fname
|
|||
subs W, W, #8
|
||||
bge 1b
|
||||
2:
|
||||
subs W, W, #(4 - 8 - prefetch_braking_distance)
|
||||
subs W, W, #(4 - 8 - \prefetch_braking_distance)
|
||||
blt 2f
|
||||
1: /* process the remaining pixels */
|
||||
scale_2_pixels
|
||||
|
@ -129,8 +133,13 @@ pixman_asm_function fname
|
|||
scale_2_pixels
|
||||
2:
|
||||
tst W, #1
|
||||
ldrne&t TMP1, [SRC, TMP1]
|
||||
strne&t TMP1, [DST]
|
||||
#ifdef __clang__
|
||||
ldr\()\t\()ne TMP1, [SRC, TMP1]
|
||||
str\()\t\()ne TMP1, [DST]
|
||||
#else
|
||||
ldrne\()\t TMP1, [SRC, TMP1]
|
||||
strne\()\t TMP1, [DST]
|
||||
#endif
|
||||
/* cleanup helper macro */
|
||||
.purgem scale_2_pixels
|
||||
.unreq DST
|
||||
|
@ -146,7 +155,9 @@ pixman_asm_function fname
|
|||
/* return */
|
||||
pop {r4, r5, r6, r7, r8, r10}
|
||||
bx lr
|
||||
#ifndef __clang__
|
||||
.endfunc
|
||||
#endif
|
||||
.endm
|
||||
|
||||
generate_nearest_scanline_func \
|
||||
|
|
|
@ -25,6 +25,11 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#ifdef __clang__
|
||||
#define adceqs adcseq
|
||||
#define ldmnedb ldmdbne
|
||||
#endif
|
||||
|
||||
/* Prevent the stack from becoming executable */
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
@ -57,7 +62,7 @@
|
|||
.endm
|
||||
|
||||
.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
pixld cond, numbytes, firstreg, SRC, unaligned_src
|
||||
pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
|
||||
.endm
|
||||
|
||||
.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
|
||||
|
@ -65,8 +70,8 @@
|
|||
WK5 .req STRIDE_S
|
||||
WK6 .req MASK
|
||||
WK7 .req STRIDE_M
|
||||
110: pixld , 16, 0, SRC, unaligned_src
|
||||
pixld , 16, 4, SRC, unaligned_src
|
||||
110: pixld , 16, 0, SRC, \unaligned_src
|
||||
pixld , 16, 4, SRC, \unaligned_src
|
||||
pld [SRC, SCRATCH]
|
||||
pixst , 16, 0, DST
|
||||
pixst , 16, 4, DST
|
||||
|
@ -142,7 +147,7 @@ generate_composite_function \
|
|||
WK5 .req STRIDE_S
|
||||
WK6 .req MASK
|
||||
WK7 .req STRIDE_M
|
||||
pixst cond, numbytes, 4, DST
|
||||
pixst \cond, \numbytes, 4, DST
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
.unreq WK6
|
||||
|
@ -182,20 +187,20 @@ generate_composite_function \
|
|||
/******************************************************************************/
|
||||
|
||||
.macro src_x888_8888_pixel, cond, reg
|
||||
orr&cond WK®, WK®, #0xFF000000
|
||||
orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
|
||||
.endm
|
||||
|
||||
.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
pixld cond, numbytes, firstreg, SRC, unaligned_src
|
||||
pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
|
||||
.endm
|
||||
|
||||
.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
|
||||
src_x888_8888_pixel cond, %(firstreg+0)
|
||||
.if numbytes >= 8
|
||||
src_x888_8888_pixel cond, %(firstreg+1)
|
||||
.if numbytes == 16
|
||||
src_x888_8888_pixel cond, %(firstreg+2)
|
||||
src_x888_8888_pixel cond, %(firstreg+3)
|
||||
src_x888_8888_pixel \cond, %(\firstreg+0)
|
||||
.if \numbytes >= 8
|
||||
src_x888_8888_pixel \cond, %(\firstreg+1)
|
||||
.if \numbytes == 16
|
||||
src_x888_8888_pixel \cond, %(\firstreg+2)
|
||||
src_x888_8888_pixel \cond, %(\firstreg+3)
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
@ -222,73 +227,73 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro src_0565_8888_2pixels, reg1, reg2
|
||||
and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
|
||||
bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
|
||||
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
|
||||
mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000
|
||||
mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
|
||||
bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000
|
||||
orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
|
||||
orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
|
||||
pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
|
||||
sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
|
||||
mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
|
||||
pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
|
||||
sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
|
||||
orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
|
||||
orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
|
||||
and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
|
||||
bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
|
||||
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
|
||||
mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
|
||||
mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
|
||||
bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
|
||||
orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
|
||||
orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
|
||||
pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
|
||||
sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
|
||||
mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
|
||||
pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
|
||||
sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
|
||||
orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
|
||||
orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
|
||||
.endm
|
||||
|
||||
/* This version doesn't need STRIDE_M, but is one instruction longer.
|
||||
It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
|
||||
and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
|
||||
bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
|
||||
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
|
||||
mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
|
||||
mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
|
||||
bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
|
||||
mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
|
||||
mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
|
||||
orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
|
||||
orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
|
||||
pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
|
||||
pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
|
||||
sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
|
||||
sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb
|
||||
orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
|
||||
orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
|
||||
and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
|
||||
bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
|
||||
orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
|
||||
mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
|
||||
mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
|
||||
bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
|
||||
mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
|
||||
mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
|
||||
orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
|
||||
orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
|
||||
pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
|
||||
pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
|
||||
sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
|
||||
sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb
|
||||
orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
|
||||
orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
|
||||
*/
|
||||
|
||||
.macro src_0565_8888_1pixel, reg
|
||||
bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb
|
||||
and WK®, WK®, MASK @ 000000000000000000000gggggg00000
|
||||
mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
|
||||
mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000
|
||||
orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
|
||||
orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000
|
||||
pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
|
||||
sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
|
||||
orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
|
||||
bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb
|
||||
and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000
|
||||
mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
|
||||
mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000
|
||||
orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
|
||||
orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000
|
||||
pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
|
||||
sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
|
||||
orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
|
||||
.endm
|
||||
|
||||
.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
.if numbytes == 16
|
||||
pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
|
||||
.elseif numbytes == 8
|
||||
pixld , 4, firstreg, SRC, unaligned_src
|
||||
.elseif numbytes == 4
|
||||
pixld , 2, firstreg, SRC, unaligned_src
|
||||
.if \numbytes == 16
|
||||
pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
|
||||
.elseif \numbytes == 8
|
||||
pixld , 4, \firstreg, SRC, \unaligned_src
|
||||
.elseif \numbytes == 4
|
||||
pixld , 2, \firstreg, SRC, \unaligned_src
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro src_0565_8888_process_tail cond, numbytes, firstreg
|
||||
.if numbytes == 16
|
||||
src_0565_8888_2pixels firstreg, %(firstreg+1)
|
||||
src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
|
||||
.elseif numbytes == 8
|
||||
src_0565_8888_2pixels firstreg, %(firstreg+1)
|
||||
.if \numbytes == 16
|
||||
src_0565_8888_2pixels \firstreg, %(\firstreg+1)
|
||||
src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
|
||||
.elseif \numbytes == 8
|
||||
src_0565_8888_2pixels \firstreg, %(\firstreg+1)
|
||||
.else
|
||||
src_0565_8888_1pixel firstreg
|
||||
src_0565_8888_1pixel \firstreg
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -311,23 +316,23 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro src_x888_0565_1pixel s, d
|
||||
and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
|
||||
and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
|
||||
orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
|
||||
orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
|
||||
and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
|
||||
and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000
|
||||
orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
|
||||
orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
|
||||
/* Top 16 bits are discarded during the following STRH */
|
||||
.endm
|
||||
|
||||
.macro src_x888_0565_2pixels slo, shi, d, tmp
|
||||
and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
|
||||
and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
|
||||
and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
|
||||
orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
|
||||
orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
|
||||
and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
|
||||
orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
|
||||
orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
|
||||
pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
|
||||
and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
|
||||
and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
|
||||
and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
|
||||
orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
|
||||
orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
|
||||
and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000
|
||||
orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
|
||||
orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
|
||||
pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
|
||||
.endm
|
||||
|
||||
.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
|
@ -335,33 +340,33 @@ generate_composite_function \
|
|||
WK5 .req STRIDE_M
|
||||
WK6 .req WK3
|
||||
WK7 .req ORIG_W
|
||||
.if numbytes == 16
|
||||
.if \numbytes == 16
|
||||
pixld , 16, 4, SRC, 0
|
||||
src_x888_0565_2pixels 4, 5, 0, 0
|
||||
pixld , 8, 4, SRC, 0
|
||||
src_x888_0565_2pixels 6, 7, 1, 1
|
||||
pixld , 8, 6, SRC, 0
|
||||
.else
|
||||
pixld , numbytes*2, 4, SRC, 0
|
||||
pixld , \numbytes*2, 4, SRC, 0
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro src_x888_0565_process_tail cond, numbytes, firstreg
|
||||
.if numbytes == 16
|
||||
.if \numbytes == 16
|
||||
src_x888_0565_2pixels 4, 5, 2, 2
|
||||
src_x888_0565_2pixels 6, 7, 3, 4
|
||||
.elseif numbytes == 8
|
||||
.elseif \numbytes == 8
|
||||
src_x888_0565_2pixels 4, 5, 1, 1
|
||||
src_x888_0565_2pixels 6, 7, 2, 2
|
||||
.elseif numbytes == 4
|
||||
.elseif \numbytes == 4
|
||||
src_x888_0565_2pixels 4, 5, 1, 1
|
||||
.else
|
||||
src_x888_0565_1pixel 4, 1
|
||||
.endif
|
||||
.if numbytes == 16
|
||||
pixst , numbytes, 0, DST
|
||||
.if \numbytes == 16
|
||||
pixst , \numbytes, 0, DST
|
||||
.else
|
||||
pixst , numbytes, 1, DST
|
||||
pixst , \numbytes, 1, DST
|
||||
.endif
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
|
@ -382,37 +387,37 @@ generate_composite_function \
|
|||
/******************************************************************************/
|
||||
|
||||
.macro add_8_8_8pixels cond, dst1, dst2
|
||||
uqadd8&cond WK&dst1, WK&dst1, MASK
|
||||
uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
|
||||
uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK
|
||||
uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M
|
||||
.endm
|
||||
|
||||
.macro add_8_8_4pixels cond, dst
|
||||
uqadd8&cond WK&dst, WK&dst, MASK
|
||||
uqadd8\()\cond WK\()\dst, WK\()\dst, MASK
|
||||
.endm
|
||||
|
||||
.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
WK4 .req MASK
|
||||
WK5 .req STRIDE_M
|
||||
.if numbytes == 16
|
||||
pixld cond, 8, 4, SRC, unaligned_src
|
||||
pixld cond, 16, firstreg, DST, 0
|
||||
add_8_8_8pixels cond, firstreg, %(firstreg+1)
|
||||
pixld cond, 8, 4, SRC, unaligned_src
|
||||
.if \numbytes == 16
|
||||
pixld \cond, 8, 4, SRC, \unaligned_src
|
||||
pixld \cond, 16, \firstreg, DST, 0
|
||||
add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
|
||||
pixld \cond, 8, 4, SRC, \unaligned_src
|
||||
.else
|
||||
pixld cond, numbytes, 4, SRC, unaligned_src
|
||||
pixld cond, numbytes, firstreg, DST, 0
|
||||
pixld \cond, \numbytes, 4, SRC, \unaligned_src
|
||||
pixld \cond, \numbytes, \firstreg, DST, 0
|
||||
.endif
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
.endm
|
||||
|
||||
.macro add_8_8_process_tail cond, numbytes, firstreg
|
||||
.if numbytes == 16
|
||||
add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
|
||||
.elseif numbytes == 8
|
||||
add_8_8_8pixels cond, firstreg, %(firstreg+1)
|
||||
.if \numbytes == 16
|
||||
add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
|
||||
.elseif \numbytes == 8
|
||||
add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
|
||||
.else
|
||||
add_8_8_4pixels cond, firstreg
|
||||
add_8_8_4pixels \cond, \firstreg
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -441,8 +446,8 @@ generate_composite_function \
|
|||
WK5 .req STRIDE_S
|
||||
WK6 .req STRIDE_M
|
||||
WK7 .req ORIG_W
|
||||
pixld , numbytes, %(4+firstreg), SRC, unaligned_src
|
||||
pixld , numbytes, firstreg, DST, 0
|
||||
pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src
|
||||
pixld , \numbytes, \firstreg, DST, 0
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
.unreq WK6
|
||||
|
@ -451,44 +456,44 @@ generate_composite_function \
|
|||
|
||||
.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
|
||||
/* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
|
||||
teq WK®0, #0
|
||||
.if numbytes > 4
|
||||
teqeq WK®1, #0
|
||||
.if numbytes > 8
|
||||
teqeq WK®2, #0
|
||||
teqeq WK®3, #0
|
||||
teq WK\()\reg0, #0
|
||||
.if \numbytes > 4
|
||||
teqeq WK\()\reg1, #0
|
||||
.if \numbytes > 8
|
||||
teqeq WK\()\reg2, #0
|
||||
teqeq WK\()\reg3, #0
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro over_8888_8888_prepare next
|
||||
mov WK&next, WK&next, lsr #24
|
||||
mov WK\()\next, WK\()\next, lsr #24
|
||||
.endm
|
||||
|
||||
.macro over_8888_8888_1pixel src, dst, offset, next
|
||||
/* src = destination component multiplier */
|
||||
rsb WK&src, WK&src, #255
|
||||
rsb WK\()\src, WK\()\src, #255
|
||||
/* Split even/odd bytes of dst into SCRATCH/dst */
|
||||
uxtb16 SCRATCH, WK&dst
|
||||
uxtb16 WK&dst, WK&dst, ror #8
|
||||
uxtb16 SCRATCH, WK\()\dst
|
||||
uxtb16 WK\()\dst, WK\()\dst, ror #8
|
||||
/* Multiply through, adding 0.5 to the upper byte of result for rounding */
|
||||
mla SCRATCH, SCRATCH, WK&src, MASK
|
||||
mla WK&dst, WK&dst, WK&src, MASK
|
||||
mla SCRATCH, SCRATCH, WK\()\src, MASK
|
||||
mla WK\()\dst, WK\()\dst, WK\()\src, MASK
|
||||
/* Where we would have had a stall between the result of the first MLA and the shifter input,
|
||||
* reload the complete source pixel */
|
||||
ldr WK&src, [SRC, #offset]
|
||||
ldr WK\()\src, [SRC, #\offset]
|
||||
/* Multiply by 257/256 to approximate 256/255 */
|
||||
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
|
||||
/* In this stall, start processing the next pixel */
|
||||
.if offset < -4
|
||||
mov WK&next, WK&next, lsr #24
|
||||
.if \offset < -4
|
||||
mov WK\()\next, WK\()\next, lsr #24
|
||||
.endif
|
||||
uxtab16 WK&dst, WK&dst, WK&dst, ror #8
|
||||
uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
|
||||
/* Recombine even/odd bytes of multiplied destination */
|
||||
mov SCRATCH, SCRATCH, ror #8
|
||||
sel WK&dst, SCRATCH, WK&dst
|
||||
sel WK\()\dst, SCRATCH, WK\()\dst
|
||||
/* Saturated add of source to multiplied destination */
|
||||
uqadd8 WK&dst, WK&dst, WK&src
|
||||
uqadd8 WK\()\dst, WK\()\dst, WK\()\src
|
||||
.endm
|
||||
|
||||
.macro over_8888_8888_process_tail cond, numbytes, firstreg
|
||||
|
@ -496,17 +501,17 @@ generate_composite_function \
|
|||
WK5 .req STRIDE_S
|
||||
WK6 .req STRIDE_M
|
||||
WK7 .req ORIG_W
|
||||
over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
|
||||
over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
|
||||
beq 10f
|
||||
over_8888_8888_prepare %(4+firstreg)
|
||||
.set PROCESS_REG, firstreg
|
||||
.set PROCESS_OFF, -numbytes
|
||||
.rept numbytes / 4
|
||||
over_8888_8888_prepare %(4+\firstreg)
|
||||
.set PROCESS_REG, \firstreg
|
||||
.set PROCESS_OFF, -\numbytes
|
||||
.rept \numbytes / 4
|
||||
over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
|
||||
.set PROCESS_REG, PROCESS_REG+1
|
||||
.set PROCESS_OFF, PROCESS_OFF+4
|
||||
.endr
|
||||
pixst , numbytes, firstreg, DST
|
||||
pixst , \numbytes, \firstreg, DST
|
||||
10:
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
|
@ -536,16 +541,16 @@ generate_composite_function \
|
|||
*/
|
||||
.macro mul_8888_8 word, byte, tmp, half
|
||||
/* Split even/odd bytes of word apart */
|
||||
uxtb16 tmp, word
|
||||
uxtb16 word, word, ror #8
|
||||
uxtb16 \tmp, \word
|
||||
uxtb16 \word, \word, ror #8
|
||||
/* Multiply bytes together with rounding, then by 257/256 */
|
||||
mla tmp, tmp, byte, half
|
||||
mla word, word, byte, half /* 1 stall follows */
|
||||
uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
|
||||
uxtab16 word, word, word, ror #8
|
||||
mla \tmp, \tmp, \byte, \half
|
||||
mla \word, \word, \byte, \half /* 1 stall follows */
|
||||
uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */
|
||||
uxtab16 \word, \word, \word, ror #8
|
||||
/* Recombine bytes */
|
||||
mov tmp, tmp, ror #8
|
||||
sel word, tmp, word
|
||||
mov \tmp, \tmp, ror #8
|
||||
sel \word, \tmp, \word
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -567,8 +572,8 @@ generate_composite_function \
|
|||
WK5 .req STRIDE_D
|
||||
WK6 .req STRIDE_S
|
||||
WK7 .req ORIG_W
|
||||
pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
|
||||
pixld , numbytes, firstreg, DST, 0
|
||||
pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
|
||||
pixld , \numbytes, \firstreg, DST, 0
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
.unreq WK6
|
||||
|
@ -576,10 +581,10 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro over_8888_n_8888_1pixel src, dst
|
||||
mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
|
||||
sub WK7, WK6, WK&src, lsr #24
|
||||
mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
|
||||
uqadd8 WK&dst, WK&dst, WK&src
|
||||
mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M
|
||||
sub WK7, WK6, WK\()\src, lsr #24
|
||||
mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M
|
||||
uqadd8 WK\()\dst, WK\()\dst, WK\()\src
|
||||
.endm
|
||||
|
||||
.macro over_8888_n_8888_process_tail cond, numbytes, firstreg
|
||||
|
@ -587,12 +592,12 @@ generate_composite_function \
|
|||
WK5 .req STRIDE_D
|
||||
WK6 .req STRIDE_S
|
||||
WK7 .req ORIG_W
|
||||
over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
|
||||
over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
|
||||
beq 10f
|
||||
mov WK6, #255
|
||||
.set PROCESS_REG, firstreg
|
||||
.rept numbytes / 4
|
||||
.if numbytes == 16 && PROCESS_REG == 2
|
||||
.set PROCESS_REG, \firstreg
|
||||
.rept \numbytes / 4
|
||||
.if \numbytes == 16 && PROCESS_REG == 2
|
||||
/* We're using WK6 and WK7 as temporaries, so half way through
|
||||
* 4 pixels, reload the second two source pixels but this time
|
||||
* into WK4 and WK5 */
|
||||
|
@ -601,7 +606,7 @@ generate_composite_function \
|
|||
over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
|
||||
.set PROCESS_REG, PROCESS_REG+1
|
||||
.endr
|
||||
pixst , numbytes, firstreg, DST
|
||||
pixst , \numbytes, \firstreg, DST
|
||||
10:
|
||||
.unreq WK4
|
||||
.unreq WK5
|
||||
|
@ -642,13 +647,13 @@ generate_composite_function \
|
|||
|
||||
.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
WK4 .req STRIDE_M
|
||||
pixld , numbytes/4, 4, MASK, unaligned_mask
|
||||
pixld , numbytes, firstreg, DST, 0
|
||||
pixld , \numbytes/4, 4, MASK, \unaligned_mask
|
||||
pixld , \numbytes, \firstreg, DST, 0
|
||||
.unreq WK4
|
||||
.endm
|
||||
|
||||
.macro over_n_8_8888_1pixel src, dst
|
||||
uxtb Y, WK4, ror #src*8
|
||||
uxtb Y, WK4, ror #\src*8
|
||||
/* Trailing part of multiplication of source */
|
||||
mla SCRATCH, STRIDE_S, Y, STRIDE_D
|
||||
mla Y, SRC, Y, STRIDE_D
|
||||
|
@ -659,20 +664,20 @@ generate_composite_function \
|
|||
sub ORIG_W, ORIG_W, Y, lsr #24
|
||||
sel Y, SCRATCH, Y
|
||||
/* Then multiply the destination */
|
||||
mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
|
||||
uqadd8 WK&dst, WK&dst, Y
|
||||
mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
|
||||
uqadd8 WK\()\dst, WK\()\dst, Y
|
||||
.endm
|
||||
|
||||
.macro over_n_8_8888_process_tail cond, numbytes, firstreg
|
||||
WK4 .req STRIDE_M
|
||||
teq WK4, #0
|
||||
beq 10f
|
||||
.set PROCESS_REG, firstreg
|
||||
.rept numbytes / 4
|
||||
over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
|
||||
.set PROCESS_REG, \firstreg
|
||||
.rept \numbytes / 4
|
||||
over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG)
|
||||
.set PROCESS_REG, PROCESS_REG+1
|
||||
.endr
|
||||
pixst , numbytes, firstreg, DST
|
||||
pixst , \numbytes, \firstreg, DST
|
||||
10:
|
||||
.unreq WK4
|
||||
.endm
|
||||
|
@ -705,14 +710,14 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
pixld , numbytes, firstreg, DST, 0
|
||||
pixld , \numbytes, \firstreg, DST, 0
|
||||
.endm
|
||||
|
||||
.macro over_reverse_n_8888_1pixel d, is_only
|
||||
teq WK&d, #0
|
||||
teq WK\()\d, #0
|
||||
beq 8f /* replace with source */
|
||||
bics ORIG_W, STRIDE_D, WK&d, lsr #24
|
||||
.if is_only == 1
|
||||
bics ORIG_W, STRIDE_D, WK\()\d, lsr #24
|
||||
.if \is_only == 1
|
||||
beq 49f /* skip store */
|
||||
.else
|
||||
beq 9f /* write same value back */
|
||||
|
@ -723,36 +728,36 @@ generate_composite_function \
|
|||
uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
|
||||
mov SCRATCH, SCRATCH, ror #8
|
||||
sel ORIG_W, SCRATCH, ORIG_W
|
||||
uqadd8 WK&d, WK&d, ORIG_W
|
||||
uqadd8 WK\()\d, WK\()\d, ORIG_W
|
||||
b 9f
|
||||
8: mov WK&d, SRC
|
||||
8: mov WK\()\d, SRC
|
||||
9:
|
||||
.endm
|
||||
|
||||
.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
|
||||
.if numbytes == 4
|
||||
over_reverse_n_8888_1pixel reg1, 1
|
||||
.if \numbytes == 4
|
||||
over_reverse_n_8888_1pixel \reg1, 1
|
||||
.else
|
||||
and SCRATCH, WK®1, WK®2
|
||||
.if numbytes == 16
|
||||
and SCRATCH, SCRATCH, WK®3
|
||||
and SCRATCH, SCRATCH, WK®4
|
||||
and SCRATCH, WK\()\reg1, WK\()\reg2
|
||||
.if \numbytes == 16
|
||||
and SCRATCH, SCRATCH, WK\()\reg3
|
||||
and SCRATCH, SCRATCH, WK\()\reg4
|
||||
.endif
|
||||
mvns SCRATCH, SCRATCH, asr #24
|
||||
beq 49f /* skip store if all opaque */
|
||||
over_reverse_n_8888_1pixel reg1, 0
|
||||
over_reverse_n_8888_1pixel reg2, 0
|
||||
.if numbytes == 16
|
||||
over_reverse_n_8888_1pixel reg3, 0
|
||||
over_reverse_n_8888_1pixel reg4, 0
|
||||
over_reverse_n_8888_1pixel \reg1, 0
|
||||
over_reverse_n_8888_1pixel \reg2, 0
|
||||
.if \numbytes == 16
|
||||
over_reverse_n_8888_1pixel \reg3, 0
|
||||
over_reverse_n_8888_1pixel \reg4, 0
|
||||
.endif
|
||||
.endif
|
||||
pixst , numbytes, reg1, DST
|
||||
pixst , \numbytes, \reg1, DST
|
||||
49:
|
||||
.endm
|
||||
|
||||
.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
|
||||
over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
|
||||
over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
|
||||
.endm
|
||||
|
||||
generate_composite_function \
|
||||
|
@ -794,20 +799,20 @@ generate_composite_function \
|
|||
|
||||
.macro over_white_8888_8888_ca_combine m, d
|
||||
uxtb16 TMP1, TMP0 /* rb_notmask */
|
||||
uxtb16 TMP2, d /* rb_dest; 1 stall follows */
|
||||
uxtb16 TMP2, \d /* rb_dest; 1 stall follows */
|
||||
smlatt TMP3, TMP2, TMP1, HALF /* red */
|
||||
smlabb TMP2, TMP2, TMP1, HALF /* blue */
|
||||
uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
|
||||
uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
|
||||
smlatt d, TMP1, TMP0, HALF /* alpha */
|
||||
uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */
|
||||
smlatt \d, TMP1, TMP0, HALF /* alpha */
|
||||
smlabb TMP1, TMP1, TMP0, HALF /* green */
|
||||
pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
|
||||
pkhbt TMP1, TMP1, d, lsl #16 /* ag */
|
||||
pkhbt TMP1, TMP1, \d, lsl #16 /* ag */
|
||||
uxtab16 TMP0, TMP0, TMP0, ror #8
|
||||
uxtab16 TMP1, TMP1, TMP1, ror #8
|
||||
mov TMP0, TMP0, ror #8
|
||||
sel d, TMP0, TMP1
|
||||
uqadd8 d, d, m /* d is a late result */
|
||||
sel \d, TMP0, TMP1
|
||||
uqadd8 \d, \d, \m /* d is a late result */
|
||||
.endm
|
||||
|
||||
.macro over_white_8888_8888_ca_1pixel_head
|
||||
|
@ -853,10 +858,10 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
.if numbytes == 4
|
||||
.if \numbytes == 4
|
||||
over_white_8888_8888_ca_1pixel_head
|
||||
.else
|
||||
.if numbytes == 16
|
||||
.if \numbytes == 16
|
||||
over_white_8888_8888_ca_2pixels_head
|
||||
over_white_8888_8888_ca_2pixels_tail
|
||||
.endif
|
||||
|
@ -865,7 +870,7 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
|
||||
.if numbytes == 4
|
||||
.if \numbytes == 4
|
||||
over_white_8888_8888_ca_1pixel_tail
|
||||
.else
|
||||
over_white_8888_8888_ca_2pixels_tail
|
||||
|
@ -1004,7 +1009,7 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
.rept (numbytes / 4) - 1
|
||||
.rept (\numbytes / 4) - 1
|
||||
over_n_8888_8888_ca_1pixel_head
|
||||
over_n_8888_8888_ca_1pixel_tail
|
||||
.endr
|
||||
|
@ -1020,7 +1025,9 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
|
|||
cmp ip, #-1
|
||||
beq pixman_composite_over_white_8888_8888_ca_asm_armv6
|
||||
/* else drop through... */
|
||||
#ifndef __clang__
|
||||
.endfunc
|
||||
#endif
|
||||
generate_composite_function \
|
||||
pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
|
||||
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
|
||||
|
@ -1045,84 +1052,84 @@ generate_composite_function \
|
|||
|
||||
.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
|
||||
ldrb ORIG_W, [SRC], #4
|
||||
.if numbytes >= 8
|
||||
ldrb WK®1, [SRC], #4
|
||||
.if numbytes == 16
|
||||
ldrb WK®2, [SRC], #4
|
||||
ldrb WK®3, [SRC], #4
|
||||
.if \numbytes >= 8
|
||||
ldrb WK\()\reg1, [SRC], #4
|
||||
.if \numbytes == 16
|
||||
ldrb WK\()\reg2, [SRC], #4
|
||||
ldrb WK\()\reg3, [SRC], #4
|
||||
.endif
|
||||
.endif
|
||||
add DST, DST, #numbytes
|
||||
add DST, DST, #\numbytes
|
||||
.endm
|
||||
|
||||
.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
|
||||
in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
|
||||
.endm
|
||||
|
||||
.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
|
||||
.if is_only != 1
|
||||
movs s, ORIG_W
|
||||
.if offset != 0
|
||||
ldrb ORIG_W, [SRC, #offset]
|
||||
.if \is_only != 1
|
||||
movs \s, ORIG_W
|
||||
.if \offset != 0
|
||||
ldrb ORIG_W, [SRC, #\offset]
|
||||
.endif
|
||||
beq 01f
|
||||
teq STRIDE_M, #0xFF
|
||||
beq 02f
|
||||
.endif
|
||||
uxtb16 SCRATCH, d /* rb_dest */
|
||||
uxtb16 d, d, ror #8 /* ag_dest */
|
||||
mla SCRATCH, SCRATCH, s, MASK
|
||||
mla d, d, s, MASK
|
||||
uxtb16 SCRATCH, \d /* rb_dest */
|
||||
uxtb16 \d, \d, ror #8 /* ag_dest */
|
||||
mla SCRATCH, SCRATCH, \s, MASK
|
||||
mla \d, \d, \s, MASK
|
||||
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
|
||||
uxtab16 d, d, d, ror #8
|
||||
uxtab16 \d, \d, \d, ror #8
|
||||
mov SCRATCH, SCRATCH, ror #8
|
||||
sel d, SCRATCH, d
|
||||
sel \d, SCRATCH, \d
|
||||
b 02f
|
||||
.if offset == 0
|
||||
.if \offset == 0
|
||||
48: /* Last mov d,#0 of the set - used as part of shortcut for
|
||||
* source values all 0 */
|
||||
.endif
|
||||
01: mov d, #0
|
||||
01: mov \d, #0
|
||||
02:
|
||||
.endm
|
||||
|
||||
.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
|
||||
.if numbytes == 4
|
||||
.if \numbytes == 4
|
||||
teq ORIG_W, ORIG_W, asr #32
|
||||
ldrne WK®1, [DST, #-4]
|
||||
.elseif numbytes == 8
|
||||
teq ORIG_W, WK®1
|
||||
ldrne WK\()\reg1, [DST, #-4]
|
||||
.elseif \numbytes == 8
|
||||
teq ORIG_W, WK\()\reg1
|
||||
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
|
||||
ldmnedb DST, {WK®1-WK®2}
|
||||
ldmnedb DST, {WK\()\reg1-WK\()\reg2}
|
||||
.else
|
||||
teq ORIG_W, WK®1
|
||||
teqeq ORIG_W, WK®2
|
||||
teqeq ORIG_W, WK®3
|
||||
teq ORIG_W, WK\()\reg1
|
||||
teqeq ORIG_W, WK\()\reg2
|
||||
teqeq ORIG_W, WK\()\reg3
|
||||
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
|
||||
ldmnedb DST, {WK®1-WK®4}
|
||||
ldmnedb DST, {WK\()\reg1-WK\()\reg4}
|
||||
.endif
|
||||
cmnne DST, #0 /* clear C if NE */
|
||||
bcs 49f /* no writes to dest if source all -1 */
|
||||
beq 48f /* set dest to all 0 if source all 0 */
|
||||
.if numbytes == 4
|
||||
in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1
|
||||
str WK®1, [DST, #-4]
|
||||
.elseif numbytes == 8
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0
|
||||
stmdb DST, {WK®1-WK®2}
|
||||
.if \numbytes == 4
|
||||
in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1
|
||||
str WK\()\reg1, [DST, #-4]
|
||||
.elseif \numbytes == 8
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0
|
||||
stmdb DST, {WK\()\reg1-WK\()\reg2}
|
||||
.else
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0
|
||||
stmdb DST, {WK®1-WK®4}
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0
|
||||
in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0
|
||||
stmdb DST, {WK\()\reg1-WK\()\reg4}
|
||||
.endif
|
||||
49:
|
||||
.endm
|
||||
|
||||
.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
|
||||
in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
|
||||
in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
|
||||
.endm
|
||||
|
||||
generate_composite_function \
|
||||
|
@ -1149,21 +1156,21 @@ generate_composite_function \
|
|||
.endm
|
||||
|
||||
.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||
pixld , numbytes, firstreg, DST, 0
|
||||
pixld , \numbytes, \firstreg, DST, 0
|
||||
.endm
|
||||
|
||||
.macro over_n_8888_1pixel dst
|
||||
mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK
|
||||
uqadd8 WK&dst, WK&dst, SRC
|
||||
mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK
|
||||
uqadd8 WK\()\dst, WK\()\dst, SRC
|
||||
.endm
|
||||
|
||||
.macro over_n_8888_process_tail cond, numbytes, firstreg
|
||||
.set PROCESS_REG, firstreg
|
||||
.rept numbytes / 4
|
||||
.set PROCESS_REG, \firstreg
|
||||
.rept \numbytes / 4
|
||||
over_n_8888_1pixel %(PROCESS_REG)
|
||||
.set PROCESS_REG, PROCESS_REG+1
|
||||
.endr
|
||||
pixst , numbytes, firstreg, DST
|
||||
pixst , \numbytes, \firstreg, DST
|
||||
.endm
|
||||
|
||||
generate_composite_function \
|
||||
|
|
|
@ -112,64 +112,96 @@
|
|||
*/
|
||||
|
||||
.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
|
||||
.if numbytes == 16
|
||||
.if unaligned == 1
|
||||
op&r&cond WK®0, [base], #4
|
||||
op&r&cond WK®1, [base], #4
|
||||
op&r&cond WK®2, [base], #4
|
||||
op&r&cond WK®3, [base], #4
|
||||
.if \numbytes == 16
|
||||
.if \unaligned == 1
|
||||
\op\()r\()\cond WK\()\reg0, [\base], #4
|
||||
\op\()r\()\cond WK\()\reg1, [\base], #4
|
||||
\op\()r\()\cond WK\()\reg2, [\base], #4
|
||||
\op\()r\()\cond WK\()\reg3, [\base], #4
|
||||
.else
|
||||
op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3}
|
||||
#ifdef __clang__
|
||||
\op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
|
||||
#else
|
||||
\op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
|
||||
#endif
|
||||
.endif
|
||||
.elseif numbytes == 8
|
||||
.if unaligned == 1
|
||||
op&r&cond WK®0, [base], #4
|
||||
op&r&cond WK®1, [base], #4
|
||||
.elseif \numbytes == 8
|
||||
.if \unaligned == 1
|
||||
\op\()r\()\cond WK\()\reg0, [\base], #4
|
||||
\op\()r\()\cond WK\()\reg1, [\base], #4
|
||||
.else
|
||||
op&m&cond&ia base!, {WK®0,WK®1}
|
||||
#ifdef __clang__
|
||||
\op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
|
||||
#else
|
||||
\op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1}
|
||||
#endif
|
||||
.endif
|
||||
.elseif numbytes == 4
|
||||
op&r&cond WK®0, [base], #4
|
||||
.elseif numbytes == 2
|
||||
op&r&cond&h WK®0, [base], #2
|
||||
.elseif numbytes == 1
|
||||
op&r&cond&b WK®0, [base], #1
|
||||
.elseif \numbytes == 4
|
||||
\op\()r\()\cond WK\()\reg0, [\base], #4
|
||||
.elseif \numbytes == 2
|
||||
#ifdef __clang__
|
||||
\op\()rh\()\cond WK\()\reg0, [\base], #2
|
||||
#else
|
||||
\op\()r\()\cond\()h WK\()\reg0, [\base], #2
|
||||
#endif
|
||||
.elseif \numbytes == 1
|
||||
#ifdef __clang__
|
||||
\op\()rb\()\cond WK\()\reg0, [\base], #1
|
||||
#else
|
||||
\op\()r\()\cond\()b WK\()\reg0, [\base], #1
|
||||
#endif
|
||||
.else
|
||||
.error "unsupported size: numbytes"
|
||||
.error "unsupported size: \numbytes"
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
|
||||
.if numbytes == 16
|
||||
stm&cond&db base, {WK®0,WK®1,WK®2,WK®3}
|
||||
.elseif numbytes == 8
|
||||
stm&cond&db base, {WK®0,WK®1}
|
||||
.elseif numbytes == 4
|
||||
str&cond WK®0, [base, #-4]
|
||||
.elseif numbytes == 2
|
||||
str&cond&h WK®0, [base, #-2]
|
||||
.elseif numbytes == 1
|
||||
str&cond&b WK®0, [base, #-1]
|
||||
.if \numbytes == 16
|
||||
#ifdef __clang__
|
||||
stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
|
||||
#else
|
||||
stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
|
||||
#endif
|
||||
.elseif \numbytes == 8
|
||||
#ifdef __clang__
|
||||
stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
|
||||
#else
|
||||
stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}
|
||||
#endif
|
||||
.elseif \numbytes == 4
|
||||
str\()\cond WK\()\reg0, [\base, #-4]
|
||||
.elseif \numbytes == 2
|
||||
#ifdef __clang__
|
||||
strh\()\cond WK\()\reg0, [\base, #-2]
|
||||
#else
|
||||
str\()\cond\()h WK\()\reg0, [\base, #-2]
|
||||
#endif
|
||||
.elseif \numbytes == 1
|
||||
#ifdef __clang__
|
||||
strb\()\cond WK\()\reg0, [\base, #-1]
|
||||
#else
|
||||
str\()\cond\()b WK\()\reg0, [\base, #-1]
|
||||
#endif
|
||||
.else
|
||||
.error "unsupported size: numbytes"
|
||||
.error "unsupported size: \numbytes"
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro pixld cond, numbytes, firstreg, base, unaligned
|
||||
pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
|
||||
pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
|
||||
.endm
|
||||
|
||||
.macro pixst cond, numbytes, firstreg, base
|
||||
.if (flags) & FLAG_DST_READWRITE
|
||||
pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
|
||||
pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
|
||||
.else
|
||||
pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
|
||||
pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro PF a, x:vararg
|
||||
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
|
||||
a x
|
||||
\a \x
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -179,11 +211,11 @@
|
|||
* between 0 and prefetch_distance (inclusive) cache lines ahead so there
|
||||
* are no gaps when the inner loop starts.
|
||||
*/
|
||||
.if bpp > 0
|
||||
PF bic, ptr, base, #31
|
||||
.if \bpp > 0
|
||||
PF bic, \ptr, \base, #31
|
||||
.set OFFSET, 0
|
||||
.rept prefetch_distance+1
|
||||
PF pld, [ptr, #OFFSET]
|
||||
PF pld, [\ptr, #OFFSET]
|
||||
.set OFFSET, OFFSET+32
|
||||
.endr
|
||||
.endif
|
||||
|
@ -201,42 +233,42 @@
|
|||
* and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
|
||||
* possible when there are 4 src bytes for every 1 dst byte).
|
||||
*/
|
||||
.if bpp > 0
|
||||
.ifc base,DST
|
||||
.if \bpp > 0
|
||||
.ifc \base,DST
|
||||
/* The test can be simplified further when preloading the destination */
|
||||
PF tst, base, #16
|
||||
PF tst, \base, #16
|
||||
PF beq, 61f
|
||||
.else
|
||||
.if bpp/dst_w_bpp == 4
|
||||
PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
|
||||
.if \bpp/dst_w_bpp == 4
|
||||
PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
|
||||
PF and, SCRATCH, SCRATCH, #31
|
||||
PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
|
||||
PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
|
||||
PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
|
||||
PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
|
||||
PF bcs, 61f
|
||||
PF bpl, 60f
|
||||
PF pld, [ptr, #32*(prefetch_distance+2)]
|
||||
.else
|
||||
PF mov, SCRATCH, base, lsl #32-5
|
||||
PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
|
||||
PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
|
||||
PF mov, SCRATCH, \base, lsl #32-5
|
||||
PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
|
||||
PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
|
||||
PF bls, 61f
|
||||
.endif
|
||||
.endif
|
||||
60: PF pld, [ptr, #32*(prefetch_distance+1)]
|
||||
60: PF pld, [\ptr, #32*(prefetch_distance+1)]
|
||||
61:
|
||||
.endif
|
||||
.endm
|
||||
|
||||
#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
|
||||
.macro preload_middle bpp, base, scratch_holds_offset
|
||||
.if bpp > 0
|
||||
.if \bpp > 0
|
||||
/* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
|
||||
.if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
|
||||
.if scratch_holds_offset
|
||||
PF pld, [base, SCRATCH]
|
||||
.if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
|
||||
.if \scratch_holds_offset
|
||||
PF pld, [\base, SCRATCH]
|
||||
.else
|
||||
PF bic, SCRATCH, base, #31
|
||||
PF bic, SCRATCH, \base, #31
|
||||
PF pld, [SCRATCH, #32*prefetch_distance]
|
||||
.endif
|
||||
.endif
|
||||
|
@ -244,28 +276,28 @@
|
|||
.endm
|
||||
|
||||
.macro preload_trailing bpp, bpp_shift, base
|
||||
.if bpp > 0
|
||||
.if bpp*pix_per_block > 256
|
||||
.if \bpp > 0
|
||||
.if \bpp*pix_per_block > 256
|
||||
/* Calculations are more complex if more than one fetch per block */
|
||||
PF and, WK1, base, #31
|
||||
PF add, WK1, WK1, WK0, lsl #bpp_shift
|
||||
PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
|
||||
PF bic, SCRATCH, base, #31
|
||||
PF and, WK1, \base, #31
|
||||
PF add, WK1, WK1, WK0, lsl #\bpp_shift
|
||||
PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
|
||||
PF bic, SCRATCH, \base, #31
|
||||
80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
|
||||
PF add, SCRATCH, SCRATCH, #32
|
||||
PF subs, WK1, WK1, #32
|
||||
PF bhi, 80b
|
||||
.else
|
||||
/* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
|
||||
PF mov, SCRATCH, base, lsl #32-5
|
||||
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
|
||||
PF mov, SCRATCH, \base, lsl #32-5
|
||||
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
|
||||
PF adceqs, SCRATCH, SCRATCH, #0
|
||||
/* The instruction above has two effects: ensures Z is only
|
||||
* set if C was clear (so Z indicates that both shifted quantities
|
||||
* were 0), and clears C if Z was set (so C indicates that the sum
|
||||
* of the shifted quantities was greater and not equal to 32) */
|
||||
PF beq, 82f
|
||||
PF bic, SCRATCH, base, #31
|
||||
PF bic, SCRATCH, \base, #31
|
||||
PF bcc, 81f
|
||||
PF pld, [SCRATCH, #32*(prefetch_distance+2)]
|
||||
81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
|
||||
|
@ -288,12 +320,12 @@
|
|||
* "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
|
||||
* "base" - base address register of channel to preload (SRC, MASK or DST)
|
||||
*/
|
||||
.if bpp > 0
|
||||
.if narrow_case && (bpp <= dst_w_bpp)
|
||||
.if \bpp > 0
|
||||
.if \narrow_case && (\bpp <= dst_w_bpp)
|
||||
/* In these cases, each line for each channel is in either 1 or 2 cache lines */
|
||||
PF bic, WK0, base, #31
|
||||
PF bic, WK0, \base, #31
|
||||
PF pld, [WK0]
|
||||
PF add, WK1, base, X, LSL #bpp_shift
|
||||
PF add, WK1, \base, X, LSL #\bpp_shift
|
||||
PF sub, WK1, WK1, #1
|
||||
PF bic, WK1, WK1, #31
|
||||
PF cmp, WK1, WK0
|
||||
|
@ -301,9 +333,9 @@
|
|||
PF pld, [WK1]
|
||||
90:
|
||||
.else
|
||||
PF bic, WK0, base, #31
|
||||
PF bic, WK0, \base, #31
|
||||
PF pld, [WK0]
|
||||
PF add, WK1, base, X, lsl #bpp_shift
|
||||
PF add, WK1, \base, X, lsl #\bpp_shift
|
||||
PF sub, WK1, WK1, #1
|
||||
PF bic, WK1, WK1, #31
|
||||
PF cmp, WK1, WK0
|
||||
|
@ -319,56 +351,56 @@
|
|||
|
||||
|
||||
.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
||||
process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
|
||||
.if decrementx
|
||||
sub&cond X, X, #8*numbytes/dst_w_bpp
|
||||
\process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
|
||||
.if \decrementx
|
||||
sub\()\cond X, X, #8*\numbytes/dst_w_bpp
|
||||
.endif
|
||||
process_tail cond, numbytes, firstreg
|
||||
\process_tail \cond, \numbytes, \firstreg
|
||||
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
||||
pixst cond, numbytes, firstreg, DST
|
||||
pixst \cond, \numbytes, \firstreg, DST
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
||||
.if (flags) & FLAG_BRANCH_OVER
|
||||
.ifc cond,mi
|
||||
.ifc \cond,mi
|
||||
bpl 100f
|
||||
.endif
|
||||
.ifc cond,cs
|
||||
.ifc \cond,cs
|
||||
bcc 100f
|
||||
.endif
|
||||
.ifc cond,ne
|
||||
.ifc \cond,ne
|
||||
beq 100f
|
||||
.endif
|
||||
conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
||||
conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
|
||||
100:
|
||||
.else
|
||||
conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
|
||||
conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
|
||||
.if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
|
||||
/* Can't interleave reads and writes */
|
||||
test
|
||||
conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
|
||||
\test
|
||||
conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
|
||||
.if (flags) & FLAG_PROCESS_CORRUPTS_PSR
|
||||
test
|
||||
\test
|
||||
.endif
|
||||
conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
|
||||
conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
|
||||
.else
|
||||
/* Can interleave reads and writes for better scheduling */
|
||||
test
|
||||
process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
|
||||
process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
|
||||
.if decrementx
|
||||
sub&cond1 X, X, #8*numbytes1/dst_w_bpp
|
||||
sub&cond2 X, X, #8*numbytes2/dst_w_bpp
|
||||
\test
|
||||
\process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
|
||||
\process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
|
||||
.if \decrementx
|
||||
sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
|
||||
sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
|
||||
.endif
|
||||
process_tail cond1, numbytes1, firstreg1
|
||||
process_tail cond2, numbytes2, firstreg2
|
||||
pixst cond1, numbytes1, firstreg1, DST
|
||||
pixst cond2, numbytes2, firstreg2, DST
|
||||
\process_tail \cond1, \numbytes1, \firstreg1
|
||||
\process_tail \cond2, \numbytes2, \firstreg2
|
||||
pixst \cond1, \numbytes1, \firstreg1, DST
|
||||
pixst \cond2, \numbytes2, \firstreg2, DST
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -400,12 +432,12 @@
|
|||
.endif
|
||||
/* Use unaligned loads in all cases for simplicity */
|
||||
.if dst_w_bpp == 8
|
||||
conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
|
||||
conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
|
||||
.elseif dst_w_bpp == 16
|
||||
test_bits_1_0_ptr
|
||||
conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
|
||||
conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
|
||||
.endif
|
||||
conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
|
||||
conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
|
||||
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
|
||||
ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
|
||||
.endif
|
||||
|
@ -424,12 +456,12 @@
|
|||
.endm
|
||||
|
||||
.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
|
||||
conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
|
||||
conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
|
||||
.if dst_w_bpp == 16
|
||||
test_bits_1_0_pix
|
||||
conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
|
||||
conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
|
||||
.elseif dst_w_bpp == 8
|
||||
conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
|
||||
conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -438,7 +470,7 @@
|
|||
110:
|
||||
.set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
|
||||
.rept pix_per_block*dst_w_bpp/128
|
||||
process_head , 16, 0, unaligned_src, unaligned_mask, 1
|
||||
\process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
|
||||
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
||||
preload_middle src_bpp, SRC, 1
|
||||
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
||||
|
@ -453,9 +485,9 @@
|
|||
* preloads for, to achieve staggered prefetches for multiple channels, because there are
|
||||
* always two STMs per prefetch, so there is always an opposite STM on which to put the
|
||||
* preload. Note, no need to BIC the base register here */
|
||||
PF pld, [DST, #32*prefetch_distance - dst_alignment]
|
||||
PF pld, [DST, #32*prefetch_distance - \dst_alignment]
|
||||
.endif
|
||||
process_tail , 16, 0
|
||||
\process_tail , 16, 0
|
||||
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
||||
pixst , 16, 0, DST
|
||||
.endif
|
||||
|
@ -470,11 +502,11 @@
|
|||
.if dst_r_bpp > 0
|
||||
tst DST, #16
|
||||
bne 111f
|
||||
process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
|
||||
\process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
|
||||
b 112f
|
||||
111:
|
||||
.endif
|
||||
process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
|
||||
\process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
|
||||
112:
|
||||
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
|
||||
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
|
||||
|
@ -487,13 +519,13 @@
|
|||
.endif
|
||||
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
|
||||
/* The remainder of the line is handled identically to the medium case */
|
||||
medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
|
||||
medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
|
||||
.endm
|
||||
|
||||
.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
|
||||
120:
|
||||
process_head , 16, 0, unaligned_src, unaligned_mask, 0
|
||||
process_tail , 16, 0
|
||||
\process_head , 16, 0, \unaligned_src, \unaligned_mask, 0
|
||||
\process_tail , 16, 0
|
||||
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
||||
pixst , 16, 0, DST
|
||||
.endif
|
||||
|
@ -501,16 +533,16 @@
|
|||
bhs 120b
|
||||
/* Trailing pixels */
|
||||
tst X, #128/dst_w_bpp - 1
|
||||
beq exit_label
|
||||
trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
|
||||
beq \exit_label
|
||||
trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
|
||||
.endm
|
||||
|
||||
.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
|
||||
tst X, #16*8/dst_w_bpp
|
||||
conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
|
||||
conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
|
||||
/* Trailing pixels */
|
||||
/* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
|
||||
trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
|
||||
trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
|
||||
.endm
|
||||
|
||||
.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
|
||||
|
@ -523,37 +555,37 @@
|
|||
tst SRC, #3
|
||||
bne 140f
|
||||
.endif
|
||||
action process_head, process_tail, process_inner_loop, exit_label, 0, 0
|
||||
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
|
||||
.if src_bpp == 8 || src_bpp == 16
|
||||
b exit_label
|
||||
b \exit_label
|
||||
140:
|
||||
action process_head, process_tail, process_inner_loop, exit_label, 1, 0
|
||||
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
|
||||
.endif
|
||||
.if mask_bpp == 8 || mask_bpp == 16
|
||||
b exit_label
|
||||
b \exit_label
|
||||
141:
|
||||
.if src_bpp == 8 || src_bpp == 16
|
||||
tst SRC, #3
|
||||
bne 142f
|
||||
.endif
|
||||
action process_head, process_tail, process_inner_loop, exit_label, 0, 1
|
||||
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
|
||||
.if src_bpp == 8 || src_bpp == 16
|
||||
b exit_label
|
||||
b \exit_label
|
||||
142:
|
||||
action process_head, process_tail, process_inner_loop, exit_label, 1, 1
|
||||
\action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro end_of_line restore_x, vars_spilled, loop_label, last_one
|
||||
.if vars_spilled
|
||||
.if \vars_spilled
|
||||
/* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
|
||||
/* This is ldmia sp,{} */
|
||||
.word 0xE89D0000 | LINE_SAVED_REGS
|
||||
.endif
|
||||
subs Y, Y, #1
|
||||
.if vars_spilled
|
||||
.if \vars_spilled
|
||||
.if (LINE_SAVED_REGS) & (1<<1)
|
||||
str Y, [sp]
|
||||
.endif
|
||||
|
@ -565,18 +597,18 @@
|
|||
.if mask_bpp > 0
|
||||
add MASK, MASK, STRIDE_M
|
||||
.endif
|
||||
.if restore_x
|
||||
.if \restore_x
|
||||
mov X, ORIG_W
|
||||
.endif
|
||||
bhs loop_label
|
||||
.ifc "last_one",""
|
||||
.if vars_spilled
|
||||
bhs \loop_label
|
||||
.ifc "\last_one",""
|
||||
.if \vars_spilled
|
||||
b 197f
|
||||
.else
|
||||
b 198f
|
||||
.endif
|
||||
.else
|
||||
.if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
|
||||
.if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
|
||||
b 198f
|
||||
.endif
|
||||
.endif
|
||||
|
@ -596,17 +628,17 @@
|
|||
process_tail, \
|
||||
process_inner_loop
|
||||
|
||||
pixman_asm_function fname
|
||||
pixman_asm_function \fname
|
||||
|
||||
/*
|
||||
* Make some macro arguments globally visible and accessible
|
||||
* from other macros
|
||||
*/
|
||||
.set src_bpp, src_bpp_
|
||||
.set mask_bpp, mask_bpp_
|
||||
.set dst_w_bpp, dst_w_bpp_
|
||||
.set flags, flags_
|
||||
.set prefetch_distance, prefetch_distance_
|
||||
.set src_bpp, \src_bpp_
|
||||
.set mask_bpp, \mask_bpp_
|
||||
.set dst_w_bpp, \dst_w_bpp_
|
||||
.set flags, \flags_
|
||||
.set prefetch_distance, \prefetch_distance_
|
||||
|
||||
/*
|
||||
* Select prefetch type for this function.
|
||||
|
@ -732,7 +764,7 @@
|
|||
sub Y, Y, #1
|
||||
#endif
|
||||
|
||||
init
|
||||
\init
|
||||
|
||||
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
|
||||
/* Reserve a word in which to store X during leading pixels */
|
||||
|
@ -773,7 +805,7 @@
|
|||
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
|
||||
.endif
|
||||
151: /* New line */
|
||||
newline
|
||||
\newline
|
||||
preload_leading_step1 src_bpp, WK1, SRC
|
||||
preload_leading_step1 mask_bpp, WK2, MASK
|
||||
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
|
||||
|
@ -790,7 +822,7 @@
|
|||
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
|
||||
.endif
|
||||
|
||||
leading_15bytes process_head, process_tail
|
||||
leading_15bytes \process_head, \process_tail
|
||||
|
||||
154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
|
||||
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
|
||||
|
@ -800,10 +832,10 @@
|
|||
and SCRATCH, MASK, #31
|
||||
rsb SCRATCH, SCRATCH, #32*prefetch_distance
|
||||
.endif
|
||||
.ifc "process_inner_loop",""
|
||||
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
|
||||
.ifc "\process_inner_loop",""
|
||||
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
|
||||
.else
|
||||
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
|
||||
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
|
||||
.endif
|
||||
|
||||
157: /* Check for another line */
|
||||
|
@ -825,7 +857,7 @@
|
|||
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
|
||||
.endif
|
||||
161: /* New line */
|
||||
newline
|
||||
\newline
|
||||
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
|
||||
preload_line 0, mask_bpp, mask_bpp_shift, MASK
|
||||
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
|
||||
|
@ -837,10 +869,10 @@
|
|||
beq 164f
|
||||
rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
|
||||
|
||||
leading_15bytes process_head, process_tail
|
||||
leading_15bytes \process_head, \process_tail
|
||||
|
||||
164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
|
||||
switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
|
||||
switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
|
||||
|
||||
167: /* Check for another line */
|
||||
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
|
||||
|
@ -856,7 +888,7 @@
|
|||
.word 0xE92D0000 | LINE_SAVED_REGS
|
||||
.endif
|
||||
171: /* New line */
|
||||
newline
|
||||
\newline
|
||||
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
|
||||
preload_line 1, mask_bpp, mask_bpp_shift, MASK
|
||||
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
|
||||
|
@ -868,8 +900,8 @@
|
|||
beq 174f
|
||||
172: subs X, X, #1
|
||||
blo 177f
|
||||
process_head , 1, 0, 1, 1, 0
|
||||
process_tail , 1, 0
|
||||
\process_head , 1, 0, 1, 1, 0
|
||||
\process_tail , 1, 0
|
||||
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
||||
pixst , 1, 0, DST
|
||||
.endif
|
||||
|
@ -880,15 +912,15 @@
|
|||
beq 174f
|
||||
subs X, X, #1
|
||||
blo 177f
|
||||
process_head , 2, 0, 1, 1, 0
|
||||
process_tail , 2, 0
|
||||
\process_head , 2, 0, 1, 1, 0
|
||||
\process_tail , 2, 0
|
||||
.if !((flags) & FLAG_PROCESS_DOES_STORE)
|
||||
pixst , 2, 0, DST
|
||||
.endif
|
||||
.endif
|
||||
|
||||
174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
|
||||
switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
|
||||
switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
|
||||
|
||||
177: /* Check for another line */
|
||||
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
|
||||
|
@ -908,7 +940,7 @@
|
|||
add sp, sp, #4
|
||||
.endif
|
||||
|
||||
cleanup
|
||||
\cleanup
|
||||
|
||||
#ifdef DEBUG_PARAMS
|
||||
add sp, sp, #9*4 /* junk the debug copy of arguments */
|
||||
|
@ -932,13 +964,15 @@
|
|||
.unreq WK3
|
||||
.unreq SCRATCH
|
||||
.unreq ORIG_W
|
||||
#ifndef __clang__
|
||||
.endfunc
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro line_saved_regs x:vararg
|
||||
.set LINE_SAVED_REGS, 0
|
||||
.set LINE_SAVED_REG_COUNT, 0
|
||||
.irp SAVED_REG,x
|
||||
.irp SAVED_REG,\x
|
||||
.ifc "SAVED_REG","Y"
|
||||
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
|
||||
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
|
||||
|
|
Загрузка…
Ссылка в новой задаче