зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1901600 - Update dav1d to 92f592ed104ba92ad35c781ee93f354525eef503 r=chunmin
Differential Revision: https://phabricator.services.mozilla.com/D213129
This commit is contained in:
Родитель
a4f4b59279
Коммит
b50386f0ec
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 5b5399911dd24703de641d65eda5b7f1e845d060 (2024-04-15T13:19:42.000+02:00).
|
||||
release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 5b5399911dd24703de641d65eda5b7f1e845d060
|
||||
revision: 92f592ed104ba92ad35c781ee93f354525eef503
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "5b5399911dd24703de641d65eda5b7f1e845d060"
|
||||
#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503"
|
||||
|
|
|
@ -1,3 +1,18 @@
|
|||
Changes for 1.4.2 'Road Runner':
|
||||
--------------------------------
|
||||
|
||||
1.4.2 is a small release of dav1d, improving notably ARM, AVX-512 and PowerPC
|
||||
- AVX2 optimizations for 8-tap and new variants for 6-tap
|
||||
- AVX-512 optimizations for 8-tap and new variants for 6-tap
|
||||
- Improve entropy decoding on ARM64
|
||||
- New ARM64 optimizations for convolutions based on DotProd extension
|
||||
- New ARM64 optimizations for convolutions based on i8mm extension
|
||||
- New ARM64 optimizations for subpel and prep filters for i8mm
|
||||
- Misc improvements on existing ARM64 optimizations, notably for put/prep
|
||||
- New PowerPC9 optimizations for loopfilter
|
||||
- Support for macOS kperf API for benchmarking
|
||||
|
||||
|
||||
Changes for 1.4.1 'Road Runner':
|
||||
--------------------------------
|
||||
|
||||
|
@ -246,7 +261,7 @@ Changes for 0.6.0 'Gyrfalcon':
|
|||
- New SSSE3 optimizations for film grain
|
||||
- New AVX2 optimizations for msac_adapt16
|
||||
- Fix rare mismatches against the reference decoder, notably because of clipping
|
||||
- Improvements on ARM64 on msac, cdef and looprestoration optimizations
|
||||
- Improvements on ARM64 on msac, cdef, mc_blend_v and looprestoration optimizations
|
||||
- Improvements on AVX2 optimizations for cdef_filter
|
||||
- Improvements in the C version for itxfm, cdef_filter
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '1.4.1',
|
||||
version: '1.4.2',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
|
|
|
@ -840,100 +840,108 @@ endfunc
|
|||
function put_neon, export=1
|
||||
adr x9, L(put_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
sub x9, x9, x8
|
||||
br x9
|
||||
|
||||
2:
|
||||
20:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v0.h}[0], [x2], x3
|
||||
ld1 {v1.h}[0], [x2], x3
|
||||
subs w5, w5, #2
|
||||
st1 {v0.h}[0], [x0], x1
|
||||
st1 {v1.h}[0], [x0], x1
|
||||
2:
|
||||
ldrh w9, [x2]
|
||||
ldrh w10, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
subs w5, w5, #2
|
||||
strh w9, [x0]
|
||||
strh w10, [x0, x1]
|
||||
add x0, x0, x1, lsl #1
|
||||
b.gt 2b
|
||||
ret
|
||||
4:
|
||||
40:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v0.s}[0], [x2], x3
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
subs w5, w5, #2
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
st1 {v1.s}[0], [x0], x1
|
||||
4:
|
||||
ldr w9, [x2]
|
||||
ldr w10, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
subs w5, w5, #2
|
||||
str w9, [x0]
|
||||
str w10, [x0, x1]
|
||||
add x0, x0, x1, lsl #1
|
||||
b.gt 4b
|
||||
ret
|
||||
8:
|
||||
80:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v0.8b}, [x2], x3
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
subs w5, w5, #2
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x0], x1
|
||||
8:
|
||||
ldr x9, [x2]
|
||||
ldr x10, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
subs w5, w5, #2
|
||||
str x9, [x0]
|
||||
str x10, [x0, x1]
|
||||
add x0, x0, x1, lsl #1
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x8, x0, x1
|
||||
lsl x1, x1, #1
|
||||
add x9, x2, x3
|
||||
lsl x3, x3, #1
|
||||
16:
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ld1 {v1.16b}, [x9], x3
|
||||
subs w5, w5, #2
|
||||
st1 {v0.16b}, [x0], x1
|
||||
st1 {v1.16b}, [x8], x1
|
||||
ldr q0, [x2]
|
||||
ldr q1, [x2, x3]
|
||||
add x2, x2, x3, lsl #1
|
||||
subs w5, w5, #2
|
||||
str q0, [x0]
|
||||
str q1, [x0, x1]
|
||||
add x0, x0, x1, lsl #1
|
||||
b.gt 16b
|
||||
ret
|
||||
32:
|
||||
320:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ldp x6, x7, [x2]
|
||||
ldp x8, x9, [x2, #16]
|
||||
stp x6, x7, [x0]
|
||||
subs w5, w5, #1
|
||||
stp x8, x9, [x0, #16]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
32:
|
||||
ldp q0, q1, [x2]
|
||||
add x2, x2, x3
|
||||
stp q0, q1, [x0]
|
||||
add x0, x0, x1
|
||||
ldp q2, q3, [x2]
|
||||
add x2, x2, x3
|
||||
stp q2, q3, [x0]
|
||||
subs w5, w5, #2
|
||||
add x0, x0, x1
|
||||
b.gt 32b
|
||||
ret
|
||||
64:
|
||||
640:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ldp x6, x7, [x2]
|
||||
ldp x8, x9, [x2, #16]
|
||||
stp x6, x7, [x0]
|
||||
ldp x10, x11, [x2, #32]
|
||||
stp x8, x9, [x0, #16]
|
||||
subs w5, w5, #1
|
||||
ldp x12, x13, [x2, #48]
|
||||
stp x10, x11, [x0, #32]
|
||||
stp x12, x13, [x0, #48]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
64:
|
||||
ldp q0, q1, [x2]
|
||||
stp q0, q1, [x0]
|
||||
ldp q2, q3, [x2, #32]
|
||||
add x2, x2, x3
|
||||
stp q2, q3, [x0, #32]
|
||||
subs w5, w5, #1
|
||||
add x0, x0, x1
|
||||
b.gt 64b
|
||||
ret
|
||||
128:
|
||||
1280:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ldp q0, q1, [x2]
|
||||
ldp q2, q3, [x2, #32]
|
||||
stp q0, q1, [x0]
|
||||
ldp q4, q5, [x2, #64]
|
||||
stp q2, q3, [x0, #32]
|
||||
ldp q6, q7, [x2, #96]
|
||||
subs w5, w5, #1
|
||||
stp q4, q5, [x0, #64]
|
||||
stp q6, q7, [x0, #96]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
128:
|
||||
ldp q0, q1, [x2]
|
||||
stp q0, q1, [x0]
|
||||
ldp q2, q3, [x2, #32]
|
||||
stp q2, q3, [x0, #32]
|
||||
ldp q4, q5, [x2, #64]
|
||||
stp q4, q5, [x0, #64]
|
||||
ldp q6, q7, [x2, #96]
|
||||
add x2, x2, x3
|
||||
stp q6, q7, [x0, #96]
|
||||
subs w5, w5, #1
|
||||
add x0, x0, x1
|
||||
b.gt 128b
|
||||
ret
|
||||
|
||||
L(put_tbl):
|
||||
.hword L(put_tbl) - 128b
|
||||
.hword L(put_tbl) - 64b
|
||||
.hword L(put_tbl) - 32b
|
||||
.hword L(put_tbl) - 160b
|
||||
.hword L(put_tbl) - 8b
|
||||
.hword L(put_tbl) - 4b
|
||||
.hword L(put_tbl) - 2b
|
||||
.hword L(put_tbl) - 1280b
|
||||
.hword L(put_tbl) - 640b
|
||||
.hword L(put_tbl) - 320b
|
||||
.hword L(put_tbl) - 160b
|
||||
.hword L(put_tbl) - 80b
|
||||
.hword L(put_tbl) - 40b
|
||||
.hword L(put_tbl) - 20b
|
||||
endfunc
|
||||
|
||||
|
||||
|
@ -942,119 +950,146 @@ endfunc
|
|||
function prep_neon, export=1
|
||||
adr x9, L(prep_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
movi v24.16b, #16
|
||||
sub x9, x9, x8
|
||||
br x9
|
||||
|
||||
4:
|
||||
40:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
4:
|
||||
ld1 {v0.s}[0], [x1], x2
|
||||
ld1 {v0.s}[1], [x1], x2
|
||||
ld1 {v1.s}[0], [x1], x2
|
||||
subs w4, w4, #2
|
||||
ld1 {v1.s}[1], [x1], x2
|
||||
ushll v0.8h, v0.8b, #4
|
||||
ushll v1.8h, v1.8b, #4
|
||||
st1 {v0.4h, v1.4h}, [x0], #16
|
||||
subs w4, w4, #4
|
||||
stp q0, q1, [x0], #32
|
||||
b.gt 4b
|
||||
ret
|
||||
8:
|
||||
80:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
subs w4, w4, #2
|
||||
8:
|
||||
ldr d0, [x1]
|
||||
ldr d1, [x1, x2]
|
||||
add x1, x1, x2, lsl #1
|
||||
ldr d2, [x1]
|
||||
ldr d3, [x1, x2]
|
||||
add x1, x1, x2, lsl #1
|
||||
ushll v0.8h, v0.8b, #4
|
||||
ushll v1.8h, v1.8b, #4
|
||||
st1 {v0.8h, v1.8h}, [x0], #32
|
||||
umull v2.8h, v2.8b, v24.8b
|
||||
umull v3.8h, v3.8b, v24.8b
|
||||
subs w4, w4, #4
|
||||
stp q0, q1, [x0]
|
||||
stp q2, q3, [x0, #32]
|
||||
add x0, x0, #64
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x9, x1, x2
|
||||
lsl x2, x2, #1
|
||||
16:
|
||||
ld1 {v0.16b}, [x1], x2
|
||||
ld1 {v1.16b}, [x9], x2
|
||||
subs w4, w4, #2
|
||||
ushll v4.8h, v0.8b, #4
|
||||
ushll2 v5.8h, v0.16b, #4
|
||||
ushll v6.8h, v1.8b, #4
|
||||
ushll2 v7.8h, v1.16b, #4
|
||||
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
|
||||
ldr q1, [x1]
|
||||
ldr q3, [x1, x2]
|
||||
add x1, x1, x2, lsl #1
|
||||
ushll v0.8h, v1.8b, #4
|
||||
ushll2 v1.8h, v1.16b, #4
|
||||
ldr q5, [x1]
|
||||
ldr q7, [x1, x2]
|
||||
add x1, x1, x2, lsl #1
|
||||
umull v2.8h, v3.8b, v24.8b
|
||||
umull2 v3.8h, v3.16b, v24.16b
|
||||
ushll v4.8h, v5.8b, #4
|
||||
ushll2 v5.8h, v5.16b, #4
|
||||
umull v6.8h, v7.8b, v24.8b
|
||||
umull2 v7.8h, v7.16b, v24.16b
|
||||
subs w4, w4, #4
|
||||
stp q0, q1, [x0]
|
||||
stp q2, q3, [x0, #32]
|
||||
stp q4, q5, [x0, #64]
|
||||
stp q6, q7, [x0, #96]
|
||||
add x0, x0, #128
|
||||
b.gt 16b
|
||||
ret
|
||||
320:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x8, x0, w3, uxtw
|
||||
32:
|
||||
ld1 {v0.16b, v1.16b}, [x1], x2
|
||||
subs w4, w4, #2
|
||||
ushll v4.8h, v0.8b, #4
|
||||
ushll2 v5.8h, v0.16b, #4
|
||||
ld1 {v2.16b, v3.16b}, [x1], x2
|
||||
ushll v6.8h, v1.8b, #4
|
||||
ushll2 v7.8h, v1.16b, #4
|
||||
ushll v16.8h, v2.8b, #4
|
||||
st1 {v4.8h, v5.8h}, [x0], x7
|
||||
ushll2 v17.8h, v2.16b, #4
|
||||
st1 {v6.8h, v7.8h}, [x8], x7
|
||||
ushll v18.8h, v3.8b, #4
|
||||
st1 {v16.8h, v17.8h}, [x0], x7
|
||||
ushll2 v19.8h, v3.16b, #4
|
||||
st1 {v18.8h, v19.8h}, [x8], x7
|
||||
ldp q4, q5, [x1]
|
||||
add x1, x1, x2
|
||||
ldp q6, q7, [x1]
|
||||
add x1, x1, x2
|
||||
ushll v0.8h, v4.8b, #4
|
||||
ushll2 v1.8h, v4.16b, #4
|
||||
umull v2.8h, v5.8b, v24.8b
|
||||
umull2 v3.8h, v5.16b, v24.16b
|
||||
ushll v4.8h, v6.8b, #4
|
||||
ushll2 v5.8h, v6.16b, #4
|
||||
umull v6.8h, v7.8b, v24.8b
|
||||
umull2 v7.8h, v7.16b, v24.16b
|
||||
subs w4, w4, #2
|
||||
stp q0, q1, [x0]
|
||||
stp q2, q3, [x0, #32]
|
||||
stp q4, q5, [x0, #64]
|
||||
stp q6, q7, [x0, #96]
|
||||
add x0, x0, #128
|
||||
b.gt 32b
|
||||
ret
|
||||
640:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x8, x0, #32
|
||||
mov x6, #64
|
||||
64:
|
||||
ldp q0, q1, [x1]
|
||||
subs w4, w4, #1
|
||||
ushll v4.8h, v0.8b, #4
|
||||
ushll2 v5.8h, v0.16b, #4
|
||||
ldp q2, q3, [x1, #32]
|
||||
ushll v6.8h, v1.8b, #4
|
||||
ushll2 v7.8h, v1.16b, #4
|
||||
add x1, x1, x2
|
||||
ushll v16.8h, v2.8b, #4
|
||||
st1 {v4.8h, v5.8h}, [x0], x6
|
||||
ushll2 v17.8h, v2.16b, #4
|
||||
ushll v18.8h, v3.8b, #4
|
||||
st1 {v6.8h, v7.8h}, [x8], x6
|
||||
ushll2 v19.8h, v3.16b, #4
|
||||
st1 {v16.8h, v17.8h}, [x0], x6
|
||||
st1 {v18.8h, v19.8h}, [x8], x6
|
||||
ldp q4, q5, [x1]
|
||||
ldp q6, q7, [x1, #32]
|
||||
add x1, x1, x2
|
||||
ushll v0.8h, v4.8b, #4
|
||||
ushll2 v1.8h, v4.16b, #4
|
||||
umull v2.8h, v5.8b, v24.8b
|
||||
umull2 v3.8h, v5.16b, v24.16b
|
||||
ushll v4.8h, v6.8b, #4
|
||||
ushll2 v5.8h, v6.16b, #4
|
||||
umull v6.8h, v7.8b, v24.8b
|
||||
umull2 v7.8h, v7.16b, v24.16b
|
||||
subs w4, w4, #1
|
||||
stp q0, q1, [x0]
|
||||
stp q2, q3, [x0, #32]
|
||||
stp q4, q5, [x0, #64]
|
||||
stp q6, q7, [x0, #96]
|
||||
add x0, x0, #128
|
||||
b.gt 64b
|
||||
ret
|
||||
1280:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x8, x0, #64
|
||||
mov x6, #128
|
||||
128:
|
||||
ldp q0, q1, [x1]
|
||||
ldp q2, q3, [x1, #32]
|
||||
ushll v16.8h, v0.8b, #4
|
||||
ushll2 v17.8h, v0.16b, #4
|
||||
ushll v18.8h, v1.8b, #4
|
||||
ushll2 v19.8h, v1.16b, #4
|
||||
ushll v20.8h, v2.8b, #4
|
||||
ushll2 v21.8h, v2.16b, #4
|
||||
ldp q4, q5, [x1, #64]
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
|
||||
ushll v22.8h, v3.8b, #4
|
||||
ushll2 v23.8h, v3.16b, #4
|
||||
ushll v24.8h, v4.8b, #4
|
||||
ushll2 v25.8h, v4.16b, #4
|
||||
ushll v26.8h, v5.8b, #4
|
||||
ushll2 v27.8h, v5.16b, #4
|
||||
ldp q6, q7, [x1, #96]
|
||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
|
||||
ushll v28.8h, v6.8b, #4
|
||||
ushll2 v29.8h, v6.16b, #4
|
||||
ushll v30.8h, v7.8b, #4
|
||||
ushll2 v31.8h, v7.16b, #4
|
||||
subs w4, w4, #1
|
||||
add x1, x1, x2
|
||||
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
|
||||
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
|
||||
ldp q28, q29, [x1]
|
||||
ldp q30, q31, [x1, #32]
|
||||
ushll v16.8h, v28.8b, #4
|
||||
ushll2 v17.8h, v28.16b, #4
|
||||
umull v18.8h, v29.8b, v24.8b
|
||||
umull2 v19.8h, v29.16b, v24.16b
|
||||
ushll v20.8h, v30.8b, #4
|
||||
ushll2 v21.8h, v30.16b, #4
|
||||
umull v22.8h, v31.8b, v24.8b
|
||||
umull2 v23.8h, v31.16b, v24.16b
|
||||
ldp q28, q29, [x1, #64]
|
||||
ldp q30, q31, [x1, #96]
|
||||
add x1, x1, x2
|
||||
stp q16, q17, [x0]
|
||||
stp q18, q19, [x0, #32]
|
||||
stp q20, q21, [x0, #64]
|
||||
stp q22, q23, [x0, #96]
|
||||
ushll v16.8h, v28.8b, #4
|
||||
ushll2 v17.8h, v28.16b, #4
|
||||
umull v18.8h, v29.8b, v24.8b
|
||||
umull2 v19.8h, v29.16b, v24.16b
|
||||
ushll v20.8h, v30.8b, #4
|
||||
ushll2 v21.8h, v30.16b, #4
|
||||
umull v22.8h, v31.8b, v24.8b
|
||||
umull2 v23.8h, v31.16b, v24.16b
|
||||
subs w4, w4, #1
|
||||
stp q16, q17, [x0, #128]
|
||||
stp q18, q19, [x0, #160]
|
||||
stp q20, q21, [x0, #192]
|
||||
stp q22, q23, [x0, #224]
|
||||
add x0, x0, #256
|
||||
b.gt 128b
|
||||
ret
|
||||
|
||||
|
@ -1063,8 +1098,8 @@ L(prep_tbl):
|
|||
.hword L(prep_tbl) - 640b
|
||||
.hword L(prep_tbl) - 320b
|
||||
.hword L(prep_tbl) - 160b
|
||||
.hword L(prep_tbl) - 8b
|
||||
.hword L(prep_tbl) - 4b
|
||||
.hword L(prep_tbl) - 80b
|
||||
.hword L(prep_tbl) - 40b
|
||||
endfunc
|
||||
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -35,14 +35,14 @@
|
|||
#define CNT 28
|
||||
#define ALLOW_UPDATE_CDF 32
|
||||
|
||||
#define COEFFS_BASE_OFFSET 30
|
||||
#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)
|
||||
|
||||
const coeffs
|
||||
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
|
||||
.short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
endconst
|
||||
|
||||
const bits
|
||||
.short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
||||
.short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
|
||||
// masks8
|
||||
.short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
|
||||
endconst
|
||||
|
||||
.macro ld1_n d0, d1, src, sz, n
|
||||
|
@ -96,13 +96,6 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
|
||||
urhadd \d0\sz, \s0\sz, \s2\sz
|
||||
.if \n == 16
|
||||
urhadd \d1\sz, \s1\sz, \s3\sz
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
|
||||
sshl \d0\sz, \s0\sz, \s2\sz
|
||||
.if \n == 16
|
||||
|
@ -129,93 +122,189 @@ endconst
|
|||
|
||||
function msac_decode_symbol_adapt4_neon, export=1
|
||||
.macro decode_update sz, szb, n
|
||||
.if \n == 16
|
||||
sub sp, sp, #48
|
||||
.endif
|
||||
add x8, x0, #RNG
|
||||
ld1_n v0, v1, x1, \sz, \n // cdf
|
||||
ld1r {v4\sz}, [x8] // rng
|
||||
movrel x9, coeffs, 30
|
||||
ld1r {v29\sz}, [x8] // rng
|
||||
movrel x9, coeffs, COEFFS_BASE_OFFSET
|
||||
movi v31\sz, #0x7f, lsl #8 // 0x7f00
|
||||
sub x9, x9, x2, lsl #1
|
||||
sub x10, x9, x2, lsl #1
|
||||
mvni v30\sz, #0x3f // 0xffc0
|
||||
and v7\szb, v4\szb, v31\szb // rng & 0x7f00
|
||||
str h4, [sp, #14] // store original u = s->rng
|
||||
and v7\szb, v29\szb, v31\szb // rng & 0x7f00
|
||||
.if \n == 16
|
||||
str h29, [sp, #14] // store original u = s->rng
|
||||
.endif
|
||||
and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
|
||||
|
||||
ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
|
||||
ld1_n v4, v5, x10, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
|
||||
sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
|
||||
add x8, x0, #DIF + 6
|
||||
ldr d28, [x0, #DIF]
|
||||
|
||||
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
|
||||
add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
|
||||
|
||||
ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
|
||||
movrel x8, bits
|
||||
dup v30\sz, v28.h[3] // dif >> (EC_WIN_SIZE - 16)
|
||||
.if \n == 8
|
||||
ldur q31, [x9, #MASKS8_OFFSET]
|
||||
.elseif \n == 16
|
||||
str_n q4, q5, sp, #16, \n // store v values to allow indexed access
|
||||
|
||||
ld1_n v16, v17, x8, .8h, \n
|
||||
|
||||
cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v
|
||||
|
||||
and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask
|
||||
.if \n == 16
|
||||
add v6.8h, v6.8h, v7.8h
|
||||
.endif
|
||||
addv h6, v6.8h // Aggregate mask bits
|
||||
ldr w4, [x0, #ALLOW_UPDATE_CDF]
|
||||
umov w3, v6.h[0]
|
||||
rbit w3, w3
|
||||
clz w15, w3 // ret
|
||||
|
||||
cbz w4, L(renorm)
|
||||
// After the condition starts being true it continues, such that the vector looks like:
|
||||
// 0, 0, 0 ... -1, -1
|
||||
cmhs_n v2, v3, v30, v30, v4, v5, \sz, \n // c >= v
|
||||
.if \n == 4
|
||||
ext v29\szb, v29\szb, v4\szb, #6 // u
|
||||
umov x15, v2.d[0]
|
||||
ldr w4, [x0, #ALLOW_UPDATE_CDF]
|
||||
rev x15, x15
|
||||
sub v29\sz, v29\sz, v4\sz // rng = u-v
|
||||
// rev + clz = count trailing zeros
|
||||
clz x15, x15 // 16*ret
|
||||
.elseif \n == 8
|
||||
// The final short of the compare is always set.
|
||||
// Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
|
||||
// For n == 8:
|
||||
// -0x202 + -0x202 + ... + 0xF0E
|
||||
// (0x202*7) | (1 << 8)
|
||||
// ^-------offset for second byte of the short
|
||||
and v31\szb, v31\szb, v2\szb
|
||||
ext v29\szb, v29\szb, v4\szb, #14 // u
|
||||
addv h31, v31\sz // ((2*ret + 1) << 8) | (2*ret)
|
||||
ldr w4, [x0, #ALLOW_UPDATE_CDF]
|
||||
sub v30\sz, v30\sz, v4\sz // (dif >> 48) - v
|
||||
smov w15, v31.b[0] // 2*ret
|
||||
sub v29\sz, v29\sz, v4\sz // rng = u-v
|
||||
.elseif \n == 16
|
||||
add v6\sz, v2\sz, v3\sz
|
||||
addv h31, v6\sz // -n + ret
|
||||
ldr w4, [x0, #ALLOW_UPDATE_CDF]
|
||||
smov w15, v31.h[0]
|
||||
.endif
|
||||
|
||||
cbz w4, 0f
|
||||
|
||||
// update_cdf
|
||||
ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
|
||||
movi v5\szb, #0xff
|
||||
.if \n == 16
|
||||
// 16 case has a lower bound that guarantees n_symbols > 2
|
||||
mov w4, #-5
|
||||
.else
|
||||
.elseif \n == 8
|
||||
mvn w14, w2
|
||||
mov w4, #-4
|
||||
cmn w14, #3 // set C if n_symbols <= 2
|
||||
.else
|
||||
// if n_symbols < 4 (or < 6 even) then
|
||||
// (1 + n_symbols) >> 2 == n_symbols > 2
|
||||
add w14, w2, #17 // (1 + n_symbols) + (4 << 2)
|
||||
.endif
|
||||
sub_n v16, v17, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
|
||||
orr v2\sz, #0x80, lsl #8
|
||||
.if \n == 16
|
||||
orr v3\sz, #0x80, lsl #8
|
||||
.endif
|
||||
urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
|
||||
.if \n == 16
|
||||
sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
|
||||
.else
|
||||
.elseif \n == 8
|
||||
lsr w14, w3, #4 // count >> 4
|
||||
sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
|
||||
.else
|
||||
neg w4, w14, lsr #2 // -((n_symbols > 2) + 4)
|
||||
sub w4, w4, w3, lsr #4 // -((count >> 4) + (n_symbols > 2) + 4)
|
||||
.endif
|
||||
sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
|
||||
sub_n v2, v3, v2, v3, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
|
||||
dup v6\sz, w4 // -rate
|
||||
|
||||
sub w3, w3, w3, lsr #5 // count - (count == 32)
|
||||
sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
|
||||
sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
|
||||
sshl_n v2, v3, v2, v3, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
|
||||
add w3, w3, #1 // count + (count < 32)
|
||||
add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate
|
||||
add_n v0, v1, v16, v17, v2, v3, \sz, \n // cdf + (32768 - cdf[i]) >> rate
|
||||
st1_n v0, v1, x1, \sz, \n
|
||||
strh w3, [x1, x2, lsl #1]
|
||||
.endm
|
||||
|
||||
decode_update .4h, .8b, 4
|
||||
0:
|
||||
// renorm
|
||||
.if \n == 4
|
||||
ldr w6, [x0, #CNT]
|
||||
ldr x7, [x0, #DIF]
|
||||
mov x4, v29.d[0] // rng (packed)
|
||||
mov x3, v4.d[0] // v (packed)
|
||||
|
||||
L(renorm):
|
||||
add x8, sp, #16
|
||||
add x8, x8, w15, uxtw #1
|
||||
ldrh w3, [x8] // v
|
||||
ldurh w4, [x8, #-2] // u
|
||||
// Shift 'v'/'rng' for ret into the 16 least sig bits. There is
|
||||
// garbage in the remaining bits, but we can work around this.
|
||||
lsr x4, x4, x15 // rng
|
||||
lsr x3, x3, x15 // v
|
||||
lsl w5, w4, #16 // rng << 16
|
||||
sub x7, x7, x3, lsl #48 // dif - (v << 48)
|
||||
clz w5, w5 // d = clz(rng << 16)
|
||||
lsl w4, w4, w5 // rng << d
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
lsl x7, x7, x5 // (dif - (v << 48)) << d
|
||||
strh w4, [x0, #RNG]
|
||||
b.lo 1f
|
||||
str w6, [x0, #CNT]
|
||||
str x7, [x0, #DIF]
|
||||
lsr w0, w15, #4
|
||||
ret
|
||||
1:
|
||||
lsr w15, w15, #4
|
||||
b L(refill)
|
||||
.elseif \n == 8
|
||||
ldr w6, [x0, #CNT]
|
||||
tbl v30.8b, {v30.16b}, v31.8b
|
||||
tbl v29.8b, {v29.16b}, v31.8b
|
||||
ins v28.h[3], v30.h[0] // dif - (v << 48)
|
||||
clz v0.4h, v29.4h // d = clz(rng)
|
||||
umov w5, v0.h[0]
|
||||
ushl v29.4h, v29.4h, v0.4h // rng << d
|
||||
|
||||
// The vec for clz(rng) is filled with garbage after the first short,
|
||||
// but ushl/sshl conveniently uses only the first byte for the shift
|
||||
// amount.
|
||||
ushl d28, d28, d0 // (dif - (v << 48)) << d
|
||||
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
str h29, [x0, #RNG]
|
||||
b.lo 1f
|
||||
str w6, [x0, #CNT]
|
||||
str d28, [x0, #DIF]
|
||||
lsr w0, w15, #1 // ret
|
||||
ret
|
||||
1:
|
||||
lsr w15, w15, #1 // ret
|
||||
mov x7, v28.d[0]
|
||||
b L(refill)
|
||||
.elseif \n == 16
|
||||
add x8, sp, w15, sxtw #1
|
||||
ldrh w3, [x8, #48] // v
|
||||
ldurh w4, [x8, #46] // u
|
||||
ldr w6, [x0, #CNT]
|
||||
ldr x7, [x0, #DIF]
|
||||
sub w4, w4, w3 // rng = u - v
|
||||
clz w5, w4 // clz(rng)
|
||||
eor w5, w5, #16 // d = clz(rng) ^ 16
|
||||
sub x7, x7, x3, lsl #48 // dif - (v << 48)
|
||||
L(renorm2):
|
||||
lsl w4, w4, w5 // rng << d
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
lsl x7, x7, x5 // (dif - (v << 48)) << d
|
||||
str w4, [x0, #RNG]
|
||||
b.hs 4f
|
||||
add sp, sp, #48
|
||||
b.lo 1f
|
||||
str w6, [x0, #CNT]
|
||||
str x7, [x0, #DIF]
|
||||
add w0, w15, #\n // ret
|
||||
ret
|
||||
1:
|
||||
add w15, w15, #\n // ret
|
||||
b L(refill)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
decode_update .4h, .8b, 4
|
||||
|
||||
L(refill):
|
||||
// refill
|
||||
ldp x3, x4, [x0] // BUF_POS, BUF_END
|
||||
add x5, x3, #8
|
||||
|
@ -243,7 +332,6 @@ L(renorm2):
|
|||
str x7, [x0, #DIF]
|
||||
|
||||
mov w0, w15
|
||||
add sp, sp, #48
|
||||
ret
|
||||
|
||||
5: // pad_with_ones
|
||||
|
@ -272,29 +360,26 @@ endfunc
|
|||
|
||||
function msac_decode_symbol_adapt8_neon, export=1
|
||||
decode_update .8h, .16b, 8
|
||||
b L(renorm)
|
||||
endfunc
|
||||
|
||||
function msac_decode_symbol_adapt16_neon, export=1
|
||||
decode_update .8h, .16b, 16
|
||||
b L(renorm)
|
||||
endfunc
|
||||
|
||||
function msac_decode_hi_tok_neon, export=1
|
||||
ld1 {v0.4h}, [x1] // cdf
|
||||
add x16, x0, #RNG
|
||||
movi v31.4h, #0x7f, lsl #8 // 0x7f00
|
||||
movrel x17, coeffs, 30-2*3
|
||||
movrel x17, coeffs, COEFFS_BASE_OFFSET-2*3
|
||||
mvni v30.4h, #0x3f // 0xffc0
|
||||
ldrh w9, [x1, #6] // count = cdf[n_symbols]
|
||||
ld1r {v3.4h}, [x16] // rng
|
||||
ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
|
||||
add x17, x0, #DIF + 6
|
||||
mov w13, #-24
|
||||
mov w13, #-24*8
|
||||
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
|
||||
ldr w10, [x0, #ALLOW_UPDATE_CDF]
|
||||
ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
|
||||
sub sp, sp, #48
|
||||
ldr w6, [x0, #CNT]
|
||||
ldr x7, [x0, #DIF]
|
||||
1:
|
||||
|
@ -302,14 +387,14 @@ function msac_decode_hi_tok_neon, export=1
|
|||
sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
|
||||
add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
|
||||
add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
|
||||
str h3, [sp, #14] // store original u = s->rng
|
||||
cmhs v2.4h, v1.4h, v4.4h // c >= v
|
||||
str q4, [sp, #16] // store v values to allow indexed access
|
||||
addv h6, v2.4h // -4 + ret
|
||||
add w13, w13, #5
|
||||
smov w15, v6.h[0]
|
||||
add x8, sp, #16
|
||||
add w15, w15, #4 // ret
|
||||
add w13, w13, #5*8
|
||||
ext v18.8b, v3.8b, v4.8b, #6 // u
|
||||
umov x15, v2.d[0]
|
||||
rev x15, x15
|
||||
sub v18.4h, v18.4h, v4.4h // rng = u-v
|
||||
// rev + clz = count trailing zeros
|
||||
clz x15, x15 // 16*ret
|
||||
|
||||
cbz w10, 2f
|
||||
// update_cdf
|
||||
|
@ -317,29 +402,32 @@ function msac_decode_hi_tok_neon, export=1
|
|||
mov w4, #-5
|
||||
orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768
|
||||
sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
|
||||
sub v4.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
|
||||
sub v2.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
|
||||
dup v6.4h, w4 // -rate
|
||||
|
||||
sub w9, w9, w9, lsr #5 // count - (count == 32)
|
||||
sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
|
||||
sshl v2.4h, v2.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
|
||||
add w9, w9, #1 // count + (count < 32)
|
||||
add v0.4h, v5.4h, v4.4h // cdf[i] + (32768 - cdf[i]) >> rate
|
||||
add v0.4h, v5.4h, v2.4h // cdf[i] + (32768 - cdf[i]) >> rate
|
||||
st1 {v0.4h}, [x1]
|
||||
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
|
||||
strh w9, [x1, #6]
|
||||
|
||||
2:
|
||||
add x8, x8, w15, uxtw #1
|
||||
ldrh w3, [x8] // v
|
||||
ldurh w4, [x8, #-2] // u
|
||||
sub w4, w4, w3 // rng = u - v
|
||||
clz w5, w4 // clz(rng)
|
||||
eor w5, w5, #16 // d = clz(rng) ^ 16
|
||||
mov x4, v18.d[0] // rng (packed)
|
||||
mov x3, v4.d[0] // v (packed)
|
||||
|
||||
// Shift 'v'/'rng' for ret into the 16 least sig bits. There is
|
||||
// garbage in the remaining bits, but we can work around this.
|
||||
lsr x4, x4, x15 // rng
|
||||
lsr x3, x3, x15 // v
|
||||
lsl w5, w4, #16 // rng << 16
|
||||
sub x7, x7, x3, lsl #48 // dif - (v << 48)
|
||||
clz w5, w5 // d = clz(rng << 16)
|
||||
lsl w4, w4, w5 // rng << d
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
lsl x7, x7, x5 // (dif - (v << 48)) << d
|
||||
str w4, [x0, #RNG]
|
||||
strh w4, [x0, #RNG]
|
||||
dup v3.4h, w4
|
||||
b.hs 5f
|
||||
|
||||
|
@ -366,17 +454,15 @@ function msac_decode_hi_tok_neon, export=1
|
|||
orr x7, x7, x8 // dif |= next_bits
|
||||
|
||||
5: // end
|
||||
lsl w15, w15, #1
|
||||
sub w15, w15, #5
|
||||
sub w15, w15, #5*8
|
||||
lsr x12, x7, #48
|
||||
adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
|
||||
dup v1.8h, w12
|
||||
b.cc 1b // loop if !carry
|
||||
add w13, w13, #30
|
||||
add w13, w13, #30*8
|
||||
str w6, [x0, #CNT]
|
||||
add sp, sp, #48
|
||||
str x7, [x0, #DIF]
|
||||
lsr w0, w13, #1
|
||||
lsr w0, w13, #4
|
||||
ret
|
||||
|
||||
6: // pad_with_ones
|
||||
|
@ -405,7 +491,6 @@ endfunc
|
|||
|
||||
function msac_decode_bool_equi_neon, export=1
|
||||
ldp w5, w6, [x0, #RNG] // + CNT
|
||||
sub sp, sp, #48
|
||||
ldr x7, [x0, #DIF]
|
||||
bic w4, w5, #0xff // r &= 0xff00
|
||||
add w4, w4, #8
|
||||
|
@ -418,12 +503,20 @@ function msac_decode_bool_equi_neon, export=1
|
|||
|
||||
clz w5, w4 // clz(rng)
|
||||
eor w5, w5, #16 // d = clz(rng) ^ 16
|
||||
b L(renorm2)
|
||||
lsl w4, w4, w5 // rng << d
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
lsl x7, x7, x5 // (dif - (v << 48)) << d
|
||||
str w4, [x0, #RNG]
|
||||
b.lo L(refill)
|
||||
|
||||
str w6, [x0, #CNT]
|
||||
str x7, [x0, #DIF]
|
||||
mov w0, w15
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function msac_decode_bool_neon, export=1
|
||||
ldp w5, w6, [x0, #RNG] // + CNT
|
||||
sub sp, sp, #48
|
||||
ldr x7, [x0, #DIF]
|
||||
lsr w4, w5, #8 // r >> 8
|
||||
bic w1, w1, #0x3f // f &= ~63
|
||||
|
@ -438,13 +531,21 @@ function msac_decode_bool_neon, export=1
|
|||
|
||||
clz w5, w4 // clz(rng)
|
||||
eor w5, w5, #16 // d = clz(rng) ^ 16
|
||||
b L(renorm2)
|
||||
lsl w4, w4, w5 // rng << d
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
lsl x7, x7, x5 // (dif - (v << 48)) << d
|
||||
str w4, [x0, #RNG]
|
||||
b.lo L(refill)
|
||||
|
||||
str w6, [x0, #CNT]
|
||||
str x7, [x0, #DIF]
|
||||
mov w0, w15
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function msac_decode_bool_adapt_neon, export=1
|
||||
ldr w9, [x1] // cdf[0-1]
|
||||
ldp w5, w6, [x0, #RNG] // + CNT
|
||||
sub sp, sp, #48
|
||||
ldr x7, [x0, #DIF]
|
||||
lsr w4, w5, #8 // r >> 8
|
||||
and w2, w9, #0xffc0 // f &= ~63
|
||||
|
@ -462,7 +563,7 @@ function msac_decode_bool_adapt_neon, export=1
|
|||
clz w5, w4 // clz(rng)
|
||||
eor w5, w5, #16 // d = clz(rng) ^ 16
|
||||
|
||||
cbz w10, L(renorm2)
|
||||
cbz w10, 1f
|
||||
|
||||
lsr w2, w9, #16 // count = cdf[1]
|
||||
and w9, w9, #0xffff // cdf[0]
|
||||
|
@ -480,5 +581,15 @@ function msac_decode_bool_adapt_neon, export=1
|
|||
strh w9, [x1]
|
||||
strh w10, [x1, #2]
|
||||
|
||||
b L(renorm2)
|
||||
1:
|
||||
lsl w4, w4, w5 // rng << d
|
||||
subs w6, w6, w5 // cnt -= d
|
||||
lsl x7, x7, x5 // (dif - (v << 48)) << d
|
||||
str w4, [x0, #RNG]
|
||||
b.lo L(refill)
|
||||
|
||||
str w6, [x0, #CNT]
|
||||
str x7, [x0, #DIF]
|
||||
mov w0, w15
|
||||
ret
|
||||
endfunc
|
||||
|
|
|
@ -62,6 +62,7 @@
|
|||
|
||||
decl_8tap_fns(neon);
|
||||
decl_8tap_fns(neon_dotprod);
|
||||
decl_8tap_fns(neon_i8mm);
|
||||
|
||||
decl_mc_fn(BF(dav1d_put_bilin, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_bilin, neon));
|
||||
|
@ -109,11 +110,17 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
|
|||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
|
||||
c->emu_edge = BF(dav1d_emu_edge, neon);
|
||||
|
||||
#if ARCH_AARCH64
|
||||
#if HAVE_DOTPROD && BITDEPTH == 8
|
||||
#if ARCH_AARCH64 && BITDEPTH == 8
|
||||
#if HAVE_DOTPROD
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
|
||||
|
||||
init_8tap_fns(neon_dotprod);
|
||||
#endif // HAVE_DOTPROD && BITDEPTH == 8
|
||||
#endif // ARCH_AARCH64
|
||||
#endif // HAVE_DOTPROD
|
||||
|
||||
#if HAVE_I8MM
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
|
||||
|
||||
init_8tap_fns(neon_i8mm);
|
||||
#endif // HAVE_I8MM
|
||||
#endif // ARCH_AARCH64 && BITDEPTH == 8
|
||||
}
|
||||
|
|
|
@ -82,6 +82,9 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
|
|||
#if defined(__VSX__)
|
||||
flags |= DAV1D_PPC_CPU_FLAG_VSX;
|
||||
#endif
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
flags |= DAV1D_PPC_CPU_FLAG_PWR9;
|
||||
#endif
|
||||
#elif ARCH_RISCV
|
||||
#if defined(__riscv_v)
|
||||
flags |= DAV1D_RISCV_CPU_FLAG_V;
|
||||
|
|
|
@ -1162,7 +1162,7 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
ts->cdf.m.use_filter_intra[bs]);
|
||||
if (is_filter) {
|
||||
b->y_mode = FILTER_PRED;
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.filter_intra, 4);
|
||||
}
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
|
|
@ -232,7 +232,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||
%elif PIC
|
||||
call $+5 ; special-cased to not affect the RSB on most CPU:s
|
||||
pop %1
|
||||
add %1, (%2)-$+1
|
||||
add %1, -$+1+%2
|
||||
%else
|
||||
mov %1, %2
|
||||
%endif
|
||||
|
@ -864,16 +864,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||
|
||||
%macro cextern 1
|
||||
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
|
||||
CAT_XDEFINE cglobaled_, %1, 1
|
||||
CAT_XDEFINE cglobaled_, %1, 2
|
||||
extern %1
|
||||
%endmacro
|
||||
|
||||
; like cextern, but without the prefix
|
||||
; Like cextern, but without the prefix. This should be used for symbols from external libraries.
|
||||
%macro cextern_naked 1
|
||||
%ifdef PREFIX
|
||||
%xdefine %1 mangle(%1)
|
||||
%endif
|
||||
CAT_XDEFINE cglobaled_, %1, 1
|
||||
CAT_XDEFINE cglobaled_, %1, 3
|
||||
extern %1
|
||||
%endmacro
|
||||
|
||||
|
@ -1268,12 +1268,27 @@ INIT_XMM
|
|||
%endmacro
|
||||
%macro call_internal 2
|
||||
%xdefine %%i %2
|
||||
%define %%j %%i
|
||||
%ifndef cglobaled_%2
|
||||
%ifdef cglobaled_%1
|
||||
%xdefine %%i %1
|
||||
%endif
|
||||
%elif FORMAT_ELF
|
||||
%if ARCH_X86_64
|
||||
%if cglobaled_%2 >= 2
|
||||
; Always emit PLT relocations when calling external functions,
|
||||
; the linker will eliminate unnecessary PLT indirections anyway.
|
||||
%define %%j %%i wrt ..plt
|
||||
%endif
|
||||
%elif PIC && cglobaled_%2 == 3
|
||||
; Go through the GOT for functions declared using cextern_naked with
|
||||
; PIC, as such functions presumably exists in external libraries.
|
||||
extern _GLOBAL_OFFSET_TABLE_
|
||||
LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc
|
||||
%define %%j [eax+%%i wrt ..got]
|
||||
%endif
|
||||
%endif
|
||||
call %%i
|
||||
call %%j
|
||||
LOAD_MM_PERMUTATION %%i
|
||||
%endmacro
|
||||
|
||||
|
|
|
@ -263,7 +263,6 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
f->c = c;
|
||||
f->task_thread.ttd = &c->task_thread;
|
||||
f->lf.last_sharpness = -1;
|
||||
dav1d_refmvs_init(&f->rf);
|
||||
}
|
||||
|
||||
for (unsigned m = 0; m < c->n_tc; m++) {
|
||||
|
@ -664,7 +663,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
dav1d_free(f->lf.lr_mask);
|
||||
dav1d_free(f->lf.tx_lpf_right_edge[0]);
|
||||
dav1d_free(f->lf.start_of_tile_row);
|
||||
dav1d_refmvs_clear(&f->rf);
|
||||
dav1d_free_aligned(f->rf.r);
|
||||
dav1d_free_aligned(f->lf.cdef_line_buf);
|
||||
dav1d_free_aligned(f->lf.lr_line_buf);
|
||||
}
|
||||
|
|
|
@ -249,6 +249,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
|
|||
#include "src/arm/loopfilter.h"
|
||||
#elif ARCH_LOONGARCH64
|
||||
#include "src/loongarch/loopfilter.h"
|
||||
#elif ARCH_PPC64LE
|
||||
#include "src/ppc/loopfilter.h"
|
||||
#elif ARCH_X86
|
||||
#include "src/x86/loopfilter.h"
|
||||
#endif
|
||||
|
@ -265,6 +267,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
|
|||
loop_filter_dsp_init_arm(c);
|
||||
#elif ARCH_LOONGARCH64
|
||||
loop_filter_dsp_init_loongarch(c);
|
||||
#elif ARCH_PPC64LE
|
||||
loop_filter_dsp_init_ppc(c);
|
||||
#elif ARCH_X86
|
||||
loop_filter_dsp_init_x86(c);
|
||||
#endif
|
||||
|
|
|
@ -73,14 +73,14 @@ libdav1d_tmpl_sources = files(
|
|||
'recon_tmpl.c',
|
||||
)
|
||||
|
||||
libdav1d_arch_tmpl_sources = []
|
||||
libdav1d_arch_tmpl_sources = {}
|
||||
|
||||
libdav1d_bitdepth_objs = []
|
||||
|
||||
# ASM specific sources
|
||||
libdav1d_asm_objs = []
|
||||
# Arch-specific flags
|
||||
arch_flags = []
|
||||
arch_flags = {}
|
||||
if is_asm_enabled
|
||||
if (host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm'))
|
||||
|
@ -232,9 +232,9 @@ if is_asm_enabled
|
|||
'loongarch/cpu.c',
|
||||
)
|
||||
|
||||
libdav1d_arch_tmpl_sources += files(
|
||||
libdav1d_arch_tmpl_sources += {'default': files(
|
||||
'loongarch/looprestoration_tmpl.c',
|
||||
)
|
||||
)}
|
||||
|
||||
libdav1d_sources_asm = files(
|
||||
'loongarch/mc.S',
|
||||
|
@ -246,20 +246,25 @@ if is_asm_enabled
|
|||
)
|
||||
libdav1d_asm_objs += libdav1d_sources_asm
|
||||
elif host_machine.cpu() == 'ppc64le'
|
||||
arch_flags = ['-maltivec', '-mvsx']
|
||||
arch_flags += {'vsx': ['-maltivec', '-mvsx', '-DDAV1D_VSX']}
|
||||
libdav1d_sources += files(
|
||||
'ppc/cpu.c',
|
||||
)
|
||||
libdav1d_arch_tmpl_sources += files(
|
||||
libdav1d_arch_tmpl_sources += {'vsx': files(
|
||||
'ppc/cdef_tmpl.c',
|
||||
'ppc/looprestoration_tmpl.c',
|
||||
)
|
||||
)}
|
||||
arch_flags += {'pwr9': ['-mcpu=power9', '-DDAV1D_PWR9']}
|
||||
libdav1d_arch_tmpl_sources += {'pwr9': files(
|
||||
'ppc/loopfilter_tmpl.c',
|
||||
)}
|
||||
elif host_machine.cpu_family().startswith('riscv')
|
||||
libdav1d_sources += files(
|
||||
'riscv/cpu.c',
|
||||
)
|
||||
if host_machine.cpu_family() == 'riscv64'
|
||||
libdav1d_sources += files(
|
||||
'riscv/64/cpu.S',
|
||||
'riscv/64/itx.S',
|
||||
)
|
||||
endif
|
||||
|
@ -320,15 +325,17 @@ endforeach
|
|||
|
||||
# Helper library for each bitdepth and architecture-specific flags
|
||||
foreach bitdepth : dav1d_bitdepths
|
||||
libdav1d_bitdepth_objs += static_library(
|
||||
'dav1d_arch_bitdepth_@0@'.format(bitdepth),
|
||||
libdav1d_arch_tmpl_sources, config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependencies],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects(recursive: true)
|
||||
foreach subarch : libdav1d_arch_tmpl_sources.keys()
|
||||
libdav1d_bitdepth_objs += static_library(
|
||||
'dav1d_arch_bitdepth_@0@_@1@'.format(bitdepth,subarch),
|
||||
libdav1d_arch_tmpl_sources[subarch], config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependencies],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags.get(subarch, []),
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects(recursive: true)
|
||||
endforeach
|
||||
endforeach
|
||||
|
||||
# The final dav1d library
|
||||
|
|
|
@ -68,7 +68,7 @@ unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
|
|||
unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
|
||||
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
|
||||
|
||||
/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
|
||||
/* Supported n_symbols ranges: adapt4: 1-3, adapt8: 1-7, adapt16: 3-15 */
|
||||
#ifndef dav1d_msac_decode_symbol_adapt4
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
|
||||
#endif
|
||||
|
|
|
@ -40,12 +40,16 @@ COLD unsigned dav1d_get_cpu_flags_ppc(void) {
|
|||
unsigned flags = 0;
|
||||
#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
unsigned long hw_cap2 = getauxval(AT_HWCAP2);
|
||||
#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE
|
||||
unsigned long hw_cap = 0;
|
||||
unsigned long hw_cap2 = 0;
|
||||
elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
|
||||
elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
|
||||
#endif
|
||||
#ifdef HAVE_AUX
|
||||
flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0;
|
||||
flags |= (hw_cap2 & PPC_FEATURE2_ARCH_3_00) ? DAV1D_PPC_CPU_FLAG_PWR9 : 0;
|
||||
#endif
|
||||
return flags;
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
|
||||
enum CpuFlags {
|
||||
DAV1D_PPC_CPU_FLAG_VSX = 1 << 0,
|
||||
DAV1D_PPC_CPU_FLAG_PWR9 = 1 << 1,
|
||||
};
|
||||
|
||||
unsigned dav1d_get_cpu_flags_ppc(void);
|
||||
|
|
|
@ -44,6 +44,10 @@
|
|||
#define i64x2 vector signed long long
|
||||
#define b64x2 vector bool long long
|
||||
|
||||
#define i8h_to_i16(v) ((i16x8) vec_unpackh((i8x16)v))
|
||||
#define i8l_to_i16(v) ((i16x8) vec_unpackl((i8x16)v))
|
||||
#define u8h_to_i16(v) ((i16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
|
||||
#define u8l_to_i16(v) ((i16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
|
||||
#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
|
||||
#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
|
||||
#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, pwr9));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, pwr9));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, pwr9));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, pwr9));
|
||||
|
||||
static ALWAYS_INLINE void loop_filter_dsp_init_ppc(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_PPC_CPU_FLAG_PWR9)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, pwr9);
|
||||
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, pwr9);
|
||||
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, pwr9);
|
||||
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, pwr9);
|
||||
#endif
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -369,7 +369,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
|
||||
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
|
||||
if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
|
||||
idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
|
||||
*txtp = dav1d_tx_types_per_set[idx + 0];
|
||||
} else {
|
||||
|
@ -412,7 +412,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
|
||||
break; \
|
||||
}
|
||||
case_sz(0, 16, 4, [is_1d]);
|
||||
case_sz(0, 16, 8, [is_1d]);
|
||||
case_sz(1, 32, 8, [is_1d]);
|
||||
case_sz(2, 64, 8, [is_1d]);
|
||||
case_sz(3, 128, 8, [is_1d]);
|
||||
|
|
|
@ -657,19 +657,19 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *con
|
|||
{
|
||||
if (rf->n_tile_threads == 1) tile_row_idx = 0;
|
||||
rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
|
||||
const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1;
|
||||
const ptrdiff_t pass_off = (uses_2pass && pass == 2) ?
|
||||
35 * rf->r_stride * rf->n_tile_rows : 0;
|
||||
refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off];
|
||||
const ptrdiff_t r_stride = rf->rp_stride * 2;
|
||||
const ptrdiff_t pass_off = (rf->n_frame_threads > 1 && pass == 2) ?
|
||||
35 * 2 * rf->n_blocks : 0;
|
||||
refmvs_block *r = &rf->r[35 * r_stride * tile_row_idx + pass_off];
|
||||
const int sbsz = rf->sbsz;
|
||||
const int off = (sbsz * sby) & 16;
|
||||
for (int i = 0; i < sbsz; i++, r += rf->r_stride)
|
||||
for (int i = 0; i < sbsz; i++, r += r_stride)
|
||||
rt->r[off + 5 + i] = r;
|
||||
rt->r[off + 0] = r;
|
||||
r += rf->r_stride;
|
||||
r += r_stride;
|
||||
rt->r[off + 1] = NULL;
|
||||
rt->r[off + 2] = r;
|
||||
r += rf->r_stride;
|
||||
r += r_stride;
|
||||
rt->r[off + 3] = NULL;
|
||||
rt->r[off + 4] = r;
|
||||
if (sby & 1) {
|
||||
|
@ -805,37 +805,37 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
|||
/*const*/ refmvs_temporal_block *const rp_ref[7],
|
||||
const int n_tile_threads, const int n_frame_threads)
|
||||
{
|
||||
const int rp_stride = ((frm_hdr->width[0] + 127) & ~127) >> 3;
|
||||
const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
|
||||
const int n_blocks = rp_stride * n_tile_rows;
|
||||
|
||||
rf->sbsz = 16 << seq_hdr->sb128;
|
||||
rf->frm_hdr = frm_hdr;
|
||||
rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
|
||||
rf->ih8 = (frm_hdr->height + 7) >> 3;
|
||||
rf->iw4 = rf->iw8 << 1;
|
||||
rf->ih4 = rf->ih8 << 1;
|
||||
|
||||
const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
|
||||
const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
|
||||
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
|
||||
if (rf->r) dav1d_freep_aligned(&rf->r);
|
||||
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
|
||||
/* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm,
|
||||
* so add 4 bytes of padding to avoid buffer overreads. */
|
||||
rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64);
|
||||
if (!rf->r) return DAV1D_ERR(ENOMEM);
|
||||
rf->r_stride = r_stride;
|
||||
}
|
||||
|
||||
const ptrdiff_t rp_stride = r_stride >> 1;
|
||||
if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
|
||||
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
|
||||
rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
|
||||
if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
|
||||
rf->rp_stride = rp_stride;
|
||||
}
|
||||
rf->n_tile_rows = n_tile_rows;
|
||||
rf->rp = rp;
|
||||
rf->rp_stride = rp_stride;
|
||||
rf->n_tile_threads = n_tile_threads;
|
||||
rf->n_frame_threads = n_frame_threads;
|
||||
rf->rp = rp;
|
||||
rf->rp_ref = rp_ref;
|
||||
|
||||
if (n_blocks != rf->n_blocks) {
|
||||
const size_t r_sz = sizeof(*rf->r) * 35 * 2 * n_blocks * (1 + (n_frame_threads > 1));
|
||||
const size_t rp_proj_sz = sizeof(*rf->rp_proj) * 16 * n_blocks;
|
||||
/* Note that sizeof(*rf->r) == 12, but it's accessed using 16-byte unaligned
|
||||
* loads in save_tmvs() asm which can overread 4 bytes into rp_proj. */
|
||||
dav1d_free_aligned(rf->r);
|
||||
rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, r_sz + rp_proj_sz, 64);
|
||||
if (!rf->r) {
|
||||
rf->n_blocks = 0;
|
||||
return DAV1D_ERR(ENOMEM);
|
||||
}
|
||||
|
||||
rf->rp_proj = (refmvs_temporal_block*)((uintptr_t)rf->r + r_sz);
|
||||
rf->n_blocks = n_blocks;
|
||||
}
|
||||
|
||||
const unsigned poc = frm_hdr->frame_offset;
|
||||
for (int i = 0; i < 7; i++) {
|
||||
const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
|
||||
|
@ -848,6 +848,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
|||
|
||||
// temporal MV setup
|
||||
rf->n_mfmvs = 0;
|
||||
rf->rp_ref = rp_ref;
|
||||
if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
|
||||
int total = 2;
|
||||
if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
|
||||
|
@ -896,18 +897,6 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
|||
return 0;
|
||||
}
|
||||
|
||||
void dav1d_refmvs_init(refmvs_frame *const rf) {
|
||||
rf->r = NULL;
|
||||
rf->r_stride = 0;
|
||||
rf->rp_proj = NULL;
|
||||
rf->rp_stride = 0;
|
||||
}
|
||||
|
||||
void dav1d_refmvs_clear(refmvs_frame *const rf) {
|
||||
if (rf->r) dav1d_freep_aligned(&rf->r);
|
||||
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
|
||||
}
|
||||
|
||||
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
|
||||
const int bx4, const int bw4, int bh4)
|
||||
{
|
||||
|
|
|
@ -72,14 +72,14 @@ typedef struct refmvs_frame {
|
|||
int mfmv_ref2ref[3][7];
|
||||
int n_mfmvs;
|
||||
|
||||
int n_blocks;
|
||||
refmvs_temporal_block *rp;
|
||||
/*const*/ refmvs_temporal_block *const *rp_ref;
|
||||
refmvs_temporal_block *rp_proj;
|
||||
ptrdiff_t rp_stride;
|
||||
|
||||
refmvs_block *r; // 35 x r_stride memory
|
||||
ptrdiff_t r_stride;
|
||||
int n_tile_rows, n_tile_threads, n_frame_threads;
|
||||
int n_tile_threads, n_frame_threads;
|
||||
} refmvs_frame;
|
||||
|
||||
typedef struct refmvs_tile {
|
||||
|
@ -121,10 +121,6 @@ typedef struct Dav1dRefmvsDSPContext {
|
|||
splat_mv_fn splat_mv;
|
||||
} Dav1dRefmvsDSPContext;
|
||||
|
||||
// call once per frame thread
|
||||
void dav1d_refmvs_init(refmvs_frame *rf);
|
||||
void dav1d_refmvs_clear(refmvs_frame *rf);
|
||||
|
||||
// call once per frame
|
||||
int dav1d_refmvs_init_frame(refmvs_frame *rf,
|
||||
const Dav1dSequenceHeader *seq_hdr,
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
/******************************************************************************
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2024, Nathan Egge
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "src/riscv/asm.S"
|
||||
|
||||
// This function detects non-compliant RVV 0.7.1 hardware which reports support
|
||||
// for the V extension through HWCAP, by intentionally setting tail and mask
|
||||
// agnostic vector configurations that were only introduced in RVV 0.9 spec.
|
||||
// Existing non-compliant (pre RVV 1.0) hardware will set the VILL bit in VTYPE
|
||||
// (indicating an illegal vector configuration) which is stored in the XLEN-1
|
||||
// bit position, thus a simple sign check is sufficient for detection.
|
||||
// This work around is inexpensive and harmless on compliant hardware, but we
|
||||
// should still consider removing it once all non-compliant RVV 0.7.1 hardware
|
||||
// is out of service.
|
||||
function has_compliant_rvv, export=1, ext=v
|
||||
vsetvli t0, zero, e8, m1, ta, ma
|
||||
csrr a0, vtype
|
||||
sgtz a0, a0
|
||||
ret
|
||||
endfunc
|
|
@ -38,11 +38,13 @@
|
|||
|
||||
#endif
|
||||
|
||||
int dav1d_has_compliant_rvv(void);
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_riscv(void) {
|
||||
unsigned flags = 0;
|
||||
#if defined(HAVE_GETAUXVAL)
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
|
||||
flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
|
|
|
@ -29,7 +29,6 @@
|
|||
#include "src/mc.h"
|
||||
|
||||
#define decl_fn(type, name) \
|
||||
decl_##type##_fn(BF(name, sse2)); \
|
||||
decl_##type##_fn(BF(name, ssse3)); \
|
||||
decl_##type##_fn(BF(name, avx2)); \
|
||||
decl_##type##_fn(BF(name, avx512icl));
|
||||
|
@ -108,25 +107,6 @@ decl_fn(resize, dav1d_resize);
|
|||
static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
|
||||
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
|
||||
#endif
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
|
||||
return;
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -92,6 +92,31 @@ JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
|
|||
|
||||
JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
|
||||
|
||||
struc rf
|
||||
.frm_hdr: resq 1
|
||||
.iw4: resd 1
|
||||
.ih4: resd 1
|
||||
.iw8: resd 1
|
||||
.ih8: resd 1
|
||||
.sbsz: resd 1
|
||||
.use_rf_mvs: resd 1
|
||||
.sign_bias: resb 7
|
||||
.mfmv_sign: resb 7
|
||||
.pocdiff: resb 7
|
||||
.mfmv_ref: resb 3
|
||||
.mfmv_ref2cur: resd 3
|
||||
.mfmv_ref2ref: resd 3*7
|
||||
.n_mfmvs: resd 1
|
||||
.n_blocks: resd 1
|
||||
.rp: resq 1
|
||||
.rp_ref: resq 1
|
||||
.rp_proj: resq 1
|
||||
.rp_stride: resq 1
|
||||
.r: resq 1
|
||||
.n_tile_threads: resd 1
|
||||
.n_frame_threads: resd 1
|
||||
endstruc
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro movif32 2
|
||||
|
@ -341,16 +366,16 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
|
|||
stride, rp_proj, roff, troff, \
|
||||
xendi, xstarti, iw8, ih8, dst
|
||||
xor r14d, r14d
|
||||
cmp dword [rfq+212], 1 ; n_tile_threads
|
||||
mov ih8d, [rfq+20] ; rf->ih8
|
||||
mov iw8d, [rfq+16] ; rf->iw8
|
||||
cmp dword [rfq+rf.n_tile_threads], 1
|
||||
mov ih8d, [rfq+rf.ih8]
|
||||
mov iw8d, [rfq+rf.iw8]
|
||||
mov xstartd, xstartd
|
||||
mov xendd, xendd
|
||||
cmove tridxd, r14d
|
||||
lea xstartid, [xstartq-8]
|
||||
lea xendid, [xendq+8]
|
||||
mov strideq, [rfq+184]
|
||||
mov rp_projq, [rfq+176]
|
||||
mov strideq, [rfq+rf.rp_stride]
|
||||
mov rp_projq, [rfq+rf.rp_proj]
|
||||
cmp ih8d, yendd
|
||||
mov [rsp+0x30], strideq
|
||||
cmovs yendd, ih8d
|
||||
|
@ -397,7 +422,7 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
|
|||
jg .init_xloop_start
|
||||
DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
|
||||
_, _, xendi, xstarti, stride5, _, n
|
||||
mov r13d, [rfq+152] ; rf->n_mfmvs
|
||||
mov r13d, [rfq+rf.n_mfmvs]
|
||||
test r13d, r13d
|
||||
jz .ret
|
||||
mov [rsp+0x0c], r13d
|
||||
|
@ -418,14 +443,14 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
|
|||
DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
|
||||
ref, rp_ref, xendi, xstarti, _, _, n
|
||||
mov rfq, [rsp+0x48]
|
||||
mov refd, [rfq+56+nq*4] ; ref2cur
|
||||
mov refd, [rfq+rf.mfmv_ref2cur+nq*4]
|
||||
cmp refd, 0x80000000
|
||||
je .next_n
|
||||
mov [rsp+0x40], refd
|
||||
mov offq, [rsp+0x00] ; ystart * stride * 5
|
||||
movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n]
|
||||
movzx refd, byte [rfq+rf.mfmv_ref+nq]
|
||||
lea refsignq, [refq-4]
|
||||
mov rp_refq, [rfq+168]
|
||||
mov rp_refq, [rfq+rf.rp_ref]
|
||||
movq m2, refsignq
|
||||
add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset
|
||||
mov [rsp+0x14], nd
|
||||
|
@ -452,8 +477,8 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
|
|||
test refd, refd
|
||||
jz .next_x_bad_ref
|
||||
mov rfq, [rsp+0x48]
|
||||
lea r14d, [16+n7q+refq]
|
||||
mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1]
|
||||
lea ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1]
|
||||
mov ref2refd, [rfq+ref2refq*4] ; rf->mfmv_ref2ref[n][b_ref-1]
|
||||
test ref2refd, ref2refd
|
||||
jz .next_x_bad_ref
|
||||
lea fracq, [mv_proj]
|
||||
|
|
|
@ -131,7 +131,7 @@ else
|
|||
mapfile -t dirs < <(printf "${ARGON_DIR}/%s\n" "$@" | sort -u)
|
||||
fi
|
||||
|
||||
ver_info="dav1d $("$DAV1D" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D"
|
||||
ver_info="dav1d $("$DAV1D" --filmgrain "$FILMGRAIN" --cpumask "$CPUMASK" --threads "$THREADS" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D"
|
||||
files=()
|
||||
|
||||
for d in "${dirs[@]}"; do
|
||||
|
|
Загрузка…
Ссылка в новой задаче