зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1540830 - Update dav1d from upstream to 1f7a7e8. r=TD-Linux
Differential Revision: https://phabricator.services.mozilla.com/D28200 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
931da4b767
Коммит
d1bd6b015b
|
@ -20,7 +20,7 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 7350c59e7894cb7e487a0add9942d2b1b39f7161 (2019-03-16T23:17:05.000Z).
|
||||
release: commit 1f7a7e8a6af739a05b320151d04f0f7509ae7579 (2019-04-19T07:16:39.000Z).
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.2.2"
|
||||
#define DAV1D_VERSION "1f7a7e8a6af739a05b320151d04f0f7509ae7579"
|
||||
|
|
|
@ -12,6 +12,12 @@ style-check:
|
|||
script:
|
||||
- git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
|
||||
- git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
|
||||
- for i in $(git ls-files -- . ':(exclude)*/compat/*'); do
|
||||
if [ -n "$(tail -c 1 "$i")" ]; then
|
||||
echo "No newline at end of $i";
|
||||
exit 1;
|
||||
fi;
|
||||
done
|
||||
- git remote rm upstream 2> /dev/null || true
|
||||
- git remote add upstream https://code.videolan.org/videolan/dav1d.git
|
||||
- git fetch -q upstream master
|
||||
|
|
|
@ -1,6 +1,14 @@
|
|||
Changes for 0.2.2 'Antelope':
|
||||
----------------------------
|
||||
|
||||
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
|
||||
The impact is important on SSSE3, SSE4 and AVX-2 cpus
|
||||
- SSSE3 optimizations for all blocks size in itx
|
||||
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
|
||||
- Speed improvements on CDEF for SSE4 CPUs
|
||||
- NEON optimizations for SGR and loop filter
|
||||
- Minor crashes, improvements and build changes
|
||||
|
||||
|
||||
Changes for 0.2.1 'Antelope':
|
||||
----------------------------
|
||||
|
|
|
@ -33,7 +33,11 @@
|
|||
|
||||
#ifndef DAV1D_API
|
||||
#if defined _WIN32
|
||||
#define DAV1D_API __declspec(dllexport)
|
||||
#if defined DAV1D_BUILDING_DLL
|
||||
#define DAV1D_API __declspec(dllexport)
|
||||
#else
|
||||
#define DAV1D_API
|
||||
#endif
|
||||
#else
|
||||
#if __GNUC__ >= 4
|
||||
#define DAV1D_API __attribute__ ((visibility ("default")))
|
||||
|
|
|
@ -217,8 +217,8 @@ bidir_fn mask
|
|||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// assumes that the caller has loaded the h argument into r5,
|
||||
// and assumes that r8 is set to (24-clz(w)).
|
||||
function put
|
||||
// and assumes that r8 is set to (clz(w)-24).
|
||||
function put_neon
|
||||
adr r9, L(put_tbl)
|
||||
ldr r8, [r9, r8, lsl #2]
|
||||
add r9, r9, r8
|
||||
|
@ -307,9 +307,9 @@ endfunc
|
|||
|
||||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// assumes that the caller has loaded the h argument into r5,
|
||||
// and assumes that r8 is set to (24-clz(w)), and r7 to w*2.
|
||||
function prep
|
||||
// assumes that the caller has loaded the h argument into r4,
|
||||
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
|
||||
function prep_neon
|
||||
adr r9, L(prep_tbl)
|
||||
ldr r8, [r9, r8, lsl #2]
|
||||
add r9, r9, r8
|
||||
|
@ -660,7 +660,7 @@ function \op\()_8tap_\type\()_8bpc_neon, export=1
|
|||
push {r4-r11,lr}
|
||||
movw r8, \type_h
|
||||
movw r9, \type_v
|
||||
b \op\()_8tap
|
||||
b \op\()_8tap_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -680,7 +680,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
|
|||
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
|
||||
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
|
||||
|
||||
function \type\()_8tap
|
||||
function \type\()_8tap_neon
|
||||
ldrd r4, r5, [sp, #36]
|
||||
ldrd r6, r7, [sp, #44]
|
||||
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
|
||||
|
@ -699,7 +699,7 @@ function \type\()_8tap
|
|||
bne L(\type\()_8tap_h)
|
||||
tst \my, #(0x7f << 14)
|
||||
bne L(\type\()_8tap_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_8tap_h):
|
||||
cmp \w, #4
|
||||
|
@ -1831,7 +1831,7 @@ function \type\()_bilin_8bpc_neon, export=1
|
|||
bne L(\type\()_bilin_h)
|
||||
cmp \my, #0
|
||||
bne L(\type\()_bilin_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_bilin_h):
|
||||
cmp \my, #0
|
||||
|
|
|
@ -34,32 +34,32 @@
|
|||
|
||||
.macro movrel rd, val, offset=0
|
||||
#if defined(PIC) && defined(__APPLE__)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
.word 3f - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
.word 3f - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
2:
|
||||
ldr \rd, [pc, \rd]
|
||||
ldr \rd, [pc, \rd]
|
||||
.if \offset < 0
|
||||
sub \rd, \rd, #-(\offset)
|
||||
sub \rd, \rd, #-(\offset)
|
||||
.elseif \offset > 0
|
||||
add \rd, \rd, #\offset
|
||||
add \rd, \rd, #\offset
|
||||
.endif
|
||||
.non_lazy_symbol_pointer
|
||||
.non_lazy_symbol_pointer
|
||||
3:
|
||||
.indirect_symbol \val
|
||||
.word 0
|
||||
.text
|
||||
.indirect_symbol \val
|
||||
.word 0
|
||||
.text
|
||||
#elif defined(PIC)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
2:
|
||||
add \rd, \rd, pc
|
||||
add \rd, \rd, pc
|
||||
#else
|
||||
movw \rd, #:lower16:\val+\offset
|
||||
movt \rd, #:upper16:\val+\offset
|
||||
movw \rd, #:lower16:\val+\offset
|
||||
movt \rd, #:upper16:\val+\offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -235,8 +235,8 @@ bidir_fn mask
|
|||
|
||||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// and assumes that x8 is set to (24-clz(w)).
|
||||
function put
|
||||
// and assumes that x8 is set to (clz(w)-24).
|
||||
function put_neon
|
||||
adr x9, L(put_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
|
@ -330,8 +330,8 @@ endfunc
|
|||
|
||||
|
||||
// This has got the same signature as the prep_8tap functions,
|
||||
// and assumes that x8 is set to (24-clz(w)), and x7 to w*2.
|
||||
function prep
|
||||
// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
|
||||
function prep_neon
|
||||
adr x9, L(prep_tbl)
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
|
@ -703,7 +703,7 @@ endfunc
|
|||
function \op\()_8tap_\type\()_8bpc_neon, export=1
|
||||
mov x8, \type_h
|
||||
mov x9, \type_v
|
||||
b \op\()_8tap
|
||||
b \op\()_8tap\()_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -723,7 +723,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
|
|||
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
|
||||
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
|
||||
|
||||
function \type\()_8tap
|
||||
function \type\()_8tap_neon
|
||||
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
|
||||
mul \mx, \mx, w10
|
||||
mul \my, \my, w10
|
||||
|
@ -741,7 +741,7 @@ function \type\()_8tap
|
|||
b.ne L(\type\()_8tap_h)
|
||||
tst \my, #(0x7f << 14)
|
||||
b.ne L(\type\()_8tap_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_8tap_h):
|
||||
cmp \w, #4
|
||||
|
@ -1826,7 +1826,7 @@ function \type\()_bilin_8bpc_neon, export=1
|
|||
sub w8, w8, #24
|
||||
cbnz \mx, L(\type\()_bilin_h)
|
||||
cbnz \my, L(\type\()_bilin_v)
|
||||
b \type
|
||||
b \type\()_neon
|
||||
|
||||
L(\type\()_bilin_h):
|
||||
cbnz \my, L(\type\()_bilin_hv)
|
||||
|
@ -2335,7 +2335,7 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
|
|||
add \src, \src, \inc
|
||||
.endm
|
||||
|
||||
function warp_filter_horz
|
||||
function warp_filter_horz_neon
|
||||
add w12, w5, #512
|
||||
|
||||
ld1 {v16.8b, v17.8b}, [x2], x3
|
||||
|
@ -2431,24 +2431,24 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
lsl x1, x1, #1
|
||||
.endif
|
||||
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v24.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v25.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v26.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v27.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v28.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v29.16b, v16.16b
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v30.16b, v16.16b
|
||||
|
||||
1:
|
||||
add w14, w6, #512
|
||||
bl warp_filter_horz
|
||||
bl warp_filter_horz_neon
|
||||
mov v31.16b, v16.16b
|
||||
|
||||
load_filter_row d0, w14, w9
|
||||
|
|
|
@ -35,57 +35,98 @@
|
|||
.macro movrel rd, val, offset=0
|
||||
#if defined(__APPLE__)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val@PAGE
|
||||
add \rd, \rd, \val@PAGEOFF
|
||||
sub \rd, \rd, -(\offset)
|
||||
adrp \rd, \val@PAGE
|
||||
add \rd, \rd, \val@PAGEOFF
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)@PAGE
|
||||
add \rd, \rd, \val+(\offset)@PAGEOFF
|
||||
adrp \rd, \val+(\offset)@PAGE
|
||||
add \rd, \rd, \val+(\offset)@PAGEOFF
|
||||
.endif
|
||||
#elif defined(PIC) && defined(_WIN32)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val
|
||||
add \rd, \rd, :lo12:\val
|
||||
sub \rd, \rd, -(\offset)
|
||||
adrp \rd, \val
|
||||
add \rd, \rd, :lo12:\val
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
.endif
|
||||
#elif defined(PIC)
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
#else
|
||||
ldr \rd, =\val+\offset
|
||||
ldr \rd, =\val+\offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \r9\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
||||
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
||||
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
||||
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
||||
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
||||
trn1 \r8\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \r9\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
||||
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
||||
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
||||
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
||||
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
||||
|
||||
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
||||
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
||||
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
||||
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
||||
trn1 \r5\().4h, \r9\().4h, \r3\().4h
|
||||
trn2 \r9\().4h, \r9\().4h, \r3\().4h
|
||||
trn1 \r3\().4h, \r8\().4h, \r1\().4h
|
||||
trn2 \r8\().4h, \r8\().4h, \r1\().4h
|
||||
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
||||
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
||||
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
||||
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
||||
trn1 \r5\().4h, \r9\().4h, \r3\().4h
|
||||
trn2 \r9\().4h, \r9\().4h, \r3\().4h
|
||||
trn1 \r3\().4h, \r8\().4h, \r1\().4h
|
||||
trn2 \r8\().4h, \r8\().4h, \r1\().4h
|
||||
|
||||
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
||||
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
||||
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r6\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r2\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r3\().2s, \r9\().2s, \r7\().2s
|
||||
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
||||
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
||||
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
||||
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r6\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r2\().2s, \r8\().2s, \r2\().2s
|
||||
trn1 \r3\().2s, \r9\().2s, \r7\().2s
|
||||
trn2 \r7\().2s, \r9\().2s, \r7\().2s
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \r9\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||
trn1 \r5\().8h, \r9\().8h, \r3\().8h
|
||||
trn2 \r9\().8h, \r9\().8h, \r3\().8h
|
||||
trn1 \r3\().8h, \r8\().8h, \r1\().8h
|
||||
trn2 \r8\().8h, \r8\().8h, \r1\().8h
|
||||
|
||||
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r6\().4s, \r8\().4s, \r2\().4s
|
||||
trn1 \r2\().4s, \r8\().4s, \r2\().4s
|
||||
trn1 \r3\().4s, \r9\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \r9\().4s, \r7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||
|
||||
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||
.endm
|
||||
|
||||
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
|
||||
|
|
|
@ -31,12 +31,12 @@
|
|||
#include "config.h"
|
||||
|
||||
#if ARCH_ARM
|
||||
.syntax unified
|
||||
.syntax unified
|
||||
#ifdef __ELF__
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.eabi_attribute 10, 0 // suppress Tag_FP_arch
|
||||
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.eabi_attribute 10, 0 // suppress Tag_FP_arch
|
||||
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
@ -46,7 +46,7 @@
|
|||
#endif
|
||||
|
||||
#if CONFIG_THUMB
|
||||
.thumb
|
||||
.thumb
|
||||
#define A @
|
||||
#define T
|
||||
#else
|
||||
|
@ -86,25 +86,25 @@
|
|||
#endif
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text
|
||||
.align \align
|
||||
.if \export
|
||||
.global EXTERN\name
|
||||
.text
|
||||
.align \align
|
||||
.if \export
|
||||
.global EXTERN\name
|
||||
#ifdef __ELF__
|
||||
.type EXTERN\name, %function
|
||||
.type EXTERN\name, %function
|
||||
#endif
|
||||
#if HAVE_AS_FUNC
|
||||
.func EXTERN\name
|
||||
.func EXTERN\name
|
||||
#endif
|
||||
EXTERN\name:
|
||||
.else
|
||||
.else
|
||||
#ifdef __ELF__
|
||||
.type \name, %function
|
||||
.type \name, %function
|
||||
#endif
|
||||
#if HAVE_AS_FUNC
|
||||
.func \name
|
||||
.func \name
|
||||
#endif
|
||||
.endif
|
||||
.endif
|
||||
\name:
|
||||
.endm
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
|
||||
|
||||
void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
|
||||
#endif
|
||||
}
|
|
@ -29,6 +29,7 @@
|
|||
#include "src/looprestoration.h"
|
||||
|
||||
#include "common/attributes.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
// This calculates things slightly differently than the reference C version.
|
||||
|
@ -91,7 +92,171 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
|||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ARCH_AARCH64
|
||||
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
|
||||
const int w, const int h, const int strength);
|
||||
void dav1d_sgr_finish_filter1_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
|
||||
/* filter with a 3x3 box (radius=1) */
|
||||
static void dav1d_sgr_filter1_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int strength,
|
||||
const enum LrEdgeFlags edges)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
|
||||
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
|
||||
|
||||
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 1, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
|
||||
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
|
||||
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
|
||||
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
|
||||
const int w, const int h, const int strength);
|
||||
void dav1d_sgr_finish_filter2_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
|
||||
/* filter with a 5x5 box (radius=2) */
|
||||
static void dav1d_sgr_filter2_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int strength,
|
||||
const enum LrEdgeFlags edges)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
|
||||
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
|
||||
|
||||
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
|
||||
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
|
||||
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
|
||||
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const coef *t1, const int w, const int h,
|
||||
const int wt);
|
||||
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const coef *t1, const coef *t2,
|
||||
const int w, const int h,
|
||||
const int16_t wt[2]);
|
||||
|
||||
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int sgr_idx,
|
||||
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
|
||||
{
|
||||
if (!dav1d_sgr_params[sgr_idx][0]) {
|
||||
ALIGN_STK_16(coef, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges);
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h,
|
||||
(1 << 7) - sgr_wt[1]);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else if (!dav1d_sgr_params[sgr_idx][1]) {
|
||||
ALIGN_STK_16(coef, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges);
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, sgr_wt[0]);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else {
|
||||
ALIGN_STK_16(coef, tmp1, 64 * 384,);
|
||||
ALIGN_STK_16(coef, tmp2, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges);
|
||||
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges);
|
||||
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp1, tmp2, w & ~7, h, wt);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp1 + (w & ~7), tmp2 + (w & ~7),
|
||||
w & 7, h, wt);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // ARCH_AARCH64
|
||||
#endif // BITDEPTH == 8
|
||||
|
||||
void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
@ -100,5 +265,8 @@ void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *
|
|||
|
||||
#if BITDEPTH == 8
|
||||
c->wiener = wiener_filter_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->selfguided = sgr_filter_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -813,7 +813,7 @@ static const uint16_t default_mv_joint_cdf[N_MV_JOINTS + 1] = {
|
|||
AOM_CDF4(4096, 11264, 19328)
|
||||
};
|
||||
|
||||
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1] = {
|
||||
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = {
|
||||
{
|
||||
{ AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
|
||||
24189, 28165, 29093, 30466) },
|
||||
|
|
|
@ -34,11 +34,13 @@
|
|||
#include "src/ref.h"
|
||||
#include "src/thread_data.h"
|
||||
|
||||
/* Buffers padded to [8] or [16] for SIMD where needed. */
|
||||
|
||||
typedef struct CdfModeContext {
|
||||
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
|
||||
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
|
||||
uint16_t use_filter_intra[N_BS_SIZES][2];
|
||||
uint16_t filter_intra[5 + 1];
|
||||
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
|
||||
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
|
||||
uint16_t angle_delta[8][8];
|
||||
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
|
||||
uint16_t newmv_mode[6][2];
|
||||
|
@ -66,7 +68,7 @@ typedef struct CdfModeContext {
|
|||
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
|
||||
uint16_t skip[3][2];
|
||||
uint16_t skip_mode[3][2];
|
||||
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
|
||||
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
|
||||
uint16_t seg_pred[3][2];
|
||||
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
|
||||
uint16_t cfl_sign[8 + 1];
|
||||
|
@ -88,12 +90,12 @@ typedef struct CdfModeContext {
|
|||
typedef struct CdfCoefContext {
|
||||
uint16_t skip[N_TX_SIZES][13][2];
|
||||
uint16_t eob_bin_16[2][2][6];
|
||||
uint16_t eob_bin_32[2][2][7];
|
||||
uint16_t eob_bin_32[2][2][7 + 1];
|
||||
uint16_t eob_bin_64[2][2][8];
|
||||
uint16_t eob_bin_128[2][2][9];
|
||||
uint16_t eob_bin_256[2][2][10];
|
||||
uint16_t eob_bin_512[2][2][11];
|
||||
uint16_t eob_bin_1024[2][2][12];
|
||||
uint16_t eob_bin_256[2][2][10 + 6];
|
||||
uint16_t eob_bin_512[2][2][11 + 5];
|
||||
uint16_t eob_bin_1024[2][2][12 + 4];
|
||||
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
|
||||
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
|
||||
uint16_t base_tok[N_TX_SIZES][2][41][5];
|
||||
|
@ -102,7 +104,7 @@ typedef struct CdfCoefContext {
|
|||
} CdfCoefContext;
|
||||
|
||||
typedef struct CdfMvComponent {
|
||||
uint16_t classes[11 + 1];
|
||||
uint16_t classes[11 + 1 + 4];
|
||||
uint16_t class0[2];
|
||||
uint16_t classN[10][2];
|
||||
uint16_t class0_fp[2][4 + 1];
|
||||
|
@ -119,7 +121,7 @@ typedef struct CdfMvContext {
|
|||
|
||||
typedef struct CdfContext {
|
||||
CdfModeContext m;
|
||||
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1];
|
||||
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
|
||||
CdfCoefContext coef;
|
||||
CdfMvContext mv, dmv;
|
||||
} CdfContext;
|
||||
|
|
|
@ -80,15 +80,15 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
|||
const Dav1dFrameContext *const f = t->f;
|
||||
const int have_hp = f->frame_hdr->hp;
|
||||
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
|
||||
const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
mv_comp->classes, 11);
|
||||
const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
mv_comp->classes, 11);
|
||||
int up, fp, hp;
|
||||
|
||||
if (!cl) {
|
||||
up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
|
||||
if (have_fp) {
|
||||
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
mv_comp->class0_fp[up], 4);
|
||||
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
mv_comp->class0_fp[up], 4);
|
||||
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->class0_hp) : 1;
|
||||
} else {
|
||||
|
@ -101,8 +101,8 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
|||
up |= dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->classN[n]) << n;
|
||||
if (have_fp) {
|
||||
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
mv_comp->classN_fp, 4);
|
||||
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
mv_comp->classN_fp, 4);
|
||||
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->classN_hp) : 1;
|
||||
} else {
|
||||
|
@ -119,8 +119,8 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
|||
static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
|
||||
CdfMvContext *const mv_cdf, const int have_fp)
|
||||
{
|
||||
switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint,
|
||||
N_MV_JOINTS))
|
||||
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
|
||||
N_MV_JOINTS))
|
||||
{
|
||||
case MV_JOINT_HV:
|
||||
ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
|
||||
|
@ -379,7 +379,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
|
|||
{
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
|
||||
uint16_t cache[16], used_cache[8];
|
||||
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
|
||||
|
@ -595,7 +595,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
|
|||
const int last = imax(0, i - h4 * 4 + 1);
|
||||
order_palette(pal_idx, stride, i, first, last, order, ctx);
|
||||
for (int j = first, m = 0; j >= last; j--, m++) {
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
color_map_cdf[ctx[m]], b->pal_sz[pl]);
|
||||
pal_idx[(i - j) * stride + j] = order[m][color_idx];
|
||||
}
|
||||
|
@ -811,7 +811,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
const unsigned pred_seg_id =
|
||||
get_cur_frame_segid(t->by, t->bx, have_top, have_left,
|
||||
&seg_ctx, f->cur_segmap, f->b4_stride);
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.seg_id[seg_ctx],
|
||||
DAV1D_MAX_SEGMENTS);
|
||||
const unsigned last_active_seg_id =
|
||||
|
@ -883,7 +883,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (b->skip) {
|
||||
b->seg_id = pred_seg_id;
|
||||
} else {
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.seg_id[seg_ctx],
|
||||
DAV1D_MAX_SEGMENTS);
|
||||
const unsigned last_active_seg_id =
|
||||
|
@ -932,8 +932,8 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
memcpy(prev_delta_lf, ts->last_delta_lf, 4);
|
||||
|
||||
if (have_delta_q) {
|
||||
int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
ts->cdf.m.delta_q, 4);
|
||||
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.delta_q, 4);
|
||||
if (delta_q == 3) {
|
||||
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
|
||||
delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
|
||||
|
@ -953,7 +953,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
|
||||
|
||||
for (int i = 0; i < n_lfs; i++) {
|
||||
int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
|
||||
if (delta_lf == 3) {
|
||||
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
|
||||
|
@ -1018,8 +1018,8 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
|
||||
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
|
||||
[dav1d_intra_mode_context[t->l.mode[by4]]];
|
||||
b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
|
||||
N_INTRA_PRED_MODES);
|
||||
b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
|
||||
N_INTRA_PRED_MODES);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
|
||||
|
||||
|
@ -1028,7 +1028,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
b->y_mode <= VERT_LEFT_PRED)
|
||||
{
|
||||
uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
|
||||
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
|
||||
b->y_angle = angle - 3;
|
||||
} else {
|
||||
b->y_angle = 0;
|
||||
|
@ -1038,20 +1038,20 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
|
||||
cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
|
||||
uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
|
||||
b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
|
||||
b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
|
||||
N_UV_INTRA_PRED_MODES - !cfl_allowed);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
|
||||
|
||||
if (b->uv_mode == CFL_PRED) {
|
||||
#define SIGN(a) (!!(a) + ((a) > 0))
|
||||
const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.cfl_sign, 8) + 1;
|
||||
const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
|
||||
assert(sign_u == sign / 3);
|
||||
if (sign_u) {
|
||||
const int ctx = (sign_u == 2) * 3 + sign_v;
|
||||
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
|
||||
if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
|
||||
} else {
|
||||
|
@ -1059,7 +1059,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
}
|
||||
if (sign_v) {
|
||||
const int ctx = (sign_v == 2) * 3 + sign_u;
|
||||
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
|
||||
if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
|
||||
} else {
|
||||
|
@ -1073,7 +1073,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
b->uv_mode <= VERT_LEFT_PRED)
|
||||
{
|
||||
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
|
||||
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
|
||||
b->uv_angle = angle - 3;
|
||||
} else {
|
||||
b->uv_angle = 0;
|
||||
|
@ -1113,7 +1113,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
ts->cdf.m.use_filter_intra[bs]);
|
||||
if (is_filter) {
|
||||
b->y_mode = FILTER_PRED;
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter_intra, 5);
|
||||
}
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -1156,7 +1156,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
|
||||
const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
|
||||
uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
|
||||
int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf,
|
||||
int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
|
||||
imin(t_dim->max + 1, 3));
|
||||
|
||||
while (depth--) {
|
||||
|
@ -1474,7 +1474,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
ts->tiling.col_end, ts->tiling.row_start,
|
||||
ts->tiling.row_end, f->libaom_cm);
|
||||
|
||||
b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.comp_inter_mode[ctx],
|
||||
N_COMP_INTER_PRED_MODES);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -1583,7 +1583,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.wedge_comp[ctx]);
|
||||
if (b->comp_type == COMP_INTER_WEDGE)
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.wedge_idx[ctx], 16);
|
||||
} else {
|
||||
b->comp_type = COMP_INTER_SEG;
|
||||
|
@ -1737,7 +1737,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.interintra[ii_sz_grp]))
|
||||
{
|
||||
b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.interintra_mode[ii_sz_grp],
|
||||
N_INTER_INTRA_PRED_MODES);
|
||||
const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
|
||||
|
@ -1745,7 +1745,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.interintra_wedge[wedge_ctx]);
|
||||
if (b->interintra_type == INTER_INTRA_WEDGE)
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.wedge_idx[wedge_ctx], 16);
|
||||
} else {
|
||||
b->interintra_type = INTER_INTRA_NONE;
|
||||
|
@ -1778,7 +1778,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
f->frame_hdr->warp_motion && (mask[0] | mask[1]);
|
||||
|
||||
b->motion_mode = allow_warp ?
|
||||
dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.motion_mode[bs], 3) :
|
||||
dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
|
||||
if (b->motion_mode == MM_WARP) {
|
||||
|
@ -1817,7 +1817,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
const int comp = b->comp_type != COMP_INTER_NONE;
|
||||
const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
|
||||
by4, bx4);
|
||||
filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter[0][ctx1],
|
||||
DAV1D_N_SWITCHABLE_FILTERS);
|
||||
if (f->seq_hdr->dual_filter) {
|
||||
|
@ -1826,7 +1826,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
|
||||
filter[0], ctx1, ts->msac.rng);
|
||||
filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter[1][ctx2],
|
||||
DAV1D_N_SWITCHABLE_FILTERS);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -2021,7 +2021,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
|
|||
} else {
|
||||
const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
|
||||
bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
|
||||
bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
|
||||
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
|
||||
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
|
||||
(bp == PARTITION_V || bp == PARTITION_V4 ||
|
||||
bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
|
||||
|
@ -2365,7 +2365,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
|
|||
Dav1dTileState *const ts = t->ts;
|
||||
|
||||
if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
|
||||
const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.restore_switchable, 3);
|
||||
lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
|
||||
DAV1D_RESTORATION_WIENER :
|
||||
|
@ -2692,7 +2692,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
freep(&f->lf.level);
|
||||
freep(&f->frame_thread.b);
|
||||
f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
|
||||
f->lf.level = malloc(f->sb128w * f->sb128h * 32 * 32 *
|
||||
// over-allocate by 3 bytes since some of the SIMD implementations
|
||||
// index this from the level type and can thus over-read by up to 3
|
||||
f->lf.level = malloc(3 + f->sb128w * f->sb128h * 32 * 32 *
|
||||
sizeof(*f->lf.level));
|
||||
if (!f->lf.mask || !f->lf.level) goto error;
|
||||
if (c->n_fc > 1) {
|
||||
|
|
|
@ -45,7 +45,7 @@ typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
|
|||
static void NOINLINE
|
||||
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
||||
coef *const coeff, const int eob,
|
||||
const int w, const int h, const int shift1, const int shift2,
|
||||
const int w, const int h, const int shift,
|
||||
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
|
||||
const int has_dconly HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
|
@ -53,8 +53,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
|
||||
const int is_rect2 = w * 2 == h || h * 2 == w;
|
||||
const int bitdepth = bitdepth_from_max(bitdepth_max);
|
||||
const int rnd1 = (1 << shift1) >> 1;
|
||||
const int rnd2 = (1 << shift2) >> 1;
|
||||
const int rnd = (1 << shift) >> 1;
|
||||
|
||||
if (has_dconly && eob == 0) {
|
||||
int dc = coeff[0];
|
||||
|
@ -62,9 +61,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
if (is_rect2)
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc + rnd1) >> shift1;
|
||||
dc = (dc + rnd) >> shift;
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc + rnd2) >> shift2;
|
||||
dc = (dc + 8) >> 4;
|
||||
for (j = 0; j < h; j++)
|
||||
for (i = 0; i < w; i++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
|
@ -93,9 +92,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
for (j = 0; j < w; j++)
|
||||
#if BITDEPTH == 8
|
||||
tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
|
||||
tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
|
||||
#else
|
||||
tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
|
||||
tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
|
||||
-col_clip_max - 1, col_clip_max);
|
||||
#endif
|
||||
}
|
||||
|
@ -106,12 +105,12 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
|||
for (j = 0; j < h; j++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
|
||||
((out[j] + (rnd2)) >> shift2));
|
||||
((out[j] + 8) >> 4));
|
||||
}
|
||||
memset(coeff, 0, sizeof(*coeff) * sh * sw);
|
||||
}
|
||||
|
||||
#define inv_txfm_fn(type1, type2, w, h, shift1, shift2, has_dconly) \
|
||||
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
|
||||
static void \
|
||||
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
||||
const ptrdiff_t stride, \
|
||||
|
@ -119,57 +118,57 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
|||
const int eob \
|
||||
HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
|
||||
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
|
||||
inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
|
||||
HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
#define inv_txfm_fn64(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(dct, dct, w, h, shift1, shift2, 1)
|
||||
#define inv_txfm_fn64(w, h, shift) \
|
||||
inv_txfm_fn(dct, dct, w, h, shift, 1)
|
||||
|
||||
#define inv_txfm_fn32(w, h, shift1, shift2) \
|
||||
inv_txfm_fn64(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(identity, identity, w, h, shift1, shift2, 0)
|
||||
#define inv_txfm_fn32(w, h, shift) \
|
||||
inv_txfm_fn64(w, h, shift) \
|
||||
inv_txfm_fn(identity, identity, w, h, shift, 0)
|
||||
|
||||
#define inv_txfm_fn16(w, h, shift1, shift2) \
|
||||
inv_txfm_fn32(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(adst, dct, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(dct, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(adst, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(dct, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, dct, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(adst, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(identity, dct, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(dct, identity, w, h, shift1, shift2, 0) \
|
||||
#define inv_txfm_fn16(w, h, shift) \
|
||||
inv_txfm_fn32(w, h, shift) \
|
||||
inv_txfm_fn(adst, dct, w, h, shift, 0) \
|
||||
inv_txfm_fn(dct, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(adst, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
|
||||
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(identity, dct, w, h, shift, 0) \
|
||||
inv_txfm_fn(dct, identity, w, h, shift, 0) \
|
||||
|
||||
#define inv_txfm_fn84(w, h, shift1, shift2) \
|
||||
inv_txfm_fn16(w, h, shift1, shift2) \
|
||||
inv_txfm_fn(identity, flipadst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(flipadst, identity, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(identity, adst, w, h, shift1, shift2, 0) \
|
||||
inv_txfm_fn(adst, identity, w, h, shift1, shift2, 0) \
|
||||
#define inv_txfm_fn84(w, h, shift) \
|
||||
inv_txfm_fn16(w, h, shift) \
|
||||
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
|
||||
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
|
||||
inv_txfm_fn(identity, adst, w, h, shift, 0) \
|
||||
inv_txfm_fn(adst, identity, w, h, shift, 0) \
|
||||
|
||||
inv_txfm_fn84( 4, 4, 0, 4)
|
||||
inv_txfm_fn84( 4, 8, 0, 4)
|
||||
inv_txfm_fn84( 4, 16, 1, 4)
|
||||
inv_txfm_fn84( 8, 4, 0, 4)
|
||||
inv_txfm_fn84( 8, 8, 1, 4)
|
||||
inv_txfm_fn84( 8, 16, 1, 4)
|
||||
inv_txfm_fn32( 8, 32, 2, 4)
|
||||
inv_txfm_fn84(16, 4, 1, 4)
|
||||
inv_txfm_fn84(16, 8, 1, 4)
|
||||
inv_txfm_fn16(16, 16, 2, 4)
|
||||
inv_txfm_fn32(16, 32, 1, 4)
|
||||
inv_txfm_fn64(16, 64, 2, 4)
|
||||
inv_txfm_fn32(32, 8, 2, 4)
|
||||
inv_txfm_fn32(32, 16, 1, 4)
|
||||
inv_txfm_fn32(32, 32, 2, 4)
|
||||
inv_txfm_fn64(32, 64, 1, 4)
|
||||
inv_txfm_fn64(64, 16, 2, 4)
|
||||
inv_txfm_fn64(64, 32, 1, 4)
|
||||
inv_txfm_fn64(64, 64, 2, 4)
|
||||
inv_txfm_fn84( 4, 4, 0)
|
||||
inv_txfm_fn84( 4, 8, 0)
|
||||
inv_txfm_fn84( 4, 16, 1)
|
||||
inv_txfm_fn84( 8, 4, 0)
|
||||
inv_txfm_fn84( 8, 8, 1)
|
||||
inv_txfm_fn84( 8, 16, 1)
|
||||
inv_txfm_fn32( 8, 32, 2)
|
||||
inv_txfm_fn84(16, 4, 1)
|
||||
inv_txfm_fn84(16, 8, 1)
|
||||
inv_txfm_fn16(16, 16, 2)
|
||||
inv_txfm_fn32(16, 32, 1)
|
||||
inv_txfm_fn64(16, 64, 2)
|
||||
inv_txfm_fn32(32, 8, 2)
|
||||
inv_txfm_fn32(32, 16, 1)
|
||||
inv_txfm_fn32(32, 32, 2)
|
||||
inv_txfm_fn64(32, 64, 1)
|
||||
inv_txfm_fn64(64, 16, 2)
|
||||
inv_txfm_fn64(64, 32, 1)
|
||||
inv_txfm_fn64(64, 64, 2)
|
||||
|
||||
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
|
||||
coef *const coeff, const int eob
|
||||
|
|
|
@ -53,6 +53,7 @@ typedef struct Dav1dLoopFilterDSPContext {
|
|||
} Dav1dLoopFilterDSPContext;
|
||||
|
||||
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_LOOPFILTER_H */
|
||||
|
|
|
@ -250,7 +250,11 @@ void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
|
|||
c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
|
||||
c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
|
||||
|
||||
#if HAVE_ASM && ARCH_X86
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
bitfn(dav1d_loop_filter_dsp_init_arm)(c);
|
||||
#elif ARCH_X86
|
||||
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -86,12 +86,14 @@ if is_asm_enabled
|
|||
)
|
||||
libdav1d_tmpl_sources += files(
|
||||
'arm/cdef_init_tmpl.c',
|
||||
'arm/loopfilter_init_tmpl.c',
|
||||
'arm/looprestoration_init_tmpl.c',
|
||||
'arm/mc_init_tmpl.c',
|
||||
)
|
||||
if host_machine.cpu_family() == 'aarch64'
|
||||
libdav1d_sources += files(
|
||||
'arm/64/cdef.S',
|
||||
'arm/64/loopfilter.S',
|
||||
'arm/64/looprestoration.S',
|
||||
'arm/64/mc.S',
|
||||
)
|
||||
|
@ -118,20 +120,31 @@ if is_asm_enabled
|
|||
|
||||
# NASM source files
|
||||
libdav1d_sources_asm = files(
|
||||
'x86/cdef.asm',
|
||||
'x86/cdef_ssse3.asm',
|
||||
'x86/cpuid.asm',
|
||||
'x86/ipred.asm',
|
||||
'x86/itx.asm',
|
||||
'x86/loopfilter.asm',
|
||||
'x86/looprestoration.asm',
|
||||
'x86/looprestoration_ssse3.asm',
|
||||
'x86/mc.asm',
|
||||
'x86/mc_ssse3.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
'x86/msac.asm',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources_asm += files(
|
||||
'x86/cdef.asm',
|
||||
'x86/cdef_sse.asm',
|
||||
'x86/ipred.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
'x86/itx.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
'x86/loopfilter.asm',
|
||||
'x86/looprestoration.asm',
|
||||
'x86/looprestoration_ssse3.asm',
|
||||
'x86/mc.asm',
|
||||
'x86/mc_ssse3.asm',
|
||||
)
|
||||
endif
|
||||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources_asm += files(
|
||||
)
|
||||
endif
|
||||
|
||||
# Compile the ASM sources with NASM
|
||||
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||
endif
|
||||
|
@ -139,8 +152,10 @@ endif
|
|||
|
||||
|
||||
|
||||
api_export_flags = []
|
||||
|
||||
#
|
||||
# Windows .rc file
|
||||
# Windows .rc file and API export flags
|
||||
#
|
||||
|
||||
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
|
||||
|
@ -162,6 +177,8 @@ if host_machine.system() == 'windows' and get_option('default_library') != 'stat
|
|||
)
|
||||
|
||||
libdav1d_rc_obj = winmod.compile_resources(rc_file)
|
||||
|
||||
api_export_flags = ['-DDAV1D_BUILDING_DLL']
|
||||
else
|
||||
libdav1d_rc_obj = []
|
||||
endif
|
||||
|
@ -180,7 +197,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
|
|||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
dependencies: [stdatomic_dependency],
|
||||
c_args : [stackalign_flag, stackrealign_flag],
|
||||
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects()
|
||||
|
@ -222,7 +239,7 @@ libdav1d = library('dav1d',
|
|||
thread_dependency,
|
||||
thread_compat_dep,
|
||||
],
|
||||
c_args : [stackalign_flag],
|
||||
c_args : [stackalign_flag, api_export_flags],
|
||||
version : dav1d_soname_version,
|
||||
soversion : dav1d_soversion,
|
||||
install : true,
|
||||
|
|
|
@ -58,8 +58,8 @@ static inline void ctx_refill(MsacContext *s) {
|
|||
* necessary), and stores them back in the decoder context.
|
||||
* dif: The new value of dif.
|
||||
* rng: The new value of the range. */
|
||||
static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
|
||||
const uint16_t d = 15 - (31 ^ clz(rng));
|
||||
static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
|
||||
const int d = 15 ^ (31 ^ clz(rng));
|
||||
assert(rng <= 65535U);
|
||||
s->cnt -= d;
|
||||
s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
|
||||
|
@ -69,18 +69,17 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
|
|||
}
|
||||
|
||||
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
|
||||
ec_win v, vw, dif = s->dif;
|
||||
uint16_t r = s->rng;
|
||||
unsigned ret;
|
||||
ec_win vw, dif = s->dif;
|
||||
unsigned ret, v, r = s->rng;
|
||||
assert((dif >> (EC_WIN_SIZE - 16)) < r);
|
||||
// When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
|
||||
// replace the multiply with a simple shift.
|
||||
v = ((r >> 8) << 7) + EC_MIN_PROB;
|
||||
vw = v << (EC_WIN_SIZE - 16);
|
||||
vw = (ec_win)v << (EC_WIN_SIZE - 16);
|
||||
ret = dif >= vw;
|
||||
dif -= ret*vw;
|
||||
v += ret*(r - 2*v);
|
||||
ctx_norm(s, dif, (unsigned) v);
|
||||
ctx_norm(s, dif, v);
|
||||
return !ret;
|
||||
}
|
||||
|
||||
|
@ -88,59 +87,57 @@ unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
|
|||
* f: The probability that the bit is one
|
||||
* Return: The value decoded (0 or 1). */
|
||||
unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) {
|
||||
ec_win v, vw, dif = s->dif;
|
||||
uint16_t r = s->rng;
|
||||
unsigned ret;
|
||||
ec_win vw, dif = s->dif;
|
||||
unsigned ret, v, r = s->rng;
|
||||
assert((dif >> (EC_WIN_SIZE - 16)) < r);
|
||||
v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
|
||||
vw = v << (EC_WIN_SIZE - 16);
|
||||
vw = (ec_win)v << (EC_WIN_SIZE - 16);
|
||||
ret = dif >= vw;
|
||||
dif -= ret*vw;
|
||||
v += ret*(r - 2*v);
|
||||
ctx_norm(s, dif, (unsigned) v);
|
||||
ctx_norm(s, dif, v);
|
||||
return !ret;
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) {
|
||||
int v = 0;
|
||||
for (int n = (int) l - 1; n >= 0; n--)
|
||||
v = (v << 1) | dav1d_msac_decode_bool_equi(c);
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
|
||||
unsigned v = 0;
|
||||
while (n--)
|
||||
v = (v << 1) | dav1d_msac_decode_bool_equi(s);
|
||||
return v;
|
||||
}
|
||||
|
||||
int dav1d_msac_decode_subexp(MsacContext *const c, const int ref,
|
||||
int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
|
||||
const int n, const unsigned k)
|
||||
{
|
||||
int i = 0;
|
||||
int a = 0;
|
||||
int b = k;
|
||||
while ((2 << b) < n) {
|
||||
if (!dav1d_msac_decode_bool_equi(c)) break;
|
||||
if (!dav1d_msac_decode_bool_equi(s)) break;
|
||||
b = k + i++;
|
||||
a = (1 << b);
|
||||
}
|
||||
const unsigned v = dav1d_msac_decode_bools(c, b) + a;
|
||||
const unsigned v = dav1d_msac_decode_bools(s, b) + a;
|
||||
return ref * 2 <= n ? inv_recenter(ref, v) :
|
||||
n - 1 - inv_recenter(n - 1 - ref, v);
|
||||
}
|
||||
|
||||
int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) {
|
||||
int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
|
||||
assert(n > 0);
|
||||
const int l = ulog2(n) + 1;
|
||||
assert(l > 1);
|
||||
const unsigned m = (1 << l) - n;
|
||||
const unsigned v = dav1d_msac_decode_bools(c, l - 1);
|
||||
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c);
|
||||
const unsigned v = dav1d_msac_decode_bools(s, l - 1);
|
||||
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
|
||||
}
|
||||
|
||||
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
|
||||
* table in Q15. */
|
||||
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
|
||||
const unsigned n_symbols)
|
||||
const size_t n_symbols)
|
||||
{
|
||||
ec_win u, v = s->rng, r = s->rng >> 8;
|
||||
const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
|
||||
unsigned ret = 0;
|
||||
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
|
||||
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
|
||||
|
||||
assert(!cdf[n_symbols - 1]);
|
||||
|
||||
|
@ -148,44 +145,39 @@ static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
|
|||
u = v;
|
||||
v = r * (cdf[ret++] >> EC_PROB_SHIFT);
|
||||
v >>= 7 - EC_PROB_SHIFT;
|
||||
v += EC_MIN_PROB * (n_symbols - ret);
|
||||
v += EC_MIN_PROB * (int) (n_symbols - ret);
|
||||
} while (c < v);
|
||||
|
||||
assert(u <= s->rng);
|
||||
|
||||
ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v));
|
||||
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
|
||||
return ret - 1;
|
||||
}
|
||||
|
||||
static void update_cdf(uint16_t *const cdf, const unsigned val,
|
||||
const unsigned n_symbols)
|
||||
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
|
||||
uint16_t *const cdf,
|
||||
const size_t n_symbols)
|
||||
{
|
||||
const unsigned count = cdf[n_symbols];
|
||||
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
|
||||
unsigned i;
|
||||
for (i = 0; i < val; i++)
|
||||
cdf[i] += (32768 - cdf[i]) >> rate;
|
||||
for (; i < n_symbols - 1; i++)
|
||||
cdf[i] -= cdf[i] >> rate;
|
||||
cdf[n_symbols] = count + (count < 32);
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c,
|
||||
uint16_t *const cdf,
|
||||
const unsigned n_symbols)
|
||||
{
|
||||
const unsigned val = decode_symbol(c, cdf, n_symbols);
|
||||
if(c->allow_update_cdf)
|
||||
update_cdf(cdf, val, n_symbols);
|
||||
const unsigned val = decode_symbol(s, cdf, n_symbols);
|
||||
if (s->allow_update_cdf) {
|
||||
const unsigned count = cdf[n_symbols];
|
||||
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
|
||||
unsigned i;
|
||||
for (i = 0; i < val; i++)
|
||||
cdf[i] += (32768 - cdf[i]) >> rate;
|
||||
for (; i < n_symbols - 1; i++)
|
||||
cdf[i] -= cdf[i] >> rate;
|
||||
cdf[n_symbols] = count + (count < 32);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c,
|
||||
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s,
|
||||
uint16_t *const cdf)
|
||||
{
|
||||
const unsigned bit = dav1d_msac_decode_bool(c, *cdf);
|
||||
const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
|
||||
|
||||
if(c->allow_update_cdf){
|
||||
if (s->allow_update_cdf) {
|
||||
// update_cdf() specialized for boolean CDFs
|
||||
const unsigned count = cdf[1];
|
||||
const int rate = (count >> 4) | 4;
|
||||
|
|
|
@ -38,20 +38,37 @@ typedef struct MsacContext {
|
|||
const uint8_t *buf_pos;
|
||||
const uint8_t *buf_end;
|
||||
ec_win dif;
|
||||
uint16_t rng;
|
||||
unsigned rng;
|
||||
int cnt;
|
||||
int allow_update_cdf;
|
||||
} MsacContext;
|
||||
|
||||
void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz,
|
||||
void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
|
||||
int disable_cdf_update_flag);
|
||||
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
|
||||
const unsigned n_symbols);
|
||||
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s);
|
||||
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
|
||||
unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
|
||||
unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l);
|
||||
int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
|
||||
int dav1d_msac_decode_uniform(MsacContext *c, unsigned n);
|
||||
unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
|
||||
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
|
||||
int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
|
||||
|
||||
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
|
||||
#if ARCH_X86_64 && HAVE_ASM
|
||||
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
|
||||
#else
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
|
||||
#endif
|
||||
|
||||
#endif /* DAV1D_SRC_MSAC_H */
|
||||
|
|
|
@ -107,7 +107,9 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
uint16_t *const txtp_cdf = intra ?
|
||||
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
|
||||
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
|
||||
idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
|
||||
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
|
||||
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
|
||||
|
||||
if (dbg)
|
||||
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
|
||||
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
|
||||
|
@ -122,19 +124,19 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
|
||||
const int is_1d = tx_class != TX_CLASS_2D;
|
||||
switch (tx2dszctx) {
|
||||
#define case_sz(sz, bin) \
|
||||
#define case_sz(sz, bin, ns) \
|
||||
case sz: { \
|
||||
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
|
||||
eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
|
||||
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
|
||||
break; \
|
||||
}
|
||||
case_sz(0, 16);
|
||||
case_sz(1, 32);
|
||||
case_sz(2, 64);
|
||||
case_sz(3, 128);
|
||||
case_sz(4, 256);
|
||||
case_sz(5, 512);
|
||||
case_sz(6, 1024);
|
||||
case_sz(0, 16, 4);
|
||||
case_sz(1, 32, 8);
|
||||
case_sz(2, 64, 8);
|
||||
case_sz(3, 128, 8);
|
||||
case_sz(4, 256, 16);
|
||||
case_sz(5, 512, 16);
|
||||
case_sz(6, 1024, 16);
|
||||
#undef case_sz
|
||||
}
|
||||
if (dbg)
|
||||
|
@ -179,8 +181,8 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
uint16_t *const lo_cdf = is_last ?
|
||||
ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
|
||||
ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
|
||||
int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf,
|
||||
4 - is_last) + is_last;
|
||||
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf,
|
||||
4 - is_last) + is_last;
|
||||
if (dbg)
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
|
||||
|
@ -190,7 +192,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
if (tok == 3) {
|
||||
const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
|
||||
do {
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac,
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
br_cdf[br_ctx], 4);
|
||||
if (dbg)
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
|
||||
|
|
|
@ -113,7 +113,7 @@ SECTION .text
|
|||
paddw m15, m5
|
||||
%endmacro
|
||||
|
||||
%macro cdef_filter_fn 3 ; w, h, stride
|
||||
%macro CDEF_FILTER 3 ; w, h, stride
|
||||
INIT_YMM avx2
|
||||
%if %1 != 4 || %2 != 8
|
||||
cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
|
||||
|
@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
lea dst4q, [dstq+strideq*4]
|
||||
%endif
|
||||
lea stride3q, [strideq*3]
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .no_right
|
||||
pmovzxbw m1, [dstq+strideq*0]
|
||||
pmovzxbw m2, [dstq+strideq*1]
|
||||
|
@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
|
||||
; top
|
||||
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
|
||||
test edged, 4 ; have_top
|
||||
test edgeb, 4 ; have_top
|
||||
jz .no_top
|
||||
mov top1q, [top2q+0*gprsize]
|
||||
mov top2q, [top2q+1*gprsize]
|
||||
test edged, 1 ; have_left
|
||||
test edgeb, 1 ; have_left
|
||||
jz .top_no_left
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .top_no_right
|
||||
pmovzxbw m1, [top1q-(%1/2)]
|
||||
pmovzxbw m2, [top2q-(%1/2)]
|
||||
|
@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
movd [px-1*%3+%1*2], xm14
|
||||
jmp .top_done
|
||||
.top_no_left:
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .top_no_left_right
|
||||
pmovzxbw m1, [top1q]
|
||||
pmovzxbw m2, [top2q]
|
||||
|
@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
.top_done:
|
||||
|
||||
; left
|
||||
test edged, 1 ; have_left
|
||||
test edgeb, 1 ; have_left
|
||||
jz .no_left
|
||||
pmovzxbw xm1, [leftq+ 0]
|
||||
%if %2 == 8
|
||||
|
@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
|
||||
; bottom
|
||||
DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
|
||||
test edged, 8 ; have_bottom
|
||||
test edgeb, 8 ; have_bottom
|
||||
jz .no_bottom
|
||||
lea dst8q, [dstq+%2*strideq]
|
||||
test edged, 1 ; have_left
|
||||
test edgeb, 1 ; have_left
|
||||
jz .bottom_no_left
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .bottom_no_right
|
||||
pmovzxbw m1, [dst8q-(%1/2)]
|
||||
pmovzxbw m2, [dst8q+strideq-(%1/2)]
|
||||
|
@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
movd [px+(%2+1)*%3+%1*2], xm14
|
||||
jmp .bottom_done
|
||||
.bottom_no_left:
|
||||
test edged, 2 ; have_right
|
||||
test edgeb, 2 ; have_right
|
||||
jz .bottom_no_left_right
|
||||
pmovzxbw m1, [dst8q]
|
||||
pmovzxbw m2, [dst8q+strideq]
|
||||
|
@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
|
||||
; actual filter
|
||||
INIT_YMM avx2
|
||||
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
|
||||
DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
|
||||
%undef edged
|
||||
; register to shuffle values into after packing
|
||||
vbroadcasti128 m12, [shufb_lohi]
|
||||
|
||||
movifnidn prid, prim
|
||||
movifnidn secd, secm
|
||||
mov dampingd, r7m
|
||||
|
||||
mov pridmpd, prid
|
||||
mov secdmpd, secd
|
||||
or pridmpd, 1
|
||||
or secdmpd, 1
|
||||
lzcnt pridmpd, pridmpd
|
||||
lzcnt secdmpd, secdmpd
|
||||
lea pridmpd, [pridmpd+dampingd-31]
|
||||
lea secdmpd, [secdmpd+dampingd-31]
|
||||
xor dampingd, dampingd
|
||||
test pridmpd, pridmpd
|
||||
cmovl pridmpd, dampingd
|
||||
test secdmpd, secdmpd
|
||||
cmovl secdmpd, dampingd
|
||||
lzcnt pridmpd, prid
|
||||
%if UNIX64
|
||||
movd xm0, prid
|
||||
movd xm1, secdmpd
|
||||
%endif
|
||||
lzcnt secdmpd, secdmpm
|
||||
sub dampingd, 31
|
||||
xor zerod, zerod
|
||||
add pridmpd, dampingd
|
||||
cmovl pridmpd, zerod
|
||||
add secdmpd, dampingd
|
||||
cmovl secdmpd, zerod
|
||||
mov [rsp+0], pridmpq ; pri_shift
|
||||
mov [rsp+8], secdmpq ; sec_shift
|
||||
|
||||
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
|
||||
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
|
||||
lea tableq, [tap_table]
|
||||
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
|
||||
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
|
||||
|
||||
; pri/sec_taps[k] [4 total]
|
||||
DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
|
||||
movd xm0, prid
|
||||
movd xm1, secd
|
||||
DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
|
||||
%if UNIX64
|
||||
vpbroadcastb m0, xm0 ; pri_strength
|
||||
vpbroadcastb m1, xm1 ; sec_strength
|
||||
%else
|
||||
vpbroadcastb m0, prim
|
||||
vpbroadcastb m1, secm
|
||||
%endif
|
||||
and prid, 1
|
||||
lea priq, [tableq+priq*2+8] ; pri_taps
|
||||
lea secq, [tableq+12] ; sec_taps
|
||||
|
||||
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
|
||||
DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
|
||||
mov dird, r6m
|
||||
lea dirq, [tapq+dirq*2+14]
|
||||
lea dirq, [tableq+dirq*2+14]
|
||||
%if %1*%2*2/mmsize > 1
|
||||
%if %1 == 4
|
||||
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
|
||||
|
@ -476,9 +475,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
|
|||
RET
|
||||
%endmacro
|
||||
|
||||
cdef_filter_fn 8, 8, 32
|
||||
cdef_filter_fn 4, 8, 32
|
||||
cdef_filter_fn 4, 4, 32
|
||||
CDEF_FILTER 8, 8, 32
|
||||
CDEF_FILTER 4, 8, 32
|
||||
CDEF_FILTER 4, 4, 32
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
||||
|
@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
paddw m11, m13 ; partial_sum_alt[3/2] right
|
||||
vbroadcasti128 m13, [div_table+32]
|
||||
paddw m4, m5 ; partial_sum_alt[3/2] left
|
||||
pshuflw m11, m11, q3012
|
||||
punpckhwd m6, m4, m11
|
||||
punpcklwd m4, m11
|
||||
pshuflw m5, m11, q3012
|
||||
punpckhwd m6, m11, m4
|
||||
punpcklwd m4, m5
|
||||
pmaddwd m6, m6
|
||||
pmaddwd m4, m4
|
||||
pmulld m6, m12
|
||||
|
@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
paddw m6, m7
|
||||
paddw m1, m3 ; partial_sum_alt[0/1] right
|
||||
paddw m5, m6 ; partial_sum_alt[0/1] left
|
||||
pshuflw m1, m1, q3012
|
||||
punpckhwd m6, m5, m1
|
||||
punpcklwd m5, m1
|
||||
pmaddwd m6, m6
|
||||
pshuflw m0, m1, q3012
|
||||
punpckhwd m1, m5
|
||||
punpcklwd m5, m0
|
||||
pmaddwd m1, m1
|
||||
pmaddwd m5, m5
|
||||
pmulld m6, m12
|
||||
pmulld m1, m12
|
||||
pmulld m5, m13
|
||||
paddd m5, m6 ; cost1[a-d] | cost3[a-d]
|
||||
paddd m5, m1 ; cost1[a-d] | cost3[a-d]
|
||||
|
||||
mova xm0, [pd_47130256+ 16]
|
||||
mova m1, [pd_47130256]
|
||||
|
@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
|
||||
; now find the best cost
|
||||
pmaxsd xm2, xm0, xm1
|
||||
pshufd xm3, xm2, q3232
|
||||
pshufd xm3, xm2, q1032
|
||||
pmaxsd xm2, xm3
|
||||
pshufd xm3, xm2, q1111
|
||||
pmaxsd xm2, xm3
|
||||
pshufd xm2, xm2, q0000 ; best cost
|
||||
pshufd xm3, xm2, q2301
|
||||
pmaxsd xm2, xm3 ; best cost
|
||||
|
||||
; find the idx using minpos
|
||||
; make everything other than the best cost negative via subtraction
|
||||
|
@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
phminposuw xm3, xm3
|
||||
|
||||
; convert idx to 32-bits
|
||||
psrldq xm3, 2
|
||||
psrld xm3, 16
|
||||
movd eax, xm3
|
||||
|
||||
; get idx^4 complement
|
||||
|
|
|
@ -29,15 +29,19 @@
|
|||
#include "src/cdef.h"
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
|
||||
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
|
||||
|
||||
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
||||
|
@ -45,13 +49,22 @@ void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
|||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH ==8
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_ssse3;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_sse4;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -58,6 +58,7 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
|
|||
decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
|
||||
|
@ -67,6 +68,10 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
|
|||
decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
|
||||
decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
|
||||
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3);
|
||||
|
||||
decl_pal_pred_fn(dav1d_pal_pred_ssse3);
|
||||
|
||||
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
|
||||
|
@ -81,6 +86,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
|
|||
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3;
|
||||
c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
|
||||
c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
|
||||
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3;
|
||||
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3;
|
||||
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
|
||||
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
|
||||
|
@ -90,7 +96,11 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
|
|||
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3;
|
||||
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3;
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_ssse3;
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3;
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_ssse3;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -86,6 +86,17 @@ decl_itx16_fns(16, 4, ssse3);
|
|||
decl_itx16_fns( 8, 16, ssse3);
|
||||
decl_itx16_fns(16, 8, ssse3);
|
||||
decl_itx12_fns(16, 16, ssse3);
|
||||
decl_itx2_fns ( 8, 32, ssse3);
|
||||
decl_itx2_fns (32, 8, ssse3);
|
||||
decl_itx2_fns (16, 32, ssse3);
|
||||
decl_itx2_fns (32, 16, ssse3);
|
||||
decl_itx2_fns (32, 32, ssse3);
|
||||
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
|
||||
|
||||
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
|
||||
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
|
||||
|
@ -138,6 +149,16 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
|
|||
assign_itx16_fn(R, 8, 16, ssse3);
|
||||
assign_itx16_fn(R, 16, 8, ssse3);
|
||||
assign_itx12_fn(, 16, 16, ssse3);
|
||||
assign_itx2_fn (R, 8, 32, ssse3);
|
||||
assign_itx2_fn (R, 32, 8, ssse3);
|
||||
assign_itx2_fn (R, 16, 32, ssse3);
|
||||
assign_itx2_fn (R, 32, 16, ssse3);
|
||||
assign_itx2_fn (, 32, 32, ssse3);
|
||||
assign_itx1_fn (R, 16, 64, ssse3);
|
||||
assign_itx1_fn (R, 32, 64, ssse3);
|
||||
assign_itx1_fn (R, 64, 16, ssse3);
|
||||
assign_itx1_fn (R, 64, 32, ssse3);
|
||||
assign_itx1_fn ( , 64, 64, ssse3);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,287 @@
|
|||
; Copyright © 2019, VideoLAN and dav1d authors
|
||||
; Copyright © 2019, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA 64 ; avoids cacheline splits
|
||||
|
||||
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
|
||||
pw_0xff00: times 8 dw 0xff00
|
||||
pw_32: times 8 dw 32
|
||||
|
||||
struc msac
|
||||
.buf: resq 1
|
||||
.end: resq 1
|
||||
.dif: resq 1
|
||||
.rng: resd 1
|
||||
.cnt: resd 1
|
||||
.update_cdf: resd 1
|
||||
endstruc
|
||||
|
||||
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
|
||||
|
||||
SECTION .text
|
||||
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 3
|
||||
%define buf rsp+8 ; shadow space
|
||||
%else
|
||||
DECLARE_REG_TMP 0
|
||||
%define buf rsp-40 ; red zone
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
|
||||
movd m2, [sq+msac.rng]
|
||||
movq m1, [cdfq]
|
||||
lea rax, [pw_0xff00]
|
||||
movq m3, [sq+msac.dif]
|
||||
mov r3d, [sq+msac.update_cdf]
|
||||
mov r4d, nsd
|
||||
neg nsq
|
||||
pshuflw m2, m2, q0000
|
||||
movd [buf+12], m2
|
||||
pand m2, [rax]
|
||||
mova m0, m1
|
||||
psrlw m1, 6
|
||||
psllw m1, 7
|
||||
pmulhuw m1, m2
|
||||
movq m2, [rax+nsq*2]
|
||||
pshuflw m3, m3, q3333
|
||||
paddw m1, m2
|
||||
mova [buf+16], m1
|
||||
psubusw m1, m3
|
||||
pxor m2, m2
|
||||
pcmpeqw m1, m2 ; c >= v
|
||||
pmovmskb eax, m1
|
||||
test r3d, r3d
|
||||
jz .renorm ; !allow_update_cdf
|
||||
|
||||
; update_cdf:
|
||||
movzx r3d, word [cdfq+r4*2] ; count
|
||||
pcmpeqw m2, m2
|
||||
mov r2d, r3d
|
||||
shr r3d, 4
|
||||
cmp r4d, 4
|
||||
sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
|
||||
cmp r2d, 32
|
||||
adc r2d, 0 ; count + (count < 32)
|
||||
movd m3, r3d
|
||||
pavgw m2, m1 ; i >= val ? -1 : 32768
|
||||
psubw m2, m0 ; for (i = 0; i < val; i++)
|
||||
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
|
||||
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
|
||||
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
|
||||
movq [cdfq], m0
|
||||
mov [cdfq+r4*2], r2w
|
||||
|
||||
.renorm:
|
||||
tzcnt eax, eax
|
||||
mov r4, [sq+msac.dif]
|
||||
movzx r1d, word [buf+rax+16] ; v
|
||||
movzx r2d, word [buf+rax+14] ; u
|
||||
shr eax, 1
|
||||
.renorm2:
|
||||
not r4
|
||||
sub r2d, r1d ; rng
|
||||
shl r1, 48
|
||||
add r4, r1 ; ~dif
|
||||
mov r1d, [sq+msac.cnt]
|
||||
movifnidn t0, sq
|
||||
bsr ecx, r2d
|
||||
xor ecx, 15 ; d
|
||||
shl r2d, cl
|
||||
shl r4, cl
|
||||
mov [t0+msac.rng], r2d
|
||||
not r4
|
||||
sub r1d, ecx
|
||||
jge .end ; no refill required
|
||||
|
||||
; refill:
|
||||
mov r2, [t0+msac.buf]
|
||||
mov rcx, [t0+msac.end]
|
||||
lea r5, [r2+8]
|
||||
cmp r5, rcx
|
||||
jg .refill_eob
|
||||
mov r2, [r2]
|
||||
lea ecx, [r1+23]
|
||||
add r1d, 16
|
||||
shr ecx, 3 ; shift_bytes
|
||||
bswap r2
|
||||
sub r5, rcx
|
||||
shl ecx, 3 ; shift_bits
|
||||
shr r2, cl
|
||||
sub ecx, r1d ; shift_bits - 16 - cnt
|
||||
mov r1d, 48
|
||||
shl r2, cl
|
||||
mov [t0+msac.buf], r5
|
||||
sub r1d, ecx ; cnt + 64 - shift_bits
|
||||
xor r4, r2
|
||||
.end:
|
||||
mov [t0+msac.cnt], r1d
|
||||
mov [t0+msac.dif], r4
|
||||
RET
|
||||
.refill_eob: ; avoid overreading the input buffer
|
||||
mov r5, rcx
|
||||
mov ecx, 40
|
||||
sub ecx, r1d ; c
|
||||
.refill_eob_loop:
|
||||
cmp r2, r5
|
||||
jge .refill_eob_end ; eob reached
|
||||
movzx r1d, byte [r2]
|
||||
inc r2
|
||||
shl r1, cl
|
||||
xor r4, r1
|
||||
sub ecx, 8
|
||||
jge .refill_eob_loop
|
||||
.refill_eob_end:
|
||||
mov r1d, 40
|
||||
sub r1d, ecx
|
||||
mov [t0+msac.buf], r2
|
||||
mov [t0+msac.dif], r4
|
||||
mov [t0+msac.cnt], r1d
|
||||
RET
|
||||
|
||||
cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
|
||||
movd m2, [sq+msac.rng]
|
||||
movu m1, [cdfq]
|
||||
lea rax, [pw_0xff00]
|
||||
movq m3, [sq+msac.dif]
|
||||
mov r3d, [sq+msac.update_cdf]
|
||||
mov r4d, nsd
|
||||
neg nsq
|
||||
pshuflw m2, m2, q0000
|
||||
movd [buf+12], m2
|
||||
punpcklqdq m2, m2
|
||||
mova m0, m1
|
||||
psrlw m1, 6
|
||||
pand m2, [rax]
|
||||
psllw m1, 7
|
||||
pmulhuw m1, m2
|
||||
movu m2, [rax+nsq*2]
|
||||
pshuflw m3, m3, q3333
|
||||
paddw m1, m2
|
||||
punpcklqdq m3, m3
|
||||
mova [buf+16], m1
|
||||
psubusw m1, m3
|
||||
pxor m2, m2
|
||||
pcmpeqw m1, m2
|
||||
pmovmskb eax, m1
|
||||
test r3d, r3d
|
||||
jz m(msac_decode_symbol_adapt4).renorm
|
||||
movzx r3d, word [cdfq+r4*2]
|
||||
pcmpeqw m2, m2
|
||||
mov r2d, r3d
|
||||
shr r3d, 4
|
||||
cmp r4d, 4 ; may be called with n_symbols < 4
|
||||
sbb r3d, -5
|
||||
cmp r2d, 32
|
||||
adc r2d, 0
|
||||
movd m3, r3d
|
||||
pavgw m2, m1
|
||||
psubw m2, m0
|
||||
psubw m0, m1
|
||||
psraw m2, m3
|
||||
paddw m0, m2
|
||||
movu [cdfq], m0
|
||||
mov [cdfq+r4*2], r2w
|
||||
jmp m(msac_decode_symbol_adapt4).renorm
|
||||
|
||||
cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
|
||||
movd m4, [sq+msac.rng]
|
||||
movu m2, [cdfq]
|
||||
lea rax, [pw_0xff00]
|
||||
movu m3, [cdfq+16]
|
||||
movq m5, [sq+msac.dif]
|
||||
mov r3d, [sq+msac.update_cdf]
|
||||
mov r4d, nsd
|
||||
neg nsq
|
||||
%if WIN64
|
||||
sub rsp, 48 ; need 36 bytes, shadow space is only 32
|
||||
%endif
|
||||
pshuflw m4, m4, q0000
|
||||
movd [buf-4], m4
|
||||
punpcklqdq m4, m4
|
||||
mova m0, m2
|
||||
psrlw m2, 6
|
||||
mova m1, m3
|
||||
psrlw m3, 6
|
||||
pand m4, [rax]
|
||||
psllw m2, 7
|
||||
psllw m3, 7
|
||||
pmulhuw m2, m4
|
||||
pmulhuw m3, m4
|
||||
movu m4, [rax+nsq*2]
|
||||
pshuflw m5, m5, q3333
|
||||
paddw m2, m4
|
||||
psubw m4, [rax-pw_0xff00+pw_32]
|
||||
punpcklqdq m5, m5
|
||||
paddw m3, m4
|
||||
mova [buf], m2
|
||||
mova [buf+16], m3
|
||||
psubusw m2, m5
|
||||
psubusw m3, m5
|
||||
pxor m4, m4
|
||||
pcmpeqw m2, m4
|
||||
pcmpeqw m3, m4
|
||||
packsswb m5, m2, m3
|
||||
pmovmskb eax, m5
|
||||
test r3d, r3d
|
||||
jz .renorm
|
||||
movzx r3d, word [cdfq+r4*2]
|
||||
pcmpeqw m4, m4
|
||||
mova m5, m4
|
||||
lea r2d, [r3+80] ; only support n_symbols >= 4
|
||||
shr r2d, 4
|
||||
cmp r3d, 32
|
||||
adc r3d, 0
|
||||
pavgw m4, m2
|
||||
pavgw m5, m3
|
||||
psubw m4, m0
|
||||
psubw m0, m2
|
||||
movd m2, r2d
|
||||
psubw m5, m1
|
||||
psubw m1, m3
|
||||
psraw m4, m2
|
||||
psraw m5, m2
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
movu [cdfq], m0
|
||||
movu [cdfq+16], m1
|
||||
mov [cdfq+r4*2], r3w
|
||||
.renorm:
|
||||
tzcnt eax, eax
|
||||
mov r4, [sq+msac.dif]
|
||||
movzx r1d, word [buf+rax*2]
|
||||
movzx r2d, word [buf+rax*2-2]
|
||||
%if WIN64
|
||||
add rsp, 48
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm2
|
||||
|
||||
%endif
|
|
@ -32,22 +32,22 @@
|
|||
#include "src/arm/32/util.S"
|
||||
|
||||
const register_init, align=3
|
||||
.quad 0x21f86d66c8ca00ce
|
||||
.quad 0x75b6ba21077c48ad
|
||||
.quad 0xed56bb2dcb3c7736
|
||||
.quad 0x8bda43d3fd1a7e06
|
||||
.quad 0xb64a9c9e5d318408
|
||||
.quad 0xdf9a54b303f1d3a3
|
||||
.quad 0x4a75479abd64e097
|
||||
.quad 0x249214109d5d1c88
|
||||
.quad 0x21f86d66c8ca00ce
|
||||
.quad 0x75b6ba21077c48ad
|
||||
.quad 0xed56bb2dcb3c7736
|
||||
.quad 0x8bda43d3fd1a7e06
|
||||
.quad 0xb64a9c9e5d318408
|
||||
.quad 0xdf9a54b303f1d3a3
|
||||
.quad 0x4a75479abd64e097
|
||||
.quad 0x249214109d5d1c88
|
||||
endconst
|
||||
|
||||
const error_message_fpscr
|
||||
.asciz "failed to preserve register FPSCR, changed bits: %x"
|
||||
.asciz "failed to preserve register FPSCR, changed bits: %x"
|
||||
error_message_gpr:
|
||||
.asciz "failed to preserve register r%d"
|
||||
.asciz "failed to preserve register r%d"
|
||||
error_message_vfp:
|
||||
.asciz "failed to preserve register d%d"
|
||||
.asciz "failed to preserve register d%d"
|
||||
endconst
|
||||
|
||||
@ max number of args used by any asm function.
|
||||
|
@ -61,111 +61,111 @@ endconst
|
|||
.macro clobbercheck variant
|
||||
.equ pushed, 4*9
|
||||
function checked_call_\variant, export=1
|
||||
push {r4-r11, lr}
|
||||
push {r4-r11, lr}
|
||||
.ifc \variant, vfp
|
||||
vpush {d8-d15}
|
||||
fmrx r4, FPSCR
|
||||
push {r4}
|
||||
vpush {d8-d15}
|
||||
fmrx r4, FPSCR
|
||||
push {r4}
|
||||
.equ pushed, pushed + 16*4 + 4
|
||||
.endif
|
||||
|
||||
movrel r12, register_init
|
||||
movrel r12, register_init
|
||||
.ifc \variant, vfp
|
||||
vldm r12, {d8-d15}
|
||||
vldm r12, {d8-d15}
|
||||
.endif
|
||||
ldm r12, {r4-r11}
|
||||
ldm r12, {r4-r11}
|
||||
|
||||
sub sp, sp, #ARG_STACK_A
|
||||
sub sp, sp, #ARG_STACK_A
|
||||
.equ pos, 0
|
||||
.rept MAX_ARGS-4
|
||||
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
|
||||
str r12, [sp, #pos]
|
||||
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
|
||||
str r12, [sp, #pos]
|
||||
.equ pos, pos + 4
|
||||
.endr
|
||||
|
||||
mov r12, r0
|
||||
mov r0, r2
|
||||
mov r1, r3
|
||||
ldrd r2, r3, [sp, #ARG_STACK_A + pushed]
|
||||
blx r12
|
||||
add sp, sp, #ARG_STACK_A
|
||||
mov r12, r0
|
||||
mov r0, r2
|
||||
mov r1, r3
|
||||
ldrd r2, r3, [sp, #ARG_STACK_A + pushed]
|
||||
blx r12
|
||||
add sp, sp, #ARG_STACK_A
|
||||
|
||||
push {r0, r1}
|
||||
movrel r12, register_init
|
||||
push {r0, r1}
|
||||
movrel r12, register_init
|
||||
.ifc \variant, vfp
|
||||
.macro check_reg_vfp, dreg, offset
|
||||
ldrd r2, r3, [r12, #8 * (\offset)]
|
||||
vmov r0, lr, \dreg
|
||||
eor r2, r2, r0
|
||||
eor r3, r3, lr
|
||||
orrs r2, r2, r3
|
||||
bne 4f
|
||||
ldrd r2, r3, [r12, #8 * (\offset)]
|
||||
vmov r0, lr, \dreg
|
||||
eor r2, r2, r0
|
||||
eor r3, r3, lr
|
||||
orrs r2, r2, r3
|
||||
bne 4f
|
||||
.endm
|
||||
|
||||
.irp n, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
@ keep track of the checked double/SIMD register
|
||||
mov r1, #\n
|
||||
check_reg_vfp d\n, \n-8
|
||||
@ keep track of the checked double/SIMD register
|
||||
mov r1, #\n
|
||||
check_reg_vfp d\n, \n-8
|
||||
.endr
|
||||
.purgem check_reg_vfp
|
||||
|
||||
fmrx r1, FPSCR
|
||||
ldr r3, [sp, #8]
|
||||
eor r1, r1, r3
|
||||
@ Ignore changes in bits 0-4 and 7
|
||||
bic r1, r1, #0x9f
|
||||
@ Ignore changes in the topmost 5 bits
|
||||
bics r1, r1, #0xf8000000
|
||||
bne 3f
|
||||
fmrx r1, FPSCR
|
||||
ldr r3, [sp, #8]
|
||||
eor r1, r1, r3
|
||||
@ Ignore changes in bits 0-4 and 7
|
||||
bic r1, r1, #0x9f
|
||||
@ Ignore changes in the topmost 5 bits
|
||||
bics r1, r1, #0xf8000000
|
||||
bne 3f
|
||||
.endif
|
||||
|
||||
@ keep track of the checked GPR
|
||||
mov r1, #4
|
||||
@ keep track of the checked GPR
|
||||
mov r1, #4
|
||||
.macro check_reg reg1, reg2=
|
||||
ldrd r2, r3, [r12], #8
|
||||
eors r2, r2, \reg1
|
||||
bne 2f
|
||||
add r1, r1, #1
|
||||
ldrd r2, r3, [r12], #8
|
||||
eors r2, r2, \reg1
|
||||
bne 2f
|
||||
add r1, r1, #1
|
||||
.ifnb \reg2
|
||||
eors r3, r3, \reg2
|
||||
bne 2f
|
||||
eors r3, r3, \reg2
|
||||
bne 2f
|
||||
.endif
|
||||
add r1, r1, #1
|
||||
add r1, r1, #1
|
||||
.endm
|
||||
check_reg r4, r5
|
||||
check_reg r6, r7
|
||||
check_reg r4, r5
|
||||
check_reg r6, r7
|
||||
@ r9 is a volatile register in the ios ABI
|
||||
#ifdef __APPLE__
|
||||
check_reg r8
|
||||
check_reg r8
|
||||
#else
|
||||
check_reg r8, r9
|
||||
check_reg r8, r9
|
||||
#endif
|
||||
check_reg r10, r11
|
||||
check_reg r10, r11
|
||||
.purgem check_reg
|
||||
|
||||
b 0f
|
||||
b 0f
|
||||
4:
|
||||
movrel r0, error_message_vfp
|
||||
b 1f
|
||||
movrel r0, error_message_vfp
|
||||
b 1f
|
||||
3:
|
||||
movrel r0, error_message_fpscr
|
||||
b 1f
|
||||
movrel r0, error_message_fpscr
|
||||
b 1f
|
||||
2:
|
||||
movrel r0, error_message_gpr
|
||||
movrel r0, error_message_gpr
|
||||
1:
|
||||
#ifdef PREFIX
|
||||
blx _checkasm_fail_func
|
||||
blx _checkasm_fail_func
|
||||
#else
|
||||
blx checkasm_fail_func
|
||||
blx checkasm_fail_func
|
||||
#endif
|
||||
0:
|
||||
pop {r0, r1}
|
||||
pop {r0, r1}
|
||||
.ifc \variant, vfp
|
||||
pop {r2}
|
||||
fmxr FPSCR, r2
|
||||
vpop {d8-d15}
|
||||
pop {r2}
|
||||
fmxr FPSCR, r2
|
||||
vpop {d8-d15}
|
||||
.endif
|
||||
pop {r4-r11, pc}
|
||||
pop {r4-r11, pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
|
|
@ -32,29 +32,29 @@
|
|||
#include "src/arm/64/util.S"
|
||||
|
||||
const register_init, align=4
|
||||
.quad 0x21f86d66c8ca00ce
|
||||
.quad 0x75b6ba21077c48ad
|
||||
.quad 0xed56bb2dcb3c7736
|
||||
.quad 0x8bda43d3fd1a7e06
|
||||
.quad 0xb64a9c9e5d318408
|
||||
.quad 0xdf9a54b303f1d3a3
|
||||
.quad 0x4a75479abd64e097
|
||||
.quad 0x249214109d5d1c88
|
||||
.quad 0x1a1b2550a612b48c
|
||||
.quad 0x79445c159ce79064
|
||||
.quad 0x2eed899d5a28ddcd
|
||||
.quad 0x86b2536fcd8cf636
|
||||
.quad 0xb0856806085e7943
|
||||
.quad 0x3f2bf84fc0fcca4e
|
||||
.quad 0xacbd382dcf5b8de2
|
||||
.quad 0xd229e1f5b281303f
|
||||
.quad 0x71aeaff20b095fd9
|
||||
.quad 0xab63e2e11fa38ed9
|
||||
.quad 0x21f86d66c8ca00ce
|
||||
.quad 0x75b6ba21077c48ad
|
||||
.quad 0xed56bb2dcb3c7736
|
||||
.quad 0x8bda43d3fd1a7e06
|
||||
.quad 0xb64a9c9e5d318408
|
||||
.quad 0xdf9a54b303f1d3a3
|
||||
.quad 0x4a75479abd64e097
|
||||
.quad 0x249214109d5d1c88
|
||||
.quad 0x1a1b2550a612b48c
|
||||
.quad 0x79445c159ce79064
|
||||
.quad 0x2eed899d5a28ddcd
|
||||
.quad 0x86b2536fcd8cf636
|
||||
.quad 0xb0856806085e7943
|
||||
.quad 0x3f2bf84fc0fcca4e
|
||||
.quad 0xacbd382dcf5b8de2
|
||||
.quad 0xd229e1f5b281303f
|
||||
.quad 0x71aeaff20b095fd9
|
||||
.quad 0xab63e2e11fa38ed9
|
||||
endconst
|
||||
|
||||
|
||||
const error_message
|
||||
.asciz "failed to preserve register"
|
||||
.asciz "failed to preserve register"
|
||||
endconst
|
||||
|
||||
|
||||
|
@ -64,107 +64,107 @@ endconst
|
|||
#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
|
||||
|
||||
function stack_clobber, export=1
|
||||
mov x3, sp
|
||||
mov x2, #CLOBBER_STACK
|
||||
mov x3, sp
|
||||
mov x2, #CLOBBER_STACK
|
||||
1:
|
||||
stp x0, x1, [sp, #-16]!
|
||||
subs x2, x2, #16
|
||||
b.gt 1b
|
||||
mov sp, x3
|
||||
ret
|
||||
stp x0, x1, [sp, #-16]!
|
||||
subs x2, x2, #16
|
||||
b.gt 1b
|
||||
mov sp, x3
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15)
|
||||
|
||||
function checked_call, export=1
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
stp x19, x20, [sp, #-16]!
|
||||
stp x21, x22, [sp, #-16]!
|
||||
stp x23, x24, [sp, #-16]!
|
||||
stp x25, x26, [sp, #-16]!
|
||||
stp x27, x28, [sp, #-16]!
|
||||
stp d8, d9, [sp, #-16]!
|
||||
stp d10, d11, [sp, #-16]!
|
||||
stp d12, d13, [sp, #-16]!
|
||||
stp d14, d15, [sp, #-16]!
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
stp x19, x20, [sp, #-16]!
|
||||
stp x21, x22, [sp, #-16]!
|
||||
stp x23, x24, [sp, #-16]!
|
||||
stp x25, x26, [sp, #-16]!
|
||||
stp x27, x28, [sp, #-16]!
|
||||
stp d8, d9, [sp, #-16]!
|
||||
stp d10, d11, [sp, #-16]!
|
||||
stp d12, d13, [sp, #-16]!
|
||||
stp d14, d15, [sp, #-16]!
|
||||
|
||||
movrel x9, register_init
|
||||
ldp d8, d9, [x9], #16
|
||||
ldp d10, d11, [x9], #16
|
||||
ldp d12, d13, [x9], #16
|
||||
ldp d14, d15, [x9], #16
|
||||
ldp x19, x20, [x9], #16
|
||||
ldp x21, x22, [x9], #16
|
||||
ldp x23, x24, [x9], #16
|
||||
ldp x25, x26, [x9], #16
|
||||
ldp x27, x28, [x9], #16
|
||||
movrel x9, register_init
|
||||
ldp d8, d9, [x9], #16
|
||||
ldp d10, d11, [x9], #16
|
||||
ldp d12, d13, [x9], #16
|
||||
ldp d14, d15, [x9], #16
|
||||
ldp x19, x20, [x9], #16
|
||||
ldp x21, x22, [x9], #16
|
||||
ldp x23, x24, [x9], #16
|
||||
ldp x25, x26, [x9], #16
|
||||
ldp x27, x28, [x9], #16
|
||||
|
||||
sub sp, sp, #ARG_STACK
|
||||
sub sp, sp, #ARG_STACK
|
||||
.equ pos, 0
|
||||
.rept MAX_ARGS-8
|
||||
// Skip the first 8 args, that are loaded into registers
|
||||
ldr x9, [x29, #16 + 8*8 + pos]
|
||||
str x9, [sp, #pos]
|
||||
// Skip the first 8 args, that are loaded into registers
|
||||
ldr x9, [x29, #16 + 8*8 + pos]
|
||||
str x9, [sp, #pos]
|
||||
.equ pos, pos + 8
|
||||
.endr
|
||||
|
||||
mov x12, x0
|
||||
ldp x0, x1, [x29, #16]
|
||||
ldp x2, x3, [x29, #32]
|
||||
ldp x4, x5, [x29, #48]
|
||||
ldp x6, x7, [x29, #64]
|
||||
blr x12
|
||||
add sp, sp, #ARG_STACK
|
||||
stp x0, x1, [sp, #-16]!
|
||||
movrel x9, register_init
|
||||
movi v3.8h, #0
|
||||
mov x12, x0
|
||||
ldp x0, x1, [x29, #16]
|
||||
ldp x2, x3, [x29, #32]
|
||||
ldp x4, x5, [x29, #48]
|
||||
ldp x6, x7, [x29, #64]
|
||||
blr x12
|
||||
add sp, sp, #ARG_STACK
|
||||
stp x0, x1, [sp, #-16]!
|
||||
movrel x9, register_init
|
||||
movi v3.8h, #0
|
||||
|
||||
.macro check_reg_neon reg1, reg2
|
||||
ldr q0, [x9], #16
|
||||
uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
orr v3.16b, v3.16b, v0.16b
|
||||
ldr q0, [x9], #16
|
||||
uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
orr v3.16b, v3.16b, v0.16b
|
||||
.endm
|
||||
check_reg_neon 8, 9
|
||||
check_reg_neon 10, 11
|
||||
check_reg_neon 12, 13
|
||||
check_reg_neon 14, 15
|
||||
uqxtn v3.8b, v3.8h
|
||||
umov x3, v3.d[0]
|
||||
check_reg_neon 8, 9
|
||||
check_reg_neon 10, 11
|
||||
check_reg_neon 12, 13
|
||||
check_reg_neon 14, 15
|
||||
uqxtn v3.8b, v3.8h
|
||||
umov x3, v3.d[0]
|
||||
|
||||
.macro check_reg reg1, reg2
|
||||
ldp x0, x1, [x9], #16
|
||||
eor x0, x0, \reg1
|
||||
eor x1, x1, \reg2
|
||||
orr x3, x3, x0
|
||||
orr x3, x3, x1
|
||||
ldp x0, x1, [x9], #16
|
||||
eor x0, x0, \reg1
|
||||
eor x1, x1, \reg2
|
||||
orr x3, x3, x0
|
||||
orr x3, x3, x1
|
||||
.endm
|
||||
check_reg x19, x20
|
||||
check_reg x21, x22
|
||||
check_reg x23, x24
|
||||
check_reg x25, x26
|
||||
check_reg x27, x28
|
||||
check_reg x19, x20
|
||||
check_reg x21, x22
|
||||
check_reg x23, x24
|
||||
check_reg x25, x26
|
||||
check_reg x27, x28
|
||||
|
||||
cbz x3, 0f
|
||||
cbz x3, 0f
|
||||
|
||||
movrel x0, error_message
|
||||
movrel x0, error_message
|
||||
#ifdef PREFIX
|
||||
bl _checkasm_fail_func
|
||||
bl _checkasm_fail_func
|
||||
#else
|
||||
bl checkasm_fail_func
|
||||
bl checkasm_fail_func
|
||||
#endif
|
||||
0:
|
||||
ldp x0, x1, [sp], #16
|
||||
ldp d14, d15, [sp], #16
|
||||
ldp d12, d13, [sp], #16
|
||||
ldp d10, d11, [sp], #16
|
||||
ldp d8, d9, [sp], #16
|
||||
ldp x27, x28, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
ldp x0, x1, [sp], #16
|
||||
ldp d14, d15, [sp], #16
|
||||
ldp d12, d13, [sp], #16
|
||||
ldp d10, d11, [sp], #16
|
||||
ldp d8, d9, [sp], #16
|
||||
ldp x27, x28, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
|
|
@ -62,6 +62,7 @@ static const struct {
|
|||
const char *name;
|
||||
void (*func)(void);
|
||||
} tests[] = {
|
||||
{ "msac", checkasm_check_msac },
|
||||
#if CONFIG_8BPC
|
||||
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
|
||||
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
|
||||
|
|
|
@ -57,6 +57,7 @@ int xor128_rand(void);
|
|||
name##_8bpc(void); \
|
||||
name##_16bpc(void)
|
||||
|
||||
void checkasm_check_msac(void);
|
||||
decl_check_bitfns(void checkasm_check_cdef);
|
||||
decl_check_bitfns(void checkasm_check_ipred);
|
||||
decl_check_bitfns(void checkasm_check_itx);
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/msac.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/* The normal code doesn't use function pointers */
|
||||
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
|
||||
typedef struct {
|
||||
decode_symbol_adapt_fn symbol_adapt4;
|
||||
decode_symbol_adapt_fn symbol_adapt8;
|
||||
decode_symbol_adapt_fn symbol_adapt16;
|
||||
} MsacDSPContext;
|
||||
|
||||
static void randomize_cdf(uint16_t *const cdf, int n) {
|
||||
for (int i = 16; i > n; i--)
|
||||
cdf[i] = rnd(); /* randomize padding */
|
||||
cdf[n] = cdf[n-1] = 0;
|
||||
while (--n > 0)
|
||||
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
|
||||
}
|
||||
|
||||
/* memcmp() on structs can have weird behavior due to padding etc. */
|
||||
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
|
||||
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
|
||||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
|
||||
a->allow_update_cdf != b->allow_update_cdf;
|
||||
}
|
||||
|
||||
#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \
|
||||
if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
|
||||
for (int ns = n_min; ns <= n_max; ns++) { \
|
||||
dav1d_msac_init(&s_c, buf, sizeof(buf), !cdf_update); \
|
||||
s_a = s_c; \
|
||||
randomize_cdf(cdf[0], ns); \
|
||||
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
|
||||
for (int i = 0; i < 64; i++) { \
|
||||
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
|
||||
unsigned a_res = call_new(&s_a, cdf[1], ns); \
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
|
||||
memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \
|
||||
{ \
|
||||
fail(); \
|
||||
} \
|
||||
} \
|
||||
if (cdf_update && ns == n) \
|
||||
bench_new(&s_a, cdf[0], n); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void check_decode_symbol_adapt(MsacDSPContext *const c) {
|
||||
/* Use an aligned CDF buffer for more consistent benchmark
|
||||
* results, and a misaligned one for checking correctness. */
|
||||
ALIGN_STK_16(uint16_t, cdf, 2, [17]);
|
||||
MsacContext s_c, s_a;
|
||||
uint8_t buf[1024];
|
||||
for (int i = 0; i < 1024; i++)
|
||||
buf[i] = rnd();
|
||||
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
|
||||
CHECK_SYMBOL_ADAPT( 4, 1, 5);
|
||||
CHECK_SYMBOL_ADAPT( 8, 1, 8);
|
||||
CHECK_SYMBOL_ADAPT(16, 4, 16);
|
||||
report("decode_symbol_adapt");
|
||||
}
|
||||
|
||||
void checkasm_check_msac(void) {
|
||||
MsacDSPContext c;
|
||||
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
|
||||
|
||||
#if ARCH_X86_64 && HAVE_ASM
|
||||
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
|
||||
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
|
||||
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;
|
||||
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
check_decode_symbol_adapt(&c);
|
||||
}
|
|
@ -34,7 +34,10 @@ endif
|
|||
libdav1d_nasm_objs_if_needed = []
|
||||
|
||||
if is_asm_enabled
|
||||
checkasm_sources = files('checkasm/checkasm.c')
|
||||
checkasm_sources = files(
|
||||
'checkasm/checkasm.c',
|
||||
'checkasm/msac.c',
|
||||
)
|
||||
|
||||
checkasm_tmpl_sources = files(
|
||||
'checkasm/cdef.c',
|
||||
|
|
Загрузка…
Ссылка в новой задаче