Bug 1540830 - Update dav1d from upstream to 1f7a7e8. r=TD-Linux

Differential Revision: https://phabricator.services.mozilla.com/D28200

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Alex Chronopoulos 2019-04-19 20:36:10 +00:00
Родитель 931da4b767
Коммит d1bd6b015b
38 изменённых файлов: 8762 добавлений и 945 удалений

Просмотреть файл

@ -20,7 +20,7 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 7350c59e7894cb7e487a0add9942d2b1b39f7161 (2019-03-16T23:17:05.000Z).
release: commit 1f7a7e8a6af739a05b320151d04f0f7509ae7579 (2019-04-19T07:16:39.000Z).
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.2.2"
#define DAV1D_VERSION "1f7a7e8a6af739a05b320151d04f0f7509ae7579"

6
third_party/dav1d/.gitlab-ci.yml поставляемый
Просмотреть файл

@ -12,6 +12,12 @@ style-check:
script:
- git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
- git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
- for i in $(git ls-files -- . ':(exclude)*/compat/*'); do
if [ -n "$(tail -c 1 "$i")" ]; then
echo "No newline at end of $i";
exit 1;
fi;
done
- git remote rm upstream 2> /dev/null || true
- git remote add upstream https://code.videolan.org/videolan/dav1d.git
- git fetch -q upstream master

8
third_party/dav1d/NEWS поставляемый
Просмотреть файл

@ -1,6 +1,14 @@
Changes for 0.2.2 'Antelope':
----------------------------
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
- NEON optimizations for SGR and loop filter
- Minor crashes, improvements and build changes
Changes for 0.2.1 'Antelope':
----------------------------

6
third_party/dav1d/include/dav1d/common.h поставляемый
Просмотреть файл

@ -33,7 +33,11 @@
#ifndef DAV1D_API
#if defined _WIN32
#define DAV1D_API __declspec(dllexport)
#if defined DAV1D_BUILDING_DLL
#define DAV1D_API __declspec(dllexport)
#else
#define DAV1D_API
#endif
#else
#if __GNUC__ >= 4
#define DAV1D_API __attribute__ ((visibility ("default")))

18
third_party/dav1d/src/arm/32/mc.S поставляемый
Просмотреть файл

@ -217,8 +217,8 @@ bidir_fn mask
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (24-clz(w)).
function put
// and assumes that r8 is set to (clz(w)-24).
function put_neon
adr r9, L(put_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
@ -307,9 +307,9 @@ endfunc
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (24-clz(w)), and r7 to w*2.
function prep
// assumes that the caller has loaded the h argument into r4,
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
function prep_neon
adr r9, L(prep_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
@ -660,7 +660,7 @@ function \op\()_8tap_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
movw r8, \type_h
movw r9, \type_v
b \op\()_8tap
b \op\()_8tap_neon
endfunc
.endm
@ -680,7 +680,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
function \type\()_8tap
function \type\()_8tap_neon
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
@ -699,7 +699,7 @@ function \type\()_8tap
bne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
bne L(\type\()_8tap_v)
b \type
b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
@ -1831,7 +1831,7 @@ function \type\()_bilin_8bpc_neon, export=1
bne L(\type\()_bilin_h)
cmp \my, #0
bne L(\type\()_bilin_v)
b \type
b \type\()_neon
L(\type\()_bilin_h):
cmp \my, #0

32
third_party/dav1d/src/arm/32/util.S поставляемый
Просмотреть файл

@ -34,32 +34,32 @@
.macro movrel rd, val, offset=0
#if defined(PIC) && defined(__APPLE__)
ldr \rd, 1f
b 2f
ldr \rd, 1f
b 2f
1:
.word 3f - (2f + 8 - 4 * CONFIG_THUMB)
.word 3f - (2f + 8 - 4 * CONFIG_THUMB)
2:
ldr \rd, [pc, \rd]
ldr \rd, [pc, \rd]
.if \offset < 0
sub \rd, \rd, #-(\offset)
sub \rd, \rd, #-(\offset)
.elseif \offset > 0
add \rd, \rd, #\offset
add \rd, \rd, #\offset
.endif
.non_lazy_symbol_pointer
.non_lazy_symbol_pointer
3:
.indirect_symbol \val
.word 0
.text
.indirect_symbol \val
.word 0
.text
#elif defined(PIC)
ldr \rd, 1f
b 2f
ldr \rd, 1f
b 2f
1:
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
2:
add \rd, \rd, pc
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
#endif
.endm

1124
third_party/dav1d/src/arm/64/loopfilter.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1372
third_party/dav1d/src/arm/64/looprestoration.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

34
third_party/dav1d/src/arm/64/mc.S поставляемый
Просмотреть файл

@ -235,8 +235,8 @@ bidir_fn mask
// This has got the same signature as the put_8tap functions,
// and assumes that x8 is set to (24-clz(w)).
function put
// and assumes that x8 is set to (clz(w)-24).
function put_neon
adr x9, L(put_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@ -330,8 +330,8 @@ endfunc
// This has got the same signature as the prep_8tap functions,
// and assumes that x8 is set to (24-clz(w)), and x7 to w*2.
function prep
// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
function prep_neon
adr x9, L(prep_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@ -703,7 +703,7 @@ endfunc
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
b \op\()_8tap
b \op\()_8tap\()_neon
endfunc
.endm
@ -723,7 +723,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
function \type\()_8tap
function \type\()_8tap_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@ -741,7 +741,7 @@ function \type\()_8tap
b.ne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
b.ne L(\type\()_8tap_v)
b \type
b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
@ -1826,7 +1826,7 @@ function \type\()_bilin_8bpc_neon, export=1
sub w8, w8, #24
cbnz \mx, L(\type\()_bilin_h)
cbnz \my, L(\type\()_bilin_v)
b \type
b \type\()_neon
L(\type\()_bilin_h):
cbnz \my, L(\type\()_bilin_hv)
@ -2335,7 +2335,7 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
add \src, \src, \inc
.endm
function warp_filter_horz
function warp_filter_horz_neon
add w12, w5, #512
ld1 {v16.8b, v17.8b}, [x2], x3
@ -2431,24 +2431,24 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
lsl x1, x1, #1
.endif
bl warp_filter_horz
bl warp_filter_horz_neon
mov v24.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v25.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v26.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v27.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v28.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v29.16b, v16.16b
bl warp_filter_horz
bl warp_filter_horz_neon
mov v30.16b, v16.16b
1:
add w14, w6, #512
bl warp_filter_horz
bl warp_filter_horz_neon
mov v31.16b, v16.16b
load_filter_row d0, w14, w9

115
third_party/dav1d/src/arm/64/util.S поставляемый
Просмотреть файл

@ -35,57 +35,98 @@
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif defined(PIC) && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val+\offset
ldr \rd, =\val+\offset
#endif
.endm
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8b, \r0\().8b, \r1\().8b
trn2 \r9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8b, \r6\().8b, \r7\().8b
trn1 \r8\().8b, \r0\().8b, \r1\().8b
trn2 \r9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8b, \r6\().8b, \r7\().8b
trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \r9\().4h, \r3\().4h
trn2 \r9\().4h, \r9\().4h, \r3\().4h
trn1 \r3\().4h, \r8\().4h, \r1\().4h
trn2 \r8\().4h, \r8\().4h, \r1\().4h
trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \r9\().4h, \r3\().4h
trn2 \r9\().4h, \r9\().4h, \r3\().4h
trn1 \r3\().4h, \r8\().4h, \r1\().4h
trn2 \r8\().4h, \r8\().4h, \r1\().4h
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2s, \r8\().2s, \r2\().2s
trn1 \r2\().2s, \r8\().2s, \r2\().2s
trn1 \r3\().2s, \r9\().2s, \r7\().2s
trn2 \r7\().2s, \r9\().2s, \r7\().2s
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2s, \r8\().2s, \r2\().2s
trn1 \r2\().2s, \r8\().2s, \r2\().2s
trn1 \r3\().2s, \r9\().2s, \r7\().2s
trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().16b, \r0\().16b, \r1\().16b
trn2 \r9\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \r9\().8h, \r3\().8h
trn2 \r9\().8h, \r9\().8h, \r3\().8h
trn1 \r3\().8h, \r8\().8h, \r1\().8h
trn2 \r8\().8h, \r8\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \r8\().4s, \r2\().4s
trn1 \r2\().4s, \r8\().4s, \r2\().4s
trn1 \r3\().4s, \r9\().4s, \r7\().4s
trn2 \r7\().4s, \r9\().4s, \r7\().4s
.endm
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
#endif /* DAV1D_SRC_ARM_64_UTIL_S */

32
third_party/dav1d/src/arm/asm.S поставляемый
Просмотреть файл

@ -31,12 +31,12 @@
#include "config.h"
#if ARCH_ARM
.syntax unified
.syntax unified
#ifdef __ELF__
.arch armv7-a
.fpu neon
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
.arch armv7-a
.fpu neon
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
#endif
#ifdef _WIN32
@ -46,7 +46,7 @@
#endif
#if CONFIG_THUMB
.thumb
.thumb
#define A @
#define T
#else
@ -86,25 +86,25 @@
#endif
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN\name
.text
.align \align
.if \export
.global EXTERN\name
#ifdef __ELF__
.type EXTERN\name, %function
.type EXTERN\name, %function
#endif
#if HAVE_AS_FUNC
.func EXTERN\name
.func EXTERN\name
#endif
EXTERN\name:
.else
.else
#ifdef __ELF__
.type \name, %function
.type \name, %function
#endif
#if HAVE_AS_FUNC
.func \name
.func \name
#endif
.endif
.endif
\name:
.endm

47
third_party/dav1d/src/arm/loopfilter_init_tmpl.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,47 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
#endif
}

Просмотреть файл

@ -29,6 +29,7 @@
#include "src/looprestoration.h"
#include "common/attributes.h"
#include "src/tables.h"
#if BITDEPTH == 8
// This calculates things slightly differently than the reference C version.
@ -91,7 +92,171 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
}
}
#endif
#if ARCH_AARCH64
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter1_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 3x3 box (radius=1) */
static void dav1d_sgr_filter1_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
{
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 1, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter2_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 5x5 box (radius=2) */
static void dav1d_sgr_filter2_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
{
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const int w, const int h,
const int wt);
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const coef *t2,
const int w, const int h,
const int16_t wt[2]);
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
if (w >= 8)
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h,
(1 << 7) - sgr_wt[1]);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
if (w >= 8)
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, sgr_wt[0]);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else {
ALIGN_STK_16(coef, tmp1, 64 * 384,);
ALIGN_STK_16(coef, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
if (w >= 8)
dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w & ~7, h, wt);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp1 + (w & ~7), tmp2 + (w & ~7),
w & 7, h, wt);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
}
}
#endif // ARCH_AARCH64
#endif // BITDEPTH == 8
void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@ -100,5 +265,8 @@ void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *
#if BITDEPTH == 8
c->wiener = wiener_filter_neon;
#if ARCH_AARCH64
c->selfguided = sgr_filter_neon;
#endif
#endif
}

2
third_party/dav1d/src/cdf.c поставляемый
Просмотреть файл

@ -813,7 +813,7 @@ static const uint16_t default_mv_joint_cdf[N_MV_JOINTS + 1] = {
AOM_CDF4(4096, 11264, 19328)
};
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1] = {
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = {
{
{ AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
24189, 28165, 29093, 30466) },

20
third_party/dav1d/src/cdf.h поставляемый
Просмотреть файл

@ -34,11 +34,13 @@
#include "src/ref.h"
#include "src/thread_data.h"
/* Buffers padded to [8] or [16] for SIMD where needed. */
typedef struct CdfModeContext {
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
uint16_t use_filter_intra[N_BS_SIZES][2];
uint16_t filter_intra[5 + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
uint16_t angle_delta[8][8];
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
uint16_t newmv_mode[6][2];
@ -66,7 +68,7 @@ typedef struct CdfModeContext {
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
uint16_t skip[3][2];
uint16_t skip_mode[3][2];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
uint16_t seg_pred[3][2];
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
uint16_t cfl_sign[8 + 1];
@ -88,12 +90,12 @@ typedef struct CdfModeContext {
typedef struct CdfCoefContext {
uint16_t skip[N_TX_SIZES][13][2];
uint16_t eob_bin_16[2][2][6];
uint16_t eob_bin_32[2][2][7];
uint16_t eob_bin_32[2][2][7 + 1];
uint16_t eob_bin_64[2][2][8];
uint16_t eob_bin_128[2][2][9];
uint16_t eob_bin_256[2][2][10];
uint16_t eob_bin_512[2][2][11];
uint16_t eob_bin_1024[2][2][12];
uint16_t eob_bin_256[2][2][10 + 6];
uint16_t eob_bin_512[2][2][11 + 5];
uint16_t eob_bin_1024[2][2][12 + 4];
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
uint16_t base_tok[N_TX_SIZES][2][41][5];
@ -102,7 +104,7 @@ typedef struct CdfCoefContext {
} CdfCoefContext;
typedef struct CdfMvComponent {
uint16_t classes[11 + 1];
uint16_t classes[11 + 1 + 4];
uint16_t class0[2];
uint16_t classN[10][2];
uint16_t class0_fp[2][4 + 1];
@ -119,7 +121,7 @@ typedef struct CdfMvContext {
typedef struct CdfContext {
CdfModeContext m;
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1];
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
CdfCoefContext coef;
CdfMvContext mv, dmv;
} CdfContext;

72
third_party/dav1d/src/decode.c поставляемый
Просмотреть файл

@ -80,15 +80,15 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
const Dav1dFrameContext *const f = t->f;
const int have_hp = f->frame_hdr->hp;
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac,
mv_comp->classes, 11);
const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
mv_comp->classes, 11);
int up, fp, hp;
if (!cl) {
up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
if (have_fp) {
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
mv_comp->class0_fp[up], 4);
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
mv_comp->class0_fp[up], 4);
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->class0_hp) : 1;
} else {
@ -101,8 +101,8 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
up |= dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->classN[n]) << n;
if (have_fp) {
fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
mv_comp->classN_fp, 4);
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
mv_comp->classN_fp, 4);
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->classN_hp) : 1;
} else {
@ -119,8 +119,8 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
CdfMvContext *const mv_cdf, const int have_fp)
{
switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint,
N_MV_JOINTS))
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
N_MV_JOINTS))
{
case MV_JOINT_HV:
ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
@ -379,7 +379,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
uint16_t cache[16], used_cache[8];
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
@ -595,7 +595,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
const int last = imax(0, i - h4 * 4 + 1);
order_palette(pal_idx, stride, i, first, last, order, ctx);
for (int j = first, m = 0; j >= last; j--, m++) {
const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
color_map_cdf[ctx[m]], b->pal_sz[pl]);
pal_idx[(i - j) * stride + j] = order[m][color_idx];
}
@ -811,7 +811,7 @@ static int decode_b(Dav1dTileContext *const t,
const unsigned pred_seg_id =
get_cur_frame_segid(t->by, t->bx, have_top, have_left,
&seg_ctx, f->cur_segmap, f->b4_stride);
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.seg_id[seg_ctx],
DAV1D_MAX_SEGMENTS);
const unsigned last_active_seg_id =
@ -883,7 +883,7 @@ static int decode_b(Dav1dTileContext *const t,
if (b->skip) {
b->seg_id = pred_seg_id;
} else {
const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.seg_id[seg_ctx],
DAV1D_MAX_SEGMENTS);
const unsigned last_active_seg_id =
@ -932,8 +932,8 @@ static int decode_b(Dav1dTileContext *const t,
memcpy(prev_delta_lf, ts->last_delta_lf, 4);
if (have_delta_q) {
int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac,
ts->cdf.m.delta_q, 4);
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.delta_q, 4);
if (delta_q == 3) {
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
@ -953,7 +953,7 @@ static int decode_b(Dav1dTileContext *const t,
f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
for (int i = 0; i < n_lfs; i++) {
int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac,
int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
if (delta_lf == 3) {
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
@ -1018,8 +1018,8 @@ static int decode_b(Dav1dTileContext *const t,
ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
[dav1d_intra_mode_context[t->l.mode[by4]]];
b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
N_INTRA_PRED_MODES);
b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
N_INTRA_PRED_MODES);
if (DEBUG_BLOCK_INFO)
printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
@ -1028,7 +1028,7 @@ static int decode_b(Dav1dTileContext *const t,
b->y_mode <= VERT_LEFT_PRED)
{
uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
b->y_angle = angle - 3;
} else {
b->y_angle = 0;
@ -1038,20 +1038,20 @@ static int decode_b(Dav1dTileContext *const t,
const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
N_UV_INTRA_PRED_MODES - !cfl_allowed);
if (DEBUG_BLOCK_INFO)
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
if (b->uv_mode == CFL_PRED) {
#define SIGN(a) (!!(a) + ((a) > 0))
const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.cfl_sign, 8) + 1;
const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
assert(sign_u == sign / 3);
if (sign_u) {
const int ctx = (sign_u == 2) * 3 + sign_v;
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
} else {
@ -1059,7 +1059,7 @@ static int decode_b(Dav1dTileContext *const t,
}
if (sign_v) {
const int ctx = (sign_v == 2) * 3 + sign_u;
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
} else {
@ -1073,7 +1073,7 @@ static int decode_b(Dav1dTileContext *const t,
b->uv_mode <= VERT_LEFT_PRED)
{
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
b->uv_angle = angle - 3;
} else {
b->uv_angle = 0;
@ -1113,7 +1113,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->cdf.m.use_filter_intra[bs]);
if (is_filter) {
b->y_mode = FILTER_PRED;
b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter_intra, 5);
}
if (DEBUG_BLOCK_INFO)
@ -1156,7 +1156,7 @@ static int decode_b(Dav1dTileContext *const t,
if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf,
int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
imin(t_dim->max + 1, 3));
while (depth--) {
@ -1474,7 +1474,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->tiling.col_end, ts->tiling.row_start,
ts->tiling.row_end, f->libaom_cm);
b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.comp_inter_mode[ctx],
N_COMP_INTER_PRED_MODES);
if (DEBUG_BLOCK_INFO)
@ -1583,7 +1583,7 @@ static int decode_b(Dav1dTileContext *const t,
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.wedge_comp[ctx]);
if (b->comp_type == COMP_INTER_WEDGE)
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.wedge_idx[ctx], 16);
} else {
b->comp_type = COMP_INTER_SEG;
@ -1737,7 +1737,7 @@ static int decode_b(Dav1dTileContext *const t,
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.interintra[ii_sz_grp]))
{
b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.interintra_mode[ii_sz_grp],
N_INTER_INTRA_PRED_MODES);
const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
@ -1745,7 +1745,7 @@ static int decode_b(Dav1dTileContext *const t,
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.interintra_wedge[wedge_ctx]);
if (b->interintra_type == INTER_INTRA_WEDGE)
b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.wedge_idx[wedge_ctx], 16);
} else {
b->interintra_type = INTER_INTRA_NONE;
@ -1778,7 +1778,7 @@ static int decode_b(Dav1dTileContext *const t,
f->frame_hdr->warp_motion && (mask[0] | mask[1]);
b->motion_mode = allow_warp ?
dav1d_msac_decode_symbol_adapt(&ts->msac,
dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.motion_mode[bs], 3) :
dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
if (b->motion_mode == MM_WARP) {
@ -1817,7 +1817,7 @@ static int decode_b(Dav1dTileContext *const t,
const int comp = b->comp_type != COMP_INTER_NONE;
const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
by4, bx4);
filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter[0][ctx1],
DAV1D_N_SWITCHABLE_FILTERS);
if (f->seq_hdr->dual_filter) {
@ -1826,7 +1826,7 @@ static int decode_b(Dav1dTileContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
filter[0], ctx1, ts->msac.rng);
filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter[1][ctx2],
DAV1D_N_SWITCHABLE_FILTERS);
if (DEBUG_BLOCK_INFO)
@ -2021,7 +2021,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
} else {
const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
(bp == PARTITION_V || bp == PARTITION_V4 ||
bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
@ -2365,7 +2365,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
Dav1dTileState *const ts = t->ts;
if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.restore_switchable, 3);
lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
DAV1D_RESTORATION_WIENER :
@ -2692,7 +2692,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
freep(&f->lf.level);
freep(&f->frame_thread.b);
f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
f->lf.level = malloc(f->sb128w * f->sb128h * 32 * 32 *
// over-allocate by 3 bytes since some of the SIMD implementations
// index this from the level type and can thus over-read by up to 3
f->lf.level = malloc(3 + f->sb128w * f->sb128h * 32 * 32 *
sizeof(*f->lf.level));
if (!f->lf.mask || !f->lf.level) goto error;
if (c->n_fc > 1) {

103
third_party/dav1d/src/itx_tmpl.c поставляемый
Просмотреть файл

@ -45,7 +45,7 @@ typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
static void NOINLINE
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob,
const int w, const int h, const int shift1, const int shift2,
const int w, const int h, const int shift,
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
const int has_dconly HIGHBD_DECL_SUFFIX)
{
@ -53,8 +53,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
const int is_rect2 = w * 2 == h || h * 2 == w;
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int rnd1 = (1 << shift1) >> 1;
const int rnd2 = (1 << shift2) >> 1;
const int rnd = (1 << shift) >> 1;
if (has_dconly && eob == 0) {
int dc = coeff[0];
@ -62,9 +61,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
if (is_rect2)
dc = (dc * 2896 + 2048) >> 12;
dc = (dc * 2896 + 2048) >> 12;
dc = (dc + rnd1) >> shift1;
dc = (dc + rnd) >> shift;
dc = (dc * 2896 + 2048) >> 12;
dc = (dc + rnd2) >> shift2;
dc = (dc + 8) >> 4;
for (j = 0; j < h; j++)
for (i = 0; i < w; i++)
dst[i + j * PXSTRIDE(stride)] =
@ -93,9 +92,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
}
for (j = 0; j < w; j++)
#if BITDEPTH == 8
tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
#else
tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
-col_clip_max - 1, col_clip_max);
#endif
}
@ -106,12 +105,12 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
for (j = 0; j < h; j++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
((out[j] + (rnd2)) >> shift2));
((out[j] + 8) >> 4));
}
memset(coeff, 0, sizeof(*coeff) * sh * sw);
}
#define inv_txfm_fn(type1, type2, w, h, shift1, shift2, has_dconly) \
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
static void \
inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const ptrdiff_t stride, \
@ -119,57 +118,57 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
const int eob \
HIGHBD_DECL_SUFFIX) \
{ \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
HIGHBD_TAIL_SUFFIX); \
}
#define inv_txfm_fn64(w, h, shift1, shift2) \
inv_txfm_fn(dct, dct, w, h, shift1, shift2, 1)
#define inv_txfm_fn64(w, h, shift) \
inv_txfm_fn(dct, dct, w, h, shift, 1)
#define inv_txfm_fn32(w, h, shift1, shift2) \
inv_txfm_fn64(w, h, shift1, shift2) \
inv_txfm_fn(identity, identity, w, h, shift1, shift2, 0)
#define inv_txfm_fn32(w, h, shift) \
inv_txfm_fn64(w, h, shift) \
inv_txfm_fn(identity, identity, w, h, shift, 0)
#define inv_txfm_fn16(w, h, shift1, shift2) \
inv_txfm_fn32(w, h, shift1, shift2) \
inv_txfm_fn(adst, dct, w, h, shift1, shift2, 0) \
inv_txfm_fn(dct, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(adst, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(dct, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, dct, w, h, shift1, shift2, 0) \
inv_txfm_fn(adst, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(identity, dct, w, h, shift1, shift2, 0) \
inv_txfm_fn(dct, identity, w, h, shift1, shift2, 0) \
#define inv_txfm_fn16(w, h, shift) \
inv_txfm_fn32(w, h, shift) \
inv_txfm_fn(adst, dct, w, h, shift, 0) \
inv_txfm_fn(dct, adst, w, h, shift, 0) \
inv_txfm_fn(adst, adst, w, h, shift, 0) \
inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
inv_txfm_fn(identity, dct, w, h, shift, 0) \
inv_txfm_fn(dct, identity, w, h, shift, 0) \
#define inv_txfm_fn84(w, h, shift1, shift2) \
inv_txfm_fn16(w, h, shift1, shift2) \
inv_txfm_fn(identity, flipadst, w, h, shift1, shift2, 0) \
inv_txfm_fn(flipadst, identity, w, h, shift1, shift2, 0) \
inv_txfm_fn(identity, adst, w, h, shift1, shift2, 0) \
inv_txfm_fn(adst, identity, w, h, shift1, shift2, 0) \
#define inv_txfm_fn84(w, h, shift) \
inv_txfm_fn16(w, h, shift) \
inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
inv_txfm_fn(identity, adst, w, h, shift, 0) \
inv_txfm_fn(adst, identity, w, h, shift, 0) \
inv_txfm_fn84( 4, 4, 0, 4)
inv_txfm_fn84( 4, 8, 0, 4)
inv_txfm_fn84( 4, 16, 1, 4)
inv_txfm_fn84( 8, 4, 0, 4)
inv_txfm_fn84( 8, 8, 1, 4)
inv_txfm_fn84( 8, 16, 1, 4)
inv_txfm_fn32( 8, 32, 2, 4)
inv_txfm_fn84(16, 4, 1, 4)
inv_txfm_fn84(16, 8, 1, 4)
inv_txfm_fn16(16, 16, 2, 4)
inv_txfm_fn32(16, 32, 1, 4)
inv_txfm_fn64(16, 64, 2, 4)
inv_txfm_fn32(32, 8, 2, 4)
inv_txfm_fn32(32, 16, 1, 4)
inv_txfm_fn32(32, 32, 2, 4)
inv_txfm_fn64(32, 64, 1, 4)
inv_txfm_fn64(64, 16, 2, 4)
inv_txfm_fn64(64, 32, 1, 4)
inv_txfm_fn64(64, 64, 2, 4)
inv_txfm_fn84( 4, 4, 0)
inv_txfm_fn84( 4, 8, 0)
inv_txfm_fn84( 4, 16, 1)
inv_txfm_fn84( 8, 4, 0)
inv_txfm_fn84( 8, 8, 1)
inv_txfm_fn84( 8, 16, 1)
inv_txfm_fn32( 8, 32, 2)
inv_txfm_fn84(16, 4, 1)
inv_txfm_fn84(16, 8, 1)
inv_txfm_fn16(16, 16, 2)
inv_txfm_fn32(16, 32, 1)
inv_txfm_fn64(16, 64, 2)
inv_txfm_fn32(32, 8, 2)
inv_txfm_fn32(32, 16, 1)
inv_txfm_fn32(32, 32, 2)
inv_txfm_fn64(32, 64, 1)
inv_txfm_fn64(64, 16, 2)
inv_txfm_fn64(64, 32, 1)
inv_txfm_fn64(64, 64, 2)
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob

1
third_party/dav1d/src/loopfilter.h поставляемый
Просмотреть файл

@ -53,6 +53,7 @@ typedef struct Dav1dLoopFilterDSPContext {
} Dav1dLoopFilterDSPContext;
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
#endif /* DAV1D_SRC_LOOPFILTER_H */

6
third_party/dav1d/src/loopfilter_tmpl.c поставляемый
Просмотреть файл

@ -250,7 +250,11 @@ void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
#if HAVE_ASM && ARCH_X86
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_loop_filter_dsp_init_arm)(c);
#elif ARCH_X86
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
#endif
#endif
}

45
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -86,12 +86,14 @@ if is_asm_enabled
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
'arm/loopfilter_init_tmpl.c',
'arm/looprestoration_init_tmpl.c',
'arm/mc_init_tmpl.c',
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
)
@ -118,20 +120,31 @@ if is_asm_enabled
# NASM source files
libdav1d_sources_asm = files(
'x86/cdef.asm',
'x86/cdef_ssse3.asm',
'x86/cpuid.asm',
'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/looprestoration_ssse3.asm',
'x86/mc.asm',
'x86/mc_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/ipred_ssse3.asm',
'x86/msac.asm',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'x86/cdef.asm',
'x86/cdef_sse.asm',
'x86/ipred.asm',
'x86/ipred_ssse3.asm',
'x86/itx.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/looprestoration_ssse3.asm',
'x86/mc.asm',
'x86/mc_ssse3.asm',
)
endif
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
)
endif
# Compile the ASM sources with NASM
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
endif
@ -139,8 +152,10 @@ endif
api_export_flags = []
#
# Windows .rc file
# Windows .rc file and API export flags
#
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
@ -162,6 +177,8 @@ if host_machine.system() == 'windows' and get_option('default_library') != 'stat
)
libdav1d_rc_obj = winmod.compile_resources(rc_file)
api_export_flags = ['-DDAV1D_BUILDING_DLL']
else
libdav1d_rc_obj = []
endif
@ -180,7 +197,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
include_directories : dav1d_inc_dirs,
dependencies: [stdatomic_dependency],
c_args : [stackalign_flag, stackrealign_flag],
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
install : false,
build_by_default : false,
).extract_all_objects()
@ -222,7 +239,7 @@ libdav1d = library('dav1d',
thread_dependency,
thread_compat_dep,
],
c_args : [stackalign_flag],
c_args : [stackalign_flag, api_export_flags],
version : dav1d_soname_version,
soversion : dav1d_soversion,
install : true,

92
third_party/dav1d/src/msac.c поставляемый
Просмотреть файл

@ -58,8 +58,8 @@ static inline void ctx_refill(MsacContext *s) {
* necessary), and stores them back in the decoder context.
* dif: The new value of dif.
* rng: The new value of the range. */
static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
const uint16_t d = 15 - (31 ^ clz(rng));
static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
const int d = 15 ^ (31 ^ clz(rng));
assert(rng <= 65535U);
s->cnt -= d;
s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
@ -69,18 +69,17 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
}
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
ec_win vw, dif = s->dif;
unsigned ret, v, r = s->rng;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
// When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
// replace the multiply with a simple shift.
v = ((r >> 8) << 7) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
vw = (ec_win)v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, (unsigned) v);
ctx_norm(s, dif, v);
return !ret;
}
@ -88,59 +87,57 @@ unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
* f: The probability that the bit is one
* Return: The value decoded (0 or 1). */
unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
ec_win vw, dif = s->dif;
unsigned ret, v, r = s->rng;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
vw = (ec_win)v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, (unsigned) v);
ctx_norm(s, dif, v);
return !ret;
}
unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) {
int v = 0;
for (int n = (int) l - 1; n >= 0; n--)
v = (v << 1) | dav1d_msac_decode_bool_equi(c);
unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
unsigned v = 0;
while (n--)
v = (v << 1) | dav1d_msac_decode_bool_equi(s);
return v;
}
int dav1d_msac_decode_subexp(MsacContext *const c, const int ref,
int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
const int n, const unsigned k)
{
int i = 0;
int a = 0;
int b = k;
while ((2 << b) < n) {
if (!dav1d_msac_decode_bool_equi(c)) break;
if (!dav1d_msac_decode_bool_equi(s)) break;
b = k + i++;
a = (1 << b);
}
const unsigned v = dav1d_msac_decode_bools(c, b) + a;
const unsigned v = dav1d_msac_decode_bools(s, b) + a;
return ref * 2 <= n ? inv_recenter(ref, v) :
n - 1 - inv_recenter(n - 1 - ref, v);
}
int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) {
int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
assert(n > 0);
const int l = ulog2(n) + 1;
assert(l > 1);
const unsigned m = (1 << l) - n;
const unsigned v = dav1d_msac_decode_bools(c, l - 1);
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c);
const unsigned v = dav1d_msac_decode_bools(s, l - 1);
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
}
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
* table in Q15. */
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
const unsigned n_symbols)
const size_t n_symbols)
{
ec_win u, v = s->rng, r = s->rng >> 8;
const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
unsigned ret = 0;
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
assert(!cdf[n_symbols - 1]);
@ -148,44 +145,39 @@ static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
u = v;
v = r * (cdf[ret++] >> EC_PROB_SHIFT);
v >>= 7 - EC_PROB_SHIFT;
v += EC_MIN_PROB * (n_symbols - ret);
v += EC_MIN_PROB * (int) (n_symbols - ret);
} while (c < v);
assert(u <= s->rng);
ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v));
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
return ret - 1;
}
static void update_cdf(uint16_t *const cdf, const unsigned val,
const unsigned n_symbols)
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
uint16_t *const cdf,
const size_t n_symbols)
{
const unsigned count = cdf[n_symbols];
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
unsigned i;
for (i = 0; i < val; i++)
cdf[i] += (32768 - cdf[i]) >> rate;
for (; i < n_symbols - 1; i++)
cdf[i] -= cdf[i] >> rate;
cdf[n_symbols] = count + (count < 32);
}
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c,
uint16_t *const cdf,
const unsigned n_symbols)
{
const unsigned val = decode_symbol(c, cdf, n_symbols);
if(c->allow_update_cdf)
update_cdf(cdf, val, n_symbols);
const unsigned val = decode_symbol(s, cdf, n_symbols);
if (s->allow_update_cdf) {
const unsigned count = cdf[n_symbols];
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
unsigned i;
for (i = 0; i < val; i++)
cdf[i] += (32768 - cdf[i]) >> rate;
for (; i < n_symbols - 1; i++)
cdf[i] -= cdf[i] >> rate;
cdf[n_symbols] = count + (count < 32);
}
return val;
}
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c,
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s,
uint16_t *const cdf)
{
const unsigned bit = dav1d_msac_decode_bool(c, *cdf);
const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
if(c->allow_update_cdf){
if (s->allow_update_cdf) {
// update_cdf() specialized for boolean CDFs
const unsigned count = cdf[1];
const int rate = (count >> 4) | 4;

33
third_party/dav1d/src/msac.h поставляемый
Просмотреть файл

@ -38,20 +38,37 @@ typedef struct MsacContext {
const uint8_t *buf_pos;
const uint8_t *buf_end;
ec_win dif;
uint16_t rng;
unsigned rng;
int cnt;
int allow_update_cdf;
} MsacContext;
void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz,
void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
int disable_cdf_update_flag);
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
const unsigned n_symbols);
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s);
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l);
int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
int dav1d_msac_decode_uniform(MsacContext *c, unsigned n);
unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
#if ARCH_X86_64 && HAVE_ASM
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#else
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
#endif
#endif /* DAV1D_SRC_MSAC_H */

28
third_party/dav1d/src/recon_tmpl.c поставляемый
Просмотреть файл

@ -107,7 +107,9 @@ static int decode_coefs(Dav1dTileContext *const t,
uint16_t *const txtp_cdf = intra ?
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
if (dbg)
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
@ -122,19 +124,19 @@ static int decode_coefs(Dav1dTileContext *const t,
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
const int is_1d = tx_class != TX_CLASS_2D;
switch (tx2dszctx) {
#define case_sz(sz, bin) \
#define case_sz(sz, bin, ns) \
case sz: { \
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
break; \
}
case_sz(0, 16);
case_sz(1, 32);
case_sz(2, 64);
case_sz(3, 128);
case_sz(4, 256);
case_sz(5, 512);
case_sz(6, 1024);
case_sz(0, 16, 4);
case_sz(1, 32, 8);
case_sz(2, 64, 8);
case_sz(3, 128, 8);
case_sz(4, 256, 16);
case_sz(5, 512, 16);
case_sz(6, 1024, 16);
#undef case_sz
}
if (dbg)
@ -179,8 +181,8 @@ static int decode_coefs(Dav1dTileContext *const t,
uint16_t *const lo_cdf = is_last ?
ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf,
4 - is_last) + is_last;
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf,
4 - is_last) + is_last;
if (dbg)
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
@ -190,7 +192,7 @@ static int decode_coefs(Dav1dTileContext *const t,
if (tok == 3) {
const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
do {
const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[br_ctx], 4);
if (dbg)
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",

98
third_party/dav1d/src/x86/cdef.asm поставляемый
Просмотреть файл

@ -113,7 +113,7 @@ SECTION .text
paddw m15, m5
%endmacro
%macro cdef_filter_fn 3 ; w, h, stride
%macro CDEF_FILTER 3 ; w, h, stride
INIT_YMM avx2
%if %1 != 4 || %2 != 8
cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
lea dst4q, [dstq+strideq*4]
%endif
lea stride3q, [strideq*3]
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .no_right
pmovzxbw m1, [dstq+strideq*0]
pmovzxbw m2, [dstq+strideq*1]
@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; top
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
test edged, 4 ; have_top
test edgeb, 4 ; have_top
jz .no_top
mov top1q, [top2q+0*gprsize]
mov top2q, [top2q+1*gprsize]
test edged, 1 ; have_left
test edgeb, 1 ; have_left
jz .top_no_left
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .top_no_right
pmovzxbw m1, [top1q-(%1/2)]
pmovzxbw m2, [top2q-(%1/2)]
@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
movd [px-1*%3+%1*2], xm14
jmp .top_done
.top_no_left:
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .top_no_left_right
pmovzxbw m1, [top1q]
pmovzxbw m2, [top2q]
@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
.top_done:
; left
test edged, 1 ; have_left
test edgeb, 1 ; have_left
jz .no_left
pmovzxbw xm1, [leftq+ 0]
%if %2 == 8
@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; bottom
DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
test edged, 8 ; have_bottom
test edgeb, 8 ; have_bottom
jz .no_bottom
lea dst8q, [dstq+%2*strideq]
test edged, 1 ; have_left
test edgeb, 1 ; have_left
jz .bottom_no_left
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .bottom_no_right
pmovzxbw m1, [dst8q-(%1/2)]
pmovzxbw m2, [dst8q+strideq-(%1/2)]
@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
movd [px+(%2+1)*%3+%1*2], xm14
jmp .bottom_done
.bottom_no_left:
test edged, 2 ; have_right
test edgeb, 2 ; have_right
jz .bottom_no_left_right
pmovzxbw m1, [dst8q]
pmovzxbw m2, [dst8q+strideq]
@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
; actual filter
INIT_YMM avx2
DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
%undef edged
; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi]
movifnidn prid, prim
movifnidn secd, secm
mov dampingd, r7m
mov pridmpd, prid
mov secdmpd, secd
or pridmpd, 1
or secdmpd, 1
lzcnt pridmpd, pridmpd
lzcnt secdmpd, secdmpd
lea pridmpd, [pridmpd+dampingd-31]
lea secdmpd, [secdmpd+dampingd-31]
xor dampingd, dampingd
test pridmpd, pridmpd
cmovl pridmpd, dampingd
test secdmpd, secdmpd
cmovl secdmpd, dampingd
lzcnt pridmpd, prid
%if UNIX64
movd xm0, prid
movd xm1, secdmpd
%endif
lzcnt secdmpd, secdmpm
sub dampingd, 31
xor zerod, zerod
add pridmpd, dampingd
cmovl pridmpd, zerod
add secdmpd, dampingd
cmovl secdmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
movd xm0, prid
movd xm1, secd
DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
%if UNIX64
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
%else
vpbroadcastb m0, prim
vpbroadcastb m1, secm
%endif
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
mov dird, r6m
lea dirq, [tapq+dirq*2+14]
lea dirq, [tableq+dirq*2+14]
%if %1*%2*2/mmsize > 1
%if %1 == 4
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
@ -476,9 +475,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
RET
%endmacro
cdef_filter_fn 8, 8, 32
cdef_filter_fn 4, 8, 32
cdef_filter_fn 4, 4, 32
CDEF_FILTER 8, 8, 32
CDEF_FILTER 4, 8, 32
CDEF_FILTER 4, 4, 32
INIT_YMM avx2
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left
pshuflw m11, m11, q3012
punpckhwd m6, m4, m11
punpcklwd m4, m11
pshuflw m5, m11, q3012
punpckhwd m6, m11, m4
punpcklwd m4, m5
pmaddwd m6, m6
pmaddwd m4, m4
pmulld m6, m12
@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left
pshuflw m1, m1, q3012
punpckhwd m6, m5, m1
punpcklwd m5, m1
pmaddwd m6, m6
pshuflw m0, m1, q3012
punpckhwd m1, m5
punpcklwd m5, m0
pmaddwd m1, m1
pmaddwd m5, m5
pmulld m6, m12
pmulld m1, m12
pmulld m5, m13
paddd m5, m6 ; cost1[a-d] | cost3[a-d]
paddd m5, m1 ; cost1[a-d] | cost3[a-d]
mova xm0, [pd_47130256+ 16]
mova m1, [pd_47130256]
@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
; now find the best cost
pmaxsd xm2, xm0, xm1
pshufd xm3, xm2, q3232
pshufd xm3, xm2, q1032
pmaxsd xm2, xm3
pshufd xm3, xm2, q1111
pmaxsd xm2, xm3
pshufd xm2, xm2, q0000 ; best cost
pshufd xm3, xm2, q2301
pmaxsd xm2, xm3 ; best cost
; find the idx using minpos
; make everything other than the best cost negative via subtraction
@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
phminposuw xm3, xm3
; convert idx to 32-bits
psrldq xm3, 2
psrld xm3, 16
movd eax, xm3
; get idx^4 complement

15
third_party/dav1d/src/x86/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -29,15 +29,19 @@
#include "src/cdef.h"
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
@ -45,13 +49,22 @@ void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH ==8
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_ssse3;
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_sse4;
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64

Разница между файлами не показана из-за своего большого размера Загрузить разницу

12
third_party/dav1d/src/x86/ipred_init_tmpl.c поставляемый
Просмотреть файл

@ -58,6 +58,7 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
@ -67,6 +68,10 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3);
decl_pal_pred_fn(dav1d_pal_pred_ssse3);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
@ -81,6 +86,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3;
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
@ -90,7 +96,11 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3;
c->pal_pred = dav1d_pal_pred_ssse3;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3;
c->pal_pred = dav1d_pal_pred_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

1317
third_party/dav1d/src/x86/ipred_ssse3.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

21
third_party/dav1d/src/x86/itx_init_tmpl.c поставляемый
Просмотреть файл

@ -86,6 +86,17 @@ decl_itx16_fns(16, 4, ssse3);
decl_itx16_fns( 8, 16, ssse3);
decl_itx16_fns(16, 8, ssse3);
decl_itx12_fns(16, 16, ssse3);
decl_itx2_fns ( 8, 32, ssse3);
decl_itx2_fns (32, 8, ssse3);
decl_itx2_fns (16, 32, ssse3);
decl_itx2_fns (32, 16, ssse3);
decl_itx2_fns (32, 32, ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@ -138,6 +149,16 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx16_fn(R, 8, 16, ssse3);
assign_itx16_fn(R, 16, 8, ssse3);
assign_itx12_fn(, 16, 16, ssse3);
assign_itx2_fn (R, 8, 32, ssse3);
assign_itx2_fn (R, 32, 8, ssse3);
assign_itx2_fn (R, 16, 32, ssse3);
assign_itx2_fn (R, 32, 16, ssse3);
assign_itx2_fn (, 32, 32, ssse3);
assign_itx1_fn (R, 16, 64, ssse3);
assign_itx1_fn (R, 32, 64, ssse3);
assign_itx1_fn (R, 64, 16, ssse3);
assign_itx1_fn (R, 64, 32, ssse3);
assign_itx1_fn ( , 64, 64, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

3410
third_party/dav1d/src/x86/itx_ssse3.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

287
third_party/dav1d/src/x86/msac.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,287 @@
; Copyright © 2019, VideoLAN and dav1d authors
; Copyright © 2019, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64 ; avoids cacheline splits
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
pw_0xff00: times 8 dw 0xff00
pw_32: times 8 dw 32
struc msac
.buf: resq 1
.end: resq 1
.dif: resq 1
.rng: resd 1
.cnt: resd 1
.update_cdf: resd 1
endstruc
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
SECTION .text
%if WIN64
DECLARE_REG_TMP 3
%define buf rsp+8 ; shadow space
%else
DECLARE_REG_TMP 0
%define buf rsp-40 ; red zone
%endif
INIT_XMM sse2
cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
movd m2, [sq+msac.rng]
movq m1, [cdfq]
lea rax, [pw_0xff00]
movq m3, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
pshuflw m2, m2, q0000
movd [buf+12], m2
pand m2, [rax]
mova m0, m1
psrlw m1, 6
psllw m1, 7
pmulhuw m1, m2
movq m2, [rax+nsq*2]
pshuflw m3, m3, q3333
paddw m1, m2
mova [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2 ; c >= v
pmovmskb eax, m1
test r3d, r3d
jz .renorm ; !allow_update_cdf
; update_cdf:
movzx r3d, word [cdfq+r4*2] ; count
pcmpeqw m2, m2
mov r2d, r3d
shr r3d, 4
cmp r4d, 4
sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
cmp r2d, 32
adc r2d, 0 ; count + (count < 32)
movd m3, r3d
pavgw m2, m1 ; i >= val ? -1 : 32768
psubw m2, m0 ; for (i = 0; i < val; i++)
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
movq [cdfq], m0
mov [cdfq+r4*2], r2w
.renorm:
tzcnt eax, eax
mov r4, [sq+msac.dif]
movzx r1d, word [buf+rax+16] ; v
movzx r2d, word [buf+rax+14] ; u
shr eax, 1
.renorm2:
not r4
sub r2d, r1d ; rng
shl r1, 48
add r4, r1 ; ~dif
mov r1d, [sq+msac.cnt]
movifnidn t0, sq
bsr ecx, r2d
xor ecx, 15 ; d
shl r2d, cl
shl r4, cl
mov [t0+msac.rng], r2d
not r4
sub r1d, ecx
jge .end ; no refill required
; refill:
mov r2, [t0+msac.buf]
mov rcx, [t0+msac.end]
lea r5, [r2+8]
cmp r5, rcx
jg .refill_eob
mov r2, [r2]
lea ecx, [r1+23]
add r1d, 16
shr ecx, 3 ; shift_bytes
bswap r2
sub r5, rcx
shl ecx, 3 ; shift_bits
shr r2, cl
sub ecx, r1d ; shift_bits - 16 - cnt
mov r1d, 48
shl r2, cl
mov [t0+msac.buf], r5
sub r1d, ecx ; cnt + 64 - shift_bits
xor r4, r2
.end:
mov [t0+msac.cnt], r1d
mov [t0+msac.dif], r4
RET
.refill_eob: ; avoid overreading the input buffer
mov r5, rcx
mov ecx, 40
sub ecx, r1d ; c
.refill_eob_loop:
cmp r2, r5
jge .refill_eob_end ; eob reached
movzx r1d, byte [r2]
inc r2
shl r1, cl
xor r4, r1
sub ecx, 8
jge .refill_eob_loop
.refill_eob_end:
mov r1d, 40
sub r1d, ecx
mov [t0+msac.buf], r2
mov [t0+msac.dif], r4
mov [t0+msac.cnt], r1d
RET
cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
movd m2, [sq+msac.rng]
movu m1, [cdfq]
lea rax, [pw_0xff00]
movq m3, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
pshuflw m2, m2, q0000
movd [buf+12], m2
punpcklqdq m2, m2
mova m0, m1
psrlw m1, 6
pand m2, [rax]
psllw m1, 7
pmulhuw m1, m2
movu m2, [rax+nsq*2]
pshuflw m3, m3, q3333
paddw m1, m2
punpcklqdq m3, m3
mova [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2
pmovmskb eax, m1
test r3d, r3d
jz m(msac_decode_symbol_adapt4).renorm
movzx r3d, word [cdfq+r4*2]
pcmpeqw m2, m2
mov r2d, r3d
shr r3d, 4
cmp r4d, 4 ; may be called with n_symbols < 4
sbb r3d, -5
cmp r2d, 32
adc r2d, 0
movd m3, r3d
pavgw m2, m1
psubw m2, m0
psubw m0, m1
psraw m2, m3
paddw m0, m2
movu [cdfq], m0
mov [cdfq+r4*2], r2w
jmp m(msac_decode_symbol_adapt4).renorm
cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
movd m4, [sq+msac.rng]
movu m2, [cdfq]
lea rax, [pw_0xff00]
movu m3, [cdfq+16]
movq m5, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
%if WIN64
sub rsp, 48 ; need 36 bytes, shadow space is only 32
%endif
pshuflw m4, m4, q0000
movd [buf-4], m4
punpcklqdq m4, m4
mova m0, m2
psrlw m2, 6
mova m1, m3
psrlw m3, 6
pand m4, [rax]
psllw m2, 7
psllw m3, 7
pmulhuw m2, m4
pmulhuw m3, m4
movu m4, [rax+nsq*2]
pshuflw m5, m5, q3333
paddw m2, m4
psubw m4, [rax-pw_0xff00+pw_32]
punpcklqdq m5, m5
paddw m3, m4
mova [buf], m2
mova [buf+16], m3
psubusw m2, m5
psubusw m3, m5
pxor m4, m4
pcmpeqw m2, m4
pcmpeqw m3, m4
packsswb m5, m2, m3
pmovmskb eax, m5
test r3d, r3d
jz .renorm
movzx r3d, word [cdfq+r4*2]
pcmpeqw m4, m4
mova m5, m4
lea r2d, [r3+80] ; only support n_symbols >= 4
shr r2d, 4
cmp r3d, 32
adc r3d, 0
pavgw m4, m2
pavgw m5, m3
psubw m4, m0
psubw m0, m2
movd m2, r2d
psubw m5, m1
psubw m1, m3
psraw m4, m2
psraw m5, m2
paddw m0, m4
paddw m1, m5
movu [cdfq], m0
movu [cdfq+16], m1
mov [cdfq+r4*2], r3w
.renorm:
tzcnt eax, eax
mov r4, [sq+msac.dif]
movzx r1d, word [buf+rax*2]
movzx r2d, word [buf+rax*2-2]
%if WIN64
add rsp, 48
%endif
jmp m(msac_decode_symbol_adapt4).renorm2
%endif

Просмотреть файл

@ -32,22 +32,22 @@
#include "src/arm/32/util.S"
const register_init, align=3
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
endconst
const error_message_fpscr
.asciz "failed to preserve register FPSCR, changed bits: %x"
.asciz "failed to preserve register FPSCR, changed bits: %x"
error_message_gpr:
.asciz "failed to preserve register r%d"
.asciz "failed to preserve register r%d"
error_message_vfp:
.asciz "failed to preserve register d%d"
.asciz "failed to preserve register d%d"
endconst
@ max number of args used by any asm function.
@ -61,111 +61,111 @@ endconst
.macro clobbercheck variant
.equ pushed, 4*9
function checked_call_\variant, export=1
push {r4-r11, lr}
push {r4-r11, lr}
.ifc \variant, vfp
vpush {d8-d15}
fmrx r4, FPSCR
push {r4}
vpush {d8-d15}
fmrx r4, FPSCR
push {r4}
.equ pushed, pushed + 16*4 + 4
.endif
movrel r12, register_init
movrel r12, register_init
.ifc \variant, vfp
vldm r12, {d8-d15}
vldm r12, {d8-d15}
.endif
ldm r12, {r4-r11}
ldm r12, {r4-r11}
sub sp, sp, #ARG_STACK_A
sub sp, sp, #ARG_STACK_A
.equ pos, 0
.rept MAX_ARGS-4
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
str r12, [sp, #pos]
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
str r12, [sp, #pos]
.equ pos, pos + 4
.endr
mov r12, r0
mov r0, r2
mov r1, r3
ldrd r2, r3, [sp, #ARG_STACK_A + pushed]
blx r12
add sp, sp, #ARG_STACK_A
mov r12, r0
mov r0, r2
mov r1, r3
ldrd r2, r3, [sp, #ARG_STACK_A + pushed]
blx r12
add sp, sp, #ARG_STACK_A
push {r0, r1}
movrel r12, register_init
push {r0, r1}
movrel r12, register_init
.ifc \variant, vfp
.macro check_reg_vfp, dreg, offset
ldrd r2, r3, [r12, #8 * (\offset)]
vmov r0, lr, \dreg
eor r2, r2, r0
eor r3, r3, lr
orrs r2, r2, r3
bne 4f
ldrd r2, r3, [r12, #8 * (\offset)]
vmov r0, lr, \dreg
eor r2, r2, r0
eor r3, r3, lr
orrs r2, r2, r3
bne 4f
.endm
.irp n, 8, 9, 10, 11, 12, 13, 14, 15
@ keep track of the checked double/SIMD register
mov r1, #\n
check_reg_vfp d\n, \n-8
@ keep track of the checked double/SIMD register
mov r1, #\n
check_reg_vfp d\n, \n-8
.endr
.purgem check_reg_vfp
fmrx r1, FPSCR
ldr r3, [sp, #8]
eor r1, r1, r3
@ Ignore changes in bits 0-4 and 7
bic r1, r1, #0x9f
@ Ignore changes in the topmost 5 bits
bics r1, r1, #0xf8000000
bne 3f
fmrx r1, FPSCR
ldr r3, [sp, #8]
eor r1, r1, r3
@ Ignore changes in bits 0-4 and 7
bic r1, r1, #0x9f
@ Ignore changes in the topmost 5 bits
bics r1, r1, #0xf8000000
bne 3f
.endif
@ keep track of the checked GPR
mov r1, #4
@ keep track of the checked GPR
mov r1, #4
.macro check_reg reg1, reg2=
ldrd r2, r3, [r12], #8
eors r2, r2, \reg1
bne 2f
add r1, r1, #1
ldrd r2, r3, [r12], #8
eors r2, r2, \reg1
bne 2f
add r1, r1, #1
.ifnb \reg2
eors r3, r3, \reg2
bne 2f
eors r3, r3, \reg2
bne 2f
.endif
add r1, r1, #1
add r1, r1, #1
.endm
check_reg r4, r5
check_reg r6, r7
check_reg r4, r5
check_reg r6, r7
@ r9 is a volatile register in the ios ABI
#ifdef __APPLE__
check_reg r8
check_reg r8
#else
check_reg r8, r9
check_reg r8, r9
#endif
check_reg r10, r11
check_reg r10, r11
.purgem check_reg
b 0f
b 0f
4:
movrel r0, error_message_vfp
b 1f
movrel r0, error_message_vfp
b 1f
3:
movrel r0, error_message_fpscr
b 1f
movrel r0, error_message_fpscr
b 1f
2:
movrel r0, error_message_gpr
movrel r0, error_message_gpr
1:
#ifdef PREFIX
blx _checkasm_fail_func
blx _checkasm_fail_func
#else
blx checkasm_fail_func
blx checkasm_fail_func
#endif
0:
pop {r0, r1}
pop {r0, r1}
.ifc \variant, vfp
pop {r2}
fmxr FPSCR, r2
vpop {d8-d15}
pop {r2}
fmxr FPSCR, r2
vpop {d8-d15}
.endif
pop {r4-r11, pc}
pop {r4-r11, pc}
endfunc
.endm

Просмотреть файл

@ -32,29 +32,29 @@
#include "src/arm/64/util.S"
const register_init, align=4
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
.quad 0x1a1b2550a612b48c
.quad 0x79445c159ce79064
.quad 0x2eed899d5a28ddcd
.quad 0x86b2536fcd8cf636
.quad 0xb0856806085e7943
.quad 0x3f2bf84fc0fcca4e
.quad 0xacbd382dcf5b8de2
.quad 0xd229e1f5b281303f
.quad 0x71aeaff20b095fd9
.quad 0xab63e2e11fa38ed9
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
.quad 0x1a1b2550a612b48c
.quad 0x79445c159ce79064
.quad 0x2eed899d5a28ddcd
.quad 0x86b2536fcd8cf636
.quad 0xb0856806085e7943
.quad 0x3f2bf84fc0fcca4e
.quad 0xacbd382dcf5b8de2
.quad 0xd229e1f5b281303f
.quad 0x71aeaff20b095fd9
.quad 0xab63e2e11fa38ed9
endconst
const error_message
.asciz "failed to preserve register"
.asciz "failed to preserve register"
endconst
@ -64,107 +64,107 @@ endconst
#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
function stack_clobber, export=1
mov x3, sp
mov x2, #CLOBBER_STACK
mov x3, sp
mov x2, #CLOBBER_STACK
1:
stp x0, x1, [sp, #-16]!
subs x2, x2, #16
b.gt 1b
mov sp, x3
ret
stp x0, x1, [sp, #-16]!
subs x2, x2, #16
b.gt 1b
mov sp, x3
ret
endfunc
#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15)
function checked_call, export=1
stp x29, x30, [sp, #-16]!
mov x29, sp
stp x19, x20, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
stp x25, x26, [sp, #-16]!
stp x27, x28, [sp, #-16]!
stp d8, d9, [sp, #-16]!
stp d10, d11, [sp, #-16]!
stp d12, d13, [sp, #-16]!
stp d14, d15, [sp, #-16]!
stp x29, x30, [sp, #-16]!
mov x29, sp
stp x19, x20, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
stp x25, x26, [sp, #-16]!
stp x27, x28, [sp, #-16]!
stp d8, d9, [sp, #-16]!
stp d10, d11, [sp, #-16]!
stp d12, d13, [sp, #-16]!
stp d14, d15, [sp, #-16]!
movrel x9, register_init
ldp d8, d9, [x9], #16
ldp d10, d11, [x9], #16
ldp d12, d13, [x9], #16
ldp d14, d15, [x9], #16
ldp x19, x20, [x9], #16
ldp x21, x22, [x9], #16
ldp x23, x24, [x9], #16
ldp x25, x26, [x9], #16
ldp x27, x28, [x9], #16
movrel x9, register_init
ldp d8, d9, [x9], #16
ldp d10, d11, [x9], #16
ldp d12, d13, [x9], #16
ldp d14, d15, [x9], #16
ldp x19, x20, [x9], #16
ldp x21, x22, [x9], #16
ldp x23, x24, [x9], #16
ldp x25, x26, [x9], #16
ldp x27, x28, [x9], #16
sub sp, sp, #ARG_STACK
sub sp, sp, #ARG_STACK
.equ pos, 0
.rept MAX_ARGS-8
// Skip the first 8 args, that are loaded into registers
ldr x9, [x29, #16 + 8*8 + pos]
str x9, [sp, #pos]
// Skip the first 8 args, that are loaded into registers
ldr x9, [x29, #16 + 8*8 + pos]
str x9, [sp, #pos]
.equ pos, pos + 8
.endr
mov x12, x0
ldp x0, x1, [x29, #16]
ldp x2, x3, [x29, #32]
ldp x4, x5, [x29, #48]
ldp x6, x7, [x29, #64]
blr x12
add sp, sp, #ARG_STACK
stp x0, x1, [sp, #-16]!
movrel x9, register_init
movi v3.8h, #0
mov x12, x0
ldp x0, x1, [x29, #16]
ldp x2, x3, [x29, #32]
ldp x4, x5, [x29, #48]
ldp x6, x7, [x29, #64]
blr x12
add sp, sp, #ARG_STACK
stp x0, x1, [sp, #-16]!
movrel x9, register_init
movi v3.8h, #0
.macro check_reg_neon reg1, reg2
ldr q0, [x9], #16
uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d
eor v0.16b, v0.16b, v1.16b
orr v3.16b, v3.16b, v0.16b
ldr q0, [x9], #16
uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d
eor v0.16b, v0.16b, v1.16b
orr v3.16b, v3.16b, v0.16b
.endm
check_reg_neon 8, 9
check_reg_neon 10, 11
check_reg_neon 12, 13
check_reg_neon 14, 15
uqxtn v3.8b, v3.8h
umov x3, v3.d[0]
check_reg_neon 8, 9
check_reg_neon 10, 11
check_reg_neon 12, 13
check_reg_neon 14, 15
uqxtn v3.8b, v3.8h
umov x3, v3.d[0]
.macro check_reg reg1, reg2
ldp x0, x1, [x9], #16
eor x0, x0, \reg1
eor x1, x1, \reg2
orr x3, x3, x0
orr x3, x3, x1
ldp x0, x1, [x9], #16
eor x0, x0, \reg1
eor x1, x1, \reg2
orr x3, x3, x0
orr x3, x3, x1
.endm
check_reg x19, x20
check_reg x21, x22
check_reg x23, x24
check_reg x25, x26
check_reg x27, x28
check_reg x19, x20
check_reg x21, x22
check_reg x23, x24
check_reg x25, x26
check_reg x27, x28
cbz x3, 0f
cbz x3, 0f
movrel x0, error_message
movrel x0, error_message
#ifdef PREFIX
bl _checkasm_fail_func
bl _checkasm_fail_func
#else
bl checkasm_fail_func
bl checkasm_fail_func
#endif
0:
ldp x0, x1, [sp], #16
ldp d14, d15, [sp], #16
ldp d12, d13, [sp], #16
ldp d10, d11, [sp], #16
ldp d8, d9, [sp], #16
ldp x27, x28, [sp], #16
ldp x25, x26, [sp], #16
ldp x23, x24, [sp], #16
ldp x21, x22, [sp], #16
ldp x19, x20, [sp], #16
ldp x29, x30, [sp], #16
ret
ldp x0, x1, [sp], #16
ldp d14, d15, [sp], #16
ldp d12, d13, [sp], #16
ldp d10, d11, [sp], #16
ldp d8, d9, [sp], #16
ldp x27, x28, [sp], #16
ldp x25, x26, [sp], #16
ldp x23, x24, [sp], #16
ldp x21, x22, [sp], #16
ldp x19, x20, [sp], #16
ldp x29, x30, [sp], #16
ret
endfunc

1
third_party/dav1d/tests/checkasm/checkasm.c поставляемый
Просмотреть файл

@ -62,6 +62,7 @@ static const struct {
const char *name;
void (*func)(void);
} tests[] = {
{ "msac", checkasm_check_msac },
#if CONFIG_8BPC
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
{ "ipred_8bpc", checkasm_check_ipred_8bpc },

1
third_party/dav1d/tests/checkasm/checkasm.h поставляемый
Просмотреть файл

@ -57,6 +57,7 @@ int xor128_rand(void);
name##_8bpc(void); \
name##_16bpc(void)
void checkasm_check_msac(void);
decl_check_bitfns(void checkasm_check_cdef);
decl_check_bitfns(void checkasm_check_ipred);
decl_check_bitfns(void checkasm_check_itx);

115
third_party/dav1d/tests/checkasm/msac.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,115 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/cpu.h"
#include "src/msac.h"
#include <string.h>
/* The normal code doesn't use function pointers */
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
typedef struct {
decode_symbol_adapt_fn symbol_adapt4;
decode_symbol_adapt_fn symbol_adapt8;
decode_symbol_adapt_fn symbol_adapt16;
} MsacDSPContext;
static void randomize_cdf(uint16_t *const cdf, int n) {
for (int i = 16; i > n; i--)
cdf[i] = rnd(); /* randomize padding */
cdf[n] = cdf[n-1] = 0;
while (--n > 0)
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
}
/* memcmp() on structs can have weird behavior due to padding etc. */
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
a->allow_update_cdf != b->allow_update_cdf;
}
#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \
if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
for (int ns = n_min; ns <= n_max; ns++) { \
dav1d_msac_init(&s_c, buf, sizeof(buf), !cdf_update); \
s_a = s_c; \
randomize_cdf(cdf[0], ns); \
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
for (int i = 0; i < 64; i++) { \
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
unsigned a_res = call_new(&s_a, cdf[1], ns); \
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \
{ \
fail(); \
} \
} \
if (cdf_update && ns == n) \
bench_new(&s_a, cdf[0], n); \
} \
} \
} \
} while (0)
static void check_decode_symbol_adapt(MsacDSPContext *const c) {
/* Use an aligned CDF buffer for more consistent benchmark
* results, and a misaligned one for checking correctness. */
ALIGN_STK_16(uint16_t, cdf, 2, [17]);
MsacContext s_c, s_a;
uint8_t buf[1024];
for (int i = 0; i < 1024; i++)
buf[i] = rnd();
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
CHECK_SYMBOL_ADAPT( 4, 1, 5);
CHECK_SYMBOL_ADAPT( 8, 1, 8);
CHECK_SYMBOL_ADAPT(16, 4, 16);
report("decode_symbol_adapt");
}
void checkasm_check_msac(void) {
MsacDSPContext c;
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c;
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
#if ARCH_X86_64 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
}
#endif
check_decode_symbol_adapt(&c);
}

5
third_party/dav1d/tests/meson.build поставляемый
Просмотреть файл

@ -34,7 +34,10 @@ endif
libdav1d_nasm_objs_if_needed = []
if is_asm_enabled
checkasm_sources = files('checkasm/checkasm.c')
checkasm_sources = files(
'checkasm/checkasm.c',
'checkasm/msac.c',
)
checkasm_tmpl_sources = files(
'checkasm/cdef.c',