Bug 1754070: Update libdav1d to b562b7f648e26e64fae892495527b5b275d53183 r=jbauman

Differential Revision: https://phabricator.services.mozilla.com/D138068
This commit is contained in:
Tom Ritter 2022-02-13 02:58:26 +00:00
Родитель 9272d30533
Коммит 0c73045ce8
39 изменённых файлов: 6666 добавлений и 2059 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit b562b7f648e26e64fae892495527b5b275d53183 (2022-01-10T14:49:11.000+00:00).
release: commit 1f09a9119fb794ab41b1e527d848c2a210ca43d4 (2022-02-04T23:02:17.000-03:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: b562b7f648e26e64fae892495527b5b275d53183
revision: 1f09a9119fb794ab41b1e527d848c2a210ca43d4
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
@ -65,3 +65,4 @@ vendoring:
file: '{yaml_dir}/vcs_version.h'

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "b562b7f648e26e64fae892495527b5b275d53183"
#define DAV1D_VERSION "1f09a9119fb794ab41b1e527d848c2a210ca43d4"

30
third_party/dav1d/include/dav1d/dav1d.h поставляемый
Просмотреть файл

@ -58,23 +58,35 @@ typedef struct Dav1dLogger {
void (*callback)(void *cookie, const char *format, va_list ap);
} Dav1dLogger;
enum Dav1dInloopFilterType {
DAV1D_INLOOPFILTER_NONE = 0,
DAV1D_INLOOPFILTER_DEBLOCK = 1 << 0,
DAV1D_INLOOPFILTER_CDEF = 1 << 1,
DAV1D_INLOOPFILTER_RESTORATION = 1 << 2,
DAV1D_INLOOPFILTER_ALL = DAV1D_INLOOPFILTER_DEBLOCK |
DAV1D_INLOOPFILTER_CDEF |
DAV1D_INLOOPFILTER_RESTORATION,
};
typedef struct Dav1dSettings {
int n_threads; ///< number of threads (0 = auto)
int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = auto)
int apply_grain;
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
int n_threads; ///< number of threads (0 = number of logical cores in host system, default 0)
int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = ceil(sqrt(n_threads)), default 0)
int apply_grain; ///< whether to apply film grain on output frames (default 1)
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31, default 0)
int all_layers; ///< output all spatial layers of a scalable AV1 biststream (default 1)
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited, default 0)
Dav1dPicAllocator allocator; ///< Picture allocator callback.
Dav1dLogger logger; ///< Logger callback.
int strict_std_compliance; ///< whether to abort decoding on standard compliance violations
///< that don't affect actual bitstream decoding (e.g. inconsistent
///< or invalid metadata)
///< or invalid metadata, default 0)
int output_invisible_frames; ///< output invisibly coded frames (in coding order) in addition
///< to all visible frames. Because of show-existing-frame, this
///< means some frames may appear twice (once when coded,
///< once when shown)
uint8_t reserved[24]; ///< reserved for future use
///< once when shown, default 0)
enum Dav1dInloopFilterType inloop_filters; ///< postfilters to enable during decoding (default
///< DAV1D_INLOOPFILTER_ALL)
uint8_t reserved[20]; ///< reserved for future use
} Dav1dSettings;
/**

6
third_party/dav1d/meson.build поставляемый
Просмотреть файл

@ -30,7 +30,7 @@ project('dav1d', ['c'],
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
dav1d_soname_version = '6.3.0'
dav1d_soname_version = '6.4.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -109,10 +109,6 @@ if host_machine.system() == 'windows'
cdata.set('ftello', '_ftelli64')
endif
if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
optional_arguments += '-mcmodel=small'
endif
# On Windows, we use a compatibility layer to emulate pthread
thread_dependency = []
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))

99
third_party/dav1d/src/arm/32/mc.S поставляемый
Просмотреть файл

@ -1146,6 +1146,16 @@ endfunc
vmla.s16 \d, \s2, d0[2]
vmla.s16 \d, \s3, d0[3]
.endm
.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
vmul.s16 \d0, \s0, d0[0]
vmla.s16 \d0, \s1, d0[1]
vmla.s16 \d0, \s2, d0[2]
vmla.s16 \d0, \s3, d0[3]
vmla.s16 \d0, \s4, d1[0]
vmla.s16 \d0, \s5, d1[1]
vmla.s16 \d0, \s6, d1[2]
vmla.s16 \d0, \s7, d1[3]
.endm
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
vmul.s16 \d0, \s0, d0[0]
vmla.s16 \d0, \s1, d0[1]
@ -1182,24 +1192,6 @@ endfunc
vmla.s16 \d1, \s8, d1[2]
vmla.s16 \d1, \s9, d1[3]
.endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
vmul.s16 \d0, \s0, d0[0]
vmla.s16 \d0, \s1, d0[1]
vmla.s16 \d0, \s2, d0[2]
vmla.s16 \d0, \s3, d0[3]
vmla.s16 \d0, \s4, d1[0]
vmla.s16 \d0, \s5, d1[1]
vmla.s16 \d0, \s6, d1[2]
vmla.s16 \d0, \s7, d1[3]
vmul.s16 \d1, \s4, d0[0]
vmla.s16 \d1, \s5, d0[1]
vmla.s16 \d1, \s6, d0[2]
vmla.s16 \d1, \s7, d0[3]
vmla.s16 \d1, \s8, d1[0]
vmla.s16 \d1, \s9, d1[1]
vmla.s16 \d1, \s10, d1[2]
vmla.s16 \d1, \s11, d1[3]
.endm
.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
vqrshrun.s16 \d0, \q0, #\shift
.ifnb \q1
@ -1623,7 +1615,7 @@ L(\type\()_8tap_v_tbl):
st_16 \d_strd, d6, 4
pop {r4-r11,pc}
28: // 2x8, 2x16 v
28: // 2x6, 2x8, 2x12, 2x16 v
vpush {q4-q7}
vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
@ -1642,34 +1634,37 @@ L(\type\()_8tap_v_tbl):
vmov d7, d10
vmov d9, d12
216:
subs \h, \h, #8
subs \h, \h, #4
load_16 \sr2, \src, \s_strd, d16, d18, d20, d22
load_16 \sr2, \src, \s_strd, d24, d26, d28, d30
interleave_1_16 d14, d16, d18, d20, d22
interleave_1_16 d22, d24, d26, d28, d30
vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20
vmovl_u8 q11, d22, q12, d24, q13, d26, q14, d28
vmov d11, d14
vmov d13, d16
vmov d15, d18
vmov d17, d20
vmov d19, d22
vmov d21, d24
vmov d23, d26
vmov d25, d28
mul_mla_8_4 q1, q2, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
vqrshrun_s16 6, q1, d2, q2, d4
mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8
vqrshrun_s16 6, q1, d2
st_16 \d_strd, d2, 4
st_16 \d_strd, d4, 4
ble 0f
vmov q1, q9
vmov q2, q10
vmov q3, q11
vmov q4, q12
vmov q5, q13
vmov q6, q14
vmov d14, d30
cmp \h, #2
vmov q1, q5
vmov q2, q6
vmov q3, q7
vmov q4, q8
vmov q5, q9
vmov q6, q10
vmov d14, d22
beq 26f
b 216b
26:
load_16 \sr2, \src, \s_strd, d16, d18
interleave_1_16 d14, d16, d18
vmovl_u8 q7, d14, q8, d16
vmov d11, d14
vmov d13, d16
mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16
vqrshrun_s16 6, q1, d2
st_16 \d_strd, d2, 2
0:
vpop {q4-q7}
pop {r4-r11,pc}
@ -1703,7 +1698,7 @@ L(\type\()_8tap_v_tbl):
0:
pop {r4-r11,pc}
480: // 4x8, 4x16 v
480: // 4x6, 4x8, 4x12, 4x16 v
vpush {q4}
vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
@ -1726,12 +1721,19 @@ L(\type\()_8tap_v_tbl):
mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13
shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5
ble 0f
subs \h, \h, #4
load_32 \sr2, \src, \s_strd, d30, d2, d4, d6
interleave_1_32 d28, d30, d2, d4, d6
vmovl_u8 q14, d28, q15, d30, q1, d2, q2, d4
mul_mla_8_2 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1, q2
shift_store_4 \type, \d_strd, q8, d16, d17, q9, d18, d19
load_32 \sr2, \src, \s_strd, d30, d2
subs \h, \h, #2
interleave_1_32 d28, d30, d2
vmovl_u8 q14, d28, q15, d30
mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15
shift_store_4 \type, \d_strd, q8, d16, d17
ble 0f
load_32 \sr2, \src, \s_strd, d4, d6
subs \h, \h, #2
interleave_1_32 d2, d4, d6
vmovl_u8 q1, d2, q2, d4
mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2
shift_store_4 \type, \d_strd, q9, d18, d19
ble 0f
subs \h, \h, #4
load_32 \sr2, \src, \s_strd, d8, d16, d18, d20
@ -2643,6 +2645,7 @@ L(\type\()_bilin_v_tbl):
// 2x2 v
vld1.16 {d16[]}, [\src], \s_strd
bgt 24f
22:
vld1.16 {d17[]}, [\sr2], \s_strd
vld1.16 {d18[]}, [\src], \s_strd
vext.8 d16, d16, d17, #6
@ -2653,11 +2656,12 @@ L(\type\()_bilin_v_tbl):
vst1.16 {d4[0]}, [\dst, :16]
vst1.16 {d4[1]}, [\ds2, :16]
pop {r4-r11,pc}
24: // 2x4, 2x8, ... v
24: // 2x4, 2x6, 2x8, ... v
vld1.16 {d17[]}, [\sr2], \s_strd
vld1.16 {d18[]}, [\src], \s_strd
vld1.16 {d19[]}, [\sr2], \s_strd
vld1.16 {d20[]}, [\src], \s_strd
sub \h, \h, #4
vext.8 d16, d16, d17, #6
vext.8 d17, d17, d18, #6
vext.8 d18, d18, d19, #6
@ -2666,14 +2670,15 @@ L(\type\()_bilin_v_tbl):
vtrn.32 d17, d19
vmull.u8 q2, d16, d2
vmlal.u8 q2, d17, d3
subs \h, \h, #4
cmp \h, #2
vqrshrn.u16 d4, q2, #4
vst1.16 {d4[0]}, [\dst, :16], \d_strd
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
vst1.16 {d4[2]}, [\dst, :16], \d_strd
vst1.16 {d4[3]}, [\ds2, :16], \d_strd
ble 0f
blt 0f
vmov d16, d20
beq 22b
b 24b
0:
pop {r4-r11,pc}

48
third_party/dav1d/src/arm/32/mc16.S поставляемый
Просмотреть файл

@ -1748,7 +1748,7 @@ L(\type\()_8tap_v_tbl):
vst1_32 \d_strd, d16, d17
pop {r4-r11,pc}
28: // 2x8, 2x16 v
28: // 2x6, 2x8, 2x12, 2x16 v
vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
@ -1761,25 +1761,29 @@ L(\type\()_8tap_v_tbl):
interleave_1_32 d2, d3, d4, d5, d6
interleave_1_32 d6, d7, d16
216:
subs \h, \h, #8
subs \h, \h, #4
load_32 \sr2, \src, \s_strd, d17, d18, d19, d20
load_32 \sr2, \src, \s_strd, d21, d22, d23, d24
interleave_1_32 d16, d17, d18, d19, d20
interleave_1_32 d20, d21, d22, d23, d24
vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19
vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21
vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23
vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3
vmin_u16 q15, q13, q1
vqrshrun_s32 6, q13, d26, q1, d27
vmin_u16 q15, q13
vst1_32 \d_strd, d26, d27
vst1_32 \d_strd, d2, d3
ble 0f
vmov q1, q9
vmov q2, q10
vmov q3, q11
vmov d16, d24
cmp \h, #2
vmov q1, q3
vmov q2, q8
vmov q3, q9
vmov d16, d20
beq 26f
b 216b
26:
load_32 \sr2, \src, \s_strd, d17, d18
interleave_1_32 d16, d17, d18
vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
vqrshrun_s32 6, q13, d26
vmin_u16 d30, d26
vst1_32 \d_strd, d26
0:
pop {r4-r11,pc}
.endif
@ -1810,7 +1814,7 @@ L(\type\()_8tap_v_tbl):
0:
pop {r4-r11,pc}
480: // 4x8, 4x16 v
480: // 4x6, 4x8, 4x12, 4x16 v
vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
@ -1830,11 +1834,18 @@ L(\type\()_8tap_v_tbl):
vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26
shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
ble 0f
cmp \h, #2
vmov q8, q10
vmov q9, q11
vmov q10, q12
vmov d22, d26
beq 46f
b 48b
46:
load_reg \sr2, \src, \s_strd, d23, d24
vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
shift_store_4 \type, \d_strd, q1, q2, d2, d3
0:
pop {r4-r11,pc}
@ -2801,6 +2812,7 @@ L(\type\()_bilin_v_tbl):
// 2x2 v
vld1.32 {d16[]}, [\src], \s_strd
bgt 24f
22:
vld1.32 {d17[]}, [\sr2], \s_strd
vld1.32 {d18[]}, [\src], \s_strd
vext.8 d16, d16, d17, #4
@ -2811,11 +2823,12 @@ L(\type\()_bilin_v_tbl):
vst1.32 {d16[0]}, [\dst, :32]
vst1.32 {d16[1]}, [\ds2, :32]
pop {r4-r11,pc}
24: // 2x4, 2x8, ... v
24: // 2x4, 2x6, 2x8, ... v
vld1.32 {d17[]}, [\sr2], \s_strd
vld1.32 {d18[]}, [\src], \s_strd
vld1.32 {d19[]}, [\sr2], \s_strd
vld1.32 {d20[]}, [\src], \s_strd
subs \h, \h, #4
vext.8 d16, d16, d17, #4
vext.8 d17, d17, d18, #4
vext.8 d18, d18, d19, #4
@ -2823,14 +2836,15 @@ L(\type\()_bilin_v_tbl):
vswp d17, d18
vmul.i16 q8, q8, q2
vmla.i16 q8, q9, q3
subs \h, \h, #4
cmp \h, #2
vrshr.u16 q8, q8, #4
vst1.32 {d16[0]}, [\dst, :32], \d_strd
vst1.32 {d16[1]}, [\ds2, :32], \d_strd
vst1.32 {d17[0]}, [\dst, :32], \d_strd
vst1.32 {d17[1]}, [\ds2, :32], \d_strd
ble 0f
blt 0f
vmov d16, d20
beq 22b
b 24b
0:
pop {r4-r11,pc}

104
third_party/dav1d/src/arm/64/mc.S поставляемый
Просмотреть файл

@ -1163,6 +1163,26 @@ endfunc
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().4h, \s0\().4h, v0.h[0]
mla \d0\().4h, \s1\().4h, v0.h[1]
mla \d0\().4h, \s2\().4h, v0.h[2]
mla \d0\().4h, \s3\().4h, v0.h[3]
mla \d0\().4h, \s4\().4h, v0.h[4]
mla \d0\().4h, \s5\().4h, v0.h[5]
mla \d0\().4h, \s6\().4h, v0.h[6]
mla \d0\().4h, \s7\().4h, v0.h[7]
.endm
.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
.endm
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
@ -1199,24 +1219,6 @@ endfunc
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d1\().8h, \s9\().8h, v0.h[7]
.endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d1\().8h, \s11\().8h, v0.h[7]
.endm
.macro sqrshrun_b shift, r0, r1, r2, r3
sqrshrun \r0\().8b, \r0\().8h, #\shift
.ifnb \r1
@ -1633,7 +1635,7 @@ L(\type\()_8tap_v):
st_h \d_strd, v6, 4
ret
28: // 2x8, 2x16 v
28: // 2x6, 2x8, 2x12, 2x16 v
ld1 {v0.8b}, [\xmy]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
@ -1648,28 +1650,32 @@ L(\type\()_8tap_v):
interleave_2_s v1, v2, v3, v4, v5, v6
uxtl_b v1, v2, v3, v4
216:
subs \h, \h, #8
subs \h, \h, #4
load_h \sr2, \src, \s_strd, v16, v17, v18, v19
load_h \sr2, \src, \s_strd, v20, v21, v22, v23
interleave_1_h v7, v16, v17, v18, v19
interleave_1_h v19, v20, v21, v22, v23
interleave_2_s v5, v6, v7, v16, v17, v18
interleave_2_s v17, v18, v19, v20, v21, v22
uxtl_b v5, v6, v7, v16
uxtl_b v17, v18, v19, v20
mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20
sqrshrun_b 6, v30, v31
mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 4
st_h \d_strd, v31, 4
b.le 0f
mov v1.16b, v17.16b
mov v2.16b, v18.16b
mov v3.16b, v19.16b
mov v4.16b, v20.16b
mov v5.16b, v21.16b
mov v6.16b, v22.16b
mov v7.16b, v23.16b
cmp \h, #2
mov v1.16b, v5.16b
mov v2.16b, v6.16b
mov v3.16b, v7.16b
mov v4.16b, v16.16b
mov v5.16b, v17.16b
mov v6.16b, v18.16b
mov v7.16b, v19.16b
b.eq 26f
b 216b
26:
load_h \sr2, \src, \s_strd, v16, v17
interleave_1_h v7, v16, v17
uxtl_b v5, v6, v7, v16
mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 2
0:
ret
.endif
@ -1703,7 +1709,7 @@ L(\type\()_8tap_v):
0:
ret
480: // 4x8, 4x16 v
480: // 4x6, 4x8, 4x12, 4x16 v
ld1 {v0.8b}, [\xmy]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
@ -1726,12 +1732,19 @@ L(\type\()_8tap_v):
mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
shift_store_4 \type, \d_strd, v1, v2
b.le 0f
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v27, v16, v17, v18
interleave_1_s v26, v27, v16, v17, v18
uxtl_b v26, v27, v16, v17
mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
shift_store_4 \type, \d_strd, v1, v2
load_s \sr2, \src, \s_strd, v27, v16
subs \h, \h, #2
interleave_1_s v26, v27, v16
uxtl_b v26, v27
mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
shift_store_4 \type, \d_strd, v1
b.le 0f
load_s \sr2, \src, \s_strd, v17, v18
subs \h, \h, #2
interleave_1_s v16, v17, v18
uxtl_b v16, v17
mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
shift_store_4 \type, \d_strd, v2
b.le 0f
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v19, v20, v21, v22
@ -2641,6 +2654,7 @@ L(\type\()_bilin_v):
// 2x2 v
ld1 {v16.h}[0], [\src], \s_strd
b.gt 24f
22:
ld1 {v17.h}[0], [\sr2], \s_strd
ld1 {v18.h}[0], [\src], \s_strd
trn1 v16.4h, v16.4h, v17.4h
@ -2651,11 +2665,12 @@ L(\type\()_bilin_v):
st1 {v4.h}[0], [\dst]
st1 {v4.h}[1], [\ds2]
ret
24: // 2x4, 2x8, ... v
24: // 2x4, 2x6, 2x8, ... v
ld1 {v17.h}[0], [\sr2], \s_strd
ld1 {v18.h}[0], [\src], \s_strd
ld1 {v19.h}[0], [\sr2], \s_strd
ld1 {v20.h}[0], [\src], \s_strd
sub \h, \h, #4
trn1 v16.4h, v16.4h, v17.4h
trn1 v17.4h, v17.4h, v18.4h
trn1 v18.4h, v18.4h, v19.4h
@ -2664,14 +2679,15 @@ L(\type\()_bilin_v):
trn1 v17.2s, v17.2s, v19.2s
umull v4.8h, v16.8b, v2.8b
umlal v4.8h, v17.8b, v3.8b
subs \h, \h, #4
cmp \h, #2
uqrshrn v4.8b, v4.8h, #4
st1 {v4.h}[0], [\dst], \d_strd
st1 {v4.h}[1], [\ds2], \d_strd
st1 {v4.h}[2], [\dst], \d_strd
st1 {v4.h}[3], [\ds2], \d_strd
b.le 0f
b.lt 0f
mov v16.8b, v20.8b
b.eq 22b
b 24b
0:
ret

54
third_party/dav1d/src/arm/64/mc16.S поставляемый
Просмотреть файл

@ -1801,7 +1801,7 @@ L(\type\()_8tap_v):
st_s \d_strd, v16, 4
ret
28: // 2x8, 2x16 v
28: // 2x6, 2x8, 2x12, 2x16 v
ld1 {v0.8b}, [\xmy]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
@ -1814,28 +1814,32 @@ L(\type\()_8tap_v):
interleave_1_s v1, v2, v3, v4, v5
interleave_1_s v5, v6, v7
216:
subs \h, \h, #8
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v16, v17, v18, v19
load_s \sr2, \src, \s_strd, v20, v21, v22, v23
interleave_1_s v7, v16, v17, v18, v19
interleave_1_s v19, v20, v21, v22, v23
smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20
smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22
sqrshrun_h 6, v24, v25, v26, v27
umin_h v31, .8h, v24, v26
sqrshrun_h 6, v24, v25
umin_h v31, .8h, v24
st_s \d_strd, v24, 4
st_s \d_strd, v26, 4
b.le 0f
mov v1.16b, v17.16b
mov v2.16b, v18.16b
mov v3.16b, v19.16b
mov v4.16b, v20.16b
mov v5.16b, v21.16b
mov v6.16b, v22.16b
mov v7.16b, v23.16b
cmp \h, #2
mov v1.16b, v5.16b
mov v2.16b, v6.16b
mov v3.16b, v7.16b
mov v4.16b, v16.16b
mov v5.16b, v17.16b
mov v6.16b, v18.16b
mov v7.16b, v19.16b
b.eq 26f
b 216b
26:
load_s \sr2, \src, \s_strd, v16, v17
interleave_1_s v7, v16, v17
smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_h 6, v24
umin_h v31, .4h, v24
st_s \d_strd, v24, 2
0:
ret
.endif
@ -1867,7 +1871,7 @@ L(\type\()_8tap_v):
0:
ret
480: // 4x8, 4x16 v
480: // 4x6, 4x8, 4x12, 4x16 v
ld1 {v0.8b}, [\xmy]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
@ -1887,6 +1891,7 @@ L(\type\()_8tap_v):
smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_4 \type, \d_strd, v1, v2, v3, v4
b.le 0f
cmp \h, #2
mov v16.8b, v20.8b
mov v17.8b, v21.8b
mov v18.8b, v22.8b
@ -1894,7 +1899,13 @@ L(\type\()_8tap_v):
mov v20.8b, v24.8b
mov v21.8b, v25.8b
mov v22.8b, v26.8b
b.eq 46f
b 48b
46:
load_4h \sr2, \src, \s_strd, v23, v24
smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@ -2858,6 +2869,7 @@ L(\type\()_bilin_v):
// 2x2 v
ld1 {v16.s}[0], [\src], \s_strd
b.gt 24f
22:
ld1 {v17.s}[0], [\sr2], \s_strd
ld1 {v18.s}[0], [\src], \s_strd
trn1 v16.2s, v16.2s, v17.2s
@ -2868,11 +2880,12 @@ L(\type\()_bilin_v):
st1 {v4.s}[0], [\dst]
st1 {v4.s}[1], [\ds2]
ret
24: // 2x4, 2x8, ... v
24: // 2x4, 2x6, 2x8, ... v
ld1 {v17.s}[0], [\sr2], \s_strd
ld1 {v18.s}[0], [\src], \s_strd
ld1 {v19.s}[0], [\sr2], \s_strd
ld1 {v20.s}[0], [\src], \s_strd
sub \h, \h, #4
trn1 v16.2s, v16.2s, v17.2s
trn1 v17.2s, v17.2s, v18.2s
trn1 v18.2s, v18.2s, v19.2s
@ -2881,14 +2894,15 @@ L(\type\()_bilin_v):
trn1 v17.2d, v17.2d, v19.2d
mul v4.8h, v16.8h, v2.8h
mla v4.8h, v17.8h, v3.8h
subs \h, \h, #4
cmp \h, #2
urshr v4.8h, v4.8h, #4
st1 {v4.s}[0], [\dst], \d_strd
st1 {v4.s}[1], [\ds2], \d_strd
st1 {v4.s}[2], [\dst], \d_strd
st1 {v4.s}[3], [\ds2], \d_strd
b.le 0f
b.lt 0f
mov v16.8b, v20.8b
b.eq 22b
b 24b
0:
ret

9
third_party/dav1d/src/cpu.c поставляемый
Просмотреть файл

@ -49,16 +49,11 @@
#endif
static unsigned flags = 0;
#if __has_feature(memory_sanitizer)
// memory sanitizer is inherently incompatible with asm
static unsigned flags_mask = 0;
#else
static unsigned flags_mask = -1;
#endif
COLD void dav1d_init_cpu(void) {
#if HAVE_ASM
#if HAVE_ASM && !__has_feature(memory_sanitizer)
// memory sanitizer is inherently incompatible with asm
#if ARCH_AARCH64 || ARCH_ARM
flags = dav1d_get_cpu_flags_arm();
#elif ARCH_PPC64LE

20
third_party/dav1d/src/decode.c поставляемый
Просмотреть файл

@ -3295,6 +3295,15 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
retval = 0;
error:
return retval;
}
int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
int retval = DAV1D_ERR(EINVAL);
if (f->frame_hdr->refresh_context)
dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
@ -3430,6 +3439,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
// if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
// threads also. Not sure it makes a measurable difference.
int res = dav1d_decode_frame_init(f);
if (!res) res = dav1d_decode_frame_init_cdf(f);
// wait until all threads have completed
if (!res) {
if (f->c->n_tc > 1) {
@ -3487,7 +3497,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
if (c->task_thread.cur < c->n_fc)
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
if (out_delayed->p.data[0]) {
@ -3496,7 +3506,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
if ((out_delayed->visible || c->output_invisible_frames) &&
progress != FRAME_ERROR)
{
dav1d_picture_ref(&c->out, &out_delayed->p);
dav1d_thread_picture_ref(&c->out, out_delayed);
c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
}
dav1d_thread_picture_unref(out_delayed);
@ -3670,7 +3680,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
// move f->cur into output queue
if (c->n_fc == 1) {
if (f->frame_hdr->show_frame || c->output_invisible_frames) {
dav1d_picture_ref(&c->out, &f->sr_cur.p);
dav1d_thread_picture_ref(&c->out, &f->sr_cur);
c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
}
} else {
@ -3822,7 +3832,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
if (c->n_fc == 1) {
if ((res = dav1d_decode_frame(f)) < 0) {
dav1d_picture_unref_internal(&c->out);
dav1d_thread_picture_unref(&c->out);
for (int i = 0; i < 8; i++) {
if (refresh_frame_flags & (1 << i)) {
if (c->refs[i].p.p.data[0])
@ -3851,7 +3861,7 @@ error:
dav1d_ref_dec(&f->ref_mvs_ref[i]);
}
if (c->n_fc == 1)
dav1d_picture_unref_internal(&c->out);
dav1d_thread_picture_unref(&c->out);
else
dav1d_thread_picture_unref(out_delayed);
dav1d_picture_unref_internal(&f->cur);

7
third_party/dav1d/src/internal.h поставляемый
Просмотреть файл

@ -102,7 +102,7 @@ struct Dav1dContext {
// decoded output picture queue
Dav1dData in;
Dav1dPicture out;
Dav1dThreadPicture out, cache;
// dummy is a pointer to prevent compiler errors about atomic_load()
// not taking const arguments
atomic_int flush_mem, *flush;
@ -158,6 +158,7 @@ struct Dav1dContext {
unsigned frame_size_limit;
int strict_std_compliance;
int output_invisible_frames;
enum Dav1dInloopFilterType inloop_filters;
int drain;
enum PictureFlags frame_flags;
enum Dav1dEventFlags event_flags;
@ -169,14 +170,15 @@ struct Dav1dContext {
enum TaskType {
DAV1D_TASK_TYPE_INIT,
DAV1D_TASK_TYPE_INIT_CDF,
DAV1D_TASK_TYPE_TILE_ENTROPY,
DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
DAV1D_TASK_TYPE_DEBLOCK_COLS,
DAV1D_TASK_TYPE_DEBLOCK_ROWS,
DAV1D_TASK_TYPE_CDEF,
DAV1D_TASK_TYPE_SUPER_RESOLUTION,
DAV1D_TASK_TYPE_LOOP_RESTORATION,
DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
};
@ -303,6 +305,7 @@ struct Dav1dFrameContext {
struct TaskThreadData *ttd;
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
int num_tasks, num_tile_tasks;
int init_done;
int done[2];
int retval;
int update_set; // whether we need to update CDF reference

68
third_party/dav1d/src/lib.c поставляемый
Просмотреть файл

@ -76,6 +76,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
s->frame_size_limit = 0;
s->strict_std_compliance = 0;
s->output_invisible_frames = 0;
s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
}
static void close_internal(Dav1dContext **const c_out, int flush);
@ -131,6 +132,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
c->frame_size_limit = s->frame_size_limit;
c->strict_std_compliance = s->strict_std_compliance;
c->output_invisible_frames = s->output_invisible_frames;
c->inloop_filters = s->inloop_filters;
if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
dav1d_mem_pool_init(&c->frame_hdr_pool) ||
@ -311,33 +313,46 @@ static int has_grain(const Dav1dPicture *const pic)
fgdata->num_uv_points[1];
}
static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
Dav1dPicture *const in)
static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
{
if (!c->apply_grain || !has_grain(in)) {
dav1d_picture_move_ref(out, in);
return 0;
int res = 0;
Dav1dThreadPicture *const in = c->all_layers ? &c->out : &c->cache;
if (!c->apply_grain || !has_grain(&in->p)) {
dav1d_picture_move_ref(out, &in->p);
dav1d_thread_picture_unref(in);
goto end;
}
int res = dav1d_apply_grain(c, out, in);
dav1d_picture_unref_internal(in);
res = dav1d_apply_grain(c, out, &in->p);
dav1d_thread_picture_unref(in);
end:
if (!c->all_layers && c->out.p.data[0]) {
dav1d_thread_picture_move_ref(in, &c->out);
}
return res;
}
static int output_picture_ready(Dav1dContext *const c) {
if (!c->out.data[0]) return 0;
// skip lower spatial layers
if (c->operating_point_idc && !c->all_layers) {
const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
if (max_spatial_id > c->out.frame_hdr->spatial_id) {
dav1d_picture_unref_internal(&c->out);
static int output_picture_ready(Dav1dContext *const c, const int drain) {
if (!c->all_layers) {
if (c->out.p.data[0] && c->cache.p.data[0]) {
const unsigned spatial_mask = c->operating_point_idc >> 8;
const int max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
if (max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
return 1;
dav1d_thread_picture_unref(&c->cache);
dav1d_thread_picture_move_ref(&c->cache, &c->out);
return 0;
} else if (c->cache.p.data[0] && drain) {
return 1;
} else if (c->out.p.data[0]) {
dav1d_thread_picture_move_ref(&c->cache, &c->out);
return 0;
}
}
return 1;
return !!c->out.p.data[0];
}
static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
@ -369,15 +384,18 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
if ((out_delayed->visible || c->output_invisible_frames) &&
progress != FRAME_ERROR)
{
dav1d_picture_ref(&c->out, &out_delayed->p);
dav1d_thread_picture_ref(&c->out, out_delayed);
c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
}
dav1d_thread_picture_unref(out_delayed);
if (output_picture_ready(c))
return output_image(c, out, &c->out);
if (output_picture_ready(c, 0))
return output_image(c, out);
}
} while (++drain_count < c->n_fc);
if (output_picture_ready(c, 1))
return output_image(c, out);
return DAV1D_ERR(EAGAIN);
}
@ -386,7 +404,7 @@ static int gen_picture(Dav1dContext *const c)
int res;
Dav1dData *const in = &c->in;
if (output_picture_ready(c))
if (output_picture_ready(c, 0))
return 0;
while (in->sz > 0) {
@ -399,7 +417,7 @@ static int gen_picture(Dav1dContext *const c)
in->data += res;
if (!in->sz) dav1d_data_unref_internal(in);
}
if (output_picture_ready(c))
if (output_picture_ready(c, 0))
break;
if (res < 0)
return res;
@ -439,8 +457,8 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
if (res < 0)
return res;
if (output_picture_ready(c))
return output_image(c, out, &c->out);
if (output_picture_ready(c, c->n_fc == 1))
return output_image(c, out);
if (c->n_fc > 1 && drain)
return drain_picture(c, out);
@ -592,6 +610,8 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
freep(&f->frame_thread.tile_start_off);
dav1d_freep_aligned(&f->frame_thread.pal);
freep(&f->frame_thread.cbi);
}
if (c->n_tc > 1) {
pthread_cond_destroy(&f->task_thread.cond);
}
freep(&f->frame_thread.frame_progress);

39
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -260,29 +260,38 @@ endif
libdav1d_rc_obj = []
libdav1d_flags = [stackalign_flag]
api_export_flags = []
#
# Windows .rc file and API export flags
#
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
rc_file = configure_file(
input : 'dav1d.rc.in',
output : 'dav1d.rc',
configuration : rc_data
)
if host_machine.system() == 'windows'
if get_option('default_library') != 'static'
rc_file = configure_file(
input : 'dav1d.rc.in',
output : 'dav1d.rc',
configuration : rc_data
)
libdav1d_rc_obj = winmod.compile_resources(rc_file)
libdav1d_rc_obj = winmod.compile_resources(rc_file)
api_export_flags = ['-DDAV1D_BUILDING_DLL']
else
libdav1d_rc_obj = []
api_export_flags = ['-DDAV1D_BUILDING_DLL']
endif
if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
# We don't expect to reference data members from other DLLs without
# dllimport attributes. Set the -mcmodel=small flag, which avoids
# generating indirection via .refptr.<symname> for all potentially
# dllimported variable references.
libdav1d_flags += '-mcmodel=small'
endif
endif
#
# Library definitions
#
@ -294,7 +303,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
include_directories : dav1d_inc_dirs,
dependencies: [stdatomic_dependencies],
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
c_args : [libdav1d_flags, stackrealign_flag, api_export_flags],
install : false,
build_by_default : false,
).extract_all_objects(recursive: true)
@ -307,7 +316,7 @@ foreach bitdepth : dav1d_bitdepths
libdav1d_tmpl_sources, config_h_target,
include_directories: dav1d_inc_dirs,
dependencies : [stdatomic_dependencies],
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags,
install : false,
build_by_default : false,
).extract_all_objects(recursive: true)
@ -320,7 +329,7 @@ foreach bitdepth : dav1d_bitdepths
libdav1d_arch_tmpl_sources, config_h_target,
include_directories: dav1d_inc_dirs,
dependencies : [stdatomic_dependencies],
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
install : false,
build_by_default : false,
).extract_all_objects(recursive: true)
@ -350,7 +359,7 @@ libdav1d = library('dav1d',
thread_compat_dep,
libdl_dependency,
],
c_args : [stackalign_flag, api_export_flags],
c_args : [libdav1d_flags, api_export_flags],
version : dav1d_soname_version,
soversion : dav1d_soversion,
install : true,

14
third_party/dav1d/src/obu.c поставляемый
Просмотреть файл

@ -1533,8 +1533,10 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
break;
}
case DAV1D_OBU_PADDING:
case DAV1D_OBU_TD:
c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT;
break;
case DAV1D_OBU_PADDING:
// ignore OBUs we don't care about
break;
default:
@ -1547,9 +1549,9 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (c->frame_hdr->show_existing_frame) {
if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
if (c->n_fc == 1) {
dav1d_picture_ref(&c->out,
&c->refs[c->frame_hdr->existing_frame_idx].p.p);
dav1d_data_props_copy(&c->out.m, &in->m);
dav1d_thread_picture_ref(&c->out,
&c->refs[c->frame_hdr->existing_frame_idx].p);
dav1d_data_props_copy(&c->out.p.m, &in->m);
c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
} else {
pthread_mutex_lock(&c->task_thread.lock);
@ -1569,7 +1571,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
if (c->task_thread.cur < c->n_fc)
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
if (out_delayed->p.data[0]) {
@ -1578,7 +1580,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if ((out_delayed->visible || c->output_invisible_frames) &&
progress != FRAME_ERROR)
{
dav1d_picture_ref(&c->out, &out_delayed->p);
dav1d_thread_picture_ref(&c->out, out_delayed);
c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
}
dav1d_thread_picture_unref(out_delayed);

10
third_party/dav1d/src/picture.c поставляемый
Просмотреть файл

@ -259,6 +259,16 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
dst->flags = src->flags;
}
void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst,
Dav1dThreadPicture *const src)
{
dav1d_picture_move_ref(&dst->p, &src->p);
dst->visible = src->visible;
dst->progress = src->progress;
dst->flags = src->flags;
memset(src, 0, sizeof(*src));
}
void dav1d_picture_unref_internal(Dav1dPicture *const p) {
validate_input(p != NULL);

3
third_party/dav1d/src/picture.h поставляемый
Просмотреть файл

@ -46,6 +46,7 @@ enum PlaneType {
enum PictureFlags {
PICTURE_FLAG_NEW_SEQUENCE = 1 << 0,
PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1,
PICTURE_FLAG_NEW_TEMPORAL_UNIT = 1 << 2,
};
typedef struct Dav1dThreadPicture {
@ -83,6 +84,8 @@ int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w,
void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
const Dav1dThreadPicture *src);
void dav1d_thread_picture_move_ref(Dav1dThreadPicture *dst,
Dav1dThreadPicture *src);
void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
/**

16
third_party/dav1d/src/recon_tmpl.c поставляемый
Просмотреть файл

@ -2046,6 +2046,11 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
}
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
(!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
{
return;
}
const int y = sby * f->sb_step * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3] = {
@ -2054,9 +2059,8 @@ void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const i
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
f->lf.start_of_tile_row[sby]);
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
f->lf.start_of_tile_row[sby]);
}
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
@ -2068,7 +2072,9 @@ void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const i
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
(f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
{
bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
}
if (f->seq_hdr->cdef || f->lf.restore_planes) {
@ -2079,6 +2085,7 @@ void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const i
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
const Dav1dFrameContext *const f = tc->f;
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
const int sbsz = f->sb_step;
const int y = sby * sbsz * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -2140,6 +2147,7 @@ void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby
}
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
const int y = sby * f->sb_step * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const sr_p[3] = {

83
third_party/dav1d/src/thread_task.c поставляемый
Просмотреть файл

@ -141,7 +141,8 @@ static void insert_tasks(Dav1dFrameContext *const f,
}
// sort by tile-id
assert(first->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
assert(first->type == t_ptr->type);
assert(t_ptr->sby == first->sby);
const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
@ -270,6 +271,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
f->task_thread.init_done = 0;
// schedule init task, which will schedule the remaining tasks
Dav1dTask *const t = &f->task_thread.init_task;
t->type = DAV1D_TASK_TYPE_INIT;
@ -350,6 +352,18 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
return 0;
}
static inline void abort_frame(Dav1dFrameContext *const f) {
atomic_store(&f->task_thread.error, 1);
f->task_thread.task_counter = 0;
f->task_thread.done[0] = 1;
f->task_thread.done[1] = 1;
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
dav1d_decode_frame_exit(f, -1);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
void *dav1d_worker_task(void *data) {
Dav1dTaskContext *const tc = data;
const Dav1dContext *const c = tc->c;
@ -360,23 +374,37 @@ void *dav1d_worker_task(void *data) {
pthread_mutex_lock(&ttd->lock);
for (;;) {
Dav1dFrameContext *f;
Dav1dTask *t, *prev_t;
Dav1dTask *t, *prev_t = NULL;
if (tc->task_thread.die) break;
if (atomic_load(c->flush)) goto park;
while (ttd->cur < c->n_fc) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + ttd->cur) % c->n_fc];
prev_t = f->task_thread.task_cur_prev;
t = prev_t ? prev_t->next : f->task_thread.task_head;
while (t) {
if (t->type == DAV1D_TASK_TYPE_INIT) {
if (c->n_fc > 1) { // run init tasks first
for (unsigned i = 0; i < c->n_fc; i++) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + i) % c->n_fc];
if (f->task_thread.init_done) continue;
t = f->task_thread.task_head;
if (!t) continue;
if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
const int p1 = f->in_cdf.progress ?
atomic_load(f->in_cdf.progress) : 1;
if (p1) {
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
goto found;
}
} else if (t->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION) {
}
}
}
while (ttd->cur < c->n_fc) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + ttd->cur) % c->n_fc];
prev_t = f->task_thread.task_cur_prev;
t = prev_t ? prev_t->next : f->task_thread.task_head;
while (t) {
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
{
// if not bottom sbrow of tile, this task will be re-added
// after it's finished
if (!check_tile(t, f, c->n_fc > 1))
@ -447,7 +475,8 @@ void *dav1d_worker_task(void *data) {
if (prev_t) prev_t->next = t->next;
else f->task_thread.task_head = t->next;
if (!t->next) f->task_thread.task_tail = prev_t;
if (!f->task_thread.task_head) ttd->cur++;
if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
ttd->cur++;
// we don't need to check cond_signaled here, since we found a task
// after the last signal so we want to re-signal the next waiting thread
// and again won't need to signal after that
@ -463,10 +492,26 @@ void *dav1d_worker_task(void *data) {
int sby = t->sby;
switch (t->type) {
case DAV1D_TASK_TYPE_INIT: {
assert(c->n_fc > 1);
int res = dav1d_decode_frame_init(f);
int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
if (res || p1 == TILE_ERROR) {
pthread_mutex_lock(&ttd->lock);
abort_frame(f);
} else if (!res) {
t->type = DAV1D_TASK_TYPE_INIT_CDF;
if (p1) goto found_unlocked;
pthread_mutex_lock(&ttd->lock);
insert_task(f, t, 0);
}
reset_task_cur(c, ttd, t->frame_idx);
continue;
}
case DAV1D_TASK_TYPE_INIT_CDF: {
assert(c->n_fc > 1);
int res = -1;
if (!atomic_load(&f->task_thread.error))
res = dav1d_decode_frame_init(f);
res = dav1d_decode_frame_init_cdf(f);
pthread_mutex_lock(&ttd->lock);
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
@ -490,19 +535,9 @@ void *dav1d_worker_task(void *data) {
}
}
}
} else {
// init failed, signal completion
atomic_store(&f->task_thread.error, 1);
f->task_thread.task_counter = 0;
f->task_thread.done[0] = 1;
f->task_thread.done[1] = 1;
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
dav1d_decode_frame_exit(f, -1);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
} else abort_frame(f);
reset_task_cur(c, ttd, t->frame_idx);
f->task_thread.init_done = 1;
continue;
}
case DAV1D_TASK_TYPE_TILE_ENTROPY:

1
third_party/dav1d/src/thread_task.h поставляемый
Просмотреть файл

@ -42,6 +42,7 @@ void dav1d_task_frame_init(Dav1dFrameContext *f);
void *dav1d_worker_task(void *data);
int dav1d_decode_frame_init(Dav1dFrameContext *f);
int dav1d_decode_frame_init_cdf(Dav1dFrameContext *f);
int dav1d_decode_frame_main(Dav1dFrameContext *f);
void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
int dav1d_decode_frame(Dav1dFrameContext *f);

30
third_party/dav1d/src/x86/cpu.c поставляемый
Просмотреть файл

@ -28,13 +28,14 @@
#include "config.h"
#include <stdint.h>
#include <string.h>
#include "common/attributes.h"
#include "src/x86/cpu.h"
typedef struct {
uint32_t eax, ebx, ecx, edx;
uint32_t eax, ebx, edx, ecx;
} CpuidRegisters;
void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
@ -43,13 +44,22 @@ uint64_t dav1d_cpu_xgetbv(unsigned xcr);
#define X(reg, mask) (((reg) & (mask)) == (mask))
COLD unsigned dav1d_get_cpu_flags_x86(void) {
CpuidRegisters r = { 0 };
dav1d_cpu_cpuid(&r, 0, 0);
const unsigned max_leaf = r.eax;
union {
CpuidRegisters r;
struct {
uint32_t max_leaf;
char vendor[12];
};
} cpu;
dav1d_cpu_cpuid(&cpu.r, 0, 0);
unsigned flags = 0;
if (max_leaf >= 1) {
if (cpu.max_leaf >= 1) {
CpuidRegisters r;
dav1d_cpu_cpuid(&r, 1, 0);
const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0);
const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
flags |= DAV1D_X86_CPU_FLAG_SSE2;
if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
@ -63,7 +73,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
if (max_leaf >= 7) {
if (cpu.max_leaf >= 7) {
dav1d_cpu_cpuid(&r, 7, 0);
if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
flags |= DAV1D_X86_CPU_FLAG_AVX2;
@ -76,6 +86,14 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
}
}
#endif
if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 ||
(family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60)))))
{
/* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */
flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
}
}
}
return flags;

14
third_party/dav1d/src/x86/cpu.h поставляемый
Просмотреть файл

@ -29,12 +29,14 @@
#define DAV1D_SRC_X86_CPU_H
enum CpuFlags {
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
* VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
* VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough
* to cause performance regressions. */
};
unsigned dav1d_get_cpu_flags_x86(void);

4
third_party/dav1d/src/x86/cpuid.asm поставляемый
Просмотреть файл

@ -38,8 +38,8 @@ cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
cpuid
mov [r4+4*0], eax
mov [r4+4*1], ebx
mov [r4+4*2], ecx
mov [r4+4*3], edx
mov [r4+4*2], edx
mov [r4+4*3], ecx
%if ARCH_X86_64
mov rbx, r5
%endif

1623
third_party/dav1d/src/x86/film_grain16_avx2.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1297
third_party/dav1d/src/x86/film_grain_avx2.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -65,10 +65,13 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
if (flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER) return;
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
#endif

489
third_party/dav1d/src/x86/itx16_avx2.asm поставляемый
Просмотреть файл

@ -313,7 +313,7 @@ ALIGN function_align
%endmacro
%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4, 12
INV_TXFM_FN %1, %2, 0, 4x4, 12
%ifidn %1_%2, dct_dct
imul r6d, [cq], 181
mov [cq], eobd ; 0
@ -340,21 +340,20 @@ ALIGN function_align
%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
vpbroadcastd m%5, [pw_m3784_1567]
punpckhwd m%3, m%2, m%1
psubw m%4, m%1, m%2
paddw m%1, m%2
vpbroadcastd m%2, [pw_1567_3784]
punpcklqdq m%1, m%4
vpbroadcastd m%4, [pw_2896x8]
vpbroadcastd m%4, [pw_1567_3784]
punpcklwd m%2, m%1
vpbroadcastd m%1, [pw_m2896_2896]
pmaddwd m%5, m%3
pmaddwd m%3, m%2
pmulhrsw m%1, m%4 ; t0 t1
paddd m%5, m%6
paddd m%3, m%6
psrad m%5, 12
psrad m%3, 12
pmaddwd m%3, m%4
vpbroadcastd m%4, [pw_2896_2896]
pmaddwd m%1, m%2
pmaddwd m%2, m%4
REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
packssdw m%3, m%5 ; t3 t2
psubsw m%2, m%1, m%3 ; out3 out2
paddsw m%1, m%3 ; out0 out1
packssdw m%2, m%1 ; t0 t1
paddsw m%1, m%2, m%3 ; out0 out1
psubsw m%2, m%3 ; out3 out2
%endmacro
INV_TXFM_4X4_FN dct, dct
@ -2581,6 +2580,33 @@ cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
lea dstq, [dstq+strideq*2]
ret
%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
punpckldq m%9, m%1, m%2 ; aibj emfn
punpckhdq m%1, m%2 ; ckdl gohp
punpckldq m%10, m%3, m%4 ; qyrz uCvD
punpckhdq m%3, m%4 ; sAtB wExF
punpckldq m%11, m%5, m%6 ; GOHP KSLT
punpckhdq m%5, m%6 ; IQJR MUNV
punpckldq m%12, m%7, m%8 ; WeXf aibj
punpckhdq m%7, m%8 ; YgZh ckdl
punpcklqdq m%2, m%9, m%10 ; aiqy emuC
punpckhqdq m%9, m%10 ; bjrz fnvD
punpcklqdq m%4, m%1, m%3 ; cksA gowE
punpckhqdq m%10, m%1, m%3 ; dltB hpxF
punpcklqdq m%6, m%11, m%12 ; GOWe KSai
punpckhqdq m%11, m%12 ; HPXf LTbj
punpcklqdq m%8, m%5, m%7 ; IQYg MUck
punpckhqdq m%12, m%5, m%7 ; JRZh NVdl
vperm2i128 m%1, m%2, m%6, 0x20 ; out0
vperm2i128 m%5, m%2, m%6, 0x31 ; out4
vperm2i128 m%2, m%9, m%11, 0x20 ; out1
vperm2i128 m%6, m%9, m%11, 0x31 ; out5
vperm2i128 m%3, m%4, m%8, 0x20 ; out2
vperm2i128 m%7, m%4, m%8, 0x31 ; out6
vperm2i128 m%4, m%10, m%12, 0x20 ; out3
vperm2i128 m%8, m%10, m%12, 0x31 ; out7
%endmacro
INV_TXFM_8X8_FN dct, dct, 12
INV_TXFM_8X8_FN dct, identity, 12
INV_TXFM_8X8_FN dct, adst, 12
@ -2608,30 +2634,7 @@ ALIGN function_align
ret
ALIGN function_align
.transpose_8x8:
punpckldq m8, m0, m1 ; aibj emfn
punpckhdq m0, m0, m1 ; ckdl gohp
punpckldq m9, m2, m3 ; qyrz uCvD
punpckhdq m2, m2, m3 ; sAtB wExF
punpckldq m10, m4, m5 ; GOHP KSLT
punpckhdq m4, m4, m5 ; IQJR MUNV
punpckldq m11, m6, m7 ; WeXf aibj
punpckhdq m6, m6, m7 ; YgZh ckdl
punpcklqdq m1, m8, m9 ; aiqy emuC
punpckhqdq m8, m8, m9 ; bjrz fnvD
punpcklqdq m3, m0, m2 ; cksA gowE
punpckhqdq m9, m0, m2 ; dltB hpxF
punpcklqdq m5, m10, m11 ; GOWe KSai
punpckhqdq m10, m10, m11 ; HPXf LTbj
punpcklqdq m7, m4, m6 ; IQYg MUck
punpckhqdq m11, m4, m6 ; JRZh NVdl
vperm2i128 m0, m1, m5, 0x20 ; out0
vperm2i128 m4, m1, m5, 0x31 ; out4
vperm2i128 m1, m8, m10, 0x20 ; out1
vperm2i128 m5, m8, m10, 0x31 ; out5
vperm2i128 m2, m3, m7, 0x20 ; out2
vperm2i128 m6, m3, m7, 0x31 ; out6
vperm2i128 m3, m9, m11, 0x20 ; out3
vperm2i128 m7, m9, m11, 0x31 ; out7
TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
ret
ALIGN function_align
.round_shift4:
@ -3336,6 +3339,21 @@ INV_TXFM_8X16_FN identity, identity, 0, 12
cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp m(iidentity_8x16_internal_10bpc).pass1
.pass2:
call .pass2_main
packssdw m0, m8
packssdw m1, m9
packssdw m2, m10
packssdw m3, m11
packssdw m4, m12
packssdw m5, m13
packssdw m6, m14
packssdw m13, m7, m15
vpbroadcastd m7, [pixel_12bpc_max]
vpbroadcastd m12, [pw_16384]
call m(iidentity_8x16_internal_10bpc).pass2_end
RET
ALIGN function_align
.pass2_main:
mova [cq], m7
vpbroadcastd m7, [clip_18b_min]
REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
@ -3358,18 +3376,7 @@ cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
paddd m15, [cq]
REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
packssdw m0, m8
packssdw m1, m9
packssdw m2, m10
packssdw m3, m11
packssdw m4, m12
packssdw m5, m13
packssdw m6, m14
packssdw m13, m7, m15
vpbroadcastd m7, [pixel_12bpc_max]
vpbroadcastd m12, [pw_16384]
call m(iidentity_8x16_internal_10bpc).pass2_end
RET
ret
%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 16x4, %3
@ -4481,15 +4488,15 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
call m(idct_16x8_internal_10bpc).write_16x4_zero
jmp m(idct_16x8_internal_10bpc).end2
%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 16x16
%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
INV_TXFM_FN %1, %2, %3, 16x16, %4
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 16
add r6d, 10240
sar r6d, 14
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2
%endif
%endmacro
@ -4499,9 +4506,10 @@ INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
.pass1:
vpbroadcastd m11, [pd_2048]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*4]
sub eobd, 36
@ -4605,6 +4613,7 @@ ALIGN function_align
pmulhrsw m2, m12
pmulhrsw m3, m12
call m(idct_16x8_internal_10bpc).write_16x4_start
.write_16x16_2:
pmulhrsw m0, m12, m4
pmulhrsw m1, m12, m5
pmulhrsw m2, m12, m6
@ -4747,6 +4756,7 @@ INV_TXFM_16X16_FN adst, flipadst
cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_18b_min]
vpbroadcastd m14, [clip_18b_max]
.pass1:
vpbroadcastd m15, [pd_2896]
lea r6, [rsp+32*4]
sub eobd, 36
@ -4882,6 +4892,7 @@ INV_TXFM_16X16_FN flipadst, flipadst
cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_18b_min]
vpbroadcastd m14, [clip_18b_max]
.pass1:
vpbroadcastd m15, [pd_2896]
lea r6, [rsp+32*4]
sub eobd, 36
@ -4993,6 +5004,7 @@ INV_TXFM_16X16_FN identity, dct, -92
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
.pass1:
vpbroadcastd m15, [pd_11586]
vpbroadcastd m7, [pd_10240]
lea r6, [rsp+32*4]
@ -5056,6 +5068,375 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
mova m1, [cq+32*1]
jmp m(idct_16x16_internal_10bpc).end
INV_TXFM_16X16_FN dct, dct, 0, 12
INV_TXFM_16X16_FN dct, identity, 28, 12
INV_TXFM_16X16_FN dct, adst, 0, 12
INV_TXFM_16X16_FN dct, flipadst, 0, 12
cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m12, [clip_20b_min]
vpbroadcastd m13, [clip_20b_max]
jmp m(idct_16x16_internal_10bpc).pass1
.pass2:
mova [cq+32* 8], m8
mova [cq+32* 9], m9
mova [cq+32*10], m10
mova [cq+32*11], m11
mova [cq+32*12], m12
mova [cq+32*13], m13
mova [cq+32*14], m14
mova [cq+32*15], m15
call .pass2_main
packssdw m0, m1
packssdw m1, m2, m3
packssdw m2, m4, m5
packssdw m3, m6, m7
packssdw m4, m8, m9
packssdw m5, m10, m11
packssdw m6, m12, m13
packssdw m7, m14, m15
mova [r6-32*4], m0
mova [r6-32*3], m1
mova [r6-32*2], m2
mova [r6-32*1], m3
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
mova [r6+32*3], m7
mova m0, [cq+32* 8]
mova m1, [cq+32* 9]
mova m2, [cq+32*10]
mova m3, [cq+32*11]
mova m4, [cq+32*12]
mova m5, [cq+32*13]
mova m6, [cq+32*14]
mova m7, [cq+32*15]
mov r5, r6
add r6, 32*16
call .pass2_main
jmp m(iadst_16x16_internal_12bpc).end
ALIGN function_align
.write_16x16:
mova [rsp+gprsize+32*0], m8
mova [rsp+gprsize+32*1], m9
mova [rsp+gprsize+32*2], m12
vpbroadcastd m12, [pw_16384]
pmulhrsw m0, m12
pmulhrsw m1, m12
pmulhrsw m2, m12
pmulhrsw m3, m12
call m(idct_16x8_internal_12bpc).write_16x4_start
call m(idct_16x8_internal_10bpc).write_16x4_zero
jmp m(idct_16x16_internal_10bpc).write_16x16_2
ALIGN function_align
.pass2_main:
call m(idct_8x8_internal_12bpc).transpose_8x8
mova [cq+32* 0], m0
mova [cq+32* 1], m2
mova [cq+32* 2], m4
mova [cq+32* 3], m6
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
pmaxsd m0, m12, m1
pmaxsd m1, m12, m3
pmaxsd m2, m12, m5
pmaxsd m3, m12, m7
REPX {pminsd x, m13}, m0, m1, m2, m3
test eobd, eobd
jge .pass2_slow
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
jmp .pass2_fast
.pass2_slow:
sub r6, 32*8
mova m8, [r6-32*4]
mova m4, [r6-32*3]
mova m10, [r6-32*2]
mova m5, [r6-32*1]
mova m12, [r6+32*0]
mova m6, [r6+32*1]
mova m14, [r6+32*2]
mova m7, [r6+32*3]
TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
mova [cq+32* 4], m8
mova [cq+32* 5], m10
mova [cq+32* 6], m12
mova [cq+32* 7], m14
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
REPX {pmaxsd x, m12}, m4, m5, m6, m7
REPX {pminsd x, m13}, m4, m5, m6, m7
.pass2_fast:
vpbroadcastd m11, [pd_2048]
vpbroadcastd m14, [pd_2896]
call m(idct_8x16_internal_10bpc).main_oddhalf
pmaxsd m0, m12, [cq+32* 0]
pmaxsd m1, m12, [cq+32* 1]
pmaxsd m2, m12, [cq+32* 2]
pmaxsd m3, m12, [cq+32* 3]
REPX {pminsd x, m13}, m0, m1, m2, m3
test eobd, eobd
jge .pass2_slow2
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
jmp .pass2_fast2
.pass2_slow2:
pmaxsd m4, m12, [cq+32* 4]
pmaxsd m5, m12, [cq+32* 5]
pmaxsd m6, m12, [cq+32* 6]
pmaxsd m7, m12, [cq+32* 7]
REPX {pminsd x, m13}, m4, m5, m6, m7
.pass2_fast2:
call m(idct_8x8_internal_10bpc).main
call m(idct_8x16_internal_10bpc).main_evenhalf
psrad m11, 8 ; pd_8
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
call m(idct_16x8_internal_10bpc).pass1_rotations
REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
ret
INV_TXFM_16X16_FN adst, dct, 0, 12
INV_TXFM_16X16_FN adst, adst, 0, 12
INV_TXFM_16X16_FN adst, flipadst, 0, 12
cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_20b_min]
vpbroadcastd m14, [clip_20b_max]
jmp m(iadst_16x16_internal_10bpc).pass1
.pass2:
call .pass2_part1
call m(iadst_16x8_internal_10bpc).pass1_rotations
call .pass2_part2
call m(iadst_16x8_internal_10bpc).pass1_rotations
.pass2_part3:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
.end:
packssdw m15, m14
packssdw m14, m13, m12
packssdw m13, m11, m10
packssdw m12, m9, m8
packssdw m11, m7, m6
packssdw m10, m5, m4
packssdw m7, m3, m2
packssdw m6, m1, m0
vpblendd m0, m6, [r5-32*4], 0x33
vpblendd m1, m6, [r5-32*4], 0xcc
vpblendd m2, m7, [r5-32*3], 0x33
vpblendd m3, m7, [r5-32*3], 0xcc
vpermq m0, m0, q3120
vpermq m1, m1, q2031
vpermq m2, m2, q3120
vpermq m3, m3, q2031
call m(idct_16x8_internal_12bpc).write_16x4_start
call m(idct_16x8_internal_10bpc).write_16x4_zero
vpblendd m0, m10, [r5-32*2], 0x33
vpblendd m1, m10, [r5-32*2], 0xcc
vpblendd m2, m11, [r5-32*1], 0x33
vpblendd m3, m11, [r5-32*1], 0xcc
vpermq m0, m0, q3120
vpermq m1, m1, q2031
vpermq m2, m2, q3120
vpermq m3, m3, q2031
call m(idct_16x8_internal_10bpc).write_16x4_zero
vpblendd m0, m12, [r5+32*0], 0x33
vpblendd m1, m12, [r5+32*0], 0xcc
vpblendd m2, m13, [r5+32*1], 0x33
vpblendd m3, m13, [r5+32*1], 0xcc
vpermq m0, m0, q3120
vpermq m1, m1, q2031
vpermq m2, m2, q3120
vpermq m3, m3, q2031
call m(idct_16x8_internal_10bpc).write_16x4_zero
vpblendd m0, m14, [r5+32*2], 0x33
vpblendd m1, m14, [r5+32*2], 0xcc
vpblendd m2, m15, [r5+32*3], 0x33
vpblendd m3, m15, [r5+32*3], 0xcc
vpermq m0, m0, q3120
vpermq m1, m1, q2031
vpermq m2, m2, q3120
vpermq m3, m3, q2031
call m(idct_16x8_internal_10bpc).write_16x4_zero
RET
ALIGN function_align
.pass2_part1:
mova [cq+32* 8], m8
mova [cq+32* 9], m9
mova [cq+32*10], m10
mova [cq+32*11], m11
mova [cq+32*12], m12
mova [cq+32*13], m13
mova [cq+32*14], m14
mova [cq+32*15], m15
.pass2_main:
call m(idct_8x8_internal_12bpc).transpose_8x8
mova [cq+32* 0], m0
mova [cq+32* 1], m3
mova [cq+32* 2], m4
mova [cq+32* 3], m7
vpbroadcastd m13, [clip_18b_min]
vpbroadcastd m14, [clip_18b_max]
pmaxsd m0, m13, m2
pmaxsd m2, m13, m6
pmaxsd m5, m13, m5
pmaxsd m7, m13, m1
REPX {pminsd x, m14}, m0, m2, m5, m7
test eobd, eobd
jge .pass2_slow
pxor m1, m1
REPX {mova x, m1}, m3, m4, m6
jmp .pass2_fast
.pass2_slow:
sub r6, 32*8
mova m8, [r6-32*4]
mova m3, [r6-32*3]
mova m4, [r6-32*2]
mova m11, [r6-32*1]
mova m12, [r6+32*0]
mova m1, [r6+32*1]
mova m6, [r6+32*2]
mova m15, [r6+32*3]
TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
mova [cq+32* 4], m8
mova [cq+32* 5], m11
mova [cq+32* 6], m12
mova [cq+32* 7], m15
vpbroadcastd m13, [clip_18b_min]
vpbroadcastd m14, [clip_18b_max]
REPX {pmaxsd x, m13}, m1, m3, m4, m6
REPX {pminsd x, m14}, m1, m3, m4, m6
.pass2_fast:
vpbroadcastd m12, [pd_2048]
vpbroadcastd m15, [pd_2896]
call m(iadst_16x8_internal_10bpc).main_part1
pmaxsd m0, m13, [cq+32* 0] ; 0
pmaxsd m7, m13, [cq+32* 1] ; 3
pmaxsd m2, m13, [cq+32* 2] ; 4
pmaxsd m5, m13, [cq+32* 3] ; 7
REPX {pminsd x, m14}, m0, m2, m5, m7
test eobd, eobd
jge .pass2_slow2
pxor m1, m1
REPX {mova x, m1}, m3, m4, m6
jmp .pass2_fast2
.pass2_slow2:
pmaxsd m4, m13, [cq+32* 4] ; 8
pmaxsd m3, m13, [cq+32* 5] ; 11
pmaxsd m6, m13, [cq+32* 6] ; 12
pmaxsd m1, m13, [cq+32* 7] ; 15
REPX {pminsd x, m14}, m1, m3, m4, m6
.pass2_fast2:
call m(iadst_16x8_internal_10bpc).main_part2
vpbroadcastd m14, [pd_34816]
psrld m15, 11 ; pd_1
psubd m13, m14, m15 ; pd_34815
pslld m15, 3 ; pd_8
ret
ALIGN function_align
.pass2_part2:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
packssdw m0, m1
packssdw m1, m2, m3
packssdw m2, m4, m5
packssdw m3, m6, m7
packssdw m4, m8, m9
packssdw m5, m10, m11
packssdw m6, m12, m13
packssdw m7, m14, m15
mova [r6-32*4], m0
mova [r6-32*3], m1
mova [r6-32*2], m2
mova [r6-32*1], m3
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
mova [r6+32*3], m7
mova m0, [cq+32* 8]
mova m1, [cq+32* 9]
mova m2, [cq+32*10]
mova m3, [cq+32*11]
mova m4, [cq+32*12]
mova m5, [cq+32*13]
mova m6, [cq+32*14]
mova m7, [cq+32*15]
mov r5, r6
add r6, 32*16
jmp .pass2_main
INV_TXFM_16X16_FN flipadst, dct, 0, 12
INV_TXFM_16X16_FN flipadst, adst, 0, 12
INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_20b_min]
vpbroadcastd m14, [clip_20b_max]
jmp m(iflipadst_16x16_internal_10bpc).pass1
.pass2:
call m(iadst_16x16_internal_12bpc).pass2_part1
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
call m(iadst_16x16_internal_12bpc).pass2_part2
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
jmp m(iadst_16x16_internal_12bpc).pass2_part3
INV_TXFM_16X16_FN identity, dct, -92, 12
INV_TXFM_16X16_FN identity, identity, 0, 12
cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
jmp m(iidentity_16x16_internal_10bpc).pass1
.pass2:
call m(iidentity_8x16_internal_12bpc).pass2_main
call m(idct_16x16_internal_10bpc).transpose_fast
test eobd, eobd
jl .pass2_fast
mova [cq+32* 8], m0
mova [cq+32* 9], m1
mova [cq+32*10], m2
mova [cq+32*11], m3
mova [cq+32*12], m4
mova [cq+32*13], m5
mova [cq+32*14], m6
mova [cq+32*15], m7
mova m8, [r6-32*4]
mova m9, [r6-32*3]
mova m10, [r6-32*2]
mova m11, [r6-32*1]
mova m12, [r6+32*0]
mova m13, [r6+32*1]
mova m14, [r6+32*2]
mova m15, [r6+32*3]
sub r6, 32*8
mova m0, [r6-32*4]
mova m1, [r6-32*3]
mova m2, [r6-32*2]
mova m3, [r6-32*1]
mova m4, [r6+32*0]
mova m5, [r6+32*1]
mova m6, [r6+32*2]
mova m7, [r6+32*3]
call m(iidentity_8x16_internal_12bpc).pass2_main
call m(idct_16x8_internal_10bpc).transpose2
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
mova m12, m4
mova m13, m5
mova m14, m6
mova m15, m7
mova m0, [cq+32* 8]
mova m1, [cq+32* 9]
mova m2, [cq+32*10]
mova m3, [cq+32*11]
mova m4, [cq+32*12]
mova m5, [cq+32*13]
mova m6, [cq+32*14]
mova m7, [cq+32*15]
.pass2_fast:
call m(idct_16x16_internal_12bpc).write_16x16
RET
%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
mova m%4, [r6+32*(%1-4)]
mova m%2, [r5+32*(3-%1)]

25
third_party/dav1d/src/x86/itx16_sse.asm поставляемый
Просмотреть файл

@ -101,6 +101,8 @@ pixel_10bpc_max: times 8 dw 0x03ff
pw_1567_3784: times 4 dw 1567, 3784
pw_m3784_1567: times 4 dw -3784, 1567
pw_2896_2896: times 4 dw 2896, 2896
pw_m2896_2896: times 4 dw -2896, 2896
clip_18b_min: times 4 dd -0x20000
clip_18b_max: times 4 dd 0x1ffff
@ -429,22 +431,19 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; m0 = in0 in1
; m1 = in2 in3
; m5 = pd_2048
mova m4, [o(pw_m3784_1567)]
punpckhwd m2, m1, m0
psubw m3, m0, m1
paddw m0, m1
punpcklqdq m0, m3
pmaddwd m4, m2
punpcklwd m1, m0
pmaddwd m4, m2, [o(pw_m3784_1567)]
pmaddwd m2, [o(pw_1567_3784)]
pmulhrsw m0, [o(pw_2896x8)] ; t0 t1
paddd m4, m5
paddd m2, m5
psrad m4, 12
psrad m2, 12
packssdw m2, m4 ; t3 t2
psubsw m1, m0, m2 ; tmp3 tmp2
paddsw m0, m2 ; tmp0 tmp1
pmaddwd m0, m1, [o(pw_m2896_2896)]
pmaddwd m1, [o(pw_2896_2896)]
REPX {paddd x, m5}, m4, m2, m0, m1
packssdw m5, m5 ; pw_2048
REPX {psrad x, 12}, m4, m2, m0, m1
packssdw m2, m4 ; t3 t2
packssdw m1, m0 ; t0 t1
paddsw m0, m1, m2 ; out0 out1
psubsw m1, m2 ; out3 out2
pmulhrsw m0, m5
pmulhrsw m1, m5
movq m2, [dstq+strideq*0]

1
third_party/dav1d/src/x86/itx_init_tmpl.c поставляемый
Просмотреть файл

@ -273,6 +273,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
assign_itx12_bpc_fn( , 16, 16, 12, avx2);
#endif
if (bpc > 10) return;

6
third_party/dav1d/src/x86/mc16_avx2.asm поставляемый
Просмотреть файл

@ -3017,11 +3017,11 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx,
%endif
dec hd
jz .ret
mova xm8, [rsp+0x00]
movd xm9, [rsp+0x30]
add myd, dyd
test myd, ~0x3ff
jz .w4_loop
mova xm8, [rsp+0x00]
movd xm9, [rsp+0x30]
movu xm4, [srcq]
movu xm5, [srcq+r4]
test myd, 0x400
@ -5789,7 +5789,7 @@ cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr, _, pxmax
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pd_64]

155
third_party/dav1d/src/x86/mc16_avx512.asm поставляемый
Просмотреть файл

@ -131,6 +131,16 @@ warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
resize_permE: dq 0, 2, 4, 6
resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
prep_hv_shift: dq 6, 4
put_bilin_h_rnd: dw 8, 8, 10, 10
@ -151,9 +161,12 @@ pw_m512: times 2 dw -512
pw_2: times 2 dw 2
pw_64: times 2 dw 64
pd_32: dd 32
pd_63: dd 63
pd_128: dd 128
pd_640: dd 640
pd_2176: dd 2176
pd_16384: dd 16384
pd_0_4: dd 0, 4
%define pw_16 prep_mul
%define pd_512 warp_8x8_rnd_h
@ -237,6 +250,7 @@ cextern mc_subpel_filters
cextern mc_warp_filter
cextern obmc_masks_avx2
cextern resize_filter
SECTION .text
@ -4708,4 +4722,145 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
jl .w128
RET
cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
sub dword mx0m, 4<<14
sub dword src_wm, 8
mov r6, ~0
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
kmovq k6, r6
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pd_16384]
vpbroadcastd m7, [base+pd_63]
mova m24, [base+resize_permA]
mova m25, [base+resize_permB]
mova m26, [base+resize_permC]
mova m27, [base+resize_permD]
vbroadcasti32x4 m28, [base+resize_shufA]
vbroadcasti32x4 m29, [base+resize_shufB]
mova m30, [base+resize_permE]
vpbroadcastw ym31, pxmaxm
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
pslld m5, 4 ; dx*16
pslld m6, 14
pxor m2, m2
.loop_y:
xor xd, xd
mova m4, m8 ; per-line working version of mx
.loop_x:
pmaxsd m0, m4, m2
psrad m9, m4, 8 ; filter offset (unmasked)
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
psubd m1, m4, m0 ; pshufb offset
psrad m0, 14 ; clipped src_x offset
psrad m1, 14 ; pshufb edge_emu offset
vptestmd k5, m1, m1
pand m9, m7 ; filter offset (masked)
ktestw k5, k5
jz .load
vpbroadcastq m14, [base+pd_0_4]
vpermq m10, m0, q1100
vpermq m11, m0, q3322
vpermq m20, m1, q1100
vpermq m21, m1, q3322
punpckldq m10, m10
punpckldq m11, m11
punpckldq m20, m20
punpckldq m21, m21
paddd m10, m14
paddd m11, m14
paddd m20, m14
paddd m21, m14
vextracti32x8 ym12, m10, 1
vextracti32x8 ym13, m11, 1
vextracti32x8 ym22, m20, 1
vextracti32x8 ym23, m21, 1
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
pshufb m16, m0
pshufb m17, m1
pshufb m18, m14
pshufb m19, m15
mova m20, m24
mova m22, m24
mova m21, m25
mova m23, m25
vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
mova m15, m26
mova m17, m26
mova m16, m27
mova m18, m27
vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
kmovq k1, k6
kmovq k2, k6
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
pshufb m10, m11, m28
pshufb m11, m11, m29
pshufb m12, m13, m28
pshufb m13, m13, m29
jmp .filter
.load:
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
pshufb m10, m11, m28
pshufb m11, m11, m29
pshufb m12, m13, m28
pshufb m13, m13, m29
vpgatherdd m15{k3}, [srcq+m0*2+ 0]
vpgatherdd m16{k4}, [srcq+m0*2+ 4]
kmovq k1, k6
kmovq k2, k6
vpgatherdd m17{k1}, [srcq+m0*2+ 8]
vpgatherdd m18{k2}, [srcq+m0*2+12]
.filter:
mova m14, m2
vpdpwssd m14, m15, m10
vpdpwssd m14, m16, m11
vpdpwssd m14, m17, m12
vpdpwssd m14, m18, m13
psubd m14, m3, m14
psrad m14, 15
packusdw m14, m14
vpermq m14, m30, m14
pminsw ym14, ym31
mova [dstq+xq*2], ym14
paddd m4, m5
add xd, 16
cmp xd, dst_wd
jl .loop_x
add dstq, dst_strideq
add srcq, src_strideq
dec hd
jg .loop_y
RET
%endif ; ARCH_X86_64

4007
third_party/dav1d/src/x86/mc16_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

4
third_party/dav1d/src/x86/mc_avx2.asm поставляемый
Просмотреть файл

@ -5046,11 +5046,11 @@ cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pw_m256]
vpbroadcastd xm3, [base+pw_m256]
vpbroadcastd m7, [base+pd_63]
vbroadcasti128 m15, [base+pb_8x0_8x8]
pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]

115
third_party/dav1d/src/x86/mc_avx512.asm поставляемый
Просмотреть файл

@ -193,29 +193,39 @@ bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 1
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
resize_permC: dd 0, 4, 8, 12
pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
wm_420_perm64: dq 0xfedcba9876543210
wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
pb_127: times 4 db 127
pw_m128 times 2 dw -128
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
pd_32: dd 32
pd_34: dd 34
pd_512: dd 512
pd_32768: dd 32768
pb_8x0_8x8: times 8 db 0
times 8 db 8
pb_127: times 4 db 127
pw_m128 times 2 dw -128
pw_m256: times 2 dw -256
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
pd_32: dd 32
pd_34: dd 34
pd_63: dd 63
pd_512: dd 512
pd_32768: dd 32768
%define pb_m64 (wm_sign+4)
%define pb_64 (wm_sign+8)
%define pd_2 (pd_0to7+8)
cextern mc_subpel_filters
cextern mc_warp_filter
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern resize_filter
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
@ -4450,4 +4460,87 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
jl .w128
RET
cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0
sub dword mx0m, 4<<14
sub dword src_wm, 8
mov r6, ~0
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
kmovq k3, r6
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pw_m256]
vpbroadcastd m7, [base+pd_63]
vbroadcasti32x4 m15, [base+pb_8x0_8x8]
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
pslld m5, 4 ; dx*16
pslld m6, 14
pxor m2, m2
mova m16, [base+resize_permA]
mova m17, [base+resize_permB]
mova xm18, [base+resize_permC]
.loop_y:
xor xd, xd
mova m4, m8 ; per-line working version of mx
.loop_x:
pmaxsd m0, m4, m2
psrad m9, m4, 8 ; filter offset (unmasked)
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
psubd m1, m4, m0 ; pshufb offset
psrad m0, 14 ; clipped src_x offset
psrad m1, 14 ; pshufb edge_emu offset
vptestmd k4, m1, m1
pand m9, m7 ; filter offset (masked)
ktestw k4, k4
jz .load
vextracti32x8 ym12, m0, 1
vextracti32x8 ym13, m1, 1
kmovq k1, k3
kmovq k2, k3
vpgatherdq m10{k1}, [srcq+ym0]
vpgatherdq m11{k2}, [srcq+ym12]
kmovq k1, k3
kmovq k2, k3
vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
mova m12, m16
mova m13, m17
paddb m14, m15
paddb m0, m15
pshufb m10, m14
pshufb m11, m0
vpermi2d m12, m10, m11
vpermi2d m13, m10, m11
jmp .filter
.load:
kmovq k1, k3
kmovq k2, k3
vpgatherdd m12{k1}, [srcq+m0+0]
vpgatherdd m13{k2}, [srcq+m0+4]
.filter:
kmovq k1, k3
kmovq k2, k3
vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
mova m14, m2
vpdpbusd m14, m12, m10
vpdpbusd m14, m13, m11
packssdw m14, m14
pmulhrsw m14, m3
packuswb m14, m14
vpermd m14, m18, m14
mova [dstq+xq], xm14
paddd m4, m5
add xd, 16
cmp xd, dst_wd
jl .loop_x
add dstq, dst_strideq
add srcq, src_strideq
dec hd
jg .loop_y
RET
%endif ; ARCH_X86_64

3
third_party/dav1d/src/x86/mc_init_tmpl.c поставляемый
Просмотреть файл

@ -152,7 +152,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
#if BITDEPTH == 8
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
@ -174,7 +173,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
#endif
c->avg = BF(dav1d_avg, ssse3);
c->w_avg = BF(dav1d_w_avg, ssse3);
@ -296,5 +294,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->blend_h = BF(dav1d_blend_h, avx512icl);
c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
c->resize = BF(dav1d_resize, avx512icl);
#endif
}

23
third_party/dav1d/src/x86/mc_sse.asm поставляемый
Просмотреть файл

@ -5170,9 +5170,9 @@ INIT_XMM ssse3
mova [esp+0x40], m2
mova [esp+0x50], m3
MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1
MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
mova m5, [esp+0x180]
mova m6, [esp+0x190]
mova m7, [esp+0x1a0]
@ -5201,9 +5201,6 @@ INIT_XMM ssse3
.vloop:
mov r0, r0m
mov r5, [esp+0x1f4]
%ifidn %1, put
mov dsd, dsm
%endif
and myd, 0x3ff
mov mym, myd
xor r3, r3
@ -5244,13 +5241,10 @@ INIT_XMM ssse3
%ifidn %1, put
packuswb m4, m4
movq [dstq], m4
add dstq, dsq
add dstq, dsm
%else
mova [tmpq], m4
add tmpq, tmp_stridem
%endif
%if ARCH_X86_32
mov r0m, r0
%endif
dec hd
jz .hloop_prep
@ -5329,6 +5323,7 @@ INIT_XMM ssse3
mova [rsp+0x70], m7
mova [rsp+0x80], m4
%else
mov r0m, r0
mov myd, mym
mov r3, r3m
add myd, dym
@ -5745,7 +5740,7 @@ INIT_XMM ssse3
movu m1, [srcq+ssq*0]
movu m2, [srcq+ssq*1]
movu m3, [srcq+ssq*2]
lea srcq, [srcq+ss3q ]
add srcq, ss3q
punpcklqdq m6, m6
SWAP m4, m7
pand m7, m11, m8
@ -6723,7 +6718,7 @@ INIT_XMM ssse3
movu m1, [srcq+ssq*0]
movu m2, [srcq+ssq*2]
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ss3q ]
add srcq, ss3q
punpcklqdq m6, m6
SWAP m4, m7
pand m7, m11, m8
@ -6734,7 +6729,7 @@ INIT_XMM ssse3
movu m0, [srcq+ssq*0]
movu m7, [srcq+ssq*1]
movu m6, [srcq+ssq*2]
lea srcq, [srcq+ss3q ]
add srcq, ss3q
pshufb m1, m14
pshufb m2, m14
pshufb m3, m14
@ -9409,7 +9404,7 @@ cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
pshufd m5, m5, q0000
%if ARCH_X86_64
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
%else

7
third_party/dav1d/tests/checkasm/checkasm.h поставляемый
Просмотреть файл

@ -311,11 +311,12 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
#endif
#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
#define PIXEL_RECT(name, w, h) \
ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \
ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \
ALIGN_STK_64(pixel, name##_buf, ((h)+32)*(ROUND_UP(w,64)+64) + 64,); \
ptrdiff_t name##_stride = sizeof(pixel)*(ROUND_UP(w,64)+64); \
(void)name##_stride; \
pixel *name = name##_buf + ((w)+64)*16 + 64
pixel *name = name##_buf + (ROUND_UP(w,64)+64)*16 + 64
#define CLEAR_PIXEL_RECT(name) \
memset(name##_buf, 0x99, sizeof(name##_buf)) \

40
third_party/dav1d/tests/checkasm/ipred.c поставляемый
Просмотреть файл

@ -192,8 +192,8 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
}
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
PIXEL_RECT(c_dst, 32, 32);
PIXEL_RECT(a_dst, 32, 32);
ALIGN_STK_64(int16_t, ac, 32 * 32,);
ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
@ -215,8 +215,6 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
const int bitdepth_max = 0xff;
#endif
const ptrdiff_t stride = w * sizeof(pixel);
int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2));
for (int i = -h * 2; i <= w * 2; i++)
@ -229,14 +227,17 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
for (int i = 0; i < w * h; i++)
ac[i] -= luma_avg;
call_ref(c_dst, stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride,
w, h, "dst");
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
bench_new(a_dst, stride, topleft, w, h, ac, alpha
call_ref(c_dst, c_dst_stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
}
}
@ -244,8 +245,8 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
}
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
PIXEL_RECT(c_dst, 64, 64);
PIXEL_RECT(a_dst, 64, 64);
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
ALIGN_STK_16(uint16_t, pal, 8,);
@ -261,7 +262,6 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
#else
const int bitdepth_max = 0xff;
#endif
const ptrdiff_t stride = w * sizeof(pixel);
for (int i = 0; i < 8; i++)
pal[i] = rnd() & bitdepth_max;
@ -269,11 +269,15 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
for (int i = 0; i < w * h; i++)
idx[i] = rnd() & 7;
call_ref(c_dst, stride, pal, idx, w, h);
call_new(a_dst, stride, pal, idx, w, h);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
bench_new(a_dst, stride, pal, idx, w, h);
call_ref(c_dst, c_dst_stride, pal, idx, w, h);
call_new(a_dst, a_dst_stride, pal, idx, w, h);
checkasm_check_pixel_padded(c_dst, c_dst_stride,
a_dst, a_dst_stride, w, h, "dst");
bench_new(a_dst, a_dst_stride, pal, idx, w, h);
}
report("pal_pred");
}

25
third_party/dav1d/tests/checkasm/itx.c поставляемый
Просмотреть файл

@ -243,8 +243,8 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
const enum RectTxfmSize tx)
{
ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
PIXEL_RECT(c_dst, 64, 64);
PIXEL_RECT(a_dst, 64, 64);
static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };
@ -275,21 +275,26 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
memcpy(coeff[1], coeff[0], sizeof(*coeff));
for (int j = 0; j < w * h; j++)
c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
call_ref(c_dst, c_dst_stride, coeff[0], eob
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
call_new(a_dst, a_dst_stride, coeff[1], eob
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
a_dst, w * sizeof(*a_dst),
w, h, "dst");
checkasm_check_pixel_padded(c_dst, c_dst_stride,
a_dst, a_dst_stride,
w, h, "dst");
if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
fail();
bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
bench_new(a_dst, a_dst_stride, coeff[0], eob
HIGHBD_TAIL_SUFFIX);
}
}

232
third_party/dav1d/tests/checkasm/mc.c поставляемый
Просмотреть файл

@ -57,8 +57,8 @@ static int mc_h_next(const int h) {
static void check_mc(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
PIXEL_RECT(c_dst, 128, 128);
PIXEL_RECT(a_dst, 128, 128);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
@ -68,7 +68,6 @@ static void check_mc(Dav1dMCDSPContext *const c) {
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
for (int mxy = 0; mxy < 4; mxy++)
if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc",
filter_names[filter], w, mxy_names[mxy], BITDEPTH))
@ -87,18 +86,21 @@ static void check_mc(Dav1dMCDSPContext *const c) {
for (int i = 0; i < 135 * 135; i++)
src_buf[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, src, src_stride, w, h,
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, c_dst_stride, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride, w, h,
call_new(a_dst, a_dst_stride, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride,
a_dst, dst_stride,
w, h, "dst");
checkasm_check_pixel_padded(c_dst, c_dst_stride,
a_dst, a_dst_stride,
w, h, "dst");
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
{
bench_new(a_dst, dst_stride, src, src_stride, w, h,
bench_new(a_dst, a_dst_stride, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
}
}
@ -164,8 +166,8 @@ static void check_mct(Dav1dMCDSPContext *const c) {
static void check_mc_scaled(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
PIXEL_RECT(c_dst, 128, 128);
PIXEL_RECT(a_dst, 128, 128);
const pixel *src = src_buf + 263 * 3 + 3;
const ptrdiff_t src_stride = 263 * sizeof(pixel);
#if BITDEPTH == 16
@ -180,7 +182,6 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) {
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
for (int p = 0; p < 3; ++p) {
if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
filter_names[filter], w, scaled_paths[p], BITDEPTH))
@ -198,16 +199,20 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) {
for (int k = 0; k < 263 * 263; k++)
src_buf[k] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, src, src_stride,
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, c_dst_stride, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride,
call_new(a_dst, a_dst_stride, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride,
a_dst, dst_stride, w, h, "dst");
checkasm_check_pixel_padded(c_dst, c_dst_stride,
a_dst, a_dst_stride,
w, h, "dst");
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
bench_new(a_dst, dst_stride, src, src_stride,
bench_new(a_dst, a_dst_stride, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
}
}
@ -281,15 +286,14 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
static void check_avg(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
PIXEL_RECT(c_dst, 135, 135);
PIXEL_RECT(a_dst, 128, 128);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) {
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
#if BITDEPTH == 16
@ -299,12 +303,16 @@ static void check_avg(Dav1dMCDSPContext *const c) {
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
}
}
report("avg");
@ -312,15 +320,14 @@ static void check_avg(Dav1dMCDSPContext *const c) {
static void check_w_avg(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
PIXEL_RECT(c_dst, 135, 135);
PIXEL_RECT(a_dst, 128, 128);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) {
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
int weight = rnd() % 15 + 1;
@ -331,12 +338,15 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded(c_dst, c_dst_stride,a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
}
}
report("w_avg");
@ -344,8 +354,8 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
static void check_mask(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
PIXEL_RECT(c_dst, 135, 135);
PIXEL_RECT(a_dst, 128, 128);
ALIGN_STK_64(uint8_t, mask, 128 * 128,);
for (int i = 0; i < 128 * 128; i++)
@ -357,7 +367,6 @@ static void check_mask(Dav1dMCDSPContext *const c) {
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) {
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
#if BITDEPTH == 16
@ -366,12 +375,16 @@ static void check_mask(Dav1dMCDSPContext *const c) {
const int bitdepth_max = 0xff;
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
}
}
report("mask");
@ -379,8 +392,8 @@ static void check_mask(Dav1dMCDSPContext *const c) {
static void check_w_mask(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
PIXEL_RECT(c_dst, 135, 135);
PIXEL_RECT(a_dst, 128, 128);
ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
@ -397,7 +410,6 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w,
BITDEPTH))
{
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
int sign = rnd() & 1;
@ -408,19 +420,22 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h,
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h,
c_mask, sign HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h,
a_mask, sign HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride,
a_dst, dst_stride,
w, h, "dst");
checkasm_check_pixel_padded(c_dst, c_dst_stride,
a_dst, a_dst_stride,
w, h, "dst");
checkasm_check(uint8_t, c_mask, w >> ss_hor[i],
a_mask, w >> ss_hor[i],
w >> ss_hor[i], h >> ss_ver[i],
"mask");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h,
a_mask, sign HIGHBD_TAIL_SUFFIX);
}
}
@ -429,15 +444,14 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
static void check_blend(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, tmp, 32 * 32,);
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
PIXEL_RECT(c_dst, 32, 32);
PIXEL_RECT(a_dst, 32, 32);
ALIGN_STK_64(uint8_t, mask, 32 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h, const uint8_t *mask);
for (int w = 4; w <= 32; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
#if BITDEPTH == 16
@ -449,15 +463,21 @@ static void check_blend(Dav1dMCDSPContext *const c) {
tmp[i] = rnd() & bitdepth_max;
mask[i] = rnd() % 65;
}
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, tmp, w, h, mask);
call_new(a_dst, dst_stride, tmp, w, h, mask);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
bench_new(a_dst, dst_stride, tmp, w, h, mask);
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
call_ref(c_dst, c_dst_stride, tmp, w, h, mask);
call_new(a_dst, a_dst_stride, tmp, w, h, mask);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, a_dst_stride, tmp, w, h, mask);
}
}
report("blend");
@ -465,14 +485,13 @@ static void check_blend(Dav1dMCDSPContext *const c) {
static void check_blend_v(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, tmp, 32 * 128,);
ALIGN_STK_64(pixel, c_dst, 32 * 128,);
ALIGN_STK_64(pixel, a_dst, 32 * 128,);
PIXEL_RECT(c_dst, 32, 128);
PIXEL_RECT(a_dst, 32, 128);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
for (int w = 2; w <= 32; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
#if BITDEPTH == 16
@ -481,17 +500,23 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
for (int i = 0; i < 32 * 128; i++)
tmp[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, tmp, w, h);
call_new(a_dst, dst_stride, tmp, w, h);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
call_ref(c_dst, c_dst_stride, tmp, w, h);
call_new(a_dst, a_dst_stride, tmp, w, h);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp, w, h);
bench_new(a_dst, a_dst_stride, tmp, w, h);
}
}
report("blend_v");
@ -499,14 +524,13 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
static void check_blend_h(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, tmp, 128 * 32,);
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
PIXEL_RECT(c_dst, 128, 32);
PIXEL_RECT(a_dst, 128, 32);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
#if BITDEPTH == 16
@ -514,17 +538,23 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
for (int i = 0; i < 128 * 32; i++)
tmp[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, tmp, w, h);
call_new(a_dst, dst_stride, tmp, w, h);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
call_ref(c_dst, c_dst_stride, tmp, w, h);
call_new(a_dst, a_dst_stride, tmp, w, h);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp, w, h);
bench_new(a_dst, a_dst_stride, tmp, w, h);
}
}
report("blend_h");
@ -532,11 +562,10 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
static void check_warp8x8(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
ALIGN_STK_64(pixel, c_dst, 8 * 8,);
ALIGN_STK_64(pixel, a_dst, 8 * 8,);
PIXEL_RECT(c_dst, 8, 8);
PIXEL_RECT(a_dst, 8, 8);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
const ptrdiff_t src_stride = 15 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
@ -558,12 +587,15 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) {
for (int i = 0; i < 15 * 15; i++)
src_buf[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
8, 8, "dst");
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
call_ref(c_dst, c_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
8, 8, "dst");
bench_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
}
report("warp8x8");
}
@ -687,13 +719,12 @@ static int get_upscale_x0(const int in_w, const int out_w, const int step) {
}
static void check_resize(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
ALIGN_STK_64(pixel, src, 512 * 64,);
PIXEL_RECT(c_dst, 1024, 64);
PIXEL_RECT(a_dst, 1024, 64);
ALIGN_STK_64(pixel, src, 512 * 64,);
const int height = 64;
const int max_src_width = 512;
const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
const ptrdiff_t src_stride = 512 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@ -720,14 +751,17 @@ static void check_resize(Dav1dMCDSPContext *const c) {
#undef scale_fac
const int mx0 = get_upscale_x0(src_w, dst_w, dx);
call_ref(c_dst, dst_stride, src, src_stride,
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride,
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
dst_w, height, "dst");
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
bench_new(a_dst, dst_stride, src, src_stride,
call_ref(c_dst, c_dst_stride, src, src_stride,
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
call_new(a_dst, a_dst_stride, src, src_stride,
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded_align(c_dst, c_dst_stride, a_dst, a_dst_stride,
dst_w, height, "dst", 16, 1);
bench_new(a_dst, a_dst_stride, src, src_stride,
512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
}