зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1754070: Update libdav1d to b562b7f648e26e64fae892495527b5b275d53183 r=jbauman
Differential Revision: https://phabricator.services.mozilla.com/D138068
This commit is contained in:
Родитель
9272d30533
Коммит
0c73045ce8
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit b562b7f648e26e64fae892495527b5b275d53183 (2022-01-10T14:49:11.000+00:00).
|
||||
release: commit 1f09a9119fb794ab41b1e527d848c2a210ca43d4 (2022-02-04T23:02:17.000-03:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: b562b7f648e26e64fae892495527b5b275d53183
|
||||
revision: 1f09a9119fb794ab41b1e527d848c2a210ca43d4
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -65,3 +65,4 @@ vendoring:
|
|||
file: '{yaml_dir}/vcs_version.h'
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "b562b7f648e26e64fae892495527b5b275d53183"
|
||||
#define DAV1D_VERSION "1f09a9119fb794ab41b1e527d848c2a210ca43d4"
|
||||
|
|
|
@ -58,23 +58,35 @@ typedef struct Dav1dLogger {
|
|||
void (*callback)(void *cookie, const char *format, va_list ap);
|
||||
} Dav1dLogger;
|
||||
|
||||
enum Dav1dInloopFilterType {
|
||||
DAV1D_INLOOPFILTER_NONE = 0,
|
||||
DAV1D_INLOOPFILTER_DEBLOCK = 1 << 0,
|
||||
DAV1D_INLOOPFILTER_CDEF = 1 << 1,
|
||||
DAV1D_INLOOPFILTER_RESTORATION = 1 << 2,
|
||||
DAV1D_INLOOPFILTER_ALL = DAV1D_INLOOPFILTER_DEBLOCK |
|
||||
DAV1D_INLOOPFILTER_CDEF |
|
||||
DAV1D_INLOOPFILTER_RESTORATION,
|
||||
};
|
||||
|
||||
typedef struct Dav1dSettings {
|
||||
int n_threads; ///< number of threads (0 = auto)
|
||||
int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = auto)
|
||||
int apply_grain;
|
||||
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
|
||||
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
|
||||
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
|
||||
int n_threads; ///< number of threads (0 = number of logical cores in host system, default 0)
|
||||
int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = ceil(sqrt(n_threads)), default 0)
|
||||
int apply_grain; ///< whether to apply film grain on output frames (default 1)
|
||||
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31, default 0)
|
||||
int all_layers; ///< output all spatial layers of a scalable AV1 biststream (default 1)
|
||||
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited, default 0)
|
||||
Dav1dPicAllocator allocator; ///< Picture allocator callback.
|
||||
Dav1dLogger logger; ///< Logger callback.
|
||||
int strict_std_compliance; ///< whether to abort decoding on standard compliance violations
|
||||
///< that don't affect actual bitstream decoding (e.g. inconsistent
|
||||
///< or invalid metadata)
|
||||
///< or invalid metadata, default 0)
|
||||
int output_invisible_frames; ///< output invisibly coded frames (in coding order) in addition
|
||||
///< to all visible frames. Because of show-existing-frame, this
|
||||
///< means some frames may appear twice (once when coded,
|
||||
///< once when shown)
|
||||
uint8_t reserved[24]; ///< reserved for future use
|
||||
///< once when shown, default 0)
|
||||
enum Dav1dInloopFilterType inloop_filters; ///< postfilters to enable during decoding (default
|
||||
///< DAV1D_INLOOPFILTER_ALL)
|
||||
uint8_t reserved[20]; ///< reserved for future use
|
||||
} Dav1dSettings;
|
||||
|
||||
/**
|
||||
|
|
|
@ -30,7 +30,7 @@ project('dav1d', ['c'],
|
|||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.49.0')
|
||||
|
||||
dav1d_soname_version = '6.3.0'
|
||||
dav1d_soname_version = '6.4.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
|
@ -109,10 +109,6 @@ if host_machine.system() == 'windows'
|
|||
cdata.set('ftello', '_ftelli64')
|
||||
endif
|
||||
|
||||
if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
|
||||
optional_arguments += '-mcmodel=small'
|
||||
endif
|
||||
|
||||
# On Windows, we use a compatibility layer to emulate pthread
|
||||
thread_dependency = []
|
||||
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
|
||||
|
|
|
@ -1146,6 +1146,16 @@ endfunc
|
|||
vmla.s16 \d, \s2, d0[2]
|
||||
vmla.s16 \d, \s3, d0[3]
|
||||
.endm
|
||||
.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
vmul.s16 \d0, \s0, d0[0]
|
||||
vmla.s16 \d0, \s1, d0[1]
|
||||
vmla.s16 \d0, \s2, d0[2]
|
||||
vmla.s16 \d0, \s3, d0[3]
|
||||
vmla.s16 \d0, \s4, d1[0]
|
||||
vmla.s16 \d0, \s5, d1[1]
|
||||
vmla.s16 \d0, \s6, d1[2]
|
||||
vmla.s16 \d0, \s7, d1[3]
|
||||
.endm
|
||||
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
vmul.s16 \d0, \s0, d0[0]
|
||||
vmla.s16 \d0, \s1, d0[1]
|
||||
|
@ -1182,24 +1192,6 @@ endfunc
|
|||
vmla.s16 \d1, \s8, d1[2]
|
||||
vmla.s16 \d1, \s9, d1[3]
|
||||
.endm
|
||||
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
|
||||
vmul.s16 \d0, \s0, d0[0]
|
||||
vmla.s16 \d0, \s1, d0[1]
|
||||
vmla.s16 \d0, \s2, d0[2]
|
||||
vmla.s16 \d0, \s3, d0[3]
|
||||
vmla.s16 \d0, \s4, d1[0]
|
||||
vmla.s16 \d0, \s5, d1[1]
|
||||
vmla.s16 \d0, \s6, d1[2]
|
||||
vmla.s16 \d0, \s7, d1[3]
|
||||
vmul.s16 \d1, \s4, d0[0]
|
||||
vmla.s16 \d1, \s5, d0[1]
|
||||
vmla.s16 \d1, \s6, d0[2]
|
||||
vmla.s16 \d1, \s7, d0[3]
|
||||
vmla.s16 \d1, \s8, d1[0]
|
||||
vmla.s16 \d1, \s9, d1[1]
|
||||
vmla.s16 \d1, \s10, d1[2]
|
||||
vmla.s16 \d1, \s11, d1[3]
|
||||
.endm
|
||||
.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
|
||||
vqrshrun.s16 \d0, \q0, #\shift
|
||||
.ifnb \q1
|
||||
|
@ -1623,7 +1615,7 @@ L(\type\()_8tap_v_tbl):
|
|||
st_16 \d_strd, d6, 4
|
||||
pop {r4-r11,pc}
|
||||
|
||||
28: // 2x8, 2x16 v
|
||||
28: // 2x6, 2x8, 2x12, 2x16 v
|
||||
vpush {q4-q7}
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
|
@ -1642,34 +1634,37 @@ L(\type\()_8tap_v_tbl):
|
|||
vmov d7, d10
|
||||
vmov d9, d12
|
||||
216:
|
||||
subs \h, \h, #8
|
||||
subs \h, \h, #4
|
||||
load_16 \sr2, \src, \s_strd, d16, d18, d20, d22
|
||||
load_16 \sr2, \src, \s_strd, d24, d26, d28, d30
|
||||
interleave_1_16 d14, d16, d18, d20, d22
|
||||
interleave_1_16 d22, d24, d26, d28, d30
|
||||
vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20
|
||||
vmovl_u8 q11, d22, q12, d24, q13, d26, q14, d28
|
||||
vmov d11, d14
|
||||
vmov d13, d16
|
||||
vmov d15, d18
|
||||
vmov d17, d20
|
||||
vmov d19, d22
|
||||
vmov d21, d24
|
||||
vmov d23, d26
|
||||
vmov d25, d28
|
||||
mul_mla_8_4 q1, q2, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
|
||||
vqrshrun_s16 6, q1, d2, q2, d4
|
||||
mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8
|
||||
vqrshrun_s16 6, q1, d2
|
||||
st_16 \d_strd, d2, 4
|
||||
st_16 \d_strd, d4, 4
|
||||
ble 0f
|
||||
vmov q1, q9
|
||||
vmov q2, q10
|
||||
vmov q3, q11
|
||||
vmov q4, q12
|
||||
vmov q5, q13
|
||||
vmov q6, q14
|
||||
vmov d14, d30
|
||||
cmp \h, #2
|
||||
vmov q1, q5
|
||||
vmov q2, q6
|
||||
vmov q3, q7
|
||||
vmov q4, q8
|
||||
vmov q5, q9
|
||||
vmov q6, q10
|
||||
vmov d14, d22
|
||||
beq 26f
|
||||
b 216b
|
||||
26:
|
||||
load_16 \sr2, \src, \s_strd, d16, d18
|
||||
interleave_1_16 d14, d16, d18
|
||||
vmovl_u8 q7, d14, q8, d16
|
||||
vmov d11, d14
|
||||
vmov d13, d16
|
||||
mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16
|
||||
vqrshrun_s16 6, q1, d2
|
||||
st_16 \d_strd, d2, 2
|
||||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
|
@ -1703,7 +1698,7 @@ L(\type\()_8tap_v_tbl):
|
|||
0:
|
||||
pop {r4-r11,pc}
|
||||
|
||||
480: // 4x8, 4x16 v
|
||||
480: // 4x6, 4x8, 4x12, 4x16 v
|
||||
vpush {q4}
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
|
@ -1726,12 +1721,19 @@ L(\type\()_8tap_v_tbl):
|
|||
mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13
|
||||
shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5
|
||||
ble 0f
|
||||
subs \h, \h, #4
|
||||
load_32 \sr2, \src, \s_strd, d30, d2, d4, d6
|
||||
interleave_1_32 d28, d30, d2, d4, d6
|
||||
vmovl_u8 q14, d28, q15, d30, q1, d2, q2, d4
|
||||
mul_mla_8_2 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1, q2
|
||||
shift_store_4 \type, \d_strd, q8, d16, d17, q9, d18, d19
|
||||
load_32 \sr2, \src, \s_strd, d30, d2
|
||||
subs \h, \h, #2
|
||||
interleave_1_32 d28, d30, d2
|
||||
vmovl_u8 q14, d28, q15, d30
|
||||
mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15
|
||||
shift_store_4 \type, \d_strd, q8, d16, d17
|
||||
ble 0f
|
||||
load_32 \sr2, \src, \s_strd, d4, d6
|
||||
subs \h, \h, #2
|
||||
interleave_1_32 d2, d4, d6
|
||||
vmovl_u8 q1, d2, q2, d4
|
||||
mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2
|
||||
shift_store_4 \type, \d_strd, q9, d18, d19
|
||||
ble 0f
|
||||
subs \h, \h, #4
|
||||
load_32 \sr2, \src, \s_strd, d8, d16, d18, d20
|
||||
|
@ -2643,6 +2645,7 @@ L(\type\()_bilin_v_tbl):
|
|||
// 2x2 v
|
||||
vld1.16 {d16[]}, [\src], \s_strd
|
||||
bgt 24f
|
||||
22:
|
||||
vld1.16 {d17[]}, [\sr2], \s_strd
|
||||
vld1.16 {d18[]}, [\src], \s_strd
|
||||
vext.8 d16, d16, d17, #6
|
||||
|
@ -2653,11 +2656,12 @@ L(\type\()_bilin_v_tbl):
|
|||
vst1.16 {d4[0]}, [\dst, :16]
|
||||
vst1.16 {d4[1]}, [\ds2, :16]
|
||||
pop {r4-r11,pc}
|
||||
24: // 2x4, 2x8, ... v
|
||||
24: // 2x4, 2x6, 2x8, ... v
|
||||
vld1.16 {d17[]}, [\sr2], \s_strd
|
||||
vld1.16 {d18[]}, [\src], \s_strd
|
||||
vld1.16 {d19[]}, [\sr2], \s_strd
|
||||
vld1.16 {d20[]}, [\src], \s_strd
|
||||
sub \h, \h, #4
|
||||
vext.8 d16, d16, d17, #6
|
||||
vext.8 d17, d17, d18, #6
|
||||
vext.8 d18, d18, d19, #6
|
||||
|
@ -2666,14 +2670,15 @@ L(\type\()_bilin_v_tbl):
|
|||
vtrn.32 d17, d19
|
||||
vmull.u8 q2, d16, d2
|
||||
vmlal.u8 q2, d17, d3
|
||||
subs \h, \h, #4
|
||||
cmp \h, #2
|
||||
vqrshrn.u16 d4, q2, #4
|
||||
vst1.16 {d4[0]}, [\dst, :16], \d_strd
|
||||
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
|
||||
vst1.16 {d4[2]}, [\dst, :16], \d_strd
|
||||
vst1.16 {d4[3]}, [\ds2, :16], \d_strd
|
||||
ble 0f
|
||||
blt 0f
|
||||
vmov d16, d20
|
||||
beq 22b
|
||||
b 24b
|
||||
0:
|
||||
pop {r4-r11,pc}
|
||||
|
|
|
@ -1748,7 +1748,7 @@ L(\type\()_8tap_v_tbl):
|
|||
vst1_32 \d_strd, d16, d17
|
||||
pop {r4-r11,pc}
|
||||
|
||||
28: // 2x8, 2x16 v
|
||||
28: // 2x6, 2x8, 2x12, 2x16 v
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
|
@ -1761,25 +1761,29 @@ L(\type\()_8tap_v_tbl):
|
|||
interleave_1_32 d2, d3, d4, d5, d6
|
||||
interleave_1_32 d6, d7, d16
|
||||
216:
|
||||
subs \h, \h, #8
|
||||
subs \h, \h, #4
|
||||
load_32 \sr2, \src, \s_strd, d17, d18, d19, d20
|
||||
load_32 \sr2, \src, \s_strd, d21, d22, d23, d24
|
||||
interleave_1_32 d16, d17, d18, d19, d20
|
||||
interleave_1_32 d20, d21, d22, d23, d24
|
||||
vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
|
||||
vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19
|
||||
vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21
|
||||
vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23
|
||||
vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3
|
||||
vmin_u16 q15, q13, q1
|
||||
vqrshrun_s32 6, q13, d26, q1, d27
|
||||
vmin_u16 q15, q13
|
||||
vst1_32 \d_strd, d26, d27
|
||||
vst1_32 \d_strd, d2, d3
|
||||
ble 0f
|
||||
vmov q1, q9
|
||||
vmov q2, q10
|
||||
vmov q3, q11
|
||||
vmov d16, d24
|
||||
cmp \h, #2
|
||||
vmov q1, q3
|
||||
vmov q2, q8
|
||||
vmov q3, q9
|
||||
vmov d16, d20
|
||||
beq 26f
|
||||
b 216b
|
||||
26:
|
||||
load_32 \sr2, \src, \s_strd, d17, d18
|
||||
interleave_1_32 d16, d17, d18
|
||||
vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
|
||||
vqrshrun_s32 6, q13, d26
|
||||
vmin_u16 d30, d26
|
||||
vst1_32 \d_strd, d26
|
||||
0:
|
||||
pop {r4-r11,pc}
|
||||
.endif
|
||||
|
@ -1810,7 +1814,7 @@ L(\type\()_8tap_v_tbl):
|
|||
0:
|
||||
pop {r4-r11,pc}
|
||||
|
||||
480: // 4x8, 4x16 v
|
||||
480: // 4x6, 4x8, 4x12, 4x16 v
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
|
@ -1830,11 +1834,18 @@ L(\type\()_8tap_v_tbl):
|
|||
vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26
|
||||
shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
|
||||
ble 0f
|
||||
cmp \h, #2
|
||||
vmov q8, q10
|
||||
vmov q9, q11
|
||||
vmov q10, q12
|
||||
vmov d22, d26
|
||||
beq 46f
|
||||
b 48b
|
||||
46:
|
||||
load_reg \sr2, \src, \s_strd, d23, d24
|
||||
vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
|
||||
vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
|
||||
shift_store_4 \type, \d_strd, q1, q2, d2, d3
|
||||
0:
|
||||
pop {r4-r11,pc}
|
||||
|
||||
|
@ -2801,6 +2812,7 @@ L(\type\()_bilin_v_tbl):
|
|||
// 2x2 v
|
||||
vld1.32 {d16[]}, [\src], \s_strd
|
||||
bgt 24f
|
||||
22:
|
||||
vld1.32 {d17[]}, [\sr2], \s_strd
|
||||
vld1.32 {d18[]}, [\src], \s_strd
|
||||
vext.8 d16, d16, d17, #4
|
||||
|
@ -2811,11 +2823,12 @@ L(\type\()_bilin_v_tbl):
|
|||
vst1.32 {d16[0]}, [\dst, :32]
|
||||
vst1.32 {d16[1]}, [\ds2, :32]
|
||||
pop {r4-r11,pc}
|
||||
24: // 2x4, 2x8, ... v
|
||||
24: // 2x4, 2x6, 2x8, ... v
|
||||
vld1.32 {d17[]}, [\sr2], \s_strd
|
||||
vld1.32 {d18[]}, [\src], \s_strd
|
||||
vld1.32 {d19[]}, [\sr2], \s_strd
|
||||
vld1.32 {d20[]}, [\src], \s_strd
|
||||
subs \h, \h, #4
|
||||
vext.8 d16, d16, d17, #4
|
||||
vext.8 d17, d17, d18, #4
|
||||
vext.8 d18, d18, d19, #4
|
||||
|
@ -2823,14 +2836,15 @@ L(\type\()_bilin_v_tbl):
|
|||
vswp d17, d18
|
||||
vmul.i16 q8, q8, q2
|
||||
vmla.i16 q8, q9, q3
|
||||
subs \h, \h, #4
|
||||
cmp \h, #2
|
||||
vrshr.u16 q8, q8, #4
|
||||
vst1.32 {d16[0]}, [\dst, :32], \d_strd
|
||||
vst1.32 {d16[1]}, [\ds2, :32], \d_strd
|
||||
vst1.32 {d17[0]}, [\dst, :32], \d_strd
|
||||
vst1.32 {d17[1]}, [\ds2, :32], \d_strd
|
||||
ble 0f
|
||||
blt 0f
|
||||
vmov d16, d20
|
||||
beq 22b
|
||||
b 24b
|
||||
0:
|
||||
pop {r4-r11,pc}
|
||||
|
|
|
@ -1163,6 +1163,26 @@ endfunc
|
|||
// Interleaving the mul/mla chains actually hurts performance
|
||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||
// chained like this.
|
||||
.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
mul \d0\().4h, \s0\().4h, v0.h[0]
|
||||
mla \d0\().4h, \s1\().4h, v0.h[1]
|
||||
mla \d0\().4h, \s2\().4h, v0.h[2]
|
||||
mla \d0\().4h, \s3\().4h, v0.h[3]
|
||||
mla \d0\().4h, \s4\().4h, v0.h[4]
|
||||
mla \d0\().4h, \s5\().4h, v0.h[5]
|
||||
mla \d0\().4h, \s6\().4h, v0.h[6]
|
||||
mla \d0\().4h, \s7\().4h, v0.h[7]
|
||||
.endm
|
||||
.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
mul \d0\().8h, \s0\().8h, v0.h[0]
|
||||
mla \d0\().8h, \s1\().8h, v0.h[1]
|
||||
mla \d0\().8h, \s2\().8h, v0.h[2]
|
||||
mla \d0\().8h, \s3\().8h, v0.h[3]
|
||||
mla \d0\().8h, \s4\().8h, v0.h[4]
|
||||
mla \d0\().8h, \s5\().8h, v0.h[5]
|
||||
mla \d0\().8h, \s6\().8h, v0.h[6]
|
||||
mla \d0\().8h, \s7\().8h, v0.h[7]
|
||||
.endm
|
||||
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
mul \d0\().8h, \s0\().8h, v0.h[0]
|
||||
mla \d0\().8h, \s1\().8h, v0.h[1]
|
||||
|
@ -1199,24 +1219,6 @@ endfunc
|
|||
mla \d1\().8h, \s8\().8h, v0.h[6]
|
||||
mla \d1\().8h, \s9\().8h, v0.h[7]
|
||||
.endm
|
||||
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
|
||||
mul \d0\().8h, \s0\().8h, v0.h[0]
|
||||
mla \d0\().8h, \s1\().8h, v0.h[1]
|
||||
mla \d0\().8h, \s2\().8h, v0.h[2]
|
||||
mla \d0\().8h, \s3\().8h, v0.h[3]
|
||||
mla \d0\().8h, \s4\().8h, v0.h[4]
|
||||
mla \d0\().8h, \s5\().8h, v0.h[5]
|
||||
mla \d0\().8h, \s6\().8h, v0.h[6]
|
||||
mla \d0\().8h, \s7\().8h, v0.h[7]
|
||||
mul \d1\().8h, \s4\().8h, v0.h[0]
|
||||
mla \d1\().8h, \s5\().8h, v0.h[1]
|
||||
mla \d1\().8h, \s6\().8h, v0.h[2]
|
||||
mla \d1\().8h, \s7\().8h, v0.h[3]
|
||||
mla \d1\().8h, \s8\().8h, v0.h[4]
|
||||
mla \d1\().8h, \s9\().8h, v0.h[5]
|
||||
mla \d1\().8h, \s10\().8h, v0.h[6]
|
||||
mla \d1\().8h, \s11\().8h, v0.h[7]
|
||||
.endm
|
||||
.macro sqrshrun_b shift, r0, r1, r2, r3
|
||||
sqrshrun \r0\().8b, \r0\().8h, #\shift
|
||||
.ifnb \r1
|
||||
|
@ -1633,7 +1635,7 @@ L(\type\()_8tap_v):
|
|||
st_h \d_strd, v6, 4
|
||||
ret
|
||||
|
||||
28: // 2x8, 2x16 v
|
||||
28: // 2x6, 2x8, 2x12, 2x16 v
|
||||
ld1 {v0.8b}, [\xmy]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
|
@ -1648,28 +1650,32 @@ L(\type\()_8tap_v):
|
|||
interleave_2_s v1, v2, v3, v4, v5, v6
|
||||
uxtl_b v1, v2, v3, v4
|
||||
216:
|
||||
subs \h, \h, #8
|
||||
subs \h, \h, #4
|
||||
load_h \sr2, \src, \s_strd, v16, v17, v18, v19
|
||||
load_h \sr2, \src, \s_strd, v20, v21, v22, v23
|
||||
interleave_1_h v7, v16, v17, v18, v19
|
||||
interleave_1_h v19, v20, v21, v22, v23
|
||||
interleave_2_s v5, v6, v7, v16, v17, v18
|
||||
interleave_2_s v17, v18, v19, v20, v21, v22
|
||||
uxtl_b v5, v6, v7, v16
|
||||
uxtl_b v17, v18, v19, v20
|
||||
mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20
|
||||
sqrshrun_b 6, v30, v31
|
||||
mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
|
||||
sqrshrun_b 6, v30
|
||||
st_h \d_strd, v30, 4
|
||||
st_h \d_strd, v31, 4
|
||||
b.le 0f
|
||||
mov v1.16b, v17.16b
|
||||
mov v2.16b, v18.16b
|
||||
mov v3.16b, v19.16b
|
||||
mov v4.16b, v20.16b
|
||||
mov v5.16b, v21.16b
|
||||
mov v6.16b, v22.16b
|
||||
mov v7.16b, v23.16b
|
||||
cmp \h, #2
|
||||
mov v1.16b, v5.16b
|
||||
mov v2.16b, v6.16b
|
||||
mov v3.16b, v7.16b
|
||||
mov v4.16b, v16.16b
|
||||
mov v5.16b, v17.16b
|
||||
mov v6.16b, v18.16b
|
||||
mov v7.16b, v19.16b
|
||||
b.eq 26f
|
||||
b 216b
|
||||
26:
|
||||
load_h \sr2, \src, \s_strd, v16, v17
|
||||
interleave_1_h v7, v16, v17
|
||||
uxtl_b v5, v6, v7, v16
|
||||
mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
|
||||
sqrshrun_b 6, v30
|
||||
st_h \d_strd, v30, 2
|
||||
0:
|
||||
ret
|
||||
.endif
|
||||
|
@ -1703,7 +1709,7 @@ L(\type\()_8tap_v):
|
|||
0:
|
||||
ret
|
||||
|
||||
480: // 4x8, 4x16 v
|
||||
480: // 4x6, 4x8, 4x12, 4x16 v
|
||||
ld1 {v0.8b}, [\xmy]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
|
@ -1726,12 +1732,19 @@ L(\type\()_8tap_v):
|
|||
mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
|
||||
shift_store_4 \type, \d_strd, v1, v2
|
||||
b.le 0f
|
||||
subs \h, \h, #4
|
||||
load_s \sr2, \src, \s_strd, v27, v16, v17, v18
|
||||
interleave_1_s v26, v27, v16, v17, v18
|
||||
uxtl_b v26, v27, v16, v17
|
||||
mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
|
||||
shift_store_4 \type, \d_strd, v1, v2
|
||||
load_s \sr2, \src, \s_strd, v27, v16
|
||||
subs \h, \h, #2
|
||||
interleave_1_s v26, v27, v16
|
||||
uxtl_b v26, v27
|
||||
mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
|
||||
shift_store_4 \type, \d_strd, v1
|
||||
b.le 0f
|
||||
load_s \sr2, \src, \s_strd, v17, v18
|
||||
subs \h, \h, #2
|
||||
interleave_1_s v16, v17, v18
|
||||
uxtl_b v16, v17
|
||||
mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
|
||||
shift_store_4 \type, \d_strd, v2
|
||||
b.le 0f
|
||||
subs \h, \h, #4
|
||||
load_s \sr2, \src, \s_strd, v19, v20, v21, v22
|
||||
|
@ -2641,6 +2654,7 @@ L(\type\()_bilin_v):
|
|||
// 2x2 v
|
||||
ld1 {v16.h}[0], [\src], \s_strd
|
||||
b.gt 24f
|
||||
22:
|
||||
ld1 {v17.h}[0], [\sr2], \s_strd
|
||||
ld1 {v18.h}[0], [\src], \s_strd
|
||||
trn1 v16.4h, v16.4h, v17.4h
|
||||
|
@ -2651,11 +2665,12 @@ L(\type\()_bilin_v):
|
|||
st1 {v4.h}[0], [\dst]
|
||||
st1 {v4.h}[1], [\ds2]
|
||||
ret
|
||||
24: // 2x4, 2x8, ... v
|
||||
24: // 2x4, 2x6, 2x8, ... v
|
||||
ld1 {v17.h}[0], [\sr2], \s_strd
|
||||
ld1 {v18.h}[0], [\src], \s_strd
|
||||
ld1 {v19.h}[0], [\sr2], \s_strd
|
||||
ld1 {v20.h}[0], [\src], \s_strd
|
||||
sub \h, \h, #4
|
||||
trn1 v16.4h, v16.4h, v17.4h
|
||||
trn1 v17.4h, v17.4h, v18.4h
|
||||
trn1 v18.4h, v18.4h, v19.4h
|
||||
|
@ -2664,14 +2679,15 @@ L(\type\()_bilin_v):
|
|||
trn1 v17.2s, v17.2s, v19.2s
|
||||
umull v4.8h, v16.8b, v2.8b
|
||||
umlal v4.8h, v17.8b, v3.8b
|
||||
subs \h, \h, #4
|
||||
cmp \h, #2
|
||||
uqrshrn v4.8b, v4.8h, #4
|
||||
st1 {v4.h}[0], [\dst], \d_strd
|
||||
st1 {v4.h}[1], [\ds2], \d_strd
|
||||
st1 {v4.h}[2], [\dst], \d_strd
|
||||
st1 {v4.h}[3], [\ds2], \d_strd
|
||||
b.le 0f
|
||||
b.lt 0f
|
||||
mov v16.8b, v20.8b
|
||||
b.eq 22b
|
||||
b 24b
|
||||
0:
|
||||
ret
|
||||
|
|
|
@ -1801,7 +1801,7 @@ L(\type\()_8tap_v):
|
|||
st_s \d_strd, v16, 4
|
||||
ret
|
||||
|
||||
28: // 2x8, 2x16 v
|
||||
28: // 2x6, 2x8, 2x12, 2x16 v
|
||||
ld1 {v0.8b}, [\xmy]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
|
@ -1814,28 +1814,32 @@ L(\type\()_8tap_v):
|
|||
interleave_1_s v1, v2, v3, v4, v5
|
||||
interleave_1_s v5, v6, v7
|
||||
216:
|
||||
subs \h, \h, #8
|
||||
subs \h, \h, #4
|
||||
load_s \sr2, \src, \s_strd, v16, v17, v18, v19
|
||||
load_s \sr2, \src, \s_strd, v20, v21, v22, v23
|
||||
interleave_1_s v7, v16, v17, v18, v19
|
||||
interleave_1_s v19, v20, v21, v22, v23
|
||||
smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
|
||||
smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
|
||||
smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20
|
||||
smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22
|
||||
sqrshrun_h 6, v24, v25, v26, v27
|
||||
umin_h v31, .8h, v24, v26
|
||||
sqrshrun_h 6, v24, v25
|
||||
umin_h v31, .8h, v24
|
||||
st_s \d_strd, v24, 4
|
||||
st_s \d_strd, v26, 4
|
||||
b.le 0f
|
||||
mov v1.16b, v17.16b
|
||||
mov v2.16b, v18.16b
|
||||
mov v3.16b, v19.16b
|
||||
mov v4.16b, v20.16b
|
||||
mov v5.16b, v21.16b
|
||||
mov v6.16b, v22.16b
|
||||
mov v7.16b, v23.16b
|
||||
cmp \h, #2
|
||||
mov v1.16b, v5.16b
|
||||
mov v2.16b, v6.16b
|
||||
mov v3.16b, v7.16b
|
||||
mov v4.16b, v16.16b
|
||||
mov v5.16b, v17.16b
|
||||
mov v6.16b, v18.16b
|
||||
mov v7.16b, v19.16b
|
||||
b.eq 26f
|
||||
b 216b
|
||||
26:
|
||||
load_s \sr2, \src, \s_strd, v16, v17
|
||||
interleave_1_s v7, v16, v17
|
||||
smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
|
||||
sqrshrun_h 6, v24
|
||||
umin_h v31, .4h, v24
|
||||
st_s \d_strd, v24, 2
|
||||
0:
|
||||
ret
|
||||
.endif
|
||||
|
@ -1867,7 +1871,7 @@ L(\type\()_8tap_v):
|
|||
0:
|
||||
ret
|
||||
|
||||
480: // 4x8, 4x16 v
|
||||
480: // 4x6, 4x8, 4x12, 4x16 v
|
||||
ld1 {v0.8b}, [\xmy]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
|
@ -1887,6 +1891,7 @@ L(\type\()_8tap_v):
|
|||
smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
|
||||
shift_store_4 \type, \d_strd, v1, v2, v3, v4
|
||||
b.le 0f
|
||||
cmp \h, #2
|
||||
mov v16.8b, v20.8b
|
||||
mov v17.8b, v21.8b
|
||||
mov v18.8b, v22.8b
|
||||
|
@ -1894,7 +1899,13 @@ L(\type\()_8tap_v):
|
|||
mov v20.8b, v24.8b
|
||||
mov v21.8b, v25.8b
|
||||
mov v22.8b, v26.8b
|
||||
b.eq 46f
|
||||
b 48b
|
||||
46:
|
||||
load_4h \sr2, \src, \s_strd, v23, v24
|
||||
smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
|
||||
shift_store_4 \type, \d_strd, v1, v2
|
||||
0:
|
||||
ret
|
||||
|
||||
|
@ -2858,6 +2869,7 @@ L(\type\()_bilin_v):
|
|||
// 2x2 v
|
||||
ld1 {v16.s}[0], [\src], \s_strd
|
||||
b.gt 24f
|
||||
22:
|
||||
ld1 {v17.s}[0], [\sr2], \s_strd
|
||||
ld1 {v18.s}[0], [\src], \s_strd
|
||||
trn1 v16.2s, v16.2s, v17.2s
|
||||
|
@ -2868,11 +2880,12 @@ L(\type\()_bilin_v):
|
|||
st1 {v4.s}[0], [\dst]
|
||||
st1 {v4.s}[1], [\ds2]
|
||||
ret
|
||||
24: // 2x4, 2x8, ... v
|
||||
24: // 2x4, 2x6, 2x8, ... v
|
||||
ld1 {v17.s}[0], [\sr2], \s_strd
|
||||
ld1 {v18.s}[0], [\src], \s_strd
|
||||
ld1 {v19.s}[0], [\sr2], \s_strd
|
||||
ld1 {v20.s}[0], [\src], \s_strd
|
||||
sub \h, \h, #4
|
||||
trn1 v16.2s, v16.2s, v17.2s
|
||||
trn1 v17.2s, v17.2s, v18.2s
|
||||
trn1 v18.2s, v18.2s, v19.2s
|
||||
|
@ -2881,14 +2894,15 @@ L(\type\()_bilin_v):
|
|||
trn1 v17.2d, v17.2d, v19.2d
|
||||
mul v4.8h, v16.8h, v2.8h
|
||||
mla v4.8h, v17.8h, v3.8h
|
||||
subs \h, \h, #4
|
||||
cmp \h, #2
|
||||
urshr v4.8h, v4.8h, #4
|
||||
st1 {v4.s}[0], [\dst], \d_strd
|
||||
st1 {v4.s}[1], [\ds2], \d_strd
|
||||
st1 {v4.s}[2], [\dst], \d_strd
|
||||
st1 {v4.s}[3], [\ds2], \d_strd
|
||||
b.le 0f
|
||||
b.lt 0f
|
||||
mov v16.8b, v20.8b
|
||||
b.eq 22b
|
||||
b 24b
|
||||
0:
|
||||
ret
|
||||
|
|
|
@ -49,16 +49,11 @@
|
|||
#endif
|
||||
|
||||
static unsigned flags = 0;
|
||||
|
||||
#if __has_feature(memory_sanitizer)
|
||||
// memory sanitizer is inherently incompatible with asm
|
||||
static unsigned flags_mask = 0;
|
||||
#else
|
||||
static unsigned flags_mask = -1;
|
||||
#endif
|
||||
|
||||
COLD void dav1d_init_cpu(void) {
|
||||
#if HAVE_ASM
|
||||
#if HAVE_ASM && !__has_feature(memory_sanitizer)
|
||||
// memory sanitizer is inherently incompatible with asm
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
flags = dav1d_get_cpu_flags_arm();
|
||||
#elif ARCH_PPC64LE
|
||||
|
|
|
@ -3295,6 +3295,15 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
|||
f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
|
||||
f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
|
||||
|
||||
retval = 0;
|
||||
error:
|
||||
return retval;
|
||||
}
|
||||
|
||||
int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
|
||||
const Dav1dContext *const c = f->c;
|
||||
int retval = DAV1D_ERR(EINVAL);
|
||||
|
||||
if (f->frame_hdr->refresh_context)
|
||||
dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
|
||||
|
||||
|
@ -3430,6 +3439,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
// if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
|
||||
// threads also. Not sure it makes a measurable difference.
|
||||
int res = dav1d_decode_frame_init(f);
|
||||
if (!res) res = dav1d_decode_frame_init_cdf(f);
|
||||
// wait until all threads have completed
|
||||
if (!res) {
|
||||
if (f->c->n_tc > 1) {
|
||||
|
@ -3487,7 +3497,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
if (c->task_thread.cur < c->n_fc)
|
||||
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
if (out_delayed->p.data[0]) {
|
||||
|
@ -3496,7 +3506,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
if ((out_delayed->visible || c->output_invisible_frames) &&
|
||||
progress != FRAME_ERROR)
|
||||
{
|
||||
dav1d_picture_ref(&c->out, &out_delayed->p);
|
||||
dav1d_thread_picture_ref(&c->out, out_delayed);
|
||||
c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
|
||||
}
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
|
@ -3670,7 +3680,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
// move f->cur into output queue
|
||||
if (c->n_fc == 1) {
|
||||
if (f->frame_hdr->show_frame || c->output_invisible_frames) {
|
||||
dav1d_picture_ref(&c->out, &f->sr_cur.p);
|
||||
dav1d_thread_picture_ref(&c->out, &f->sr_cur);
|
||||
c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
|
||||
}
|
||||
} else {
|
||||
|
@ -3822,7 +3832,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
|
||||
if (c->n_fc == 1) {
|
||||
if ((res = dav1d_decode_frame(f)) < 0) {
|
||||
dav1d_picture_unref_internal(&c->out);
|
||||
dav1d_thread_picture_unref(&c->out);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (refresh_frame_flags & (1 << i)) {
|
||||
if (c->refs[i].p.p.data[0])
|
||||
|
@ -3851,7 +3861,7 @@ error:
|
|||
dav1d_ref_dec(&f->ref_mvs_ref[i]);
|
||||
}
|
||||
if (c->n_fc == 1)
|
||||
dav1d_picture_unref_internal(&c->out);
|
||||
dav1d_thread_picture_unref(&c->out);
|
||||
else
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
dav1d_picture_unref_internal(&f->cur);
|
||||
|
|
|
@ -102,7 +102,7 @@ struct Dav1dContext {
|
|||
|
||||
// decoded output picture queue
|
||||
Dav1dData in;
|
||||
Dav1dPicture out;
|
||||
Dav1dThreadPicture out, cache;
|
||||
// dummy is a pointer to prevent compiler errors about atomic_load()
|
||||
// not taking const arguments
|
||||
atomic_int flush_mem, *flush;
|
||||
|
@ -158,6 +158,7 @@ struct Dav1dContext {
|
|||
unsigned frame_size_limit;
|
||||
int strict_std_compliance;
|
||||
int output_invisible_frames;
|
||||
enum Dav1dInloopFilterType inloop_filters;
|
||||
int drain;
|
||||
enum PictureFlags frame_flags;
|
||||
enum Dav1dEventFlags event_flags;
|
||||
|
@ -169,14 +170,15 @@ struct Dav1dContext {
|
|||
|
||||
enum TaskType {
|
||||
DAV1D_TASK_TYPE_INIT,
|
||||
DAV1D_TASK_TYPE_INIT_CDF,
|
||||
DAV1D_TASK_TYPE_TILE_ENTROPY,
|
||||
DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
|
||||
DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
|
||||
DAV1D_TASK_TYPE_DEBLOCK_COLS,
|
||||
DAV1D_TASK_TYPE_DEBLOCK_ROWS,
|
||||
DAV1D_TASK_TYPE_CDEF,
|
||||
DAV1D_TASK_TYPE_SUPER_RESOLUTION,
|
||||
DAV1D_TASK_TYPE_LOOP_RESTORATION,
|
||||
DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
|
||||
DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
|
||||
};
|
||||
|
||||
|
@ -303,6 +305,7 @@ struct Dav1dFrameContext {
|
|||
struct TaskThreadData *ttd;
|
||||
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
|
||||
int num_tasks, num_tile_tasks;
|
||||
int init_done;
|
||||
int done[2];
|
||||
int retval;
|
||||
int update_set; // whether we need to update CDF reference
|
||||
|
|
|
@ -76,6 +76,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
|
|||
s->frame_size_limit = 0;
|
||||
s->strict_std_compliance = 0;
|
||||
s->output_invisible_frames = 0;
|
||||
s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
|
||||
}
|
||||
|
||||
static void close_internal(Dav1dContext **const c_out, int flush);
|
||||
|
@ -131,6 +132,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
c->frame_size_limit = s->frame_size_limit;
|
||||
c->strict_std_compliance = s->strict_std_compliance;
|
||||
c->output_invisible_frames = s->output_invisible_frames;
|
||||
c->inloop_filters = s->inloop_filters;
|
||||
|
||||
if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
|
||||
dav1d_mem_pool_init(&c->frame_hdr_pool) ||
|
||||
|
@ -311,33 +313,46 @@ static int has_grain(const Dav1dPicture *const pic)
|
|||
fgdata->num_uv_points[1];
|
||||
}
|
||||
|
||||
static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
|
||||
Dav1dPicture *const in)
|
||||
static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
|
||||
{
|
||||
if (!c->apply_grain || !has_grain(in)) {
|
||||
dav1d_picture_move_ref(out, in);
|
||||
return 0;
|
||||
int res = 0;
|
||||
|
||||
Dav1dThreadPicture *const in = c->all_layers ? &c->out : &c->cache;
|
||||
if (!c->apply_grain || !has_grain(&in->p)) {
|
||||
dav1d_picture_move_ref(out, &in->p);
|
||||
dav1d_thread_picture_unref(in);
|
||||
goto end;
|
||||
}
|
||||
|
||||
int res = dav1d_apply_grain(c, out, in);
|
||||
dav1d_picture_unref_internal(in);
|
||||
res = dav1d_apply_grain(c, out, &in->p);
|
||||
dav1d_thread_picture_unref(in);
|
||||
end:
|
||||
if (!c->all_layers && c->out.p.data[0]) {
|
||||
dav1d_thread_picture_move_ref(in, &c->out);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static int output_picture_ready(Dav1dContext *const c) {
|
||||
|
||||
if (!c->out.data[0]) return 0;
|
||||
|
||||
// skip lower spatial layers
|
||||
if (c->operating_point_idc && !c->all_layers) {
|
||||
const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
|
||||
if (max_spatial_id > c->out.frame_hdr->spatial_id) {
|
||||
dav1d_picture_unref_internal(&c->out);
|
||||
static int output_picture_ready(Dav1dContext *const c, const int drain) {
|
||||
if (!c->all_layers) {
|
||||
if (c->out.p.data[0] && c->cache.p.data[0]) {
|
||||
const unsigned spatial_mask = c->operating_point_idc >> 8;
|
||||
const int max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
|
||||
if (max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
|
||||
c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
|
||||
return 1;
|
||||
dav1d_thread_picture_unref(&c->cache);
|
||||
dav1d_thread_picture_move_ref(&c->cache, &c->out);
|
||||
return 0;
|
||||
} else if (c->cache.p.data[0] && drain) {
|
||||
return 1;
|
||||
} else if (c->out.p.data[0]) {
|
||||
dav1d_thread_picture_move_ref(&c->cache, &c->out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
return !!c->out.p.data[0];
|
||||
}
|
||||
|
||||
static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
|
||||
|
@ -369,15 +384,18 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
|
|||
if ((out_delayed->visible || c->output_invisible_frames) &&
|
||||
progress != FRAME_ERROR)
|
||||
{
|
||||
dav1d_picture_ref(&c->out, &out_delayed->p);
|
||||
dav1d_thread_picture_ref(&c->out, out_delayed);
|
||||
c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
|
||||
}
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
if (output_picture_ready(c))
|
||||
return output_image(c, out, &c->out);
|
||||
if (output_picture_ready(c, 0))
|
||||
return output_image(c, out);
|
||||
}
|
||||
} while (++drain_count < c->n_fc);
|
||||
|
||||
if (output_picture_ready(c, 1))
|
||||
return output_image(c, out);
|
||||
|
||||
return DAV1D_ERR(EAGAIN);
|
||||
}
|
||||
|
||||
|
@ -386,7 +404,7 @@ static int gen_picture(Dav1dContext *const c)
|
|||
int res;
|
||||
Dav1dData *const in = &c->in;
|
||||
|
||||
if (output_picture_ready(c))
|
||||
if (output_picture_ready(c, 0))
|
||||
return 0;
|
||||
|
||||
while (in->sz > 0) {
|
||||
|
@ -399,7 +417,7 @@ static int gen_picture(Dav1dContext *const c)
|
|||
in->data += res;
|
||||
if (!in->sz) dav1d_data_unref_internal(in);
|
||||
}
|
||||
if (output_picture_ready(c))
|
||||
if (output_picture_ready(c, 0))
|
||||
break;
|
||||
if (res < 0)
|
||||
return res;
|
||||
|
@ -439,8 +457,8 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
|
|||
if (res < 0)
|
||||
return res;
|
||||
|
||||
if (output_picture_ready(c))
|
||||
return output_image(c, out, &c->out);
|
||||
if (output_picture_ready(c, c->n_fc == 1))
|
||||
return output_image(c, out);
|
||||
|
||||
if (c->n_fc > 1 && drain)
|
||||
return drain_picture(c, out);
|
||||
|
@ -592,6 +610,8 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
freep(&f->frame_thread.tile_start_off);
|
||||
dav1d_freep_aligned(&f->frame_thread.pal);
|
||||
freep(&f->frame_thread.cbi);
|
||||
}
|
||||
if (c->n_tc > 1) {
|
||||
pthread_cond_destroy(&f->task_thread.cond);
|
||||
}
|
||||
freep(&f->frame_thread.frame_progress);
|
||||
|
|
|
@ -260,29 +260,38 @@ endif
|
|||
|
||||
|
||||
|
||||
libdav1d_rc_obj = []
|
||||
libdav1d_flags = [stackalign_flag]
|
||||
api_export_flags = []
|
||||
|
||||
#
|
||||
# Windows .rc file and API export flags
|
||||
#
|
||||
|
||||
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
|
||||
rc_file = configure_file(
|
||||
input : 'dav1d.rc.in',
|
||||
output : 'dav1d.rc',
|
||||
configuration : rc_data
|
||||
)
|
||||
if host_machine.system() == 'windows'
|
||||
if get_option('default_library') != 'static'
|
||||
rc_file = configure_file(
|
||||
input : 'dav1d.rc.in',
|
||||
output : 'dav1d.rc',
|
||||
configuration : rc_data
|
||||
)
|
||||
|
||||
libdav1d_rc_obj = winmod.compile_resources(rc_file)
|
||||
libdav1d_rc_obj = winmod.compile_resources(rc_file)
|
||||
|
||||
api_export_flags = ['-DDAV1D_BUILDING_DLL']
|
||||
else
|
||||
libdav1d_rc_obj = []
|
||||
api_export_flags = ['-DDAV1D_BUILDING_DLL']
|
||||
endif
|
||||
|
||||
if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
|
||||
# We don't expect to reference data members from other DLLs without
|
||||
# dllimport attributes. Set the -mcmodel=small flag, which avoids
|
||||
# generating indirection via .refptr.<symname> for all potentially
|
||||
# dllimported variable references.
|
||||
libdav1d_flags += '-mcmodel=small'
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Library definitions
|
||||
#
|
||||
|
@ -294,7 +303,7 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
|
|||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
dependencies: [stdatomic_dependencies],
|
||||
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
|
||||
c_args : [libdav1d_flags, stackrealign_flag, api_export_flags],
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects(recursive: true)
|
||||
|
@ -307,7 +316,7 @@ foreach bitdepth : dav1d_bitdepths
|
|||
libdav1d_tmpl_sources, config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependencies],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects(recursive: true)
|
||||
|
@ -320,7 +329,7 @@ foreach bitdepth : dav1d_bitdepths
|
|||
libdav1d_arch_tmpl_sources, config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependencies],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects(recursive: true)
|
||||
|
@ -350,7 +359,7 @@ libdav1d = library('dav1d',
|
|||
thread_compat_dep,
|
||||
libdl_dependency,
|
||||
],
|
||||
c_args : [stackalign_flag, api_export_flags],
|
||||
c_args : [libdav1d_flags, api_export_flags],
|
||||
version : dav1d_soname_version,
|
||||
soversion : dav1d_soversion,
|
||||
install : true,
|
||||
|
|
|
@ -1533,8 +1533,10 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
|
||||
break;
|
||||
}
|
||||
case DAV1D_OBU_PADDING:
|
||||
case DAV1D_OBU_TD:
|
||||
c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT;
|
||||
break;
|
||||
case DAV1D_OBU_PADDING:
|
||||
// ignore OBUs we don't care about
|
||||
break;
|
||||
default:
|
||||
|
@ -1547,9 +1549,9 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
if (c->frame_hdr->show_existing_frame) {
|
||||
if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
|
||||
if (c->n_fc == 1) {
|
||||
dav1d_picture_ref(&c->out,
|
||||
&c->refs[c->frame_hdr->existing_frame_idx].p.p);
|
||||
dav1d_data_props_copy(&c->out.m, &in->m);
|
||||
dav1d_thread_picture_ref(&c->out,
|
||||
&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
dav1d_data_props_copy(&c->out.p.m, &in->m);
|
||||
c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
} else {
|
||||
pthread_mutex_lock(&c->task_thread.lock);
|
||||
|
@ -1569,7 +1571,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
if (c->task_thread.cur < c->n_fc)
|
||||
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
if (out_delayed->p.data[0]) {
|
||||
|
@ -1578,7 +1580,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
if ((out_delayed->visible || c->output_invisible_frames) &&
|
||||
progress != FRAME_ERROR)
|
||||
{
|
||||
dav1d_picture_ref(&c->out, &out_delayed->p);
|
||||
dav1d_thread_picture_ref(&c->out, out_delayed);
|
||||
c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
|
||||
}
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
|
|
|
@ -259,6 +259,16 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
|
|||
dst->flags = src->flags;
|
||||
}
|
||||
|
||||
void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst,
|
||||
Dav1dThreadPicture *const src)
|
||||
{
|
||||
dav1d_picture_move_ref(&dst->p, &src->p);
|
||||
dst->visible = src->visible;
|
||||
dst->progress = src->progress;
|
||||
dst->flags = src->flags;
|
||||
memset(src, 0, sizeof(*src));
|
||||
}
|
||||
|
||||
void dav1d_picture_unref_internal(Dav1dPicture *const p) {
|
||||
validate_input(p != NULL);
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@ enum PlaneType {
|
|||
enum PictureFlags {
|
||||
PICTURE_FLAG_NEW_SEQUENCE = 1 << 0,
|
||||
PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1,
|
||||
PICTURE_FLAG_NEW_TEMPORAL_UNIT = 1 << 2,
|
||||
};
|
||||
|
||||
typedef struct Dav1dThreadPicture {
|
||||
|
@ -83,6 +84,8 @@ int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w,
|
|||
void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
|
||||
void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
|
||||
const Dav1dThreadPicture *src);
|
||||
void dav1d_thread_picture_move_ref(Dav1dThreadPicture *dst,
|
||||
Dav1dThreadPicture *src);
|
||||
void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
|
||||
|
||||
/**
|
||||
|
|
|
@ -2046,6 +2046,11 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
|
||||
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
|
||||
(!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
|
||||
{
|
||||
return;
|
||||
}
|
||||
const int y = sby * f->sb_step * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *const p[3] = {
|
||||
|
@ -2054,9 +2059,8 @@ void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const i
|
|||
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
|
||||
};
|
||||
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])
|
||||
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
|
||||
f->lf.start_of_tile_row[sby]);
|
||||
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
|
||||
f->lf.start_of_tile_row[sby]);
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
|
||||
|
@ -2068,7 +2072,9 @@ void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const i
|
|||
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
|
||||
};
|
||||
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
|
||||
if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
|
||||
(f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
|
||||
{
|
||||
bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
|
||||
}
|
||||
if (f->seq_hdr->cdef || f->lf.restore_planes) {
|
||||
|
@ -2079,6 +2085,7 @@ void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const i
|
|||
|
||||
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
|
||||
const Dav1dFrameContext *const f = tc->f;
|
||||
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
|
||||
const int sbsz = f->sb_step;
|
||||
const int y = sby * sbsz * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
@ -2140,6 +2147,7 @@ void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby
|
|||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
|
||||
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
|
||||
const int y = sby * f->sb_step * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *const sr_p[3] = {
|
||||
|
|
|
@ -141,7 +141,8 @@ static void insert_tasks(Dav1dFrameContext *const f,
|
|||
}
|
||||
|
||||
// sort by tile-id
|
||||
assert(first->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
|
||||
assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
|
||||
first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
|
||||
assert(first->type == t_ptr->type);
|
||||
assert(t_ptr->sby == first->sby);
|
||||
const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
|
||||
|
@ -270,6 +271,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
|
|||
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
|
||||
const Dav1dContext *const c = f->c;
|
||||
|
||||
f->task_thread.init_done = 0;
|
||||
// schedule init task, which will schedule the remaining tasks
|
||||
Dav1dTask *const t = &f->task_thread.init_task;
|
||||
t->type = DAV1D_TASK_TYPE_INIT;
|
||||
|
@ -350,6 +352,18 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline void abort_frame(Dav1dFrameContext *const f) {
|
||||
atomic_store(&f->task_thread.error, 1);
|
||||
f->task_thread.task_counter = 0;
|
||||
f->task_thread.done[0] = 1;
|
||||
f->task_thread.done[1] = 1;
|
||||
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
|
||||
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
|
||||
dav1d_decode_frame_exit(f, -1);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
|
||||
void *dav1d_worker_task(void *data) {
|
||||
Dav1dTaskContext *const tc = data;
|
||||
const Dav1dContext *const c = tc->c;
|
||||
|
@ -360,23 +374,37 @@ void *dav1d_worker_task(void *data) {
|
|||
pthread_mutex_lock(&ttd->lock);
|
||||
for (;;) {
|
||||
Dav1dFrameContext *f;
|
||||
Dav1dTask *t, *prev_t;
|
||||
Dav1dTask *t, *prev_t = NULL;
|
||||
if (tc->task_thread.die) break;
|
||||
if (atomic_load(c->flush)) goto park;
|
||||
while (ttd->cur < c->n_fc) {
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
f = &c->fc[(first + ttd->cur) % c->n_fc];
|
||||
prev_t = f->task_thread.task_cur_prev;
|
||||
t = prev_t ? prev_t->next : f->task_thread.task_head;
|
||||
while (t) {
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT) {
|
||||
if (c->n_fc > 1) { // run init tasks first
|
||||
for (unsigned i = 0; i < c->n_fc; i++) {
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
f = &c->fc[(first + i) % c->n_fc];
|
||||
if (f->task_thread.init_done) continue;
|
||||
t = f->task_thread.task_head;
|
||||
if (!t) continue;
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
|
||||
const int p1 = f->in_cdf.progress ?
|
||||
atomic_load(f->in_cdf.progress) : 1;
|
||||
if (p1) {
|
||||
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
|
||||
goto found;
|
||||
}
|
||||
} else if (t->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION) {
|
||||
}
|
||||
}
|
||||
}
|
||||
while (ttd->cur < c->n_fc) {
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
f = &c->fc[(first + ttd->cur) % c->n_fc];
|
||||
prev_t = f->task_thread.task_cur_prev;
|
||||
t = prev_t ? prev_t->next : f->task_thread.task_head;
|
||||
while (t) {
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
|
||||
else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
|
||||
t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
|
||||
{
|
||||
// if not bottom sbrow of tile, this task will be re-added
|
||||
// after it's finished
|
||||
if (!check_tile(t, f, c->n_fc > 1))
|
||||
|
@ -447,7 +475,8 @@ void *dav1d_worker_task(void *data) {
|
|||
if (prev_t) prev_t->next = t->next;
|
||||
else f->task_thread.task_head = t->next;
|
||||
if (!t->next) f->task_thread.task_tail = prev_t;
|
||||
if (!f->task_thread.task_head) ttd->cur++;
|
||||
if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
|
||||
ttd->cur++;
|
||||
// we don't need to check cond_signaled here, since we found a task
|
||||
// after the last signal so we want to re-signal the next waiting thread
|
||||
// and again won't need to signal after that
|
||||
|
@ -463,10 +492,26 @@ void *dav1d_worker_task(void *data) {
|
|||
int sby = t->sby;
|
||||
switch (t->type) {
|
||||
case DAV1D_TASK_TYPE_INIT: {
|
||||
assert(c->n_fc > 1);
|
||||
int res = dav1d_decode_frame_init(f);
|
||||
int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
|
||||
if (res || p1 == TILE_ERROR) {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
abort_frame(f);
|
||||
} else if (!res) {
|
||||
t->type = DAV1D_TASK_TYPE_INIT_CDF;
|
||||
if (p1) goto found_unlocked;
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
insert_task(f, t, 0);
|
||||
}
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
continue;
|
||||
}
|
||||
case DAV1D_TASK_TYPE_INIT_CDF: {
|
||||
assert(c->n_fc > 1);
|
||||
int res = -1;
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
res = dav1d_decode_frame_init(f);
|
||||
res = dav1d_decode_frame_init_cdf(f);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
|
||||
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
|
||||
|
@ -490,19 +535,9 @@ void *dav1d_worker_task(void *data) {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// init failed, signal completion
|
||||
atomic_store(&f->task_thread.error, 1);
|
||||
f->task_thread.task_counter = 0;
|
||||
f->task_thread.done[0] = 1;
|
||||
f->task_thread.done[1] = 1;
|
||||
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
|
||||
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
|
||||
dav1d_decode_frame_exit(f, -1);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
} else abort_frame(f);
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
f->task_thread.init_done = 1;
|
||||
continue;
|
||||
}
|
||||
case DAV1D_TASK_TYPE_TILE_ENTROPY:
|
||||
|
|
|
@ -42,6 +42,7 @@ void dav1d_task_frame_init(Dav1dFrameContext *f);
|
|||
void *dav1d_worker_task(void *data);
|
||||
|
||||
int dav1d_decode_frame_init(Dav1dFrameContext *f);
|
||||
int dav1d_decode_frame_init_cdf(Dav1dFrameContext *f);
|
||||
int dav1d_decode_frame_main(Dav1dFrameContext *f);
|
||||
void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
|
||||
int dav1d_decode_frame(Dav1dFrameContext *f);
|
||||
|
|
|
@ -28,13 +28,14 @@
|
|||
#include "config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/x86/cpu.h"
|
||||
|
||||
typedef struct {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
uint32_t eax, ebx, edx, ecx;
|
||||
} CpuidRegisters;
|
||||
|
||||
void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
|
||||
|
@ -43,13 +44,22 @@ uint64_t dav1d_cpu_xgetbv(unsigned xcr);
|
|||
#define X(reg, mask) (((reg) & (mask)) == (mask))
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_x86(void) {
|
||||
CpuidRegisters r = { 0 };
|
||||
dav1d_cpu_cpuid(&r, 0, 0);
|
||||
const unsigned max_leaf = r.eax;
|
||||
union {
|
||||
CpuidRegisters r;
|
||||
struct {
|
||||
uint32_t max_leaf;
|
||||
char vendor[12];
|
||||
};
|
||||
} cpu;
|
||||
dav1d_cpu_cpuid(&cpu.r, 0, 0);
|
||||
unsigned flags = 0;
|
||||
|
||||
if (max_leaf >= 1) {
|
||||
if (cpu.max_leaf >= 1) {
|
||||
CpuidRegisters r;
|
||||
dav1d_cpu_cpuid(&r, 1, 0);
|
||||
const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0);
|
||||
const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
|
||||
|
||||
if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
|
||||
flags |= DAV1D_X86_CPU_FLAG_SSE2;
|
||||
if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
|
||||
|
@ -63,7 +73,7 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
|
|||
if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
|
||||
const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
|
||||
if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
|
||||
if (max_leaf >= 7) {
|
||||
if (cpu.max_leaf >= 7) {
|
||||
dav1d_cpu_cpuid(&r, 7, 0);
|
||||
if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
|
||||
flags |= DAV1D_X86_CPU_FLAG_AVX2;
|
||||
|
@ -76,6 +86,14 @@ COLD unsigned dav1d_get_cpu_flags_x86(void) {
|
|||
}
|
||||
}
|
||||
#endif
|
||||
if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
|
||||
if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 ||
|
||||
(family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60)))))
|
||||
{
|
||||
/* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */
|
||||
flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return flags;
|
||||
|
|
|
@ -29,12 +29,14 @@
|
|||
#define DAV1D_SRC_X86_CPU_H
|
||||
|
||||
enum CpuFlags {
|
||||
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
|
||||
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
|
||||
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
|
||||
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
|
||||
DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
|
||||
* VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
|
||||
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
|
||||
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
|
||||
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
|
||||
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
|
||||
DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
|
||||
* VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
|
||||
DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough
|
||||
* to cause performance regressions. */
|
||||
};
|
||||
|
||||
unsigned dav1d_get_cpu_flags_x86(void);
|
||||
|
|
|
@ -38,8 +38,8 @@ cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
|
|||
cpuid
|
||||
mov [r4+4*0], eax
|
||||
mov [r4+4*1], ebx
|
||||
mov [r4+4*2], ecx
|
||||
mov [r4+4*3], edx
|
||||
mov [r4+4*2], edx
|
||||
mov [r4+4*3], ecx
|
||||
%if ARCH_X86_64
|
||||
mov rbx, r5
|
||||
%endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -65,10 +65,13 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
|
|||
|
||||
c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
|
||||
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
|
||||
|
||||
if (flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER) return;
|
||||
|
||||
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
|
||||
#endif
|
||||
|
|
|
@ -313,7 +313,7 @@ ALIGN function_align
|
|||
%endmacro
|
||||
|
||||
%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2
|
||||
INV_TXFM_FN %1, %2, 0, 4x4, 12
|
||||
INV_TXFM_FN %1, %2, 0, 4x4, 12
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r6d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
|
@ -340,21 +340,20 @@ ALIGN function_align
|
|||
%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
|
||||
vpbroadcastd m%5, [pw_m3784_1567]
|
||||
punpckhwd m%3, m%2, m%1
|
||||
psubw m%4, m%1, m%2
|
||||
paddw m%1, m%2
|
||||
vpbroadcastd m%2, [pw_1567_3784]
|
||||
punpcklqdq m%1, m%4
|
||||
vpbroadcastd m%4, [pw_2896x8]
|
||||
vpbroadcastd m%4, [pw_1567_3784]
|
||||
punpcklwd m%2, m%1
|
||||
vpbroadcastd m%1, [pw_m2896_2896]
|
||||
pmaddwd m%5, m%3
|
||||
pmaddwd m%3, m%2
|
||||
pmulhrsw m%1, m%4 ; t0 t1
|
||||
paddd m%5, m%6
|
||||
paddd m%3, m%6
|
||||
psrad m%5, 12
|
||||
psrad m%3, 12
|
||||
pmaddwd m%3, m%4
|
||||
vpbroadcastd m%4, [pw_2896_2896]
|
||||
pmaddwd m%1, m%2
|
||||
pmaddwd m%2, m%4
|
||||
REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
|
||||
REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
|
||||
packssdw m%3, m%5 ; t3 t2
|
||||
psubsw m%2, m%1, m%3 ; out3 out2
|
||||
paddsw m%1, m%3 ; out0 out1
|
||||
packssdw m%2, m%1 ; t0 t1
|
||||
paddsw m%1, m%2, m%3 ; out0 out1
|
||||
psubsw m%2, m%3 ; out3 out2
|
||||
%endmacro
|
||||
|
||||
INV_TXFM_4X4_FN dct, dct
|
||||
|
@ -2581,6 +2580,33 @@ cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
lea dstq, [dstq+strideq*2]
|
||||
ret
|
||||
|
||||
%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
|
||||
punpckldq m%9, m%1, m%2 ; aibj emfn
|
||||
punpckhdq m%1, m%2 ; ckdl gohp
|
||||
punpckldq m%10, m%3, m%4 ; qyrz uCvD
|
||||
punpckhdq m%3, m%4 ; sAtB wExF
|
||||
punpckldq m%11, m%5, m%6 ; GOHP KSLT
|
||||
punpckhdq m%5, m%6 ; IQJR MUNV
|
||||
punpckldq m%12, m%7, m%8 ; WeXf aibj
|
||||
punpckhdq m%7, m%8 ; YgZh ckdl
|
||||
punpcklqdq m%2, m%9, m%10 ; aiqy emuC
|
||||
punpckhqdq m%9, m%10 ; bjrz fnvD
|
||||
punpcklqdq m%4, m%1, m%3 ; cksA gowE
|
||||
punpckhqdq m%10, m%1, m%3 ; dltB hpxF
|
||||
punpcklqdq m%6, m%11, m%12 ; GOWe KSai
|
||||
punpckhqdq m%11, m%12 ; HPXf LTbj
|
||||
punpcklqdq m%8, m%5, m%7 ; IQYg MUck
|
||||
punpckhqdq m%12, m%5, m%7 ; JRZh NVdl
|
||||
vperm2i128 m%1, m%2, m%6, 0x20 ; out0
|
||||
vperm2i128 m%5, m%2, m%6, 0x31 ; out4
|
||||
vperm2i128 m%2, m%9, m%11, 0x20 ; out1
|
||||
vperm2i128 m%6, m%9, m%11, 0x31 ; out5
|
||||
vperm2i128 m%3, m%4, m%8, 0x20 ; out2
|
||||
vperm2i128 m%7, m%4, m%8, 0x31 ; out6
|
||||
vperm2i128 m%4, m%10, m%12, 0x20 ; out3
|
||||
vperm2i128 m%8, m%10, m%12, 0x31 ; out7
|
||||
%endmacro
|
||||
|
||||
INV_TXFM_8X8_FN dct, dct, 12
|
||||
INV_TXFM_8X8_FN dct, identity, 12
|
||||
INV_TXFM_8X8_FN dct, adst, 12
|
||||
|
@ -2608,30 +2634,7 @@ ALIGN function_align
|
|||
ret
|
||||
ALIGN function_align
|
||||
.transpose_8x8:
|
||||
punpckldq m8, m0, m1 ; aibj emfn
|
||||
punpckhdq m0, m0, m1 ; ckdl gohp
|
||||
punpckldq m9, m2, m3 ; qyrz uCvD
|
||||
punpckhdq m2, m2, m3 ; sAtB wExF
|
||||
punpckldq m10, m4, m5 ; GOHP KSLT
|
||||
punpckhdq m4, m4, m5 ; IQJR MUNV
|
||||
punpckldq m11, m6, m7 ; WeXf aibj
|
||||
punpckhdq m6, m6, m7 ; YgZh ckdl
|
||||
punpcklqdq m1, m8, m9 ; aiqy emuC
|
||||
punpckhqdq m8, m8, m9 ; bjrz fnvD
|
||||
punpcklqdq m3, m0, m2 ; cksA gowE
|
||||
punpckhqdq m9, m0, m2 ; dltB hpxF
|
||||
punpcklqdq m5, m10, m11 ; GOWe KSai
|
||||
punpckhqdq m10, m10, m11 ; HPXf LTbj
|
||||
punpcklqdq m7, m4, m6 ; IQYg MUck
|
||||
punpckhqdq m11, m4, m6 ; JRZh NVdl
|
||||
vperm2i128 m0, m1, m5, 0x20 ; out0
|
||||
vperm2i128 m4, m1, m5, 0x31 ; out4
|
||||
vperm2i128 m1, m8, m10, 0x20 ; out1
|
||||
vperm2i128 m5, m8, m10, 0x31 ; out5
|
||||
vperm2i128 m2, m3, m7, 0x20 ; out2
|
||||
vperm2i128 m6, m3, m7, 0x31 ; out6
|
||||
vperm2i128 m3, m9, m11, 0x20 ; out3
|
||||
vperm2i128 m7, m9, m11, 0x31 ; out7
|
||||
TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
|
||||
ret
|
||||
ALIGN function_align
|
||||
.round_shift4:
|
||||
|
@ -3336,6 +3339,21 @@ INV_TXFM_8X16_FN identity, identity, 0, 12
|
|||
cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
||||
jmp m(iidentity_8x16_internal_10bpc).pass1
|
||||
.pass2:
|
||||
call .pass2_main
|
||||
packssdw m0, m8
|
||||
packssdw m1, m9
|
||||
packssdw m2, m10
|
||||
packssdw m3, m11
|
||||
packssdw m4, m12
|
||||
packssdw m5, m13
|
||||
packssdw m6, m14
|
||||
packssdw m13, m7, m15
|
||||
vpbroadcastd m7, [pixel_12bpc_max]
|
||||
vpbroadcastd m12, [pw_16384]
|
||||
call m(iidentity_8x16_internal_10bpc).pass2_end
|
||||
RET
|
||||
ALIGN function_align
|
||||
.pass2_main:
|
||||
mova [cq], m7
|
||||
vpbroadcastd m7, [clip_18b_min]
|
||||
REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
|
||||
|
@ -3358,18 +3376,7 @@ cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|||
paddd m15, [cq]
|
||||
REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
||||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
packssdw m0, m8
|
||||
packssdw m1, m9
|
||||
packssdw m2, m10
|
||||
packssdw m3, m11
|
||||
packssdw m4, m12
|
||||
packssdw m5, m13
|
||||
packssdw m6, m14
|
||||
packssdw m13, m7, m15
|
||||
vpbroadcastd m7, [pixel_12bpc_max]
|
||||
vpbroadcastd m12, [pw_16384]
|
||||
call m(iidentity_8x16_internal_10bpc).pass2_end
|
||||
RET
|
||||
ret
|
||||
|
||||
%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
|
||||
INV_TXFM_FN %1, %2, 0, 16x4, %3
|
||||
|
@ -4481,15 +4488,15 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|||
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
||||
jmp m(idct_16x8_internal_10bpc).end2
|
||||
|
||||
%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
|
||||
INV_TXFM_FN %1, %2, %3, 16x16
|
||||
%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
|
||||
INV_TXFM_FN %1, %2, %3, 16x16, %4
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r6d, [cq], 2896
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 16
|
||||
add r6d, 10240
|
||||
sar r6d, 14
|
||||
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
|
||||
jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
@ -4499,9 +4506,10 @@ INV_TXFM_16X16_FN dct, adst
|
|||
INV_TXFM_16X16_FN dct, flipadst
|
||||
|
||||
cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
.pass1:
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m14, [pd_2896]
|
||||
lea r6, [rsp+32*4]
|
||||
sub eobd, 36
|
||||
|
@ -4605,6 +4613,7 @@ ALIGN function_align
|
|||
pmulhrsw m2, m12
|
||||
pmulhrsw m3, m12
|
||||
call m(idct_16x8_internal_10bpc).write_16x4_start
|
||||
.write_16x16_2:
|
||||
pmulhrsw m0, m12, m4
|
||||
pmulhrsw m1, m12, m5
|
||||
pmulhrsw m2, m12, m6
|
||||
|
@ -4747,6 +4756,7 @@ INV_TXFM_16X16_FN adst, flipadst
|
|||
cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m13, [clip_18b_min]
|
||||
vpbroadcastd m14, [clip_18b_max]
|
||||
.pass1:
|
||||
vpbroadcastd m15, [pd_2896]
|
||||
lea r6, [rsp+32*4]
|
||||
sub eobd, 36
|
||||
|
@ -4882,6 +4892,7 @@ INV_TXFM_16X16_FN flipadst, flipadst
|
|||
cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m13, [clip_18b_min]
|
||||
vpbroadcastd m14, [clip_18b_max]
|
||||
.pass1:
|
||||
vpbroadcastd m15, [pd_2896]
|
||||
lea r6, [rsp+32*4]
|
||||
sub eobd, 36
|
||||
|
@ -4993,6 +5004,7 @@ INV_TXFM_16X16_FN identity, dct, -92
|
|||
INV_TXFM_16X16_FN identity, identity
|
||||
|
||||
cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
.pass1:
|
||||
vpbroadcastd m15, [pd_11586]
|
||||
vpbroadcastd m7, [pd_10240]
|
||||
lea r6, [rsp+32*4]
|
||||
|
@ -5056,6 +5068,375 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
|
|||
mova m1, [cq+32*1]
|
||||
jmp m(idct_16x16_internal_10bpc).end
|
||||
|
||||
INV_TXFM_16X16_FN dct, dct, 0, 12
|
||||
INV_TXFM_16X16_FN dct, identity, 28, 12
|
||||
INV_TXFM_16X16_FN dct, adst, 0, 12
|
||||
INV_TXFM_16X16_FN dct, flipadst, 0, 12
|
||||
|
||||
cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m12, [clip_20b_min]
|
||||
vpbroadcastd m13, [clip_20b_max]
|
||||
jmp m(idct_16x16_internal_10bpc).pass1
|
||||
.pass2:
|
||||
mova [cq+32* 8], m8
|
||||
mova [cq+32* 9], m9
|
||||
mova [cq+32*10], m10
|
||||
mova [cq+32*11], m11
|
||||
mova [cq+32*12], m12
|
||||
mova [cq+32*13], m13
|
||||
mova [cq+32*14], m14
|
||||
mova [cq+32*15], m15
|
||||
call .pass2_main
|
||||
packssdw m0, m1
|
||||
packssdw m1, m2, m3
|
||||
packssdw m2, m4, m5
|
||||
packssdw m3, m6, m7
|
||||
packssdw m4, m8, m9
|
||||
packssdw m5, m10, m11
|
||||
packssdw m6, m12, m13
|
||||
packssdw m7, m14, m15
|
||||
mova [r6-32*4], m0
|
||||
mova [r6-32*3], m1
|
||||
mova [r6-32*2], m2
|
||||
mova [r6-32*1], m3
|
||||
mova [r6+32*0], m4
|
||||
mova [r6+32*1], m5
|
||||
mova [r6+32*2], m6
|
||||
mova [r6+32*3], m7
|
||||
mova m0, [cq+32* 8]
|
||||
mova m1, [cq+32* 9]
|
||||
mova m2, [cq+32*10]
|
||||
mova m3, [cq+32*11]
|
||||
mova m4, [cq+32*12]
|
||||
mova m5, [cq+32*13]
|
||||
mova m6, [cq+32*14]
|
||||
mova m7, [cq+32*15]
|
||||
mov r5, r6
|
||||
add r6, 32*16
|
||||
call .pass2_main
|
||||
jmp m(iadst_16x16_internal_12bpc).end
|
||||
ALIGN function_align
|
||||
.write_16x16:
|
||||
mova [rsp+gprsize+32*0], m8
|
||||
mova [rsp+gprsize+32*1], m9
|
||||
mova [rsp+gprsize+32*2], m12
|
||||
vpbroadcastd m12, [pw_16384]
|
||||
pmulhrsw m0, m12
|
||||
pmulhrsw m1, m12
|
||||
pmulhrsw m2, m12
|
||||
pmulhrsw m3, m12
|
||||
call m(idct_16x8_internal_12bpc).write_16x4_start
|
||||
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
||||
jmp m(idct_16x16_internal_10bpc).write_16x16_2
|
||||
ALIGN function_align
|
||||
.pass2_main:
|
||||
call m(idct_8x8_internal_12bpc).transpose_8x8
|
||||
mova [cq+32* 0], m0
|
||||
mova [cq+32* 1], m2
|
||||
mova [cq+32* 2], m4
|
||||
mova [cq+32* 3], m6
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
pmaxsd m0, m12, m1
|
||||
pmaxsd m1, m12, m3
|
||||
pmaxsd m2, m12, m5
|
||||
pmaxsd m3, m12, m7
|
||||
REPX {pminsd x, m13}, m0, m1, m2, m3
|
||||
test eobd, eobd
|
||||
jge .pass2_slow
|
||||
pxor m4, m4
|
||||
REPX {mova x, m4}, m5, m6, m7
|
||||
jmp .pass2_fast
|
||||
.pass2_slow:
|
||||
sub r6, 32*8
|
||||
mova m8, [r6-32*4]
|
||||
mova m4, [r6-32*3]
|
||||
mova m10, [r6-32*2]
|
||||
mova m5, [r6-32*1]
|
||||
mova m12, [r6+32*0]
|
||||
mova m6, [r6+32*1]
|
||||
mova m14, [r6+32*2]
|
||||
mova m7, [r6+32*3]
|
||||
TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
|
||||
mova [cq+32* 4], m8
|
||||
mova [cq+32* 5], m10
|
||||
mova [cq+32* 6], m12
|
||||
mova [cq+32* 7], m14
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
REPX {pmaxsd x, m12}, m4, m5, m6, m7
|
||||
REPX {pminsd x, m13}, m4, m5, m6, m7
|
||||
.pass2_fast:
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m14, [pd_2896]
|
||||
call m(idct_8x16_internal_10bpc).main_oddhalf
|
||||
pmaxsd m0, m12, [cq+32* 0]
|
||||
pmaxsd m1, m12, [cq+32* 1]
|
||||
pmaxsd m2, m12, [cq+32* 2]
|
||||
pmaxsd m3, m12, [cq+32* 3]
|
||||
REPX {pminsd x, m13}, m0, m1, m2, m3
|
||||
test eobd, eobd
|
||||
jge .pass2_slow2
|
||||
pxor m4, m4
|
||||
REPX {mova x, m4}, m5, m6, m7
|
||||
jmp .pass2_fast2
|
||||
.pass2_slow2:
|
||||
pmaxsd m4, m12, [cq+32* 4]
|
||||
pmaxsd m5, m12, [cq+32* 5]
|
||||
pmaxsd m6, m12, [cq+32* 6]
|
||||
pmaxsd m7, m12, [cq+32* 7]
|
||||
REPX {pminsd x, m13}, m4, m5, m6, m7
|
||||
.pass2_fast2:
|
||||
call m(idct_8x8_internal_10bpc).main
|
||||
call m(idct_8x16_internal_10bpc).main_evenhalf
|
||||
psrad m11, 8 ; pd_8
|
||||
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
call m(idct_16x8_internal_10bpc).pass1_rotations
|
||||
REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
||||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
ret
|
||||
|
||||
INV_TXFM_16X16_FN adst, dct, 0, 12
|
||||
INV_TXFM_16X16_FN adst, adst, 0, 12
|
||||
INV_TXFM_16X16_FN adst, flipadst, 0, 12
|
||||
|
||||
cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m13, [clip_20b_min]
|
||||
vpbroadcastd m14, [clip_20b_max]
|
||||
jmp m(iadst_16x16_internal_10bpc).pass1
|
||||
.pass2:
|
||||
call .pass2_part1
|
||||
call m(iadst_16x8_internal_10bpc).pass1_rotations
|
||||
call .pass2_part2
|
||||
call m(iadst_16x8_internal_10bpc).pass1_rotations
|
||||
.pass2_part3:
|
||||
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
|
||||
REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
|
||||
.end:
|
||||
packssdw m15, m14
|
||||
packssdw m14, m13, m12
|
||||
packssdw m13, m11, m10
|
||||
packssdw m12, m9, m8
|
||||
packssdw m11, m7, m6
|
||||
packssdw m10, m5, m4
|
||||
packssdw m7, m3, m2
|
||||
packssdw m6, m1, m0
|
||||
vpblendd m0, m6, [r5-32*4], 0x33
|
||||
vpblendd m1, m6, [r5-32*4], 0xcc
|
||||
vpblendd m2, m7, [r5-32*3], 0x33
|
||||
vpblendd m3, m7, [r5-32*3], 0xcc
|
||||
vpermq m0, m0, q3120
|
||||
vpermq m1, m1, q2031
|
||||
vpermq m2, m2, q3120
|
||||
vpermq m3, m3, q2031
|
||||
call m(idct_16x8_internal_12bpc).write_16x4_start
|
||||
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
||||
vpblendd m0, m10, [r5-32*2], 0x33
|
||||
vpblendd m1, m10, [r5-32*2], 0xcc
|
||||
vpblendd m2, m11, [r5-32*1], 0x33
|
||||
vpblendd m3, m11, [r5-32*1], 0xcc
|
||||
vpermq m0, m0, q3120
|
||||
vpermq m1, m1, q2031
|
||||
vpermq m2, m2, q3120
|
||||
vpermq m3, m3, q2031
|
||||
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
||||
vpblendd m0, m12, [r5+32*0], 0x33
|
||||
vpblendd m1, m12, [r5+32*0], 0xcc
|
||||
vpblendd m2, m13, [r5+32*1], 0x33
|
||||
vpblendd m3, m13, [r5+32*1], 0xcc
|
||||
vpermq m0, m0, q3120
|
||||
vpermq m1, m1, q2031
|
||||
vpermq m2, m2, q3120
|
||||
vpermq m3, m3, q2031
|
||||
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
||||
vpblendd m0, m14, [r5+32*2], 0x33
|
||||
vpblendd m1, m14, [r5+32*2], 0xcc
|
||||
vpblendd m2, m15, [r5+32*3], 0x33
|
||||
vpblendd m3, m15, [r5+32*3], 0xcc
|
||||
vpermq m0, m0, q3120
|
||||
vpermq m1, m1, q2031
|
||||
vpermq m2, m2, q3120
|
||||
vpermq m3, m3, q2031
|
||||
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
||||
RET
|
||||
ALIGN function_align
|
||||
.pass2_part1:
|
||||
mova [cq+32* 8], m8
|
||||
mova [cq+32* 9], m9
|
||||
mova [cq+32*10], m10
|
||||
mova [cq+32*11], m11
|
||||
mova [cq+32*12], m12
|
||||
mova [cq+32*13], m13
|
||||
mova [cq+32*14], m14
|
||||
mova [cq+32*15], m15
|
||||
.pass2_main:
|
||||
call m(idct_8x8_internal_12bpc).transpose_8x8
|
||||
mova [cq+32* 0], m0
|
||||
mova [cq+32* 1], m3
|
||||
mova [cq+32* 2], m4
|
||||
mova [cq+32* 3], m7
|
||||
vpbroadcastd m13, [clip_18b_min]
|
||||
vpbroadcastd m14, [clip_18b_max]
|
||||
pmaxsd m0, m13, m2
|
||||
pmaxsd m2, m13, m6
|
||||
pmaxsd m5, m13, m5
|
||||
pmaxsd m7, m13, m1
|
||||
REPX {pminsd x, m14}, m0, m2, m5, m7
|
||||
test eobd, eobd
|
||||
jge .pass2_slow
|
||||
pxor m1, m1
|
||||
REPX {mova x, m1}, m3, m4, m6
|
||||
jmp .pass2_fast
|
||||
.pass2_slow:
|
||||
sub r6, 32*8
|
||||
mova m8, [r6-32*4]
|
||||
mova m3, [r6-32*3]
|
||||
mova m4, [r6-32*2]
|
||||
mova m11, [r6-32*1]
|
||||
mova m12, [r6+32*0]
|
||||
mova m1, [r6+32*1]
|
||||
mova m6, [r6+32*2]
|
||||
mova m15, [r6+32*3]
|
||||
TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
|
||||
mova [cq+32* 4], m8
|
||||
mova [cq+32* 5], m11
|
||||
mova [cq+32* 6], m12
|
||||
mova [cq+32* 7], m15
|
||||
vpbroadcastd m13, [clip_18b_min]
|
||||
vpbroadcastd m14, [clip_18b_max]
|
||||
REPX {pmaxsd x, m13}, m1, m3, m4, m6
|
||||
REPX {pminsd x, m14}, m1, m3, m4, m6
|
||||
.pass2_fast:
|
||||
vpbroadcastd m12, [pd_2048]
|
||||
vpbroadcastd m15, [pd_2896]
|
||||
call m(iadst_16x8_internal_10bpc).main_part1
|
||||
pmaxsd m0, m13, [cq+32* 0] ; 0
|
||||
pmaxsd m7, m13, [cq+32* 1] ; 3
|
||||
pmaxsd m2, m13, [cq+32* 2] ; 4
|
||||
pmaxsd m5, m13, [cq+32* 3] ; 7
|
||||
REPX {pminsd x, m14}, m0, m2, m5, m7
|
||||
test eobd, eobd
|
||||
jge .pass2_slow2
|
||||
pxor m1, m1
|
||||
REPX {mova x, m1}, m3, m4, m6
|
||||
jmp .pass2_fast2
|
||||
.pass2_slow2:
|
||||
pmaxsd m4, m13, [cq+32* 4] ; 8
|
||||
pmaxsd m3, m13, [cq+32* 5] ; 11
|
||||
pmaxsd m6, m13, [cq+32* 6] ; 12
|
||||
pmaxsd m1, m13, [cq+32* 7] ; 15
|
||||
REPX {pminsd x, m14}, m1, m3, m4, m6
|
||||
.pass2_fast2:
|
||||
call m(iadst_16x8_internal_10bpc).main_part2
|
||||
vpbroadcastd m14, [pd_34816]
|
||||
psrld m15, 11 ; pd_1
|
||||
psubd m13, m14, m15 ; pd_34815
|
||||
pslld m15, 3 ; pd_8
|
||||
ret
|
||||
ALIGN function_align
|
||||
.pass2_part2:
|
||||
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
|
||||
REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
|
||||
packssdw m0, m1
|
||||
packssdw m1, m2, m3
|
||||
packssdw m2, m4, m5
|
||||
packssdw m3, m6, m7
|
||||
packssdw m4, m8, m9
|
||||
packssdw m5, m10, m11
|
||||
packssdw m6, m12, m13
|
||||
packssdw m7, m14, m15
|
||||
mova [r6-32*4], m0
|
||||
mova [r6-32*3], m1
|
||||
mova [r6-32*2], m2
|
||||
mova [r6-32*1], m3
|
||||
mova [r6+32*0], m4
|
||||
mova [r6+32*1], m5
|
||||
mova [r6+32*2], m6
|
||||
mova [r6+32*3], m7
|
||||
mova m0, [cq+32* 8]
|
||||
mova m1, [cq+32* 9]
|
||||
mova m2, [cq+32*10]
|
||||
mova m3, [cq+32*11]
|
||||
mova m4, [cq+32*12]
|
||||
mova m5, [cq+32*13]
|
||||
mova m6, [cq+32*14]
|
||||
mova m7, [cq+32*15]
|
||||
mov r5, r6
|
||||
add r6, 32*16
|
||||
jmp .pass2_main
|
||||
|
||||
INV_TXFM_16X16_FN flipadst, dct, 0, 12
|
||||
INV_TXFM_16X16_FN flipadst, adst, 0, 12
|
||||
INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
|
||||
|
||||
cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m13, [clip_20b_min]
|
||||
vpbroadcastd m14, [clip_20b_max]
|
||||
jmp m(iflipadst_16x16_internal_10bpc).pass1
|
||||
.pass2:
|
||||
call m(iadst_16x16_internal_12bpc).pass2_part1
|
||||
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
|
||||
call m(iadst_16x16_internal_12bpc).pass2_part2
|
||||
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
|
||||
jmp m(iadst_16x16_internal_12bpc).pass2_part3
|
||||
|
||||
INV_TXFM_16X16_FN identity, dct, -92, 12
|
||||
INV_TXFM_16X16_FN identity, identity, 0, 12
|
||||
|
||||
cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
||||
jmp m(iidentity_16x16_internal_10bpc).pass1
|
||||
.pass2:
|
||||
call m(iidentity_8x16_internal_12bpc).pass2_main
|
||||
call m(idct_16x16_internal_10bpc).transpose_fast
|
||||
test eobd, eobd
|
||||
jl .pass2_fast
|
||||
mova [cq+32* 8], m0
|
||||
mova [cq+32* 9], m1
|
||||
mova [cq+32*10], m2
|
||||
mova [cq+32*11], m3
|
||||
mova [cq+32*12], m4
|
||||
mova [cq+32*13], m5
|
||||
mova [cq+32*14], m6
|
||||
mova [cq+32*15], m7
|
||||
mova m8, [r6-32*4]
|
||||
mova m9, [r6-32*3]
|
||||
mova m10, [r6-32*2]
|
||||
mova m11, [r6-32*1]
|
||||
mova m12, [r6+32*0]
|
||||
mova m13, [r6+32*1]
|
||||
mova m14, [r6+32*2]
|
||||
mova m15, [r6+32*3]
|
||||
sub r6, 32*8
|
||||
mova m0, [r6-32*4]
|
||||
mova m1, [r6-32*3]
|
||||
mova m2, [r6-32*2]
|
||||
mova m3, [r6-32*1]
|
||||
mova m4, [r6+32*0]
|
||||
mova m5, [r6+32*1]
|
||||
mova m6, [r6+32*2]
|
||||
mova m7, [r6+32*3]
|
||||
call m(iidentity_8x16_internal_12bpc).pass2_main
|
||||
call m(idct_16x8_internal_10bpc).transpose2
|
||||
mova m8, m0
|
||||
mova m9, m1
|
||||
mova m10, m2
|
||||
mova m11, m3
|
||||
mova m12, m4
|
||||
mova m13, m5
|
||||
mova m14, m6
|
||||
mova m15, m7
|
||||
mova m0, [cq+32* 8]
|
||||
mova m1, [cq+32* 9]
|
||||
mova m2, [cq+32*10]
|
||||
mova m3, [cq+32*11]
|
||||
mova m4, [cq+32*12]
|
||||
mova m5, [cq+32*13]
|
||||
mova m6, [cq+32*14]
|
||||
mova m7, [cq+32*15]
|
||||
.pass2_fast:
|
||||
call m(idct_16x16_internal_12bpc).write_16x16
|
||||
RET
|
||||
|
||||
%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
|
||||
mova m%4, [r6+32*(%1-4)]
|
||||
mova m%2, [r5+32*(3-%1)]
|
||||
|
|
|
@ -101,6 +101,8 @@ pixel_10bpc_max: times 8 dw 0x03ff
|
|||
|
||||
pw_1567_3784: times 4 dw 1567, 3784
|
||||
pw_m3784_1567: times 4 dw -3784, 1567
|
||||
pw_2896_2896: times 4 dw 2896, 2896
|
||||
pw_m2896_2896: times 4 dw -2896, 2896
|
||||
|
||||
clip_18b_min: times 4 dd -0x20000
|
||||
clip_18b_max: times 4 dd 0x1ffff
|
||||
|
@ -429,22 +431,19 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
; m0 = in0 in1
|
||||
; m1 = in2 in3
|
||||
; m5 = pd_2048
|
||||
mova m4, [o(pw_m3784_1567)]
|
||||
punpckhwd m2, m1, m0
|
||||
psubw m3, m0, m1
|
||||
paddw m0, m1
|
||||
punpcklqdq m0, m3
|
||||
pmaddwd m4, m2
|
||||
punpcklwd m1, m0
|
||||
pmaddwd m4, m2, [o(pw_m3784_1567)]
|
||||
pmaddwd m2, [o(pw_1567_3784)]
|
||||
pmulhrsw m0, [o(pw_2896x8)] ; t0 t1
|
||||
paddd m4, m5
|
||||
paddd m2, m5
|
||||
psrad m4, 12
|
||||
psrad m2, 12
|
||||
packssdw m2, m4 ; t3 t2
|
||||
psubsw m1, m0, m2 ; tmp3 tmp2
|
||||
paddsw m0, m2 ; tmp0 tmp1
|
||||
pmaddwd m0, m1, [o(pw_m2896_2896)]
|
||||
pmaddwd m1, [o(pw_2896_2896)]
|
||||
REPX {paddd x, m5}, m4, m2, m0, m1
|
||||
packssdw m5, m5 ; pw_2048
|
||||
REPX {psrad x, 12}, m4, m2, m0, m1
|
||||
packssdw m2, m4 ; t3 t2
|
||||
packssdw m1, m0 ; t0 t1
|
||||
paddsw m0, m1, m2 ; out0 out1
|
||||
psubsw m1, m2 ; out3 out2
|
||||
pmulhrsw m0, m5
|
||||
pmulhrsw m1, m5
|
||||
movq m2, [dstq+strideq*0]
|
||||
|
|
|
@ -273,6 +273,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
|
|||
assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
|
||||
assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
|
||||
assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
|
||||
assign_itx12_bpc_fn( , 16, 16, 12, avx2);
|
||||
#endif
|
||||
|
||||
if (bpc > 10) return;
|
||||
|
|
|
@ -3017,11 +3017,11 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx,
|
|||
%endif
|
||||
dec hd
|
||||
jz .ret
|
||||
mova xm8, [rsp+0x00]
|
||||
movd xm9, [rsp+0x30]
|
||||
add myd, dyd
|
||||
test myd, ~0x3ff
|
||||
jz .w4_loop
|
||||
mova xm8, [rsp+0x00]
|
||||
movd xm9, [rsp+0x30]
|
||||
movu xm4, [srcq]
|
||||
movu xm5, [srcq+r4]
|
||||
test myd, 0x400
|
||||
|
@ -5789,7 +5789,7 @@ cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
|
|||
vpbroadcastd m5, dxm
|
||||
vpbroadcastd m8, mx0m
|
||||
vpbroadcastd m6, src_wm
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr, _, pxmax
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
|
||||
LEA r7, $$
|
||||
%define base r7-$$
|
||||
vpbroadcastd m3, [base+pd_64]
|
||||
|
|
|
@ -131,6 +131,16 @@ warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
|
|||
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
|
||||
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
|
||||
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
|
||||
resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
|
||||
resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
|
||||
resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
|
||||
resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
|
||||
resize_permE: dq 0, 2, 4, 6
|
||||
resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
|
||||
resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
|
||||
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
|
||||
|
||||
prep_hv_shift: dq 6, 4
|
||||
put_bilin_h_rnd: dw 8, 8, 10, 10
|
||||
|
@ -151,9 +161,12 @@ pw_m512: times 2 dw -512
|
|||
pw_2: times 2 dw 2
|
||||
pw_64: times 2 dw 64
|
||||
pd_32: dd 32
|
||||
pd_63: dd 63
|
||||
pd_128: dd 128
|
||||
pd_640: dd 640
|
||||
pd_2176: dd 2176
|
||||
pd_16384: dd 16384
|
||||
pd_0_4: dd 0, 4
|
||||
|
||||
%define pw_16 prep_mul
|
||||
%define pd_512 warp_8x8_rnd_h
|
||||
|
@ -237,6 +250,7 @@ cextern mc_subpel_filters
|
|||
|
||||
cextern mc_warp_filter
|
||||
cextern obmc_masks_avx2
|
||||
cextern resize_filter
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
@ -4708,4 +4722,145 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
|
|||
jl .w128
|
||||
RET
|
||||
|
||||
cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0, pxmax
|
||||
sub dword mx0m, 4<<14
|
||||
sub dword src_wm, 8
|
||||
mov r6, ~0
|
||||
vpbroadcastd m5, dxm
|
||||
vpbroadcastd m8, mx0m
|
||||
vpbroadcastd m6, src_wm
|
||||
kmovq k6, r6
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
|
||||
LEA r7, $$
|
||||
%define base r7-$$
|
||||
vpbroadcastd m3, [base+pd_16384]
|
||||
vpbroadcastd m7, [base+pd_63]
|
||||
mova m24, [base+resize_permA]
|
||||
mova m25, [base+resize_permB]
|
||||
mova m26, [base+resize_permC]
|
||||
mova m27, [base+resize_permD]
|
||||
vbroadcasti32x4 m28, [base+resize_shufA]
|
||||
vbroadcasti32x4 m29, [base+resize_shufB]
|
||||
mova m30, [base+resize_permE]
|
||||
vpbroadcastw ym31, pxmaxm
|
||||
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
|
||||
pslld m5, 4 ; dx*16
|
||||
pslld m6, 14
|
||||
pxor m2, m2
|
||||
.loop_y:
|
||||
xor xd, xd
|
||||
mova m4, m8 ; per-line working version of mx
|
||||
.loop_x:
|
||||
pmaxsd m0, m4, m2
|
||||
psrad m9, m4, 8 ; filter offset (unmasked)
|
||||
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
|
||||
psubd m1, m4, m0 ; pshufb offset
|
||||
psrad m0, 14 ; clipped src_x offset
|
||||
psrad m1, 14 ; pshufb edge_emu offset
|
||||
vptestmd k5, m1, m1
|
||||
pand m9, m7 ; filter offset (masked)
|
||||
ktestw k5, k5
|
||||
jz .load
|
||||
vpbroadcastq m14, [base+pd_0_4]
|
||||
vpermq m10, m0, q1100
|
||||
vpermq m11, m0, q3322
|
||||
vpermq m20, m1, q1100
|
||||
vpermq m21, m1, q3322
|
||||
punpckldq m10, m10
|
||||
punpckldq m11, m11
|
||||
punpckldq m20, m20
|
||||
punpckldq m21, m21
|
||||
paddd m10, m14
|
||||
paddd m11, m14
|
||||
paddd m20, m14
|
||||
paddd m21, m14
|
||||
vextracti32x8 ym12, m10, 1
|
||||
vextracti32x8 ym13, m11, 1
|
||||
vextracti32x8 ym22, m20, 1
|
||||
vextracti32x8 ym23, m21, 1
|
||||
kmovq k1, k6
|
||||
kmovq k2, k6
|
||||
kmovq k3, k6
|
||||
kmovq k4, k6
|
||||
vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
|
||||
vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
|
||||
vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
|
||||
vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
|
||||
kmovq k1, k6
|
||||
kmovq k2, k6
|
||||
kmovq k3, k6
|
||||
kmovq k4, k6
|
||||
vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
|
||||
vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
|
||||
vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
|
||||
vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
|
||||
pshufb m16, m0
|
||||
pshufb m17, m1
|
||||
pshufb m18, m14
|
||||
pshufb m19, m15
|
||||
mova m20, m24
|
||||
mova m22, m24
|
||||
mova m21, m25
|
||||
mova m23, m25
|
||||
vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
|
||||
vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
|
||||
vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
|
||||
vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
|
||||
mova m15, m26
|
||||
mova m17, m26
|
||||
mova m16, m27
|
||||
mova m18, m27
|
||||
vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
|
||||
vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
|
||||
vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
|
||||
vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
|
||||
kmovq k1, k6
|
||||
kmovq k2, k6
|
||||
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
|
||||
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
|
||||
pshufb m10, m11, m28
|
||||
pshufb m11, m11, m29
|
||||
pshufb m12, m13, m28
|
||||
pshufb m13, m13, m29
|
||||
jmp .filter
|
||||
.load:
|
||||
kmovq k1, k6
|
||||
kmovq k2, k6
|
||||
kmovq k3, k6
|
||||
kmovq k4, k6
|
||||
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
|
||||
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
|
||||
pshufb m10, m11, m28
|
||||
pshufb m11, m11, m29
|
||||
pshufb m12, m13, m28
|
||||
pshufb m13, m13, m29
|
||||
vpgatherdd m15{k3}, [srcq+m0*2+ 0]
|
||||
vpgatherdd m16{k4}, [srcq+m0*2+ 4]
|
||||
kmovq k1, k6
|
||||
kmovq k2, k6
|
||||
vpgatherdd m17{k1}, [srcq+m0*2+ 8]
|
||||
vpgatherdd m18{k2}, [srcq+m0*2+12]
|
||||
.filter:
|
||||
mova m14, m2
|
||||
vpdpwssd m14, m15, m10
|
||||
vpdpwssd m14, m16, m11
|
||||
vpdpwssd m14, m17, m12
|
||||
vpdpwssd m14, m18, m13
|
||||
psubd m14, m3, m14
|
||||
psrad m14, 15
|
||||
packusdw m14, m14
|
||||
vpermq m14, m30, m14
|
||||
pminsw ym14, ym31
|
||||
mova [dstq+xq*2], ym14
|
||||
paddd m4, m5
|
||||
add xd, 16
|
||||
cmp xd, dst_wd
|
||||
jl .loop_x
|
||||
add dstq, dst_strideq
|
||||
add srcq, src_strideq
|
||||
dec hd
|
||||
jg .loop_y
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -5046,11 +5046,11 @@ cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
|
|||
vpbroadcastd m8, mx0m
|
||||
vpbroadcastd m6, src_wm
|
||||
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
|
||||
LEA r7, $$
|
||||
%define base r7-$$
|
||||
|
||||
vpbroadcastd m3, [base+pw_m256]
|
||||
vpbroadcastd xm3, [base+pw_m256]
|
||||
vpbroadcastd m7, [base+pd_63]
|
||||
vbroadcasti128 m15, [base+pb_8x0_8x8]
|
||||
pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
|
||||
|
|
|
@ -193,29 +193,39 @@ bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 1
|
|||
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
|
||||
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
|
||||
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
|
||||
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
|
||||
resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
||||
resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
||||
resize_permC: dd 0, 4, 8, 12
|
||||
pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
|
||||
|
||||
wm_420_perm64: dq 0xfedcba9876543210
|
||||
wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
|
||||
|
||||
pb_127: times 4 db 127
|
||||
pw_m128 times 2 dw -128
|
||||
pw_1024: times 2 dw 1024
|
||||
pw_2048: times 2 dw 2048
|
||||
pw_6903: times 2 dw 6903
|
||||
pw_8192: times 2 dw 8192
|
||||
pd_32: dd 32
|
||||
pd_34: dd 34
|
||||
pd_512: dd 512
|
||||
pd_32768: dd 32768
|
||||
pb_8x0_8x8: times 8 db 0
|
||||
times 8 db 8
|
||||
pb_127: times 4 db 127
|
||||
pw_m128 times 2 dw -128
|
||||
pw_m256: times 2 dw -256
|
||||
pw_1024: times 2 dw 1024
|
||||
pw_2048: times 2 dw 2048
|
||||
pw_6903: times 2 dw 6903
|
||||
pw_8192: times 2 dw 8192
|
||||
pd_32: dd 32
|
||||
pd_34: dd 34
|
||||
pd_63: dd 63
|
||||
pd_512: dd 512
|
||||
pd_32768: dd 32768
|
||||
|
||||
%define pb_m64 (wm_sign+4)
|
||||
%define pb_64 (wm_sign+8)
|
||||
%define pd_2 (pd_0to7+8)
|
||||
|
||||
cextern mc_subpel_filters
|
||||
cextern mc_warp_filter
|
||||
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
|
||||
cextern mc_warp_filter
|
||||
cextern resize_filter
|
||||
|
||||
%macro BASE_JMP_TABLE 3-*
|
||||
%xdefine %1_%2_table (%%table - %3)
|
||||
|
@ -4450,4 +4460,87 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
|
|||
jl .w128
|
||||
RET
|
||||
|
||||
cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0
|
||||
sub dword mx0m, 4<<14
|
||||
sub dword src_wm, 8
|
||||
mov r6, ~0
|
||||
vpbroadcastd m5, dxm
|
||||
vpbroadcastd m8, mx0m
|
||||
vpbroadcastd m6, src_wm
|
||||
kmovq k3, r6
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
|
||||
LEA r7, $$
|
||||
%define base r7-$$
|
||||
vpbroadcastd m3, [base+pw_m256]
|
||||
vpbroadcastd m7, [base+pd_63]
|
||||
vbroadcasti32x4 m15, [base+pb_8x0_8x8]
|
||||
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
|
||||
pslld m5, 4 ; dx*16
|
||||
pslld m6, 14
|
||||
pxor m2, m2
|
||||
mova m16, [base+resize_permA]
|
||||
mova m17, [base+resize_permB]
|
||||
mova xm18, [base+resize_permC]
|
||||
.loop_y:
|
||||
xor xd, xd
|
||||
mova m4, m8 ; per-line working version of mx
|
||||
.loop_x:
|
||||
pmaxsd m0, m4, m2
|
||||
psrad m9, m4, 8 ; filter offset (unmasked)
|
||||
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
|
||||
psubd m1, m4, m0 ; pshufb offset
|
||||
psrad m0, 14 ; clipped src_x offset
|
||||
psrad m1, 14 ; pshufb edge_emu offset
|
||||
vptestmd k4, m1, m1
|
||||
pand m9, m7 ; filter offset (masked)
|
||||
ktestw k4, k4
|
||||
jz .load
|
||||
vextracti32x8 ym12, m0, 1
|
||||
vextracti32x8 ym13, m1, 1
|
||||
kmovq k1, k3
|
||||
kmovq k2, k3
|
||||
vpgatherdq m10{k1}, [srcq+ym0]
|
||||
vpgatherdq m11{k2}, [srcq+ym12]
|
||||
kmovq k1, k3
|
||||
kmovq k2, k3
|
||||
vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
|
||||
vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
|
||||
mova m12, m16
|
||||
mova m13, m17
|
||||
paddb m14, m15
|
||||
paddb m0, m15
|
||||
pshufb m10, m14
|
||||
pshufb m11, m0
|
||||
vpermi2d m12, m10, m11
|
||||
vpermi2d m13, m10, m11
|
||||
jmp .filter
|
||||
.load:
|
||||
kmovq k1, k3
|
||||
kmovq k2, k3
|
||||
vpgatherdd m12{k1}, [srcq+m0+0]
|
||||
vpgatherdd m13{k2}, [srcq+m0+4]
|
||||
.filter:
|
||||
kmovq k1, k3
|
||||
kmovq k2, k3
|
||||
vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
|
||||
vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
|
||||
mova m14, m2
|
||||
vpdpbusd m14, m12, m10
|
||||
vpdpbusd m14, m13, m11
|
||||
packssdw m14, m14
|
||||
pmulhrsw m14, m3
|
||||
packuswb m14, m14
|
||||
vpermd m14, m18, m14
|
||||
mova [dstq+xq], xm14
|
||||
paddd m4, m5
|
||||
add xd, 16
|
||||
cmp xd, dst_wd
|
||||
jl .loop_x
|
||||
add dstq, dst_strideq
|
||||
add srcq, src_strideq
|
||||
dec hd
|
||||
jg .loop_y
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
|
|
@ -152,7 +152,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
|
||||
|
||||
#if BITDEPTH == 8
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
|
||||
|
@ -174,7 +173,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
|
||||
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
|
||||
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
|
||||
#endif
|
||||
|
||||
c->avg = BF(dav1d_avg, ssse3);
|
||||
c->w_avg = BF(dav1d_w_avg, ssse3);
|
||||
|
@ -296,5 +294,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
c->blend_h = BF(dav1d_blend_h, avx512icl);
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
|
||||
c->resize = BF(dav1d_resize, avx512icl);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -5170,9 +5170,9 @@ INIT_XMM ssse3
|
|||
mova [esp+0x40], m2
|
||||
mova [esp+0x50], m3
|
||||
MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1
|
||||
MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
|
||||
MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
|
||||
MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
|
||||
MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
|
||||
MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
|
||||
MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
|
||||
mova m5, [esp+0x180]
|
||||
mova m6, [esp+0x190]
|
||||
mova m7, [esp+0x1a0]
|
||||
|
@ -5201,9 +5201,6 @@ INIT_XMM ssse3
|
|||
.vloop:
|
||||
mov r0, r0m
|
||||
mov r5, [esp+0x1f4]
|
||||
%ifidn %1, put
|
||||
mov dsd, dsm
|
||||
%endif
|
||||
and myd, 0x3ff
|
||||
mov mym, myd
|
||||
xor r3, r3
|
||||
|
@ -5244,13 +5241,10 @@ INIT_XMM ssse3
|
|||
%ifidn %1, put
|
||||
packuswb m4, m4
|
||||
movq [dstq], m4
|
||||
add dstq, dsq
|
||||
add dstq, dsm
|
||||
%else
|
||||
mova [tmpq], m4
|
||||
add tmpq, tmp_stridem
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
mov r0m, r0
|
||||
%endif
|
||||
dec hd
|
||||
jz .hloop_prep
|
||||
|
@ -5329,6 +5323,7 @@ INIT_XMM ssse3
|
|||
mova [rsp+0x70], m7
|
||||
mova [rsp+0x80], m4
|
||||
%else
|
||||
mov r0m, r0
|
||||
mov myd, mym
|
||||
mov r3, r3m
|
||||
add myd, dym
|
||||
|
@ -5745,7 +5740,7 @@ INIT_XMM ssse3
|
|||
movu m1, [srcq+ssq*0]
|
||||
movu m2, [srcq+ssq*1]
|
||||
movu m3, [srcq+ssq*2]
|
||||
lea srcq, [srcq+ss3q ]
|
||||
add srcq, ss3q
|
||||
punpcklqdq m6, m6
|
||||
SWAP m4, m7
|
||||
pand m7, m11, m8
|
||||
|
@ -6723,7 +6718,7 @@ INIT_XMM ssse3
|
|||
movu m1, [srcq+ssq*0]
|
||||
movu m2, [srcq+ssq*2]
|
||||
movu m3, [srcq+ssq*1]
|
||||
lea srcq, [srcq+ss3q ]
|
||||
add srcq, ss3q
|
||||
punpcklqdq m6, m6
|
||||
SWAP m4, m7
|
||||
pand m7, m11, m8
|
||||
|
@ -6734,7 +6729,7 @@ INIT_XMM ssse3
|
|||
movu m0, [srcq+ssq*0]
|
||||
movu m7, [srcq+ssq*1]
|
||||
movu m6, [srcq+ssq*2]
|
||||
lea srcq, [srcq+ss3q ]
|
||||
add srcq, ss3q
|
||||
pshufb m1, m14
|
||||
pshufb m2, m14
|
||||
pshufb m3, m14
|
||||
|
@ -9409,7 +9404,7 @@ cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
|
|||
pshufd m5, m5, q0000
|
||||
|
||||
%if ARCH_X86_64
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
|
||||
LEA r7, $$
|
||||
%define base r7-$$
|
||||
%else
|
||||
|
|
|
@ -311,11 +311,12 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
|
|||
#endif
|
||||
|
||||
|
||||
#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1))
|
||||
#define PIXEL_RECT(name, w, h) \
|
||||
ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \
|
||||
ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \
|
||||
ALIGN_STK_64(pixel, name##_buf, ((h)+32)*(ROUND_UP(w,64)+64) + 64,); \
|
||||
ptrdiff_t name##_stride = sizeof(pixel)*(ROUND_UP(w,64)+64); \
|
||||
(void)name##_stride; \
|
||||
pixel *name = name##_buf + ((w)+64)*16 + 64
|
||||
pixel *name = name##_buf + (ROUND_UP(w,64)+64)*16 + 64
|
||||
|
||||
#define CLEAR_PIXEL_RECT(name) \
|
||||
memset(name##_buf, 0x99, sizeof(name##_buf)) \
|
||||
|
|
|
@ -192,8 +192,8 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
|
||||
PIXEL_RECT(c_dst, 32, 32);
|
||||
PIXEL_RECT(a_dst, 32, 32);
|
||||
ALIGN_STK_64(int16_t, ac, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, topleft_buf, 257,);
|
||||
pixel *const topleft = topleft_buf + 128;
|
||||
|
@ -215,8 +215,6 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
const ptrdiff_t stride = w * sizeof(pixel);
|
||||
|
||||
int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2));
|
||||
|
||||
for (int i = -h * 2; i <= w * 2; i++)
|
||||
|
@ -229,14 +227,17 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
for (int i = 0; i < w * h; i++)
|
||||
ac[i] -= luma_avg;
|
||||
|
||||
call_ref(c_dst, stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride,
|
||||
w, h, "dst");
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
bench_new(a_dst, stride, topleft, w, h, ac, alpha
|
||||
call_ref(c_dst, c_dst_stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
@ -244,8 +245,8 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
PIXEL_RECT(c_dst, 64, 64);
|
||||
PIXEL_RECT(a_dst, 64, 64);
|
||||
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
|
||||
ALIGN_STK_16(uint16_t, pal, 8,);
|
||||
|
||||
|
@ -261,7 +262,6 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const ptrdiff_t stride = w * sizeof(pixel);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
pal[i] = rnd() & bitdepth_max;
|
||||
|
@ -269,11 +269,15 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
for (int i = 0; i < w * h; i++)
|
||||
idx[i] = rnd() & 7;
|
||||
|
||||
call_ref(c_dst, stride, pal, idx, w, h);
|
||||
call_new(a_dst, stride, pal, idx, w, h);
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
bench_new(a_dst, stride, pal, idx, w, h);
|
||||
call_ref(c_dst, c_dst_stride, pal, idx, w, h);
|
||||
call_new(a_dst, a_dst_stride, pal, idx, w, h);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride,
|
||||
a_dst, a_dst_stride, w, h, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, pal, idx, w, h);
|
||||
}
|
||||
report("pal_pred");
|
||||
}
|
||||
|
|
|
@ -243,8 +243,8 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
|
|||
const enum RectTxfmSize tx)
|
||||
{
|
||||
ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
PIXEL_RECT(c_dst, 64, 64);
|
||||
PIXEL_RECT(a_dst, 64, 64);
|
||||
|
||||
static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };
|
||||
|
||||
|
@ -275,21 +275,26 @@ static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
|
|||
const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
|
||||
memcpy(coeff[1], coeff[0], sizeof(*coeff));
|
||||
|
||||
for (int j = 0; j < w * h; j++)
|
||||
c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
|
||||
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, c_dst_stride, coeff[0], eob
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
|
||||
call_new(a_dst, a_dst_stride, coeff[1], eob
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
|
||||
a_dst, w * sizeof(*a_dst),
|
||||
w, h, "dst");
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride,
|
||||
a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
|
||||
fail();
|
||||
|
||||
bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
|
||||
bench_new(a_dst, a_dst_stride, coeff[0], eob
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,8 +57,8 @@ static int mc_h_next(const int h) {
|
|||
|
||||
static void check_mc(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
PIXEL_RECT(c_dst, 128, 128);
|
||||
PIXEL_RECT(a_dst, 128, 128);
|
||||
const pixel *src = src_buf + 135 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 135 * sizeof(pixel);
|
||||
|
||||
|
@ -68,7 +68,6 @@ static void check_mc(Dav1dMCDSPContext *const c) {
|
|||
|
||||
for (int filter = 0; filter < N_2D_FILTERS; filter++)
|
||||
for (int w = 2; w <= 128; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int mxy = 0; mxy < 4; mxy++)
|
||||
if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc",
|
||||
filter_names[filter], w, mxy_names[mxy], BITDEPTH))
|
||||
|
@ -87,18 +86,21 @@ static void check_mc(Dav1dMCDSPContext *const c) {
|
|||
for (int i = 0; i < 135 * 135; i++)
|
||||
src_buf[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride, w, h,
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
call_ref(c_dst, c_dst_stride, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride, w, h,
|
||||
call_new(a_dst, a_dst_stride, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride,
|
||||
a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride,
|
||||
a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
if (filter == FILTER_2D_8TAP_REGULAR ||
|
||||
filter == FILTER_2D_BILINEAR)
|
||||
{
|
||||
bench_new(a_dst, dst_stride, src, src_stride, w, h,
|
||||
bench_new(a_dst, a_dst_stride, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
@ -164,8 +166,8 @@ static void check_mct(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_mc_scaled(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
PIXEL_RECT(c_dst, 128, 128);
|
||||
PIXEL_RECT(a_dst, 128, 128);
|
||||
const pixel *src = src_buf + 263 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 263 * sizeof(pixel);
|
||||
#if BITDEPTH == 16
|
||||
|
@ -180,7 +182,6 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) {
|
|||
|
||||
for (int filter = 0; filter < N_2D_FILTERS; filter++)
|
||||
for (int w = 2; w <= 128; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int p = 0; p < 3; ++p) {
|
||||
if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
|
||||
filter_names[filter], w, scaled_paths[p], BITDEPTH))
|
||||
|
@ -198,16 +199,20 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) {
|
|||
for (int k = 0; k < 263 * 263; k++)
|
||||
src_buf[k] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride,
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
call_ref(c_dst, c_dst_stride, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride,
|
||||
call_new(a_dst, a_dst_stride, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride,
|
||||
a_dst, dst_stride, w, h, "dst");
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride,
|
||||
a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
if (filter == FILTER_2D_8TAP_REGULAR ||
|
||||
filter == FILTER_2D_BILINEAR)
|
||||
bench_new(a_dst, dst_stride, src, src_stride,
|
||||
bench_new(a_dst, a_dst_stride, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
@ -281,15 +286,14 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
|
|||
|
||||
static void check_avg(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
PIXEL_RECT(c_dst, 135, 135);
|
||||
PIXEL_RECT(a_dst, 128, 128);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) {
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
|
@ -299,12 +303,16 @@ static void check_avg(Dav1dMCDSPContext *const c) {
|
|||
#endif
|
||||
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("avg");
|
||||
|
@ -312,15 +320,14 @@ static void check_avg(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_w_avg(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
PIXEL_RECT(c_dst, 135, 135);
|
||||
PIXEL_RECT(a_dst, 128, 128);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) {
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
int weight = rnd() % 15 + 1;
|
||||
|
@ -331,12 +338,15 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
|
|||
#endif
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride,a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("w_avg");
|
||||
|
@ -344,8 +354,8 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_mask(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
PIXEL_RECT(c_dst, 135, 135);
|
||||
PIXEL_RECT(a_dst, 128, 128);
|
||||
ALIGN_STK_64(uint8_t, mask, 128 * 128,);
|
||||
|
||||
for (int i = 0; i < 128 * 128; i++)
|
||||
|
@ -357,7 +367,6 @@ static void check_mask(Dav1dMCDSPContext *const c) {
|
|||
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) {
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
|
@ -366,12 +375,16 @@ static void check_mask(Dav1dMCDSPContext *const c) {
|
|||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("mask");
|
||||
|
@ -379,8 +392,8 @@ static void check_mask(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_w_mask(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
PIXEL_RECT(c_dst, 135, 135);
|
||||
PIXEL_RECT(a_dst, 128, 128);
|
||||
ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
|
||||
|
||||
|
@ -397,7 +410,6 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
|
|||
if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w,
|
||||
BITDEPTH))
|
||||
{
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
int sign = rnd() & 1;
|
||||
|
@ -408,19 +420,22 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
|
|||
#endif
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h,
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h,
|
||||
c_mask, sign HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
|
||||
call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h,
|
||||
a_mask, sign HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride,
|
||||
a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride,
|
||||
a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
checkasm_check(uint8_t, c_mask, w >> ss_hor[i],
|
||||
a_mask, w >> ss_hor[i],
|
||||
w >> ss_hor[i], h >> ss_ver[i],
|
||||
"mask");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
|
||||
bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h,
|
||||
a_mask, sign HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
@ -429,15 +444,14 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_blend(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, tmp, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
|
||||
PIXEL_RECT(c_dst, 32, 32);
|
||||
PIXEL_RECT(a_dst, 32, 32);
|
||||
ALIGN_STK_64(uint8_t, mask, 32 * 32,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h, const uint8_t *mask);
|
||||
|
||||
for (int w = 4; w <= 32; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
|
||||
#if BITDEPTH == 16
|
||||
|
@ -449,15 +463,21 @@ static void check_blend(Dav1dMCDSPContext *const c) {
|
|||
tmp[i] = rnd() & bitdepth_max;
|
||||
mask[i] = rnd() % 65;
|
||||
}
|
||||
for (int i = 0; i < w * h; i++)
|
||||
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp, w, h, mask);
|
||||
call_new(a_dst, dst_stride, tmp, w, h, mask);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp, w, h, mask);
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
|
||||
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, c_dst_stride, tmp, w, h, mask);
|
||||
call_new(a_dst, a_dst_stride, tmp, w, h, mask);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, tmp, w, h, mask);
|
||||
}
|
||||
}
|
||||
report("blend");
|
||||
|
@ -465,14 +485,13 @@ static void check_blend(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_blend_v(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, tmp, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 128,);
|
||||
PIXEL_RECT(c_dst, 32, 128);
|
||||
PIXEL_RECT(a_dst, 32, 128);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h);
|
||||
|
||||
for (int w = 2; w <= 32; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
|
||||
#if BITDEPTH == 16
|
||||
|
@ -481,17 +500,23 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
|
|||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < w * h; i++)
|
||||
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
|
||||
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
|
||||
|
||||
for (int i = 0; i < 32 * 128; i++)
|
||||
tmp[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp, w, h);
|
||||
call_new(a_dst, dst_stride, tmp, w, h);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
call_ref(c_dst, c_dst_stride, tmp, w, h);
|
||||
call_new(a_dst, a_dst_stride, tmp, w, h);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp, w, h);
|
||||
bench_new(a_dst, a_dst_stride, tmp, w, h);
|
||||
}
|
||||
}
|
||||
report("blend_v");
|
||||
|
@ -499,14 +524,13 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_blend_h(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, tmp, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
|
||||
PIXEL_RECT(c_dst, 128, 32);
|
||||
PIXEL_RECT(a_dst, 128, 32);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h);
|
||||
|
||||
for (int w = 2; w <= 128; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
|
||||
#if BITDEPTH == 16
|
||||
|
@ -514,17 +538,23 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
|
|||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
for (int i = 0; i < w * h; i++)
|
||||
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
c_dst[y*PXSTRIDE(c_dst_stride) + x] =
|
||||
a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max;
|
||||
|
||||
for (int i = 0; i < 128 * 32; i++)
|
||||
tmp[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp, w, h);
|
||||
call_new(a_dst, dst_stride, tmp, w, h);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
call_ref(c_dst, c_dst_stride, tmp, w, h);
|
||||
call_new(a_dst, a_dst_stride, tmp, w, h);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp, w, h);
|
||||
bench_new(a_dst, a_dst_stride, tmp, w, h);
|
||||
}
|
||||
}
|
||||
report("blend_h");
|
||||
|
@ -532,11 +562,10 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
|
|||
|
||||
static void check_warp8x8(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_64(pixel, c_dst, 8 * 8,);
|
||||
ALIGN_STK_64(pixel, a_dst, 8 * 8,);
|
||||
PIXEL_RECT(c_dst, 8, 8);
|
||||
PIXEL_RECT(a_dst, 8, 8);
|
||||
int16_t abcd[4];
|
||||
const pixel *src = src_buf + 15 * 3 + 3;
|
||||
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
|
||||
const ptrdiff_t src_stride = 15 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
|
||||
|
@ -558,12 +587,15 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) {
|
|||
for (int i = 0; i < 15 * 15; i++)
|
||||
src_buf[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
8, 8, "dst");
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_ref(c_dst, c_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
8, 8, "dst");
|
||||
|
||||
bench_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
report("warp8x8");
|
||||
}
|
||||
|
@ -687,13 +719,12 @@ static int get_upscale_x0(const int in_w, const int out_w, const int step) {
|
|||
}
|
||||
|
||||
static void check_resize(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
|
||||
ALIGN_STK_64(pixel, src, 512 * 64,);
|
||||
PIXEL_RECT(c_dst, 1024, 64);
|
||||
PIXEL_RECT(a_dst, 1024, 64);
|
||||
ALIGN_STK_64(pixel, src, 512 * 64,);
|
||||
|
||||
const int height = 64;
|
||||
const int max_src_width = 512;
|
||||
const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
|
||||
const ptrdiff_t src_stride = 512 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
|
@ -720,14 +751,17 @@ static void check_resize(Dav1dMCDSPContext *const c) {
|
|||
#undef scale_fac
|
||||
const int mx0 = get_upscale_x0(src_w, dst_w, dx);
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride,
|
||||
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride,
|
||||
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
dst_w, height, "dst");
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
bench_new(a_dst, dst_stride, src, src_stride,
|
||||
call_ref(c_dst, c_dst_stride, src, src_stride,
|
||||
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, a_dst_stride, src, src_stride,
|
||||
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel_padded_align(c_dst, c_dst_stride, a_dst, a_dst_stride,
|
||||
dst_w, height, "dst", 16, 1);
|
||||
|
||||
bench_new(a_dst, a_dst_stride, src, src_stride,
|
||||
512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче