зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1800912 - Update libdav1d to revision 4b9f5b704. r=media-playback-reviewers,alwu
Differential Revision: https://phabricator.services.mozilla.com/D162211
This commit is contained in:
Родитель
67f82af8fd
Коммит
d17cafec50
|
@ -98,6 +98,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
'../../../third_party/dav1d/src/x86/ipred_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred_avx512.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx16_avx512.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_avx512.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',
|
||||
|
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: cd5e415270285a58f48c1e9ec1a2dd024b9acf9f (2022-08-19T13:58:13.000-03:00).
|
||||
release: 4b9f5b704e299543afcea87f375a308b90ef6c70 (2022-11-10T00:58:40.000+00:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: cd5e415270285a58f48c1e9ec1a2dd024b9acf9f
|
||||
revision: 4b9f5b704e299543afcea87f375a308b90ef6c70
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "cd5e415270285a58f48c1e9ec1a2dd024b9acf9f"
|
||||
#define DAV1D_VERSION "4b9f5b704e299543afcea87f375a308b90ef6c70"
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
exclude = .*/tests/.*
|
||||
exclude = .*/tools/.*
|
||||
exclude = .*/include/common/dump.h
|
||||
gcov-ignore-parse-errors = yes
|
||||
|
|
|
@ -113,6 +113,12 @@
|
|||
#define ALWAYS_INLINE __attribute__((always_inline)) inline
|
||||
#endif
|
||||
|
||||
#if (defined(__ELF__) || defined(__MACH__) || (defined(_WIN32) && defined(__clang__))) && __has_attribute(visibility)
|
||||
#define EXTERN extern __attribute__((visibility("hidden")))
|
||||
#else
|
||||
#define EXTERN extern
|
||||
#endif
|
||||
|
||||
#ifdef __clang__
|
||||
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
|
||||
#else
|
||||
|
|
|
@ -44,6 +44,7 @@ typedef unsigned int atomic_uint;
|
|||
#define atomic_fetch_sub(p_a, dec) __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST)
|
||||
#define atomic_exchange(p_a, v) __atomic_exchange_n(p_a, v, __ATOMIC_SEQ_CST)
|
||||
#define atomic_fetch_or(p_a, v) __atomic_fetch_or(p_a, v, __ATOMIC_SEQ_CST)
|
||||
#define atomic_compare_exchange_strong(p_a, expected, desired) __atomic_compare_exchange_n(p_a, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
|
||||
|
||||
#endif /* !defined(__cplusplus) */
|
||||
|
||||
|
|
|
@ -55,6 +55,15 @@ typedef enum {
|
|||
#define atomic_exchange(p_a, v) InterlockedExchange(p_a, v)
|
||||
#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
|
||||
|
||||
static inline int atomic_compare_exchange_strong_int(LONG *obj, LONG *expected,
|
||||
LONG desired)
|
||||
{
|
||||
LONG orig = *expected;
|
||||
*expected = InterlockedCompareExchange(obj, desired, orig);
|
||||
return *expected == orig;
|
||||
}
|
||||
#define atomic_compare_exchange_strong(p_a, expected, desired) atomic_compare_exchange_strong_int((LONG *)p_a, (LONG *)expected, (LONG)desired)
|
||||
|
||||
/*
|
||||
* TODO use a special call to increment/decrement
|
||||
* using InterlockedIncrement/InterlockedDecrement
|
||||
|
|
|
@ -126,7 +126,7 @@ DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
|
|||
* 0: Success, and out is filled with the parsed Sequence Header
|
||||
* OBU parameters.
|
||||
* DAV1D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer.
|
||||
* other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in
|
||||
* Other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in
|
||||
* arguments, and other errors during parsing.
|
||||
*
|
||||
* @note It is safe to feed this function data containing other OBUs than a
|
||||
|
@ -137,7 +137,8 @@ DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
|
|||
const uint8_t *buf, const size_t sz);
|
||||
|
||||
/**
|
||||
* Feed bitstream data to the decoder.
|
||||
* Feed bitstream data to the decoder, in the form of one or multiple AV1
|
||||
* Open Bitstream Units (OBUs).
|
||||
*
|
||||
* @param c Input decoder instance.
|
||||
* @param in Input bitstream data. On success, ownership of the reference is
|
||||
|
@ -148,8 +149,9 @@ DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
|
|||
* DAV1D_ERR(EAGAIN): The data can't be consumed. dav1d_get_picture() should
|
||||
* be called to get one or more frames before the function
|
||||
* can consume new data.
|
||||
* other negative DAV1D_ERR codes: Error during decoding or because of invalid
|
||||
* passed-in arguments.
|
||||
* Other negative DAV1D_ERR codes: Error during decoding or because of invalid
|
||||
* passed-in arguments. The reference remains
|
||||
* owned by the caller.
|
||||
*/
|
||||
DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
|
||||
|
||||
|
@ -164,7 +166,7 @@ DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
|
|||
* 0: Success, and a frame is returned.
|
||||
* DAV1D_ERR(EAGAIN): Not enough data to output a frame. dav1d_send_data()
|
||||
* should be called with new input.
|
||||
* other negative DAV1D_ERR codes: Error during decoding or because of invalid
|
||||
* Other negative DAV1D_ERR codes: Error during decoding or because of invalid
|
||||
* passed-in arguments.
|
||||
*
|
||||
* @note To drain buffered frames from the decoder (i.e. on end of stream),
|
||||
|
@ -216,7 +218,7 @@ DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);
|
|||
*
|
||||
* @return
|
||||
* 0: Success, and a frame is returned.
|
||||
* other negative DAV1D_ERR codes: Error due to lack of memory or because of
|
||||
* Other negative DAV1D_ERR codes: Error due to lack of memory or because of
|
||||
* invalid passed-in arguments.
|
||||
*
|
||||
* @note If `Dav1dSettings.apply_grain` is true, film grain was already applied
|
||||
|
|
|
@ -63,7 +63,7 @@ endforeach
|
|||
# ASM option
|
||||
is_asm_enabled = (get_option('enable_asm') == true and
|
||||
(host_machine.cpu_family() == 'x86' or
|
||||
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__') == '') or
|
||||
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or
|
||||
host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm') or
|
||||
host_machine.cpu() == 'ppc64le'))
|
||||
|
@ -87,16 +87,11 @@ cdata.set10('CONFIG_LOG', get_option('logging'))
|
|||
test_args = []
|
||||
|
||||
optional_arguments = []
|
||||
optional_link_arguments = []
|
||||
|
||||
if host_machine.system() == 'linux'
|
||||
if host_machine.system() in ['linux', 'gnu']
|
||||
test_args += '-D_GNU_SOURCE'
|
||||
add_project_arguments('-D_GNU_SOURCE', language: 'c')
|
||||
elif host_machine.system() == 'darwin'
|
||||
test_args += '-D_DARWIN_C_SOURCE'
|
||||
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
|
||||
else
|
||||
test_args += '-D_POSIX_C_SOURCE=200112L'
|
||||
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
|
||||
endif
|
||||
|
||||
if host_machine.system() == 'windows'
|
||||
|
@ -112,6 +107,18 @@ if host_machine.system() == 'windows'
|
|||
cdata.set('ftello', '_ftelli64')
|
||||
endif
|
||||
|
||||
if host_machine.cpu_family() == 'x86_64'
|
||||
if cc.get_argument_syntax() != 'msvc'
|
||||
optional_link_arguments += '-Wl,--dynamicbase,--nxcompat,--tsaware,--high-entropy-va'
|
||||
endif
|
||||
elif host_machine.cpu_family() == 'x86' or host_machine.cpu_family() == 'arm'
|
||||
if cc.get_argument_syntax() == 'msvc'
|
||||
optional_link_arguments += '/largeaddressaware'
|
||||
else
|
||||
optional_link_arguments += '-Wl,--dynamicbase,--nxcompat,--tsaware,--large-address-aware'
|
||||
endif
|
||||
endif
|
||||
|
||||
# On Windows, we use a compatibility layer to emulate pthread
|
||||
thread_dependency = []
|
||||
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
|
||||
|
@ -135,7 +142,7 @@ else
|
|||
rt_dependency = []
|
||||
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
|
||||
cdata.set('HAVE_CLOCK_GETTIME', 1)
|
||||
elif host_machine.system() != 'darwin'
|
||||
elif host_machine.system() not in ['darwin', 'ios', 'tvos']
|
||||
rt_dependency = cc.find_library('rt', required: false)
|
||||
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
|
||||
error('clock_gettime not found')
|
||||
|
@ -248,6 +255,10 @@ if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args :
|
|||
cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
|
||||
endif
|
||||
|
||||
if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
|
||||
cdata.set('HAVE_C11_GENERIC', 1)
|
||||
endif
|
||||
|
||||
# Compiler flag tests
|
||||
|
||||
if cc.has_argument('-fvisibility=hidden')
|
||||
|
@ -289,13 +300,14 @@ if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
|
|||
optional_arguments += '-ffast-math'
|
||||
endif
|
||||
|
||||
if (host_machine.system() == 'darwin' and cc.get_id() == 'clang' and
|
||||
if (host_machine.system() in ['darwin', 'ios', 'tvos'] and cc.get_id() == 'clang' and
|
||||
cc.version().startswith('11'))
|
||||
# Workaround for Xcode 11 -fstack-check bug, see #301
|
||||
optional_arguments += '-fno-stack-check'
|
||||
endif
|
||||
|
||||
add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
|
||||
add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c')
|
||||
|
||||
# libFuzzer related things
|
||||
fuzzing_engine = get_option('fuzzing_engine')
|
||||
|
@ -330,7 +342,7 @@ if host_machine.cpu_family().startswith('x86')
|
|||
stack_alignment = 16
|
||||
endif
|
||||
else
|
||||
if host_machine.system() == 'linux' or host_machine.system() == 'darwin'
|
||||
if host_machine.system() == 'linux' or host_machine.system() in ['darwin', 'ios', 'tvos']
|
||||
stack_alignment = 16
|
||||
elif cc.has_argument('-mpreferred-stack-boundary=4')
|
||||
stackalign_flag = ['-mpreferred-stack-boundary=4']
|
||||
|
@ -391,7 +403,7 @@ if host_machine.cpu_family().startswith('x86')
|
|||
cdata_asm.set10('PIC', true)
|
||||
|
||||
# Convert SSE asm into (128-bit) AVX when compiler flags are set to use AVX instructions
|
||||
cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__') != '')
|
||||
cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__').strip() != '')
|
||||
endif
|
||||
|
||||
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
|
||||
|
@ -399,7 +411,7 @@ cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
|
|||
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
|
||||
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
|
||||
# see following meson issue https://github.com/mesonbuild/meson/issues/5482
|
||||
if (host_machine.system() == 'darwin' or
|
||||
if (host_machine.system() in ['darwin', 'ios', 'tvos'] or
|
||||
(host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86'))
|
||||
cdata.set10('PREFIX', true)
|
||||
cdata_asm.set10('PREFIX', true)
|
||||
|
@ -433,7 +445,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
|
|||
|
||||
if host_machine.system() == 'windows'
|
||||
nasm_format = 'win'
|
||||
elif host_machine.system() == 'darwin'
|
||||
elif host_machine.system() in ['darwin', 'ios', 'tvos']
|
||||
nasm_format = 'macho'
|
||||
else
|
||||
nasm_format = 'elf'
|
||||
|
@ -462,7 +474,8 @@ use_gaspp = false
|
|||
if (is_asm_enabled and
|
||||
(host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm')) and
|
||||
cc.get_argument_syntax() == 'msvc')
|
||||
cc.get_argument_syntax() == 'msvc' and
|
||||
(cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0')))
|
||||
gaspp = find_program('gas-preprocessor.pl')
|
||||
use_gaspp = true
|
||||
gaspp_gen = generator(gaspp,
|
||||
|
|
|
@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst
|
|||
.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
|
||||
idct_4s_x4 \r0, \r2, \r4, \r6
|
||||
|
||||
vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
.irp r, \r0, \r2, \r4, \r6
|
||||
vmin.s32 \r, \r, q5
|
||||
.endr
|
||||
.irp r, \r0, \r2, \r4, \r6
|
||||
vmax.s32 \r, \r, q4
|
||||
.endr
|
||||
|
||||
vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
|
||||
vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a
|
||||
vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
|
||||
vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
|
||||
vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
|
||||
vrshr.s32 \r1, q2, #12 // t4a
|
||||
vrshr.s32 \r7, q4, #12 // t7a
|
||||
vrshr.s32 \r7, q3, #12 // t7a
|
||||
vrshr.s32 \r3, q6, #12 // t5a
|
||||
vrshr.s32 \r5, q7, #12 // t6a
|
||||
|
||||
|
@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst
|
|||
vqadd.s32 q3, \r7, \r5 // t7
|
||||
vqsub.s32 \r3, \r7, \r5 // t6a
|
||||
|
||||
vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5
|
||||
.irp r, q2, \r1, q3, \r3
|
||||
vmin.s32 \r, \r, q5
|
||||
.endr
|
||||
.irp r, q2, \r1, q3, \r3
|
||||
vmax.s32 \r, \r, q4
|
||||
.endr
|
||||
|
||||
vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
|
||||
vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
|
||||
vrshr.s32 q4, q4, #12 // t5
|
||||
vrshr.s32 q7, q7, #12 // t5
|
||||
vrshr.s32 q5, q6, #12 // t6
|
||||
|
||||
vqsub.s32 \r7, \r0, q3 // out7
|
||||
vqadd.s32 \r0, \r0, q3 // out0
|
||||
vqadd.s32 \r1, \r2, q5 // out1
|
||||
vqsub.s32 q6, \r2, q5 // out6
|
||||
vqadd.s32 \r2, \r4, q4 // out2
|
||||
vqsub.s32 \r5, \r4, q4 // out5
|
||||
vqadd.s32 \r2, \r4, q7 // out2
|
||||
vqsub.s32 \r5, \r4, q7 // out5
|
||||
vqadd.s32 \r3, \r6, q2 // out3
|
||||
vqsub.s32 \r4, \r6, q2 // out4
|
||||
vmov \r6, q6 // out6
|
||||
|
@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst
|
|||
.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
|
||||
idct_2s_x4 \r0, \r2, \r4, \r6
|
||||
|
||||
vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
.irp r, \r0, \r2, \r4, \r6
|
||||
vmin.s32 \r, \r, d9
|
||||
.endr
|
||||
.irp r, \r0, \r2, \r4, \r6
|
||||
vmax.s32 \r, \r, d8
|
||||
.endr
|
||||
|
||||
vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
|
||||
vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
|
||||
vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
|
||||
|
@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst
|
|||
vqadd.s32 d5, \r7, \r5 // t7
|
||||
vqsub.s32 \r3, \r7, \r5 // t6a
|
||||
|
||||
.irp r, d4, \r1, d5, \r3
|
||||
vmin.s32 \r, \r, d9
|
||||
.endr
|
||||
.irp r, d4, \r1, d5, \r3
|
||||
vmax.s32 \r, \r, d8
|
||||
.endr
|
||||
|
||||
vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
|
||||
vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
|
||||
vrshr.s32 d6, d6, #12 // t5
|
||||
|
@ -763,19 +795,28 @@ endfunc
|
|||
|
||||
vqadd.s32 q2, q8, q12 // t0
|
||||
vqsub.s32 q3, q8, q12 // t4
|
||||
vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vqadd.s32 q4, q15, q11 // t1
|
||||
vqsub.s32 q5, q15, q11 // t5
|
||||
vqadd.s32 q6, q10, q14 // t2
|
||||
vqsub.s32 q7, q10, q14 // t6
|
||||
vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
vqadd.s32 q10, q13, q9 // t3
|
||||
vqsub.s32 q11, q13, q9 // t7
|
||||
|
||||
.irp r, q2, q3, q4, q5, q6, q7, q10, q11
|
||||
vmin.s32 \r, \r, q12
|
||||
.endr
|
||||
.irp r, q2, q3, q4, q5, q6, q7, q10, q11
|
||||
vmax.s32 \r, \r, q14
|
||||
.endr
|
||||
|
||||
vmul_vmla q8, q3, q5, d1[1], d1[0]
|
||||
vmul_vmls q12, q3, q5, d1[0], d1[1]
|
||||
vmul_vmls q13, q3, q5, d1[0], d1[1]
|
||||
vmul_vmls q14, q11, q7, d1[1], d1[0]
|
||||
|
||||
vrshr.s32 q3, q8, #12 // t4a
|
||||
vrshr.s32 q5, q12, #12 // t5a
|
||||
vrshr.s32 q5, q13, #12 // t5a
|
||||
|
||||
vmul_vmla q8, q11, q7, d1[0], d1[1]
|
||||
|
||||
|
@ -786,12 +827,24 @@ endfunc
|
|||
vqsub.s32 q2, q2, q6 // t2
|
||||
vqadd.s32 \r7, q4, q10 // out7
|
||||
vqsub.s32 q4, q4, q10 // t3
|
||||
vqneg.s32 \r7, \r7 // out7
|
||||
|
||||
vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
vqadd.s32 \r1, q3, q7 // out1
|
||||
vqsub.s32 q3, q3, q7 // t6
|
||||
vqadd.s32 \r6, q5, q11 // out6
|
||||
vqsub.s32 q5, q5, q11 // t7
|
||||
|
||||
// Not clipping the output registers, as they will be downshifted and
|
||||
// narrowed afterwards anyway.
|
||||
.irp r, q2, q4, q3, q5
|
||||
vmin.s32 \r, \r, q12
|
||||
.endr
|
||||
.irp r, q2, q4, q3, q5
|
||||
vmax.s32 \r, \r, q10
|
||||
.endr
|
||||
|
||||
vqneg.s32 \r7, \r7 // out7
|
||||
vqneg.s32 \r1, \r1 // out1
|
||||
|
||||
vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
|
||||
|
@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon
|
|||
|
||||
idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
|
||||
|
||||
// idct_8 leaves the row_clip_max/min constants in d9 and d8
|
||||
.irp r, d16, d18, d20, d22, d24, d26, d28, d30
|
||||
vmin.s32 \r, \r, d9
|
||||
.endr
|
||||
.irp r, d16, d18, d20, d22, d24, d26, d28, d30
|
||||
vmax.s32 \r, \r, d8
|
||||
.endr
|
||||
|
||||
vld1.32 {q0, q1}, [r12, :128]
|
||||
sub r12, r12, #32
|
||||
|
||||
|
@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon
|
|||
vqadd.s32 d25, d29, d27 // t12
|
||||
vqsub.s32 d29, d29, d27 // t13
|
||||
|
||||
.irp r, d4, d17, d5, d31, d23, d19, d25, d29
|
||||
vmin.s32 \r, \r, d9
|
||||
.endr
|
||||
.irp r, d4, d17, d5, d31, d23, d19, d25, d29
|
||||
vmax.s32 \r, \r, d8
|
||||
.endr
|
||||
|
||||
vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
|
||||
vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
|
||||
vrshr.s32 d21, d6, #12 // t9a
|
||||
|
@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon
|
|||
vqsub.s32 d25, d27, d29 // t13
|
||||
vqadd.s32 d27, d27, d29 // t14
|
||||
|
||||
.irp r, d4, d17, d5, d31, d19, d21, d25, d27
|
||||
vmin.s32 \r, \r, d9
|
||||
.endr
|
||||
.irp r, d4, d17, d5, d31, d19, d21, d25, d27
|
||||
vmax.s32 \r, \r, d8
|
||||
.endr
|
||||
|
||||
vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
|
||||
vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
|
||||
vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
|
||||
|
@ -1193,6 +1268,9 @@ endfunc
|
|||
|
||||
vld1.32 {q0, q1}, [r12, :128]
|
||||
|
||||
vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
vqsub.s32 d5, d16, d23 // t8a
|
||||
vqadd.s32 d16, d16, d23 // t0a
|
||||
vqsub.s32 d7, d31, d24 // t9a
|
||||
|
@ -1210,6 +1288,13 @@ endfunc
|
|||
vqadd.s32 d28, d25, d30 // t7a
|
||||
vqsub.s32 d25, d25, d30 // t15a
|
||||
|
||||
.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
|
||||
vmin.s32 \r, \r, d11
|
||||
.endr
|
||||
.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
|
||||
vmax.s32 \r, \r, d10
|
||||
.endr
|
||||
|
||||
vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
|
||||
vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
|
||||
vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
|
||||
|
@ -1244,6 +1329,13 @@ endfunc
|
|||
vqadd.s32 d20, d29, d22 // t11a
|
||||
vqsub.s32 d29, d29, d22 // t15a
|
||||
|
||||
.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
|
||||
vmin.s32 \r, \r, d11
|
||||
.endr
|
||||
.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
|
||||
vmax.s32 \r, \r, d10
|
||||
.endr
|
||||
|
||||
vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
|
||||
vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
|
||||
vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
|
||||
|
@ -1272,24 +1364,34 @@ endfunc
|
|||
vqadd.s32 \o15,d31, d26 // out15
|
||||
vmov \o0, d4
|
||||
.endif
|
||||
vqneg.s32 \o15, \o15 // out15
|
||||
|
||||
vqsub.s32 d3, d29, d18 // t15a
|
||||
vqadd.s32 \o13,d29, d18 // out13
|
||||
vqadd.s32 \o2, d17, d30 // out2
|
||||
vqsub.s32 d26, d17, d30 // t14a
|
||||
vqneg.s32 \o13,\o13 // out13
|
||||
|
||||
vqadd.s32 \o1, d19, d27 // out1
|
||||
vqsub.s32 d27, d19, d27 // t10
|
||||
vqadd.s32 \o14,d28, d20 // out14
|
||||
vqsub.s32 d20, d28, d20 // t11
|
||||
vqneg.s32 \o1, \o1 // out1
|
||||
|
||||
vqadd.s32 \o3, d22, d24 // out3
|
||||
vqsub.s32 d22, d22, d24 // t6
|
||||
vqadd.s32 \o12,d25, d23 // out12
|
||||
vqsub.s32 d23, d25, d23 // t7
|
||||
|
||||
// Not clipping the output registers, as they will be downshifted and
|
||||
// narrowed afterwards anyway.
|
||||
.irp r, d2, d21, d3, d26, d27, d20, d22, d23
|
||||
vmin.s32 \r, \r, d11
|
||||
.endr
|
||||
.irp r, d2, d21, d3, d26, d27, d20, d22, d23
|
||||
vmax.s32 \r, \r, d10
|
||||
.endr
|
||||
|
||||
vqneg.s32 \o15, \o15 // out15
|
||||
vqneg.s32 \o13,\o13 // out13
|
||||
vqneg.s32 \o1, \o1 // out1
|
||||
vqneg.s32 \o3, \o3 // out3
|
||||
|
||||
vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
|
||||
|
@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon
|
|||
|
||||
vld1.32 {q0, q1}, [r12, :128]
|
||||
|
||||
vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
vqsub.s32 d5, d16, d24 // t17
|
||||
vqadd.s32 d16, d16, d24 // t16
|
||||
vqsub.s32 d7, d31, d23 // t30
|
||||
|
@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon
|
|||
vqadd.s32 d25, d19, d27 // t28
|
||||
vqsub.s32 d19, d19, d27 // t29
|
||||
|
||||
.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
|
||||
vmin.s32 \r, \r, d11
|
||||
.endr
|
||||
.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
|
||||
vmax.s32 \r, \r, d10
|
||||
.endr
|
||||
|
||||
vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
|
||||
vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
|
||||
vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
|
||||
|
@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon
|
|||
vqsub.s32 d29, d31, d25 // t28a
|
||||
vqadd.s32 d31, d31, d25 // t31a
|
||||
|
||||
.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
|
||||
vmin.s32 \r, \r, d11
|
||||
.endr
|
||||
.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
|
||||
vmax.s32 \r, \r, d10
|
||||
.endr
|
||||
|
||||
vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
|
||||
vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
|
||||
vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
|
||||
|
@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon
|
|||
vqsub.s32 d24, d24, d19 // t27a
|
||||
vmov d19, d4 // out19
|
||||
|
||||
.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
|
||||
vmin.s32 \r, \r, d11
|
||||
.endr
|
||||
.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
|
||||
vmax.s32 \r, \r, d10
|
||||
.endr
|
||||
|
||||
vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
|
||||
vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
|
||||
vrshr.s32 d20, d4, #12 // t20
|
||||
|
@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon
|
|||
scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
|
||||
.endif
|
||||
bl inv_dct_2s_x16_neon
|
||||
|
||||
// idct_16 leaves the row_clip_max/min constants in d9 and d8,
|
||||
// but here we want to use full q registers for clipping.
|
||||
vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
|
||||
vmin.s32 \r, \r, q3
|
||||
.endr
|
||||
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
|
||||
vmax.s32 \r, \r, q2
|
||||
.endr
|
||||
|
||||
vtrn.32 d16, d17
|
||||
vtrn.32 d18, d19
|
||||
vtrn.32 d20, d21
|
||||
|
@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon
|
|||
vqsub.s32 d30, d23, d22 // t62
|
||||
vqadd.s32 d31, d23, d22 // t63
|
||||
|
||||
.irp r, q12, q13, q14, q15
|
||||
vmin.s32 \r, \r, q5
|
||||
.endr
|
||||
.irp r, q12, q13, q14, q15
|
||||
vmax.s32 \r, \r, q4
|
||||
.endr
|
||||
|
||||
vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
|
||||
vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
|
||||
vneg.s32 d4, d4 // t34a
|
||||
vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a
|
||||
vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
|
||||
vrshr.s32 d26, d4, #12 // t34a
|
||||
vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
|
||||
vrshr.s32 d29, d6, #12 // t61a
|
||||
vrshr.s32 d25, d8, #12 // t33a
|
||||
vrshr.s32 d25, d7, #12 // t33a
|
||||
vrshr.s32 d30, d4, #12 // t62a
|
||||
|
||||
vqadd.s32 d16, d24, d27 // t32a
|
||||
|
@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon
|
|||
vqsub.s32 d21, d30, d29 // t61
|
||||
vqadd.s32 d22, d30, d29 // t62
|
||||
|
||||
.irp r, q8, q9, q10, q11
|
||||
vmin.s32 \r, \r, q5
|
||||
.endr
|
||||
.irp r, q8, q9, q10, q11
|
||||
vmax.s32 \r, \r, q4
|
||||
.endr
|
||||
|
||||
vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
|
||||
vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
|
||||
vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60
|
||||
vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
|
||||
vrshr.s32 d21, d4, #12 // t61a
|
||||
vrshr.s32 d18, d6, #12 // t34a
|
||||
vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
|
||||
vrshr.s32 d20, d8, #12 // t60
|
||||
vrshr.s32 d20, d7, #12 // t60
|
||||
vrshr.s32 d19, d4, #12 // t35
|
||||
|
||||
vst1.32 {d16, d17, d18, d19}, [r6, :128]!
|
||||
|
@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon
|
|||
vqadd.s32 d30, d23, d22 // t48
|
||||
vqsub.s32 d31, d23, d22 // t55
|
||||
|
||||
.irp r, q12, q13, q14, q15
|
||||
vmin.s32 \r, \r, q5
|
||||
.endr
|
||||
.irp r, q12, q13, q14, q15
|
||||
vmax.s32 \r, \r, q4
|
||||
.endr
|
||||
|
||||
vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
|
||||
vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
|
||||
vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a
|
||||
vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
|
||||
vrshr.s32 d25, d4, #12 // t56a
|
||||
vrshr.s32 d27, d6, #12 // t39a
|
||||
vneg.s32 d8, d8 // t40a
|
||||
vneg.s32 d7, d7 // t40a
|
||||
vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
|
||||
vrshr.s32 d31, d8, #12 // t40a
|
||||
vrshr.s32 d31, d7, #12 // t40a
|
||||
vrshr.s32 d28, d4, #12 // t55a
|
||||
|
||||
vqadd.s32 d16, d24, d29 // t32a
|
||||
|
@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon
|
|||
vqsub.s32 d21, d25, d28 // t55
|
||||
vqadd.s32 d22, d25, d28 // t56
|
||||
|
||||
.irp r, q8, q9, q10, q11
|
||||
vmin.s32 \r, \r, q5
|
||||
.endr
|
||||
.irp r, q8, q9, q10, q11
|
||||
vmax.s32 \r, \r, q4
|
||||
.endr
|
||||
|
||||
vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
|
||||
vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
|
||||
vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47
|
||||
vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
|
||||
vrshr.s32 d18, d4, #12 // t40a
|
||||
vrshr.s32 d21, d6, #12 // t55a
|
||||
vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
|
||||
vrshr.s32 d19, d8, #12 // t47
|
||||
vrshr.s32 d19, d7, #12 // t47
|
||||
vrshr.s32 d20, d4, #12 // t48
|
||||
|
||||
vstr d16, [r6, #4*2*0] // t32a
|
||||
|
@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
|
|||
|
||||
bl inv_dct_2s_x16_neon
|
||||
|
||||
// idct_16 leaves the row_clip_max/min constants in d9 and d8,
|
||||
// but here we want to use full q registers for clipping.
|
||||
vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
|
||||
vmin.s32 \r, \r, q3
|
||||
.endr
|
||||
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
|
||||
vmax.s32 \r, \r, q2
|
||||
.endr
|
||||
|
||||
store16 r6
|
||||
|
||||
movdup_if d0, r12, 2896*8*(1<<16), \scale
|
||||
|
@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
|
|||
|
||||
mov r9, #-8
|
||||
|
||||
vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
.macro store_addsub r0, r1, r2, r3
|
||||
vld1.32 {d2}, [r6, :64]!
|
||||
vld1.32 {d3}, [r6, :64]!
|
||||
|
@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
|
|||
vld1.32 {d4}, [r6, :64]!
|
||||
vqadd.s32 d7, d3, \r1
|
||||
vqsub.s32 \r1, d3, \r1
|
||||
vmin.s32 d6, d6, d1
|
||||
vmin.s32 \r0, \r0, d1
|
||||
vld1.32 {d5}, [r6, :64]!
|
||||
vqadd.s32 d2, d4, \r2
|
||||
sub r6, r6, #8*4
|
||||
vmax.s32 d6, d6, d0
|
||||
vmax.s32 \r0, \r0, d0
|
||||
vqsub.s32 \r2, d4, \r2
|
||||
vmin.s32 d7, d7, d1
|
||||
vmin.s32 \r1, \r1, d1
|
||||
vst1.32 {d6}, [r6, :64]!
|
||||
vst1.32 {\r0}, [r10, :64], r9
|
||||
vmin.s32 d2, d2, d1
|
||||
vmin.s32 \r2, \r2, d1
|
||||
vmax.s32 d7, d7, d0
|
||||
vmax.s32 \r1, \r1, d0
|
||||
vqadd.s32 d3, d5, \r3
|
||||
vqsub.s32 \r3, d5, \r3
|
||||
vmax.s32 d2, d2, d0
|
||||
vmax.s32 \r2, \r2, d0
|
||||
vmin.s32 d3, d3, d1
|
||||
vmin.s32 \r3, \r3, d1
|
||||
vst1.32 {d7}, [r6, :64]!
|
||||
vst1.32 {\r1}, [r10, :64], r9
|
||||
vmax.s32 d3, d3, d0
|
||||
vmax.s32 \r3, \r3, d0
|
||||
vst1.32 {d2}, [r6, :64]!
|
||||
vst1.32 {\r2}, [r10, :64], r9
|
||||
vst1.32 {d3}, [r6, :64]!
|
||||
|
@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
|
|||
add r6, r6, #2*4*16
|
||||
|
||||
movrel_local r12, idct64_coeffs
|
||||
vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
movdup_if d0, lr, 2896*8*(1<<16), \scale
|
||||
vmov_if d7, #0, \clear
|
||||
add r9, r7, r8, lsl #4 // offset 16
|
||||
|
|
|
@ -483,10 +483,10 @@ endfunc
|
|||
add \o1\().4s, v5.4s, v7.4s
|
||||
sub \o3\().4s, \o3\().4s, v7.4s
|
||||
|
||||
rshrn \o0\().4h, \o0\().4s, #12
|
||||
rshrn \o2\().4h, \o2\().4s, #12
|
||||
rshrn \o1\().4h, \o1\().4s, #12
|
||||
rshrn \o3\().4h, \o3\().4s, #12
|
||||
sqrshrn \o0\().4h, \o0\().4s, #12
|
||||
sqrshrn \o2\().4h, \o2\().4s, #12
|
||||
sqrshrn \o1\().4h, \o1\().4s, #12
|
||||
sqrshrn \o3\().4h, \o3\().4s, #12
|
||||
.endm
|
||||
|
||||
function inv_adst_4h_x4_neon, export=1
|
||||
|
@ -538,21 +538,21 @@ endfunc
|
|||
sub v4.4s, v4.4s, v2.4s // out3
|
||||
sub v5.4s, v5.4s, v3.4s
|
||||
|
||||
rshrn v18.4h, v18.4s, #12
|
||||
rshrn2 v18.8h, v19.4s, #12
|
||||
sqrshrn v18.4h, v18.4s, #12
|
||||
sqrshrn2 v18.8h, v19.4s, #12
|
||||
|
||||
rshrn \o0\().4h, v16.4s, #12
|
||||
rshrn2 \o0\().8h, v17.4s, #12
|
||||
sqrshrn \o0\().4h, v16.4s, #12
|
||||
sqrshrn2 \o0\().8h, v17.4s, #12
|
||||
|
||||
.ifc \o2, v17
|
||||
mov v17.16b, v18.16b
|
||||
.endif
|
||||
|
||||
rshrn \o1\().4h, v6.4s, #12
|
||||
rshrn2 \o1\().8h, v7.4s, #12
|
||||
sqrshrn \o1\().4h, v6.4s, #12
|
||||
sqrshrn2 \o1\().8h, v7.4s, #12
|
||||
|
||||
rshrn \o3\().4h, v4.4s, #12
|
||||
rshrn2 \o3\().8h, v5.4s, #12
|
||||
sqrshrn \o3\().4h, v4.4s, #12
|
||||
sqrshrn2 \o3\().8h, v5.4s, #12
|
||||
.endm
|
||||
|
||||
function inv_adst_8h_x4_neon, export=1
|
||||
|
|
|
@ -124,6 +124,13 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
.macro smin_4s r0, r1, r2
|
||||
smin \r0\().4s, \r1\().4s, \r2\().4s
|
||||
.endm
|
||||
.macro smax_4s r0, r1, r2
|
||||
smax \r0\().4s, \r1\().4s, \r2\().4s
|
||||
.endm
|
||||
|
||||
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
|
||||
.ifnb \load
|
||||
ld1 {\load}, [\src], x1
|
||||
|
@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst
|
|||
.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
|
||||
idct_4 \r0, \r2, \r4, \r6
|
||||
|
||||
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
.irp r, \r0, \r2, \r4, \r6
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, \r0, \r2, \r4, \r6
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
|
||||
mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
|
||||
mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
|
||||
mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
|
||||
mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
|
||||
srshr \r1\().4s, v2.4s, #12 // t4a
|
||||
srshr \r7\().4s, v4.4s, #12 // t7a
|
||||
srshr \r7\().4s, v3.4s, #12 // t7a
|
||||
srshr \r3\().4s, v6.4s, #12 // t5a
|
||||
srshr \r5\().4s, v7.4s, #12 // t6a
|
||||
|
||||
|
@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst
|
|||
sqadd v3.4s, \r7\().4s, \r5\().4s // t7
|
||||
sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
|
||||
|
||||
mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
|
||||
.irp r, v2, \r1, v3, \r3
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v2, \r1, v3, \r3
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
|
||||
mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
|
||||
srshr v4.4s, v4.4s, #12 // t5
|
||||
srshr v5.4s, v6.4s, #12 // t6
|
||||
srshr v7.4s, v7.4s, #12 // t5
|
||||
srshr v6.4s, v6.4s, #12 // t6
|
||||
|
||||
sqsub \r7\().4s, \r0\().4s, v3.4s // out7
|
||||
sqadd \r0\().4s, \r0\().4s, v3.4s // out0
|
||||
sqadd \r1\().4s, \r2\().4s, v5.4s // out1
|
||||
sqsub v6.4s, \r2\().4s, v5.4s // out6
|
||||
sqadd \r2\().4s, \r4\().4s, v4.4s // out2
|
||||
sqsub \r5\().4s, \r4\().4s, v4.4s // out5
|
||||
sqadd \r1\().4s, \r2\().4s, v6.4s // out1
|
||||
sqsub v6.4s, \r2\().4s, v6.4s // out6
|
||||
sqadd \r2\().4s, \r4\().4s, v7.4s // out2
|
||||
sqsub \r5\().4s, \r4\().4s, v7.4s // out5
|
||||
sqadd \r3\().4s, \r6\().4s, v2.4s // out3
|
||||
sqsub \r4\().4s, \r6\().4s, v2.4s // out4
|
||||
mov \r6\().16b, v6.16b // out6
|
||||
|
@ -660,8 +683,11 @@ endfunc
|
|||
|
||||
ld1 {v0.4s}, [x16]
|
||||
|
||||
movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
|
||||
sqadd v2.4s, v16.4s, v20.4s // t0
|
||||
sqsub v3.4s, v16.4s, v20.4s // t4
|
||||
mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
sqadd v4.4s, v23.4s, v19.4s // t1
|
||||
sqsub v5.4s, v23.4s, v19.4s // t5
|
||||
sqadd v6.4s, v18.4s, v22.4s // t2
|
||||
|
@ -669,6 +695,13 @@ endfunc
|
|||
sqadd v18.4s, v21.4s, v17.4s // t3
|
||||
sqsub v19.4s, v21.4s, v17.4s // t7
|
||||
|
||||
.irp r, v2, v3, v4, v5, v6, v7, v18, v19
|
||||
smin_4s \r, \r, v1
|
||||
.endr
|
||||
.irp r, v2, v3, v4, v5, v6, v7, v18, v19
|
||||
smax_4s \r, \r, v20
|
||||
.endr
|
||||
|
||||
mul_mla v16, v3, v5, v0.s[3], v0.s[2]
|
||||
mul_mls v20, v3, v5, v0.s[2], v0.s[3]
|
||||
mul_mls v22, v19, v7, v0.s[3], v0.s[2]
|
||||
|
@ -685,12 +718,24 @@ endfunc
|
|||
sqsub v2.4s, v2.4s, v6.4s // t2
|
||||
sqadd \o7\().4s, v4.4s, v18.4s // out7
|
||||
sqsub v4.4s, v4.4s, v18.4s // t3
|
||||
sqneg \o7\().4s, \o7\().4s // out7
|
||||
|
||||
mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
sqadd \o1\().4s, v3.4s, v7.4s // out1
|
||||
sqsub v3.4s, v3.4s, v7.4s // t6
|
||||
sqadd \o6\().4s, v5.4s, v19.4s // out6
|
||||
sqsub v5.4s, v5.4s, v19.4s // t7
|
||||
|
||||
// Not clipping the output registers, as they will be downshifted and
|
||||
// narrowed afterwards anyway.
|
||||
.irp r, v2, v4, v3, v5
|
||||
smin_4s \r, \r, v1
|
||||
.endr
|
||||
.irp r, v2, v4, v3, v5
|
||||
smax_4s \r, \r, v18
|
||||
.endr
|
||||
|
||||
sqneg \o7\().4s, \o7\().4s // out7
|
||||
sqneg \o1\().4s, \o1\().4s // out1
|
||||
|
||||
mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
|
||||
|
@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon
|
|||
|
||||
idct_8 v16, v18, v20, v22, v24, v26, v28, v30
|
||||
|
||||
// idct_8 leaves the row_clip_max/min constants in v5 and v4
|
||||
.irp r, v16, v18, v20, v22, v24, v26, v28, v30
|
||||
smin \r\().4s, \r\().4s, v5.4s
|
||||
.endr
|
||||
.irp r, v16, v18, v20, v22, v24, v26, v28, v30
|
||||
smax \r\().4s, \r\().4s, v4.4s
|
||||
.endr
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [x16]
|
||||
sub x16, x16, #32
|
||||
|
||||
mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
|
||||
mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
|
||||
mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
|
||||
mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
|
||||
srshr v17.4s, v2.4s, #12 // t8a
|
||||
srshr v31.4s, v4.4s, #12 // t15a
|
||||
srshr v31.4s, v3.4s, #12 // t15a
|
||||
mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
|
||||
mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
|
||||
mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
|
||||
srshr v23.4s, v6.4s, #12 // t9a
|
||||
srshr v25.4s, v2.4s, #12 // t14a
|
||||
mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
|
||||
mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
|
||||
srshr v21.4s, v4.4s, #12 // t10a
|
||||
srshr v21.4s, v3.4s, #12 // t10a
|
||||
srshr v27.4s, v6.4s, #12 // t13a
|
||||
mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
|
||||
mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
|
||||
srshr v19.4s, v2.4s, #12 // t11a
|
||||
srshr v29.4s, v4.4s, #12 // t12a
|
||||
srshr v29.4s, v3.4s, #12 // t12a
|
||||
|
||||
ld1 {v0.4s}, [x16]
|
||||
|
||||
|
@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon
|
|||
sqadd v25.4s, v29.4s, v27.4s // t12
|
||||
sqsub v29.4s, v29.4s, v27.4s // t13
|
||||
|
||||
mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
|
||||
.irp r, v2, v17, v3, v31, v23, v19, v25, v29
|
||||
smin \r\().4s, \r\().4s, v5.4s
|
||||
.endr
|
||||
.irp r, v2, v17, v3, v31, v23, v19, v25, v29
|
||||
smax \r\().4s, \r\().4s, v4.4s
|
||||
.endr
|
||||
|
||||
mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
|
||||
mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
|
||||
srshr v21.4s, v4.4s, #12 // t9a
|
||||
srshr v21.4s, v7.4s, #12 // t9a
|
||||
srshr v27.4s, v6.4s, #12 // t14a
|
||||
|
||||
mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
|
||||
mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
|
||||
mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
|
||||
srshr v29.4s, v4.4s, #12 // t13a
|
||||
srshr v29.4s, v7.4s, #12 // t13a
|
||||
neg v6.4s, v6.4s
|
||||
srshr v23.4s, v6.4s, #12 // t10a
|
||||
|
||||
|
@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon
|
|||
sqsub v25.4s, v27.4s, v29.4s // t13
|
||||
sqadd v27.4s, v27.4s, v29.4s // t14
|
||||
|
||||
mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
|
||||
.irp r, v2, v17, v3, v31, v19, v21, v25, v27
|
||||
smin \r\().4s, \r\().4s, v5.4s
|
||||
.endr
|
||||
.irp r, v2, v17, v3, v31, v19, v21, v25, v27
|
||||
smax \r\().4s, \r\().4s, v4.4s
|
||||
.endr
|
||||
|
||||
mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
|
||||
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
|
||||
mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
|
||||
|
||||
srshr v4.4s, v4.4s, #12 // t11
|
||||
srshr v5.4s, v6.4s, #12 // t12
|
||||
mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
|
||||
srshr v7.4s, v7.4s, #12 // t11
|
||||
srshr v6.4s, v6.4s, #12 // t12
|
||||
mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
|
||||
srshr v2.4s, v2.4s, #12 // t10a
|
||||
srshr v3.4s, v6.4s, #12 // t13a
|
||||
srshr v3.4s, v3.4s, #12 // t13a
|
||||
|
||||
sqadd v6.4s, v16.4s, v31.4s // out0
|
||||
sqadd v1.4s, v16.4s, v31.4s // out0
|
||||
sqsub v31.4s, v16.4s, v31.4s // out15
|
||||
mov v16.16b, v6.16b
|
||||
mov v16.16b, v1.16b
|
||||
sqadd v23.4s, v30.4s, v17.4s // out7
|
||||
sqsub v7.4s, v30.4s, v17.4s // out8
|
||||
sqsub v1.4s, v30.4s, v17.4s // out8
|
||||
sqadd v17.4s, v18.4s, v27.4s // out1
|
||||
sqsub v30.4s, v18.4s, v27.4s // out14
|
||||
sqadd v18.4s, v20.4s, v3.4s // out2
|
||||
sqsub v29.4s, v20.4s, v3.4s // out13
|
||||
sqadd v3.4s, v28.4s, v19.4s // out6
|
||||
sqsub v25.4s, v28.4s, v19.4s // out9
|
||||
sqadd v19.4s, v22.4s, v5.4s // out3
|
||||
sqsub v28.4s, v22.4s, v5.4s // out12
|
||||
sqadd v20.4s, v24.4s, v4.4s // out4
|
||||
sqsub v27.4s, v24.4s, v4.4s // out11
|
||||
sqadd v19.4s, v22.4s, v6.4s // out3
|
||||
sqsub v28.4s, v22.4s, v6.4s // out12
|
||||
sqadd v20.4s, v24.4s, v7.4s // out4
|
||||
sqsub v27.4s, v24.4s, v7.4s // out11
|
||||
sqadd v21.4s, v26.4s, v2.4s // out5
|
||||
sqsub v26.4s, v26.4s, v2.4s // out10
|
||||
mov v24.16b, v7.16b
|
||||
mov v24.16b, v1.16b
|
||||
mov v22.16b, v3.16b
|
||||
|
||||
ret
|
||||
|
@ -1084,6 +1151,9 @@ endfunc
|
|||
|
||||
ld1 {v0.4s, v1.4s}, [x16]
|
||||
|
||||
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
sqsub v2.4s, v16.4s, v23.4s // t8a
|
||||
sqadd v16.4s, v16.4s, v23.4s // t0a
|
||||
sqsub v3.4s, v31.4s, v24.4s // t9a
|
||||
|
@ -1101,6 +1171,13 @@ endfunc
|
|||
sqadd v28.4s, v25.4s, v30.4s // t7a
|
||||
sqsub v25.4s, v25.4s, v30.4s // t15a
|
||||
|
||||
.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
|
||||
smax_4s \r, \r, v7
|
||||
.endr
|
||||
|
||||
mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
|
||||
mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
|
||||
mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
|
||||
|
@ -1135,6 +1212,13 @@ endfunc
|
|||
sqadd v20.4s, v29.4s, v22.4s // t11a
|
||||
sqsub v29.4s, v29.4s, v22.4s // t15a
|
||||
|
||||
.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
|
||||
smax_4s \r, \r, v7
|
||||
.endr
|
||||
|
||||
mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
|
||||
mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
|
||||
mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
|
||||
|
@ -1163,24 +1247,34 @@ endfunc
|
|||
sqadd \o15\().4s, v31.4s, v26.4s // out15
|
||||
mov \o0\().16b, v4.16b
|
||||
.endif
|
||||
sqneg \o15\().4s, \o15\().4s // out15
|
||||
|
||||
sqsub v3.4s, v29.4s, v18.4s // t15a
|
||||
sqadd \o13\().4s, v29.4s, v18.4s // out13
|
||||
sqadd \o2\().4s, v17.4s, v30.4s // out2
|
||||
sqsub v26.4s, v17.4s, v30.4s // t14a
|
||||
sqneg \o13\().4s, \o13\().4s // out13
|
||||
|
||||
sqadd \o1\().4s, v19.4s, v27.4s // out1
|
||||
sqsub v27.4s, v19.4s, v27.4s // t10
|
||||
sqadd \o14\().4s, v28.4s, v20.4s // out14
|
||||
sqsub v20.4s, v28.4s, v20.4s // t11
|
||||
sqneg \o1\().4s, \o1\().4s // out1
|
||||
|
||||
sqadd \o3\().4s, v22.4s, v24.4s // out3
|
||||
sqsub v22.4s, v22.4s, v24.4s // t6
|
||||
sqadd \o12\().4s, v25.4s, v23.4s // out12
|
||||
sqsub v23.4s, v25.4s, v23.4s // t7
|
||||
|
||||
// Not clipping the output registers, as they will be downshifted and
|
||||
// narrowed afterwards anyway.
|
||||
.irp r, v2, v21, v3, v26, v27, v20, v22, v23
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v2, v21, v3, v26, v27, v20, v22, v23
|
||||
smax_4s \r, \r, v7
|
||||
.endr
|
||||
|
||||
sqneg \o15\().4s, \o15\().4s // out15
|
||||
sqneg \o13\().4s, \o13\().4s // out13
|
||||
sqneg \o1\().4s, \o1\().4s // out1
|
||||
sqneg \o3\().4s, \o3\().4s // out3
|
||||
|
||||
mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
|
||||
|
@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon
|
|||
|
||||
ld1 {v0.4s, v1.4s}, [x16]
|
||||
|
||||
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
sqsub v2.4s, v16.4s, v24.4s // t17
|
||||
sqadd v16.4s, v16.4s, v24.4s // t16
|
||||
sqsub v3.4s, v31.4s, v23.4s // t30
|
||||
|
@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon
|
|||
sqadd v25.4s, v19.4s, v27.4s // t28
|
||||
sqsub v19.4s, v19.4s, v27.4s // t29
|
||||
|
||||
mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
|
||||
.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
|
||||
smin \r\().4s, \r\().4s, v5.4s
|
||||
.endr
|
||||
.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
|
||||
smax \r\().4s, \r\().4s, v4.4s
|
||||
.endr
|
||||
|
||||
mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
|
||||
mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
|
||||
mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
|
||||
srshr v21.4s, v4.4s, #12 // t17a
|
||||
srshr v21.4s, v7.4s, #12 // t17a
|
||||
srshr v27.4s, v6.4s, #12 // t30a
|
||||
neg v2.4s, v2.4s // -> t18a
|
||||
mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
|
||||
mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
|
||||
mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
|
||||
srshr v19.4s, v2.4s, #12 // t18a
|
||||
srshr v24.4s, v4.4s, #12 // t29a
|
||||
srshr v24.4s, v7.4s, #12 // t29a
|
||||
mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
|
||||
mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
|
||||
mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
|
||||
srshr v22.4s, v6.4s, #12 // t21a
|
||||
srshr v18.4s, v2.4s, #12 // t26a
|
||||
neg v4.4s, v4.4s // -> t22a
|
||||
neg v7.4s, v7.4s // -> t22a
|
||||
mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
|
||||
srshr v17.4s, v4.4s, #12 // t22a
|
||||
srshr v17.4s, v7.4s, #12 // t22a
|
||||
srshr v20.4s, v6.4s, #12 // t25a
|
||||
|
||||
sqsub v2.4s, v27.4s, v24.4s // t29
|
||||
|
@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon
|
|||
sqsub v29.4s, v31.4s, v25.4s // t28a
|
||||
sqadd v31.4s, v31.4s, v25.4s // t31a
|
||||
|
||||
mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
|
||||
.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
|
||||
smin \r\().4s, \r\().4s, v5.4s
|
||||
.endr
|
||||
.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
|
||||
smax \r\().4s, \r\().4s, v4.4s
|
||||
.endr
|
||||
|
||||
mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
|
||||
mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
|
||||
mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
|
||||
srshr v18.4s, v4.4s, #12 // t18a
|
||||
srshr v18.4s, v7.4s, #12 // t18a
|
||||
srshr v25.4s, v6.4s, #12 // t29a
|
||||
mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
|
||||
mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
|
||||
mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
|
||||
srshr v29.4s, v2.4s, #12 // t19
|
||||
srshr v24.4s, v4.4s, #12 // t28
|
||||
srshr v24.4s, v7.4s, #12 // t28
|
||||
neg v6.4s, v6.4s // -> t20
|
||||
mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
|
||||
mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
|
||||
mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
|
||||
srshr v26.4s, v6.4s, #12 // t20
|
||||
srshr v19.4s, v2.4s, #12 // t27
|
||||
neg v4.4s, v4.4s // -> t21a
|
||||
neg v7.4s, v7.4s // -> t21a
|
||||
mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
|
||||
srshr v20.4s, v4.4s, #12 // t21a
|
||||
srshr v20.4s, v7.4s, #12 // t21a
|
||||
srshr v28.4s, v6.4s, #12 // t26a
|
||||
|
||||
sqsub v2.4s, v16.4s, v30.4s // t23
|
||||
|
@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon
|
|||
sqsub v21.4s, v27.4s, v22.4s // t25a
|
||||
sqsub v27.4s, v18.4s, v20.4s // t21
|
||||
sqadd v18.4s, v18.4s, v20.4s // t18 = out18
|
||||
sqadd v4.4s, v29.4s, v26.4s // t19a = out19
|
||||
sqadd v7.4s, v29.4s, v26.4s // t19a = out19
|
||||
sqsub v26.4s, v29.4s, v26.4s // t20a
|
||||
sqadd v29.4s, v25.4s, v28.4s // t29 = out29
|
||||
sqsub v25.4s, v25.4s, v28.4s // t26
|
||||
sqadd v28.4s, v24.4s, v19.4s // t28a = out28
|
||||
sqsub v24.4s, v24.4s, v19.4s // t27a
|
||||
mov v19.16b, v4.16b // out19
|
||||
mov v19.16b, v7.16b // out19
|
||||
|
||||
mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
|
||||
.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
|
||||
smin \r\().4s, \r\().4s, v5.4s
|
||||
.endr
|
||||
.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
|
||||
smax \r\().4s, \r\().4s, v4.4s
|
||||
.endr
|
||||
|
||||
mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
|
||||
mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
|
||||
srshr v20.4s, v4.4s, #12 // t20
|
||||
srshr v20.4s, v7.4s, #12 // t20
|
||||
srshr v22.4s, v6.4s, #12 // t27
|
||||
|
||||
mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
|
||||
mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
|
||||
mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
|
||||
mov v27.16b, v22.16b // t27
|
||||
srshr v26.4s, v4.4s, #12 // t26a
|
||||
srshr v26.4s, v7.4s, #12 // t26a
|
||||
|
||||
mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
|
||||
mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
|
||||
mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
|
||||
srshr v21.4s, v6.4s, #12 // t21a
|
||||
srshr v22.4s, v24.4s, #12 // t22
|
||||
srshr v25.4s, v4.4s, #12 // t25
|
||||
srshr v25.4s, v7.4s, #12 // t25
|
||||
|
||||
mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
|
||||
mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
|
||||
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
|
||||
srshr v23.4s, v4.4s, #12 // t23a
|
||||
srshr v23.4s, v7.4s, #12 // t23a
|
||||
srshr v24.4s, v6.4s, #12 // t24a
|
||||
|
||||
ret
|
||||
|
@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
|
|||
scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
|
||||
.endif
|
||||
bl inv_dct_4s_x16_neon
|
||||
|
||||
// idct_16 leaves the row_clip_max/min constants in v5 and v4
|
||||
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
|
||||
transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
|
||||
transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
|
||||
|
@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon
|
|||
sqsub v30.4s, v23.4s, v22.4s // t62
|
||||
sqadd v31.4s, v23.4s, v22.4s // t63
|
||||
|
||||
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
|
||||
mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
|
||||
mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
|
||||
neg v2.4s, v2.4s // t34a
|
||||
mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
|
||||
srshr v26.4s, v2.4s, #12 // t34a
|
||||
mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
|
||||
srshr v29.4s, v4.4s, #12 // t61a
|
||||
srshr v29.4s, v7.4s, #12 // t61a
|
||||
srshr v25.4s, v6.4s, #12 // t33a
|
||||
srshr v30.4s, v2.4s, #12 // t62a
|
||||
|
||||
|
@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon
|
|||
sqsub v21.4s, v30.4s, v29.4s // t61
|
||||
sqadd v22.4s, v30.4s, v29.4s // t62
|
||||
|
||||
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
|
||||
mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
|
||||
mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
|
||||
mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
|
||||
srshr v21.4s, v2.4s, #12 // t61a
|
||||
srshr v18.4s, v4.4s, #12 // t34a
|
||||
srshr v18.4s, v7.4s, #12 // t34a
|
||||
mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
|
||||
srshr v20.4s, v6.4s, #12 // t60
|
||||
srshr v19.4s, v2.4s, #12 // t35
|
||||
|
@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon
|
|||
sqadd v30.4s, v23.4s, v22.4s // t48
|
||||
sqsub v31.4s, v23.4s, v22.4s // t55
|
||||
|
||||
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
|
||||
mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
|
||||
mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
|
||||
mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
|
||||
srshr v25.4s, v2.4s, #12 // t56a
|
||||
srshr v27.4s, v4.4s, #12 // t39a
|
||||
srshr v27.4s, v7.4s, #12 // t39a
|
||||
neg v6.4s, v6.4s // t40a
|
||||
mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
|
||||
srshr v31.4s, v6.4s, #12 // t40a
|
||||
|
@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon
|
|||
sqsub v21.4s, v25.4s, v28.4s // t55
|
||||
sqadd v22.4s, v25.4s, v28.4s // t56
|
||||
|
||||
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
|
||||
mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
|
||||
mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
|
||||
mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
|
||||
srshr v18.4s, v2.4s, #12 // t40a
|
||||
srshr v21.4s, v4.4s, #12 // t55a
|
||||
srshr v21.4s, v7.4s, #12 // t55a
|
||||
mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
|
||||
srshr v19.4s, v6.4s, #12 // t47
|
||||
srshr v20.4s, v2.4s, #12 // t48
|
||||
|
@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
|
|||
|
||||
bl inv_dct_4s_x16_neon
|
||||
|
||||
// idct_16 leaves the row_clip_max/min constants in v5 and v4
|
||||
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smin_4s \r, \r, v5
|
||||
.endr
|
||||
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
smax_4s \r, \r, v4
|
||||
.endr
|
||||
|
||||
store16 x6
|
||||
|
||||
movz16dup_if v0.2s, w16, #2896*8, \scale
|
||||
|
@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
|
|||
|
||||
mov x9, #-16
|
||||
|
||||
movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
|
||||
.macro store_addsub r0, r1, r2, r3
|
||||
ld1 {v2.4s}, [x6], #16
|
||||
ld1 {v3.4s}, [x6], #16
|
||||
|
@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
|
|||
ld1 {v4.4s}, [x6], #16
|
||||
sqadd v7.4s, v3.4s, \r1
|
||||
sqsub \r1, v3.4s, \r1
|
||||
smin v6.4s, v6.4s, v1.4s
|
||||
smin \r0, \r0, v1.4s
|
||||
ld1 {v5.4s}, [x6], #16
|
||||
sqadd v2.4s, v4.4s, \r2
|
||||
sub x6, x6, #16*4
|
||||
smax v6.4s, v6.4s, v0.4s
|
||||
smax \r0, \r0, v0.4s
|
||||
sqsub \r2, v4.4s, \r2
|
||||
smin v7.4s, v7.4s, v1.4s
|
||||
smin \r1, \r1, v1.4s
|
||||
st1 {v6.4s}, [x6], #16
|
||||
st1 {\r0}, [x10], x9
|
||||
smin v2.4s, v2.4s, v1.4s
|
||||
smin \r2, \r2, v1.4s
|
||||
smax v7.4s, v7.4s, v0.4s
|
||||
smax \r1, \r1, v0.4s
|
||||
sqadd v3.4s, v5.4s, \r3
|
||||
sqsub \r3, v5.4s, \r3
|
||||
smax v2.4s, v2.4s, v0.4s
|
||||
smax \r2, \r2, v0.4s
|
||||
smin v3.4s, v3.4s, v1.4s
|
||||
smin \r3, \r3, v1.4s
|
||||
st1 {v7.4s}, [x6], #16
|
||||
st1 {\r1}, [x10], x9
|
||||
smax v3.4s, v3.4s, v0.4s
|
||||
smax \r3, \r3, v0.4s
|
||||
st1 {v2.4s}, [x6], #16
|
||||
st1 {\r2}, [x10], x9
|
||||
st1 {v3.4s}, [x6], #16
|
||||
|
@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
|
|||
add x6, x6, #4*4*16
|
||||
|
||||
movrel x17, idct64_coeffs
|
||||
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
|
||||
mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
|
||||
movz16dup_if v0.2s, w16, #2896*8, \scale
|
||||
movi_if v7.4s, #0, \clear
|
||||
add x9, x7, x8, lsl #4 // offset 16
|
||||
|
|
|
@ -137,7 +137,7 @@ void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
|
|||
static void \
|
||||
fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
|
||||
const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
|
||||
const int pw, const uint8_t scaling[SCALING_SIZE], \
|
||||
const size_t pw, const uint8_t scaling[SCALING_SIZE], \
|
||||
const entry grain_lut[][GRAIN_WIDTH], const int bh, \
|
||||
const int row_num, const pixel *const luma_row, \
|
||||
const ptrdiff_t luma_stride, const int uv, const int is_id \
|
||||
|
@ -156,7 +156,7 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
|
|||
int offsets[2 /* col offset */][2 /* row offset */]; \
|
||||
\
|
||||
/* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
|
||||
for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
|
||||
if (data->overlap_flag && bx) { \
|
||||
/* shift previous offsets left */ \
|
||||
for (int i = 0; i < rows; i++) \
|
||||
|
|
|
@ -43,8 +43,8 @@
|
|||
#include "src/x86/cpu.h"
|
||||
#endif
|
||||
|
||||
extern unsigned dav1d_cpu_flags;
|
||||
extern unsigned dav1d_cpu_flags_mask;
|
||||
EXTERN unsigned dav1d_cpu_flags;
|
||||
EXTERN unsigned dav1d_cpu_flags_mask;
|
||||
|
||||
void dav1d_init_cpu(void);
|
||||
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
|
||||
|
|
|
@ -2087,11 +2087,14 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
|
||||
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
|
||||
const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
|
||||
enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
|
||||
if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
|
||||
ytx = (enum RectTxfmSize) TX_4X4;
|
||||
uvtx = (enum RectTxfmSize) TX_4X4;
|
||||
}
|
||||
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
|
||||
t->bx, t->by, f->w4, f->h4, b->skip, bs,
|
||||
f->frame_hdr->segmentation.lossless[b->seg_id] ?
|
||||
(enum RectTxfmSize) TX_4X4 : b->max_ytx,
|
||||
tx_split, b->uvtx, f->cur.p.layout,
|
||||
ytx, tx_split, uvtx, f->cur.p.layout,
|
||||
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
|
||||
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
|
||||
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
|
||||
|
@ -3456,11 +3459,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
// wait until all threads have completed
|
||||
if (!res) {
|
||||
if (f->c->n_tc > 1) {
|
||||
pthread_mutex_lock(&f->task_thread.ttd->lock);
|
||||
res = dav1d_task_create_tile_sbrow(f, 0, 1);
|
||||
pthread_mutex_lock(&f->task_thread.ttd->lock);
|
||||
pthread_cond_signal(&f->task_thread.ttd->cond);
|
||||
if (!res) {
|
||||
while (!f->task_thread.done[0] ||
|
||||
f->task_thread.task_counter > 0)
|
||||
atomic_load(&f->task_thread.task_counter) > 0)
|
||||
{
|
||||
pthread_cond_wait(&f->task_thread.cond,
|
||||
&f->task_thread.ttd->lock);
|
||||
|
@ -3483,7 +3487,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
|
||||
static int get_upscale_x0(const int in_w, const int out_w, const int step) {
|
||||
const int err = out_w * step - (in_w << 14);
|
||||
const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
|
||||
const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
|
||||
return x0 & 0x3fff;
|
||||
}
|
||||
|
||||
|
@ -3505,10 +3509,13 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
&c->task_thread.lock);
|
||||
out_delayed = &c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
|
||||
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
|
||||
unsigned first = atomic_load(&c->task_thread.first);
|
||||
if (first + 1U < c->n_fc)
|
||||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
|
||||
&first, UINT_MAX);
|
||||
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
|
@ -3720,7 +3727,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
const int uses_2pass = c->n_fc > 1;
|
||||
const int cols = f->frame_hdr->tiling.cols;
|
||||
const int rows = f->frame_hdr->tiling.rows;
|
||||
f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass;
|
||||
atomic_store(&f->task_thread.task_counter,
|
||||
(cols * rows + f->sbh) << uses_2pass);
|
||||
|
||||
// ref_mvs
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
|
||||
|
@ -3740,9 +3748,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
if (f->frame_hdr->use_ref_frame_mvs) {
|
||||
for (int i = 0; i < 7; i++) {
|
||||
const int refidx = f->frame_hdr->refidx[i];
|
||||
const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
|
||||
const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
|
||||
if (c->refs[refidx].refmvs != NULL &&
|
||||
ref_coded_width[i] == f->cur.p.w &&
|
||||
f->refp[i].p.p.h == f->cur.p.h)
|
||||
ref_w == f->bw && ref_h == f->bh)
|
||||
{
|
||||
f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
|
||||
dav1d_ref_inc(f->ref_mvs_ref[i]);
|
||||
|
|
|
@ -32,6 +32,6 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
|
||||
EXTERN const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
|
||||
|
||||
#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
|
||||
|
|
|
@ -51,6 +51,11 @@ static void generate_scaling(const int bitdepth,
|
|||
const int scaling_size = 1 << bitdepth;
|
||||
#endif
|
||||
|
||||
if (num == 0) {
|
||||
memset(scaling, 0, scaling_size);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fill up the preceding entries with the initial value
|
||||
memset(scaling, points[0][1], points[0][0] << shift_x);
|
||||
|
||||
|
@ -113,7 +118,7 @@ void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
|||
data, 1 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
// Generate scaling LUTs as needed
|
||||
if (data->num_y_points)
|
||||
if (data->num_y_points || data->chroma_scaling_from_luma)
|
||||
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
|
||||
if (data->num_uv_points[0])
|
||||
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
|
||||
|
|
|
@ -64,7 +64,7 @@ typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
|
|||
|
||||
#define decl_fguv_32x32xn_fn(name) \
|
||||
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
|
||||
const Dav1dFilmGrainData *data, int pw, \
|
||||
const Dav1dFilmGrainData *data, size_t pw, \
|
||||
const uint8_t scaling[SCALING_SIZE], \
|
||||
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
|
||||
const pixel *luma_row, ptrdiff_t luma_stride, \
|
||||
|
|
|
@ -278,7 +278,7 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
|||
static NOINLINE void
|
||||
fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
||||
const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
|
||||
const int pw, const uint8_t scaling[SCALING_SIZE],
|
||||
const size_t pw, const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH], const int bh,
|
||||
const int row_num, const pixel *const luma_row,
|
||||
const ptrdiff_t luma_stride, const int uv, const int is_id,
|
||||
|
@ -311,8 +311,8 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
|||
int offsets[2 /* col offset */][2 /* row offset */];
|
||||
|
||||
// process this row in BLOCK_SIZE^2 blocks (subsampled)
|
||||
for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
|
||||
const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
|
||||
const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
|
||||
if (data->overlap_flag && bx) {
|
||||
// shift previous offsets left
|
||||
for (int i = 0; i < rows; i++)
|
||||
|
|
|
@ -275,7 +275,7 @@ struct Dav1dFrameContext {
|
|||
|
||||
struct {
|
||||
int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
|
||||
int entropy_progress;
|
||||
atomic_int entropy_progress;
|
||||
atomic_int deblock_progress; // in sby units
|
||||
atomic_uint *frame_progress, *copy_lpf_progress;
|
||||
// indexed using t->by * f->b4_stride + t->bx
|
||||
|
@ -324,22 +324,28 @@ struct Dav1dFrameContext {
|
|||
} lf;
|
||||
|
||||
struct {
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t cond;
|
||||
struct TaskThreadData *ttd;
|
||||
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
|
||||
int num_tasks, num_tile_tasks;
|
||||
int init_done;
|
||||
int done[2];
|
||||
atomic_int init_done;
|
||||
atomic_int done[2];
|
||||
int retval;
|
||||
int update_set; // whether we need to update CDF reference
|
||||
atomic_int error;
|
||||
int task_counter;
|
||||
atomic_int task_counter;
|
||||
struct Dav1dTask *task_head, *task_tail;
|
||||
// Points to the task directly before the cur pointer in the queue.
|
||||
// This cur pointer is theoretical here, we actually keep track of the
|
||||
// "prev_t" variable. This is needed to not loose the tasks in
|
||||
// [head;cur-1] when picking one for execution.
|
||||
struct Dav1dTask *task_cur_prev;
|
||||
struct { // async task insertion
|
||||
atomic_int merge;
|
||||
pthread_mutex_t lock;
|
||||
Dav1dTask *head, *tail;
|
||||
} pending_tasks;
|
||||
} task_thread;
|
||||
|
||||
// threading (refer to tc[] for per-thread things)
|
||||
|
|
|
@ -235,8 +235,18 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
}
|
||||
for (unsigned n = 0; n < c->n_fc; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
if (c->n_tc > 1)
|
||||
if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error;
|
||||
if (c->n_tc > 1) {
|
||||
if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&f->task_thread.cond, NULL)) {
|
||||
pthread_mutex_destroy(&f->task_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) {
|
||||
pthread_cond_destroy(&f->task_thread.cond);
|
||||
pthread_mutex_destroy(&f->task_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
f->c = c;
|
||||
f->task_thread.ttd = &c->task_thread;
|
||||
f->lf.last_sharpness = -1;
|
||||
|
@ -335,7 +345,8 @@ static int has_grain(const Dav1dPicture *const pic)
|
|||
{
|
||||
const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data;
|
||||
return fgdata->num_y_points || fgdata->num_uv_points[0] ||
|
||||
fgdata->num_uv_points[1];
|
||||
fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range &&
|
||||
fgdata->chroma_scaling_from_luma);
|
||||
}
|
||||
|
||||
static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
|
||||
|
@ -392,10 +403,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
|
|||
Dav1dThreadPicture *const out_delayed =
|
||||
&c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
|
||||
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
|
||||
unsigned first = atomic_load(&c->task_thread.first);
|
||||
if (first + 1U < c->n_fc)
|
||||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
|
||||
&first, UINT_MAX);
|
||||
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
|
@ -591,6 +605,9 @@ void dav1d_flush(Dav1dContext *const c) {
|
|||
c->fc[i].task_thread.task_head = NULL;
|
||||
c->fc[i].task_thread.task_tail = NULL;
|
||||
c->fc[i].task_thread.task_cur_prev = NULL;
|
||||
c->fc[i].task_thread.pending_tasks.head = NULL;
|
||||
c->fc[i].task_thread.pending_tasks.tail = NULL;
|
||||
atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0);
|
||||
}
|
||||
atomic_init(&c->task_thread.first, 0);
|
||||
c->task_thread.cur = c->n_fc;
|
||||
|
@ -664,7 +681,9 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
freep(&f->frame_thread.cbi);
|
||||
}
|
||||
if (c->n_tc > 1) {
|
||||
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
|
||||
pthread_cond_destroy(&f->task_thread.cond);
|
||||
pthread_mutex_destroy(&f->task_thread.lock);
|
||||
}
|
||||
freep(&f->frame_thread.frame_progress);
|
||||
freep(&f->task_thread.tasks);
|
||||
|
|
|
@ -174,6 +174,7 @@ if is_asm_enabled
|
|||
'x86/cpuid.asm',
|
||||
'x86/msac.asm',
|
||||
'x86/refmvs.asm',
|
||||
'x86/itx_avx512.asm',
|
||||
'x86/cdef_avx2.asm',
|
||||
'x86/itx_avx2.asm',
|
||||
'x86/looprestoration_avx2.asm',
|
||||
|
@ -186,7 +187,6 @@ if is_asm_enabled
|
|||
'x86/cdef_avx512.asm',
|
||||
'x86/filmgrain_avx512.asm',
|
||||
'x86/ipred_avx512.asm',
|
||||
'x86/itx_avx512.asm',
|
||||
'x86/loopfilter_avx512.asm',
|
||||
'x86/looprestoration_avx512.asm',
|
||||
'x86/mc_avx512.asm',
|
||||
|
@ -207,6 +207,7 @@ if is_asm_enabled
|
|||
'x86/cdef16_avx512.asm',
|
||||
'x86/filmgrain16_avx512.asm',
|
||||
'x86/ipred16_avx512.asm',
|
||||
'x86/itx16_avx512.asm',
|
||||
'x86/loopfilter16_avx512.asm',
|
||||
'x86/looprestoration16_avx512.asm',
|
||||
'x86/mc16_avx512.asm',
|
||||
|
|
|
@ -1509,7 +1509,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
|
||||
if (payload_size <= 0) {
|
||||
dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
|
||||
goto error;
|
||||
break;
|
||||
}
|
||||
|
||||
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
|
||||
|
@ -1581,10 +1581,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
Dav1dThreadPicture *const out_delayed =
|
||||
&c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
|
||||
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
|
||||
unsigned first = atomic_load(&c->task_thread.first);
|
||||
if (first + 1U < c->n_fc)
|
||||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
|
||||
&first, UINT_MAX);
|
||||
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
|
||||
EXTERN const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
|
||||
|
||||
void dav1d_init_qm_tables(void);
|
||||
|
||||
|
|
|
@ -591,7 +591,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
|
||||
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
|
||||
const int dq_shift = imax(0, t_dim->ctx - 2);
|
||||
const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
|
||||
const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
|
||||
unsigned cul_level, dc_sign_level;
|
||||
|
||||
if (!dc_tok) {
|
||||
|
@ -608,7 +608,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
|
||||
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
|
||||
|
||||
unsigned dc_dq = dq_tbl[0];
|
||||
int dc_dq = dq_tbl[0];
|
||||
dc_sign_level = (dc_sign - 1) & (2 << 6);
|
||||
|
||||
if (qm_tbl) {
|
||||
|
@ -628,7 +628,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
}
|
||||
cul_level = dc_tok;
|
||||
dc_dq >>= dq_shift;
|
||||
cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
|
||||
dc_dq = umin(dc_dq, cf_max + dc_sign);
|
||||
cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
|
||||
|
||||
if (rc) ac_qm: {
|
||||
const unsigned ac_dq = dq_tbl[1];
|
||||
|
@ -638,6 +639,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
|
||||
const unsigned rc_tok = cf[rc];
|
||||
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
|
||||
int dq_sat;
|
||||
|
||||
if (rc_tok >= (15 << 11)) {
|
||||
tok = read_golomb(&ts->msac) + 15;
|
||||
|
@ -654,7 +656,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
}
|
||||
cul_level += tok;
|
||||
dq >>= dq_shift;
|
||||
cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
|
||||
dq_sat = umin(dq, cf_max + sign);
|
||||
cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
|
||||
|
||||
rc = rc_tok & 0x3ff;
|
||||
} while (rc);
|
||||
|
@ -669,13 +672,13 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
|
||||
dc_tok &= 0xfffff;
|
||||
dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
|
||||
dc_dq = umin(dc_dq - dc_sign, cf_max);
|
||||
dc_dq = umin(dc_dq, cf_max + dc_sign);
|
||||
} else {
|
||||
dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
|
||||
dc_dq = ((dc_dq * dc_tok) >> dq_shift);
|
||||
assert(dc_dq <= cf_max);
|
||||
}
|
||||
cul_level = dc_tok;
|
||||
cf[0] = (coef) (dc_dq ^ -dc_sign);
|
||||
cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
|
||||
|
||||
if (rc) ac_noqm: {
|
||||
const unsigned ac_dq = dq_tbl[1];
|
||||
|
@ -684,7 +687,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
if (dbg)
|
||||
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
|
||||
const unsigned rc_tok = cf[rc];
|
||||
unsigned tok, dq;
|
||||
unsigned tok;
|
||||
int dq;
|
||||
|
||||
// residual
|
||||
if (rc_tok >= (15 << 11)) {
|
||||
|
@ -698,15 +702,15 @@ static int decode_coefs(Dav1dTaskContext *const t,
|
|||
|
||||
// dequant, see 7.12.3
|
||||
dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
|
||||
dq = umin(dq - sign, cf_max);
|
||||
dq = umin(dq, cf_max + sign);
|
||||
} else {
|
||||
// cannot exceed cf_max, so we can avoid the clipping
|
||||
tok = rc_tok >> 11;
|
||||
dq = ((ac_dq * tok) >> dq_shift) - sign;
|
||||
dq = ((ac_dq * tok) >> dq_shift);
|
||||
assert(dq <= cf_max);
|
||||
}
|
||||
cul_level += tok;
|
||||
cf[rc] = (coef) (dq ^ -sign);
|
||||
cf[rc] = (coef) (sign ? -dq : dq);
|
||||
|
||||
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
|
||||
} while (rc);
|
||||
|
@ -1092,9 +1096,10 @@ static int obmc(Dav1dTaskContext *const t,
|
|||
// only odd blocks are considered for overlap handling, hence +1
|
||||
const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
|
||||
const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
|
||||
const int step4 = iclip(a_b_dim[0], 2, 16);
|
||||
|
||||
if (a_r->ref.ref[0] > 0) {
|
||||
const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
|
||||
const int ow4 = imin(step4, b_dim[0]);
|
||||
const int oh4 = imin(b_dim[1], 16) >> 1;
|
||||
res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
|
||||
t->bx + x, t->by, pl, a_r->mv.mv[0],
|
||||
|
@ -1105,7 +1110,7 @@ static int obmc(Dav1dTaskContext *const t,
|
|||
h_mul * ow4, v_mul * oh4);
|
||||
i++;
|
||||
}
|
||||
x += imax(a_b_dim[0], 2);
|
||||
x += step4;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1114,10 +1119,11 @@ static int obmc(Dav1dTaskContext *const t,
|
|||
// only odd blocks are considered for overlap handling, hence +1
|
||||
const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
|
||||
const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
|
||||
const int step4 = iclip(l_b_dim[1], 2, 16);
|
||||
|
||||
if (l_r->ref.ref[0] > 0) {
|
||||
const int ow4 = imin(b_dim[0], 16) >> 1;
|
||||
const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
|
||||
const int oh4 = imin(step4, b_dim[1]);
|
||||
res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
|
||||
t->bx, t->by + y, pl, l_r->mv.mv[0],
|
||||
&f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
|
||||
|
@ -1127,7 +1133,7 @@ static int obmc(Dav1dTaskContext *const t,
|
|||
dst_stride, lap, h_mul * ow4, v_mul * oh4);
|
||||
i++;
|
||||
}
|
||||
y += imax(l_b_dim[1], 2);
|
||||
y += step4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -32,6 +32,6 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
|
||||
EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
|
||||
|
||||
#endif /* DAV1D_SRC_SCAN_H */
|
||||
|
|
|
@ -34,38 +34,38 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
|
||||
extern const uint8_t /* enum BlockSize */
|
||||
EXTERN const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
|
||||
EXTERN const uint8_t /* enum BlockSize */
|
||||
dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2];
|
||||
// width, height (in 4px blocks), log2 versions of these two
|
||||
extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
|
||||
EXTERN const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
|
||||
typedef struct TxfmInfo {
|
||||
// width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad
|
||||
uint8_t w, h, lw, lh, min, max, sub, ctx;
|
||||
} TxfmInfo;
|
||||
extern const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
|
||||
extern const uint8_t /* enum (Rect)TxfmSize */
|
||||
EXTERN const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
|
||||
EXTERN const uint8_t /* enum (Rect)TxfmSize */
|
||||
dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */];
|
||||
extern const uint8_t /* enum TxfmType */
|
||||
EXTERN const uint8_t /* enum TxfmType */
|
||||
dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES];
|
||||
|
||||
extern const uint8_t /* enum InterPredMode */
|
||||
EXTERN const uint8_t /* enum InterPredMode */
|
||||
dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
|
||||
|
||||
extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
|
||||
extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
|
||||
EXTERN const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
|
||||
EXTERN const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
|
||||
|
||||
extern const uint8_t dav1d_filter_mode_to_y_mode[5];
|
||||
extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
|
||||
extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
|
||||
extern const uint8_t dav1d_skip_ctx[5][5];
|
||||
extern const uint8_t /* enum TxClass */
|
||||
EXTERN const uint8_t dav1d_filter_mode_to_y_mode[5];
|
||||
EXTERN const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
|
||||
EXTERN const uint8_t dav1d_lo_ctx_offsets[3][5][5];
|
||||
EXTERN const uint8_t dav1d_skip_ctx[5][5];
|
||||
EXTERN const uint8_t /* enum TxClass */
|
||||
dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
|
||||
extern const uint8_t /* enum Filter2d */
|
||||
EXTERN const uint8_t /* enum Filter2d */
|
||||
dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
|
||||
extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
|
||||
extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
|
||||
extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
|
||||
EXTERN const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
|
||||
EXTERN const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
|
||||
EXTERN const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
|
||||
|
||||
static const unsigned cfl_allowed_mask =
|
||||
(1 << BS_32x32) |
|
||||
|
@ -103,23 +103,23 @@ static const unsigned interintra_allowed_mask =
|
|||
(1 << BS_8x16) |
|
||||
(1 << BS_8x8);
|
||||
|
||||
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
|
||||
EXTERN const Dav1dWarpedMotionParams dav1d_default_wm_params;
|
||||
|
||||
extern const int8_t dav1d_cdef_directions[12][2];
|
||||
EXTERN const int8_t dav1d_cdef_directions[12][2];
|
||||
|
||||
extern const uint16_t dav1d_sgr_params[16][2];
|
||||
extern const uint8_t dav1d_sgr_x_by_x[256];
|
||||
EXTERN const uint16_t dav1d_sgr_params[16][2];
|
||||
EXTERN const uint8_t dav1d_sgr_x_by_x[256];
|
||||
|
||||
extern const int8_t dav1d_mc_subpel_filters[6][15][8];
|
||||
extern const int8_t dav1d_mc_warp_filter[193][8];
|
||||
extern const int8_t dav1d_resize_filter[64][8];
|
||||
EXTERN const int8_t dav1d_mc_subpel_filters[6][15][8];
|
||||
EXTERN const int8_t dav1d_mc_warp_filter[193][8];
|
||||
EXTERN const int8_t dav1d_resize_filter[64][8];
|
||||
|
||||
extern const uint8_t dav1d_sm_weights[128];
|
||||
extern const uint16_t dav1d_dr_intra_derivative[44];
|
||||
extern const int8_t dav1d_filter_intra_taps[5][64];
|
||||
EXTERN const uint8_t dav1d_sm_weights[128];
|
||||
EXTERN const uint16_t dav1d_dr_intra_derivative[44];
|
||||
EXTERN const int8_t dav1d_filter_intra_taps[5][64];
|
||||
|
||||
extern const uint8_t dav1d_obmc_masks[64];
|
||||
EXTERN const uint8_t dav1d_obmc_masks[64];
|
||||
|
||||
extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
|
||||
EXTERN const int16_t dav1d_gaussian_sequence[2048]; // for fgs
|
||||
|
||||
#endif /* DAV1D_SRC_TABLES_H */
|
||||
|
|
|
@ -49,9 +49,13 @@ static inline int reset_task_cur(const Dav1dContext *const c,
|
|||
unsigned frame_idx)
|
||||
{
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
|
||||
if (reset_frame_idx < first) {
|
||||
if (frame_idx == UINT_MAX) return 0;
|
||||
reset_frame_idx = UINT_MAX;
|
||||
}
|
||||
if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
|
||||
return 0;
|
||||
unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
|
||||
if (reset_frame_idx != UINT_MAX) {
|
||||
if (frame_idx == UINT_MAX) {
|
||||
if (reset_frame_idx > first + ttd->cur)
|
||||
|
@ -78,12 +82,17 @@ cur_found:
|
|||
static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
|
||||
unsigned frame_idx, unsigned n_frames)
|
||||
{
|
||||
if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames;
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
if (frame_idx < first) frame_idx += n_frames;
|
||||
unsigned last_idx = frame_idx;
|
||||
do {
|
||||
frame_idx = last_idx;
|
||||
last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
|
||||
} while (last_idx < frame_idx);
|
||||
if (frame_idx == first && atomic_load(&ttd->first) != first) {
|
||||
unsigned expected = frame_idx;
|
||||
atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
static void insert_tasks_between(Dav1dFrameContext *const f,
|
||||
|
@ -164,6 +173,43 @@ static inline void insert_task(Dav1dFrameContext *const f,
|
|||
insert_tasks(f, t, t, cond_signal);
|
||||
}
|
||||
|
||||
static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
|
||||
pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
|
||||
t->next = NULL;
|
||||
if (!f->task_thread.pending_tasks.head)
|
||||
f->task_thread.pending_tasks.head = t;
|
||||
else
|
||||
f->task_thread.pending_tasks.tail->next = t;
|
||||
f->task_thread.pending_tasks.tail = t;
|
||||
atomic_store(&f->task_thread.pending_tasks.merge, 1);
|
||||
pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
|
||||
}
|
||||
|
||||
static inline int merge_pending_frame(Dav1dFrameContext *const f) {
|
||||
int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
|
||||
if (merge) {
|
||||
pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
|
||||
Dav1dTask *t = f->task_thread.pending_tasks.head;
|
||||
f->task_thread.pending_tasks.head = NULL;
|
||||
f->task_thread.pending_tasks.tail = NULL;
|
||||
atomic_store(&f->task_thread.pending_tasks.merge, 0);
|
||||
pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
|
||||
while (t) {
|
||||
Dav1dTask *const tmp = t->next;
|
||||
insert_task(f, t, 0);
|
||||
t = tmp;
|
||||
}
|
||||
}
|
||||
return merge;
|
||||
}
|
||||
|
||||
static inline int merge_pending(const Dav1dContext *const c) {
|
||||
int res = 0;
|
||||
for (unsigned i = 0; i < c->n_fc; i++)
|
||||
res |= merge_pending_frame(&c->fc[i]);
|
||||
return res;
|
||||
}
|
||||
|
||||
static int create_filter_sbrow(Dav1dFrameContext *const f,
|
||||
const int pass, Dav1dTask **res_t)
|
||||
{
|
||||
|
@ -192,13 +238,14 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
|
|||
const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
|
||||
if (prog_sz > f->frame_thread.prog_sz) {
|
||||
atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
|
||||
prog_sz * 2 * sizeof(*prog));
|
||||
2 * prog_sz * sizeof(*prog));
|
||||
if (!prog) return -1;
|
||||
f->frame_thread.frame_progress = prog;
|
||||
f->frame_thread.copy_lpf_progress = prog + prog_sz;
|
||||
f->frame_thread.prog_sz = prog_sz;
|
||||
}
|
||||
memset(f->frame_thread.frame_progress, 0, prog_sz * 2 * sizeof(atomic_uint));
|
||||
f->frame_thread.prog_sz = prog_sz;
|
||||
memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
|
||||
memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
|
||||
atomic_store(&f->frame_thread.deblock_progress, 0);
|
||||
}
|
||||
f->frame_thread.next_tile_row[pass & 1] = 0;
|
||||
|
@ -224,16 +271,18 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
|
|||
Dav1dTask *tasks = f->task_thread.tile_tasks[0];
|
||||
const int uses_2pass = f->c->n_fc > 1;
|
||||
const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
|
||||
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
|
||||
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
|
||||
tasks = realloc(f->task_thread.tile_tasks[0], size);
|
||||
if (!tasks) return -1;
|
||||
memset(tasks, 0, size);
|
||||
f->task_thread.tile_tasks[0] = tasks;
|
||||
f->task_thread.num_tile_tasks = alloc_num_tasks;
|
||||
if (pass < 2) {
|
||||
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
|
||||
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
|
||||
tasks = realloc(f->task_thread.tile_tasks[0], size);
|
||||
if (!tasks) return -1;
|
||||
memset(tasks, 0, size);
|
||||
f->task_thread.tile_tasks[0] = tasks;
|
||||
f->task_thread.num_tile_tasks = alloc_num_tasks;
|
||||
}
|
||||
f->task_thread.tile_tasks[1] = tasks + num_tasks;
|
||||
}
|
||||
f->task_thread.tile_tasks[1] = tasks + num_tasks;
|
||||
tasks += num_tasks * (pass & 1);
|
||||
|
||||
Dav1dTask *pf_t;
|
||||
|
@ -263,8 +312,22 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
|
|||
prev_t->next = pf_t;
|
||||
prev_t = pf_t;
|
||||
}
|
||||
insert_tasks(f, &tasks[0], prev_t, cond_signal);
|
||||
f->task_thread.done[pass & 1] = 0;
|
||||
prev_t->next = NULL;
|
||||
|
||||
atomic_store(&f->task_thread.done[pass & 1], 0);
|
||||
|
||||
// XXX in theory this could be done locklessly, at this point they are no
|
||||
// tasks in the frameQ, so no other runner should be using this lock, but
|
||||
// we must add both passes at once
|
||||
pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
|
||||
assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
|
||||
if (!f->task_thread.pending_tasks.head)
|
||||
f->task_thread.pending_tasks.head = &tasks[0];
|
||||
else
|
||||
f->task_thread.pending_tasks.tail->next = &tasks[0];
|
||||
f->task_thread.pending_tasks.tail = prev_t;
|
||||
atomic_store(&f->task_thread.pending_tasks.merge, 1);
|
||||
pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -272,7 +335,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
|
|||
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
|
||||
const Dav1dContext *const c = f->c;
|
||||
|
||||
f->task_thread.init_done = 0;
|
||||
atomic_store(&f->task_thread.init_done, 0);
|
||||
// schedule init task, which will schedule the remaining tasks
|
||||
Dav1dTask *const t = &f->task_thread.init_task;
|
||||
t->type = DAV1D_TASK_TYPE_INIT;
|
||||
|
@ -307,16 +370,12 @@ static inline int ensure_progress(struct TaskThreadData *const ttd,
|
|||
// so ensure that completed. if not, re-add to task-queue; else, fall-through
|
||||
int p1 = atomic_load(state);
|
||||
if (p1 < t->sby) {
|
||||
t->type = type;
|
||||
t->recon_progress = t->deblock_progress = 0;
|
||||
*target = t->sby;
|
||||
add_pending(f, t);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
p1 = atomic_load(state);
|
||||
if (p1 < t->sby) {
|
||||
t->type = type;
|
||||
t->recon_progress = t->deblock_progress = 0;
|
||||
*target = t->sby;
|
||||
insert_task(f, t, 0);
|
||||
return 1;
|
||||
}
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -369,11 +428,29 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int get_frame_progress(const Dav1dContext *const c,
|
||||
const Dav1dFrameContext *const f)
|
||||
{
|
||||
unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
|
||||
if (frame_prog >= FRAME_ERROR)
|
||||
return f->sbh - 1;
|
||||
int idx = frame_prog >> (f->sb_shift + 7);
|
||||
int prog;
|
||||
do {
|
||||
atomic_uint *state = &f->frame_thread.frame_progress[idx];
|
||||
const unsigned val = ~atomic_load(state);
|
||||
prog = val ? ctz(val) : 32;
|
||||
if (prog != 32) break;
|
||||
prog = 0;
|
||||
} while (++idx < f->frame_thread.prog_sz);
|
||||
return ((idx << 5) | prog) - 1;
|
||||
}
|
||||
|
||||
static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
|
||||
atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
|
||||
f->task_thread.task_counter = 0;
|
||||
f->task_thread.done[0] = 1;
|
||||
f->task_thread.done[1] = 1;
|
||||
atomic_store(&f->task_thread.task_counter, 0);
|
||||
atomic_store(&f->task_thread.done[0], 1);
|
||||
atomic_store(&f->task_thread.done[1], 1);
|
||||
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
|
||||
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
|
||||
dav1d_decode_frame_exit(f, error);
|
||||
|
@ -478,6 +555,8 @@ void *dav1d_worker_task(void *data) {
|
|||
for (;;) {
|
||||
if (tc->task_thread.die) break;
|
||||
if (atomic_load(c->flush)) goto park;
|
||||
|
||||
merge_pending(c);
|
||||
if (ttd->delayed_fg.exec) { // run delayed film grain first
|
||||
delayed_fg_task(c, ttd);
|
||||
continue;
|
||||
|
@ -488,11 +567,18 @@ void *dav1d_worker_task(void *data) {
|
|||
for (unsigned i = 0; i < c->n_fc; i++) {
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
f = &c->fc[(first + i) % c->n_fc];
|
||||
if (f->task_thread.init_done) continue;
|
||||
if (atomic_load(&f->task_thread.init_done)) continue;
|
||||
t = f->task_thread.task_head;
|
||||
if (!t) continue;
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
|
||||
// XXX This can be a simple else, if adding tasks of both
|
||||
// passes at once (in dav1d_task_create_tile_sbrow).
|
||||
// Adding the tasks to the pending Q can result in a
|
||||
// thread merging them before setting init_done.
|
||||
// We will need to set init_done before adding to the
|
||||
// pending Q, so maybe return the tasks, set init_done,
|
||||
// and add to pending Q only then.
|
||||
const int p1 = f->in_cdf.progress ?
|
||||
atomic_load(f->in_cdf.progress) : 1;
|
||||
if (p1) {
|
||||
|
@ -505,6 +591,7 @@ void *dav1d_worker_task(void *data) {
|
|||
while (ttd->cur < c->n_fc) { // run decoding tasks last
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
f = &c->fc[(first + ttd->cur) % c->n_fc];
|
||||
merge_pending_frame(f);
|
||||
prev_t = f->task_thread.task_cur_prev;
|
||||
t = prev_t ? prev_t->next : f->task_thread.task_head;
|
||||
while (t) {
|
||||
|
@ -519,11 +606,12 @@ void *dav1d_worker_task(void *data) {
|
|||
} else if (t->recon_progress) {
|
||||
const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
|
||||
int error = atomic_load(&f->task_thread.error);
|
||||
assert(!f->task_thread.done[p] || error);
|
||||
assert(!atomic_load(&f->task_thread.done[p]) || error);
|
||||
const int tile_row_base = f->frame_hdr->tiling.cols *
|
||||
f->frame_thread.next_tile_row[p];
|
||||
if (p) {
|
||||
const int p1 = f->frame_thread.entropy_progress;
|
||||
atomic_int *const prog = &f->frame_thread.entropy_progress;
|
||||
const int p1 = atomic_load(prog);
|
||||
if (p1 < t->sby) goto next;
|
||||
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
|
||||
}
|
||||
|
@ -567,6 +655,7 @@ void *dav1d_worker_task(void *data) {
|
|||
ttd->cur++;
|
||||
}
|
||||
if (reset_task_cur(c, ttd, UINT_MAX)) continue;
|
||||
if (merge_pending(c)) continue;
|
||||
park:
|
||||
tc->task_thread.flushed = 1;
|
||||
pthread_cond_signal(&tc->task_thread.td.cond);
|
||||
|
@ -584,6 +673,7 @@ void *dav1d_worker_task(void *data) {
|
|||
if (!t->next) f->task_thread.task_tail = prev_t;
|
||||
if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
|
||||
ttd->cur++;
|
||||
t->next = NULL;
|
||||
// we don't need to check cond_signaled here, since we found a task
|
||||
// after the last signal so we want to re-signal the next waiting thread
|
||||
// and again won't need to signal after that
|
||||
|
@ -605,13 +695,13 @@ void *dav1d_worker_task(void *data) {
|
|||
if (res || p1 == TILE_ERROR) {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
|
||||
} else if (!res) {
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
} else {
|
||||
t->type = DAV1D_TASK_TYPE_INIT_CDF;
|
||||
if (p1) goto found_unlocked;
|
||||
add_pending(f, t);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
insert_task(f, t, 0);
|
||||
}
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
continue;
|
||||
}
|
||||
case DAV1D_TASK_TYPE_INIT_CDF: {
|
||||
|
@ -619,7 +709,6 @@ void *dav1d_worker_task(void *data) {
|
|||
int res = DAV1D_ERR(EINVAL);
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
res = dav1d_decode_frame_init_cdf(f);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
|
||||
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
|
||||
}
|
||||
|
@ -628,23 +717,34 @@ void *dav1d_worker_task(void *data) {
|
|||
for (int p = 1; p <= 2; p++) {
|
||||
const int res = dav1d_task_create_tile_sbrow(f, p, 0);
|
||||
if (res) {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
// memory allocation failed
|
||||
f->task_thread.done[2 - p] = 1;
|
||||
atomic_store(&f->task_thread.done[2 - p], 1);
|
||||
atomic_store(&f->task_thread.error, -1);
|
||||
f->task_thread.task_counter -= f->sbh +
|
||||
f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
|
||||
atomic_fetch_sub(&f->task_thread.task_counter,
|
||||
f->frame_hdr->tiling.cols *
|
||||
f->frame_hdr->tiling.rows + f->sbh);
|
||||
atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
|
||||
if (p == 2 && f->task_thread.done[1]) {
|
||||
assert(!f->task_thread.task_counter);
|
||||
if (p == 2 && atomic_load(&f->task_thread.done[1])) {
|
||||
assert(!atomic_load(&f->task_thread.task_counter));
|
||||
dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
atomic_store(&f->task_thread.init_done, 1);
|
||||
continue;
|
||||
} else {
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else abort_frame(f, res);
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
f->task_thread.init_done = 1;
|
||||
atomic_store(&f->task_thread.init_done, 1);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
} else {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
abort_frame(f, res);
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
atomic_store(&f->task_thread.init_done, 1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
case DAV1D_TASK_TYPE_TILE_ENTROPY:
|
||||
|
@ -673,10 +773,9 @@ void *dav1d_worker_task(void *data) {
|
|||
pthread_cond_signal(&ttd->cond);
|
||||
goto found_unlocked;
|
||||
}
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
atomic_store(&ts->progress[p], progress);
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
insert_task(f, t, 0);
|
||||
add_pending(f, t);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
} else {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
atomic_store(&ts->progress[p], progress);
|
||||
|
@ -692,15 +791,16 @@ void *dav1d_worker_task(void *data) {
|
|||
if (c->n_fc > 1)
|
||||
atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
|
||||
}
|
||||
if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
|
||||
(!uses_2pass || f->task_thread.done[1]))
|
||||
if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
|
||||
atomic_load(&f->task_thread.done[0]) &&
|
||||
(!uses_2pass || atomic_load(&f->task_thread.done[1])))
|
||||
{
|
||||
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
|
||||
error ? DAV1D_ERR(ENOMEM) : 0);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
assert(f->task_thread.task_counter >= 0);
|
||||
assert(atomic_load(&f->task_thread.task_counter) >= 0);
|
||||
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
}
|
||||
|
@ -734,15 +834,11 @@ void *dav1d_worker_task(void *data) {
|
|||
if (sby) {
|
||||
int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
|
||||
if (~prog & (1U << ((sby - 1) & 31))) {
|
||||
t->type = DAV1D_TASK_TYPE_CDEF;
|
||||
t->recon_progress = t->deblock_progress = 0;
|
||||
add_pending(f, t);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
|
||||
if (~prog & (1U << ((sby - 1) & 31))) {
|
||||
t->type = DAV1D_TASK_TYPE_CDEF;
|
||||
t->recon_progress = t->deblock_progress = 0;
|
||||
insert_task(f, t, 0);
|
||||
continue;
|
||||
}
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -776,40 +872,53 @@ void *dav1d_worker_task(void *data) {
|
|||
const int uses_2pass = c->n_fc > 1;
|
||||
const int sbh = f->sbh;
|
||||
const int sbsz = f->sb_step * 4;
|
||||
const enum PlaneType progress_plane_type =
|
||||
t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK :
|
||||
c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL;
|
||||
if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
|
||||
atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
|
||||
1U << (sby & 31));
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
|
||||
unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
|
||||
if (frame_prog < FRAME_ERROR) {
|
||||
int idx = frame_prog >> (f->sb_shift + 7);
|
||||
int prog;
|
||||
do {
|
||||
atomic_uint *state = &f->frame_thread.frame_progress[idx];
|
||||
const unsigned val = ~atomic_load(state);
|
||||
prog = val ? ctz(val) : 32;
|
||||
if (prog != 32) break;
|
||||
prog = 0;
|
||||
} while (++idx < f->frame_thread.prog_sz);
|
||||
sby = ((idx << 5) | prog) - 1;
|
||||
} else sby = sbh - 1;
|
||||
if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
|
||||
error = atomic_load(&f->task_thread.error);
|
||||
const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
|
||||
assert(c->n_fc > 1);
|
||||
if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
|
||||
atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
|
||||
atomic_store(&f->frame_thread.entropy_progress,
|
||||
error ? TILE_ERROR : sby + 1);
|
||||
if (sby + 1 == sbh)
|
||||
atomic_store(&f->task_thread.done[1], 1);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
|
||||
if (sby + 1 < sbh && num_tasks) {
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
continue;
|
||||
}
|
||||
if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
|
||||
atomic_load(&f->task_thread.done[1]))
|
||||
{
|
||||
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
|
||||
error ? DAV1D_ERR(ENOMEM) : 0);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
continue;
|
||||
}
|
||||
// t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
|
||||
atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
|
||||
1U << (sby & 31));
|
||||
pthread_mutex_lock(&f->task_thread.lock);
|
||||
sby = get_frame_progress(c, f);
|
||||
error = atomic_load(&f->task_thread.error);
|
||||
const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
|
||||
if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) {
|
||||
const int idx = t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
|
||||
atomic_store(&f->sr_cur.progress[idx], error ? FRAME_ERROR : y);
|
||||
}
|
||||
if (progress_plane_type == PLANE_TYPE_BLOCK)
|
||||
f->frame_thread.entropy_progress = error ? TILE_ERROR : sby + 1;
|
||||
if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
|
||||
atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
|
||||
pthread_mutex_unlock(&f->task_thread.lock);
|
||||
if (sby + 1 == sbh)
|
||||
f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1;
|
||||
if (!--f->task_thread.task_counter &&
|
||||
f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
|
||||
atomic_store(&f->task_thread.done[0], 1);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
|
||||
if (sby + 1 < sbh && num_tasks) {
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
continue;
|
||||
}
|
||||
if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
|
||||
(!uses_2pass || atomic_load(&f->task_thread.done[1])))
|
||||
{
|
||||
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
|
||||
error ? DAV1D_ERR(ENOMEM) : 0);
|
||||
|
|
|
@ -31,11 +31,11 @@
|
|||
#include "src/levels.h"
|
||||
|
||||
void dav1d_init_wedge_masks(void);
|
||||
extern const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
|
||||
EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
|
||||
[2 /* sign */][16 /* wedge_idx */];
|
||||
|
||||
void dav1d_init_interintra_masks(void);
|
||||
extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
|
||||
EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
|
||||
[N_INTER_INTRA_PRED_MODES];
|
||||
|
||||
#endif /* DAV1D_SRC_WEDGE_H */
|
||||
|
|
|
@ -126,6 +126,7 @@ decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
|
|||
decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
|
||||
|
||||
decl_itx_fns(avx512icl);
|
||||
decl_itx_bpc_fns(10, avx512icl);
|
||||
decl_itx_fns(avx2);
|
||||
decl_itx_bpc_fns(10, avx2);
|
||||
decl_itx_bpc_fns(12, avx2);
|
||||
|
@ -341,6 +342,13 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
|
|||
assign_itx1_fn (R, 64, 16, avx512icl);
|
||||
assign_itx1_fn (R, 64, 32, avx512icl);
|
||||
assign_itx1_fn ( , 64, 64, avx512icl);
|
||||
#else
|
||||
if (bpc == 10) {
|
||||
assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
|
||||
assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
|
||||
assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
|
||||
assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -361,18 +361,32 @@ ALIGN function_align
|
|||
%macro INV_TXFM_4X4_FN 2 ; type1, type2
|
||||
INV_TXFM_FN %1, %2, 0, 4x4
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
movd m1, [o(pw_2896x8)]
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
mov r3d, 4
|
||||
.dconly:
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
.dconly2:
|
||||
imul r5d, 2896
|
||||
mova m2, [o(pixel_10bpc_max)]
|
||||
add r5d, 34816
|
||||
movd m0, r5d
|
||||
packssdw m0, m0
|
||||
pmulhrsw m0, m1
|
||||
pshuflw m0, m0, q0000
|
||||
pshuflw m0, m0, q1111
|
||||
pxor m3, m3
|
||||
punpcklqdq m0, m0
|
||||
mova m1, m0
|
||||
TAIL_CALL m(iadst_4x4_internal_16bpc).end
|
||||
.dconly_loop:
|
||||
movq m1, [dstq+strideq*0]
|
||||
movhps m1, [dstq+strideq*1]
|
||||
paddw m1, m0
|
||||
pminsw m1, m2
|
||||
pmaxsw m1, m3
|
||||
movq [dstq+strideq*0], m1
|
||||
movhps [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub r3d, 2
|
||||
jg .dconly_loop
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
|
||||
INV_TXFM_FN %1, %2, %3, 4x8
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 2
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
.end:
|
||||
imul r5d, 2896
|
||||
add r5d, 34816
|
||||
movd m0, r5d
|
||||
pshuflw m0, m0, q1111
|
||||
punpcklqdq m0, m0
|
||||
pxor m4, m4
|
||||
mova m3, [o(pixel_10bpc_max)]
|
||||
lea r2, [strideq*3]
|
||||
.loop:
|
||||
movq m1, [dstq+strideq*0]
|
||||
movq m2, [dstq+strideq*2]
|
||||
movhps m1, [dstq+strideq*1]
|
||||
movhps m2, [dstq+r2]
|
||||
paddw m1, m0
|
||||
paddw m2, m0
|
||||
REPX {pminsw x, m3}, m1, m2
|
||||
REPX {pmaxsw x, m4}, m1, m2
|
||||
movq [dstq+strideq*0], m1
|
||||
movhps [dstq+strideq*1], m1
|
||||
movq [dstq+strideq*2], m2
|
||||
movhps [dstq+r2 ], m2
|
||||
lea dstq, [dstq+strideq*4]
|
||||
dec r3d
|
||||
jg .loop
|
||||
RET
|
||||
mov r3d, 8
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
|
||||
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 4
|
||||
add r5d, 6144
|
||||
sar r5d, 13
|
||||
jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
|
||||
mov r3d, 16
|
||||
add r5d, 384
|
||||
sar r5d, 9
|
||||
jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
|
||||
%endif
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 2896
|
||||
add r5d, 34816
|
||||
movd m0, r5d
|
||||
|
@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
|
||||
%endif
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 2
|
||||
.end:
|
||||
add r5d, 6144
|
||||
sar r5d, 13
|
||||
add r5d, 384
|
||||
sar r5d, 9
|
||||
.end2:
|
||||
imul r5d, 2896
|
||||
add r5d, 34816
|
||||
|
@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
|
||||
%endif
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
mov r3d, 4
|
||||
%if stack_size_padded > 0
|
||||
; adjust to caller's stack allocation
|
||||
|
@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
|
||||
%endif
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 4
|
||||
.dconly:
|
||||
add r5d, 6144
|
||||
sar r5d, 13
|
||||
add r5d, 384
|
||||
sar r5d, 9
|
||||
.dconly2:
|
||||
imul r5d, 2896
|
||||
add r5d, 34816
|
||||
|
@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
ret
|
||||
.round:
|
||||
%if ARCH_X86_64
|
||||
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
pcmpeqd m8, m8
|
||||
REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
mova m8, [r3+1*16]
|
||||
|
@ -2784,6 +2773,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
; and out0-15 is now in m0-15
|
||||
%else
|
||||
mova [r3+ 0*16], m0
|
||||
mova m0, [o(clip_18b_min)]
|
||||
REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
|
||||
pmaxsd m0, [r3+ 0*16]
|
||||
mova [r3+ 0*16], m7
|
||||
mova m7, [o(clip_18b_max)]
|
||||
REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
pminsd m7, [r3+ 0*16]
|
||||
mova [r3+ 0*16], m0
|
||||
pcmpeqd m0, m0
|
||||
REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
|
||||
|
@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
|
||||
%endif
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 8
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
%if ARCH_X86_32
|
||||
add rsp, 1*16
|
||||
%endif
|
||||
|
@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
|
||||
%endif
|
||||
%ifidn %1_%2, dct_dct
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 16
|
||||
add r5d, 10240
|
||||
sar r5d, 14
|
||||
add r5d, 640
|
||||
sar r5d, 10
|
||||
add rsp, (5+ARCH_X86_64*3+WIN64)*16
|
||||
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
|
||||
%endif
|
||||
|
@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
ret
|
||||
.round:
|
||||
%if ARCH_X86_64
|
||||
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
psrld m8, m11, 10 ; 2
|
||||
REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
mova m8, [r3+1*16]
|
||||
|
@ -4086,6 +4085,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
|
|||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
; and out0-15 is now in m0-15
|
||||
%else
|
||||
mova [r3+ 0*16], m0
|
||||
mova m0, [o(clip_18b_min)]
|
||||
REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
|
||||
pmaxsd m0, [r3+ 0*16]
|
||||
mova [r3+ 0*16], m7
|
||||
mova m7, [o(clip_18b_max)]
|
||||
REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
pminsd m7, [r3+ 0*16]
|
||||
mova [r3+ 0*16], m0
|
||||
mova m0, [o(pd_2)]
|
||||
REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
|
||||
|
@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
|
|||
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
|
||||
ret
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 8
|
||||
add r5d, 10240
|
||||
sar r5d, 14
|
||||
add r5d, 640
|
||||
sar r5d, 10
|
||||
add rsp, (31+2*ARCH_X86_64)*16
|
||||
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
|
||||
|
||||
|
@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
|
|||
%endif
|
||||
RET
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 32
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
add rsp, (65+4*ARCH_X86_64)*16
|
||||
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
|
||||
|
||||
|
@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
|
|||
; final sumsub for idct16 as well as idct32, plus final downshift
|
||||
%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
|
||||
mova m%4, [r3+16*(23-%1)]
|
||||
pmaxsd m%1, m12
|
||||
pminsd m%1, m13
|
||||
psubd m%3, m%1, m%4 ; idct16 out15 - n
|
||||
paddd m%1, m%4 ; idct16 out0 + n
|
||||
pmaxsd m%1, m12
|
||||
|
@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
|
|||
.loop_dct32_end:
|
||||
mova m0, [r3+16*16]
|
||||
mova m6, [r3+16*24]
|
||||
pmaxsd m0, m2
|
||||
pminsd m0, m3
|
||||
psubd m5, m0, m6 ; idct16 out15 - n
|
||||
paddd m0, m6 ; idct16 out0 + n
|
||||
pmaxsd m0, m2
|
||||
|
@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
|
|||
%endif
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 8
|
||||
.dconly1:
|
||||
add r5d, 10240
|
||||
sar r5d, 14
|
||||
add r5d, 640
|
||||
sar r5d, 10
|
||||
.dconly2:
|
||||
imul r5d, 2896
|
||||
add r5d, 34816
|
||||
|
@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
|
|||
%endif
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 16
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 6144
|
||||
sar r5d, 13
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
add r5d, 384
|
||||
sar r5d, 9
|
||||
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
|
||||
|
@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
|
|||
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 32
|
||||
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
|
||||
|
@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
|
|||
ret
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 64
|
||||
add r5d, 10240
|
||||
sar r5d, 14
|
||||
add r5d, 640
|
||||
sar r5d, 10
|
||||
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
|
||||
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
|
||||
|
||||
|
@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
|
|||
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 64
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 6144
|
||||
sar r5d, 13
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
add r5d, 384
|
||||
sar r5d, 9
|
||||
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
|
||||
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
|
||||
|
||||
|
@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
|
|||
mova m5, [r3-16* 4] ; idct64 48 + n
|
||||
mova m6, [r4-16*20] ; idct64 47 - n
|
||||
mova m7, [r3-16*20] ; idct64 32 + n
|
||||
pmaxsd m0, m12
|
||||
pminsd m0, m13
|
||||
paddd m8, m0, m1 ; idct16 out0 + n
|
||||
psubd m0, m1 ; idct16 out15 - n
|
||||
REPX {pmaxsd x, m12}, m8, m0
|
||||
|
@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
|
|||
mova [r4-16* 4], m6
|
||||
mova [r3+16*12], m8
|
||||
%else
|
||||
mova m1, [r3+16*44] ; idct16 15 - n
|
||||
paddd m4, m0, m1 ; idct16 out0 + n
|
||||
psubd m0, m1 ; idct16 out15 - n
|
||||
mova m5, [o(clip_18b_min)]
|
||||
mova m6, [o(clip_18b_max)]
|
||||
mova m1, [r3+16*44] ; idct16 15 - n
|
||||
pmaxsd m0, m5
|
||||
pminsd m0, m6
|
||||
paddd m4, m0, m1 ; idct16 out0 + n
|
||||
psubd m0, m1 ; idct16 out15 - n
|
||||
REPX {pmaxsd x, m5}, m4, m0
|
||||
REPX {pminsd x, m6}, m4, m0
|
||||
paddd m1, m4, m3 ; idct32 out0 + n
|
||||
|
@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
|
|||
ret
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 16
|
||||
.dconly1:
|
||||
add r5d, 10240
|
||||
sar r5d, 14
|
||||
add r5d, 640
|
||||
sar r5d, 10
|
||||
.dconly2:
|
||||
imul r5d, 2896
|
||||
add r5d, 34816
|
||||
|
@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
|
|||
ret
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 32
|
||||
add r5d, 2048
|
||||
sar r5d, 12
|
||||
imul r5d, 2896
|
||||
add r5d, 6144
|
||||
sar r5d, 13
|
||||
add r5d, 128
|
||||
sar r5d, 8
|
||||
imul r5d, 181
|
||||
add r5d, 384
|
||||
sar r5d, 9
|
||||
add rsp, (1+8*32+1*WIN64)*16
|
||||
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
|
||||
|
||||
|
@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
|
|||
ret
|
||||
|
||||
.dconly:
|
||||
imul r5d, [cq], 2896
|
||||
imul r5d, [cq], 181
|
||||
mov [cq], eobd ; 0
|
||||
mov r3d, 64
|
||||
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
|
||||
|
|
|
@ -29,7 +29,8 @@
|
|||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA 64
|
||||
int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
|
||||
const \
|
||||
int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
|
||||
db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
|
||||
db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
|
||||
db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
|
||||
|
@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
|
|||
punpcklwd m3, m5 ; dct8 in3 in5
|
||||
punpckhwd m5, m2 ; dct16 in11 in5
|
||||
punpcklwd m6, m2 ; dct4 in3 in1
|
||||
.main2:
|
||||
cglobal_label .main2
|
||||
vpbroadcastd m10, [o(pd_2048)]
|
||||
.main3:
|
||||
vpbroadcastq m13, [o(int_mshift)]
|
||||
|
@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
|
|||
vpermq m3, m3, q2031
|
||||
jmp m(iadst_8x8_internal_8bpc).end2
|
||||
ALIGN function_align
|
||||
.main:
|
||||
cglobal_label .main
|
||||
IDCT8_1D_PACKED
|
||||
ret
|
||||
|
||||
|
@ -1422,7 +1423,7 @@ ALIGN function_align
|
|||
punpckhqdq m0, m4 ; out0 -out1
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2:
|
||||
cglobal_label .main_pass2
|
||||
IADST8_1D_PACKED 2
|
||||
ret
|
||||
|
||||
|
@ -1608,7 +1609,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
|
|||
vpscatterdq [r3+ym8]{k2}, m2
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
cglobal_label .main
|
||||
WRAP_YMM IDCT16_1D_PACKED
|
||||
ret
|
||||
|
||||
|
@ -1685,13 +1686,14 @@ ALIGN function_align
|
|||
vpermi2q m6, m0, m2 ; in4 in8 in6 in10
|
||||
vpermt2q m1, m10, m3 ; in11 in7 in9 in5
|
||||
.main:
|
||||
vpbroadcastd m9, [o(pd_2048)]
|
||||
vpbroadcastq m13, [o(int_mshift)]
|
||||
kxnorb k1, k1, k1
|
||||
punpcklwd m0, m4, m5 ; in0 in15 in2 in13
|
||||
punpckhwd m4, m5 ; in12 in3 in14 in1
|
||||
punpcklwd m5, m6, m1 ; in4 in11 in6 in9
|
||||
punpckhwd m6, m1 ; in8 in7 in10 in5
|
||||
cglobal_label .main2
|
||||
vpbroadcastd m9, [o(pd_2048)]
|
||||
vpbroadcastq m13, [o(int_mshift)]
|
||||
kxnorb k1, k1, k1
|
||||
vpcmpub k7, m13, m9, 6 ; 0x33...
|
||||
pxor m8, m8
|
||||
ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
|
||||
|
@ -2114,7 +2116,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
|
|||
vextracti32x4 [r3 +r4 ], m1, 3
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
cglobal_label .main
|
||||
IDCT8_1D_PACKED
|
||||
ret
|
||||
|
||||
|
@ -2168,6 +2170,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
|
|||
pshufd m4, m0, q1032 ; 1 0
|
||||
pshufd m5, m1, q1032 ; 3 2
|
||||
call .main_pass2
|
||||
movshdup m4, [o(permC)]
|
||||
pmulhrsw m0, m6
|
||||
pmulhrsw m1, m6
|
||||
psrlq m6, m4, 4
|
||||
|
@ -2194,9 +2197,8 @@ ALIGN function_align
|
|||
IADST8_1D_PACKED 1
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2:
|
||||
cglobal_label .main_pass2
|
||||
IADST8_1D_PACKED 2
|
||||
movshdup m4, [o(permC)]
|
||||
pxor m5, m5
|
||||
psubd m5, m6
|
||||
packssdw m6, m5
|
||||
|
@ -2222,6 +2224,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
|
|||
pshufd m4, m0, q1032 ; 1 0
|
||||
pshufd m5, m1, q1032 ; 3 2
|
||||
call m(iadst_16x8_internal_8bpc).main_pass2
|
||||
movshdup m4, [o(permC)]
|
||||
pmulhrsw m5, m6, m0
|
||||
pmulhrsw m0, m6, m1
|
||||
psrlq m1, m4, 12
|
||||
|
@ -2456,7 +2459,7 @@ ALIGN function_align
|
|||
pmulhrsw m3, m4 ; t5a t6a
|
||||
jmp .main4
|
||||
ALIGN function_align
|
||||
.main:
|
||||
cglobal_label .main
|
||||
IDCT16_1D_PACKED
|
||||
ret
|
||||
|
||||
|
@ -2562,6 +2565,7 @@ ALIGN function_align
|
|||
vshufi32x4 m1, m5, q2020 ; 2 3
|
||||
vshufi32x4 m5, m7, m9, q2020 ; 10 11
|
||||
vshufi32x4 m7, m9, q3131 ; 14 15
|
||||
cglobal_label .main_pass2b
|
||||
REPX {pshufd x, x, q1032}, m1, m3, m5, m7
|
||||
call .main
|
||||
vpbroadcastd m8, [o(pw_2896x8)]
|
||||
|
|
|
@ -329,11 +329,11 @@ ALIGN function_align
|
|||
packuswb m2, m4
|
||||
psrlw m2, 8
|
||||
vpackuswb m2{k2}, m3, m5
|
||||
mova [dstq+r10], m2
|
||||
add r10, 64
|
||||
jl .hv_loop
|
||||
mov t6, t5
|
||||
mov t5, t4
|
||||
movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
|
||||
add r10, 64 ; function is used for chroma as well, and in some
|
||||
jl .hv_loop ; esoteric edge cases chroma dst pointers may only
|
||||
mov t6, t5 ; have a 32-byte alignment despite having a width
|
||||
mov t5, t4 ; larger than 32, so use an unaligned store here.
|
||||
mov t4, t3
|
||||
mov t3, t2
|
||||
mov t2, t1
|
||||
|
@ -379,7 +379,7 @@ ALIGN function_align
|
|||
packuswb m0, m2
|
||||
psrlw m0, 8
|
||||
vpackuswb m0{k2}, m1, m3
|
||||
mova [dstq+r10], m0
|
||||
movu [dstq+r10], m0
|
||||
add r10, 64
|
||||
jl .v_loop
|
||||
mov t6, t5
|
||||
|
|
|
@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
|
|||
vpbroadcastd m11, [buf+ 4]
|
||||
vpbroadcastd m12, [buf+ 8]
|
||||
vpbroadcastd m13, [buf+12]
|
||||
cmp wd, 16
|
||||
sub wd, 16
|
||||
je .h_w16
|
||||
jg .h_w32
|
||||
.h_w8:
|
||||
|
|
|
@ -121,15 +121,15 @@ static struct {
|
|||
CheckasmFunc *current_func;
|
||||
CheckasmFuncVersion *current_func_ver;
|
||||
const char *current_test_name;
|
||||
const char *bench_pattern;
|
||||
size_t bench_pattern_len;
|
||||
int num_checked;
|
||||
int num_failed;
|
||||
int nop_time;
|
||||
unsigned cpu_flag;
|
||||
const char *cpu_flag_name;
|
||||
const char *test_name;
|
||||
const char *test_pattern;
|
||||
const char *function_pattern;
|
||||
unsigned seed;
|
||||
int bench;
|
||||
int bench_c;
|
||||
int verbose;
|
||||
int function_listing;
|
||||
|
@ -489,6 +489,21 @@ static void signal_handler(const int s) {
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Compares a string with a wildcard pattern. */
|
||||
static int wildstrcmp(const char *str, const char *pattern) {
|
||||
const char *wild = strchr(pattern, '*');
|
||||
if (wild) {
|
||||
const size_t len = wild - pattern;
|
||||
if (strncmp(str, pattern, len)) return 1;
|
||||
while (*++wild == '*');
|
||||
if (!*wild) return 0;
|
||||
str += len;
|
||||
while (*str && wildstrcmp(str, wild)) str++;
|
||||
return !*str;
|
||||
}
|
||||
return strcmp(str, pattern);
|
||||
}
|
||||
|
||||
/* Perform tests and benchmarks for the specified
|
||||
* cpu flag if supported by the host */
|
||||
static void check_cpu_flag(const char *const name, unsigned flag) {
|
||||
|
@ -501,7 +516,7 @@ static void check_cpu_flag(const char *const name, unsigned flag) {
|
|||
if (!flag || state.cpu_flag != old_cpu_flag) {
|
||||
state.cpu_flag_name = name;
|
||||
for (int i = 0; tests[i].func; i++) {
|
||||
if (state.test_name && strcmp(tests[i].name, state.test_name))
|
||||
if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern))
|
||||
continue;
|
||||
xor128_srand(state.seed);
|
||||
state.current_test_name = tests[i].name;
|
||||
|
@ -536,33 +551,40 @@ int main(int argc, char *argv[]) {
|
|||
state.seed = get_seed();
|
||||
|
||||
while (argc > 1) {
|
||||
if (!strncmp(argv[1], "--help", 6)) {
|
||||
if (!strncmp(argv[1], "--help", 6) || !strcmp(argv[1], "-h")) {
|
||||
fprintf(stderr,
|
||||
"checkasm [options] <random seed>\n"
|
||||
" <random seed> Numeric value to seed the rng\n"
|
||||
" <random seed> Numeric value to seed the rng\n"
|
||||
"Options:\n"
|
||||
" --test=<test_name> Test only <test_name>\n"
|
||||
" --bench=<pattern> Test and benchmark the functions matching <pattern>\n"
|
||||
" --list-functions List available functions\n"
|
||||
" --list-tests List available tests\n"
|
||||
" --bench-c Benchmark the C-only functions\n"
|
||||
" --verbose -v Print failures verbosely\n");
|
||||
" --test=<pattern> Test only <pattern>\n"
|
||||
" --function=<pattern> -f Test only the functions matching <pattern>\n"
|
||||
" --bench -b Benchmark the tested functions\n"
|
||||
" --list-functions List available functions\n"
|
||||
" --list-tests List available tests\n"
|
||||
" --bench-c -c Benchmark the C-only functions\n"
|
||||
" --verbose -v Print failures verbosely\n");
|
||||
return 0;
|
||||
} else if (!strncmp(argv[1], "--bench-c", 9)) {
|
||||
} else if (!strcmp(argv[1], "--bench-c") || !strcmp(argv[1], "-c")) {
|
||||
state.bench_c = 1;
|
||||
} else if (!strncmp(argv[1], "--bench", 7)) {
|
||||
} else if (!strcmp(argv[1], "--bench") || !strcmp(argv[1], "-b")) {
|
||||
#ifndef readtime
|
||||
fprintf(stderr,
|
||||
"checkasm: --bench is not supported on your system\n");
|
||||
return 1;
|
||||
#endif
|
||||
if (argv[1][7] == '=') {
|
||||
state.bench_pattern = argv[1] + 8;
|
||||
state.bench_pattern_len = strlen(state.bench_pattern);
|
||||
} else
|
||||
state.bench_pattern = "";
|
||||
state.bench = 1;
|
||||
} else if (!strncmp(argv[1], "--test=", 7)) {
|
||||
state.test_name = argv[1] + 7;
|
||||
state.test_pattern = argv[1] + 7;
|
||||
} else if (!strcmp(argv[1], "-t")) {
|
||||
state.test_pattern = argc > 1 ? argv[2] : "";
|
||||
argc--;
|
||||
argv++;
|
||||
} else if (!strncmp(argv[1], "--function=", 11)) {
|
||||
state.function_pattern = argv[1] + 11;
|
||||
} else if (!strcmp(argv[1], "-f")) {
|
||||
state.function_pattern = argc > 1 ? argv[2] : "";
|
||||
argc--;
|
||||
argv++;
|
||||
} else if (!strcmp(argv[1], "--list-functions")) {
|
||||
state.function_listing = 1;
|
||||
} else if (!strcmp(argv[1], "--list-tests")) {
|
||||
|
@ -602,7 +624,7 @@ int main(int argc, char *argv[]) {
|
|||
#endif
|
||||
|
||||
#ifdef readtime
|
||||
if (state.bench_pattern) {
|
||||
if (state.bench) {
|
||||
static int testing = 0;
|
||||
checkasm_save_context();
|
||||
if (!testing) {
|
||||
|
@ -658,7 +680,7 @@ int main(int argc, char *argv[]) {
|
|||
} else {
|
||||
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
|
||||
#ifdef readtime
|
||||
if (state.bench_pattern) {
|
||||
if (state.bench) {
|
||||
state.nop_time = measure_nop_time();
|
||||
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
|
||||
print_benchs(state.funcs);
|
||||
|
@ -682,8 +704,11 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
|
|||
const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
|
||||
va_end(arg);
|
||||
|
||||
if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf))
|
||||
if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf) ||
|
||||
(state.function_pattern && wildstrcmp(name_buf, state.function_pattern)))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
state.current_func = get_func(&state.funcs, name_buf);
|
||||
|
||||
|
@ -724,9 +749,7 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
|
|||
|
||||
/* Decide whether or not the current function needs to be benchmarked */
|
||||
int checkasm_bench_func(void) {
|
||||
return !state.num_failed && state.bench_pattern &&
|
||||
!strncmp(state.current_func->name, state.bench_pattern,
|
||||
state.bench_pattern_len);
|
||||
return !state.num_failed && state.bench;
|
||||
}
|
||||
|
||||
/* Indicate that the current test has failed, return whether verbose printing
|
||||
|
|
|
@ -185,17 +185,6 @@ static inline uint64_t readtime(void) {
|
|||
void checkasm_checked_call(void *func, ...);
|
||||
|
||||
#if ARCH_X86_64
|
||||
/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended
|
||||
* to 64-bit. This is done by clobbering the stack with junk around the stack
|
||||
* pointer and calling the assembly function through checked_call() with added
|
||||
* dummy arguments which forces all real arguments to be passed on the stack
|
||||
* and not in registers. For 32-bit arguments the upper half of the 64-bit
|
||||
* register locations on the stack will now contain junk which will cause
|
||||
* misbehaving functions to either produce incorrect output or segfault. Note
|
||||
* that even though this works extremely well in practice, it's technically
|
||||
* not guaranteed and false negatives is theoretically possible, but there
|
||||
* can never be any false positives. */
|
||||
void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
/* YMM and ZMM registers on x86 are turned off to save power when they haven't
|
||||
* been used for some period of time. When they are used there will be a
|
||||
* "warmup" period during which performance will be reduced and inconsistent
|
||||
|
@ -203,24 +192,54 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
|
|||
* work around this by periodically issuing "dummy" instructions that uses
|
||||
* those registers to keep them powered on. */
|
||||
void checkasm_simd_warmup(void);
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\
|
||||
int, int, int, int, int, int, int, int,\
|
||||
int, int, int, int, int, int, int) =\
|
||||
(void *)checkasm_checked_call;
|
||||
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
|
||||
|
||||
/* The upper 32 bits of 32-bit data types are undefined when passed as function
|
||||
* parameters. In practice those bits usually end up being zero which may hide
|
||||
* certain bugs, such as using a register containing undefined bits as a pointer
|
||||
* offset, so we want to intentionally clobber those bits with junk to expose
|
||||
* any issues. The following set of macros automatically calculates a bitmask
|
||||
* specifying which parameters should have their upper halves clobbered. */
|
||||
#ifdef _WIN32
|
||||
#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0
|
||||
/* Integer and floating-point parameters share "register slots". */
|
||||
#define IGNORED_FP_ARGS 0
|
||||
#else
|
||||
#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0
|
||||
/* Up to 8 floating-point parameters are passed in XMM registers, which are
|
||||
* handled orthogonally from integer parameters passed in GPR registers. */
|
||||
#define IGNORED_FP_ARGS 8
|
||||
#endif
|
||||
#ifdef HAVE_C11_GENERIC
|
||||
#define clobber_type(arg) _Generic((void (*)(void*, arg))NULL,\
|
||||
void (*)(void*, int32_t ): clobber_mask |= 1 << mpos++,\
|
||||
void (*)(void*, uint32_t): clobber_mask |= 1 << mpos++,\
|
||||
void (*)(void*, float ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\
|
||||
void (*)(void*, double ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\
|
||||
default: mpos++)
|
||||
#define init_clobber_mask(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, ...)\
|
||||
unsigned clobber_mask = 0;\
|
||||
{\
|
||||
int mpos = 0, fp_args = 0;\
|
||||
clobber_type(a); clobber_type(b); clobber_type(c); clobber_type(d);\
|
||||
clobber_type(e); clobber_type(f); clobber_type(g); clobber_type(h);\
|
||||
clobber_type(i); clobber_type(j); clobber_type(k); clobber_type(l);\
|
||||
clobber_type(m); clobber_type(n); clobber_type(o); clobber_type(p);\
|
||||
}
|
||||
#else
|
||||
/* Skip parameter clobbering on compilers without support for _Generic() */
|
||||
#define init_clobber_mask(...) unsigned clobber_mask = 0
|
||||
#endif
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(__VA_ARGS__, int, int, int, int, int, int, int,\
|
||||
int, int, int, int, int, int, int, int, int,\
|
||||
void*, unsigned) =\
|
||||
(void*)checkasm_checked_call;\
|
||||
init_clobber_mask(__VA_ARGS__, void*, void*, void*, void*,\
|
||||
void*, void*, void*, void*, void*, void*,\
|
||||
void*, void*, void*, void*, void*);
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
checkasm_simd_warmup(),\
|
||||
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
|
||||
checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\
|
||||
checked_call(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8,\
|
||||
7, 6, 5, 4, 3, 2, 1, func_new, clobber_mask));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#elif ARCH_X86_32
|
||||
#define declare_new(ret, ...)\
|
||||
|
|
|
@ -55,6 +55,7 @@ n14: dq 0x249214109d5d1c88
|
|||
%endif
|
||||
|
||||
errmsg_stack: db "stack corruption", 0
|
||||
errmsg_register: db "failed to preserve register:%s", 0
|
||||
errmsg_vzeroupper: db "missing vzeroupper", 0
|
||||
|
||||
SECTION .bss
|
||||
|
@ -151,56 +152,44 @@ cglobal init_x86, 0, 5
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
;-----------------------------------------------------------------------------
|
||||
; int checkasm_stack_clobber(uint64_t clobber, ...)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal stack_clobber, 1, 2
|
||||
; Clobber the stack with junk below the stack pointer
|
||||
%define argsize (max_args+6)*8
|
||||
SUB rsp, argsize
|
||||
mov r1, argsize-8
|
||||
.loop:
|
||||
mov [rsp+r1], r0
|
||||
sub r1, 8
|
||||
jge .loop
|
||||
ADD rsp, argsize
|
||||
RET
|
||||
|
||||
%if WIN64
|
||||
%assign free_regs 7
|
||||
%define stack_param rsp+32 ; shadow space
|
||||
%define num_stack_params rsp+stack_offset+22*8
|
||||
%define num_fn_args rsp+stack_offset+17*8
|
||||
%assign num_reg_args 4
|
||||
%assign free_regs 7
|
||||
%assign clobber_mask_stack_bit 16
|
||||
DECLARE_REG_TMP 4
|
||||
%else
|
||||
%assign free_regs 9
|
||||
%define stack_param rsp
|
||||
%define num_stack_params rsp+stack_offset+16*8
|
||||
%define num_fn_args rsp+stack_offset+11*8
|
||||
%assign num_reg_args 6
|
||||
%assign free_regs 9
|
||||
%assign clobber_mask_stack_bit 64
|
||||
DECLARE_REG_TMP 7
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void checkasm_checked_call(void *func, ...)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro CLOBBER_UPPER 2 ; reg, mask_bit
|
||||
mov r13d, %1d
|
||||
or r13, r8
|
||||
test r9b, %2
|
||||
cmovnz %1, r13
|
||||
%endmacro
|
||||
|
||||
cglobal checked_call, 2, 15, 16, max_args*8+64+8
|
||||
mov t0, r0
|
||||
mov r10d, [num_fn_args]
|
||||
mov r8, 0xdeadbeef00000000
|
||||
mov r9d, [num_fn_args+r10*8+8] ; clobber_mask
|
||||
mov t0, [num_fn_args+r10*8] ; func
|
||||
|
||||
; All arguments have been pushed on the stack instead of registers in
|
||||
; order to test for incorrect assumptions that 32-bit ints are
|
||||
; zero-extended to 64-bit.
|
||||
mov r0, r6mp
|
||||
mov r1, r7mp
|
||||
mov r2, r8mp
|
||||
mov r3, r9mp
|
||||
; Clobber the upper halves of 32-bit parameters
|
||||
CLOBBER_UPPER r0, 1
|
||||
CLOBBER_UPPER r1, 2
|
||||
CLOBBER_UPPER r2, 4
|
||||
CLOBBER_UPPER r3, 8
|
||||
%if UNIX64
|
||||
mov r4, r10mp
|
||||
mov r5, r11mp
|
||||
CLOBBER_UPPER r4, 16
|
||||
CLOBBER_UPPER r5, 32
|
||||
%else ; WIN64
|
||||
; Move possible floating-point arguments to the correct registers
|
||||
movq m0, r0
|
||||
movq m1, r1
|
||||
movq m2, r2
|
||||
movq m3, r3
|
||||
|
||||
%assign i 6
|
||||
%rep 16-6
|
||||
mova m %+ i, [x %+ i]
|
||||
|
@ -208,22 +197,29 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
|
|||
%endrep
|
||||
%endif
|
||||
|
||||
xor r11d, r11d
|
||||
sub r10d, num_reg_args
|
||||
cmovs r10d, r11d ; num stack args
|
||||
|
||||
; write stack canaries to the area above parameters passed on the stack
|
||||
mov r9d, [num_stack_params]
|
||||
mov r8, [rsp+stack_offset] ; return address
|
||||
not r8
|
||||
mov r12, [rsp+stack_offset] ; return address
|
||||
not r12
|
||||
%assign i 0
|
||||
%rep 8 ; 64 bytes
|
||||
mov [stack_param+(r9+i)*8], r8
|
||||
mov [stack_param+(r10+i)*8], r12
|
||||
%assign i i+1
|
||||
%endrep
|
||||
dec r9d
|
||||
jl .stack_setup_done ; no stack parameters
|
||||
|
||||
test r10d, r10d
|
||||
jz .stack_setup_done ; no stack parameters
|
||||
.copy_stack_parameter:
|
||||
mov r8, [stack_param+stack_offset+7*8+r9*8]
|
||||
mov [stack_param+r9*8], r8
|
||||
dec r9d
|
||||
jge .copy_stack_parameter
|
||||
mov r12, [stack_param+stack_offset+8+r11*8]
|
||||
CLOBBER_UPPER r12, clobber_mask_stack_bit
|
||||
shr r9d, 1
|
||||
mov [stack_param+r11*8], r12
|
||||
inc r11d
|
||||
cmp r11d, r10d
|
||||
jl .copy_stack_parameter
|
||||
.stack_setup_done:
|
||||
|
||||
%assign i 14
|
||||
|
@ -234,7 +230,11 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
|
|||
call t0
|
||||
|
||||
; check for stack corruption
|
||||
mov r0d, [num_stack_params]
|
||||
mov r0d, [num_fn_args]
|
||||
xor r3d, r3d
|
||||
sub r0d, num_reg_args
|
||||
cmovs r0d, r3d ; num stack args
|
||||
|
||||
mov r3, [rsp+stack_offset]
|
||||
mov r4, [stack_param+r0*8]
|
||||
not r3
|
||||
|
@ -247,27 +247,32 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
|
|||
%assign i i+1
|
||||
%endrep
|
||||
xor r3, [stack_param+(r0+7)*8]
|
||||
lea r0, [errmsg_stack]
|
||||
or r4, r3
|
||||
jnz .save_retval_and_fail
|
||||
jz .stack_ok
|
||||
; Save the return value located in rdx:rax first to prevent clobbering.
|
||||
mov r10, rax
|
||||
mov r11, rdx
|
||||
lea r0, [errmsg_stack]
|
||||
jmp .fail
|
||||
.stack_ok:
|
||||
|
||||
; check for failure to preserve registers
|
||||
%assign i 14
|
||||
%rep 15-free_regs
|
||||
cmp r %+ i, [r0-errmsg_stack+n %+ i]
|
||||
cmp r %+ i, [n %+ i]
|
||||
setne r4b
|
||||
lea r3d, [r4+r3*2]
|
||||
%assign i i-1
|
||||
%endrep
|
||||
%if WIN64
|
||||
lea r0, [rsp+60] ; account for shadow space
|
||||
lea r0, [rsp+32] ; account for shadow space
|
||||
mov r5, r0
|
||||
test r3d, r3d
|
||||
jz .gpr_ok
|
||||
%else
|
||||
test r3d, r3d
|
||||
jz .gpr_xmm_ok
|
||||
lea r0, [rsp+28]
|
||||
mov r0, rsp
|
||||
%endif
|
||||
%assign i free_regs
|
||||
%rep 15-free_regs
|
||||
|
@ -324,22 +329,15 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
|
|||
cmp r0, r5
|
||||
je .gpr_xmm_ok
|
||||
mov byte [r0], 0
|
||||
lea r0, [r5-28]
|
||||
mov r11, rdx
|
||||
mov r1, r5
|
||||
%else
|
||||
mov byte [r0], 0
|
||||
mov r0, rsp
|
||||
%endif
|
||||
mov dword [r0+ 0], "fail"
|
||||
mov dword [r0+ 4], "ed t"
|
||||
mov dword [r0+ 8], "o pr"
|
||||
mov dword [r0+12], "eser"
|
||||
mov dword [r0+16], "ve r"
|
||||
mov dword [r0+20], "egis"
|
||||
mov dword [r0+24], "ter:"
|
||||
.save_retval_and_fail:
|
||||
; Save the return value located in rdx:rax first to prevent clobbering.
|
||||
mov r10, rax
|
||||
mov r11, rdx
|
||||
mov r1, rsp
|
||||
%endif
|
||||
mov r10, rax
|
||||
lea r0, [errmsg_register]
|
||||
jmp .fail
|
||||
.gpr_xmm_ok:
|
||||
; Check for dirty YMM state, i.e. missing vzeroupper
|
||||
|
@ -420,25 +418,19 @@ cglobal checked_call, 1, 7
|
|||
test r3, r3
|
||||
jz .gpr_ok
|
||||
lea r1, [esp+16]
|
||||
mov dword [r1+ 0], "fail"
|
||||
mov dword [r1+ 4], "ed t"
|
||||
mov dword [r1+ 8], "o pr"
|
||||
mov dword [r1+12], "eser"
|
||||
mov dword [r1+16], "ve r"
|
||||
mov dword [r1+20], "egis"
|
||||
mov dword [r1+24], "ter:"
|
||||
lea r4, [r1+28]
|
||||
mov [esp+4], r1
|
||||
%assign i 3
|
||||
%rep 4
|
||||
mov dword [r4], " r0" + (i << 16)
|
||||
lea r5, [r4+3]
|
||||
mov dword [r1], " r0" + (i << 16)
|
||||
lea r4, [r1+3]
|
||||
test r3, 1 << ((6 - i) * 8)
|
||||
cmovnz r4, r5
|
||||
cmovnz r1, r4
|
||||
%assign i i+1
|
||||
%endrep
|
||||
mov byte [r4], 0
|
||||
mov byte [r1], 0
|
||||
mov r5, eax
|
||||
mov r6, edx
|
||||
LEA r1, errmsg_register
|
||||
jmp .fail
|
||||
.gpr_ok:
|
||||
; check for stack corruption
|
||||
|
|
Загрузка…
Ссылка в новой задаче