Bug 1734058 - Update dav1d to new version f52aee04fbd711cddab23d0aa9b196e9c963e7b8 from 2021-10-04 21:58:36. r=mjf,haik

This is a fairly significant update, so required a few changes to Gecko code, but I've commented on the interesting details, so they should be easy to find

Differential Revision: https://phabricator.services.mozilla.com/D129465
This commit is contained in:
Jon Bauman 2021-10-26 17:11:36 +00:00
Родитель 5ac2b54c29
Коммит 874adf9b96
101 изменённых файлов: 38970 добавлений и 7530 удалений

Просмотреть файл

@ -37,13 +37,8 @@ RefPtr<MediaDataDecoder::InitPromise> DAV1DDecoder::Init() {
} else if (mInfo.mDisplay.width >= 1024) {
decoder_threads = 4;
}
settings.n_frame_threads =
settings.n_threads =
static_cast<int>(std::min(decoder_threads, GetNumberOfProcessors()));
// There is not much improvement with more than 2 tile threads at least with
// the content being currently served. The ideal number of tile thread would
// much the tile count of the content. Maybe dav1d can help to do that in the
// future.
settings.n_tile_threads = 2;
int res = dav1d_open(&mContext, &settings);
if (res < 0) {

Просмотреть файл

@ -581,6 +581,7 @@ class Dav1dDecoder final : AVIFDecoderInterface {
Dav1dSettings settings;
dav1d_default_settings(&settings);
settings.all_layers = 0;
settings.max_frame_delay = 1;
// TODO: tune settings a la DAV1DDecoder for AV1 (Bug 1681816)
return dav1d_open(&mContext, &settings);

Просмотреть файл

@ -71,6 +71,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
SOURCES += [
'../../../third_party/dav1d/src/x86/cpu.c',
'../../../third_party/dav1d/src/x86/msac_init.c',
'../../../third_party/dav1d/src/x86/refmvs_init.c',
]
EXPORTS.dav1d += [
@ -88,12 +89,10 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/cdef_avx512.asm',
'../../../third_party/dav1d/src/x86/film_grain16_avx2.asm',
'../../../third_party/dav1d/src/x86/film_grain_avx2.asm',
'../../../third_party/dav1d/src/x86/ipred16_avx2.asm',
'../../../third_party/dav1d/src/x86/ipred_avx2.asm',
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
'../../../third_party/dav1d/src/x86/itx_avx2.asm',
'../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',
'../../../third_party/dav1d/src/x86/loopfilter16_sse.asm',
'../../../third_party/dav1d/src/x86/loopfilter_avx2.asm',
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm',
'../../../third_party/dav1d/src/x86/looprestoration_avx2.asm',
@ -106,16 +105,21 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/cdef16_sse.asm',
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
'../../../third_party/dav1d/src/x86/cpuid.asm',
'../../../third_party/dav1d/src/x86/film_grain16_sse.asm',
'../../../third_party/dav1d/src/x86/film_grain_sse.asm',
'../../../third_party/dav1d/src/x86/ipred16_avx2.asm',
'../../../third_party/dav1d/src/x86/ipred16_sse.asm',
'../../../third_party/dav1d/src/x86/ipred_sse.asm',
'../../../third_party/dav1d/src/x86/itx16_sse.asm',
'../../../third_party/dav1d/src/x86/itx_sse.asm',
'../../../third_party/dav1d/src/x86/loopfilter16_sse.asm',
'../../../third_party/dav1d/src/x86/loopfilter_sse.asm',
'../../../third_party/dav1d/src/x86/looprestoration16_sse.asm', # moved from autovendored
'../../../third_party/dav1d/src/x86/looprestoration_sse.asm',
'../../../third_party/dav1d/src/x86/mc16_sse.asm',
'../../../third_party/dav1d/src/x86/mc_sse.asm',
'../../../third_party/dav1d/src/x86/msac.asm',
'../../../third_party/dav1d/src/x86/refmvs.asm',
]
# BITDEPTH
@ -148,6 +152,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
SOURCES += [
'../../../third_party/dav1d/src/arm/cpu.c',
'../../../third_party/dav1d/src/arm/refmvs_init.c',
]
EXPORTS += [
'../../../third_party/dav1d/src/arm/asm-offsets.h',
@ -203,6 +208,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
'../../../third_party/dav1d/src/arm/64/mc.S',
'../../../third_party/dav1d/src/arm/64/mc16.S',
'../../../third_party/dav1d/src/arm/64/msac.S',
'../../../third_party/dav1d/src/arm/64/refmvs.S',
]
elif CONFIG['CPU_ARCH'] == 'arm':
SOURCES += [
@ -224,6 +230,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
'../../../third_party/dav1d/src/arm/32/mc.S',
'../../../third_party/dav1d/src/arm/32/mc16.S',
'../../../third_party/dav1d/src/arm/32/msac.S',
'../../../third_party/dav1d/src/arm/32/refmvs.S',
]
if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit ddbbfde198aced0d02ea739c320d754d43406f7b (2021-06-12T07:58:29.000+00:00).
release: commit f52aee04fbd711cddab23d0aa9b196e9c963e7b8 (2021-10-04T21:58:36.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: ddbbfde198aced0d02ea739c320d754d43406f7b
revision: f52aee04fbd711cddab23d0aa9b196e9c963e7b8
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.9.0-24-gddbbfde"
#define DAV1D_VERSION "f52aee04fbd711cddab23d0aa9b196e9c963e7b8"

Просмотреть файл

@ -122,6 +122,7 @@ static const char SandboxPolicyContent[] = R"SANDBOX_LITERAL(
(sysctl-name "hw.activecpu")
(sysctl-name "hw.byteorder")
(sysctl-name "hw.pagesize_compat")
(sysctl-name "hw.logicalcpu")
(sysctl-name "hw.logicalcpu_max")
(sysctl-name "hw.physicalcpu_max")
(sysctl-name "hw.busfrequency_compat")

26
third_party/dav1d/NEWS поставляемый
Просмотреть файл

@ -1,3 +1,29 @@
Changes for 0.9.2 'Golden Eagle':
---------------------------------
0.9.2 is a small update of dav1d on the 0.9.x branch:
- x86: SSE4 optimizations of inverse transforms for 10bit for all sizes
- x86: mc.resize optimizations with AVX2/SSSE3 for 10/12b
- x86: SSSE3 optimizations for cdef_filter in 10/12b and mc_w_mask_422/444 in 8b
- ARM NEON optimizations for FilmGrain Gen_grain functions
- Optimizations for splat_mv in SSE2/AVX2 and NEON
- x86: SGR improvements for SSSE3 CPUs
- x86: AVX2 optimizations for cfl_ac
Changes for 0.9.1 'Golden Eagle':
---------------------------------
0.9.1 is a middle-size revision of dav1d, adding notably 10b acceleration for SSSE3:
- 10/12b SSSE3 optimizations for mc (avg, w_avg, mask, w_mask, emu_edge),
prep/put_bilin, prep/put_8tap, ipred (dc/h/v, paeth, smooth, pal, filter), wiener,
sgr (10b), warp8x8, deblock, film_grain, cfl_ac/pred for 32bit and 64bit x86 processors
- Film grain NEON for fguv 10/12b, fgy/fguv 8b and fgy/fguv 10/12 arm32
- Fixes for filmgrain on ARM
- itx 10bit optimizations for 4x4/x8/x16, 8x4/x8/x16 for SSE4
- Misc improvements on SSE2, SSE4
Changes for 0.9.0 'Golden Eagle':
---------------------------------

14
third_party/dav1d/README.md поставляемый
Просмотреть файл

@ -36,16 +36,16 @@ The plan is the following:
7. Make high bit-depth fast on mobile, by writing asm for ARMv8 chips.
8. Make it fast on older mobile, by writing asm for ARMv7 chips,
9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips,
10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
### On-going
10. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
11. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
12. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
12. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
13. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
14. Improve threading.
### After
13. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
14. Use more GPU decoding, when possible.
15. Improve threading.
15. Use more GPU decoding, when possible.
# Contribute
@ -60,7 +60,7 @@ Our contributions guidelines are quite strict. We want to build a coherent codeb
Notably, the codebase is in pure C and asm.
We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [KiwiIRC Web Interface](https://kiwiirc.com/nextclient/#ircs://irc.libera.chat/#dav1d).
We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [IRC Web Interface](https://web.libera.chat/#dav1d).
See the [contributions document](CONTRIBUTING.md).

24
third_party/dav1d/include/common/attributes.h поставляемый
Просмотреть файл

@ -33,6 +33,14 @@
#include <stddef.h>
#include <assert.h>
#ifndef __has_attribute
#define __has_attribute(x) 0
#endif
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#ifdef __GNUC__
#define ATTR_ALIAS __attribute__((may_alias))
#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr)))
@ -93,9 +101,11 @@
*/
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else /* !_MSC_VER */
#elif __has_attribute(noclone)
#define NOINLINE __attribute__((noinline, noclone))
#else
#define NOINLINE __attribute__((noinline))
#endif /* !_MSC_VER */
#endif
#ifdef __clang__
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
@ -160,10 +170,6 @@ static inline int clzll(const unsigned long long mask) {
}
#endif /* !_MSC_VER */
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#ifndef static_assert
#define CHECK_OFFSET(type, field, name) \
struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
@ -172,4 +178,10 @@ static inline int clzll(const unsigned long long mask) {
static_assert(name == offsetof(type, field), #field)
#endif
#ifdef _MSC_VER
#define PACKED(...) __pragma(pack(push, 1)) __VA_ARGS__ __pragma(pack(pop))
#else
#define PACKED(...) __VA_ARGS__ __attribute__((__packed__))
#endif
#endif /* DAV1D_COMMON_ATTRIBUTES_H */

Просмотреть файл

@ -41,6 +41,8 @@ typedef unsigned int atomic_uint;
#define atomic_load_explicit(p_a, mo) __atomic_load_n(p_a, mo)
#define atomic_fetch_add(p_a, inc) __atomic_fetch_add(p_a, inc, __ATOMIC_SEQ_CST)
#define atomic_fetch_sub(p_a, dec) __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST)
#define atomic_exchange(p_a, v) __atomic_exchange_n(p_a, v, __ATOMIC_SEQ_CST)
#define atomic_fetch_or(p_a, v) __atomic_fetch_or(p_a, v, __ATOMIC_SEQ_CST)
#endif /* !defined(__cplusplus) */

Просмотреть файл

@ -41,8 +41,8 @@
#include "common/attributes.h"
typedef volatile LONG __declspec(align(32)) atomic_int;
typedef volatile ULONG __declspec(align(32)) atomic_uint;
typedef volatile LONG atomic_int;
typedef volatile ULONG atomic_uint;
typedef enum {
memory_order_relaxed,
@ -52,6 +52,7 @@ typedef enum {
#define atomic_init(p_a, v) do { *(p_a) = (v); } while(0)
#define atomic_store(p_a, v) InterlockedExchange((LONG*)p_a, v)
#define atomic_load(p_a) InterlockedCompareExchange((LONG*)p_a, 0, 0)
#define atomic_exchange(p_a, v) InterlockedExchange(p_a, v)
#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
/*
@ -60,6 +61,7 @@ typedef enum {
*/
#define atomic_fetch_add(p_a, inc) InterlockedExchangeAdd(p_a, inc)
#define atomic_fetch_sub(p_a, dec) InterlockedExchangeAdd(p_a, -(dec))
#define atomic_fetch_or(p_a, v) InterlockedOr(p_a, v)
#endif /* ! stdatomic.h */

19
third_party/dav1d/include/dav1d/dav1d.h поставляемый
Просмотреть файл

@ -43,9 +43,8 @@ extern "C" {
typedef struct Dav1dContext Dav1dContext;
typedef struct Dav1dRef Dav1dRef;
#define DAV1D_MAX_FRAME_THREADS 256
#define DAV1D_MAX_TILE_THREADS 64
#define DAV1D_MAX_POSTFILTER_THREADS 256
#define DAV1D_MAX_THREADS 256
#define DAV1D_MAX_FRAME_DELAY 256
typedef struct Dav1dLogger {
void *cookie; ///< Custom data to pass to the callback.
@ -60,16 +59,15 @@ typedef struct Dav1dLogger {
} Dav1dLogger;
typedef struct Dav1dSettings {
int n_frame_threads;
int n_tile_threads;
int n_threads; ///< number of threads (0 = auto)
int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = auto)
int apply_grain;
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
Dav1dPicAllocator allocator; ///< Picture allocator callback.
Dav1dLogger logger; ///< Logger callback.
int n_postfilter_threads;
uint8_t reserved[28]; ///< reserved for future use
uint8_t reserved[32]; ///< reserved for future use
} Dav1dSettings;
/**
@ -105,7 +103,12 @@ DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
* @param buf The data to be parser.
* @param sz Size of the data.
*
* @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
* @return
* 0: Success, and out is filled with the parsed Sequence Header
* OBU parameters.
* DAV1D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer.
* other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in
* arguments, and other errors during parsing.
*
* @note It is safe to feed this function data containing other OBUs than a
* Sequence Header, as they will simply be ignored. If there is more than

25
third_party/dav1d/meson.build поставляемый
Просмотреть файл

@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.9.0',
version: '0.9.2',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
dav1d_soname_version = '5.1.0'
dav1d_soname_version = '6.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -173,16 +173,16 @@ libm_dependency = cc.find_library('m', required: false)
# Header checks
stdatomic_dependency = []
stdatomic_dependencies = []
if not cc.check_header('stdatomic.h')
if cc.get_id() == 'msvc'
# we have a custom replacement for MSVC
stdatomic_dependency = declare_dependency(
stdatomic_dependencies += declare_dependency(
include_directories : include_directories('include/compat/msvc'),
)
elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''',
name : 'GCC-style atomics', args : test_args)
stdatomic_dependency = declare_dependency(
stdatomic_dependencies += declare_dependency(
include_directories : include_directories('include/compat/gcc'),
)
else
@ -190,6 +190,11 @@ if not cc.check_header('stdatomic.h')
endif
endif
if host_machine.cpu_family().startswith('wasm')
# enable atomics + bulk-memory features
stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
endif
if cc.check_header('unistd.h')
cdata.set('HAVE_UNISTD_H', 1)
endif
@ -247,6 +252,7 @@ if cc.get_argument_syntax() != 'msvc'
'-Wno-maybe-uninitialized',
'-Wno-missing-field-initializers',
'-Wno-unused-parameter',
'-Wstrict-prototypes',
'-Werror=missing-prototypes',
'-Wshorten-64-to-32',
]
@ -369,11 +375,18 @@ if host_machine.cpu_family().startswith('x86')
cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
cdata_asm.set10('PIC', true)
# Convert SSE asm into (128-bit) AVX when compiler flags are set to use AVX instructions
cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__') != '')
endif
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
if cc.symbols_have_underscore_prefix()
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
# see following meson issue https://github.com/mesonbuild/meson/issues/5482
if (host_machine.system() == 'darwin' or
(host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86'))
cdata.set10('PREFIX', true)
cdata_asm.set10('PREFIX', true)
endif

1325
third_party/dav1d/src/arm/32/film_grain.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1188
third_party/dav1d/src/arm/32/film_grain16.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

97
third_party/dav1d/src/arm/32/refmvs.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,97 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* Copyright © 2021, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
// int bx4, int bw4, int bh4)
function splat_mv_neon, export=1
push {r4, lr}
vld1.8 {q3}, [r1]
ldr r4, [sp, #8]
clz r3, r3
adr lr, L(splat_tbl)
sub r3, r3, #26
vext.8 q2, q3, q3, #12
ldr r3, [lr, r3, lsl #2]
add r2, r2, r2, lsl #1
vext.8 q0, q2, q3, #4
add r3, lr, r3
vext.8 q1, q2, q3, #8
lsl r2, r2, #2
vext.8 q2, q2, q3, #12
vmov q3, q0
1:
ldr r1, [r0], #4
subs r4, r4, #1
add r1, r1, r2
bx r3
.align 2
L(splat_tbl):
.word 320f - L(splat_tbl) + CONFIG_THUMB
.word 160f - L(splat_tbl) + CONFIG_THUMB
.word 80f - L(splat_tbl) + CONFIG_THUMB
.word 40f - L(splat_tbl) + CONFIG_THUMB
.word 20f - L(splat_tbl) + CONFIG_THUMB
.word 10f - L(splat_tbl) + CONFIG_THUMB
10:
vst1.8 {d0}, [r1]
vstr s2, [r1, #8]
bgt 1b
pop {r4, pc}
20:
vst1.8 {q0}, [r1]
vstr d2, [r1, #16]
bgt 1b
pop {r4, pc}
40:
vst1.8 {q0, q1}, [r1]!
vst1.8 {q2}, [r1]
bgt 1b
pop {r4, pc}
320:
vst1.8 {q0, q1}, [r1]!
vst1.8 {q2, q3}, [r1]!
vst1.8 {q1, q2}, [r1]!
vst1.8 {q0, q1}, [r1]!
vst1.8 {q2, q3}, [r1]!
vst1.8 {q1, q2}, [r1]!
160:
vst1.8 {q0, q1}, [r1]!
vst1.8 {q2, q3}, [r1]!
vst1.8 {q1, q2}, [r1]!
80:
vst1.8 {q0, q1}, [r1]!
vst1.8 {q2, q3}, [r1]!
vst1.8 {q1, q2}, [r1]
bgt 1b
pop {r4, pc}
endfunc

425
third_party/dav1d/src/arm/64/film_grain.S поставляемый
Просмотреть файл

@ -186,32 +186,53 @@ endfunc
add x0, x0, #GRAIN_WIDTH-32
.endm
.macro get_grain_2 dst
function get_grain_2_neon
increment_seed 2
read_rand x14, 11, 1
read_rand x15, 11, 0
add x14, x3, x14, lsl #1
add x15, x3, x15, lsl #1
ld1 {\dst\().h}[0], [x14]
ld1 {\dst\().h}[1], [x15]
srshl v0.4h, \dst\().4h, v31.4h
xtn \dst\().8b, v0.8h
ld1 {v0.h}[0], [x14]
ld1 {v0.h}[1], [x15]
srshl v0.4h, v0.4h, v31.4h
xtn v0.8b, v0.8h
ret
endfunc
.macro get_grain_2 dst
bl get_grain_2_neon
.ifnc \dst, v0
mov \dst\().8b, v0.8b
.endif
.endm
// w15 holds the number of entries to produce
// w14 holds the previous output entry
// w14, w16 and w17 hold the previous output entries
// v0 holds the vector of produced entries
// v1 holds the input vector of sums from above
function output_lag1_neon
.macro output_lag n
function output_lag\n\()_neon
1:
read_shift_rand x13, 11
mov w11, v1.s[0]
ldrsh w12, [x3, x13, lsl #1]
ext v0.16b, v0.16b, v0.16b, #1
madd w14, w14, w4, w11 // sum (above) + *coeff * prev output
add w14, w14, w8 // 1 << (ar_coeff_shift - 1)
.if \n == 1
madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
.elseif \n == 2
madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
madd w11, w14, w17, w11 // += *coeff * prev output 2
mov w16, w14
.else
madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
madd w11, w14, w21, w11 // += *coeff * prev output 3
mov w17, w16
mov w16, w14
.endif
add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1)
asr w14, w14, w7 // >> ar_coeff_shift
add w12, w12, w10
asr w12, w12, w9 // >> (4 + grain_scale_shift)
add w14, w14, w12
cmp w14, w5
@ -224,6 +245,12 @@ function output_lag1_neon
b.gt 1b
ret
endfunc
.endm
output_lag 1
output_lag 2
output_lag 3
function sum_lag1_above_neon
smull v2.8h, v3.8b, v28.8b
@ -243,10 +270,8 @@ function sum_lag1_above_neon
ret
endfunc
.macro sum_lag1_func type, uv_layout, edge, elems=16
function sum_\type\()_lag1_\edge\()_neon
str x30, [sp, #-16]!
bl sum_lag1_above_neon
.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
bl sum_\lag\()_above_neon
.ifc \type, uv_420
add x12, x19, #GRAIN_WIDTH
ld1 {v22.16b, v23.16b}, [x19], #32
@ -257,35 +282,41 @@ function sum_\type\()_lag1_\edge\()_neon
saddlp v25.8h, v25.16b
add v22.8h, v22.8h, v24.8h
add v23.8h, v23.8h, v25.8h
rshrn v0.8b, v22.8h, #2
rshrn2 v0.16b, v23.8h, #2
rshrn v0.8b, v22.8h, #2
rshrn2 v0.16b, v23.8h, #2
.endif
.ifc \type, uv_422
ld1 {v22.16b, v23.16b}, [x19], #32
saddlp v22.8h, v22.16b
saddlp v23.8h, v23.16b
rshrn v0.8b, v22.8h, #1
rshrn2 v0.16b, v23.8h, #1
rshrn v0.8b, v22.8h, #1
rshrn2 v0.16b, v23.8h, #1
.endif
.ifc \type, uv_444
ld1 {v0.16b}, [x19], #16
.endif
.if \uv_layout
.ifnb \uv_coeff
dup v1.16b, \uv_coeff
smull v2.8h, v0.8b, v1.8b
smull2 v3.8h, v0.16b, v1.16b
.else
smull v2.8h, v0.8b, v30.8b
smull2 v3.8h, v0.16b, v30.16b
.endif
saddw v4.4s, v4.4s, v2.4h
saddw2 v5.4s, v5.4s, v2.8h
saddw v6.4s, v6.4s, v3.4h
saddw2 v7.4s, v7.4s, v3.8h
.endif
.if \uv_layout && \elems == 16
b sum_lag1_y_\edge\()_start
b sum_\lag\()_y_\edge\()_start
.elseif \uv_layout == 444 && \elems == 15
b sum_lag1_y_\edge\()_start
b sum_\lag\()_y_\edge\()_start
.elseif \uv_layout == 422 && \elems == 9
b sum_lag1_uv_420_\edge\()_start
b sum_\lag\()_uv_420_\edge\()_start
.else
sum_lag1_\type\()_\edge\()_start:
sum_\lag\()_\type\()_\edge\()_start:
.ifc \edge, left
increment_seed 4
read_rand x12, 11, 3
@ -301,28 +332,34 @@ sum_lag1_\type\()_\edge\()_start:
srshl v0.8h, v0.8h, v31.8h
xtn2 v0.16b, v0.8h
ext v4.16b, v4.16b, v4.16b, #12
.ifc \lag, lag3
smov w17, v0.b[13]
.endif
.ifnc \lag, lag1
smov w16, v0.b[14]
.endif
smov w14, v0.b[15]
mov v1.16b, v4.16b
mov w15, #1
bl output_lag1_neon
bl output_\lag\()_neon
.else
increment_seed 4, shift=0
mov v1.16b, v4.16b
mov w15, #4
bl output_lag1_neon
bl output_\lag\()_neon
.endif
increment_seed 4, shift=0
mov v1.16b, v5.16b
mov w15, #4
bl output_lag1_neon
bl output_\lag\()_neon
increment_seed 4, shift=0
mov v1.16b, v6.16b
.if \elems == 9
mov w15, #1
bl output_lag1_neon
bl output_\lag\()_neon
lsr w2, w2, #3
read_rand x12, 11, 2
@ -339,14 +376,14 @@ sum_lag1_\type\()_\edge\()_start:
ext v0.16b, v0.16b, v1.16b, #7
.else
mov w15, #4
bl output_lag1_neon
bl output_\lag\()_neon
increment_seed 4, shift=0
mov v1.16b, v7.16b
.ifc \edge, right
mov w15, #3
bl output_lag1_neon
bl output_\lag\()_neon
read_shift_rand x15, 11
add x15, x3, x15, lsl #1
ld1 {v1.h}[0], [x15]
@ -354,12 +391,21 @@ sum_lag1_\type\()_\edge\()_start:
ext v0.16b, v0.16b, v1.16b, #1
.else
mov w15, #4
bl output_lag1_neon
bl output_\lag\()_neon
.endif
.endif
.if \store
st1 {v0.16b}, [x0], #16
.endif
ldr x30, [sp], #16
ret
.endif
.endm
.macro sum_lag1_func type, uv_layout, edge, elems=16
function sum_\type\()_lag1_\edge\()_neon
str x30, [sp, #-16]!
sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
endfunc
.endm
@ -400,34 +446,6 @@ sum_lag1_func uv_420, 420, right, 9
sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
.endm
// w15 holds the number of entries to produce
// w14 and w16 hold the previous output entries
// v0 holds the vector of produced entries
// v1 holds the input vector of sums from above
function output_lag2_neon
1:
read_shift_rand x13, 11
mov w11, v1.s[0]
ldrsh w12, [x3, x13, lsl #1]
ext v0.16b, v0.16b, v0.16b, #1
madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
madd w11, w14, w17, w11 // += *coeff * prev output 2
mov w16, w14
add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
asr w14, w14, w7 // >> ar_coeff_shift
add w12, w12, w10
asr w12, w12, w9 // >> (4 + grain_scale_shift)
add w14, w14, w12
cmp w14, w5
csel w14, w14, w5, le
cmp w14, w6
csel w14, w14, w6, ge
subs w15, w15, #1
ext v1.16b, v1.16b, v1.16b, #4
ins v0.b[15], w14
b.gt 1b
ret
endfunc
function sum_lag2_above_neon
sub x12, x0, #2*GRAIN_WIDTH - 16
@ -530,123 +548,7 @@ function sum_\type\()_lag2_\edge\()_neon
ld1 {v17.16b}, [x12] // load the previous block right above
ld1 {v20.16b}, [x13]
.endif
bl sum_lag2_above_neon
.ifc \type, uv_420
add x12, x19, #GRAIN_WIDTH
ld1 {v22.16b, v23.16b}, [x19], #32
ld1 {v24.16b, v25.16b}, [x12]
saddlp v22.8h, v22.16b
saddlp v23.8h, v23.16b
saddlp v24.8h, v24.16b
saddlp v25.8h, v25.16b
add v22.8h, v22.8h, v24.8h
add v23.8h, v23.8h, v25.8h
rshrn v0.8b, v22.8h, #2
rshrn2 v0.16b, v23.8h, #2
.endif
.ifc \type, uv_422
ld1 {v22.16b, v23.16b}, [x19], #32
saddlp v22.8h, v22.16b
saddlp v23.8h, v23.16b
rshrn v0.8b, v22.8h, #1
rshrn2 v0.16b, v23.8h, #1
.endif
.ifc \type, uv_444
ld1 {v0.16b}, [x19], #16
.endif
.if \uv_layout
dup v1.16b, v30.b[12]
smull v2.8h, v0.8b, v1.8b
smull2 v3.8h, v0.16b, v1.16b
saddw v4.4s, v4.4s, v2.4h
saddw2 v5.4s, v5.4s, v2.8h
saddw v6.4s, v6.4s, v3.4h
saddw2 v7.4s, v7.4s, v3.8h
.endif
.if \uv_layout && \elems == 16
b sum_lag2_y_\edge\()_start
.elseif \uv_layout == 444 && \elems == 15
b sum_lag2_y_\edge\()_start
.elseif \uv_layout == 422 && \elems == 9
b sum_lag2_uv_420_\edge\()_start
.else
sum_lag2_\type\()_\edge\()_start:
.ifc \edge, left
increment_seed 4
read_rand x12, 11, 3
read_rand x13, 11, 2
read_rand x14, 11, 1
add x12, x3, x12, lsl #1
add x13, x3, x13, lsl #1
add x14, x3, x14, lsl #1
ld1 {v0.h}[5], [x12]
ld1 {v0.h}[6], [x13]
ld1 {v0.h}[7], [x14]
lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
srshl v0.8h, v0.8h, v31.8h
xtn2 v0.16b, v0.8h
ext v4.16b, v4.16b, v4.16b, #12
smov w16, v0.b[14]
smov w14, v0.b[15]
mov v1.16b, v4.16b
mov w15, #1
bl output_lag2_neon
.else
increment_seed 4, shift=0
mov v1.16b, v4.16b
mov w15, #4
bl output_lag2_neon
.endif
increment_seed 4, shift=0
mov v1.16b, v5.16b
mov w15, #4
bl output_lag2_neon
increment_seed 4, shift=0
mov v1.16b, v6.16b
.if \elems == 9
mov w15, #1
bl output_lag2_neon
lsr w2, w2, #3
read_rand x12, 11, 2
read_rand x13, 11, 1
read_rand x14, 11, 0
add x12, x3, x12, lsl #1
add x13, x3, x13, lsl #1
add x14, x3, x14, lsl #1
ld1 {v1.h}[0], [x12]
ld1 {v1.h}[1], [x13]
ld1 {v1.h}[2], [x14]
srshl v1.4h, v1.4h, v31.4h
xtn v1.8b, v1.8h
ext v0.16b, v0.16b, v1.16b, #7
.else
mov w15, #4
bl output_lag2_neon
increment_seed 4, shift=0
mov v1.16b, v7.16b
.ifc \edge, right
mov w15, #3
bl output_lag2_neon
read_shift_rand x15, 11
add x15, x3, x15, lsl #1
ld1 {v1.h}[0], [x15]
srshl v1.4h, v1.4h, v31.4h
ext v0.16b, v0.16b, v1.16b, #1
.else
mov w15, #4
bl output_lag2_neon
.endif
.endif
st1 {v0.16b}, [x0], #16
ldr x30, [sp], #16
ret
.endif
sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
endfunc
.endm
@ -664,37 +566,6 @@ sum_lag2_func uv_420, 420, mid
sum_lag2_func uv_420, 420, right, 9
// w15 holds the number of entries to produce
// w14, w16 and w17 hold the previous output entries
// v0 holds the vector of produced entries
// v1 holds the input vector of sums from above
function output_lag3_neon
1:
read_shift_rand x13, 11
mov w11, v1.s[0]
ldrsh w12, [x3, x13, lsl #1]
ext v0.16b, v0.16b, v0.16b, #1
madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
madd w11, w14, w21, w11 // += *coeff * prev output 3
mov w17, w16
mov w16, w14
add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
asr w14, w14, w7 // >> ar_coeff_shift
add w12, w12, w10
asr w12, w12, w9 // >> (4 + grain_scale_shift)
add w14, w14, w12
cmp w14, w5
csel w14, w14, w5, le
cmp w14, w6
csel w14, w14, w6, ge
subs w15, w15, #1
ext v1.16b, v1.16b, v1.16b, #4
ins v0.b[15], w14
b.gt 1b
ret
endfunc
function sum_lag3_above_neon
sub x11, x0, #3*GRAIN_WIDTH - 16
sub x12, x0, #2*GRAIN_WIDTH - 16
@ -890,124 +761,7 @@ function sum_\type\()_lag3_\edge\()_neon
ld1 {v17.16b}, [x12]
ld1 {v20.16b}, [x13]
.endif
bl sum_lag3_above_neon
.ifc \type, uv_420
add x12, x19, #GRAIN_WIDTH
ld1 {v22.16b, v23.16b}, [x19], #32
ld1 {v24.16b, v25.16b}, [x12]
saddlp v22.8h, v22.16b
saddlp v23.8h, v23.16b
saddlp v24.8h, v24.16b
saddlp v25.8h, v25.16b
add v22.8h, v22.8h, v24.8h
add v23.8h, v23.8h, v25.8h
rshrn v0.8b, v22.8h, #2
rshrn2 v0.16b, v23.8h, #2
.endif
.ifc \type, uv_422
ld1 {v22.16b, v23.16b}, [x19], #32
saddlp v22.8h, v22.16b
saddlp v23.8h, v23.16b
rshrn v0.8b, v22.8h, #1
rshrn2 v0.16b, v23.8h, #1
.endif
.ifc \type, uv_444
ld1 {v0.16b}, [x19], #16
.endif
.if \uv_layout
dup v1.16b, v30.b[8]
smull v2.8h, v0.8b, v1.8b
smull2 v3.8h, v0.16b, v1.16b
saddw v4.4s, v4.4s, v2.4h
saddw2 v5.4s, v5.4s, v2.8h
saddw v6.4s, v6.4s, v3.4h
saddw2 v7.4s, v7.4s, v3.8h
.endif
.if \uv_layout && \elems == 16
b sum_lag3_y_\edge\()_start
.elseif \uv_layout == 444 && \elems == 15
b sum_lag3_y_\edge\()_start
.elseif \uv_layout == 422 && \elems == 9
b sum_lag3_uv_420_\edge\()_start
.else
sum_lag3_\type\()_\edge\()_start:
.ifc \edge, left
increment_seed 4
read_rand x12, 11, 3
read_rand x13, 11, 2
read_rand x14, 11, 1
add x12, x3, x12, lsl #1
add x13, x3, x13, lsl #1
add x14, x3, x14, lsl #1
ld1 {v0.h}[5], [x12]
ld1 {v0.h}[6], [x13]
ld1 {v0.h}[7], [x14]
lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
srshl v0.8h, v0.8h, v31.8h
xtn2 v0.16b, v0.8h
ext v4.16b, v4.16b, v4.16b, #12
smov w17, v0.b[13]
smov w16, v0.b[14]
smov w14, v0.b[15]
mov v1.16b, v4.16b
mov w15, #1
bl output_lag3_neon
.else
increment_seed 4, shift=0
mov v1.16b, v4.16b
mov w15, #4
bl output_lag3_neon
.endif
increment_seed 4, shift=0
mov v1.16b, v5.16b
mov w15, #4
bl output_lag3_neon
increment_seed 4, shift=0
mov v1.16b, v6.16b
.if \elems == 9
mov w15, #1
bl output_lag3_neon
lsr w2, w2, #3
read_rand x12, 11, 2
read_rand x13, 11, 1
read_rand x14, 11, 0
add x12, x3, x12, lsl #1
add x13, x3, x13, lsl #1
add x14, x3, x14, lsl #1
ld1 {v1.h}[0], [x12]
ld1 {v1.h}[1], [x13]
ld1 {v1.h}[2], [x14]
srshl v1.4h, v1.4h, v31.4h
xtn v1.8b, v1.8h
ext v0.16b, v0.16b, v1.16b, #7
.else
mov w15, #4
bl output_lag3_neon
increment_seed 4, shift=0
mov v1.16b, v7.16b
.ifc \edge, right
mov w15, #3
bl output_lag3_neon
read_shift_rand x15, 11
add x15, x3, x15, lsl #1
ld1 {v1.h}[0], [x15]
srshl v1.4h, v1.4h, v31.4h
ext v0.16b, v0.16b, v1.16b, #1
.else
mov w15, #4
bl output_lag3_neon
.endif
.endif
st1 {v0.16b}, [x0], #16
ldr x30, [sp], #16
ret
.endif
sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
endfunc
.endm
@ -1061,7 +815,6 @@ function get_grain_row_44_neon
endfunc
function add_uv_444_coeff_lag0_neon
str x30, [sp, #-16]!
add_coeff_lag0_start:
smull v2.8h, v0.8b, v27.8b
smull2 v3.8h, v0.16b, v27.16b
@ -1071,20 +824,18 @@ add_coeff_lag0_start:
saddw2 v3.8h, v3.8h, v1.16b
sqxtn v2.8b, v2.8h
sqxtn2 v2.16b, v3.8h
ldr x30, [sp], #16
ret
endfunc
function add_uv_420_coeff_lag0_neon
str x30, [sp, #-16]!
ld1 {v4.16b, v5.16b}, [x19], #32
ld1 {v6.16b, v7.16b}, [x12], #32
saddlp v4.8h, v4.16b
saddlp v5.8h, v5.16b
saddlp v6.8h, v6.16b
saddlp v7.8h, v7.16b
add v4.8h, v4.8h, v6.8h
add v5.8h, v5.8h, v7.8h
add v4.8h, v4.8h, v6.8h
add v5.8h, v5.8h, v7.8h
rshrn v4.8b, v4.8h, #2
rshrn2 v4.16b, v5.8h, #2
and v0.16b, v4.16b, v0.16b
@ -1092,7 +843,6 @@ function add_uv_420_coeff_lag0_neon
endfunc
function add_uv_422_coeff_lag0_neon
str x30, [sp, #-16]!
ld1 {v4.16b, v5.16b}, [x19], #32
saddlp v4.8h, v4.16b
saddlp v5.8h, v5.16b
@ -1153,8 +903,6 @@ function generate_grain_\type\()_8bpc_neon, export=1
br x16
ret
L(generate_grain_\type\()_lag0):
.ifc \type, y
mov w1, #GRAIN_HEIGHT
@ -1208,15 +956,15 @@ L(generate_grain_\type\()_lag1):
ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1]
ld1r {v29.16b}, [x4] // ar_coeffs_y[2]
.ifc \type, y
ldrsb w4, [x4, #1] // ar_coeffs_y[4]
ldrsb w4, [x4, #1] // ar_coeffs_y[3]
.else
add x4, x4, #2
.endif
mov w1, #3
.ifc \type, uv_444
ld1r {v30.16b}, [x4] // ar_coeffs_uv[5]
ldursb w4, [x4, #-1] // ar_coeffs_uv[4]
ld1r {v30.16b}, [x4] // ar_coeffs_uv[4]
ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
.endif
bl generate_grain_rows_neon
@ -1270,8 +1018,7 @@ L(generate_grain_\type\()_lag2):
ret
L(generate_grain_\type\()_lag3):
ldr q29, [x4] // ar_coeffs_y[0-15]
ldr q30, [x4, #16] // ar_coeffs_y[16-23], ar_coeffs_uv[16-24]
ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
stp d8, d9, [sp, #16]
stp d10, d11, [sp, #32]
stp d12, d13, [sp, #48]
@ -1377,8 +1124,6 @@ function generate_grain_\type\()_8bpc_neon, export=1
br x16
ret
L(generate_grain_\type\()_lag0):
dup v28.8h, w7
ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
@ -1423,8 +1168,8 @@ L(generate_grain_\type\()_lag1):
add x4, x4, #2
mov w1, #3
ld1r {v30.16b}, [x4] // ar_coeffs_uv[5]
ldursb w4, [x4, #-1] // ar_coeffs_uv[4]
ld1r {v30.16b}, [x4] // ar_coeffs_u4[4]
ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
bl generate_grain_rows_44_neon
set_height w1, \type

1104
third_party/dav1d/src/arm/64/film_grain16.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

85
third_party/dav1d/src/arm/64/refmvs.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,85 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* Copyright © 2021, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
// int bx4, int bw4, int bh4)
function splat_mv_neon, export=1
ld1 {v3.16b}, [x1]
clz w3, w3
adr x5, L(splat_tbl)
sub w3, w3, #26
ext v2.16b, v3.16b, v3.16b, #12
ldrh w3, [x5, w3, uxtw #1]
add w2, w2, w2, lsl #1
ext v0.16b, v2.16b, v3.16b, #4
sub x3, x5, w3, uxtw
ext v1.16b, v2.16b, v3.16b, #8
lsl w2, w2, #2
ext v2.16b, v2.16b, v3.16b, #12
1:
ldr x1, [x0], #8
subs w4, w4, #1
add x1, x1, x2
br x3
10:
st1 {v0.8b}, [x1]
str s2, [x1, #8]
b.gt 1b
ret
20:
st1 {v0.16b}, [x1]
str d1, [x1, #16]
b.gt 1b
ret
320:
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
160:
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
80:
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
40:
st1 {v0.16b, v1.16b, v2.16b}, [x1]
b.gt 1b
ret
L(splat_tbl):
.hword L(splat_tbl) - 320b
.hword L(splat_tbl) - 160b
.hword L(splat_tbl) - 80b
.hword L(splat_tbl) - 40b
.hword L(splat_tbl) - 20b
.hword L(splat_tbl) - 10b
endfunc

Просмотреть файл

@ -31,8 +31,6 @@
#include "src/film_grain.h"
#include "asm-offsets.h"
#if ARCH_AARCH64
CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
@ -60,7 +58,6 @@ void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
GEN_GRAIN_UV(420);
GEN_GRAIN_UV(422);
GEN_GRAIN_UV(444);
#endif
// Use ptrdiff_t instead of int for the last few parameters, to get the
// same layout of parameters on the stack across platforms.
@ -209,12 +206,10 @@ COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if ARCH_AARCH64 && BITDEPTH == 8
c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
#endif
c->fgy_32x32xn = fgy_32x32xn_neon;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;

39
third_party/dav1d/src/arm/refmvs_init.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,39 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* Copyright © 2021, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/refmvs.h"
decl_splat_mv_fn(dav1d_splat_mv_neon);
COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
c->splat_mv = dav1d_splat_mv_neon;
}

4
third_party/dav1d/src/cdef_tmpl.c поставляемый
Просмотреть файл

@ -113,7 +113,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
const int pri_shift = imax(0, damping - ulog2(pri_strength));
if (sec_strength) {
const int sec_shift = imax(0, damping - ulog2(sec_strength));
const int sec_shift = damping - ulog2(sec_strength);
do {
for (int x = 0; x < w; x++) {
const int px = dst[x];
@ -180,7 +180,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
}
} else { // sec_strength only
assert(sec_strength);
const int sec_shift = imax(0, damping - ulog2(sec_strength));
const int sec_shift = damping - ulog2(sec_strength);
do {
for (int x = 0; x < w; x++) {
const int px = dst[x];

24
third_party/dav1d/src/cdf.c поставляемый
Просмотреть файл

@ -4096,16 +4096,15 @@ void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const
}
int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf,
struct thread_data *const t)
const int have_frame_mt)
{
cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
sizeof(CdfContext) + sizeof(atomic_uint));
if (!cdf->ref) return DAV1D_ERR(ENOMEM);
cdf->data.cdf = cdf->ref->data;
if (t) {
if (have_frame_mt) {
cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
atomic_init(cdf->progress, 0);
cdf->t = t;
}
return 0;
}
@ -4123,22 +4122,3 @@ void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
dav1d_ref_dec(&cdf->ref);
memset(cdf, 0, sizeof(*cdf));
}
void dav1d_cdf_thread_wait(CdfThreadContext *const cdf) {
if (!cdf->t) return;
if (atomic_load(cdf->progress)) return;
pthread_mutex_lock(&cdf->t->lock);
while (!atomic_load(cdf->progress))
pthread_cond_wait(&cdf->t->cond, &cdf->t->lock);
pthread_mutex_unlock(&cdf->t->lock);
}
void dav1d_cdf_thread_signal(CdfThreadContext *const cdf) {
if (!cdf->t) return;
pthread_mutex_lock(&cdf->t->lock);
atomic_store(cdf->progress, 1);
pthread_cond_broadcast(&cdf->t->cond);
pthread_mutex_unlock(&cdf->t->lock);
}

9
third_party/dav1d/src/cdf.h поставляемый
Просмотреть файл

@ -135,23 +135,16 @@ typedef struct CdfThreadContext {
CdfContext *cdf; // if ref != NULL
unsigned qcat; // if ref == NULL, from static CDF tables
} data;
struct thread_data *t;
atomic_uint *progress;
} CdfThreadContext;
void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
struct thread_data *t);
const int have_frame_mt);
void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst,
const CdfContext *src);
/*
* These are binary signals (so a signal is either "done" or "not done").
*/
void dav1d_cdf_thread_wait(CdfThreadContext *cdf);
void dav1d_cdf_thread_signal(CdfThreadContext *cdf);
#endif /* DAV1D_SRC_CDF_H */

44
third_party/dav1d/src/cpu.c поставляемый
Просмотреть файл

@ -29,6 +29,17 @@
#include <stdint.h>
#include "src/cpu.h"
#include "src/log.h"
#ifdef _WIN32
#include <windows.h>
#elif defined(__linux__)
#include <sched.h>
#include <unistd.h>
#elif defined(__APPLE__)
#include <sys/sysctl.h>
#include <sys/types.h>
#endif
static unsigned flags = 0;
@ -61,3 +72,36 @@ COLD unsigned dav1d_get_cpu_flags(void) {
COLD void dav1d_set_cpu_flags_mask(const unsigned mask) {
flags_mask = mask;
}
COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
#ifdef _WIN32
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
GROUP_AFFINITY affinity;
if (GetThreadGroupAffinity(GetCurrentThread(), &affinity)) {
int num_processors = 1;
while (affinity.Mask &= affinity.Mask - 1)
num_processors++;
return num_processors;
}
#else
SYSTEM_INFO system_info;
GetNativeSystemInfo(&system_info);
return system_info.dwNumberOfProcessors;
#endif
#elif defined(__linux__)
#ifdef CPU_COUNT
cpu_set_t affinity;
if (!sched_getaffinity(0, sizeof(affinity), &affinity))
return CPU_COUNT(&affinity);
#else
return (int)sysconf(_SC_NPROCESSORS_ONLN);
#endif
#elif defined(__APPLE__)
int num_processors;
size_t length = sizeof(num_processors);
if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0))
return num_processors;
#endif
dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
return 1;
}

2
third_party/dav1d/src/cpu.h поставляемый
Просмотреть файл

@ -33,6 +33,7 @@
#include "common/attributes.h"
#include "dav1d/common.h"
#include "dav1d/dav1d.h"
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/cpu.h"
@ -45,5 +46,6 @@
void dav1d_init_cpu(void);
unsigned dav1d_get_cpu_flags(void);
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
int dav1d_num_logical_processors(Dav1dContext *c);
#endif /* DAV1D_SRC_CPU_H */

867
third_party/dav1d/src/decode.c поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

53
third_party/dav1d/src/ext/x86/x86inc.asm поставляемый
Просмотреть файл

@ -79,6 +79,11 @@
%define mangle(x) x
%endif
; Use VEX-encoding even in non-AVX functions
%ifndef FORCE_VEX_ENCODING
%define FORCE_VEX_ENCODING 0
%endif
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,win32
SECTION .rdata align=%1
@ -1008,7 +1013,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endmacro
%macro INIT_XMM 0-1+
%assign avx_enabled 0
%assign avx_enabled FORCE_VEX_ENCODING
%define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16
%define mova movdqa
@ -1339,26 +1344,50 @@ INIT_XMM
%elif %0 >= 9
__instr %6, %7, %8, %9
%elif %0 == 8
%if avx_enabled && %5
%if avx_enabled && __sizeofreg >= 16 && %4 == 0
%xdefine __src1 %7
%xdefine __src2 %8
%ifnum regnumof%7
%ifnum regnumof%8
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
; Most VEX-encoded instructions require an additional byte to encode when
; src2 is a high register (e.g. m8..15). If the instruction is commutative
; we can swap src1 and src2 when doing so reduces the instruction length.
%xdefine __src1 %8
%xdefine __src2 %7
%if %5
%ifnum regnumof%7
%ifnum regnumof%8
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
; Most VEX-encoded instructions require an additional byte to encode when
; src2 is a high register (e.g. m8..15). If the instruction is commutative
; we can swap src1 and src2 when doing so reduces the instruction length.
%xdefine __src1 %8
%xdefine __src2 %7
%endif
%endif
%elifnum regnumof%8 ; put memory operands in src2 when possible
%xdefine __src1 %8
%xdefine __src2 %7
%else
%assign __emulate_avx 1
%endif
%elifnnum regnumof%7
; EVEX allows imm8 shift instructions to be used with memory operands,
; but VEX does not. This handles those special cases.
%ifnnum %8
%assign __emulate_avx 1
%elif notcpuflag(avx512)
%assign __emulate_avx 1
%endif
%endif
__instr %6, __src1, __src2
%if __emulate_avx ; a separate load is required
%if %3
vmovaps %6, %7
%else
vmovdqa %6, %7
%endif
__instr %6, %8
%else
__instr %6, __src1, __src2
%endif
%else
__instr %6, %7, %8
%endif
%elif %0 == 7
%if avx_enabled && %5
%if avx_enabled && __sizeofreg >= 16 && %5
%xdefine __src1 %6
%xdefine __src2 %7
%ifnum regnumof%6

130
third_party/dav1d/src/internal.h поставляемый
Просмотреть файл

@ -34,8 +34,7 @@
typedef struct Dav1dFrameContext Dav1dFrameContext;
typedef struct Dav1dTileState Dav1dTileState;
typedef struct Dav1dTileContext Dav1dTileContext;
typedef struct Dav1dPostFilterContext Dav1dPostFilterContext;
typedef struct Dav1dTaskContext Dav1dTaskContext;
typedef struct Dav1dTask Dav1dTask;
#include "common/attributes.h"
@ -78,8 +77,8 @@ struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
Dav1dPostFilterContext *pfc;
unsigned n_pfc;
Dav1dTaskContext *tc;
unsigned n_tc;
// cache of OBUs that make up a single frame before we submit them
// to a frame worker to be decoded
@ -112,14 +111,20 @@ struct Dav1dContext {
unsigned next;
} frame_thread;
// postfilter threading (refer to pfc[] for per_thread thingies)
struct PostFilterThreadData {
// task threading (refer to tc[] for per_thread thingies)
struct TaskThreadData {
pthread_mutex_t lock;
pthread_cond_t cond;
struct Dav1dTask *tasks;
int frame_cnt;
atomic_uint first;
unsigned cur;
// This is used for delayed reset of the task cur pointer when
// such operation is needed but the thread doesn't enter a critical
// section (typically when executing the next sbrow task locklessly).
// See src/thread_task.c:reset_task_cur().
atomic_uint reset_task_cur;
atomic_int cond_signaled;
int inited;
} postfilter_thread;
} task_thread;
// reference/entropy state
Dav1dMemPool *segmap_pool;
@ -134,6 +139,7 @@ struct Dav1dContext {
CdfThreadContext cdf[8];
Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
Dav1dRefmvsDSPContext refmvs_dsp;
// tree to keep track of which edges are available
struct {
@ -159,6 +165,29 @@ struct Dav1dContext {
Dav1dMemPool *picture_pool;
};
enum TaskType {
DAV1D_TASK_TYPE_INIT,
DAV1D_TASK_TYPE_TILE_ENTROPY,
DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
DAV1D_TASK_TYPE_DEBLOCK_COLS,
DAV1D_TASK_TYPE_DEBLOCK_ROWS,
DAV1D_TASK_TYPE_CDEF,
DAV1D_TASK_TYPE_SUPER_RESOLUTION,
DAV1D_TASK_TYPE_LOOP_RESTORATION,
DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
};
struct Dav1dTask {
unsigned frame_idx; // frame thread id
enum TaskType type; // task work
int sby; // sbrow
// task dependencies
int recon_progress, deblock_progress, cdef_progress, lr_progress;
int deps_skip;
struct Dav1dTask *next; // only used in task queue
};
struct Dav1dFrameContext {
Dav1dRef *seq_hdr_ref;
Dav1dSequenceHeader *seq_hdr;
@ -188,8 +217,6 @@ struct Dav1dFrameContext {
int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];
const Dav1dContext *c;
Dav1dTileContext *tc;
int n_tc;
Dav1dTileState *ts;
int n_ts;
const Dav1dDSPContext *dsp;
@ -197,7 +224,8 @@ struct Dav1dFrameContext {
recon_b_intra_fn recon_b_intra;
recon_b_inter_fn recon_b_inter;
filter_sbrow_fn filter_sbrow;
filter_sbrow_fn filter_sbrow_deblock;
filter_sbrow_fn filter_sbrow_deblock_cols;
filter_sbrow_fn filter_sbrow_deblock_rows;
filter_sbrow_fn filter_sbrow_cdef;
filter_sbrow_fn filter_sbrow_resize;
filter_sbrow_fn filter_sbrow_lr;
@ -218,8 +246,9 @@ struct Dav1dFrameContext {
int bitdepth_max;
struct {
struct thread_data td;
int pass, die;
int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
int entropy_progress;
atomic_int deblock_progress, cdef_progress, lr_progress; // in sby units
// indexed using t->by * f->b4_stride + t->bx
Av1Block *b;
struct CodedBlockInfo {
@ -243,7 +272,8 @@ struct Dav1dFrameContext {
Av1Restoration *lr_mask;
int top_pre_cdef_toggle;
int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
int lr_line_sz, re_sz /* h */;
size_t lr_plane_sz; /* w*sbh*4*is_sb128 if n_tc > 1, else w*12 */
int re_sz /* h */;
ALIGN(Av1FilterLUT lim_lut, 16);
int last_sharpness;
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
@ -253,32 +283,34 @@ struct Dav1dFrameContext {
pixel *lr_lpf_line[3 /* plane */];
// in-loop filter per-frame state keeping
int tile_row; // for carry-over at tile row edges
uint8_t *start_of_tile_row;
int start_of_tile_row_sz;
pixel *p[3], *sr_p[3];
Av1Filter *mask_ptr, *prev_mask_ptr;
int restore_planes; // enum LrRestorePlanes
struct {
pthread_cond_t cond;
struct PostFilterThreadData *pftd;
struct Dav1dTask *tasks;
int num_tasks;
int npf;
int done;
int inited;
} thread;
} lf;
struct {
pthread_cond_t cond;
struct TaskThreadData *ttd;
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
int num_tasks, num_tile_tasks;
int done[2];
int update_set; // whether we need to update CDF reference
atomic_int error;
int task_counter;
struct Dav1dTask *task_head, *task_tail;
// Points to the task directly before the cur pointer in the queue.
// This cur pointer is theoretical here, we actually keep track of the
// "prev_t" variable. This is needed to not loose the tasks in
// [head;cur-1] when picking one for execution.
struct Dav1dTask *task_cur_prev;
} task_thread;
// threading (refer to tc[] for per-thread things)
struct FrameTileThreadData {
uint64_t available;
pthread_mutex_t lock;
pthread_cond_t cond, icond;
int tasks_left, num_tasks;
int (*task_idx_to_sby_and_tile_idx)[2];
int titsati_sz, titsati_init[2];
uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS];
int inited;
int (*lowest_pixel_mem)[7][2];
int lowest_pixel_mem_sz;
} tile_thread;
};
@ -291,15 +323,16 @@ struct Dav1dTileState {
int col, row; // in tile units
} tiling;
atomic_int progress; // in sby units, TILE_ERROR after a decoding error
struct {
pthread_mutex_t lock;
pthread_cond_t cond;
} tile_thread;
// in sby units, TILE_ERROR after a decoding error
atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
struct {
uint8_t *pal_idx;
coef *cf;
} frame_thread;
} frame_thread[2 /* 0: reconstruction, 1: entropy */];
// in fullpel units, [0] = Y, [1] = UV, used for progress requirements
// each entry is one tile-sbrow; middle index is refidx
int (*lowest_pixel)[7][2];
uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
const uint16_t (*dq)[3][2];
@ -312,7 +345,8 @@ struct Dav1dTileState {
Av1RestorationUnit *lr_ref[3];
};
struct Dav1dTileContext {
struct Dav1dTaskContext {
const Dav1dContext *c;
const Dav1dFrameContext *f;
Dav1dTileState *ts;
int bx, by;
@ -375,18 +409,16 @@ struct Dav1dTileContext {
// keeps it accessible
enum Filter2d tl_4x4_filter;
struct {
int pass;
} frame_thread;
struct {
struct thread_data td;
struct TaskThreadData *ttd;
struct FrameTileThreadData *fttd;
int flushed;
int die;
} tile_thread;
};
struct Dav1dPostFilterContext {
Dav1dContext *c;
struct thread_data td;
int flushed;
int die;
} task_thread;
};
#endif /* DAV1D_SRC_INTERNAL_H */

1
third_party/dav1d/src/levels.h поставляемый
Просмотреть файл

@ -31,6 +31,7 @@
#include <stdint.h>
#include "dav1d/headers.h"
#include "common/attributes.h"
enum ObuMetaType {
OBU_META_HDR_CLL = 1,

9
third_party/dav1d/src/lf_apply.h поставляемый
Просмотреть файл

@ -35,8 +35,11 @@
#include "src/internal.h"
#include "src/levels.h"
void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *f,
pixel *const p[3], Av1Filter *lflvl,
int sby, int start_of_tile_row);
void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *f,
pixel *const p[3], Av1Filter *lflvl,
int sby, int start_of_tile_row);
void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *f,
pixel *const p[3], Av1Filter *lflvl,
int sby);
#endif /* DAV1D_SRC_LF_APPLY_H */

42
third_party/dav1d/src/lf_apply_tmpl.c поставляемый
Просмотреть файл

@ -170,13 +170,12 @@ static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
}
}
void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
pixel *const p[3], Av1Filter *const lflvl,
int sby, const int start_of_tile_row)
void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *const f,
pixel *const p[3], Av1Filter *const lflvl,
int sby, const int start_of_tile_row)
{
int x, have_left;
// Don't filter outside the frame
const int have_top = sby > 0;
const int is_sb64 = !f->seq_hdr->sb128;
const int starty4 = (sby & is_sb64) << 4;
const int sbsz = 32 >> is_sb64;
@ -271,13 +270,6 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
imin(32, f->w4 - x * 32), starty4, endy4);
}
level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
lflvl[x].filter_y[1], ptr, f->cur.stride[0],
imin(32, f->w4 - x * 32), starty4, endy4);
}
if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
return;
@ -292,7 +284,35 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
(imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
starty4 >> ss_ver, uv_endy4, ss_ver);
}
}
void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *const f,
pixel *const p[3], Av1Filter *const lflvl,
int sby)
{
int x;
// Don't filter outside the frame
const int have_top = sby > 0;
const int is_sb64 = !f->seq_hdr->sb128;
const int starty4 = (sby & is_sb64) << 4;
const int sbsz = 32 >> is_sb64;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
pixel *ptr;
uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
lflvl[x].filter_y[1], ptr, f->cur.stride[0],
imin(32, f->w4 - x * 32), starty4, endy4);
}
if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
return;
ptrdiff_t uv_off;
level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
for (uv_off = 0, x = 0; x < f->sb128w;
x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

313
third_party/dav1d/src/lib.c поставляемый
Просмотреть файл

@ -63,9 +63,8 @@ COLD const char *dav1d_version(void) {
}
COLD void dav1d_default_settings(Dav1dSettings *const s) {
s->n_frame_threads = 1;
s->n_tile_threads = 1;
s->n_postfilter_threads = 1;
s->n_threads = 0;
s->max_frame_delay = 0;
s->apply_grain = 1;
s->allocator.cookie = NULL;
s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
@ -101,12 +100,10 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_postfilter_threads >= 1 &&
s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_tile_threads >= 1 &&
s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_frame_threads >= 1 &&
s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_threads >= 0 &&
s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->max_frame_delay >= 0 &&
s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
DAV1D_ERR(EINVAL));
validate_input_or_ret(s->allocator.release_picture_callback != NULL,
@ -166,44 +163,28 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
c->flush = &c->flush_mem;
atomic_init(c->flush, 0);
c->n_pfc = s->n_postfilter_threads;
c->n_fc = s->n_frame_threads;
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
if (!c->fc) goto error;
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
c->n_tc = s->n_threads ? s->n_threads :
iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
c->n_fc = s->max_frame_delay ? umin(s->max_frame_delay, c->n_tc) :
umin(c->n_tc, 8);
if (c->n_pfc > 1) {
c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32);
if (!c->pfc) goto error;
memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads);
if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error;
if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) {
pthread_mutex_destroy(&c->postfilter_thread.lock);
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
if (!c->fc) goto error;
memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64);
if (!c->tc) goto error;
memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
if (c->n_tc > 1) {
if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error;
if (pthread_cond_init(&c->task_thread.cond, NULL)) {
pthread_mutex_destroy(&c->task_thread.lock);
goto error;
}
c->postfilter_thread.inited = 1;
for (int n = 0; n < s->n_frame_threads; n++) {
Dav1dFrameContext *const f = &c->fc[n];
if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error;
f->lf.thread.pftd = &c->postfilter_thread;
f->lf.thread.done = 1;
f->lf.thread.inited = 1;
}
for (int n = 0; n < s->n_postfilter_threads; ++n) {
Dav1dPostFilterContext *const pf = &c->pfc[n];
pf->c = c;
if (pthread_mutex_init(&pf->td.lock, NULL)) goto error;
if (pthread_cond_init(&pf->td.cond, NULL)) {
pthread_mutex_destroy(&pf->td.lock);
goto error;
}
if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) {
pthread_cond_destroy(&c->postfilter_thread.cond);
pthread_mutex_destroy(&c->postfilter_thread.lock);
goto error;
}
pf->td.inited = 1;
}
c->task_thread.cur = c->n_fc;
atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
atomic_init(&c->task_thread.cond_signaled, 0);
c->task_thread.inited = 1;
}
if (c->n_fc > 1) {
@ -211,61 +192,37 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
if (!c->frame_thread.out_delayed) goto error;
}
for (int n = 0; n < s->n_frame_threads; n++) {
for (unsigned n = 0; n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
if (c->n_tc > 1)
if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error;
f->c = c;
f->task_thread.ttd = &c->task_thread;
f->lf.last_sharpness = -1;
f->n_tc = s->n_tile_threads;
f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
if (!f->tc) goto error;
memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
if (f->n_tc > 1) {
if (pthread_mutex_init(&f->tile_thread.lock, NULL)) goto error;
if (pthread_cond_init(&f->tile_thread.cond, NULL)) {
pthread_mutex_destroy(&f->tile_thread.lock);
goto error;
}
if (pthread_cond_init(&f->tile_thread.icond, NULL)) {
pthread_mutex_destroy(&f->tile_thread.lock);
pthread_cond_destroy(&f->tile_thread.cond);
goto error;
}
f->tile_thread.inited = 1;
}
for (int m = 0; m < s->n_tile_threads; m++) {
Dav1dTileContext *const t = &f->tc[m];
t->f = f;
memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
if (f->n_tc > 1) {
if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error;
if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) {
pthread_mutex_destroy(&t->tile_thread.td.lock);
goto error;
}
t->tile_thread.fttd = &f->tile_thread;
if (pthread_create(&t->tile_thread.td.thread, &thread_attr, dav1d_tile_task, t)) {
pthread_cond_destroy(&t->tile_thread.td.cond);
pthread_mutex_destroy(&t->tile_thread.td.lock);
goto error;
}
t->tile_thread.td.inited = 1;
}
}
dav1d_refmvs_init(&f->rf);
if (c->n_fc > 1) {
if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error;
if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) {
pthread_mutex_destroy(&f->frame_thread.td.lock);
}
for (unsigned m = 0; m < c->n_tc; m++) {
Dav1dTaskContext *const t = &c->tc[m];
t->f = &c->fc[0];
t->task_thread.ttd = &c->task_thread;
t->c = c;
memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
if (c->n_tc > 1) {
if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error;
if (pthread_cond_init(&t->task_thread.td.cond, NULL)) {
pthread_mutex_destroy(&t->task_thread.td.lock);
goto error;
}
if (pthread_create(&f->frame_thread.td.thread, &thread_attr, dav1d_frame_task, f)) {
pthread_cond_destroy(&f->frame_thread.td.cond);
pthread_mutex_destroy(&f->frame_thread.td.lock);
if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav1d_worker_task, t)) {
pthread_cond_destroy(&t->task_thread.td.cond);
pthread_mutex_destroy(&t->task_thread.td.lock);
goto error;
}
f->frame_thread.td.inited = 1;
t->task_thread.td.inited = 1;
}
}
dav1d_refmvs_dsp_init(&c->refmvs_dsp);
// intra edge tree
c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node;
@ -297,6 +254,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
Dav1dSettings s;
dav1d_default_settings(&s);
s.n_threads = 1;
s.logger.callback = NULL;
Dav1dContext *c;
@ -318,7 +276,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
}
if (!c->seq_hdr) {
res = DAV1D_ERR(EINVAL);
res = DAV1D_ERR(ENOENT);
goto error;
}
@ -394,15 +352,23 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
do {
const unsigned next = c->frame_thread.next;
Dav1dFrameContext *const f = &c->fc[next];
pthread_mutex_lock(&f->frame_thread.td.lock);
pthread_mutex_lock(&c->task_thread.lock);
while (f->n_tile_data > 0)
pthread_cond_wait(&f->frame_thread.td.cond,
&f->frame_thread.td.lock);
pthread_mutex_unlock(&f->frame_thread.td.lock);
pthread_cond_wait(&f->task_thread.cond,
&f->task_thread.ttd->lock);
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
if (++c->frame_thread.next == c->n_fc)
c->frame_thread.next = 0;
pthread_mutex_unlock(&c->task_thread.lock);
if (out_delayed->p.data[0]) {
const unsigned progress =
atomic_load_explicit(&out_delayed->progress[1],
@ -509,51 +475,43 @@ void dav1d_flush(Dav1dContext *const c) {
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
if (c->n_fc == 1 && c->n_pfc == 1) return;
if (c->n_fc == 1 && c->n_tc == 1) return;
atomic_store(c->flush, 1);
// stop running tasks in worker threads
if (c->n_tc > 1) {
pthread_mutex_lock(&c->task_thread.lock);
for (unsigned i = 0; i < c->n_tc; i++) {
Dav1dTaskContext *const tc = &c->tc[i];
while (!tc->task_thread.flushed) {
pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock);
}
}
for (unsigned i = 0; i < c->n_fc; i++) {
c->fc[i].task_thread.task_head = NULL;
c->fc[i].task_thread.task_tail = NULL;
c->fc[i].task_thread.task_cur_prev = NULL;
}
atomic_init(&c->task_thread.first, 0);
c->task_thread.cur = c->n_fc;
atomic_store(&c->task_thread.reset_task_cur, UINT_MAX);
atomic_store(&c->task_thread.cond_signaled, 0);
pthread_mutex_unlock(&c->task_thread.lock);
}
// wait for threads to complete flushing
if (c->n_pfc > 1)
pthread_mutex_lock(&c->postfilter_thread.lock);
atomic_store(c->flush, 1);
if (c->n_pfc > 1) {
pthread_cond_broadcast(&c->postfilter_thread.cond);
pthread_mutex_unlock(&c->postfilter_thread.lock);
}
if (c->n_fc == 1) goto skip_ft_flush;
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
if (next == c->n_fc) next = 0;
Dav1dFrameContext *const f = &c->fc[next];
pthread_mutex_lock(&f->frame_thread.td.lock);
if (f->n_tile_data > 0) {
while (f->n_tile_data > 0)
pthread_cond_wait(&f->frame_thread.td.cond,
&f->frame_thread.td.lock);
assert(!f->cur.data[0]);
}
pthread_mutex_unlock(&f->frame_thread.td.lock);
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0])
dav1d_thread_picture_unref(out_delayed);
}
c->frame_thread.next = 0;
skip_ft_flush:
if (c->n_pfc > 1) {
for (unsigned i = 0; i < c->n_pfc; ++i) {
Dav1dPostFilterContext *const pf = &c->pfc[i];
pthread_mutex_lock(&pf->td.lock);
if (!pf->flushed)
pthread_cond_wait(&pf->td.cond, &pf->td.lock);
pf->flushed = 0;
pthread_mutex_unlock(&pf->td.lock);
}
pthread_mutex_lock(&c->postfilter_thread.lock);
c->postfilter_thread.tasks = NULL;
pthread_mutex_unlock(&c->postfilter_thread.lock);
for (unsigned i = 0; i < c->n_fc; ++i) {
freep(&c->fc[i].lf.thread.tasks);
c->fc[i].lf.thread.num_tasks = 0;
if (c->n_fc > 1) {
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
if (next == c->n_fc) next = 0;
Dav1dFrameContext *const f = &c->fc[next];
dav1d_decode_frame_exit(f, -1);
f->n_tile_data = 0;
Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0]) {
dav1d_thread_picture_unref(out_delayed);
}
}
c->frame_thread.next = 0;
}
atomic_store(c->flush, 0);
}
@ -569,82 +527,44 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
if (flush) dav1d_flush(c);
if (c->pfc) {
struct PostFilterThreadData *pftd = &c->postfilter_thread;
if (pftd->inited) {
pthread_mutex_lock(&pftd->lock);
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++)
c->pfc[n].die = 1;
pthread_cond_broadcast(&pftd->cond);
pthread_mutex_unlock(&pftd->lock);
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) {
pthread_join(c->pfc[n].td.thread, NULL);
pthread_cond_destroy(&c->pfc[n].td.cond);
pthread_mutex_destroy(&c->pfc[n].td.lock);
if (c->tc) {
struct TaskThreadData *ttd = &c->task_thread;
if (ttd->inited) {
pthread_mutex_lock(&ttd->lock);
for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++)
c->tc[n].task_thread.die = 1;
pthread_cond_broadcast(&ttd->cond);
pthread_mutex_unlock(&ttd->lock);
for (unsigned n = 0; n < c->n_tc; n++) {
Dav1dTaskContext *const pf = &c->tc[n];
if (!pf->task_thread.td.inited) break;
pthread_join(pf->task_thread.td.thread, NULL);
pthread_cond_destroy(&pf->task_thread.td.cond);
pthread_mutex_destroy(&pf->task_thread.td.lock);
}
pthread_cond_destroy(&pftd->cond);
pthread_mutex_destroy(&pftd->lock);
pthread_cond_destroy(&ttd->cond);
pthread_mutex_destroy(&ttd->lock);
}
dav1d_free_aligned(c->pfc);
dav1d_free_aligned(c->tc);
}
for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
// clean-up threading stuff
if (c->n_fc > 1 && f->frame_thread.td.inited) {
pthread_mutex_lock(&f->frame_thread.td.lock);
f->frame_thread.die = 1;
pthread_cond_signal(&f->frame_thread.td.cond);
pthread_mutex_unlock(&f->frame_thread.td.lock);
pthread_join(f->frame_thread.td.thread, NULL);
if (c->n_fc > 1) {
freep(&f->tile_thread.lowest_pixel_mem);
freep(&f->frame_thread.b);
dav1d_freep_aligned(&f->frame_thread.pal_idx);
dav1d_freep_aligned(&f->frame_thread.cf);
freep(&f->frame_thread.tile_start_off);
dav1d_freep_aligned(&f->frame_thread.pal);
freep(&f->frame_thread.cbi);
pthread_mutex_destroy(&f->frame_thread.td.lock);
pthread_cond_destroy(&f->frame_thread.td.cond);
}
if (f->n_tc > 1 && f->tc && f->tile_thread.inited) {
pthread_mutex_lock(&f->tile_thread.lock);
for (int m = 0; m < f->n_tc; m++) {
Dav1dTileContext *const t = &f->tc[m];
t->tile_thread.die = 1;
// mark not created tile threads as available
if (!t->tile_thread.td.inited)
f->tile_thread.available |= 1ULL<<m;
}
pthread_cond_broadcast(&f->tile_thread.cond);
while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc))
pthread_cond_wait(&f->tile_thread.icond,
&f->tile_thread.lock);
pthread_mutex_unlock(&f->tile_thread.lock);
for (int m = 0; m < f->n_tc; m++) {
Dav1dTileContext *const t = &f->tc[m];
if (f->n_tc > 1 && t->tile_thread.td.inited) {
pthread_join(t->tile_thread.td.thread, NULL);
pthread_mutex_destroy(&t->tile_thread.td.lock);
pthread_cond_destroy(&t->tile_thread.td.cond);
}
}
pthread_mutex_destroy(&f->tile_thread.lock);
pthread_cond_destroy(&f->tile_thread.cond);
pthread_cond_destroy(&f->tile_thread.icond);
freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
}
for (int m = 0; f->ts && m < f->n_ts; m++) {
Dav1dTileState *const ts = &f->ts[m];
pthread_cond_destroy(&ts->tile_thread.cond);
pthread_mutex_destroy(&ts->tile_thread.lock);
}
if (f->lf.thread.inited) {
freep(&f->lf.thread.tasks);
pthread_cond_destroy(&f->lf.thread.cond);
pthread_cond_destroy(&f->task_thread.cond);
}
freep(&f->task_thread.tasks);
freep(&f->task_thread.tile_tasks[0]);
dav1d_free_aligned(f->ts);
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);
free(f->a);
free(f->tile);
@ -652,6 +572,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
free(f->lf.lr_mask);
free(f->lf.level);
free(f->lf.tx_lpf_right_edge[0]);
free(f->lf.start_of_tile_row);
dav1d_refmvs_clear(&f->rf);
dav1d_free_aligned(f->lf.cdef_line_buf);
dav1d_free_aligned(f->lf.lr_lpf_line[0]);

28
third_party/dav1d/src/looprestoration_tmpl.c поставляемый
Просмотреть файл

@ -382,11 +382,11 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = 256 - x;
BB[i] = x;
}
AA += step * REST_UNIT_STRIDE;
BB += step * REST_UNIT_STRIDE;
@ -403,7 +403,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
for (int i = 0; i < w; i++) {
const int a = SIX_NEIGHBORS(B, i);
const int b = SIX_NEIGHBORS(A, i);
dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
}
dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
src += REST_UNIT_STRIDE;
@ -412,7 +412,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
for (int i = 0; i < w; i++) {
const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
dst[i] = (b - a * src[i] + (1 << 7)) >> 8;
}
dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
src += REST_UNIT_STRIDE;
@ -423,7 +423,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
for (int i = 0; i < w; i++) {
const int a = SIX_NEIGHBORS(B, i);
const int b = SIX_NEIGHBORS(A, i);
dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
}
}
#undef SIX_NEIGHBORS
@ -436,7 +436,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
for (int i = 0; i < w; i++) {
const int a = EIGHT_NEIGHBORS(B, i);
const int b = EIGHT_NEIGHBORS(A, i);
dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
}
dst += 384;
src += REST_UNIT_STRIDE;
@ -468,9 +468,8 @@ static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride,
const int w0 = params->sgr.w0;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
const int v = w0 * dst[j * 384 + i];
p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
}
p += PXSTRIDE(p_stride);
}
@ -492,9 +491,8 @@ static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride,
const int w1 = params->sgr.w1;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
const int v = w1 * dst[j * 384 + i];
p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
}
p += PXSTRIDE(p_stride);
}
@ -520,10 +518,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride,
const int w1 = params->sgr.w1;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) +
w1 * (dst1[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
const int v = w0 * dst0[j * 384 + i] + w1 * dst1[j * 384 + i];
p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
}
p += PXSTRIDE(p_stride);
}

23
third_party/dav1d/src/lr_apply_tmpl.c поставляемый
Просмотреть файл

@ -48,7 +48,7 @@ static void backup_lpf(const Dav1dFrameContext *const f,
const pixel *src, const ptrdiff_t src_stride,
const int ss_ver, const int sb128,
int row, const int row_h, const int src_w,
const int h, const int ss_hor, const int pft)
const int h, const int ss_hor)
{
const int dst_w = f->frame_hdr->super_res.enabled ?
(f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
@ -57,7 +57,7 @@ static void backup_lpf(const Dav1dFrameContext *const f,
int stripe_h = (64 - 8 * !row) >> ss_ver;
src += (stripe_h - 2) * PXSTRIDE(src_stride);
if (!pft) {
if (f->c->n_tc == 1) {
if (row) {
const int top = 4 << sb128;
// Copy the top part of the stored loop filtered pixels from the
@ -108,14 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f,
void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
/*const*/ pixel *const src[3], const int sby)
{
const int pft = f->c->n_pfc > 1;
const int have_tt = f->c->n_tc > 1;
const int offset = 8 * !!sby;
const ptrdiff_t *const src_stride = f->cur.stride;
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
const ptrdiff_t tt_off = have_tt * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride);
pixel *const dst[3] = {
f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride)
f->lf.lr_lpf_line[0] + tt_off,
f->lf.lr_lpf_line[1] + tt_off,
f->lf.lr_lpf_line[2] + tt_off
};
// TODO Also check block level restore type to reduce copying.
@ -128,7 +129,7 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
backup_lpf(f, dst[0], lr_stride,
src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft);
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
}
if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -141,12 +142,12 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
if (restore_planes & LR_RESTORE_U) {
backup_lpf(f, dst[1], lr_stride,
src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
}
if (restore_planes & LR_RESTORE_V) {
backup_lpf(f, dst[2], lr_stride,
src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
}
}
}
@ -162,7 +163,9 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
const int have_tt = f->c->n_tc > 1;
const pixel *lpf = f->lf.lr_lpf_line[plane] +
have_tt * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);

23
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -92,6 +92,7 @@ if is_asm_enabled
libdav1d_sources += files(
'arm/cpu.c',
'arm/refmvs_init.c',
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
@ -109,6 +110,7 @@ if is_asm_enabled
'arm/64/itx.S',
'arm/64/looprestoration_common.S',
'arm/64/msac.S',
'arm/64/refmvs.S',
)
if dav1d_bitdepths.contains('8')
@ -139,6 +141,7 @@ if is_asm_enabled
'arm/32/itx.S',
'arm/32/looprestoration_common.S',
'arm/32/msac.S',
'arm/32/refmvs.S',
)
if dav1d_bitdepths.contains('8')
@ -175,6 +178,7 @@ if is_asm_enabled
libdav1d_sources += files(
'x86/cpu.c',
'x86/msac_init.c',
'x86/refmvs_init.c',
)
libdav1d_tmpl_sources += files(
@ -191,10 +195,12 @@ if is_asm_enabled
libdav1d_sources_asm = files(
'x86/cpuid.asm',
'x86/msac.asm',
'x86/refmvs.asm',
'x86/cdef_avx2.asm',
'x86/itx_avx2.asm',
'x86/looprestoration_avx2.asm',
'x86/cdef_sse.asm',
'x86/itx_sse.asm',
)
if dav1d_bitdepths.contains('8')
@ -207,7 +213,6 @@ if is_asm_enabled
'x86/loopfilter_avx2.asm',
'x86/film_grain_sse.asm',
'x86/ipred_sse.asm',
'x86/itx_sse.asm',
'x86/loopfilter_sse.asm',
'x86/looprestoration_sse.asm',
'x86/mc_sse.asm',
@ -224,6 +229,8 @@ if is_asm_enabled
'x86/looprestoration16_avx2.asm',
'x86/mc16_avx2.asm',
'x86/cdef16_sse.asm',
'x86/film_grain16_sse.asm',
'x86/ipred16_sse.asm',
'x86/itx16_sse.asm',
'x86/loopfilter16_sse.asm',
'x86/looprestoration16_sse.asm',
@ -280,11 +287,11 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
rev_target, config_h_target,
include_directories : dav1d_inc_dirs,
dependencies: [stdatomic_dependency],
dependencies: [stdatomic_dependencies],
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
install : false,
build_by_default : false,
).extract_all_objects()
).extract_all_objects(recursive: true)
# Helper library for each bitdepth
libdav1d_bitdepth_objs = []
@ -293,11 +300,11 @@ foreach bitdepth : dav1d_bitdepths
'dav1d_bitdepth_@0@'.format(bitdepth),
libdav1d_tmpl_sources, config_h_target,
include_directories: dav1d_inc_dirs,
dependencies : [stdatomic_dependency],
dependencies : [stdatomic_dependencies],
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
install : false,
build_by_default : false,
).extract_all_objects()
).extract_all_objects(recursive: true)
endforeach
# Helper library for each bitdepth and architecture-specific flags
@ -306,11 +313,11 @@ foreach bitdepth : dav1d_bitdepths
'dav1d_arch_bitdepth_@0@'.format(bitdepth),
libdav1d_arch_tmpl_sources, config_h_target,
include_directories: dav1d_inc_dirs,
dependencies : [stdatomic_dependency],
dependencies : [stdatomic_dependencies],
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
install : false,
build_by_default : false,
).extract_all_objects()
).extract_all_objects(recursive: true)
endforeach
# The final dav1d library
@ -332,7 +339,7 @@ libdav1d = library('dav1d',
include_directories : dav1d_inc_dirs,
dependencies : [
stdatomic_dependency,
stdatomic_dependencies,
thread_dependency,
thread_compat_dep,
libdl_dependency,

16
third_party/dav1d/src/obu.c поставляемый
Просмотреть файл

@ -1547,18 +1547,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
dav1d_data_props_copy(&c->out.m, &in->m);
c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
} else {
pthread_mutex_lock(&c->task_thread.lock);
// need to append this to the frame output queue
const unsigned next = c->frame_thread.next++;
if (c->frame_thread.next == c->n_fc)
c->frame_thread.next = 0;
Dav1dFrameContext *const f = &c->fc[next];
pthread_mutex_lock(&f->frame_thread.td.lock);
while (f->n_tile_data > 0)
pthread_cond_wait(&f->frame_thread.td.cond,
&f->frame_thread.td.lock);
pthread_cond_wait(&f->task_thread.cond,
&f->task_thread.ttd->lock);
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
if (c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
if (out_delayed->p.data[0]) {
const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
memory_order_relaxed);
@ -1572,7 +1580,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
&c->refs[c->frame_hdr->existing_frame_idx].p);
out_delayed->visible = 1;
dav1d_data_props_copy(&out_delayed->p.m, &in->m);
pthread_mutex_unlock(&f->frame_thread.td.lock);
pthread_mutex_unlock(&c->task_thread.lock);
}
if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
const int r = c->frame_hdr->existing_frame_idx;

52
third_party/dav1d/src/picture.c поставляемый
Просмотреть файл

@ -176,7 +176,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
const int bpc)
{
Dav1dThreadPicture *const p = &f->sr_cur;
p->t = c->n_fc > 1 ? &f->frame_thread.td : NULL;
const int have_frame_mt = c->n_fc > 1;
const int res =
picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
@ -186,7 +186,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
c->mastering_display, c->mastering_display_ref,
c->itut_t35, c->itut_t35_ref,
bpc, &f->tile[0].data.m, &c->allocator,
p->t != NULL ? sizeof(atomic_int) * 2 : 0,
have_frame_mt ? sizeof(atomic_int) * 2 : 0,
(void **) &p->progress);
if (res) return res;
@ -198,7 +198,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
c->frame_flags = 0;
p->visible = f->frame_hdr->show_frame;
if (p->t) {
if (have_frame_mt) {
atomic_init(&p->progress[0], 0);
atomic_init(&p->progress[1], 0);
}
@ -254,7 +254,6 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
const Dav1dThreadPicture *const src)
{
dav1d_picture_ref(&dst->p, &src->p);
dst->t = src->t;
dst->visible = src->visible;
dst->progress = src->progress;
dst->flags = src->flags;
@ -279,54 +278,9 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
dav1d_picture_unref_internal(&p->p);
p->t = NULL;
p->progress = NULL;
}
int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
int y_unclipped, const enum PlaneType plane_type)
{
assert(plane_type != PLANE_TYPE_ALL);
if (!p->t)
return 0;
// convert to luma units; include plane delay from loopfilters; clip
const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
y_unclipped *= 1 << (plane_type & ss_ver); // we rely here on PLANE_TYPE_UV being 1
y_unclipped += (plane_type != PLANE_TYPE_BLOCK) * 8; // delay imposed by loopfilter
const unsigned y = iclip(y_unclipped, 1, p->p.p.h);
atomic_uint *const progress = &p->progress[plane_type != PLANE_TYPE_BLOCK];
unsigned state;
if ((state = atomic_load_explicit(progress, memory_order_acquire)) >= y)
return state == FRAME_ERROR;
pthread_mutex_lock(&p->t->lock);
while ((state = atomic_load_explicit(progress, memory_order_relaxed)) < y)
pthread_cond_wait(&p->t->cond, &p->t->lock);
pthread_mutex_unlock(&p->t->lock);
return state == FRAME_ERROR;
}
void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p,
const int y, // in pixel units
const enum PlaneType plane_type)
{
assert(plane_type != PLANE_TYPE_UV);
if (!p->t)
return;
pthread_mutex_lock(&p->t->lock);
if (plane_type != PLANE_TYPE_Y)
atomic_store(&p->progress[0], y);
if (plane_type != PLANE_TYPE_BLOCK)
atomic_store(&p->progress[1], y);
pthread_cond_broadcast(&p->t->cond);
pthread_mutex_unlock(&p->t->lock);
}
enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) {
if (!p->flags)
return 0;

26
third_party/dav1d/src/picture.h поставляемый
Просмотреть файл

@ -52,7 +52,6 @@ typedef struct Dav1dThreadPicture {
Dav1dPicture p;
int visible;
enum PictureFlags flags;
struct thread_data *t;
// [0] block data (including segmentation map and motion vectors)
// [1] pixel data
atomic_uint *progress;
@ -91,31 +90,6 @@ void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
*/
void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src);
/**
* Wait for picture to reach a certain stage.
*
* y is in full-pixel units. If pt is not UV, this is in luma
* units, else it is in chroma units.
* plane_type is used to determine how many pixels delay are
* introduced by loopfilter processes.
*
* Returns 0 on success, and 1 if there was an error while decoding p
*/
int dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y,
enum PlaneType plane_type);
/**
* Signal decoding progress.
*
* y is in full-pixel luma units. FRAME_ERROR is used to signal a decoding
* error to frames using this frame as reference frame.
* plane_type denotes whether we have completed block data (pass 1;
* PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
* 2-pass decoding; PLANE_TYPE_ALL).
*/
void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
enum PlaneType plane_type);
int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
void dav1d_picture_unref_internal(Dav1dPicture *p);

14
third_party/dav1d/src/recon.h поставляемый
Просмотреть файл

@ -37,12 +37,12 @@
#define DEBUG_B_PIXELS 0
#define decl_recon_b_intra_fn(name) \
void (name)(Dav1dTileContext *t, enum BlockSize bs, \
void (name)(Dav1dTaskContext *t, enum BlockSize bs, \
enum EdgeFlags intra_edge_flags, const Av1Block *b)
typedef decl_recon_b_intra_fn(*recon_b_intra_fn);
#define decl_recon_b_inter_fn(name) \
int (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
int (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
typedef decl_recon_b_inter_fn(*recon_b_inter_fn);
#define decl_filter_sbrow_fn(name) \
@ -50,11 +50,11 @@ void (name)(Dav1dFrameContext *f, int sby)
typedef decl_filter_sbrow_fn(*filter_sbrow_fn);
#define decl_backup_ipred_edge_fn(name) \
void (name)(Dav1dTileContext *t)
void (name)(Dav1dTaskContext *t)
typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
#define decl_read_coef_blocks_fn(name) \
void (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
@ -65,8 +65,10 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);

130
third_party/dav1d/src/recon_tmpl.c поставляемый
Просмотреть файл

@ -318,7 +318,7 @@ static inline unsigned get_lo_ctx(const uint8_t *const levels,
return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
}
static int decode_coefs(Dav1dTileContext *const t,
static int decode_coefs(Dav1dTaskContext *const t,
uint8_t *const a, uint8_t *const l,
const enum RectTxfmSize tx, const enum BlockSize bs,
const Av1Block *const b, const int intra,
@ -719,7 +719,7 @@ static int decode_coefs(Dav1dTileContext *const t,
return eob;
}
static void read_coef_tree(Dav1dTileContext *const t,
static void read_coef_tree(Dav1dTaskContext *const t,
const enum BlockSize bs, const Av1Block *const b,
const enum RectTxfmSize ytx, const int depth,
const uint16_t *const tx_split,
@ -768,15 +768,16 @@ static void read_coef_tree(Dav1dTileContext *const t,
coef *cf;
struct CodedBlockInfo *cbi;
if (f->frame_thread.pass) {
assert(ts->frame_thread.cf);
cf = ts->frame_thread.cf;
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].cf);
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
} else {
cf = bitfn(t->cf);
}
if (f->frame_thread.pass != 2) {
if (t->frame_thread.pass != 2) {
eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
@ -798,7 +799,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
case_set_upto16(txw,,,);
#undef set_ctx
if (f->frame_thread.pass == 1) {
if (t->frame_thread.pass == 1) {
cbi->eob[0] = eob;
cbi->txtp[0] = txtp;
}
@ -806,7 +807,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
eob = cbi->eob[0];
txtp = cbi->txtp[0];
}
if (!(f->frame_thread.pass & 1)) {
if (!(t->frame_thread.pass & 1)) {
assert(dst);
if (eob >= 0) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@ -820,7 +821,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
}
}
void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
const enum BlockSize bs, const Av1Block *const b)
{
const Dav1dFrameContext *const f = t->f;
@ -855,7 +856,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
Dav1dTileState *const ts = t->ts;
const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
assert(f->frame_thread.pass == 1);
assert(t->frame_thread.pass == 1);
assert(!b->skip);
const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
@ -884,12 +885,12 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
const int eob = cbi[t->bx].eob[0] =
decode_coefs(t, &t->a->lcoef[bx4 + x],
&t->l.lcoef[by4 + y], b->tx, bs, b, 1,
0, ts->frame_thread.cf, &txtp, &cf_ctx);
0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
b->tx, txtp, eob, ts->msac.rng);
cbi[t->bx].txtp[0] = txtp;
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
@ -927,14 +928,14 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
const int eob = cbi[t->bx].eob[1 + pl] =
decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
&t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
b, b->intra, 1 + pl, ts->frame_thread.cf,
b, b->intra, 1 + pl, ts->frame_thread[1].cf,
&txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d\n",
pl, b->uvtx, txtp, eob, ts->msac.rng);
cbi[t->bx].txtp[1 + pl] = txtp;
ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
@ -956,7 +957,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
}
}
static int mc(Dav1dTileContext *const t,
static int mc(Dav1dTaskContext *const t,
pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
const int bw4, const int bh4,
const int bx, const int by, const int pl,
@ -979,11 +980,6 @@ static int mc(Dav1dTileContext *const t,
int w, h;
if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
PLANE_TYPE_Y + !!pl))
{
return -1;
}
w = (f->cur.p.w + ss_hor) >> ss_hor;
h = (f->cur.p.h + ss_ver) >> ss_ver;
} else {
@ -1034,8 +1030,6 @@ static int mc(Dav1dTileContext *const t,
const int bottom =
((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
if (dav1d_thread_picture_wait(refp, bottom + 4, PLANE_TYPE_Y + !!pl))
return -1;
if (DEBUG_BLOCK_INFO)
printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
@ -1077,7 +1071,7 @@ static int mc(Dav1dTileContext *const t,
return 0;
}
static int obmc(Dav1dTileContext *const t,
static int obmc(Dav1dTaskContext *const t,
pixel *const dst, const ptrdiff_t dst_stride,
const uint8_t *const b_dim, const int pl,
const int bx4, const int by4, const int w4, const int h4)
@ -1138,7 +1132,7 @@ static int obmc(Dav1dTileContext *const t,
return 0;
}
static int warp_affine(Dav1dTileContext *const t,
static int warp_affine(Dav1dTaskContext *const t,
pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
const uint8_t *const b_dim, const int pl,
const Dav1dThreadPicture *const refp,
@ -1176,11 +1170,6 @@ static int warp_affine(Dav1dTileContext *const t,
const pixel *ref_ptr;
ptrdiff_t ref_stride = refp->p.stride[!!pl];
if (dav1d_thread_picture_wait(refp, dy + 4 + 8,
PLANE_TYPE_Y + !!pl))
{
return -1;
}
if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
@ -1204,7 +1193,7 @@ static int warp_affine(Dav1dTileContext *const t,
return 0;
}
void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
const enum EdgeFlags intra_edge_flags,
const Av1Block *const b)
{
@ -1239,14 +1228,15 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
pixel *dst = ((pixel *) f->cur.data[0]) +
4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
const uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
} else {
pal_idx = t->scratch.pal_idx;
}
const uint16_t *const pal = f->frame_thread.pass ?
const uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
@ -1323,9 +1313,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
coef *cf;
int eob;
enum TxfmType txtp;
if (f->frame_thread.pass) {
cf = ts->frame_thread.cf;
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
const struct CodedBlockInfo *const cbi =
&f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
eob = cbi->eob[0];
@ -1362,7 +1353,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
hex_dump(dst, f->cur.stride[0],
t_dim->w * 4, t_dim->h * 4, "recon");
}
} else if (!f->frame_thread.pass) {
} else if (!t->frame_thread.pass) {
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * 0x40)
case_set_upto16(t_dim->h, l., 1, by4 + y);
@ -1435,12 +1426,13 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
(t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
const uint16_t (*pal)[8];
const uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))];
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
} else {
pal = t->scratch.pal;
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
@ -1545,9 +1537,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
enum TxfmType txtp;
int eob;
coef *cf;
if (f->frame_thread.pass) {
cf = ts->frame_thread.cf;
ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
const struct CodedBlockInfo *const cbi =
&f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
eob = cbi->eob[pl + 1];
@ -1587,7 +1580,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
hex_dump(dst, stride, uv_t_dim->w * 4,
uv_t_dim->h * 4, "recon");
}
} else if (!f->frame_thread.pass) {
} else if (!t->frame_thread.pass) {
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
@ -1604,7 +1597,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
}
}
int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
const Av1Block *const b)
{
Dav1dTileState *const ts = t->ts;
@ -1719,7 +1712,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
r[-1][t->bx - 1].mv.mv[0],
&f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
r[-1][t->bx - 1].ref.ref[0] - 1,
f->frame_thread.pass != 2 ? t->tl_4x4_filter :
t->frame_thread.pass != 2 ? t->tl_4x4_filter :
f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
if (res) return res;
}
@ -1735,7 +1728,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
&f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
r[0][t->bx - 1].ref.ref[0] - 1,
f->frame_thread.pass != 2 ? left_filter_2d :
t->frame_thread.pass != 2 ? left_filter_2d :
f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
if (res) return res;
}
@ -1750,7 +1743,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
1 + pl, r[-1][t->bx].mv.mv[0],
&f->refp[r[-1][t->bx].ref.ref[0] - 1],
r[-1][t->bx].ref.ref[0] - 1,
f->frame_thread.pass != 2 ? top_filter_2d :
t->frame_thread.pass != 2 ? top_filter_2d :
f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
if (res) return res;
}
@ -1994,9 +1987,10 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
coef *cf;
int eob;
enum TxfmType txtp;
if (f->frame_thread.pass) {
cf = ts->frame_thread.cf;
ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
const struct CodedBlockInfo *const cbi =
&f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
eob = cbi->eob[1 + pl];
@ -2051,7 +2045,21 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
return 0;
}
void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) {
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext*const f, const int sby) {
const int y = sby * f->sb_step * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3] = {
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
f->lf.start_of_tile_row[sby]);
}
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext*const f, const int sby) {
const int y = sby * f->sb_step * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3] = {
@ -2061,10 +2069,7 @@ void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
int start_of_tile_row = 0;
if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
start_of_tile_row = f->lf.tile_row++;
bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row);
bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
}
if (f->lf.restore_planes) {
// Store loop filtered pixels required by loop restoration
@ -2145,7 +2150,8 @@ void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
}
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
bytefn(dav1d_filter_sbrow_deblock)(f, sby);
bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
if (f->seq_hdr->cdef)
bytefn(dav1d_filter_sbrow_cdef)(f, sby);
if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
@ -2154,7 +2160,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
bytefn(dav1d_filter_sbrow_lr)(f, sby);
}
void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
const Dav1dFrameContext *const f = t->f;
Dav1dTileState *const ts = t->ts;
const int sby = t->by >> f->sb_shift;

47
third_party/dav1d/src/refmvs.c поставляемый
Просмотреть файл

@ -35,6 +35,7 @@
#include "common/intops.h"
#include "src/env.h"
#include "src/mem.h"
#include "src/refmvs.h"
static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt,
@ -652,11 +653,14 @@ void dav1d_refmvs_find(const refmvs_tile *const rt,
void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf,
const int tile_col_start4, const int tile_col_end4,
const int tile_row_start4, const int tile_row_end4,
const int sby, int tile_row_idx)
const int sby, int tile_row_idx, const int pass)
{
if (rf->n_tile_threads == 1) tile_row_idx = 0;
rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx];
const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1;
const ptrdiff_t pass_off = (uses_2pass && pass == 2) ?
35 * rf->r_stride * rf->n_tile_rows : 0;
refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off];
const int sbsz = rf->sbsz;
const int off = (sbsz * sby) & 16;
for (int i = 0; i < sbsz; i++, r += rf->r_stride)
@ -805,7 +809,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
refmvs_temporal_block *const rp,
const unsigned ref_ref_poc[7][7],
/*const*/ refmvs_temporal_block *const rp_ref[7],
const int n_tile_threads)
const int n_tile_threads, const int n_frame_threads)
{
rf->sbsz = 16 << seq_hdr->sb128;
rf->frm_hdr = frm_hdr;
@ -817,21 +821,23 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->r) free(rf->r);
rf->r = malloc(sizeof(*rf->r) * 35 * r_stride * n_tile_rows);
if (rf->r) dav1d_freep_aligned(&rf->r);
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
if (!rf->r) return DAV1D_ERR(ENOMEM);
rf->r_stride = r_stride;
}
const ptrdiff_t rp_stride = r_stride >> 1;
if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->rp_proj) free(rf->rp_proj);
rf->rp_proj = malloc(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows);
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
rf->rp_stride = rp_stride;
}
rf->n_tile_rows = n_tile_rows;
rf->n_tile_threads = n_tile_threads;
rf->n_frame_threads = n_frame_threads;
rf->rp = rp;
rf->rp_ref = rp_ref;
const unsigned poc = frm_hdr->frame_offset;
@ -902,6 +908,29 @@ void dav1d_refmvs_init(refmvs_frame *const rf) {
}
void dav1d_refmvs_clear(refmvs_frame *const rf) {
if (rf->r) free(rf->r);
if (rf->rp_proj) free(rf->rp_proj);
if (rf->r) dav1d_freep_aligned(&rf->r);
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
}
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
const int bx4, const int bw4, int bh4)
{
do {
refmvs_block *const r = *rr++ + bx4;
for (int x = 0; x < bw4; x++)
r[x] = *rmv;
} while (--bh4);
}
COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
{
c->splat_mv = splat_mv_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
dav1d_refmvs_dsp_init_arm(c);
#elif ARCH_X86
dav1d_refmvs_dsp_init_x86(c);
#endif
#endif
}

114
third_party/dav1d/src/refmvs.h поставляемый
Просмотреть файл

@ -35,7 +35,6 @@
#include "common/intops.h"
#include "src/intra_edge.h"
#include "src/levels.h"
#include "src/tables.h"
#define INVALID_MV 0x80008000
@ -55,11 +54,11 @@ typedef union refmvs_mvpair {
uint64_t n;
} refmvs_mvpair;
typedef struct refmvs_block {
PACKED(typedef struct refmvs_block {
refmvs_mvpair mv;
refmvs_refpair ref;
uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
} refmvs_block;
}) ALIGN(refmvs_block, 4);
typedef struct refmvs_frame {
const Dav1dFrameHeader *frm_hdr;
@ -80,7 +79,7 @@ typedef struct refmvs_frame {
refmvs_block *r; // 35 x r_stride memory
ptrdiff_t r_stride;
int n_tile_rows, n_tile_threads;
int n_tile_rows, n_tile_threads, n_frame_threads;
} refmvs_frame;
typedef struct refmvs_tile {
@ -97,6 +96,14 @@ typedef struct refmvs_candidate {
int weight;
} refmvs_candidate;
#define decl_splat_mv_fn(name) \
void (name)(refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4)
typedef decl_splat_mv_fn(*splat_mv_fn);
typedef struct Dav1dRefmvsDSPContext {
splat_mv_fn splat_mv;
} Dav1dRefmvsDSPContext;
// call once per frame thread
void dav1d_refmvs_init(refmvs_frame *rf);
void dav1d_refmvs_clear(refmvs_frame *rf);
@ -109,7 +116,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *rf,
refmvs_temporal_block *rp,
const unsigned ref_ref_poc[7][7],
/*const*/ refmvs_temporal_block *const rp_ref[7],
int n_tile_threads);
int n_tile_threads, int n_frame_threads);
// initialize temporal MVs; this can be done in any configuration, e.g. one
// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
@ -129,7 +136,7 @@ void dav1d_refmvs_save_tmvs(const refmvs_tile *rt,
void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
int tile_col_start4, int tile_col_end4,
int tile_row_start4, int tile_row_end4,
int sby, int tile_row_idx);
int sby, int tile_row_idx, int pass);
// call for each block
void dav1d_refmvs_find(const refmvs_tile *rt,
@ -137,97 +144,8 @@ void dav1d_refmvs_find(const refmvs_tile *rt,
int *ctx, const refmvs_refpair ref, enum BlockSize bs,
enum EdgeFlags edge_flags, int by4, int bx4);
static inline void splat_oneref_mv(refmvs_tile *const rt,
const int by4, const int bx4,
const enum BlockSize bs,
const enum InterPredMode mode,
const int ref, const mv mv,
const int is_interintra)
{
const int bw4 = dav1d_block_dimensions[bs][0];
int bh4 = dav1d_block_dimensions[bs][1];
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
const refmvs_block tmpl = (refmvs_block) {
.ref.ref = { ref + 1, is_interintra ? 0 : -1 },
.mv.mv[0] = mv,
.bs = bs,
.mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
};
do {
refmvs_block *r = *rr++ + bx4;
for (int x = 0; x < bw4; x++)
r[x] = tmpl;
} while (--bh4);
}
static inline void splat_intrabc_mv(refmvs_tile *const rt,
const int by4, const int bx4,
const enum BlockSize bs, const mv mv)
{
const int bw4 = dav1d_block_dimensions[bs][0];
int bh4 = dav1d_block_dimensions[bs][1];
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
const refmvs_block tmpl = (refmvs_block) {
.ref.ref = { 0, -1 },
.mv.mv[0] = mv,
.bs = bs,
.mf = 0,
};
do {
refmvs_block *r = *rr++ + bx4;
for (int x = 0; x < bw4; x++) {
r[x] = tmpl;
}
} while (--bh4);
}
static inline void splat_tworef_mv(refmvs_tile *const rt,
const int by4, const int bx4,
const enum BlockSize bs,
const enum CompInterPredMode mode,
const refmvs_refpair ref,
const refmvs_mvpair mv)
{
const int bw4 = dav1d_block_dimensions[bs][0];
int bh4 = dav1d_block_dimensions[bs][1];
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
assert(bw4 >= 2 && bh4 >= 2);
const refmvs_block tmpl = (refmvs_block) {
.ref.pair = ref.pair + 0x0101,
.mv = mv,
.bs = bs,
.mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
};
do {
refmvs_block *r = *rr++ + bx4;
for (int x = 0; x < bw4; x++)
r[x] = tmpl;
} while (--bh4);
}
static inline void splat_intraref(refmvs_tile *const rt,
const int by4, const int bx4,
const enum BlockSize bs)
{
const int bw4 = dav1d_block_dimensions[bs][0];
int bh4 = dav1d_block_dimensions[bs][1];
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
const refmvs_block tmpl = (refmvs_block) {
.ref.ref = { 0, -1 },
.mv.mv[0].n = INVALID_MV,
.bs = bs,
.mf = 0,
};
do {
refmvs_block *r = *rr++ + bx4;
for (int x = 0; x < bw4; x++) {
r[x] = tmpl;
}
} while (--bh4);
}
void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
#endif /* DAV1D_SRC_REF_MVS_H */

914
third_party/dav1d/src/thread_task.c поставляемый
Просмотреть файл

@ -27,345 +27,661 @@
#include "config.h"
#include "common/frame.h"
#include "src/thread_task.h"
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) {
struct PostFilterThreadData *const pftd = f->lf.thread.pftd;
const int frame_idx = (int)(f - f->c->fc);
// This function resets the cur pointer to the first frame theoretically
// executable after a task completed (ie. each time we update some progress or
// insert some tasks in the queue).
// When frame_idx is set, it can be either from a completed task, or from tasks
// inserted in the queue, in which case we have to make sure the cur pointer
// isn't past this insert.
// The special case where frame_idx is UINT_MAX is to handle the reset after
// completing a task and locklessly signaling progress. In this case we don't
// enter a critical section, which is needed for this function, so we set an
// atomic for a delayed handling, happening here. Meaning we can call this
// function without any actual update other than what's in the atomic, hence
// this special case.
static inline int reset_task_cur(const Dav1dContext *const c,
struct TaskThreadData *const ttd,
unsigned frame_idx)
{
const unsigned first = atomic_load(&ttd->first);
if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
return 0;
unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
if (reset_frame_idx != UINT_MAX) {
if (frame_idx == UINT_MAX) {
if (reset_frame_idx > first + ttd->cur)
return 0;
ttd->cur = reset_frame_idx - first;
goto cur_found;
}
} else if (frame_idx == UINT_MAX)
return 0;
if (frame_idx < first) frame_idx += c->n_fc;
const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx);
const unsigned cur_frame_idx = first + ttd->cur;
if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx)
return 0;
for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++)
if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head)
break;
cur_found:
for (unsigned i = ttd->cur; i < c->n_fc; i++)
c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL;
return 1;
}
static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
unsigned frame_idx, unsigned n_frames)
{
if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames;
unsigned last_idx = frame_idx;
do {
frame_idx = last_idx;
last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
} while (last_idx < frame_idx);
}
static void insert_tasks_between(Dav1dFrameContext *const f,
Dav1dTask *const first, Dav1dTask *const last,
Dav1dTask *const a, Dav1dTask *const b,
const int cond_signal)
{
struct TaskThreadData *const ttd = f->task_thread.ttd;
if (atomic_load(f->c->flush)) return;
assert(!a || a->next == b);
if (!a) f->task_thread.task_head = first;
else a->next = first;
if (!b) f->task_thread.task_tail = last;
last->next = b;
reset_task_cur(f->c, ttd, first->frame_idx);
if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
}
static void insert_tasks(Dav1dFrameContext *const f,
Dav1dTask *const first, Dav1dTask *const last,
const int cond_signal)
{
// insert task back into task queue
Dav1dTask *t_ptr, *prev_t = NULL;
for (t_ptr = f->task_thread.task_head;
t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next)
{
// entropy coding precedes other steps
if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue;
// both are entropy
if (first->sby > t_ptr->sby) continue;
if (first->sby < t_ptr->sby) {
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
return;
}
// same sby
} else {
if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
return;
}
if (first->sby > t_ptr->sby) continue;
if (first->sby < t_ptr->sby) {
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
return;
}
// same sby
if (first->type > t_ptr->type) continue;
if (first->type < t_ptr->type) {
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
return;
}
// same task type
}
// sort by tile-id
assert(first->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
assert(first->type == t_ptr->type);
assert(t_ptr->sby == first->sby);
const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]);
const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]);
assert(t_tile_idx != p_tile_idx);
if (t_tile_idx > p_tile_idx) continue;
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
return;
}
// append at the end
insert_tasks_between(f, first, last, prev_t, NULL, cond_signal);
}
static inline void insert_task(Dav1dFrameContext *const f,
Dav1dTask *const t, const int cond_signal)
{
insert_tasks(f, t, t, cond_signal);
}
static int create_filter_sbrow(Dav1dFrameContext *const f,
const int pass, Dav1dTask **res_t)
{
const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
f->frame_hdr->loopfilter.level_y[1] ||
f->lf.restore_planes;
f->frame_hdr->loopfilter.level_y[1];
const int has_cdef = f->seq_hdr->cdef;
const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
const int has_lr = !!f->lf.restore_planes;
f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr;
if (f->lf.thread.npf == 0) return 0;
const int has_lr = f->lf.restore_planes;
pthread_mutex_lock(&pftd->lock);
Dav1dTask *tasks = f->lf.thread.tasks;
int num_tasks = f->sbh * f->lf.thread.npf;
if (num_tasks > f->lf.thread.num_tasks) {
Dav1dTask *tasks = f->task_thread.tasks;
const int uses_2pass = f->c->n_fc > 1;
int num_tasks = f->sbh * (1 + uses_2pass);
if (num_tasks > f->task_thread.num_tasks) {
const size_t size = sizeof(Dav1dTask) * num_tasks;
tasks = realloc(f->lf.thread.tasks, size);
if (!tasks) {
pthread_mutex_unlock(&pftd->lock);
return -1;
}
tasks = realloc(f->task_thread.tasks, size);
if (!tasks) return -1;
memset(tasks, 0, size);
f->lf.thread.tasks = tasks;
f->lf.thread.num_tasks = num_tasks;
f->task_thread.tasks = tasks;
f->task_thread.num_tasks = num_tasks;
}
tasks += f->sbh * (pass & 1);
#define create_task(task, ready_cond, start_cond) \
do { \
t = &tasks[num_tasks++]; \
t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \
t->start = start_cond; \
t->frame_id = frame_cnt; \
t->frame_idx = frame_idx; \
t->sby = sby; \
t->fn = f->bd_fn.filter_sbrow_##task; \
t->last_deps[0] = NULL; \
t->last_deps[1] = NULL; \
t->next_deps[0] = NULL; \
t->next_deps[1] = NULL; \
t->next_exec = NULL; \
} while (0)
Dav1dTask *last_sbrow_deblock = NULL;
Dav1dTask *last_sbrow_cdef = NULL;
Dav1dTask *last_sbrow_resize = NULL;
Dav1dTask *last_sbrow_lr = NULL;
num_tasks = 0;
const int frame_cnt = pftd->frame_cnt++;
for (int sby = 0; sby < f->sbh; ++sby) {
Dav1dTask *t;
Dav1dTask *last = NULL;
if (has_deblock) {
create_task(deblock, sby == 0, 0);
if (sby) {
t->last_deps[1] = last_sbrow_deblock;
last_sbrow_deblock->next_deps[1] = t;
}
last = t;
last_sbrow_deblock = t;
}
if (has_cdef) {
create_task(cdef, sby == 0 && !has_deblock, has_deblock);
if (has_deblock) {
t->last_deps[0] = last;
last->next_deps[0] = t;
}
if (sby) {
t->last_deps[1] = last_sbrow_cdef;
last_sbrow_cdef->next_deps[1] = t;
}
last = t;
last_sbrow_cdef = t;
};
if (has_resize) {
create_task(resize, sby == 0 && !last, !!last);
if (last) {
t->last_deps[0] = last;
last->next_deps[0] = t;
}
if (sby) {
t->last_deps[1] = last_sbrow_resize;
last_sbrow_resize->next_deps[1] = t;
}
last = t;
last_sbrow_resize = t;
}
if (has_lr) {
create_task(lr, sby == 0 && !last, !!last);
if (last) {
t->last_deps[0] = last;
last->next_deps[0] = t;
}
if (sby) {
t->last_deps[1] = last_sbrow_lr;
last_sbrow_lr->next_deps[1] = t;
}
last_sbrow_lr = t;
}
if (pass & 1) {
f->frame_thread.entropy_progress = 0;
} else {
atomic_store(&f->frame_thread.deblock_progress, 0);
atomic_store(&f->frame_thread.cdef_progress, 0);
atomic_store(&f->frame_thread.lr_progress, 0);
}
f->lf.thread.done = 0;
pthread_mutex_unlock(&pftd->lock);
f->frame_thread.next_tile_row[pass & 1] = 0;
Dav1dTask *t = &tasks[0];
t->sby = 0;
t->recon_progress = 1;
t->deblock_progress = 0;
t->cdef_progress = 0;
t->lr_progress = 0;
t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS :
has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS :
has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS :
has_cdef ? DAV1D_TASK_TYPE_CDEF :
has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION :
DAV1D_TASK_TYPE_LOOP_RESTORATION;
t->frame_idx = (int)(f - f->c->fc);
*res_t = t;
return 0;
}
int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
const int cond_signal)
{
Dav1dTask *tasks = f->task_thread.tile_tasks[0];
const int uses_2pass = f->c->n_fc > 1;
const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
tasks = realloc(f->task_thread.tile_tasks[0], size);
if (!tasks) return -1;
memset(tasks, 0, size);
f->task_thread.tile_tasks[0] = tasks;
f->task_thread.num_tile_tasks = alloc_num_tasks;
}
f->task_thread.tile_tasks[1] = tasks + num_tasks;
tasks += num_tasks * (pass & 1);
Dav1dTask *pf_t;
if (create_filter_sbrow(f, pass, &pf_t))
return -1;
Dav1dTask *prev_t = NULL;
for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) {
Dav1dTileState *const ts = &f->ts[tile_idx];
Dav1dTask *t = &tasks[tile_idx];
t->sby = ts->tiling.row_start >> f->sb_shift;
if (pf_t && t->sby) {
prev_t->next = pf_t;
prev_t = pf_t;
pf_t = NULL;
}
t->recon_progress = 0;
t->deblock_progress = 0;
t->cdef_progress = 0;
t->lr_progress = 0;
t->deps_skip = 0;
t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION :
DAV1D_TASK_TYPE_TILE_ENTROPY;
t->frame_idx = (int)(f - f->c->fc);
if (prev_t) prev_t->next = t;
prev_t = t;
}
if (pf_t) {
prev_t->next = pf_t;
prev_t = pf_t;
}
insert_tasks(f, &tasks[0], prev_t, cond_signal);
f->task_thread.done[pass & 1] = 0;
return 0;
}
void dav1d_task_schedule(struct PostFilterThreadData *const pftd,
Dav1dTask *const t)
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
// schedule init task, which will schedule the remaining tasks
Dav1dTask *const t = &f->task_thread.init_task;
t->type = DAV1D_TASK_TYPE_INIT;
t->frame_idx = (int)(f - c->fc);
t->sby = 0;
t->recon_progress = t->deblock_progress = 0;
t->cdef_progress = t->lr_progress = 0;
insert_task(f, t, 1);
}
static inline int ensure_progress(struct TaskThreadData *const ttd,
Dav1dFrameContext *const f,
Dav1dTask *const t, const enum TaskType type,
atomic_int *const state, int *const target)
{
Dav1dTask **pt = &pftd->tasks;
while (*pt &&
((*pt)->sby < t->sby ||
((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id)))
pt = &(*pt)->next_exec;
t->next_exec = *pt;
*pt = t;
pthread_cond_signal(&pftd->cond);
// deblock_rows (non-LR portion) depends on deblock of previous sbrow,
// so ensure that completed. if not, re-add to task-queue; else, fall-through
int p1 = atomic_load(state);
if (p1 < t->sby) {
pthread_mutex_lock(&ttd->lock);
p1 = atomic_load(state);
if (p1 < t->sby) {
t->type = type;
t->deblock_progress = t->recon_progress = 0;
t->cdef_progress = t->lr_progress = 0;
*target = t->sby;
insert_task(f, t, 0);
return 1;
}
pthread_mutex_unlock(&ttd->lock);
}
return 0;
}
static inline void update_task(Dav1dTask *const t, const int dep_type,
Dav1dFrameContext *const f)
static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
const int frame_mt)
{
if (!t->last_deps[!dep_type] ||
t->last_deps[!dep_type]->status == DAV1D_TASK_DONE)
{
t->status = DAV1D_TASK_READY;
if (t->start)
dav1d_task_schedule(f->lf.thread.pftd, t);
const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]);
Dav1dTileState *const ts = &f->ts[tile_idx];
const int p1 = atomic_load(&ts->progress[tp]);
if (p1 < t->sby) return 1;
int error = p1 == TILE_ERROR;
error |= atomic_fetch_or(&f->task_thread.error, error);
if (!error && frame_mt && !tp) {
const int p2 = atomic_load(&ts->progress[1]);
if (p2 <= t->sby) return 1;
error = p2 == TILE_ERROR;
error |= atomic_fetch_or(&f->task_thread.error, error);
}
if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) {
// check reference state
const Dav1dThreadPicture *p = &f->sr_cur;
const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2);
const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift);
const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby];
for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) {
unsigned lowest;
if (tp) {
// if temporal mv refs are disabled, we only need this
// for the primary ref; if segmentation is disabled, we
// don't even need that
lowest = p_b;
} else {
// +8 is postfilter-induced delay
const int y = lowest_px[n][0] == INT_MIN ? INT_MIN :
lowest_px[n][0] + 8;
const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN :
lowest_px[n][1] * (1 << ss_ver) + 8;
const int max = imax(y, uv);
if (max == INT_MIN) continue;
lowest = iclip(max, 1, f->refp[n].p.p.h);
}
const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]);
if (p3 < lowest) return 1;
atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR);
}
}
return 0;
}
void *dav1d_frame_task(void *const data) {
Dav1dFrameContext *const f = data;
void *dav1d_worker_task(void *data) {
Dav1dTaskContext *const tc = data;
const Dav1dContext *const c = tc->c;
struct TaskThreadData *const ttd = tc->task_thread.ttd;
dav1d_set_thread_name("dav1d-frame");
pthread_mutex_lock(&f->frame_thread.td.lock);
dav1d_set_thread_name("dav1d-worker");
pthread_mutex_lock(&ttd->lock);
for (;;) {
while (!f->n_tile_data && !f->frame_thread.die) {
pthread_cond_wait(&f->frame_thread.td.cond,
&f->frame_thread.td.lock);
}
if (f->frame_thread.die) break;
pthread_mutex_unlock(&f->frame_thread.td.lock);
if (dav1d_decode_frame(f))
memset(f->frame_thread.cf, 0,
(size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
pthread_mutex_lock(&f->frame_thread.td.lock);
f->n_tile_data = 0;
pthread_cond_signal(&f->frame_thread.td.cond);
}
pthread_mutex_unlock(&f->frame_thread.td.lock);
return NULL;
}
void *dav1d_tile_task(void *const data) {
Dav1dTileContext *const t = data;
struct FrameTileThreadData *const fttd = t->tile_thread.fttd;
const Dav1dFrameContext *const f = t->f;
const int tile_thread_idx = (int) (t - f->tc);
const uint64_t mask = 1ULL << tile_thread_idx;
dav1d_set_thread_name("dav1d-tile");
for (;;) {
pthread_mutex_lock(&fttd->lock);
fttd->available |= mask;
int did_signal = 0;
while (!fttd->tasks_left && !t->tile_thread.die) {
if (!did_signal) {
did_signal = 1;
pthread_cond_signal(&fttd->icond);
Dav1dFrameContext *f;
Dav1dTask *t, *prev_t;
if (tc->task_thread.die) break;
if (atomic_load(c->flush)) goto park;
while (ttd->cur < c->n_fc) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + ttd->cur) % c->n_fc];
prev_t = f->task_thread.task_cur_prev;
t = prev_t ? prev_t->next : f->task_thread.task_head;
while (t) {
if (t->type == DAV1D_TASK_TYPE_INIT) {
const int p1 = f->in_cdf.progress ?
atomic_load(f->in_cdf.progress) : 1;
if (p1) {
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
goto found;
}
} else if (t->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION) {
// if not bottom sbrow of tile, this task will be re-added
// after it's finished
if (!check_tile(t, f, c->n_fc > 1))
goto found;
} else if (t->recon_progress) {
const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
int error = atomic_load(&f->task_thread.error);
assert(!f->task_thread.done[p] || error);
const int tile_row_base = f->frame_hdr->tiling.cols *
f->frame_thread.next_tile_row[p];
if (p) {
const int p1 = f->frame_thread.entropy_progress;
if (p1 < t->sby) goto next;
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
}
for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) {
Dav1dTileState *const ts = &f->ts[tile_row_base + tc];
const int p2 = atomic_load(&ts->progress[p]);
if (p2 < t->recon_progress) goto next;
atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR);
}
if (!p) {
atomic_int *state = NULL;
int needed;
if (t->cdef_progress) {
state = &f->frame_thread.cdef_progress;
needed = t->cdef_progress;
} else if (t->lr_progress) {
state = &f->frame_thread.lr_progress;
needed = t->lr_progress;
}
if (state) {
const int p3 = atomic_load(state);
if (p3 < needed) goto next;
atomic_fetch_or(&f->task_thread.error, p3 == TILE_ERROR);
}
}
if (t->sby + 1 < f->sbh) {
// add sby+1 to list to replace this one
Dav1dTask *next_t = &t[1];
*next_t = *t;
next_t->sby++;
const int ntr = f->frame_thread.next_tile_row[p] + 1;
const int start = f->frame_hdr->tiling.row_start_sb[ntr];
if (next_t->sby == start)
f->frame_thread.next_tile_row[p] = ntr;
next_t->recon_progress = next_t->sby + 1;
if (t->type == DAV1D_TASK_TYPE_CDEF)
next_t->cdef_progress = next_t->sby;
else if (t->type == DAV1D_TASK_TYPE_LOOP_RESTORATION)
next_t->lr_progress = next_t->sby;
insert_task(f, next_t, 0);
}
goto found;
} else {
assert(!!t->deblock_progress + !!t->cdef_progress + !!t->lr_progress == 1);
atomic_int *state;
int needed;
if (t->deblock_progress) {
needed = t->deblock_progress;
state = &f->frame_thread.deblock_progress;
} else if (t->cdef_progress) {
needed = t->cdef_progress;
state = &f->frame_thread.cdef_progress;
} else {
assert(t->lr_progress);
needed = t->lr_progress;
state = &f->frame_thread.lr_progress;
}
const int p1 = atomic_load(state);
if (p1 >= needed) {
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
goto found;
}
}
next:
prev_t = t;
t = t->next;
f->task_thread.task_cur_prev = prev_t;
}
pthread_cond_wait(&fttd->cond, &fttd->lock);
ttd->cur++;
}
if (t->tile_thread.die) {
pthread_cond_signal(&fttd->icond);
pthread_mutex_unlock(&fttd->lock);
break;
}
fttd->available &= ~mask;
const int task_idx = fttd->num_tasks - fttd->tasks_left--;
pthread_mutex_unlock(&fttd->lock);
if (reset_task_cur(c, ttd, UINT_MAX)) continue;
park:
tc->task_thread.flushed = 1;
pthread_cond_signal(&tc->task_thread.td.cond);
// we want to be woken up next time progress is signaled
atomic_store(&ttd->cond_signaled, 0);
pthread_cond_wait(&ttd->cond, &ttd->lock);
tc->task_thread.flushed = 0;
reset_task_cur(c, ttd, UINT_MAX);
continue;
if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
// we can (or in fact, if >, we need to) do full tile decoding.
// loopfilter happens in the main thread
Dav1dTileState *const ts = t->ts = &f->ts[task_idx];
for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end;
t->by += f->sb_step)
{
const int error = dav1d_decode_tile_sbrow(t);
const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift);
found:
// remove t from list
if (prev_t) prev_t->next = t->next;
else f->task_thread.task_head = t->next;
if (!t->next) f->task_thread.task_tail = prev_t;
if (!f->task_thread.task_head) ttd->cur++;
// we don't need to check cond_signaled here, since we found a task
// after the last signal so we want to re-signal the next waiting thread
// and again won't need to signal after that
atomic_store(&ttd->cond_signaled, 1);
pthread_cond_signal(&ttd->cond);
pthread_mutex_unlock(&ttd->lock);
found_unlocked:;
const int flush = atomic_load(c->flush);
int error = atomic_fetch_or(&f->task_thread.error, flush) | flush;
// signal progress
pthread_mutex_lock(&ts->tile_thread.lock);
atomic_store(&ts->progress, progress);
pthread_cond_signal(&ts->tile_thread.cond);
pthread_mutex_unlock(&ts->tile_thread.lock);
if (error) break;
// run it
tc->f = f;
int sby = t->sby;
switch (t->type) {
case DAV1D_TASK_TYPE_INIT: {
assert(c->n_fc > 1);
int res = -1;
if (!atomic_load(&f->task_thread.error))
res = dav1d_decode_frame_init(f);
pthread_mutex_lock(&ttd->lock);
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
}
} else {
const int sby = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0];
const int tile_idx = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1];
if (!res) {
assert(c->n_fc > 1);
for (int p = 1; p <= 2; p++) {
const int res = dav1d_task_create_tile_sbrow(f, p, 0);
if (res) {
// memory allocation failed
f->task_thread.done[2 - p] = 1;
atomic_store(&f->task_thread.error, 1);
f->task_thread.task_counter -= f->sbh +
f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
if (p == 2 && f->task_thread.done[1]) {
assert(!f->task_thread.task_counter);
dav1d_decode_frame_exit(f, -1);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
}
}
} else {
// init failed, signal completion
atomic_store(&f->task_thread.error, 1);
f->task_thread.task_counter = 0;
f->task_thread.done[0] = 1;
f->task_thread.done[1] = 1;
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
dav1d_decode_frame_exit(f, -1);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
reset_task_cur(c, ttd, t->frame_idx);
continue;
}
case DAV1D_TASK_TYPE_TILE_ENTROPY:
case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: {
const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]);
Dav1dTileState *const ts = &f->ts[tile_idx];
int progress;
// the interleaved decoding can sometimes cause dependency issues
// if one part of the frame decodes signifcantly faster than others.
// Ideally, we'd "skip" tile_sbrows where dependencies are missing,
// and resume them later as dependencies are met. This also would
// solve the broadcast() below and allow us to use signal(). However,
// for now, we use linear dependency tracking because it's simpler.
if ((progress = atomic_load(&ts->progress)) < sby) {
pthread_mutex_lock(&ts->tile_thread.lock);
while ((progress = atomic_load(&ts->progress)) < sby)
pthread_cond_wait(&ts->tile_thread.cond,
&ts->tile_thread.lock);
pthread_mutex_unlock(&ts->tile_thread.lock);
}
if (progress == TILE_ERROR) continue;
// we need to interleave sbrow decoding for all tile cols in a
// tile row, since otherwise subsequent threads will be blocked
// waiting for the post-filter to complete
t->ts = ts;
t->by = sby << f->sb_shift;
const int error = dav1d_decode_tile_sbrow(t);
progress = error ? TILE_ERROR : 1 + sby;
tc->ts = ts;
tc->by = sby << f->sb_shift;
const int uses_2pass = c->n_fc > 1;
tc->frame_thread.pass = !uses_2pass ? 0 :
1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
if (!error) error = dav1d_decode_tile_sbrow(tc);
const int progress = error ? TILE_ERROR : 1 + sby;
// signal progress
pthread_mutex_lock(&ts->tile_thread.lock);
atomic_store(&ts->progress, progress);
pthread_cond_broadcast(&ts->tile_thread.cond);
pthread_mutex_unlock(&ts->tile_thread.lock);
}
}
return NULL;
}
static inline int handle_abortion(Dav1dPostFilterContext *const pf,
Dav1dContext *const c,
struct PostFilterThreadData *const pftd)
{
const int flush = atomic_load_explicit(c->flush, memory_order_acquire);
if (flush) {
pthread_mutex_lock(&pf->td.lock);
pf->flushed = 0;
pthread_mutex_unlock(&pf->td.lock);
}
for (unsigned i = 0; i < c->n_fc; i++) {
Dav1dFrameContext *const f = &c->fc[i];
int send_signal;
if (flush) // TODO before merge, see if this can be safely merged
send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0;
else
send_signal = f->lf.thread.done == -1;
for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) {
Dav1dTask *const t = &f->lf.thread.tasks[j];
if (t->status == DAV1D_TASK_RUNNING ||
(t->status == DAV1D_TASK_DONE && t->start != -1))
send_signal = 0;
}
if (send_signal) {
if (!flush) {
Dav1dTask **pt = &pftd->tasks;
while (*pt) {
if ((*pt)->frame_idx == i)
*pt = (*pt)->next_exec;
else
pt = &(*pt)->next_exec;
atomic_fetch_or(&f->task_thread.error, error);
if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) {
t->sby++;
t->deps_skip = 0;
if (!check_tile(t, f, uses_2pass)) {
atomic_store(&ts->progress[p], progress);
reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
goto found_unlocked;
}
pthread_mutex_lock(&ttd->lock);
atomic_store(&ts->progress[p], progress);
reset_task_cur(c, ttd, t->frame_idx);
insert_task(f, t, 0);
} else {
pthread_mutex_lock(&ttd->lock);
atomic_store(&ts->progress[p], progress);
reset_task_cur(c, ttd, t->frame_idx);
error = atomic_load(&f->task_thread.error);
if (f->frame_hdr->refresh_context &&
tc->frame_thread.pass <= 1 && f->task_thread.update_set &&
f->frame_hdr->tiling.update == tile_idx)
{
if (!error)
dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
&f->ts[f->frame_hdr->tiling.update].cdf);
if (c->n_fc > 1)
atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
}
if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
(!uses_2pass || f->task_thread.done[1]))
{
dav1d_decode_frame_exit(f, error ? -1 : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
assert(f->task_thread.task_counter >= 0);
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
}
f->lf.thread.done = 1;
pthread_cond_signal(&f->lf.thread.cond);
continue;
}
}
if (flush) {
pthread_mutex_lock(&pf->td.lock);
pf->flushed = 1;
pthread_cond_signal(&pf->td.cond);
pthread_mutex_unlock(&pf->td.lock);
}
return !flush;
}
void *dav1d_postfilter_task(void *data) {
Dav1dPostFilterContext *const pf = data;
Dav1dContext *const c = pf->c;
struct PostFilterThreadData *pftd = &c->postfilter_thread;
dav1d_set_thread_name("dav1d-postfilter");
int exec = 1;
pthread_mutex_lock(&pftd->lock);
for (;;) {
if (!exec && !pf->die)
pthread_cond_wait(&pftd->cond, &pftd->lock);
if (!(exec = handle_abortion(pf, c, pftd))) continue;
if (pf->die) break;
Dav1dTask *const t = pftd->tasks;
if (!t) { exec = 0; continue; }
pftd->tasks = t->next_exec;
t->status = DAV1D_TASK_RUNNING;
pthread_mutex_unlock(&pftd->lock);
Dav1dFrameContext *const f = &c->fc[t->frame_idx];
t->fn(f, t->sby);
exec = 1;
pthread_mutex_lock(&pftd->lock);
if (t->next_deps[0])
update_task(t->next_deps[0], 0, f);
if (t->next_deps[1])
update_task(t->next_deps[1], 1, f);
t->status = DAV1D_TASK_DONE;
if (!t->next_deps[0]) {
const enum PlaneType progress_plane_type =
c->n_fc > 1 && f->frame_hdr->refresh_context ?
PLANE_TYPE_Y : PLANE_TYPE_ALL;
const int y = (t->sby + 1) * f->sb_step * 4;
dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type);
if (t->sby + 1 == f->sbh) {
f->lf.thread.done = 1;
pthread_cond_signal(&f->lf.thread.cond);
case DAV1D_TASK_TYPE_DEBLOCK_COLS:
if (!atomic_load(&f->task_thread.error))
f->bd_fn.filter_sbrow_deblock_cols(f, sby);
if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS,
&f->frame_thread.deblock_progress,
&t->deblock_progress)) continue;
// fall-through
case DAV1D_TASK_TYPE_DEBLOCK_ROWS:
if (!atomic_load(&f->task_thread.error))
f->bd_fn.filter_sbrow_deblock_rows(f, sby);
// signal deblock progress
if (f->frame_hdr->loopfilter.level_y[0] ||
f->frame_hdr->loopfilter.level_y[1])
{
error = atomic_load(&f->task_thread.error);
atomic_store(&f->frame_thread.deblock_progress,
error ? TILE_ERROR : sby + 1);
reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
}
// fall-through
case DAV1D_TASK_TYPE_CDEF:
if (f->seq_hdr->cdef) {
// cdef caches top (pre-cdef) buffers internally and therefore
// needs to be vertically linear
if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_CDEF,
&f->frame_thread.cdef_progress,
&t->cdef_progress)) continue;
if (!atomic_load(&f->task_thread.error))
f->bd_fn.filter_sbrow_cdef(f, sby);
// signal cdef progress
error = atomic_load(&f->task_thread.error);
atomic_store(&f->frame_thread.cdef_progress,
error ? TILE_ERROR : sby + 1);
reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
}
// fall-through
case DAV1D_TASK_TYPE_SUPER_RESOLUTION:
if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
if (!atomic_load(&f->task_thread.error))
f->bd_fn.filter_sbrow_resize(f, sby);
// fall-through
case DAV1D_TASK_TYPE_LOOP_RESTORATION:
// lr is the last step before signaling frame completion, and
// therefore needs to be done vertically linear
if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_LOOP_RESTORATION,
&f->frame_thread.lr_progress,
&t->lr_progress)) continue;
if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes)
f->bd_fn.filter_sbrow_lr(f, sby);
// fall-through
case DAV1D_TASK_TYPE_ENTROPY_PROGRESS:
// dummy to convert tile to frame
break;
default: abort();
}
t->start = -1;
// if task completed [typically LR], signal picture progress as per below
const int uses_2pass = c->n_fc > 1;
const enum PlaneType progress_plane_type =
t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK :
c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL;
const int sbh = f->sbh;
pthread_mutex_lock(&ttd->lock);
error = atomic_load(&f->task_thread.error);
const unsigned y = error ? FRAME_ERROR :
sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * f->sb_step * 4;
if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) {
if (!uses_2pass || t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
atomic_store(&f->sr_cur.progress[0], y);
if (!uses_2pass || t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
atomic_store(&f->sr_cur.progress[1], y);
}
const int progress = error ? TILE_ERROR : sby + 1;
if (progress_plane_type == PLANE_TYPE_BLOCK)
f->frame_thread.entropy_progress = progress;
else
atomic_store(&f->frame_thread.lr_progress, progress);
if (sby + 1 == sbh)
f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1;
if (!--f->task_thread.task_counter &&
f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
{
dav1d_decode_frame_exit(f, error ? -1 : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
reset_task_cur(c, ttd, t->frame_idx);
}
pthread_mutex_unlock(&pftd->lock);
pthread_mutex_unlock(&ttd->lock);
return NULL;
}

33
third_party/dav1d/src/thread_task.h поставляемый
Просмотреть файл

@ -35,33 +35,16 @@
#define FRAME_ERROR (UINT_MAX - 1)
#define TILE_ERROR (INT_MAX - 1)
enum TaskStatus {
DAV1D_TASK_DEFAULT,
DAV1D_TASK_READY,
DAV1D_TASK_RUNNING,
DAV1D_TASK_DONE,
};
// these functions assume the task scheduling lock is already taken
int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal);
void dav1d_task_frame_init(Dav1dFrameContext *f);
struct Dav1dTask {
enum TaskStatus status; // task status
int start; // frame thread start flag
unsigned frame_idx; // frame thread id
int frame_id; // frame ordering
int sby; // sbrow
filter_sbrow_fn fn; // task work
Dav1dTask *last_deps[2]; // dependencies
Dav1dTask *next_deps[2]; // dependant tasks
Dav1dTask *next_exec; // tasks scheduling
};
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f);
void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t);
void *dav1d_frame_task(void *data);
void *dav1d_tile_task(void *data);
void *dav1d_postfilter_task(void *data);
void *dav1d_worker_task(void *data);
int dav1d_decode_frame_init(Dav1dFrameContext *f);
int dav1d_decode_frame_main(Dav1dFrameContext *f);
void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
int dav1d_decode_frame(Dav1dFrameContext *f);
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
int dav1d_decode_tile_sbrow(Dav1dTaskContext *t);
#endif /* DAV1D_SRC_THREAD_TASK_H */

1184
third_party/dav1d/src/x86/cdef16_avx2.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

910
third_party/dav1d/src/x86/cdef16_sse.asm поставляемый
Просмотреть файл

@ -1,3 +1,5 @@
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; Copyright (c) 2017-2021, The rav1e contributors
; Copyright (c) 2021, Nathan Egge
; All rights reserved.
@ -28,10 +30,33 @@
SECTION_RODATA
%macro DUP8 1-*
%rep %0
times 8 dw %1
%rotate 1
%endrep
%endmacro
pri_taps: DUP8 4, 2, 3, 3
dir_table: db 1 * 32 + 0, 2 * 32 + 0
db 1 * 32 + 0, 2 * 32 - 2
db -1 * 32 + 2, -2 * 32 + 4
db 0 * 32 + 2, -1 * 32 + 4
db 0 * 32 + 2, 0 * 32 + 4
db 0 * 32 + 2, 1 * 32 + 4
db 1 * 32 + 2, 2 * 32 + 4
db 1 * 32 + 0, 2 * 32 + 2
db 1 * 32 + 0, 2 * 32 + 0
db 1 * 32 + 0, 2 * 32 - 2
db -1 * 32 + 2, -2 * 32 + 4
db 0 * 32 + 2, -1 * 32 + 4
dir_shift: times 4 dw 0x4000
times 4 dw 0x1000
pw_128: times 4 dw 128
pw_2048: times 8 dw 2048
pw_m16384: times 8 dw -16384
cextern cdef_dir_8bpc_ssse3.main
cextern cdef_dir_8bpc_sse4.main
@ -47,6 +72,891 @@ SECTION .text
%endrep
%endmacro
%if ARCH_X86_32
DECLARE_REG_TMP 5, 3
%elif WIN64
DECLARE_REG_TMP 7, 4
%else
DECLARE_REG_TMP 7, 8
%endif
%macro CDEF_FILTER 2 ; w, h
%if ARCH_X86_64
DEFINE_ARGS dst, stride, tmp, pridmp, pri, sec, dir
mova m8, [base+pw_2048]
%else
DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
%define m8 [base+pw_2048]
%define m9 [rsp+16*1+gprsize]
%define m10 [rsp+16*2+gprsize]
%endif
movifnidn prid, r4m
movifnidn secd, r5m
test prid, prid
jz .sec_only
movd m6, r4m
%if ARCH_X86_32
mov [rsp+24], pridmpd
%endif
bsr pridmpd, prid
lea tmpd, [priq*4]
cmp dword r9m, 0x3ff ; if (bpc == 10)
cmove prid, tmpd ; pri <<= 2
mov tmpd, r7m ; damping
mov dird, r6m
and prid, 16
pshufb m6, m7 ; splat
lea dirq, [base+dir_table+dirq*2]
lea priq, [base+pri_taps+priq*2]
test secd, secd
jz .pri_only
mova [rsp], m6
movd m6, secd
tzcnt secd, secd
sub pridmpd, tmpd
sub tmpd, secd
pshufb m6, m7
xor secd, secd
neg pridmpd
cmovs pridmpd, secd
%if ARCH_X86_32
mov [pri_shift+4], secd
mov [sec_shift+4], secd
%endif
mov [pri_shift+0], pridmpq
mov [sec_shift+0], tmpq
lea tmpq, [px]
%if WIN64
movaps r4m, m9
movaps r6m, m10
%elif ARCH_X86_32
mov pridmpd, [rsp+24]
%endif
%rep %1*%2/8
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
%endrep
%if WIN64
movaps m9, r4m
movaps m10, r6m
%endif
jmp .end
.pri_only:
sub tmpd, pridmpd
cmovs tmpd, secd
%if ARCH_X86_32
mov pridmpd, [rsp+24]
mov [pri_shift+4], secd
%endif
mov [pri_shift+0], tmpq
lea tmpq, [px]
%rep %1*%2/8
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
%endrep
.end:
RET
.sec_only:
mov tmpd, r7m ; damping
movd m6, r5m
tzcnt secd, secd
mov dird, r6m
pshufb m6, m7
sub tmpd, secd
lea dirq, [base+dir_table+dirq*2]
%if ARCH_X86_32
mov [sec_shift+4], prid
%endif
mov [sec_shift+0], tmpq
lea tmpq, [px]
%rep %1*%2/8
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
%endrep
jmp .end
%if %1 == %2
DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
ALIGN function_align
.pri:
movsx offq, byte [dirq+4] ; off_k0
%if %1 == 4
movq m1, [dstq+strideq*0]
movhps m1, [dstq+strideq*1]
movq m2, [tmpq+offq+32*0] ; k0p0
movhps m2, [tmpq+offq+32*1]
neg offq
movq m3, [tmpq+offq+32*0] ; k0p1
movhps m3, [tmpq+offq+32*1]
%else
mova m1, [dstq]
movu m2, [tmpq+offq]
neg offq
movu m3, [tmpq+offq]
%endif
movsx offq, byte [dirq+5] ; off_k1
psubw m2, m1 ; diff_k0p0
psubw m3, m1 ; diff_k0p1
pabsw m4, m2 ; adiff_k0p0
psrlw m5, m4, [pri_shift+gprsize]
psubusw m0, m6, m5
pabsw m5, m3 ; adiff_k0p1
pminsw m0, m4
psrlw m4, m5, [pri_shift+gprsize]
psignw m0, m2 ; constrain(diff_k0p0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movq m4, [tmpq+offq+32*0] ; k1p0
movhps m4, [tmpq+offq+32*1]
neg offq
movq m5, [tmpq+offq+32*0] ; k1p1
movhps m5, [tmpq+offq+32*1]
%else
movu m4, [tmpq+offq]
neg offq
movu m5, [tmpq+offq]
%endif
psubw m4, m1 ; diff_k1p0
psubw m5, m1 ; diff_k1p1
psignw m2, m3 ; constrain(diff_k0p1)
pabsw m3, m4 ; adiff_k1p0
paddw m0, m2 ; constrain(diff_k0)
psrlw m2, m3, [pri_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k1p1
pminsw m7, m3
psrlw m3, m2, [pri_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1p0)
psubusw m4, m6, m3
pminsw m4, m2
psignw m4, m5 ; constrain(diff_k1p1)
paddw m7, m4 ; constrain(diff_k1)
pmullw m0, [priq+16*0] ; pri_tap_k0
pmullw m7, [priq+16*1] ; pri_tap_k1
paddw m0, m7 ; sum
psraw m2, m0, 15
paddw m0, m2
pmulhrsw m0, m8
paddw m0, m1
%if %1 == 4
add tmpq, 32*2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
%else
add tmpq, 32
mova [dstq], m0
add dstq, strideq
%endif
ret
ALIGN function_align
.sec:
movsx offq, byte [dirq+8] ; off1_k0
%if %1 == 4
movq m1, [dstq+strideq*0]
movhps m1, [dstq+strideq*1]
movq m2, [tmpq+offq+32*0] ; k0s0
movhps m2, [tmpq+offq+32*1]
neg offq
movq m3, [tmpq+offq+32*0] ; k0s1
movhps m3, [tmpq+offq+32*1]
%else
mova m1, [dstq]
movu m2, [tmpq+offq]
neg offq
movu m3, [tmpq+offq]
%endif
movsx offq, byte [dirq+0] ; off2_k0
psubw m2, m1 ; diff_k0s0
psubw m3, m1 ; diff_k0s1
pabsw m4, m2 ; adiff_k0s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m0, m6, m5
pabsw m5, m3 ; adiff_k0s1
pminsw m0, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m0, m2 ; constrain(diff_k0s0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movq m4, [tmpq+offq+32*0] ; k0s2
movhps m4, [tmpq+offq+32*1]
neg offq
movq m5, [tmpq+offq+32*0] ; k0s3
movhps m5, [tmpq+offq+32*1]
%else
movu m4, [tmpq+offq]
neg offq
movu m5, [tmpq+offq]
%endif
movsx offq, byte [dirq+9] ; off1_k1
psubw m4, m1 ; diff_k0s2
psubw m5, m1 ; diff_k0s3
psignw m2, m3 ; constrain(diff_k0s1)
pabsw m3, m4 ; adiff_k0s2
paddw m0, m2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k0s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k0s2)
psubusw m4, m6, m3
pminsw m4, m2
%if %1 == 4
movq m2, [tmpq+offq+32*0] ; k1s0
movhps m2, [tmpq+offq+32*1]
neg offq
movq m3, [tmpq+offq+32*0] ; k1s1
movhps m3, [tmpq+offq+32*1]
%else
movu m2, [tmpq+offq]
neg offq
movu m3, [tmpq+offq]
%endif
movsx offq, byte [dirq+1] ; off2_k1
paddw m0, m7
psignw m4, m5 ; constrain(diff_k0s3)
paddw m0, m4 ; constrain(diff_k0)
psubw m2, m1 ; diff_k1s0
psubw m3, m1 ; diff_k1s1
paddw m0, m0 ; sec_tap_k0
pabsw m4, m2 ; adiff_k1s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m7, m6, m5
pabsw m5, m3 ; adiff_k1s1
pminsw m7, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m7, m2 ; constrain(diff_k1s0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movq m4, [tmpq+offq+32*0] ; k1s2
movhps m4, [tmpq+offq+32*1]
neg offq
movq m5, [tmpq+offq+32*0] ; k1s3
movhps m5, [tmpq+offq+32*1]
%else
movu m4, [tmpq+offq]
neg offq
movu m5, [tmpq+offq]
%endif
paddw m0, m7
psubw m4, m1 ; diff_k1s2
psubw m5, m1 ; diff_k1s3
psignw m2, m3 ; constrain(diff_k1s1)
pabsw m3, m4 ; adiff_k1s2
paddw m0, m2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k1s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1s2)
psubusw m4, m6, m3
pminsw m4, m2
paddw m0, m7
psignw m4, m5 ; constrain(diff_k1s3)
paddw m0, m4 ; sum
psraw m2, m0, 15
paddw m0, m2
pmulhrsw m0, m8
paddw m0, m1
%if %1 == 4
add tmpq, 32*2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
%else
add tmpq, 32
mova [dstq], m0
add dstq, strideq
%endif
ret
ALIGN function_align
.pri_sec:
movsx offq, byte [dirq+8] ; off2_k0
%if %1 == 4
movq m1, [dstq+strideq*0]
movhps m1, [dstq+strideq*1]
movq m2, [tmpq+offq+32*0] ; k0s0
movhps m2, [tmpq+offq+32*1]
neg offq
movq m3, [tmpq+offq+32*0] ; k0s1
movhps m3, [tmpq+offq+32*1]
%else
mova m1, [dstq]
movu m2, [tmpq+offq]
neg offq
movu m3, [tmpq+offq]
%endif
movsx offq, byte [dirq+0] ; off3_k0
pabsw m4, m2
%if ARCH_X86_64
pabsw m10, m3
pmaxsw m9, m2, m3
pminsw m10, m4
%else
pabsw m7, m3
pmaxsw m5, m2, m3
pminsw m4, m7
mova m9, m5
mova m10, m4
%endif
psubw m2, m1 ; diff_k0s0
psubw m3, m1 ; diff_k0s1
pabsw m4, m2 ; adiff_k0s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m0, m6, m5
pabsw m5, m3 ; adiff_k0s1
pminsw m0, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m0, m2 ; constrain(diff_k0s0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movq m4, [tmpq+offq+32*0] ; k0s2
movhps m4, [tmpq+offq+32*1]
neg offq
movq m5, [tmpq+offq+32*0] ; k0s3
movhps m5, [tmpq+offq+32*1]
%else
movu m4, [tmpq+offq]
neg offq
movu m5, [tmpq+offq]
%endif
movsx offq, byte [dirq+9] ; off2_k1
pabsw m7, m4
psignw m2, m3
pabsw m3, m5 ; constrain(diff_k0s1)
%if ARCH_X86_64
pmaxsw m9, m4
pminsw m10, m7
pmaxsw m9, m5
pminsw m10, m3
%else
pminsw m7, m10
pminsw m7, m3
pmaxsw m3, m9, m4
pmaxsw m3, m5
mova m10, m7
mova m9, m3
%endif
psubw m4, m1 ; diff_k0s2
psubw m5, m1 ; diff_k0s3
paddw m0, m2
pabsw m3, m4 ; adiff_k0s2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k0s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k0s2)
psubusw m4, m6, m3
pminsw m4, m2
%if %1 == 4
movq m2, [tmpq+offq+32*0] ; k1s0
movhps m2, [tmpq+offq+32*1]
neg offq
movq m3, [tmpq+offq+32*0] ; k1s1
movhps m3, [tmpq+offq+32*1]
%else
movu m2, [tmpq+offq]
neg offq
movu m3, [tmpq+offq]
%endif
movsx offq, byte [dirq+1] ; off3_k1
paddw m0, m7
pabsw m7, m2
psignw m4, m5 ; constrain(diff_k0s3)
pabsw m5, m3
%if ARCH_X86_64
pmaxsw m9, m2
pminsw m10, m7
pmaxsw m9, m3
pminsw m10, m5
%else
pminsw m7, m10
pminsw m7, m5
pmaxsw m5, m9, m2
pmaxsw m5, m3
mova m10, m7
mova m9, m5
%endif
paddw m0, m4 ; constrain(diff_k0)
psubw m2, m1 ; diff_k1s0
psubw m3, m1 ; diff_k1s1
paddw m0, m0 ; sec_tap_k0
pabsw m4, m2 ; adiff_k1s0
psrlw m5, m4, [sec_shift+gprsize]
psubusw m7, m6, m5
pabsw m5, m3 ; adiff_k1s1
pminsw m7, m4
psrlw m4, m5, [sec_shift+gprsize]
psignw m7, m2 ; constrain(diff_k1s0)
psubusw m2, m6, m4
pminsw m2, m5
%if %1 == 4
movq m4, [tmpq+offq+32*0] ; k1s2
movhps m4, [tmpq+offq+32*1]
neg offq
movq m5, [tmpq+offq+32*0] ; k1s3
movhps m5, [tmpq+offq+32*1]
%else
movu m4, [tmpq+offq]
neg offq
movu m5, [tmpq+offq]
%endif
movsx offq, byte [dirq+4] ; off1_k0
paddw m0, m7
pabsw m7, m4
psignw m2, m3 ; constrain(diff_k1s1)
pabsw m3, m5
%if ARCH_X86_64
pmaxsw m9, m4
pminsw m10, m7
pmaxsw m9, m5
pminsw m10, m3
%else
pminsw m7, m10
pminsw m7, m3
pmaxsw m3, m9, m4
pmaxsw m3, m5
mova m10, m7
mova m9, m3
%endif
psubw m4, m1 ; diff_k1s2
psubw m5, m1 ; diff_k1s3
pabsw m3, m4 ; adiff_k1s2
paddw m0, m2
psrlw m2, m3, [sec_shift+gprsize]
psubusw m7, m6, m2
pabsw m2, m5 ; adiff_k1s3
pminsw m7, m3
psrlw m3, m2, [sec_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1s2)
psubusw m4, m6, m3
pminsw m4, m2
paddw m0, m7
%if %1 == 4
movq m2, [tmpq+offq+32*0] ; k0p0
movhps m2, [tmpq+offq+32*1]
neg offq
movq m3, [tmpq+offq+32*0] ; k0p1
movhps m3, [tmpq+offq+32*1]
%else
movu m2, [tmpq+offq]
neg offq
movu m3, [tmpq+offq]
%endif
movsx offq, byte [dirq+5] ; off1_k1
pabsw m7, m2
psignw m4, m5 ; constrain(diff_k1s3)
pabsw m5, m3
%if ARCH_X86_64
pmaxsw m9, m2
pminsw m10, m7
pmaxsw m9, m3
pminsw m10, m5
%else
pminsw m7, m10
pminsw m7, m5
pmaxsw m5, m9, m2
pmaxsw m5, m3
mova m10, m7
mova m9, m5
%endif
psubw m2, m1 ; diff_k0p0
psubw m3, m1 ; diff_k0p1
paddw m0, m4
pabsw m4, m2 ; adiff_k0p0
psrlw m5, m4, [pri_shift+gprsize]
psubusw m7, [rsp+gprsize], m5
pabsw m5, m3 ; adiff_k0p1
pminsw m7, m4
psrlw m4, m5, [pri_shift+gprsize]
psignw m7, m2 ; constrain(diff_k0p0)
psubusw m2, [rsp+gprsize], m4
pminsw m2, m5
%if %1 == 4
movq m4, [tmpq+offq+32*0] ; k1p0
movhps m4, [tmpq+offq+32*1]
neg offq
movq m5, [tmpq+offq+32*0] ; k1p1
movhps m5, [tmpq+offq+32*1]
%else
movu m4, [tmpq+offq]
neg offq
movu m5, [tmpq+offq]
%endif
psignw m2, m3 ; constrain(diff_k0p1)
pabsw m3, m4
paddw m7, m2 ; constrain(diff_k0)
pabsw m2, m5
%if ARCH_X86_64
pmaxsw m9, m4
pminsw m10, m3
pmaxsw m9, m5
pminsw m10, m2
%else
pminsw m3, m10
pminsw m3, m2
pmaxsw m2, m9, m4
pmaxsw m2, m5
mova m10, m3
mova m9, m2
%endif
psubw m4, m1 ; diff_k1p0
psubw m5, m1 ; diff_k1p1
pabsw m3, m4 ; adiff_k1p0
pmullw m7, [priq+16*0] ; pri_tap_k0
paddw m0, m7
psrlw m2, m3, [pri_shift+gprsize]
psubusw m7, [rsp+16*0+gprsize], m2
pabsw m2, m5 ; adiff_k1p1
pminsw m7, m3
psrlw m3, m2, [pri_shift+gprsize]
psignw m7, m4 ; constrain(diff_k1p0)
psubusw m4, [rsp+16*0+gprsize], m3
pminsw m4, m2
psignw m4, m5 ; constrain(diff_k1p1)
paddw m7, m4 ; constrain(diff_k1)
pmullw m7, [priq+16*1] ; pri_tap_k1
paddw m0, m7 ; sum
psraw m2, m0, 15
paddw m0, m2
pmulhrsw m0, m8
paddw m0, m1
%if ARCH_X86_64
pmaxsw m9, m1
pminsw m0, m9
%else
pmaxsw m2, m9, m1
pminsw m0, m2
%endif
pminsw m1, m10
pmaxsw m0, m1
%if %1 == 4
add tmpq, 32*2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
%else
add tmpq, 32
mova [dstq], m0
add dstq, strideq
%endif
ret
%endif
%endmacro
INIT_XMM ssse3
%if ARCH_X86_64
cglobal cdef_filter_4x4_16bpc, 4, 8, 9, 32*10, dst, stride, left, top, pri, sec, edge
%define px rsp+32*4
%else
cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
%define px rsp+32*5
%endif
%define base t0-dir_table
%define pri_shift px-16*6
%define sec_shift px-16*5
mov edged, r8m
LEA t0, dir_table
movu m0, [dstq+strideq*0]
movu m1, [dstq+strideq*1]
lea t1, [dstq+strideq*2]
movu m2, [t1 +strideq*0]
movu m3, [t1 +strideq*1]
movddup m7, [base+pw_m16384]
mova [px+32*0+0], m0
mova [px+32*1+0], m1
mova [px+32*2+0], m2
mova [px+32*3+0], m3
test edgeb, 4 ; HAVE_TOP
jz .no_top
movifnidn topq, topmp
movu m0, [topq+strideq*0]
movu m1, [topq+strideq*1]
mova [px-32*2+0], m0
mova [px-32*1+0], m1
test edgeb, 1 ; HAVE_LEFT
jz .top_no_left
movd m0, [topq+strideq*0-4]
movd m1, [topq+strideq*1-4]
movd [px-32*2-4], m0
movd [px-32*1-4], m1
jmp .top_done
.no_top:
mova [px-32*2+0], m7
mova [px-32*1+0], m7
.top_no_left:
movd [px-32*2-4], m7
movd [px-32*1-4], m7
.top_done:
test edgeb, 8 ; HAVE_BOTTOM
jz .no_bottom
lea r3, [dstq+strideq*4]
movu m0, [r3+strideq*0]
movu m1, [r3+strideq*1]
mova [px+32*4+0], m0
mova [px+32*5+0], m1
test edgeb, 1 ; HAVE_LEFT
jz .bottom_no_left
movd m0, [r3+strideq*0-4]
movd m1, [r3+strideq*1-4]
movd [px+32*4-4], m0
movd [px+32*5-4], m1
jmp .bottom_done
.no_bottom:
mova [px+32*4+0], m7
mova [px+32*5+0], m7
.bottom_no_left:
movd [px+32*4-4], m7
movd [px+32*5-4], m7
.bottom_done:
test edgeb, 1 ; HAVE_LEFT
jz .no_left
movifnidn leftq, r2mp
movd m0, [leftq+4*0]
movd m1, [leftq+4*1]
movd m2, [leftq+4*2]
movd m3, [leftq+4*3]
movd [px+32*0-4], m0
movd [px+32*1-4], m1
movd [px+32*2-4], m2
movd [px+32*3-4], m3
jmp .left_done
.no_left:
REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
.left_done:
test edgeb, 2 ; HAVE_RIGHT
jnz .padding_done
REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
.padding_done:
CDEF_FILTER 4, 4
%if ARCH_X86_64
cglobal cdef_filter_4x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
%else
cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
%endif
mov edged, r8m
LEA t0, dir_table
movu m0, [dstq+strideq*0]
movu m1, [dstq+strideq*1]
lea t1, [dstq+strideq*2]
movu m2, [t1 +strideq*0]
movu m3, [t1 +strideq*1]
lea t1, [t1 +strideq*2]
movu m4, [t1 +strideq*0]
movu m5, [t1 +strideq*1]
lea t1, [t1 +strideq*2]
movu m6, [t1 +strideq*0]
movu m7, [t1 +strideq*1]
mova [px+32*0+0], m0
mova [px+32*1+0], m1
mova [px+32*2+0], m2
mova [px+32*3+0], m3
mova [px+32*4+0], m4
mova [px+32*5+0], m5
mova [px+32*6+0], m6
mova [px+32*7+0], m7
movddup m7, [base+pw_m16384]
test edgeb, 4 ; HAVE_TOP
jz .no_top
movifnidn topq, topmp
movu m0, [topq+strideq*0]
movu m1, [topq+strideq*1]
mova [px-32*2+0], m0
mova [px-32*1+0], m1
test edgeb, 1 ; HAVE_LEFT
jz .top_no_left
movd m0, [topq+strideq*0-4]
movd m1, [topq+strideq*1-4]
movd [px-32*2-4], m0
movd [px-32*1-4], m1
jmp .top_done
.no_top:
mova [px-32*2+0], m7
mova [px-32*1+0], m7
.top_no_left:
movd [px-32*2-4], m7
movd [px-32*1-4], m7
.top_done:
test edgeb, 8 ; HAVE_BOTTOM
jz .no_bottom
lea r3, [dstq+strideq*8]
movu m0, [r3+strideq*0]
movu m1, [r3+strideq*1]
mova [px+32*8+0], m0
mova [px+32*9+0], m1
test edgeb, 1 ; HAVE_LEFT
jz .bottom_no_left
movd m0, [r3+strideq*0-4]
movd m1, [r3+strideq*1-4]
movd [px+32*8-4], m0
movd [px+32*9-4], m1
jmp .bottom_done
.no_bottom:
mova [px+32*8+0], m7
mova [px+32*9+0], m7
.bottom_no_left:
movd [px+32*8-4], m7
movd [px+32*9-4], m7
.bottom_done:
test edgeb, 1 ; HAVE_LEFT
jz .no_left
movifnidn leftq, r2mp
movd m0, [leftq+4*0]
movd m1, [leftq+4*1]
movd m2, [leftq+4*2]
movd m3, [leftq+4*3]
movd [px+32*0-4], m0
movd [px+32*1-4], m1
movd [px+32*2-4], m2
movd [px+32*3-4], m3
movd m0, [leftq+4*4]
movd m1, [leftq+4*5]
movd m2, [leftq+4*6]
movd m3, [leftq+4*7]
movd [px+32*4-4], m0
movd [px+32*5-4], m1
movd [px+32*6-4], m2
movd [px+32*7-4], m3
jmp .left_done
.no_left:
REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
.left_done:
test edgeb, 2 ; HAVE_RIGHT
jnz .padding_done
REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
.padding_done:
CDEF_FILTER 4, 8
%if ARCH_X86_64
cglobal cdef_filter_8x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
%else
cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
%endif
mov edged, r8m
LEA t0, dir_table
mova m0, [dstq+strideq*0+ 0]
movd m1, [dstq+strideq*0+16]
mova m2, [dstq+strideq*1+ 0]
movd m3, [dstq+strideq*1+16]
lea t1, [dstq+strideq*2]
mova m4, [t1 +strideq*0+ 0]
movd m5, [t1 +strideq*0+16]
mova m6, [t1 +strideq*1+ 0]
movd m7, [t1 +strideq*1+16]
lea t1, [t1 +strideq*2]
mova [px+32*0+ 0], m0
movd [px+32*0+16], m1
mova [px+32*1+ 0], m2
movd [px+32*1+16], m3
mova [px+32*2+ 0], m4
movd [px+32*2+16], m5
mova [px+32*3+ 0], m6
movd [px+32*3+16], m7
mova m0, [t1 +strideq*0+ 0]
movd m1, [t1 +strideq*0+16]
mova m2, [t1 +strideq*1+ 0]
movd m3, [t1 +strideq*1+16]
lea t1, [t1 +strideq*2]
mova m4, [t1 +strideq*0+ 0]
movd m5, [t1 +strideq*0+16]
mova m6, [t1 +strideq*1+ 0]
movd m7, [t1 +strideq*1+16]
mova [px+32*4+ 0], m0
movd [px+32*4+16], m1
mova [px+32*5+ 0], m2
movd [px+32*5+16], m3
mova [px+32*6+ 0], m4
movd [px+32*6+16], m5
mova [px+32*7+ 0], m6
movd [px+32*7+16], m7
movddup m7, [base+pw_m16384]
test edgeb, 4 ; HAVE_TOP
jz .no_top
movifnidn topq, topmp
mova m0, [topq+strideq*0+ 0]
mova m1, [topq+strideq*0+16]
mova m2, [topq+strideq*1+ 0]
mova m3, [topq+strideq*1+16]
mova [px-32*2+ 0], m0
movd [px-32*2+16], m1
mova [px-32*1+ 0], m2
movd [px-32*1+16], m3
test edgeb, 1 ; HAVE_LEFT
jz .top_no_left
movd m0, [topq+strideq*0-4]
movd m1, [topq+strideq*1-4]
movd [px-32*2-4], m0
movd [px-32*1-4], m1
jmp .top_done
.no_top:
mova [px-32*2+ 0], m7
movd [px-32*2+16], m7
mova [px-32*1+ 0], m7
movd [px-32*1+16], m7
.top_no_left:
movd [px-32*2- 4], m7
movd [px-32*1- 4], m7
.top_done:
test edgeb, 8 ; HAVE_BOTTOM
jz .no_bottom
lea r3, [dstq+strideq*8]
mova m0, [r3+strideq*0+ 0]
movd m1, [r3+strideq*0+16]
mova m2, [r3+strideq*1+ 0]
movd m3, [r3+strideq*1+16]
mova [px+32*8+ 0], m0
movd [px+32*8+16], m1
mova [px+32*9+ 0], m2
movd [px+32*9+16], m3
test edgeb, 1 ; HAVE_LEFT
jz .bottom_no_left
movd m0, [r3+strideq*0-4]
movd m1, [r3+strideq*1-4]
movd [px+32*8- 4], m0
movd [px+32*9- 4], m1
jmp .bottom_done
.no_bottom:
mova [px+32*8+ 0], m7
movd [px+32*8+16], m7
mova [px+32*9+ 0], m7
movd [px+32*9+16], m7
.bottom_no_left:
movd [px+32*8- 4], m7
movd [px+32*9- 4], m7
.bottom_done:
test edgeb, 1 ; HAVE_LEFT
jz .no_left
movifnidn leftq, r2mp
movd m0, [leftq+4*0]
movd m1, [leftq+4*1]
movd m2, [leftq+4*2]
movd m3, [leftq+4*3]
movd [px+32*0- 4], m0
movd [px+32*1- 4], m1
movd [px+32*2- 4], m2
movd [px+32*3- 4], m3
movd m0, [leftq+4*4]
movd m1, [leftq+4*5]
movd m2, [leftq+4*6]
movd m3, [leftq+4*7]
movd [px+32*4- 4], m0
movd [px+32*5- 4], m1
movd [px+32*6- 4], m2
movd [px+32*7- 4], m3
jmp .left_done
.no_left:
REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
.left_done:
test edgeb, 2 ; HAVE_RIGHT
jnz .padding_done
REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
.padding_done:
CDEF_FILTER 8, 8
%macro CDEF_DIR 0
%if ARCH_X86_64
cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax

4
third_party/dav1d/src/x86/cdef_avx2.asm поставляемый
Просмотреть файл

@ -472,7 +472,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
@ -552,7 +551,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, left, top, _, secdmp, table
lea tableq, [tap_table]
@ -1481,7 +1479,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
@ -1556,7 +1553,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
lea tableq, [tap_table]

6
third_party/dav1d/src/x86/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -46,9 +46,9 @@ decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
#if BITDEPTH == 8
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
@ -57,11 +57,9 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
c->dir = BF(dav1d_cdef_dir, ssse3);
#if BITDEPTH == 8
c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
@ -77,9 +75,7 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
c->dir = BF(dav1d_cdef_dir, avx2);
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
#if BITDEPTH == 8
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
#endif
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;

5
third_party/dav1d/src/x86/cdef_sse.asm поставляемый
Просмотреть файл

@ -566,7 +566,7 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
test secd, secd
jz .pri_only
movd m10, r5m
bsr secd, secd
tzcnt secd, secd
and prid, 1
sub pridmpd, dampingd
sub secd, dampingd
@ -575,7 +575,6 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
neg pridmpd
cmovs pridmpd, dampingd
neg secd
cmovs secd, dampingd
PSHUFB_0 m1, m7
PSHUFB_0 m10, m7
%if ARCH_X86_64
@ -697,7 +696,7 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
%endif
movd m1, r5m
bsr secd, secd
tzcnt secd, secd
mov dird, r6m
xor zerod, zerod
sub dampingd, secd

Разница между файлами не показана из-за своего большого размера Загрузить разницу

3450
third_party/dav1d/src/x86/film_grain16_sse.asm поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

205
third_party/dav1d/src/x86/film_grain_avx2.asm поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
; Copyright © 2019, VideoLAN and dav1d authors
; Copyright © 2019-2021, VideoLAN and dav1d authors
; Copyright © 2019, Two Orioles, LLC
; All rights reserved.
;
@ -38,7 +38,8 @@ byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pd_m65536: dd ~0xffff
pb_23_22: times 2 db 23, 22
pb_23_22: db 23, 22
times 3 db 0, 32
pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
@ -47,24 +48,25 @@ round_vals: dw 32, 64, 128, 256, 512
max: dw 255, 240, 235
min: dw 0, 16
pb_27_17_17_27: db 27, 17, 17, 27
times 2 db 0, 32
pw_1: dw 1
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%macro JMP_TABLE 2-*
%xdefine %1_8bpc_%2_table %%table
%xdefine %%base %1_8bpc_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 1
dd %%prefix %+ .ar%2 - %%base
%rep %0 - 2
dd %%prefix %+ .ar%3 - %%base
%rotate 1
%endrep
%endmacro
ALIGN 4
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
struc FGData
.seed: resd 1
@ -90,8 +92,16 @@ cextern gaussian_sequence
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
INIT_XMM avx2
cglobal generate_grain_y, 2, 9, 16, buf, fg_data
cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data
lea r4, [pb_mask]
%define base r4-pb_mask
movq xm1, [base+rnd_next_upperbit_mask]
@ -132,8 +142,8 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
; auto-regression code
movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
movsxd r2, [base+generate_grain_y_avx2_table+r2*4]
lea r2, [r2+base+generate_grain_y_avx2_table]
movsxd r2, [base+generate_grain_y_8bpc_avx2_table+r2*4]
lea r2, [r2+base+generate_grain_y_8bpc_avx2_table]
jmp r2
.ar1:
@ -420,7 +430,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
INIT_XMM avx2
cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
lea r4, [pb_mask]
%define base r4-pb_mask
movq xm1, [base+rnd_next_upperbit_mask]
@ -478,8 +488,8 @@ cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
; auto-regression code
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
lea r5, [r5+base+generate_grain_uv_%1_avx2_table]
movsxd r5, [base+generate_grain_uv_%1_8bpc_avx2_table+r5*4]
lea r5, [r5+base+generate_grain_uv_%1_8bpc_avx2_table]
jmp r5
.ar0:
@ -975,7 +985,7 @@ generate_grain_uv_fn 422, 1, 0
generate_grain_uv_fn 444, 0, 0
INIT_YMM avx2
cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pcmpeqw m10, m10
psrld m10, 24
mov r7d, [fg_dataq+FGData.scaling_shift]
@ -1092,12 +1102,12 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
jz .loop_x
; r8m = sbym
movd xm15, [pb_27_17_17_27]
movq xm15, [pb_27_17_17_27]
cmp dword r8m, 0
jne .loop_x_hv_overlap
; horizontal overlap (without vertical overlap)
movd xm14, [pw_1024]
movq xm14, [pw_1024]
.loop_x_h_overlap:
mov r6d, seed
or seed, 0xEFF4
@ -1156,8 +1166,7 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pmaddubsw xm4, xm15, xm4
pmulhrsw xm4, xm14
packsswb xm4, xm4
vpblendw xm4, xm3, 11111110b
vpblendd m3, m4, 00001111b
vpblendd m3, m3, m4, 00000001b
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@ -1329,7 +1338,7 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
movd xm15, [pb_27_17_17_27]
movq xm15, [pb_27_17_17_27]
.loop_x_hv_overlap:
vpbroadcastw m8, [pb_27_17_17_27]
@ -1409,10 +1418,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pmulhrsw xm7, xm14
packsswb xm4, xm4
packsswb xm7, xm7
vpblendw xm4, xm3, 11111110b
vpblendw xm7, xm6, 11111110b
vpblendd m3, m4, 00001111b
vpblendd m6, m7, 00001111b
vpblendd m3, m4, 00000001b
vpblendd m6, m7, 00000001b
; followed by v interpolation (top | cur -> cur)
punpckhbw m7, m6, m3
punpcklbw m6, m3
@ -1461,10 +1468,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
RET
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, h, sby, luma, lstride, uv_pl, is_id
pcmpeqw m10, m10
psrld m10, 24
mov r7d, [fg_dataq+FGData.scaling_shift]
lea r8, [pb_mask]
%define base r8-pb_mask
@ -1490,10 +1495,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
vpbroadcastd m14, [pw_1024]
%if %2
vpbroadcastd m15, [pb_23_22]
vpbroadcastq m15, [pb_23_22]
%else
vpbroadcastd xm15, [pb_27_17_17_27]
vpbroadcastq xm15, [pb_27_17_17_27]
%endif
%endif
%if %3
vpbroadcastw m10, [pb_23_22]
%elif %2
mova m10, [pb_8x_27_17_8x_17_27]
%endif
mov overlapd, [fg_dataq+FGData.overlap_flag]
@ -1593,16 +1603,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[luma_src]
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m8, [scalingq+m4], m3
vpgatherdd m4, [scalingq+m5], m9
vpgatherdd m8, [scalingq-3+m4], m3
vpgatherdd m4, [scalingq-3+m5], m9
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m5, [scalingq+m6], m3
vpgatherdd m6, [scalingq+m7], m9
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m3
vpgatherdd m6, [scalingq-3+m7], m9
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@ -1743,16 +1750,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[luma_src]
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m8, [scalingq+m4], m3
vpgatherdd m4, [scalingq+m5], m9
vpgatherdd m8, [scalingq-3+m4], m3
vpgatherdd m4, [scalingq-3+m5], m9
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m5, [scalingq+m6], m3
vpgatherdd m6, [scalingq+m7], m9
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m3
vpgatherdd m6, [scalingq-3+m7], m9
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@ -1763,7 +1767,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; grain = grain_lut[offy+y][offx+x]
%if %2
%if %1
vpbroadcastd m6, [pb_23_22] ; FIXME
vpbroadcastq m6, [pb_23_22]
%endif
movu xm3, [grain_lutq+offxyq+ 0]
movd xm4, [grain_lutq+left_offxyq+ 0]
@ -1778,12 +1782,10 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmulhrsw m4, m14
%endif
packsswb m4, m4
pcmpeqw m6, m6 ; FIXME
psrldq m6, 15 ; FIXME
vpblendvb m3, m3, m4, m6
vpblendd m3, m3, m4, 00010001b
%else
%if %1
vpbroadcastd xm6, [pb_27_17_17_27]
movq xm6, [pb_27_17_17_27]
%endif
movu m3, [grain_lutq+offxyq]
movd xm4, [grain_lutq+left_offxyq]
@ -1796,9 +1798,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmulhrsw xm4, xm14
%endif
packsswb xm4, xm4
pcmpeqw xm6, xm6
psrldq xm6, 14
vpblendvb m3, m3, m4, m6
vpblendd m3, m3, m4, 00000001b
%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
@ -1915,7 +1915,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov hd, hm
mov grain_lutq, grain_lutmp
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
%endif
%%loop_y_v_overlap:
; src
@ -1966,16 +1966,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[luma_src]
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m8, [scalingq+m4], m3
vpgatherdd m4, [scalingq+m5], m9
vpgatherdd m8, [scalingq-3+m4], m3
vpgatherdd m4, [scalingq-3+m5], m9
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m5, [scalingq+m6], m3
vpgatherdd m6, [scalingq+m7], m9
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m3
vpgatherdd m6, [scalingq-3+m7], m9
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@ -1988,7 +1985,6 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; grain = grain_lut[offy+y][offx+x]
%if %3 == 0
%if %2
mova m6, [pb_8x_27_17_8x_17_27]
movu xm3, [grain_lutq+offxyq]
movu xm4, [grain_lutq+top_offxyq]
vinserti128 m3, [grain_lutq+offxyq+82], 1
@ -1999,13 +1995,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%endif
punpckhbw m9, m4, m3
punpcklbw m4, m3
%if %2
pmaddubsw m9, m6, m9
pmaddubsw m4, m6, m4
%else
pmaddubsw m9, m1, m9
pmaddubsw m4, m1, m4
%endif
pmaddubsw m9, m10, m9
pmaddubsw m4, m10, m4
%if %1
pmulhrsw m9, [pw_1024]
pmulhrsw m4, [pw_1024]
@ -2015,19 +2006,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%endif
packsswb m3, m4, m9
%else
%if %1
vpbroadcastd m6, [pb_23_22]
%endif
movq xm3, [grain_lutq+offxyq]
movq xm4, [grain_lutq+top_offxyq]
vinserti128 m3, [grain_lutq+offxyq+8], 1
vinserti128 m4, [grain_lutq+top_offxyq+8], 1
punpcklbw m4, m3
pmaddubsw m4, m10, m4
%if %1
pmaddubsw m4, m6, m4
pmulhrsw m4, [pw_1024]
%else
pmaddubsw m4, m15, m4
pmulhrsw m4, m14
%endif
packsswb m4, m4
@ -2084,7 +2071,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%endif
add grain_lutq, 82<<%2
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
btc hd, 16
jnc %%loop_y_v_overlap
%endif
@ -2139,7 +2126,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov hd, hm
mov grain_lutq, grain_lutmp
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
%endif
%%loop_y_hv_overlap:
; src
@ -2190,16 +2177,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[src]
pcmpeqw m9, m9
pcmpeqw m3, m3
vpgatherdd m8, [scalingq+m4], m9
vpgatherdd m4, [scalingq+m5], m3
vpgatherdd m8, [scalingq-3+m4], m9
vpgatherdd m4, [scalingq-3+m5], m3
pcmpeqw m9, m9
pcmpeqw m3, m3
vpgatherdd m5, [scalingq+m6], m9
vpgatherdd m6, [scalingq+m7], m3
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m9
vpgatherdd m6, [scalingq-3+m7], m3
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@ -2212,9 +2196,9 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; grain = grain_lut[offy+y][offx+x]
%if %1
%if %2
vpbroadcastd m9, [pb_23_22]
vpbroadcastq m9, [pb_23_22]
%else
vpbroadcastd xm9, [pb_27_17_17_27]
vpbroadcastq xm9, [pb_27_17_17_27]
%endif
%endif
@ -2252,7 +2236,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
punpcklbw m7, m6
%endif
punpcklwd m4, m7
punpcklqdq m4, m7
%if %1
pmaddubsw m4, m9, m4
pmulhrsw m4, [pw_1024]
@ -2261,18 +2245,17 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmulhrsw m4, m14
%endif
packsswb m4, m4
pcmpeqw m9, m9 ; this is kind of ugly
psrldq m9, 15
vpblendvb m3, m3, m4, m9
psrldq m4, 1
vpblendd m3, m4, 00010001b
psrldq m4, 4
%if %3
shufpd m9, m9, m9, 1110b ; clear upper lane
vpblendd m6, m6, m4, 00000001b
%else
vpblendd m6, m6, m4, 00010001b
%endif
vpblendvb m6, m6, m4, m9
%else
punpcklbw xm4, xm3
punpcklbw xm7, xm6
punpckldq xm4, xm7
punpcklqdq xm4, xm7
%if %1
pmaddubsw xm4, xm9, xm4
pmulhrsw xm4, [pw_1024]
@ -2281,23 +2264,19 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmulhrsw xm4, xm14
%endif
packsswb xm4, xm4
pcmpeqw xm9, xm9 ; this is kind of ugly
psrldq xm9, 14
vpblendvb m3, m3, m4, m9
psrldq xm4, 2
vpblendvb m6, m6, m4, m9
vpblendd m3, m3, m4, 00000001b
psrldq xm4, 4
vpblendd m6, m6, m4, 00000001b
%endif
; followed by v interpolation (top | cur -> cur)
%if %3
vpermq m9, m3, q3120
punpcklbw m6, m9
pmaddubsw m6, m10, m6
%if %1
vpbroadcastd m9, [pb_23_22]
pmaddubsw m6, m9, m6
pmulhrsw m6, [pw_1024]
%else
pmaddubsw m6, m15, m6
pmulhrsw m6, m14
%endif
packsswb m6, m6
@ -2306,14 +2285,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%else
punpckhbw m9, m6, m3
punpcklbw m6, m3
%if %2
mova m3, [pb_8x_27_17_8x_17_27]
pmaddubsw m9, m3, m9
pmaddubsw m6, m3, m6
%else
pmaddubsw m9, m1, m9
pmaddubsw m6, m1, m6
%endif
pmaddubsw m9, m10, m9
pmaddubsw m6, m10, m6
%if %1
pmulhrsw m9, [pw_1024]
pmulhrsw m6, [pw_1024]
@ -2373,7 +2346,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
jg %%loop_y_h_overlap
%else
je %%end_y_hv_overlap
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
btc hd, 16
jnc %%loop_y_hv_overlap
jmp %%loop_y_h_overlap

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -28,64 +28,48 @@
#include "src/cpu.h"
#include "src/film_grain.h"
decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3));
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3));
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3));
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3));
decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3));
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3));
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3));
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3));
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
decl_generate_grain_y_fn(dav1d_generate_grain_y_16bpc_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_16bpc_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_16bpc_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_16bpc_avx2);
decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2));
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2));
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2));
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2));
decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2));
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2));
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2));
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2));
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->generate_grain_y = dav1d_generate_grain_y_ssse3;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
#endif
c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8
c->generate_grain_y = dav1d_generate_grain_y_avx2;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
#else
c->generate_grain_y = dav1d_generate_grain_y_16bpc_avx2;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] =
dav1d_generate_grain_uv_420_16bpc_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_16bpc_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] =
dav1d_fguv_32x32xn_i420_16bpc_avx2;
#endif
c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
#endif
}

503
third_party/dav1d/src/x86/film_grain_sse.asm поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
; Copyright © 2019, VideoLAN and dav1d authors
; Copyright © 2019-2021, VideoLAN and dav1d authors
; Copyright © 2019, Two Orioles, LLC
; All rights reserved.
;
@ -29,14 +29,18 @@
SECTION_RODATA
pw_1024: times 8 dw 1024
pb_27_17_17_27: db 27, 17, 17, 27
times 6 db 0, 32
pb_23_22_h: db 23, 22
times 7 db 0, 32
pb_27_17: times 8 db 27, 17
pb_17_27: times 8 db 17, 27
pb_23_22: times 8 db 23, 22
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pb_23_22: times 2 db 23, 22
pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
@ -46,23 +50,21 @@ max: dw 255, 240, 235
min: dw 0, 16
pw_1: dw 1
%define pb_27_17_17_27 pb_17_27 - 2
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%macro JMP_TABLE 2-*
%xdefine %1_8bpc_%2_table %%table
%xdefine %%base %1_8bpc_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 1
dd %%prefix %+ .ar%2 - %%base
%rep %0 - 2
dd %%prefix %+ .ar%3 - %%base
%rotate 1
%endrep
%endmacro
JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
struc FGData
.seed: resd 1
@ -88,6 +90,20 @@ cextern gaussian_sequence
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
%if ARCH_X86_32
%define PIC_ptr(a) base+a
%else
%define PIC_ptr(a) a
%endif
%macro SCRATCH 3
%if ARCH_X86_32
mova [rsp+%3*mmsize], m%1
@ -98,7 +114,7 @@ SECTION .text
%endmacro
INIT_XMM ssse3
cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
LEA r4, $$
%define base r4-$$
movq m1, [base+rnd_next_upperbit_mask]
@ -164,8 +180,8 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
; auto-regression code
movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
movsxd r2, [base+generate_grain_y_ssse3_table+r2*4]
lea r2, [r2+base+generate_grain_y_ssse3_table]
movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
jmp r2
.ar1:
@ -507,7 +523,7 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
INIT_XMM ssse3
cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
movifnidn r2, r2mp
movifnidn r3, r3mp
LEA r4, $$
@ -606,8 +622,8 @@ cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
; auto-regression code
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4]
lea r5, [r5+base+generate_grain_uv_%1_ssse3_table]
movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
jmp r5
.ar0:
@ -1284,7 +1300,7 @@ INIT_XMM ssse3
; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
dst, src, scaling, unused1, fg_data, picptr, unused2
; copy stack arguments to new position post-alignment, so that we
; don't have to keep the old stack location in a separate register
@ -1295,43 +1311,41 @@ cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
mov r4, r7m
mov r5, r8m
mov [rsp+6*mmsize+ 3*gprsize], r0
mov [rsp+6*mmsize+ 5*gprsize], r1
mov [rsp+6*mmsize+ 7*gprsize], r2
mov [rsp+6*mmsize+ 9*gprsize], r3
mov [rsp+6*mmsize+10*gprsize], r4
mov [rsp+6*mmsize+11*gprsize], r5
mov [rsp+5*mmsize+ 4*gprsize], r0
mov [rsp+5*mmsize+ 6*gprsize], r1
mov [rsp+5*mmsize+ 8*gprsize], r2
mov [rsp+5*mmsize+10*gprsize], r3
mov [rsp+5*mmsize+11*gprsize], r4
mov [rsp+5*mmsize+12*gprsize], r5
%else
cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
dst, src, scaling, unused1, fg_data, picptr, unused2
%endif
mov srcq, srcm
mov fg_dataq, r3m
mov scalingq, r5m
%if STACK_ALIGNMENT < mmsize
%define r0m [rsp+6*mmsize+ 3*gprsize]
%define r1m [rsp+6*mmsize+ 4*gprsize]
%define r2m [rsp+6*mmsize+ 5*gprsize]
%define r3m [rsp+6*mmsize+ 6*gprsize]
%define r4m [rsp+6*mmsize+ 7*gprsize]
%define r5m [rsp+6*mmsize+ 8*gprsize]
%define r6m [rsp+6*mmsize+ 9*gprsize]
%define r7m [rsp+6*mmsize+10*gprsize]
%define r8m [rsp+6*mmsize+11*gprsize]
%define r0m [rsp+5*mmsize+ 4*gprsize]
%define r1m [rsp+5*mmsize+ 5*gprsize]
%define r2m [rsp+5*mmsize+ 6*gprsize]
%define r3m [rsp+5*mmsize+ 7*gprsize]
%define r4m [rsp+5*mmsize+ 8*gprsize]
%define r5m [rsp+5*mmsize+ 9*gprsize]
%define r6m [rsp+5*mmsize+10*gprsize]
%define r7m [rsp+5*mmsize+11*gprsize]
%define r8m [rsp+5*mmsize+12*gprsize]
%endif
LEA r5, pb_mask
%define base r5-pb_mask
mov r5m, picptrq
%else
cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
lea r7, [pb_mask]
%define base r7-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
pcmpeqw m2, m2
psrldq m2, 14
movd m4, [base+max+r6*4]
movd m5, [base+min+r6*2]
punpcklwd m3, m3
@ -1340,10 +1354,9 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pshufd m3, m3, q0000
pshufd m4, m4, q0000
pshufd m5, m5, q0000
SCRATCH 2, 10, 0
SCRATCH 3, 11, 1
SCRATCH 4, 12, 2
SCRATCH 5, 13, 3
SCRATCH 3, 11, 0
SCRATCH 4, 12, 1
SCRATCH 5, 13, 2
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
@ -1356,9 +1369,9 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
test overlapd, overlapd
jz .no_vertical_overlap
mova m6, [base+pw_1024]
movd m7, [base+pb_27_17_17_27]
SCRATCH 6, 14, 4
SCRATCH 7, 15, 5
mova m7, [base+pb_27_17_17_27]
SCRATCH 6, 14, 3
SCRATCH 7, 15, 4
test sbyd, sbyd
jnz .vertical_overlap
; fall-through
@ -1445,16 +1458,13 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq, r0, r5, m3
vpgatherdw m5, m1, scalingq, r0, r5, m3
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq, r12, r13, m3
vpgatherdw m5, m1, scalingq, r12, r13, m3
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
pcmpeqw m3, m3
psrlw m3, 8
pand m4, m3
pand m5, m3
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
@ -1504,7 +1514,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxyd
%endif
@ -1525,7 +1535,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
add offxyd, 16 ; left_offxyd
mov [rsp+6*mmsize+0*gprsize], offxyd
mov [rsp+5*mmsize+0*gprsize], offxyd
DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
@ -1578,21 +1588,18 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq, r0, r5, m3
vpgatherdw m5, m1, scalingq, r0, r5, m3
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq, r12, r13, m3
vpgatherdw m5, m1, scalingq, r12, r13, m3
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
pcmpeqw m3, m3
psrlw m3, 8
pand m4, m3
pand m5, m3
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+6*mmsize+0*gprsize]
mov r5, [rsp+5*mmsize+0*gprsize]
movd m7, [grain_lutq+r5]
%else
movd m7, [grain_lutq+left_offxyq]
@ -1601,9 +1608,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pmaddubsw m6, m15, m7
pmulhrsw m6, m14
packsswb m6, m6
pand m6, m10
pandn m7, m10, m3
por m6, m7
shufps m6, m3, q3210
pcmpgtb m2, m6
punpcklbw m7, m6, m2
punpckhbw m6, m2
@ -1649,7 +1654,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
test dword r8m, 2 ; have_top_overlap
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxyd
%endif
@ -1754,7 +1759,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+6*mmsize+1*gprsize], top_offxyd
mov [rsp+5*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
@ -1764,7 +1769,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
%if ARCH_X86_32
mov r5, r5m
lea r5, [base+pb_27_17]
mov [rsp+5*mmsize+8], r5
mov [rsp+5*mmsize+12], r5
%else
mova m8, [pb_27_17]
%endif
@ -1779,21 +1784,18 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq, r0, r5, m3
vpgatherdw m5, m1, scalingq, r0, r5, m3
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq, r12, r13, m3
vpgatherdw m5, m1, scalingq, r12, r13, m3
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
pcmpeqw m3, m3
psrlw m3, 8
pand m4, m3
pand m5, m3
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+6*mmsize+1*gprsize]
mov r5, [rsp+5*mmsize+1*gprsize]
movu m7, [grain_lutq+r5]
%else
movu m7, [grain_lutq+top_offxyq]
@ -1801,7 +1803,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
punpckhbw m6, m7, m3
punpcklbw m7, m3
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+8]
mov r5, [rsp+5*mmsize+12]
pmaddubsw m3, [r5], m6
pmaddubsw m6, [r5], m7
%else
@ -1833,7 +1835,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
mova [dstq+srcq], m0
%if ARCH_X86_32
add dword [rsp+5*mmsize+8], mmsize
add dword [rsp+5*mmsize+12], mmsize
%else
mova m8, [pb_17_27]
%endif
@ -1864,7 +1866,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
jc .loop_x_hv_overlap
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
@ -1874,16 +1876,16 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
%if ARCH_X86_32
mov r5, r5m
lea r5, [base+pb_27_17]
mov [rsp+5*mmsize+8], r5
mov [rsp+5*mmsize+12], r5
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
mov r5, [rsp+6*mmsize+1*gprsize]
mov r5, [rsp+5*mmsize+1*gprsize]
mov r4, offxyd
add r5, 16
add r4, 16
mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy
mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy
mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
@ -1937,7 +1939,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
movzx r5, offxyw ; top_offxy
mov [rsp+6*mmsize+1*gprsize], r5
mov [rsp+5*mmsize+1*gprsize], r5
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy
@ -1952,10 +1954,10 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy
mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy
mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
movu m6, [grain_lutq+r5]
mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
movd m4, [grain_lutq+r0]
movd m7, [grain_lutq+r5]
%else
@ -1972,17 +1974,13 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pmulhrsw m4, m14
packsswb m2, m2
packsswb m4, m4
pand m2, m10
pand m4, m10
pandn m7, m10, m3
pandn m3, m10, m6
por m7, m2
por m3, m4
shufps m2, m3, q3210
shufps m4, m6, q3210
; followed by v interpolation (top | cur -> cur)
punpckhbw m4, m3, m7
punpcklbw m3, m7
punpcklbw m3, m4, m2
punpckhbw m4, m2
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+8]
mov r5, [rsp+5*mmsize+12]
pmaddubsw m7, [r5], m4
pmaddubsw m4, [r5], m3
%else
@ -2004,16 +2002,13 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
; scaling[src]
%if ARCH_X86_32
vpgatherdw m5, m0, scalingq, r0, r5, m7
vpgatherdw m6, m1, scalingq, r0, r5, m7
vpgatherdw m5, m0, scalingq-1, r0, r5, m7
vpgatherdw m6, m1, scalingq-1, r0, r5, m7
%else
vpgatherdw m5, m0, scalingq, r13, r14, m7
vpgatherdw m6, m1, scalingq, r13, r14, m7
vpgatherdw m5, m0, scalingq-1, r13, r14, m7
vpgatherdw m6, m1, scalingq-1, r13, r14, m7
%endif
pcmpeqw m7, m7
psrlw m7, 8
pand m5, m7
pand m6, m7
REPX {psrlw x, 8}, m5, m6
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m3, m5
@ -2033,7 +2028,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
mova [dstq+srcq], m0
%if ARCH_X86_32
add dword [rsp+5*mmsize+8], mmsize
add dword [rsp+5*mmsize+12], mmsize
%else
mova m8, [pb_17_27]
%endif
@ -2063,7 +2058,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
xor dword r8m, 4
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
@ -2079,61 +2074,60 @@ INIT_XMM ssse3
; sby, luma, lstride, uv_pl, is_id)
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
tmp, src, scaling, h, fg_data, picptr, unused
mov r0, r0m
mov r1, r2m
mov r2, r4m
mov r3, r6m
mov r4, r7m
mov [rsp+8*mmsize+3*gprsize], r0
mov [rsp+8*mmsize+5*gprsize], r1
mov [rsp+8*mmsize+7*gprsize], r2
mov [rsp+8*mmsize+9*gprsize], r3
mov [rsp+8*mmsize+10*gprsize], r4
mov [rsp+7*mmsize+3*gprsize], r0
mov [rsp+7*mmsize+5*gprsize], r1
mov [rsp+7*mmsize+7*gprsize], r2
mov [rsp+7*mmsize+9*gprsize], r3
mov [rsp+7*mmsize+10*gprsize], r4
mov r0, r8m
mov r1, r9m
mov r2, r10m
mov r4, r11m
mov r3, r12m
mov [rsp+8*mmsize+11*gprsize], r0
mov [rsp+8*mmsize+12*gprsize], r1
mov [rsp+8*mmsize+13*gprsize], r2
mov [rsp+8*mmsize+14*gprsize], r4
mov [rsp+7*mmsize+11*gprsize], r0
mov [rsp+7*mmsize+12*gprsize], r1
mov [rsp+7*mmsize+13*gprsize], r2
mov [rsp+7*mmsize+14*gprsize], r4
%else
cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
tmp, src, scaling, h, fg_data, picptr, unused
%endif
mov srcq, srcm
mov fg_dataq, r3m
mov scalingq, r5m
%if STACK_ALIGNMENT < mmsize
%define r0m [rsp+8*mmsize+ 3*gprsize]
%define r1m [rsp+8*mmsize+ 4*gprsize]
%define r2m [rsp+8*mmsize+ 5*gprsize]
%define r3m [rsp+8*mmsize+ 6*gprsize]
%define r4m [rsp+8*mmsize+ 7*gprsize]
%define r5m [rsp+8*mmsize+ 8*gprsize]
%define r6m [rsp+8*mmsize+ 9*gprsize]
%define r7m [rsp+8*mmsize+10*gprsize]
%define r8m [rsp+8*mmsize+11*gprsize]
%define r9m [rsp+8*mmsize+12*gprsize]
%define r10m [rsp+8*mmsize+13*gprsize]
%define r11m [rsp+8*mmsize+14*gprsize]
%define r12m [rsp+8*mmsize+15*gprsize]
%define r0m [rsp+7*mmsize+ 3*gprsize]
%define r1m [rsp+7*mmsize+ 4*gprsize]
%define r2m [rsp+7*mmsize+ 5*gprsize]
%define r3m [rsp+7*mmsize+ 6*gprsize]
%define r4m [rsp+7*mmsize+ 7*gprsize]
%define r5m [rsp+7*mmsize+ 8*gprsize]
%define r6m [rsp+7*mmsize+ 9*gprsize]
%define r7m [rsp+7*mmsize+10*gprsize]
%define r8m [rsp+7*mmsize+11*gprsize]
%define r9m [rsp+7*mmsize+12*gprsize]
%define r10m [rsp+7*mmsize+13*gprsize]
%define r11m [rsp+7*mmsize+14*gprsize]
%define r12m [rsp+7*mmsize+15*gprsize]
%endif
LEA r5, pb_mask
%define base r5-pb_mask
mov r5m, r5
%else
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
lea r8, [pb_mask]
%define base r8-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
pcmpeqw m2, m2
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
lea tmpd, [r6d*2]
@ -2145,17 +2139,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
movd m5, [base+min+r6*2]
cmovne r6d, tmpd
movd m4, [base+max+r6*2]
psrldq m2, 14+%2
punpcklwd m3, m3
punpcklwd m5, m5
punpcklwd m4, m4
pshufd m3, m3, q0000
pshufd m5, m5, q0000
pshufd m4, m4, q0000
SCRATCH 2, 10, 0
SCRATCH 3, 11, 1
SCRATCH 4, 12, 2
SCRATCH 5, 13, 3
SCRATCH 3, 11, 0
SCRATCH 4, 12, 1
SCRATCH 5, 13, 2
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
@ -2177,8 +2169,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
punpcklwd m7, m7
pshufd m6, m6, q0000
pshufd m7, m7, q0000
SCRATCH 6, 14, 4
SCRATCH 7, 15, 5
SCRATCH 6, 14, 3
SCRATCH 7, 15, 4
%endif
mov sbyd, r8m
@ -2187,22 +2179,21 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
jz %%no_vertical_overlap
%if ARCH_X86_32
%if %2
movd m1, [base+pb_23_22]
mova m1, [base+pb_23_22_h]
%else
movd m1, [base+pb_27_17_17_27]
mova m1, [base+pb_27_17_17_27]
%endif
mova m0, [base+pw_1024]
%else
%if %2
movd m1, [pb_23_22]
mova m1, [pb_23_22_h]
%else
movd m1, [pb_27_17_17_27]
mova m1, [pb_27_17_17_27]
%endif
mova m0, [pw_1024]
%endif
pshufd m1, m1, q0000
SCRATCH 0, 8, 6
SCRATCH 1, 9, 7
SCRATCH 0, 8, 5
SCRATCH 1, 9, 6
test sbyd, sbyd
jnz %%vertical_overlap
; fall-through
@ -2347,16 +2338,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
pcmpeqw m1, m1
psrlw m1, 8
pand m7, m1
pand m5, m1
REPX {psrlw x, 8}, m7, m5
; unpack chroma_source
punpckhbw m1, m0, m2
@ -2426,7 +2414,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%if %2 == 0
; adjust top_offxy
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
@ -2450,9 +2438,9 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%if ARCH_X86_32
%if %2
lea r6, [offxyd+16]
mov [rsp+8*mmsize+0*gprsize], r6
mov [rsp+7*mmsize+0*gprsize], r6
%else
mov [rsp+8*mmsize+0*gprsize], offxyd
mov [rsp+7*mmsize+0*gprsize], offxyd
%endif
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
@ -2558,36 +2546,31 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
pcmpeqw m1, m1
psrlw m1, 8
pand m7, m1
pand m5, m1
REPX {psrlw x, 8}, m7, m5
; unpack chroma_source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq+ 0]
movu m4, [grain_lutq+offxyq+ 0]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+0*gprsize]
movd m4, [grain_lutq+r0+ 0]
mov r0, [rsp+7*mmsize+0*gprsize]
movd m2, [grain_lutq+r0+ 0]
%else
movd m4, [grain_lutq+left_offxyq+ 0]
movd m2, [grain_lutq+left_offxyq+ 0]
%endif
punpcklbw m2, m4, m3
pmaddubsw m4, m9, m2
pmulhrsw m4, m8
packsswb m4, m4
pand m4, m10
pandn m2, m10, m3
por m3, m4, m2
punpcklbw m2, m4
pmaddubsw m3, m9, m2
pmulhrsw m3, m8
packsswb m3, m3
shufps m3, m4, q3210
pxor m4, m4
pcmpgtb m4, m3
punpcklbw m2, m3, m4
@ -2652,7 +2635,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
xor dword r8m, 4
; adjust top_offxyd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
@ -2780,7 +2763,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
movzx top_offxyd, offxyw
shr offxyd, 16
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
mov [rsp+7*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%endif
@ -2790,9 +2773,11 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
mova m1, [base+pb_27_17]
%endif
%if %3
mova m1, [PIC_ptr(pb_23_22)]
%else
mova m1, [pb_27_17]
mova m1, [PIC_ptr(pb_27_17)]
%endif
%%loop_y_v_overlap:
%if ARCH_X86_32
@ -2848,34 +2833,26 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
pcmpeqw m4, m4
psrlw m4, 8
pand m7, m4
pand m5, m4
REPX {psrlw x, 8}, m7, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+1*gprsize]
mov r0, [rsp+7*mmsize+1*gprsize]
movu m4, [grain_lutq+r0]
%else
movu m4, [grain_lutq+top_offxyq]
%endif
punpckhbw m6, m4, m3
punpcklbw m4, m3
%if %3
pmaddubsw m2, m9, m6
pmaddubsw m3, m9, m4
%else
pmaddubsw m2, m1, m6
pmaddubsw m3, m1, m4
%endif
pmulhrsw m2, m8
pmulhrsw m3, m8
packsswb m3, m2
@ -2928,10 +2905,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
btc hd, 16
%if ARCH_X86_32
mov r5, r5m
mova m1, [base+pb_17_27]
%else
mova m1, [pb_17_27]
%endif
mova m1, [PIC_ptr(pb_17_27)]
jnc %%loop_y_v_overlap
%endif
jmp %%loop_y
@ -2963,7 +2938,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; h+v overlap
%else
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
@ -2976,15 +2951,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
mov r6, [rsp+8*mmsize+1*gprsize]
mov r6, [rsp+7*mmsize+1*gprsize]
%if %2
lea r0, [r3d+16]
add r6, 16
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
%else
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
%endif
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
@ -3048,18 +3023,55 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
movzx top_offxyd, offxyw
shr offxyd, 16
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
mov [rsp+7*mmsize+1*gprsize], top_offxyd
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
mova m3, [base+pb_27_17]
%endif
%if %3
mova m3, [PIC_ptr(pb_23_22)]
%else
mova m3, [pb_27_17]
mova m3, [PIC_ptr(pb_27_17)]
%endif
%%loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
%if ARCH_X86_32
mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
movd m1, [grain_lutq+r0]
mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
%else
movd m1, [grain_lutq+topleft_offxyq]
%endif
movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
%else
movu m6, [grain_lutq+top_offxyq]
movd m4, [grain_lutq+left_offxyq]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
punpcklbw m4, m2
pmaddubsw m0, m9, m1
pmaddubsw m1, m9, m4
REPX {pmulhrsw x, m8}, m0, m1
packsswb m0, m1
shufps m4, m0, m2, q3232
shufps m0, m6, q3210
; followed by v interpolation (top | cur -> cur)
punpcklbw m2, m0, m4
punpckhbw m0, m4
pmaddubsw m4, m3, m0
pmaddubsw m1, m3, m2
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
; src
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
@ -3116,69 +3128,20 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
; scaling[src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
movd m1, [grain_lutq+topleft_offxyq]
%if %3
vpgatherdw m7, m4, scalingq, r2, r12
vpgatherdw m5, m6, scalingq, r2, r12
vpgatherdw m7, m4, scalingq-1, r2, r12
vpgatherdw m5, m6, scalingq-1, r2, r12
%else
vpgatherdw m7, m4, scalingq, r2, r13
vpgatherdw m5, m6, scalingq, r2, r13
vpgatherdw m7, m4, scalingq-1, r2, r13
vpgatherdw m5, m6, scalingq-1, r2, r13
%endif
%endif
pcmpeqw m2, m2
psrlw m2, 8
pand m7, m2
pand m5, m2
REPX {psrlw x, 8}, m7, m5
; grain = grain_lut[offy+y][offx+x]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy
movd m1, [grain_lutq+r0]
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
%endif
movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
%else
movu m6, [grain_lutq+top_offxyq]
movd m4, [grain_lutq+left_offxyq]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
punpcklbw m4, m2
%if %2
punpcklwd m4, m1
%else
punpckldq m4, m1
%endif
pmaddubsw m1, m9, m4
pmulhrsw m1, m8
packsswb m1, m1
pandn m4, m10, m2
pandn m2, m10, m6
psrldq m6, m1, 2-%2
pand m1, m10
pand m6, m10
por m4, m1
por m2, m6
; followed by v interpolation (top | cur -> cur)
punpckhbw m1, m2, m4
punpcklbw m2, m4
%if %3
pmaddubsw m4, m9, m1
pmaddubsw m1, m9, m2
%else
pmaddubsw m4, m3, m1
pmaddubsw m1, m3, m2
%endif
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
; unpack grain
pxor m4, m4
pcmpgtb m4, m1
punpcklbw m2, m1, m4
@ -3229,10 +3192,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
jle %%end_y_hv_overlap
%if ARCH_X86_32
mov r5, r5m
mova m3, [base+pb_17_27]
%else
mova m3, [pb_17_27]
%endif
mova m3, [PIC_ptr(pb_17_27)]
btc hd, 16
jnc %%loop_y_hv_overlap
%if ARCH_X86_64
@ -3268,7 +3229,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
jmp %%loop_x_hv_overlap
%else
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif

1206
third_party/dav1d/src/x86/ipred16_avx2.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1931
third_party/dav1d/src/x86/ipred16_sse.asm поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

48
third_party/dav1d/src/x86/ipred_avx2.asm поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
@ -141,7 +141,7 @@ pw_512: times 2 dw 512
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_%2)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
@ -178,7 +178,7 @@ cextern filter_intra_taps
SECTION .text
INIT_YMM avx2
cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
lea r5, [ipred_dc_left_avx2_table]
tzcnt wd, wm
inc tlq
@ -196,7 +196,7 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
add wq, r5
jmp r6
cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
@ -235,7 +235,7 @@ cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
mova m1, m0
jmp wq
cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
@ -446,7 +446,7 @@ ALIGN function_align
jg .s64
RET
cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_splat_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
@ -457,7 +457,7 @@ cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
lea stride3q, [strideq*3]
jmp wq
cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
lea r5, [ipred_dc_splat_avx2_table]
tzcnt wd, wm
movu m0, [tlq+ 1]
@ -486,7 +486,7 @@ ALIGN function_align
%endmacro
INIT_XMM avx2
cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3
cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
lea r5, [ipred_h_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
@ -543,7 +543,7 @@ INIT_YMM avx2
vpblendvb m0, m5, m0, m1
%endmacro
cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
%define base r5-ipred_paeth_avx2_table
lea r5, [ipred_paeth_avx2_table]
tzcnt wd, wm
@ -677,7 +677,7 @@ ALIGN function_align
packuswb m0, m1
%endmacro
cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights
cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
%define base r6-ipred_smooth_v_avx2_table
lea r6, [ipred_smooth_v_avx2_table]
tzcnt wd, wm
@ -835,7 +835,7 @@ ALIGN function_align
ALLOC_STACK %1, %3
%endmacro
cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h
cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
%define base r6-ipred_smooth_h_avx2_table
lea r6, [ipred_smooth_h_avx2_table]
mov wd, wm
@ -1045,7 +1045,7 @@ ALIGN function_align
packuswb m0, m1
%endmacro
cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights
cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
%define base r6-ipred_smooth_avx2_table
lea r6, [ipred_smooth_avx2_table]
mov wd, wm
@ -1315,7 +1315,7 @@ ALIGN function_align
sub r3, hq
ret
cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
%assign org_stack_offset stack_offset
lea r6, [ipred_z1_avx2_table]
tzcnt wd, wm
@ -2144,7 +2144,7 @@ ALIGN function_align
.w64_end:
RET
cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
%define base r9-z_filter_t0
lea r9, [ipred_z2_avx2_table]
tzcnt wd, wm
@ -3000,7 +3000,7 @@ ALIGN function_align
movu [rsp+97], m0
jmp .w32_filter_above
cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
%assign org_stack_offset stack_offset
lea r6, [ipred_z3_avx2_table]
tzcnt hd, hm
@ -4211,7 +4211,7 @@ ALIGN function_align
; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
; 5 8 8 i
cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter
cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
%define base r6-ipred_filter_avx2_table
lea r6, [filter_intra_taps]
tzcnt wd, wm
@ -4435,7 +4435,7 @@ DECLARE_REG_TMP 7
paddw m%1, m0
%endmacro
cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
lea t0, [ipred_cfl_left_avx2_table]
tzcnt wd, wm
inc tlq
@ -4454,7 +4454,7 @@ cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn acq, acmp
jmp r6
cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
@ -4488,7 +4488,7 @@ cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
vpbroadcastw m0, xm0
jmp wq
cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
@ -4692,7 +4692,7 @@ ALIGN function_align
jg .s32_loop
RET
cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
lea t0, [ipred_cfl_splat_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
@ -4702,7 +4702,7 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn acq, acmp
jmp wq
cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
movifnidn hpadd, hpadm
movifnidn wd, wm
mov hd, hm
@ -4883,7 +4883,7 @@ cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
jg .sub_loop
RET
cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
movifnidn hpadd, hpadm
movifnidn wd, wm
mov hd, hm
@ -5076,7 +5076,7 @@ cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
jg .sub_loop
RET
cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
movifnidn hpadd, hpadm
movifnidn wd, wm
mov hd, hm
@ -5306,7 +5306,7 @@ cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
jg .sub_loop
RET
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
vbroadcasti128 m4, [palq]
lea r2, [pal_pred_avx2_table]
tzcnt wd, wm

26
third_party/dav1d/src/x86/ipred_init_tmpl.c поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -28,19 +28,11 @@
#include "src/cpu.h"
#include "src/ipred.h"
#if BITDEPTH == 8
#define decl_fn(type, name) \
decl_##type##_fn(dav1d_##name##_ssse3); \
decl_##type##_fn(dav1d_##name##_avx2)
decl_##type##_fn(BF(dav1d_##name, ssse3)); \
decl_##type##_fn(BF(dav1d_##name, avx2))
#define init_fn(type0, type1, name, suffix) \
c->type0[type1] = dav1d_##name##_##suffix
#else
#define decl_fn(type, name) \
decl_##type##_fn(dav1d_##name##_16bpc_ssse3); \
decl_##type##_fn(dav1d_##name##_16bpc_avx2)
#define init_fn(type0, type1, name, suffix) \
c->type0[type1] = dav1d_##name##_16bpc_##suffix
#endif
c->type0[type1] = BF(dav1d_##name, suffix)
#define init_angular_ipred_fn(type, name, suffix) \
init_fn(intra_pred, type, name, suffix)
@ -80,7 +72,6 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
@ -102,8 +93,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
c->pal_pred = dav1d_pal_pred_ssse3;
#endif
c->pal_pred = BF(dav1d_pal_pred, ssse3);
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
@ -130,12 +120,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
#if BITDEPTH == 8
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
c->pal_pred = dav1d_pal_pred_avx2;
#else
c->pal_pred = dav1d_pal_pred_16bpc_avx2;
#endif
c->pal_pred = BF(dav1d_pal_pred, avx2);
#endif
}

48
third_party/dav1d/src/x86/ipred_sse.asm поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
@ -74,7 +74,7 @@ pd_32768 : times 1 dd 32768
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_%2)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
@ -156,7 +156,7 @@ SECTION .text
%endmacro
INIT_XMM ssse3
cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
LEA r5, ipred_h_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
@ -179,7 +179,7 @@ cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movu m0, [tlq+ 1]
@ -196,7 +196,7 @@ cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
@ -438,7 +438,7 @@ ALIGN function_align
;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_left_ssse3_table
mov hd, hm ; zero upper half
tzcnt r6d, hd
@ -488,7 +488,7 @@ cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
@ -505,7 +505,7 @@ cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
LEA r5, ipred_dc_left_ssse3_table
tzcnt wd, wm
inc tlq
@ -540,7 +540,7 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
packuswb m6, m0
%endmacro
cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights
cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
%define base r6-ipred_smooth_v_ssse3_table
LEA r6, ipred_smooth_v_ssse3_table
tzcnt wd, wm
@ -701,7 +701,7 @@ ALIGN function_align
;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h
cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
%define base r6-ipred_smooth_h_ssse3_table
LEA r6, ipred_smooth_h_ssse3_table
mov wd, wm
@ -958,7 +958,7 @@ ALIGN function_align
mova m5, [rsp+16*%12] ; recovery
%endmacro
cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
%define base r6-ipred_smooth_ssse3_table
mov wd, wm
mov hd, hm
@ -1194,7 +1194,7 @@ ALIGN function_align
;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
; const uint8_t *idx, const int w, const int h);
;---------------------------------------------------------------------------------------
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
mova m4, [palq]
LEA r2, pal_pred_ssse3_table
tzcnt wd, wm
@ -1295,7 +1295,7 @@ DECLARE_REG_TMP 7
DECLARE_REG_TMP 5
%endif
cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn wd, wm
movifnidn hd, hm
tzcnt r6d, hd
@ -1535,7 +1535,7 @@ ALIGN function_align
;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
@ -1576,7 +1576,7 @@ cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
LEA t0, ipred_cfl_left_ssse3_table
tzcnt wd, wm
inc tlq
@ -1600,7 +1600,7 @@ cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
tzcnt wd, wm
movifnidn hd, hm
LEA r6, ipred_cfl_splat_ssse3_table
@ -1615,11 +1615,11 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
%endmacro
%if ARCH_X86_64
cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
DECLARE_REG_TMP 7
movddup m2, [pb_2]
%else
cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
DECLARE_REG_TMP 4
%define ac_bakq acmp
mov t0d, 0x02020202
@ -1855,10 +1855,10 @@ DECLARE_REG_TMP 4
RET
%if ARCH_X86_64
cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
movddup m2, [pb_4]
%else
cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
mov t0d, 0x04040404
movd m2, t0d
pshufd m2, m2, q0000
@ -2128,10 +2128,10 @@ cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
RET
%if ARCH_X86_64
cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
movddup m2, [pb_4]
%else
cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
%define ac_bakq [rsp+16*4]
mov t0d, 0x04040404
movd m2, t0d
@ -2769,7 +2769,7 @@ cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
BLEND m1, m0, m5
%endmacro
cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h
cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
%define base r5-ipred_paeth_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
@ -2937,7 +2937,7 @@ ALIGN function_align
packuswb m%1, m%1
%endmacro
cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter
cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
%define base r6-$$
LEA r6, $$
tzcnt wd, wm

160
third_party/dav1d/src/x86/itx16_avx2.asm поставляемый
Просмотреть файл

@ -105,32 +105,32 @@ cextern pw_16384
cextern pw_2896x8
cextern pd_2048
cextern idct_4x8_internal_avx2.main
cextern idct_4x16_internal_avx2.main
cextern idct_8x8_internal_avx2.main
cextern idct_8x16_internal_avx2.main
cextern idct_16x4_internal_avx2.main
cextern idct_16x8_internal_avx2.main
cextern idct_16x16_internal_avx2.main
cextern inv_txfm_add_dct_dct_8x32_avx2.main
cextern inv_txfm_add_dct_dct_8x32_avx2.main_fast
cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf
cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_16x64_avx2.main_part1
cextern inv_txfm_add_dct_dct_16x64_avx2.main_part2_internal
cextern idct_4x8_internal_8bpc_avx2.main
cextern idct_4x16_internal_8bpc_avx2.main
cextern idct_8x8_internal_8bpc_avx2.main
cextern idct_8x16_internal_8bpc_avx2.main
cextern idct_16x4_internal_8bpc_avx2.main
cextern idct_16x8_internal_8bpc_avx2.main
cextern idct_16x16_internal_8bpc_avx2.main
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
cextern iadst_4x4_internal_avx2.main
cextern iadst_4x8_internal_avx2.main_pass2
cextern iadst_4x16_internal_avx2.main2
cextern iadst_8x4_internal_avx2.main
cextern iadst_8x8_internal_avx2.main_pass2
cextern iadst_8x16_internal_avx2.main
cextern iadst_8x16_internal_avx2.main_pass2_end
cextern iadst_16x4_internal_avx2.main
cextern iadst_16x8_internal_avx2.main
cextern iadst_16x8_internal_avx2.main_pass2_end
cextern iadst_16x16_internal_avx2.main
cextern iadst_16x16_internal_avx2.main_pass2_end
cextern iadst_4x4_internal_8bpc_avx2.main
cextern iadst_4x8_internal_8bpc_avx2.main_pass2
cextern iadst_4x16_internal_8bpc_avx2.main2
cextern iadst_8x4_internal_8bpc_avx2.main
cextern iadst_8x8_internal_8bpc_avx2.main_pass2
cextern iadst_8x16_internal_8bpc_avx2.main
cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
cextern iadst_16x4_internal_8bpc_avx2.main
cextern iadst_16x8_internal_8bpc_avx2.main
cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
cextern iadst_16x16_internal_8bpc_avx2.main
cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
SECTION .text
@ -384,7 +384,7 @@ cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
.pass2:
lea rax, [deint_shuf+128]
vextracti128 xm1, m0, 1
call m(iadst_4x4_internal).main
call m(iadst_4x4_internal_8bpc).main
.end:
vpbroadcastd xm4, [pw_2048]
movq xm2, [dstq+strideq*0]
@ -457,7 +457,7 @@ cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
.pass2:
lea rax, [deint_shuf+128]
vextracti128 xm1, m0, 1
call m(iadst_4x4_internal).main
call m(iadst_4x4_internal_8bpc).main
vpbroadcastd xm4, [pw_2048]
movq xm3, [dstq+strideq*1]
movhps xm3, [dstq+strideq*0]
@ -607,7 +607,7 @@ cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2
punpckldq m0, m2 ; 0 1
vextracti128 xm2, m0, 1 ; 4 5
vextracti128 xm3, m1, 1 ; 6 7
call m(idct_4x8_internal).main
call m(idct_4x8_internal_8bpc).main
vpbroadcastd xm4, [pw_2048]
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
lea r3, [strideq*3]
@ -697,7 +697,7 @@ ALIGN function_align
vextracti128 xm3, m5, 1 ; 6 7
pshufd xm4, xm4, q1032 ; 1 0
pshufd xm5, xm5, q1032 ; 3 2
jmp m(iadst_4x8_internal).main_pass2
jmp m(iadst_4x8_internal_8bpc).main_pass2
ALIGN function_align
.main:
vbroadcasti128 m0, [cq+16*0]
@ -934,7 +934,7 @@ cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2
vextracti128 xm3, m1, 1 ; 6 7
vextracti128 xm6, m4, 1 ; c d
vextracti128 xm7, m5, 1 ; e f
call m(idct_4x16_internal).main
call m(idct_4x16_internal_8bpc).main
vpbroadcastd m9, [pw_2048]
vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
@ -1054,7 +1054,7 @@ ALIGN function_align
vinserti128 m0, xm3, 1 ; 0 3 2 1
vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
vinserti128 m2, xm4, 1 ; b 8 9 a
call m(iadst_4x16_internal).main2
call m(iadst_4x16_internal_8bpc).main2
vpbroadcastd m5, [pw_2896x8]
paddsw m1, m2, m4
psubsw m2, m4
@ -1434,7 +1434,7 @@ ALIGN function_align
vinserti128 m0, xm2, 1
pshufb m0, m4
pshufb m1, m4
jmp m(iadst_8x4_internal).main
jmp m(iadst_8x4_internal_8bpc).main
ALIGN function_align
.main:
vpbroadcastd m1, [pd_2896]
@ -1636,7 +1636,7 @@ cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call .transpose_8x8_packed
call m(idct_8x8_internal).main
call m(idct_8x8_internal_8bpc).main
vpbroadcastd m12, [pw_2048]
vpermq m0, m0, q3120
vpermq m1, m1, q2031
@ -1754,7 +1754,7 @@ cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
call m(idct_8x8_internal_16bpc).transpose_8x8_packed
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal).main_pass2
call m(iadst_8x8_internal_8bpc).main_pass2
vpbroadcastd m5, [pw_2048]
vpbroadcastd xm12, [pw_4096]
psubw m12, m5
@ -1814,7 +1814,7 @@ cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
call m(idct_8x8_internal_16bpc).transpose_8x8_packed
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal).main_pass2
call m(iadst_8x8_internal_8bpc).main_pass2
vpbroadcastd m12, [pw_2048]
vpbroadcastd xm5, [pw_4096]
psubw m12, m5
@ -1971,7 +1971,7 @@ cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call .transpose
call m(idct_8x16_internal).main
call m(idct_8x16_internal_8bpc).main
vpbroadcastd m12, [pw_2048]
REPX {vpermq x, x, q3120}, m0, m2, m4, m6
REPX {vpermq x, x, q2031}, m1, m3, m5, m7
@ -2167,8 +2167,8 @@ cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call m(idct_8x16_internal_16bpc).transpose
call m(iadst_8x16_internal).main
call m(iadst_8x16_internal).main_pass2_end
call m(iadst_8x16_internal_8bpc).main
call m(iadst_8x16_internal_8bpc).main_pass2_end
vpbroadcastd m8, [pw_2048]
vpbroadcastd xm12, [pw_4096]
REPX {vpermq x, x, q2031}, m0, m1, m2, m3
@ -2232,8 +2232,8 @@ cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call m(idct_8x16_internal_16bpc).transpose
call m(iadst_8x16_internal).main
call m(iadst_8x16_internal).main_pass2_end
call m(iadst_8x16_internal_8bpc).main
call m(iadst_8x16_internal_8bpc).main_pass2_end
vpbroadcastd m12, [pw_2048]
vpbroadcastd xm13, [pw_4096]
mova m11, m0
@ -2458,7 +2458,7 @@ cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
.pass2:
call .transpose_4x16_packed
lea rax, [deint_shuf+128]
call m(idct_16x4_internal).main
call m(idct_16x4_internal_8bpc).main
.end:
vpbroadcastd m4, [pw_2048]
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
@ -2517,7 +2517,7 @@ cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
.pass2:
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
lea rax, [deint_shuf+128]
call m(iadst_16x4_internal).main
call m(iadst_16x4_internal_8bpc).main
jmp m(idct_16x4_internal_16bpc).end
ALIGN function_align
.main:
@ -2596,7 +2596,7 @@ cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
.pass2:
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
lea rax, [deint_shuf+128]
call m(iadst_16x4_internal).main
call m(iadst_16x4_internal_8bpc).main
vpbroadcastd m4, [pw_2048]
pmulhrsw m5, m3, m4
pmulhrsw m6, m2, m4
@ -2712,7 +2712,7 @@ cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call .transpose
call m(idct_16x8_internal).main
call m(idct_16x8_internal_8bpc).main
vpbroadcastd m10, [pw_2048]
.end:
pmulhrsw m0, m10
@ -2827,8 +2827,8 @@ cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call m(idct_16x8_internal_16bpc).transpose
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass2_end
call m(iadst_16x8_internal_8bpc).main
call m(iadst_16x8_internal_8bpc).main_pass2_end
vpbroadcastd m10, [pw_2048]
pxor m11, m11
psubw m11, m10
@ -3039,8 +3039,8 @@ cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp m(iadst_16x8_internal_16bpc).pass1_end
.pass2:
call m(idct_16x8_internal_16bpc).transpose
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass2_end
call m(iadst_16x8_internal_8bpc).main
call m(iadst_16x8_internal_8bpc).main_pass2_end
vpbroadcastd m10, [pw_2048]
pxor m11, m11
psubw m11, m10
@ -3216,7 +3216,7 @@ cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
call .transpose
lea rax, [pw_5+128]
mova [rsp], m15
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
.end:
call .write_16x16
@ -3450,8 +3450,8 @@ cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
call m(idct_16x16_internal_16bpc).transpose
lea rax, [pw_5+128]
mova [rsp], m15
call m(iadst_16x16_internal).main
call m(iadst_16x16_internal).main_pass2_end
call m(iadst_16x16_internal_8bpc).main
call m(iadst_16x16_internal_8bpc).main_pass2_end
mova [rsp+32*0], m8
mova [rsp+32*2], m12
mova [rsp+32*3], m13
@ -3582,8 +3582,8 @@ cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
call m(idct_16x16_internal_16bpc).transpose
lea rax, [pw_5+128]
mova [rsp], m15
call m(iadst_16x16_internal).main
call m(iadst_16x16_internal).main_pass2_end
call m(iadst_16x16_internal_8bpc).main
call m(iadst_16x16_internal_8bpc).main_pass2_end
mova [rsp+32*3], m3
mova [rsp+32*2], m2
mova [rsp+32*0], m0
@ -3740,7 +3740,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob
vpbroadcastd m10, [pw_2048]
lea rax, [deint_shuf+128]
REPX {mova x, m4}, m5, m6, m7
call m(inv_txfm_add_dct_dct_8x32).main_fast
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
jmp .end
.eob107:
mova [rsp+32*3], m3
@ -3778,7 +3778,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob
lea rax, [deint_shuf+128]
mova m11, [rsp+32*3] ; out13 out15
vpbroadcastd m10, [pw_2048]
call m(inv_txfm_add_dct_dct_8x32).main
call m(inv_txfm_add_dct_dct_8x32_8bpc).main
.end: ; [rsp+0*32] = m12
vpbroadcastd m12, [pw_2048]
mov cq, r4
@ -4294,7 +4294,7 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob
RET
ALIGN function_align
.pass2:
call m(idct_16x8_internal).main
call m(idct_16x8_internal_8bpc).main
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m11, m4
@ -4404,7 +4404,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
mova m3, [r4+32*3]
.fast:
lea rax, [pw_5+128]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
jmp .idct16
@ -4456,7 +4456,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
mova m6, [r4-32*2]
mova m7, [r4-32*1]
lea rax, [pw_5 + 128]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
lea r3, [rsp+32*8]
mova m8, [r3+32*0]
mova m9, [r3+32*1]
@ -4477,7 +4477,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
mova m6, [r3-32*2]
mova m7, [r3-32*1]
mova [rsp], m15
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
@ -4711,7 +4711,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
lea rax, [pw_5+128]
mov r7, dstq
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
call .write_16x16
mova m0, [r5+32*3]
mova m1, [r5+32*2]
@ -4750,7 +4750,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
call .transpose_16x16
lea rax, [pw_5+128]
mov r7, dstq
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
call .write_16x16
mova m0, [r5+32*3]
mova m1, [r5+32*2]
@ -4764,7 +4764,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
call .transpose_16x16
.end:
lea dstq, [r7+32]
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
call .write_16x16
RET
ALIGN function_align
@ -5124,7 +5124,7 @@ ALIGN function_align
mova m13, [r3+32*51] ; 27
mova m14, [r3+32*53] ; 29
mova m15, [r3+32*55] ; 31
jmp m(inv_txfm_add_dct_dct_16x32).main_oddhalf
jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
ALIGN function_align
.pass2_evenhalf:
mova m0, [r3+32* 0] ; 0
@ -5144,7 +5144,7 @@ ALIGN function_align
mova m14, [r3+32*52] ; 28
mova m15, [r3+32*54] ; 30
mova [rsp+gprsize], m15
jmp m(idct_16x16_internal).main
jmp m(idct_16x16_internal_8bpc).main
cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob
%undef cmp
@ -5300,7 +5300,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
lea r4, [rsp+32*38]
mova [r4-32*4], m0
@ -5330,7 +5330,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
mova m7, [rsp+32*32] ; in30
lea r5, [r4+32*16]
add r4, 32*8
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova m0, [rsp+32* 3] ; in1
mova m1, [rsp+32*33] ; in31
mova m2, [rsp+32*19] ; in17
@ -5342,7 +5342,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
lea rax, [idct64_mul - 8]
add r4, 32*16
add r5, 32*32
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
mova m0, [rsp+32* 7] ; in5
mova m1, [rsp+32*29] ; in27
mova m2, [rsp+32*23] ; in21
@ -5354,7 +5354,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
add rax, 8
add r4, 32*8
sub r5, 32*8
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
lea r8, [strideq*4]
lea r9, [strideq*5]
lea r3, [r9+strideq*1] ; stride*6
@ -5449,7 +5449,7 @@ ALIGN function_align
lea r2, [dstq+r7]
.main_part2_pass2_loop:
vpbroadcastd m14, [pw_m2896_2896]
call m(inv_txfm_add_dct_dct_16x64).main_part2_internal
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
vpbroadcastd m14, [pw_2048]
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
@ -5648,7 +5648,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
lea r4, [rsp+32*70]
mova [r4-32*4], m0
@ -5678,7 +5678,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
mova m7, [r10+32*56] ; in30
lea r5, [r4+32*16]
add r4, 32*8
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova m0, [r10+32* 3] ; in1
mova m1, [r10+32*57] ; in31
mova m2, [r10+32*35] ; in17
@ -5690,7 +5690,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
lea rax, [idct64_mul - 8]
add r4, 32*16
add r5, 32*32
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
mova m0, [r10+32* 7] ; in5
mova m1, [r10+32*53] ; in27
mova m2, [r10+32*39] ; in21
@ -5702,7 +5702,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
add rax, 8
add r4, 32*8
sub r5, 32*8
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2
add r10, 32*8
sub r4, 32*98 ; rsp+32*16
@ -5877,7 +5877,7 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob
mova m15, [r7+32*3]
sub r7, 32*24
mova [rsp], m15
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16
add r5, 32
@ -6109,7 +6109,7 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob
mova m13, [r7-32* 1]
mova m14, [r7+32* 1]
mova m15, [r7+32* 3]
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
mova m0, [r7-32*100]
mova m1, [r7-32*98]
mova m2, [r7-32*96]
@ -6128,7 +6128,7 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob
mova m15, [r7+32* 2]
add r7, 32*8
mova [rsp], m15
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end
sub dstq, r3
lea r2, [r2+r3+32]
@ -6248,7 +6248,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal).main
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
mova [r4-32*4], m0
mova [r4-32*3], m1
@ -6277,7 +6277,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
mova m7, [r10+32* 2] ; in30
lea r5, [r4+32*16]
add r4, 32*8
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova m0, [r10-32*99] ; in1
mova m1, [r10+32* 3] ; in31
mova m2, [r10-32*35] ; in17
@ -6289,7 +6289,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
lea rax, [idct64_mul - 8]
add r4, 32*16
add r5, 32*32
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
mova m0, [r10-32*95] ; in5
mova m1, [r10-32* 1] ; in27
mova m2, [r10-32*31] ; in21
@ -6301,7 +6301,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
add rax, 8
add r4, 32*8
sub r5, 32*8
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2
add r10, 32*8
sub dstq, r8

8066
third_party/dav1d/src/x86/itx16_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

442
third_party/dav1d/src/x86/itx_avx2.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

180
third_party/dav1d/src/x86/itx_init_tmpl.c поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -29,78 +29,57 @@
#include "src/itx.h"
#define decl_itx2_fns(w, h, opt) \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
#define decl_itx12_fns(w, h, opt) \
decl_itx2_fns(w, h, opt); \
decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
#define decl_itx16_fns(w, h, opt) \
decl_itx12_fns(w, h, opt); \
decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
#define decl_itx17_fns(w, h, opt) \
decl_itx16_fns(w, h, opt); \
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
#define avx2_fns(avx2) \
decl_itx17_fns( 4, 4, avx2); \
decl_itx16_fns( 4, 8, avx2); \
decl_itx16_fns( 4, 16, avx2); \
decl_itx16_fns( 8, 4, avx2); \
decl_itx16_fns( 8, 8, avx2); \
decl_itx16_fns( 8, 16, avx2); \
decl_itx2_fns ( 8, 32, avx2); \
decl_itx16_fns(16, 4, avx2); \
decl_itx16_fns(16, 8, avx2); \
decl_itx12_fns(16, 16, avx2); \
decl_itx2_fns (16, 32, avx2); \
decl_itx2_fns (32, 8, avx2); \
decl_itx2_fns (32, 16, avx2); \
decl_itx2_fns (32, 32, avx2); \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_##avx2); \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_##avx2); \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_##avx2); \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_##avx2); \
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_##avx2)
avx2_fns(avx2);
avx2_fns(16bpc_avx2);
decl_itx17_fns( 4, 4, ssse3);
decl_itx16_fns( 4, 8, ssse3);
decl_itx16_fns( 8, 4, ssse3);
decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
decl_itx16_fns( 8, 16, ssse3);
decl_itx16_fns(16, 8, ssse3);
decl_itx12_fns(16, 16, ssse3);
decl_itx2_fns ( 8, 32, ssse3);
decl_itx2_fns (32, 8, ssse3);
decl_itx2_fns (16, 32, ssse3);
decl_itx2_fns (32, 16, ssse3);
decl_itx2_fns (32, 32, ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
#define decl_itx_fns(ext) \
decl_itx17_fns( 4, 4, ext); \
decl_itx16_fns( 4, 8, ext); \
decl_itx16_fns( 4, 16, ext); \
decl_itx16_fns( 8, 4, ext); \
decl_itx16_fns( 8, 8, ext); \
decl_itx16_fns( 8, 16, ext); \
decl_itx2_fns ( 8, 32, ext); \
decl_itx16_fns(16, 4, ext); \
decl_itx16_fns(16, 8, ext); \
decl_itx12_fns(16, 16, ext); \
decl_itx2_fns (16, 32, ext); \
decl_itx2_fns (32, 8, ext); \
decl_itx2_fns (32, 16, ext); \
decl_itx2_fns (32, 32, ext); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
decl_itx_fns(avx2);
decl_itx_fns(sse4);
decl_itx_fns(ssse3);
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_sse2);
COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
@ -108,7 +87,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
{
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
#define assign_itx1_fn(pfx, w, h, ext) \
assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
@ -146,7 +125,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 16
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, 16bpc_sse2);
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
@ -173,38 +152,59 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
assign_itx1_fn ( , 64, 64, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 16
if (bpc <= 10) {
assign_itx16_fn(, 4, 4, sse4);
assign_itx16_fn(R, 4, 8, sse4);
assign_itx16_fn(R, 4, 16, sse4);
assign_itx16_fn(R, 8, 4, sse4);
assign_itx16_fn(, 8, 8, sse4);
assign_itx16_fn(R, 8, 16, sse4);
assign_itx16_fn(R, 16, 4, sse4);
assign_itx16_fn(R, 16, 8, sse4);
assign_itx12_fn(, 16, 16, sse4);
assign_itx2_fn (R, 8, 32, sse4);
assign_itx2_fn (R, 32, 8, sse4);
assign_itx2_fn (R, 16, 32, sse4);
assign_itx2_fn (R, 32, 16, sse4);
assign_itx2_fn (, 32, 32, sse4);
assign_itx1_fn (R, 16, 64, sse4);
assign_itx1_fn (R, 32, 64, sse4);
assign_itx1_fn (R, 64, 16, sse4);
assign_itx1_fn (R, 64, 32, sse4);
assign_itx1_fn (, 64, 64, sse4);
}
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if ARCH_X86_64 && BITDEPTH == 16
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, 16bpc_avx2);
#if ARCH_X86_64
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
#endif
if (bpc > 10) return;
#if ARCH_X86_64
#if BITDEPTH == 8
#define SUFFIX avx2
#else
#define SUFFIX 16bpc_avx2
#endif
assign_itx17_fn( , 4, 4, SUFFIX);
assign_itx16_fn(R, 4, 8, SUFFIX);
assign_itx16_fn(R, 4, 16, SUFFIX);
assign_itx16_fn(R, 8, 4, SUFFIX);
assign_itx16_fn( , 8, 8, SUFFIX);
assign_itx16_fn(R, 8, 16, SUFFIX);
assign_itx2_fn (R, 8, 32, SUFFIX);
assign_itx16_fn(R, 16, 4, SUFFIX);
assign_itx16_fn(R, 16, 8, SUFFIX);
assign_itx12_fn( , 16, 16, SUFFIX);
assign_itx2_fn (R, 16, 32, SUFFIX);
assign_itx1_fn (R, 16, 64, SUFFIX);
assign_itx2_fn (R, 32, 8, SUFFIX);
assign_itx2_fn (R, 32, 16, SUFFIX);
assign_itx2_fn ( , 32, 32, SUFFIX);
assign_itx1_fn (R, 32, 64, SUFFIX);
assign_itx1_fn (R, 64, 16, SUFFIX);
assign_itx1_fn (R, 64, 32, SUFFIX);
assign_itx1_fn ( , 64, 64, SUFFIX);
assign_itx17_fn( , 4, 4, avx2);
assign_itx16_fn(R, 4, 8, avx2);
assign_itx16_fn(R, 4, 16, avx2);
assign_itx16_fn(R, 8, 4, avx2);
assign_itx16_fn( , 8, 8, avx2);
assign_itx16_fn(R, 8, 16, avx2);
assign_itx2_fn (R, 8, 32, avx2);
assign_itx16_fn(R, 16, 4, avx2);
assign_itx16_fn(R, 16, 8, avx2);
assign_itx12_fn( , 16, 16, avx2);
assign_itx2_fn (R, 16, 32, avx2);
assign_itx1_fn (R, 16, 64, avx2);
assign_itx2_fn (R, 32, 8, avx2);
assign_itx2_fn (R, 32, 16, avx2);
assign_itx2_fn ( , 32, 32, avx2);
assign_itx1_fn (R, 32, 64, avx2);
assign_itx1_fn (R, 64, 16, avx2);
assign_itx1_fn (R, 64, 32, avx2);
assign_itx1_fn ( , 64, 64, avx2);
#endif
}

1155
third_party/dav1d/src/x86/itx_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -623,9 +623,7 @@ SECTION .text
paddw m8, m5 ; p6*7+p3+p1+q0
paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m2
por m10, m9
vpblendvb m10, m2, m10, m1
%ifidn %2, v
mova [tmpq+strideq*2], m10 ; p5
%else
@ -638,9 +636,7 @@ SECTION .text
paddw m8, m6
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m7
por m10, m9
vpblendvb m10, m7, m10, m1
%ifidn %2, v
mova [tmpq+stride3q], m10 ; p4
%else
@ -653,9 +649,7 @@ SECTION .text
psubw m8, m2
paddw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m11
por m10, m9
vpblendvb m10, m11, m10, m1
%ifidn %2, v
mova [tmpq+strideq*4], m10 ; p3
lea tmpq, [dstq+strideq*4]
@ -669,9 +663,7 @@ SECTION .text
paddw m8, m15
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m13
por m10, m9
vpblendvb m10, m13, m10, m1
mova [rsp+1*32], m10 ; don't clobber p2/m13
; sub p6/p3, add p0/q4
@ -684,9 +676,7 @@ SECTION .text
%endif
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m3
por m10, m9
vpblendvb m10, m3, m10, m1
mova [rsp+2*32], m10 ; don't clobber p1/m3
; sub p6/p2, add q0/q5
@ -699,9 +689,7 @@ SECTION .text
%endif
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m4
por m10, m9
vpblendvb m10, m4, m10, m1
mova [rsp+3*32], m10 ; don't clobber p0/m4
; sub p6/p1, add q1/q6
@ -715,9 +703,7 @@ SECTION .text
paddw m8, m0
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m5
por m10, m9
vpblendvb m10, m5, m10, m1
mova [rsp+4*32], m10 ; don't clobber q0/m5
; sub p5/p0, add q2/q6
@ -726,9 +712,7 @@ SECTION .text
paddw m8, m0
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m6
por m2, m10, m9 ; don't clobber q1/m6
vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6
; sub p4/q0, add q3/q6
paddw m8, m15
@ -736,9 +720,7 @@ SECTION .text
paddw m8, m0
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m14
por m7, m10, m9 ; don't clobber q2/m14
vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14
; sub p3/q1, add q4/q6
%ifidn %2, v
@ -750,9 +732,7 @@ SECTION .text
paddw m8, m0
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
pandn m9, m1, m15
por m10, m9
vpblendvb m10, m15, m10, m1
%ifidn %2, v
mova [tmpq+mstrideq], m10 ; q3
%else
@ -769,13 +749,12 @@ SECTION .text
paddw m8, m0
psubw m8, m10
psrlw m10, m8, 4
pand m10, m1
%ifidn %2, v
pandn m9, m1, [tmpq+strideq*0]
mova m9, [tmpq+strideq*0]
%else
pandn m9, m1, [rsp+10*32]
mova m9, [rsp+10*32]
%endif
por m10, m9
vpblendvb m10, m9, m10, m1
%ifidn %2, v
mova [tmpq+strideq*0], m10 ; q4
%else
@ -790,11 +769,11 @@ SECTION .text
psrlw m10, m8, 4
pand m10, m1
%ifidn %2, v
pandn m9, m1, [tmpq+strideq*1]
mova m9, [tmpq+strideq*1]
%else
pandn m9, m1, [rsp+11*32]
mova m9, [rsp+11*32]
%endif
por m10, m9
vpblendvb m10, m9, m10, m1
%ifidn %2, v
mova [tmpq+strideq*1], m10 ; q5
%else
@ -859,14 +838,12 @@ SECTION .text
paddw m2, m0
pmulhrsw m2, [pw_4096]
REPX {pand x, m9}, m7, m8, m10, m11, m1, m2
REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14
por m13, m7
por m3, m8
por m4, m10
por m5, m11
por m6, m1
por m14, m2
vpblendvb m13, m13, m7, m9
vpblendvb m3, m3, m8, m9
vpblendvb m4, m4, m10, m9
vpblendvb m5, m5, m11, m9
vpblendvb m6, m6, m1, m9
vpblendvb m14, m14, m2, m9
%ifidn %2, v
mova [tmpq+strideq*1], m13 ; p2
@ -984,12 +961,10 @@ SECTION .text
paddw m8, m14
pmulhrsw m8, [pw_4096]
REPX {pand x, m9}, m2, m10, m11, m8
REPX {pandn x, m9, x}, m3, m4, m5, m6
por m3, m2
por m4, m10
por m5, m11
por m6, m8
vpblendvb m3, m3, m2, m9
vpblendvb m4, m4, m10, m9
vpblendvb m5, m5, m11, m9
vpblendvb m6, m6, m8, m9
%ifidn %2, v
mova [tmpq+strideq*2], m3 ; p1

1996
third_party/dav1d/src/x86/loopfilter16_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

100
third_party/dav1d/src/x86/loopfilter_avx2.asm поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
@ -822,9 +822,7 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m7
por m8, m9
vpblendvb m8, m7, m8, m1
%ifidn %2, v
mova [tmpq+stride3q], m8 ; p4
%else
@ -850,9 +848,7 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m12
por m8, m9
vpblendvb m8, m12, m8, m1
%ifidn %2, v
mova [tmpq+strideq*4], m8 ; p3
%else
@ -878,9 +874,7 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m13
por m8, m9
vpblendvb m8, m13, m8, m1
mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F
; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
@ -910,9 +904,7 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m3
por m8, m9
vpblendvb m8, m3, m8, m1
mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G
; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
@ -940,9 +932,7 @@ SECTION .text
pmulhrsw m0, m10, [pw_2048]
pmulhrsw m8, m11, [pw_2048]
packuswb m0, m8
pand m0, m1
pandn m8, m1, m4
por m0, m8
vpblendvb m0, m4, m0, m1
mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H
; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
@ -966,9 +956,7 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m5
por m8, m9
vpblendvb m8, m5, m8, m1
mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I
; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
@ -985,9 +973,7 @@ SECTION .text
pmulhrsw m2, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m2, m9
pand m2, m1
pandn m9, m1, m6
por m2, m9 ; don't clobber q1/m6 since we need it in K
vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K
; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
; write +2
@ -1003,9 +989,7 @@ SECTION .text
pmulhrsw m7, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m7, m9
pand m7, m1
pandn m9, m1, m14
por m7, m9 ; don't clobber q2/m14 since we need it in K
vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K
; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
; write +3
@ -1021,9 +1005,7 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m15
por m8, m9
vpblendvb m8, m15, m8, m1
%ifidn %2, v
mova [tmpq+mstrideq], m8 ; q3
%else
@ -1044,13 +1026,12 @@ SECTION .text
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
%ifidn %2, v
pandn m9, m1, [tmpq+strideq*0]
mova m9, [tmpq+strideq*0]
%else
pandn m9, m1, [rsp+15*32]
mova m9, [rsp+15*32]
%endif
por m8, m9
vpblendvb m8, m9, m8, m1
%ifidn %2, v
mova [tmpq+strideq*0], m8 ; q4
%else
@ -1070,13 +1051,12 @@ SECTION .text
pmulhrsw m10, [pw_2048]
pmulhrsw m11, [pw_2048]
packuswb m10, m11
pand m10, m1
%ifidn %2, v
pandn m11, m1, [tmpq+strideq*1]
mova m11, [tmpq+strideq*1]
%else
pandn m11, m1, [rsp+16*32]
mova m11, [rsp+16*32]
%endif
por m10, m11
vpblendvb m10, m11, m10, m1
%ifidn %2, v
mova [tmpq+strideq*1], m10 ; q5
%else
@ -1109,9 +1089,7 @@ SECTION .text
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m13
por m10, m8, m11 ; p2
vpblendvb m10, m13, m8, m9 ; p2
%ifidn %2, v
mova [tmpq+strideq*1], m10 ; p2
%endif
@ -1129,9 +1107,7 @@ SECTION .text
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m3
por m8, m11 ; p1
vpblendvb m8, m3, m8, m9 ; p1
%ifidn %2, v
mova [tmpq+strideq*2], m8 ; p1
%else
@ -1151,9 +1127,7 @@ SECTION .text
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m4
por m8, m11 ; p0
vpblendvb m8, m4, m8, m9 ; p0
%ifidn %2, v
mova [tmpq+stride3q ], m8 ; p0
%else
@ -1175,9 +1149,7 @@ SECTION .text
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m5
por m11, m8, m11 ; q0
vpblendvb m11, m5, m8, m9 ; q0
%ifidn %2, v
mova [dstq+strideq*0], m11 ; q0
%endif
@ -1195,9 +1167,7 @@ SECTION .text
psrlw m8, m2, 3
psrlw m13, m7, 3
packuswb m8, m13
pand m8, m9
pandn m13, m9, m6
por m13, m8, m13 ; q1
vpblendvb m13, m6, m8, m9 ; q1
%ifidn %2, v
mova [dstq+strideq*1], m13 ; q1
%endif
@ -1217,9 +1187,7 @@ SECTION .text
psrlw m2, 3
psrlw m7, 3
packuswb m2, m7
pand m2, m9
pandn m7, m9, m14
por m2, m7 ; q2
vpblendvb m2, m14, m2, m9 ; q2
%ifidn %2, v
mova [dstq+strideq*2], m2 ; q2
%else
@ -1380,9 +1348,7 @@ SECTION .text
pmulhrsw m2, m0, [pw_4096]
pmulhrsw m12, m1, [pw_4096]
packuswb m2, m12
pand m2, m9
pandn m12, m9, m3
por m2, m12
vpblendvb m2, m3, m2, m9
%ifidn %2, v
mova [tmpq+strideq*2], m2 ; p1
%endif
@ -1400,9 +1366,7 @@ SECTION .text
pmulhrsw m12, m0, [pw_4096]
pmulhrsw m13, m1, [pw_4096]
packuswb m12, m13
pand m12, m9
pandn m13, m9, m4
por m12, m13
vpblendvb m12, m4, m12, m9
%ifidn %2, v
mova [tmpq+stride3q], m12 ; p0
%endif
@ -1418,9 +1382,7 @@ SECTION .text
pmulhrsw m14, m0, [pw_4096]
pmulhrsw m13, m1, [pw_4096]
packuswb m14, m13
pand m14, m9
pandn m13, m9, m5
por m14, m13
vpblendvb m14, m5, m14, m9
%ifidn %2, v
mova [dstq+strideq*0], m14 ; q0
%endif
@ -1436,9 +1398,7 @@ SECTION .text
pmulhrsw m0, [pw_4096]
pmulhrsw m1, [pw_4096]
packuswb m0, m1
pand m0, m9
pandn m9, m6
por m0, m9
vpblendvb m0, m6, m0, m9
%ifidn %2, v
mova [dstq+strideq*1], m0 ; q1
%else
@ -1457,7 +1417,7 @@ SECTION .text
%endmacro
INIT_YMM avx2
cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp
shl l_strideq, 2
@ -1495,7 +1455,7 @@ cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
RET
INIT_YMM avx2
cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp
shl l_strideq, 2
@ -1535,7 +1495,7 @@ cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
RET
INIT_YMM avx2
cglobal lpf_v_sb_uv, 7, 10, 16, \
cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp
shl l_strideq, 2
@ -1566,7 +1526,7 @@ cglobal lpf_v_sb_uv, 7, 10, 16, \
RET
INIT_YMM avx2
cglobal lpf_h_sb_uv, 7, 10, 16, \
cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp
shl l_strideq, 2

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -29,48 +29,30 @@
#include "src/loopfilter.h"
#define decl_loopfilter_sb_fns(ext) \
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
decl_loopfilter_sb_fns(ssse3);
decl_loopfilter_sb_fns(avx2);
decl_loopfilter_sb_fns(16bpc_ssse3);
decl_loopfilter_sb_fns(16bpc_avx2);
COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
#else
#if ARCH_X86_64
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_ssse3;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_ssse3;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_ssse3;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_ssse3;
#endif
#endif
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if ARCH_X86_64
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2;
#else
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx2;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx2;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx2;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx2;
#endif
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
#endif
}

18
third_party/dav1d/src/x86/loopfilter_sse.asm поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
@ -1977,11 +1977,11 @@ SECTION .text
INIT_XMM ssse3
%if ARCH_X86_64
cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \
cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp, mask_bits
%else
cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \
dst, stride, mask, l, l_stride, lut, mask_bits
RELOC_ARGS w
SETUP_PIC
@ -2075,11 +2075,11 @@ cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
INIT_XMM ssse3
%if ARCH_X86_64
cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \
cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp, mask_bits
%else
cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \
dst, stride, mask, l, l_stride, lut, mask_bits
RELOC_ARGS h
SETUP_PIC
@ -2179,11 +2179,11 @@ cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
INIT_XMM ssse3
%if ARCH_X86_64
cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \
cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp, mask_bits
%else
cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \
dst, stride, mask, l, l_stride, lut, mask_bits
RELOC_ARGS w
SETUP_PIC
@ -2261,11 +2261,11 @@ cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
INIT_XMM ssse3
%if ARCH_X86_64
cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \
cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp, mask_bits
%else
cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \
cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \
dst, stride, mask, l, l_stride, lut, mask_bits
RELOC_ARGS h
SETUP_PIC

Просмотреть файл

@ -662,7 +662,7 @@ ALIGN function_align
jl .v_loop
ret
cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \
cglobal sgr_filter_5x5_16bpc, 5, 14, 15, 400*24+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
movifnidn wd, wm
mov paramsq, paramsmp
@ -680,13 +680,12 @@ cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf,
lea t3, [rsp+wq*2+400*12+16]
vpbroadcastd m11, [pd_0xf00800a4]
lea t4, [rsp+wq+400*20+16]
vpbroadcastd m12, [pw_256]
mova xm12, [sgr_lshuf5]
neg wq
vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15)
pxor m6, m6
vpbroadcastd m14, [pw_1023]
psllw m7, 4
mova xm15, [sgr_lshuf5]
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
@ -786,7 +785,7 @@ cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf,
jmp .h_main
.h_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm15
pshufb xm4, xm12
vinserti128 m4, [lpfq+wq+10], 1
jmp .h_main
.h_top:
@ -867,7 +866,7 @@ ALIGN function_align
jmp .hv_main
.hv_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm15
pshufb xm4, xm12
vinserti128 m4, [lpfq+wq+10], 1
jmp .hv_main
.hv_bottom:
@ -945,13 +944,12 @@ ALIGN function_align
paddusw m4, m11
paddusw m5, m11
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
psubw m2, m12, m2 ; a
paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m13
mova [t4+r10+4], m2
@ -1015,13 +1013,12 @@ ALIGN function_align
paddusw m4, m11
paddusw m5, m11
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
pmulld m1, m3
packssdw m2, m3
psubw m2, m12, m2 ; a
paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m13
mova [t4+r10+4], m2
@ -1098,15 +1095,13 @@ ALIGN function_align
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
paddd m2, m1 ; a * src + b + (1 << 8)
paddd m3, m4
psrld m2, 9
psrld m3, 9
packssdw m2, m3
psllw m1, m0, 4
psubw m2, m1
pmulhrsw m2, m7
paddw m0, m2
psubd m1, m2 ; b - a * src + (1 << 8)
psubd m4, m3
psrad m1, 9
psrad m4, 9
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m14
mova [dstq+r10], m0
@ -1130,15 +1125,13 @@ ALIGN function_align
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
paddd m2, m1 ; a * src + b + (1 <<7)
paddd m3, m4
psrld m2, 8
psrld m3, 8
packssdw m2, m3
psllw m1, m0, 4
psubw m2, m1
pmulhrsw m2, m7
paddw m0, m2
psubd m1, m2 ; b - a * src + (1 << 7)
psubd m4, m3
psrad m1, 8
psrad m4, 8
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m14
mova [dstq+r10], m0
@ -1147,7 +1140,7 @@ ALIGN function_align
add dstq, dst_strideq
ret
cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
cglobal sgr_filter_3x3_16bpc, 5, 14, 14, 400*42+8, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
movifnidn wd, wm
mov paramsq, paramsmp
@ -1166,11 +1159,10 @@ cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
lea t4, [rsp+wq+400*32+8]
vpbroadcastd m11, [pd_34816]
neg wq
vpbroadcastd m12, [pw_256]
mova xm12, [sgr_lshuf3]
pxor m6, m6
vpbroadcastd m13, [pw_1023]
psllw m7, 4
mova xm14, [sgr_lshuf3]
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
@ -1268,7 +1260,7 @@ cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
jmp .h_main
.h_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm14
pshufb xm4, xm12
vinserti128 m4, [lpfq+wq+12], 1
jmp .h_main
.h_top:
@ -1318,7 +1310,7 @@ ALIGN function_align
jmp .hv0_main
.hv0_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm14
pshufb xm4, xm12
vinserti128 m4, [lpfq+wq+12], 1
jmp .hv0_main
.hv0_bottom:
@ -1388,7 +1380,7 @@ ALIGN function_align
paddusw m4, m10
paddusw m5, m10
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -1396,7 +1388,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
psubw m2, m12, m2
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m2
@ -1420,7 +1411,7 @@ ALIGN function_align
jmp .hv1_main
.hv1_extend_left:
mova xm4, [lpfq+wq]
pshufb xm4, xm14
pshufb xm4, xm12
vinserti128 m4, [lpfq+wq+12], 1
jmp .hv1_main
.hv1_bottom:
@ -1484,7 +1475,7 @@ ALIGN function_align
paddusw m4, m10
paddusw m5, m10
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -1492,7 +1483,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
psubw m2, m12, m2
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*2 +4], m2
@ -1548,7 +1538,7 @@ ALIGN function_align
paddusw m4, m10
paddusw m5, m10
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -1556,7 +1546,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
psubw m2, m12, m2
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m2
@ -1606,7 +1595,7 @@ ALIGN function_align
paddusw m4, m10
paddusw m5, m10
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -1614,7 +1603,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
psubw m2, m12, m2
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*2+ 4], m2
@ -1700,15 +1688,13 @@ ALIGN function_align
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
paddd m2, m1 ; a * src + b + (1 << 8)
paddd m3, m4
psrld m2, 9
psrld m3, 9
packssdw m2, m3
psllw m1, m0, 4
psubw m2, m1
pmulhrsw m2, m7
paddw m0, m2
psubd m1, m2 ; b - a * src + (1 << 8)
psubd m4, m3
psrad m1, 9
psrad m4, 9
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m13
mova [dstq+r10], m0
@ -1756,15 +1742,13 @@ ALIGN function_align
pmaddwd m3, m1
vinserti128 m1, m4, xm5, 1
vperm2i128 m4, m5, 0x31
paddd m2, m1 ; a * src + b + (1 << 8)
paddd m3, m4
psrld m2, 9
psrld m3, 9
packssdw m2, m3
psllw m1, m0, 4
psubw m2, m1
pmulhrsw m2, m7
paddw m0, m2
psubd m1, m2 ; b - a * src + (1 << 8)
psubd m4, m3
psrad m1, 9
psrad m4, 9
packssdw m1, m4
pmulhrsw m1, m7
paddw m0, m1
pmaxsw m0, m6
pminsw m0, m13
mova [dstq+r10], m0
@ -1786,7 +1770,7 @@ cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \
lea t1, [rsp+wq+12]
vpbroadcastd m10, [pd_34816]
add dstq, wq
vpbroadcastd m11, [pw_256]
vpbroadcastd m11, [pd_4096]
lea t3, [rsp+wq*2+400*24+8]
vpbroadcastd m12, [pd_0xf00801c7]
lea t4, [rsp+wq+400*52+8]
@ -2048,7 +2032,7 @@ ALIGN function_align
paddusw m4, m12
paddusw m5, m12
psrad m3, m4, 20 ; min(z3, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x3
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -2056,7 +2040,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
psubw m2, m11, m2
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*2+ 4], m2
@ -2154,7 +2137,7 @@ ALIGN function_align
paddusw m2, m12
paddusw m3, m12
psrad m7, m2, 20 ; min(z3, 255) - 256
vpgatherdd m6, [r13+m7*4], m2
vpgatherdd m6, [r13+m7*4], m2 ; x3
psrad m2, m3, 20
vpgatherdd m7, [r13+m2*4], m3
pmulld m0, m6
@ -2162,7 +2145,6 @@ ALIGN function_align
pmulld m7, m1
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m7, m10
psubw m6, m11, m6
psrld m0, 12
psrld m7, 12
paddw m1, m8, [t2+r10+400*0]
@ -2207,7 +2189,7 @@ ALIGN function_align
paddusw m2, m4
paddusw m3, m4
psrad m5, m2, 20 ; min(z5, 255) - 256
vpgatherdd m4, [r13+m5*4], m2
vpgatherdd m4, [r13+m5*4], m2 ; x5
psrad m2, m3, 20
vpgatherdd m5, [r13+m2*4], m3
pmulld m0, m4
@ -2215,7 +2197,6 @@ ALIGN function_align
packssdw m4, m5
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
psubw m4, m11, m4
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m4
@ -2271,7 +2252,7 @@ ALIGN function_align
paddusw m4, m12
paddusw m5, m12
psrad m3, m4, 20 ; min(z3, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x3
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -2279,7 +2260,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
psubw m2, m11, m2
psrld m0, 12
psrld m1, 12
mova m3, [t1+r10+400*0]
@ -2341,7 +2321,7 @@ ALIGN function_align
paddusw m4, m12
paddusw m5, m12
psrad m3, m4, 20 ; min(z3, 255) - 256
vpgatherdd m2, [r13+m3*4], m4
vpgatherdd m2, [r13+m3*4], m4 ; x3
psrad m4, m5, 20
vpgatherdd m3, [r13+m4*4], m5
pmulld m0, m2
@ -2349,7 +2329,6 @@ ALIGN function_align
packssdw m2, m3
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
psubw m2, m11, m2
psrld m0, 12
psrld m8, m1, 12
mova [t4+r10*1+400*4+4], m2
@ -2396,7 +2375,7 @@ ALIGN function_align
paddusw m2, m4
paddusw m3, m4
psrad m5, m2, 20 ; min(z5, 255) - 256
vpgatherdd m4, [r13+m5*4], m2
vpgatherdd m4, [r13+m5*4], m2 ; x5
psrad m2, m3, 20
vpgatherdd m5, [r13+m2*4], m3
pmulld m0, m4
@ -2404,7 +2383,6 @@ ALIGN function_align
packssdw m4, m5
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
psubw m4, m11, m4
psrld m0, 12
psrld m1, 12
mova [t4+r10*1+400*0+ 4], m4
@ -2508,16 +2486,13 @@ ALIGN function_align
pmaddwd m2, m4 ; a5 * src
pmaddwd m3, m4 ; a3 * src
pslld m4, 13
psubd m0, m4
psubd m1, m4
paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13)
paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13)
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
psrld m0, 9
pslld m1, 7
pblendw m0, m1, 0xaa
pmaddwd m0, m15
vpbroadcastd m1, [pd_4096]
paddd m4, m1
paddd m4, m11
paddd m0, m4
psrad m0, 7
vextracti128 xm1, m0, 1
@ -2551,22 +2526,19 @@ ALIGN function_align
mova [t3+r10*2+400*20], m5
mova [t3+r10*2+400*24], m4
pmovzxwd m4, [dstq+r10]
pmovzxwd m0, [t4+r10*1+400* 6]
pmovzxwd m2, [t4+r10*1+400* 6]
pmovzxwd m3, xm3
pmaddwd m0, m4 ; a5 * src
mova m0, [t3+r10*2+400*12]
pmaddwd m2, m4 ; a5 * src
pmaddwd m3, m4 ; a3 * src
pslld m4, 12
psubd m2, m4, [t3+r10*2+400*12]
paddd m4, m4
psubd m1, m4
psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13)
paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13)
pslld m4, 13
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
psrld m0, 8
pslld m1, 7
pblendw m0, m1, 0xaa
pmaddwd m0, m15
vpbroadcastd m1, [pd_4096]
paddd m4, m1
paddd m4, m11
paddd m0, m4
psrad m0, 7
vextracti128 xm1, m0, 1

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -79,14 +79,6 @@ pd_0xf00800a4: dd 0xf00800a4
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers
INIT_YMM avx2
@ -111,6 +103,8 @@ cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf,
add dstq, wq
vpbroadcastd m15, [fltq+20] ; y2 y3
neg wq
psllw m14, 5
psllw m15, 5
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
@ -357,9 +351,7 @@ ALIGN function_align
mova m3, [t3+r10*2+32]
mova m5, [t5+r10*2+32]
paddw m5, [t1+r10*2+32]
psrad m0, 11
psrad m4, 11
packssdw m0, m4
packuswb m0, m4
paddw m4, m1, [t6+r10*2+32]
mova [t0+r10*2+32], m1
punpcklwd m1, m2, m3
@ -372,9 +364,9 @@ ALIGN function_align
pmaddwd m4, m14
paddd m1, m3
paddd m2, m4
psrad m1, 11
psrad m2, 11
packssdw m1, m2
packuswb m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+r10], m0
add r10, 32
@ -423,9 +415,10 @@ ALIGN function_align
paddd m2, m6
paddd m1, m5
paddd m3, m7
REPX {psrad x, 11}, m0, m2, m1, m3
packssdw m0, m2
packssdw m1, m3
packuswb m0, m2
packuswb m1, m3
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+r10], m0
add r10, 32
@ -459,6 +452,8 @@ cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
add dstq, wq
vpbroadcastd m15, [fltq+20] ; y2 y3
neg wq
psllw m14, 5
psllw m15, 5
test edgeb, 4 ; LR_HAVE_TOP
jz .no_top
call .h_top
@ -661,9 +656,7 @@ ALIGN function_align
mova m2, [t3+r10*2+32]
paddw m2, [t1+r10*2+32]
mova m3, [t2+r10*2+32]
psrad m0, 11
psrad m4, 11
packssdw m0, m4
packuswb m0, m4
paddw m4, m1, [t4+r10*2+32]
mova [t0+r10*2+32], m1
punpcklwd m1, m2, m3
@ -676,9 +669,9 @@ ALIGN function_align
pmaddwd m4, m14
paddd m1, m3
paddd m2, m4
psrad m1, 11
psrad m2, 11
packssdw m1, m2
packuswb m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+r10], m0
add r10, 32
@ -720,9 +713,10 @@ ALIGN function_align
paddd m2, m6
paddd m1, m5
paddd m3, m7
REPX {psrad x, 11}, m0, m2, m1, m3
packssdw m0, m2
packssdw m1, m3
packuswb m0, m2
packuswb m1, m3
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq+r10], m0
add r10, 32
@ -1003,7 +997,7 @@ ALIGN function_align
paddusw m4, m13
paddusw m5, m13
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r12+m3*4], m4
vpgatherdd m2, [r12+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r12+m4*4], m5
pmulld m0, m2
@ -1063,7 +1057,7 @@ ALIGN function_align
paddusw m4, m13
paddusw m5, m13
psrad m3, m4, 20 ; min(z, 255) - 256
vpgatherdd m2, [r12+m3*4], m4
vpgatherdd m2, [r12+m3*4], m4 ; x
psrad m4, m5, 20
vpgatherdd m3, [r12+m4*4], m5
pmulld m0, m2
@ -1096,12 +1090,9 @@ ALIGN function_align
pslld m3, 2
paddd m2, m0 ; ab 565
paddd m3, m1
; a = 4096 - (ab & 4095) = -(ab | ~4095), so by
; using OR instead of AND for the masking we get
; the subtraction for free (with a negated result)
por m0, m15, m2 ; -a
psrld m2, 12 ; b
por m1, m15, m3
pandn m0, m15, m2 ; a
psrld m2, 12 ; b
pandn m1, m15, m3
psrld m3, 12
mova [t3+r10*4+400*4+ 0], m0
mova [t3+r10*4+400*8+ 0], m2
@ -1126,11 +1117,11 @@ ALIGN function_align
pslld m3, 2
paddd m2, m0
paddd m3, m1
por m0, m15, m2
pandn m0, m15, m2
psrld m2, 12
por m1, m15, m3
pandn m1, m15, m3
psrld m3, 12
paddd m4, m0, [t3+r10*4+400*4+ 0] ; -a
paddd m4, m0, [t3+r10*4+400*4+ 0] ; a
paddd m5, m1, [t3+r10*4+400*4+32]
mova [t3+r10*4+400*4+ 0], m0
mova [t3+r10*4+400*4+32], m1
@ -1140,16 +1131,14 @@ ALIGN function_align
mova [t3+r10*4+400*8+32], m3
pmovzxbd m2, [dstq+r10+0]
pmovzxbd m3, [dstq+r10+8]
pmaddwd m4, m2 ; -a * src
pmaddwd m4, m2 ; a * src
pmaddwd m5, m3
packssdw m2, m3
psubd m0, m4 ; a * src + b + (1 << 8)
psubd m0, m4 ; b - a * src + (1 << 8)
psubd m1, m5
psrld m0, 9
psrld m1, 9
psrad m0, 9
psrad m1, 9
packssdw m0, m1
psllw m1, m2, 4
psubw m0, m1
pmulhrsw m0, m7
paddw m0, m2
vextracti128 xm1, m0, 1
@ -1166,18 +1155,16 @@ ALIGN function_align
.n1_loop:
pmovzxbd m2, [dstq+r10+0]
pmovzxbd m3, [dstq+r10+8]
pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; -a * src
pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src
pmaddwd m5, m3, [t3+r10*4+400*4+32]
mova m0, [t3+r10*4+400*8+ 0] ; b
mova m1, [t3+r10*4+400*8+32]
packssdw m2, m3
psubd m0, m4 ; a * src + b + (1 << 7)
psubd m0, m4 ; b - a * src + (1 << 7)
psubd m1, m5
psrld m0, 8
psrld m1, 8
psrad m0, 8
psrad m1, 8
packssdw m0, m1
psllw m1, m2, 4
psubw m0, m1
pmulhrsw m0, m7
paddw m0, m2
vextracti128 xm1, m0, 1
@ -1509,31 +1496,29 @@ ALIGN function_align
paddd m5, m5
psubd m5, m4
mova [t5+r10*4+32], m5
por m4, m14, m0
pandn m4, m14, m0
psrld m0, 12
paddd m3, m5
por m5, m14, m2
pandn m5, m14, m2
psrld m2, 12
paddd m4, m5 ; -a
por m5, m14, m1
paddd m4, m5 ; a
pandn m5, m14, m1
psrld m1, 12
paddd m0, m2 ; b + (1 << 8)
por m2, m14, m3
paddd m0, m2 ; b + (1 << 8)
pandn m2, m14, m3
psrld m3, 12
paddd m5, m2
pmovzxbd m2, [dstq+r10+0]
paddd m1, m3
pmovzxbd m3, [dstq+r10+8]
pmaddwd m4, m2 ; -a * src
pmaddwd m4, m2 ; a * src
pmaddwd m5, m3
packssdw m2, m3
psubd m0, m4 ; a * src + b + (1 << 8)
psubd m0, m4 ; b - a * src + (1 << 8)
psubd m1, m5
psrld m0, 9
psrld m1, 9
psrad m0, 9
psrad m1, 9
packssdw m0, m1
psllw m1, m2, 4
psubw m0, m1
pmulhrsw m0, m7
paddw m0, m2
vextracti128 xm1, m0, 1
@ -1908,7 +1893,7 @@ ALIGN function_align
vpgatherdd m2, [r12+m3*4], m6
psrad m6, m7, 20
vpgatherdd m3, [r12+m6*4], m7
vpbroadcastd m6, [base+pd_34816]
vpbroadcastd m6, [base+pd_34816] ; x3
pmulld m0, m2
vpbroadcastd m7, [base+pd_m4096]
pmulld m1, m3
@ -1918,12 +1903,12 @@ ALIGN function_align
pand m7, m1
por m0, m2 ; a3 | (b3 << 12)
por m7, m3
paddw m1, m8, [t2+r10*2+400*0]
paddd m2, m4, [t2+r10*2+400*2]
paddd m3, m5, [t2+r10*2+400*4]
paddw m1, [t1+r10*2+400*0]
paddd m2, [t1+r10*2+400*2]
paddd m3, [t1+r10*2+400*4]
paddw m1, m8, [t2+r10*2+400*0]
paddd m2, m4, [t2+r10*2+400*2]
paddd m3, m5, [t2+r10*2+400*4]
paddw m1, [t1+r10*2+400*0]
paddd m2, [t1+r10*2+400*2]
paddd m3, [t1+r10*2+400*4]
mova [t2+r10*2+400*0], m8
mova [t2+r10*2+400*2], m4
mova [t2+r10*2+400*4], m5
@ -1949,7 +1934,7 @@ ALIGN function_align
paddusw m2, m4
paddusw m3, m4
psrad m5, m2, 20 ; min(z5, 255) - 256
vpgatherdd m4, [r12+m5*4], m2
vpgatherdd m4, [r12+m5*4], m2 ; x5
psrad m2, m3, 20
vpgatherdd m5, [r12+m2*4], m3
pmulld m0, m4
@ -2006,7 +1991,7 @@ ALIGN function_align
paddusw m4, m2
paddusw m5, m2
psrad m3, m4, 20 ; min(z3, 255) - 256
vpgatherdd m2, [r12+m3*4], m4
vpgatherdd m2, [r12+m3*4], m4 ; x3
psrad m4, m5, 20
vpgatherdd m3, [r12+m4*4], m5
pmulld m0, m2
@ -2023,7 +2008,7 @@ ALIGN function_align
mova [t3+r10*4+400*8+ 8], m2
mova [t3+r10*4+400*0+ 8], m3
mova [t3+r10*4+400*0+40], m4
paddw m2, m2 ; cc5
paddw m2, m2 ; cc5
paddd m3, m3
paddd m4, m4
mova [t1+r10*2+400*0], m2
@ -2066,7 +2051,7 @@ ALIGN function_align
paddusw m4, m2
paddusw m5, m2
psrad m3, m4, 20 ; min(z3, 255) - 256
vpgatherdd m2, [r12+m3*4], m4
vpgatherdd m2, [r12+m3*4], m4 ; x3
psrad m4, m5, 20
vpgatherdd m3, [r12+m4*4], m5
vpbroadcastd m4, [base+pd_34816]
@ -2112,7 +2097,7 @@ ALIGN function_align
paddusw m2, m4
paddusw m3, m4
psrad m5, m2, 20 ; min(z5, 255) - 256
vpgatherdd m4, [r12+m5*4], m2
vpgatherdd m4, [r12+m5*4], m2 ; x5
psrad m2, m3, 20
vpgatherdd m5, [r12+m2*4], m3
pmulld m0, m4
@ -2154,7 +2139,7 @@ ALIGN function_align
paddd m3, m3 ; ab3[ 0] 222
psubd m2, m4 ; ab3[-1] 343
mova [t3+r10*4+400*20], m3
por m0, m6, m1 ; a5 565
pandn m0, m6, m1 ; a5 565
mova [t3+r10*4+400*24], m2
psrld m1, 12 ; b5 565
mova [t3+r10*4+400*12], m0
@ -2175,11 +2160,11 @@ ALIGN function_align
paddd m0, m4
pslld m4, 2
paddd m4, m0
por m0, m6, m4
pandn m0, m6, m4
psrld m4, 12
paddd m2, m0, [t3+r10*4+400*12] ; -a5
paddd m2, m0, [t3+r10*4+400*12] ; a5
mova [t3+r10*4+400*12], m0
paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
mova [t3+r10*4+400*16], m4
mova m3, [t3+r10*4+400*4+0]
paddd m3, [t3+r10*4+400*4+8]
@ -2192,27 +2177,24 @@ ALIGN function_align
psubd m5, m3 ; ab3[ 1] 343
mova [t3+r10*4+400*24], m5
paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
por m3, m6, m1
pandn m3, m6, m1
psrld m1, 12
por m5, m6, m4
pandn m5, m6, m4
psrld m4, 12
paddd m3, m5 ; -a3
paddd m1, m4 ; b3 + (1 << 8)
paddd m3, m5 ; a3
paddd m1, m4 ; b3 + (1 << 8)
pmovzxbd m4, [dstq+r10]
pmaddwd m2, m4 ; -a5 * src
pmaddwd m3, m4 ; -a3 * src
pslld m4, 13
psubd m0, m4
psubd m1, m4
psubd m0, m2 ; a5 * src + b5 + (1 << 8)
psubd m1, m3 ; a3 * src + b3 + (1 << 8)
pmaddwd m2, m4 ; a5 * src
pmaddwd m3, m4 ; a3 * src
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
psrld m0, 9
pslld m1, 7
pblendw m0, m1, 0xaa
pmaddwd m0, m15
psubd m4, m6
paddd m0, m4
psubd m0, m6
psrad m0, 13
paddd m0, m4
vextracti128 xm1, m0, 1
packssdw xm0, xm1
packuswb xm0, xm0
@ -2236,9 +2218,9 @@ ALIGN function_align
psubd m5, m3 ; ab3[ 1] 343
mova [t3+r10*4+400*28], m5
paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
por m3, m6, m1
pandn m3, m6, m1
psrld m1, 12
por m5, m6, m4
pandn m5, m6, m4
psrld m4, 12
paddd m3, m5 ; -a3
paddd m1, m4 ; b3 + (1 << 8)
@ -2246,19 +2228,15 @@ ALIGN function_align
pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src
mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7)
pmaddwd m3, m4 ; -a3 * src
pslld m4, 12
psubd m0, m4
paddd m4, m4
psubd m1, m4
psubd m0, m2 ; a5 * src + b5 + (1 << 7)
psubd m1, m3 ; a3 * src + b3 + (1 << 8)
psrld m0, 8
pslld m1, 7
pblendw m0, m1, 0xaa
pmaddwd m0, m15
psubd m4, m6
paddd m0, m4
psubd m0, m6
psrad m0, 13
paddd m0, m4
vextracti128 xm1, m0, 1
packssdw xm0, xm1
packuswb xm0, xm0

Просмотреть файл

@ -39,152 +39,12 @@ decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
/* FIXME: Replace with a port of the AVX2 code */
#define SGR_FILTER_OLD(ext) \
void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
const int w, const int h, const unsigned s); \
void BF(dav1d_sgr_finish_filter1, ext)(int16_t *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
\
/* filter with a 3x3 box (radius=1) */ \
static void BF(dav1d_sgr_filter1, ext)(int16_t *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
\
BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
if (edges & LR_HAVE_TOP) \
BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
\
if (edges & LR_HAVE_BOTTOM) \
BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
\
BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
} \
\
void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
void BF(dav1d_sgr_finish_filter2, ext)(int16_t *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
\
/* filter with a 5x5 box (radius=2) */ \
static void BF(dav1d_sgr_filter2, ext)(int16_t *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
\
BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
if (edges & LR_HAVE_TOP) \
BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
\
if (edges & LR_HAVE_BOTTOM) \
BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
\
BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
} \
\
void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
const int16_t *t1, const int w, const int h, \
const int wt); \
void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
const int16_t *t1, const int16_t *t2, \
const int w, const int h, \
const uint32_t wt); \
\
static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \
{ \
ALIGN_STK_32(int16_t, tmp, 64 * 384,); \
BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
} \
static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \
{ \
ALIGN_STK_32(int16_t, tmp, 64 * 384,); \
BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
} \
static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \
{ \
ALIGN_STK_32(int16_t, tmp1, 64 * 384,); \
ALIGN_STK_32(int16_t, tmp2, 64 * 384,); \
BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
}
decl_wiener_filter_fns(sse2);
decl_wiener_filter_fns(ssse3);
decl_wiener_filter_fns(avx2);
decl_sgr_filter_fns(ssse3);
decl_sgr_filter_fns(avx2);
#if BITDEPTH == 8
SGR_FILTER_OLD(ssse3)
#endif
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c,
const int bpc)
{
@ -199,11 +59,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
#if BITDEPTH == 8
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
c->sgr[2] = BF(sgr_filter_mix, ssse3);
#endif
if (bpc <= 10) {
c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
}
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1964
third_party/dav1d/src/x86/mc16_avx2.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

636
third_party/dav1d/src/x86/mc16_sse.asm поставляемый
Просмотреть файл

@ -41,6 +41,9 @@ blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
rescale_mul: dd 0, 1, 2, 3
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
pw_2: times 8 dw 2
pw_16: times 4 dw 16
@ -54,6 +57,8 @@ pw_8192: times 8 dw 8192
pw_27615: times 8 dw 27615
pw_32766: times 8 dw 32766
pw_m512: times 8 dw -512
pd_63: times 4 dd 63
pd_64: times 4 dd 64
pd_512: times 4 dd 512
pd_65538: times 2 dd 65538
@ -65,6 +70,12 @@ put_8tap_h_rnd: dd 34, 34, 40, 40
prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
warp8x8_shift: dd 11, 13
warp8x8_rnd1: dd 1024, 1024, 4096, 4096
warp8x8_rnd2: times 4 dw 4096
times 4 dw 16384
warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
@ -105,6 +116,9 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern resize_filter
SECTION .text
%macro REPX 2-*
@ -2526,6 +2540,398 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
RET
%undef tmp
%if ARCH_X86_64
; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
; by allocating 16 bytes more stack space so that stack offsets match up.
%if WIN64 && STACK_ALIGNMENT == 16
%assign stksz 16*14
%else
%assign stksz 16*13
%endif
cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
mx, tmp, alpha, beta, \
filter, my, gamma, cnt
%assign stack_size_padded_8x8t stack_size_padded
%else
cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
filter, mx, my
%define m8 [esp+16*13]
%define m9 [esp+16*14]
%define cntd dword [esp+4*63]
%define dstq tmpq
%define dsq 0
%if STACK_ALIGNMENT < 16
%define dstm [esp+4*65]
%define dsm [esp+4*66]
%else
%define dstm r0m
%define dsm r1m
%endif
%endif
%define base filterq-$$
mov t0d, r7m
LEA filterq, $$
shr t0d, 11
%if ARCH_X86_64
movddup m8, [base+warp8x8t_rnd]
%else
movddup m1, [base+warp8x8t_rnd]
mov r1, r1m
add r1, r1
mova m8, m1
mov r1m, r1 ; ds *= 2
%endif
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
jmp .start
.loop:
%if ARCH_X86_64
lea dstq, [dstq+dsq*4]
%else
add dstq, dsm
mov dstm, dstq
%endif
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
.start:
%if ARCH_X86_32
mov dstq, dstm
%endif
paddd m1, m8
paddd m2, m8
psrad m1, 15
psrad m2, 15
packssdw m1, m2
mova [dstq+dsq*0], m1
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
%if ARCH_X86_32
mov dstq, dstm
add dstq, dsm
%endif
paddd m1, m8
paddd m2, m8
psrad m1, 15
psrad m2, 15
packssdw m1, m2
mova [dstq+dsq*2], m1
dec cntd
jg .loop
RET
%if ARCH_X86_64
cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
mx, tmp, alpha, beta, \
filter, my, gamma, cnt
ASSERT stack_size_padded == stack_size_padded_8x8t
%else
cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
filter, mx, my
%endif
mov t0d, r7m
LEA filterq, $$
shr t0d, 11
%if ARCH_X86_64
movddup m8, [base+warp8x8_rnd2+t0*8]
movd m9, r7m ; pixel_max
pshufb m9, [base+pw_256]
%else
movddup m1, [base+warp8x8_rnd2+t0*8]
movd m2, r7m ; pixel_max
pshufb m2, [base+pw_256]
mova m8, m1
mova m9, m2
%endif
call .main
jmp .start
.loop:
%if ARCH_X86_64
lea dstq, [dstq+dsq*2]
%else
add dstq, dsm
mov dstm, dstq
%endif
call .main2
.start:
%if ARCH_X86_32
mov dstq, dstm
%endif
psrad m1, 16
psrad m2, 16
packssdw m1, m2
pmaxsw m1, m6
pmulhrsw m1, m8
pminsw m1, m9
mova [dstq+dsq*0], m1
call .main3
%if ARCH_X86_32
mov dstq, dstm
add dstq, dsm
%endif
psrad m1, 16
psrad m2, 16
packssdw m1, m2
pmaxsw m1, m6
pmulhrsw m1, m8
pminsw m1, m9
mova [dstq+dsq*1], m1
dec cntd
jg .loop
RET
ALIGN function_align
.main:
; Stack args offset by one (r4m -> r5m etc.) due to call
%if WIN64
mov deltaq, r5m
mov mxd, r6m
%endif
movd m0, [base+warp8x8_shift+t0*4]
movddup m7, [base+warp8x8_rnd1+t0*8]
add filterq, mc_warp_filter-$$
%if ARCH_X86_64
movsx alphad, word [deltaq+2*0]
movsx betad, word [deltaq+2*1]
movsx gammad, word [deltaq+2*2]
movsx deltad, word [deltaq+2*3]
lea tmpq, [ssq*3]
add mxd, 512+(64<<10)
sub srcq, tmpq ; src -= ss*3
imul tmpd, alphad, -7
mov myd, r7m
add betad, tmpd ; beta -= alpha*7
imul tmpd, gammad, -7
add myd, 512+(64<<10)
mov cntd, 4
add deltad, tmpd ; delta -= gamma*7
%else
%if STACK_ALIGNMENT < 16
%assign stack_offset stack_offset - gprsize
%endif
mov r3d, r5m ; abcd
%if STACK_ALIGNMENT < 16
mov r0, r1m ; dst
mov r1, r2m ; ds
mov [esp+gprsize+4*65], r0
mov [esp+gprsize+4*66], r1
%endif
movsx alphad, word [r3+2*0]
movsx r2d, word [r3+2*1]
movsx gammad, word [r3+2*2]
movsx r3d, word [r3+2*3]
imul r5d, alphad, -7
add r2d, r5d ; beta -= alpha*7
imul r5d, gammad, -7
mov [esp+gprsize+4*60], r2d
add r3d, r5d ; delta -= gamma*7
mov [esp+gprsize+4*61], r3d
mov r3d, r4m ; ss
mov srcq, r3m
mov mxd, r6m
mov myd, r7m
mov dword [esp+gprsize+4*63], 4 ; cnt
mov [esp+gprsize+4*62], r3
lea r3, [r3*3]
add mxd, 512+(64<<10)
add myd, 512+(64<<10)
sub srcq, r3 ; src -= ss*3
%if STACK_ALIGNMENT < 16
%assign stack_offset stack_offset + gprsize
%endif
%endif
mova [rsp+gprsize], m0
pxor m6, m6
call .h
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 01
punpckhwd m5, m0
mova [rsp+gprsize+16* 1], m1
mova [rsp+gprsize+16* 4], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 12
punpckhwd m5, m0
mova [rsp+gprsize+16* 7], m1
mova [rsp+gprsize+16*10], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 23
punpckhwd m5, m0
mova [rsp+gprsize+16* 2], m1
mova [rsp+gprsize+16* 5], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 34
punpckhwd m5, m0
mova [rsp+gprsize+16* 8], m1
mova [rsp+gprsize+16*11], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 45
punpckhwd m5, m0
mova [rsp+gprsize+16* 3], m1
mova [rsp+gprsize+16* 6], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 56
punpckhwd m5, m0
mova [rsp+gprsize+16* 9], m1
mova [rsp+gprsize+16*12], m5
mova m5, m0
.main2:
call .h
%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
lea tmpd, [myq+gammaq]
shr myd, 10
movq m4, [filterq+myq*8] ; a
lea myd, [tmpq+gammaq]
shr tmpd, 10
movq m2, [filterq+tmpq*8] ; b
lea tmpd, [myq+gammaq]
shr myd, 10
movq m3, [filterq+myq*8] ; c
lea myd, [tmpq+gammaq]
shr tmpd, 10
movq m1, [filterq+tmpq*8] ; d
lea tmpd, [myq+gammaq]
shr myd, 10
punpcklwd m4, m2
punpcklwd m3, m1
punpckldq m2, m4, m3
punpckhdq m4, m3
punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
pmaddwd m1, [rsp+gprsize+16*%1]
punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
mova m2, [rsp+gprsize+16*%2]
pmaddwd m3, m2
mova [rsp+gprsize+16*%1], m2
paddd m1, m3
punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
mova m2, [rsp+gprsize+16*%3]
pmaddwd m3, m2
mova [rsp+gprsize+16*%2], m2
paddd m1, m3
punpcklwd m3, m5, m0 ; 67
punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
pmaddwd m2, m3
mova [rsp+gprsize+16*%3], m3
paddd m1, m2
movq m4, [filterq+myq*8] ; e
lea myd, [tmpq+gammaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8] ; f
lea tmpd, [myq+gammaq]
shr myd, 10
movq m2, [filterq+myq*8] ; g
%if ARCH_X86_64
lea myd, [tmpq+deltaq] ; my += delta
%else
mov myd, [esp+gprsize+4*61]
add myd, tmpd
%endif
shr tmpd, 10
punpcklwd m4, m3
movq m3, [filterq+tmpq*8] ; h
punpcklwd m2, m3
punpckldq m3, m4, m2
punpckhdq m4, m2
punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
pmaddwd m2, [rsp+gprsize+16*%4]
punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
mova m3, [rsp+gprsize+16*%5]
pmaddwd m6, m3
mova [rsp+gprsize+16*%4], m3
pxor m3, m3
paddd m2, m6
punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
mova m6, [rsp+gprsize+16*%6]
pmaddwd m3, m6
mova [rsp+gprsize+16*%5], m6
punpckhwd m5, m0
pxor m6, m6
paddd m2, m3
punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
pmaddwd m3, m5
mova [rsp+gprsize+16*%6], m5
mova m5, m0
paddd m2, m3
%endmacro
WARP_V 1, 2, 3, 4, 5, 6
ret
.main3:
call .h
WARP_V 7, 8, 9, 10, 11, 12
ret
ALIGN function_align
.h:
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
punpcklbw m0, m6, m3
movu m3, [srcq-6]
pmaddwd m0, m3 ; 0
lea mxd, [tmpq+alphaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8]
punpcklbw m2, m6, m3
movu m3, [srcq-4]
pmaddwd m2, m3 ; 1
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
phaddd m0, m2 ; 0 1
punpcklbw m2, m6, m3
movu m3, [srcq-2]
pmaddwd m2, m3 ; 2
lea mxd, [tmpq+alphaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8]
punpcklbw m1, m6, m3
movu m3, [srcq+0]
pmaddwd m1, m3 ; 3
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
phaddd m2, m1 ; 2 3
punpcklbw m1, m6, m3
movu m3, [srcq+2]
pmaddwd m1, m3 ; 4
lea mxd, [tmpq+alphaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8]
phaddd m0, m2 ; 0 1 2 3
punpcklbw m2, m6, m3
movu m3, [srcq+4]
pmaddwd m2, m3 ; 5
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
phaddd m1, m2 ; 4 5
punpcklbw m2, m6, m3
movu m3, [srcq+6]
pmaddwd m2, m3 ; 6
%if ARCH_X86_64
lea mxd, [tmpq+betaq] ; mx += beta
%else
mov mxd, [esp+gprsize*2+4*60]
add mxd, tmpd
%endif
shr tmpd, 10
movq m3, [filterq+tmpq*8]
punpcklbw m4, m6, m3
movu m3, [srcq+8]
%if ARCH_X86_64
add srcq, ssq
%else
add srcq, [esp+gprsize*2+4*62]
%endif
pmaddwd m3, m4 ; 7
phaddd m2, m3 ; 6 7
phaddd m1, m2 ; 4 5 6 7
paddd m0, m7
paddd m1, m7
psrad m0, [rsp+gprsize*2]
psrad m1, [rsp+gprsize*2]
packssdw m0, m1
ret
%macro BIDIR_FN 0
call .main
jmp wq
@ -4142,3 +4548,233 @@ cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
%undef reg_dstride
%undef reg_blkm
%undef reg_tmp
%macro SCRATCH 3
%if ARCH_X86_32
mova [rsp+%3*mmsize], m%1
%define m%2 [rsp+%3*mmsize]
%else
SWAP %1, %2
%endif
%endmacro
%if ARCH_X86_64
cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
%elif STACK_ALIGNMENT >= 16
cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
%else
cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
%endif
movifnidn dstq, dstmp
movifnidn srcq, srcmp
%if STACK_ALIGNMENT >= 16
movifnidn dst_wd, dst_wm
%endif
%if ARCH_X86_64
movifnidn hd, hm
%endif
sub dword mx0m, 4<<14
sub dword src_wm, 8
movd m4, pxmaxm
movd m7, dxm
movd m6, mx0m
movd m5, src_wm
punpcklwd m4, m4
pshufd m4, m4, q0000
pshufd m7, m7, q0000
pshufd m6, m6, q0000
pshufd m5, m5, q0000
mova [rsp+16*3*ARCH_X86_32], m4
%if ARCH_X86_64
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
LEA r7, $$
%define base r7-$$
%else
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
%define hd dword r5m
%if STACK_ALIGNMENT >= 16
LEA r6, $$
%define base r6-$$
%else
LEA r4, $$
%define base r4-$$
%endif
%endif
%if ARCH_X86_64
mova m12, [base+pd_64]
mova m11, [base+pd_63]
%else
%define m12 [base+pd_64]
%define m11 [base+pd_63]
%endif
pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
pslld m7, 2 ; dx*4
pslld m5, 14
paddd m6, m4 ; mx+[0..3]*dx
SCRATCH 7, 15, 0
SCRATCH 6, 14, 1
SCRATCH 5, 13, 2
pxor m1, m1
.loop_y:
xor xd, xd
mova m0, m14 ; per-line working version of mx
.loop_x:
pcmpgtd m1, m0
pandn m1, m0
psrad m2, m0, 8 ; filter offset (unmasked)
pcmpgtd m3, m13, m1
pand m1, m3
pandn m3, m13
por m1, m3
psubd m3, m0, m1 ; pshufb offset
psrad m1, 14 ; clipped src_x offset
psrad m3, 14 ; pshufb edge_emu offset
pand m2, m11 ; filter offset (masked)
; load source pixels
%if ARCH_X86_64
movd r8d, m1
pshuflw m1, m1, q3232
movd r9d, m1
punpckhqdq m1, m1
movd r10d, m1
psrlq m1, 32
movd r11d, m1
movu m4, [srcq+r8*2]
movu m5, [srcq+r9*2]
movu m6, [srcq+r10*2]
movu m7, [srcq+r11*2]
; if no emulation is required, we don't need to shuffle or emulate edges
packssdw m3, m3
movq r11, m3
test r11, r11
jz .filter
movsx r8, r11w
sar r11, 16
movsx r9, r11w
sar r11, 16
movsx r10, r11w
sar r11, 16
movu m1, [base+resize_shuf+8+r8*2]
movu m3, [base+resize_shuf+8+r9*2]
movu m8, [base+resize_shuf+8+r10*2]
movu m9, [base+resize_shuf+8+r11*2]
pshufb m4, m1
pshufb m5, m3
pshufb m6, m8
pshufb m7, m9
.filter:
movd r8d, m2
pshuflw m2, m2, q3232
movd r9d, m2
punpckhqdq m2, m2
movd r10d, m2
psrlq m2, 32
movd r11d, m2
movq m8, [base+resize_filter+r8*8]
movq m2, [base+resize_filter+r9*8]
pxor m9, m9
punpcklbw m1, m9, m8
punpcklbw m3, m9, m2
psraw m1, 8
psraw m3, 8
movq m10, [base+resize_filter+r10*8]
movq m2, [base+resize_filter+r11*8]
punpcklbw m8, m9, m10
punpcklbw m9, m2
psraw m8, 8
psraw m9, 8
pmaddwd m4, m1
pmaddwd m5, m3
pmaddwd m6, m8
pmaddwd m7, m9
phaddd m4, m5
%else
movd r3, m1
pshuflw m1, m1, q3232
movd r1, m1
punpckhqdq m1, m1
movu m4, [srcq+r3*2]
movu m5, [srcq+r1*2]
movd r3, m1
psrlq m1, 32
movd r1, m1
movu m6, [srcq+r3*2]
movu m7, [srcq+r1*2]
; if no emulation is required, we don't need to shuffle or emulate edges
pxor m1, m1
pcmpeqb m1, m3
pmovmskb r3d, m1
cmp r3d, 0xffff
je .filter
movd r3, m3
movu m1, [base+resize_shuf+8+r3*2]
pshuflw m3, m3, q3232
movd r1, m3
pshufb m4, m1
movu m1, [base+resize_shuf+8+r1*2]
punpckhqdq m3, m3
movd r3, m3
pshufb m5, m1
movu m1, [base+resize_shuf+8+r3*2]
psrlq m3, 32
movd r1, m3
pshufb m6, m1
movu m1, [base+resize_shuf+8+r1*2]
pshufb m7, m1
.filter:
mova [esp+4*16], m6
mova [esp+5*16], m7
movd r3, m2
pshuflw m2, m2, q3232
movd r1, m2
movq m6, [base+resize_filter+r3*8]
movq m7, [base+resize_filter+r1*8]
pxor m3, m3
punpcklbw m1, m3, m6
punpcklbw m3, m7
psraw m1, 8
psraw m3, 8
pmaddwd m4, m1
pmaddwd m5, m3
punpckhqdq m2, m2
movd r3, m2
psrlq m2, 32
movd r1, m2
phaddd m4, m5
movq m2, [base+resize_filter+r3*8]
movq m5, [base+resize_filter+r1*8]
mova m6, [esp+4*16]
mova m7, [esp+5*16]
pxor m3, m3
punpcklbw m1, m3, m2
punpcklbw m3, m5
psraw m1, 8
psraw m3, 8
pmaddwd m6, m1
pmaddwd m7, m3
%endif
phaddd m6, m7
phaddd m4, m6
pxor m1, m1
psubd m2, m12, m4
psrad m2, 7
packssdw m2, m2
pmaxsw m2, m1
pminsw m2, [rsp+16*3*ARCH_X86_32]
movq [dstq+xq*2], m2
paddd m0, m15
add xd, 4
%if STACK_ALIGNMENT >= 16
cmp xd, dst_wd
%else
cmp xd, dst_wm
%endif
jl .loop_x
add dstq, dst_stridemp
add srcq, src_stridemp
dec hd
jg .loop_y
RET

212
third_party/dav1d/src/x86/mc_avx2.asm поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
; Copyright © 2018-2020, VideoLAN and dav1d authors
; Copyright © 2018-2020, Two Orioles, LLC
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018-2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
@ -69,7 +69,6 @@ bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 1
wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
db 7, 7, 7, 7, 7, 7, 7, 7
wm_420_sign: dd 0x01020102, 0x01010101
wm_422_sign: dd 0x80808080, 0x7f7f7f7f
@ -110,7 +109,7 @@ cextern resize_filter
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
@ -141,68 +140,68 @@ cextern resize_filter
%endif
%endmacro
%macro BIDIR_JMP_TABLE 1-*
%xdefine %1_table (%%table - 2*%2)
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 1
dd %%prefix %+ .w%2 - %%base
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro SCALED_JMP_TABLE 1-*
%xdefine %1_table (%%table - %2)
%xdefine %%base mangle(private_prefix %+ _%1)
%macro SCALED_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 1
dw %%base %+ .w%2 - %%base
%rep %0 - 2
dw %%base %+ .w%3 - %%base
%rotate 1
%endrep
%rotate 1
%rotate 2
%%dy_1024:
%xdefine %1_dy1_table (%%dy_1024 - %2)
%rep %0 - 1
dw %%base %+ .dy1_w%2 - %%base
%xdefine %1_%2_dy1_table (%%dy_1024 - %3)
%rep %0 - 2
dw %%base %+ .dy1_w%3 - %%base
%rotate 1
%endrep
%rotate 1
%rotate 2
%%dy_2048:
%xdefine %1_dy2_table (%%dy_2048 - %2)
%rep %0 - 1
dw %%base %+ .dy2_w%2 - %%base
%xdefine %1_%2_dy2_table (%%dy_2048 - %3)
%rep %0 - 2
dw %%base %+ .dy2_w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32
SECTION .text
INIT_XMM avx2
cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
movifnidn mxyd, r6m ; mx
lea r7, [put_avx2]
tzcnt wd, wm
@ -769,7 +768,7 @@ INIT_YMM avx2
%endif
RET
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea r6, [prep%+SUFFIX]
tzcnt wd, wm
@ -1439,7 +1438,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4 ; fn, type, type_h, type_v
cglobal %1_%2
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
@ -1447,7 +1446,7 @@ cglobal %1_%2
mov t1d, FILTER_%4
%endif
%ifnidn %2, regular ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
%endif
%endmacro
@ -1458,7 +1457,6 @@ DECLARE_REG_TMP 7, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN sharp, SHARP, SHARP
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
@ -1469,7 +1467,7 @@ PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
@ -2124,7 +2122,6 @@ DECLARE_REG_TMP 6, 7
%endif
%define PREP_8TAP_FN FN prep_8tap,
PREP_8TAP_FN sharp, SHARP, SHARP
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
@ -2135,7 +2132,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
PREP_8TAP_FN regular, REGULAR, REGULAR
cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
@ -2725,26 +2722,26 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
%ifidn %1, put
%assign isprep 0
%if required_stack_alignment <= STACK_ALIGNMENT
cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
%else
cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
%endif
%xdefine base_reg r12
%define rndshift 10
%else
%assign isprep 1
%if required_stack_alignment <= STACK_ALIGNMENT
cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%xdefine tmp_stridem r14q
%else
cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%define tmp_stridem qword [rsp+120]
%endif
%xdefine base_reg r11
%define rndshift 6
%endif
lea base_reg, [%1_8tap_scaled_avx2]
%define base base_reg-%1_8tap_scaled_avx2
lea base_reg, [%1_8tap_scaled_8bpc_avx2]
%define base base_reg-%1_8tap_scaled_8bpc_avx2
tzcnt wd, wm
vpbroadcastd m8, dxm
%if isprep && UNIX64
@ -2817,7 +2814,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
dec srcq
movd xm15, t0d
punpckldq m8, m9, m8
paddd m14, m8 ; mx+dx*[0-1]
paddd m14, m8 ; mx+dx*[0,1]
vpbroadcastd m11, [base+pd_0x4000]
vpbroadcastd xm15, xm15
pand m8, m14, m10
@ -2868,8 +2865,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
lea r4d, [t1+r4]
cmovnz r6q, [base+subpel_filters+r4*8]
movq xm11, r6q
punpcklbw xm11, xm11
psraw xm11, 8
pmovsxbw xm11, xm11
pshufd xm8, xm11, q0000
pshufd xm9, xm11, q1111
pshufd xm10, xm11, q2222
@ -2997,8 +2993,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
lea r4d, [t1+r4]
cmovnz r6q, [base+subpel_filters+r4*8]
movq xm10, r6q
punpcklbw xm10, xm10
psraw xm10, 8
pmovsxbw xm10, xm10
pshufd xm7, xm10, q0000
pshufd xm8, xm10, q1111
pshufd xm9, xm10, q2222
@ -3172,9 +3167,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
lea r4d, [t1+r4]
cmovnz r6q, [base+subpel_filters+r4*8]
movq xm11, r6q
punpcklbw xm11, xm11
psraw xm11, 8
vinserti128 m11, xm11, 1
punpcklqdq xm11, xm11
pmovsxbw m11, xm11
pshufd m8, m11, q0000
pshufd m9, m11, q1111
pmaddwd m4, m0, m8
@ -3320,8 +3314,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
vpbroadcastq m2, [srcq+ssq*1]
add srcq, ss3q
movq xm10, r4q
punpcklbw xm10, xm10
psraw xm10, 8
pmovsxbw xm10, xm10
vpblendd m15, m7, 0xaa
pblendvb m15, m11, m8
pshufd xm8, xm10, q0000
@ -3417,9 +3410,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
punpcklqdq m15, m15
pblendvb m15, m11, m8
movq xm10, r4q
punpcklbw xm10, xm10
psraw xm10, 8
vinserti128 m10, xm10, 1
punpcklqdq xm10, xm10
pmovsxbw m10, xm10
pshufb m2, m14
pshufb m3, m14
pshufb m4, m14
@ -3526,8 +3518,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
movq xm0, r4q
punpcklbw xm0, xm0
psraw xm0, 8
pmovsxbw xm0, xm0
mova [rsp+96], xm0
jmp .dy1_hloop
.dy1_hloop_prep:
@ -3695,8 +3686,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
pmaddubsw m0, m15
pmaddubsw m1, m15
movq xm11, r4q
punpcklbw xm11, xm11
psraw xm11, 8
pmovsxbw xm11, xm11
phaddw m0, m1
pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
pshufd xm8, xm11, q0000
@ -3792,9 +3782,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
pmaddubsw xm1, xm15
pmaddubsw m3, m15
movq xm11, r4q
punpcklbw xm11, xm11
psraw xm11, 8
vinserti128 m11, xm11, 1
punpcklqdq xm11, xm11
pmovsxbw m11, xm11
phaddw m0, m2
phaddw m1, m3
pmulhrsw m0, m12 ; 0 2 _ 4
@ -3889,8 +3878,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
movq xm0, r4q
punpcklbw xm0, xm0
psraw xm0, 8
pmovsxbw xm0, xm0
mova [rsp+0x50], xm0
jmp .dy2_hloop
.dy2_hloop_prep:
@ -4025,10 +4013,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%endmacro
%macro BILIN_SCALED_FN 1
cglobal %1_bilin_scaled
cglobal %1_bilin_scaled_8bpc
mov t0d, (5*15 << 16) | 5*15
mov t1d, t0d
jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
%endmacro
%if WIN64
@ -4113,11 +4101,11 @@ MC_8TAP_SCALED prep
paddd m%1, m0, m%2
%endmacro
cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
%if WIN64
sub rsp, 0xa0
%endif
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
.loop:
psrad m7, 13
psrad m0, 13
@ -4127,13 +4115,13 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
mova [tmpq+tsq*0], xm7
vextracti128 [tmpq+tsq*2], m7, 1
dec r4d
jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
lea tmpq, [tmpq+tsq*4]
jmp .loop
cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
beta, filter, tmp1, delta, my, gamma
cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
beta, filter, tmp1, delta, my, gamma
%if WIN64
sub rsp, 0xa0
%assign xmm_regs_used 16
@ -4389,7 +4377,7 @@ ALIGN function_align
add tmp2q, %1*32
%endmacro
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg %+ SUFFIX %+ _table
lea r6, [avg %+ SUFFIX %+ _table]
tzcnt wd, wm
@ -4419,7 +4407,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define W_AVG_INC_PTR AVG_INC_PTR
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-w_avg %+ SUFFIX %+ _table
lea r6, [w_avg %+ SUFFIX %+ _table]
tzcnt wd, wm
@ -4469,7 +4457,7 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
add tmp1q, %1*32
%endmacro
cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-mask %+ SUFFIX %+ _table
lea r7, [mask %+ SUFFIX %+ _table]
tzcnt wd, wm
@ -4512,7 +4500,7 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
packuswb m%1, m1
%endmacro
cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
%define base r6-blend_avx2_table
lea r6, [blend_avx2_table]
tzcnt wd, wm
@ -4629,7 +4617,7 @@ ALIGN function_align
jg .w32
RET
cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
%define base r5-blend_v_avx2_table
lea r5, [blend_v_avx2_table]
tzcnt wd, wm
@ -4740,7 +4728,7 @@ ALIGN function_align
jg .w32_loop
RET
cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
%define base r5-blend_h_avx2_table
lea r5, [blend_h_avx2_table]
mov r6d, wd
@ -4866,7 +4854,7 @@ ALIGN function_align
jl .w32_loop0
RET
cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
bottomext, rightext
; we assume that the buffer (stride) is larger than width, so we can
; safely overwrite by a few bytes
@ -5053,8 +5041,8 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
.end:
RET
cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0
cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0
sub dword mx0m, 4<<14
sub dword src_wm, 8
vpbroadcastd m5, dxm
@ -5117,27 +5105,23 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
vptest m1, m1
jz .filter
movd r8d, xm1
pextrd r9d, xm1, 1
pextrd r10d, xm1, 2
pextrd r11d, xm1, 3
movsxd r8, r8d
movsxd r9, r9d
movsxd r10, r10d
movsxd r11, r11d
movq r9, xm1
pextrq r11, xm1, 1
movsxd r8, r9d
sar r9, 32
movsxd r10, r11d
sar r11, 32
vextracti128 xm1, m1, 1
movq xm14, [base+resize_shuf+4+r8]
movq xm0, [base+resize_shuf+4+r10]
movhps xm14, [base+resize_shuf+4+r9]
movhps xm0, [base+resize_shuf+4+r11]
movd r8d, xm1
pextrd r9d, xm1, 1
pextrd r10d, xm1, 2
pextrd r11d, xm1, 3
movsxd r8, r8d
movsxd r9, r9d
movsxd r10, r10d
movsxd r11, r11d
movq r9, xm1
pextrq r11, xm1, 1
movsxd r8, r9d
sar r9, 32
movsxd r10, r11d
sar r11, 32
vinserti128 m14, [base+resize_shuf+4+r8], 1
vinserti128 m0, [base+resize_shuf+4+r10], 1
vpbroadcastq m10, [base+resize_shuf+4+r9]
@ -5191,7 +5175,7 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
jg .loop_y
RET
cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_420_avx2_table
lea r7, [w_mask_420_avx2_table]
tzcnt wd, wm
@ -5397,7 +5381,7 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_422_avx2_table
lea r7, [w_mask_422_avx2_table]
tzcnt wd, wm
@ -5570,7 +5554,7 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_444_avx2_table
lea r7, [w_mask_444_avx2_table]
tzcnt wd, wm

48
third_party/dav1d/src/x86/mc_avx512.asm поставляемый
Просмотреть файл

@ -146,7 +146,7 @@ cextern mc_subpel_filters
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
@ -177,30 +177,30 @@ cextern mc_subpel_filters
%endif
%endmacro
%macro BIDIR_JMP_TABLE 1-*
%xdefine %1_table (%%table - 2*%2)
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 1
dd %%prefix %+ .w%2 - %%base
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
SECTION .text
@ -221,7 +221,7 @@ INIT_ZMM cpuname
DECLARE_REG_TMP 3, 5, 6
INIT_ZMM avx512icl
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea t2, [prep_avx512icl]
tzcnt wd, wm
@ -772,7 +772,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4 ; fn, type, type_h, type_v
cglobal %1_%2
cglobal %1_%2_8bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
@ -780,7 +780,7 @@ cglobal %1_%2
mov t1d, FILTER_%4
%endif
%ifnidn %2, regular ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
%endif
%endmacro
@ -829,7 +829,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
PREP_8TAP_FN regular, REGULAR, REGULAR
cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
@ -1753,7 +1753,7 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
add tmp2q, %1*mmsize
%endmacro
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg_avx512icl_table
lea r6, [avg_avx512icl_table]
tzcnt wd, wm
@ -1783,7 +1783,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define W_AVG_INC_PTR AVG_INC_PTR
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-w_avg_avx512icl_table
lea r6, [w_avg_avx512icl_table]
tzcnt wd, wm
@ -1837,7 +1837,7 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
add tmp1q, %1*64
%endmacro
cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-mask_avx512icl_table
lea r7, [mask_avx512icl_table]
tzcnt wd, wm
@ -1877,7 +1877,7 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
packuswb m%1, m1
%endmacro
cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_420_avx512icl_table
lea r7, [w_mask_420_avx512icl_table]
tzcnt wd, wm
@ -2070,7 +2070,7 @@ cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_422_avx512icl_table
lea r7, [w_mask_422_avx512icl_table]
tzcnt wd, wm
@ -2243,7 +2243,7 @@ cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_444_avx512icl_table
lea r7, [w_mask_444_avx512icl_table]
tzcnt wd, wm

146
third_party/dav1d/src/x86/mc_init_tmpl.c поставляемый
Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018-2021, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -28,35 +28,19 @@
#include "src/cpu.h"
#include "src/mc.h"
#if BITDEPTH == 8
#define decl_fn(type, name) \
decl_##type##_fn(name##_sse2); \
decl_##type##_fn(name##_ssse3); \
decl_##type##_fn(name##_avx2); \
decl_##type##_fn(name##_avx512icl);
decl_##type##_fn(BF(name, sse2)); \
decl_##type##_fn(BF(name, ssse3)); \
decl_##type##_fn(BF(name, avx2)); \
decl_##type##_fn(BF(name, avx512icl));
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_##suffix
c->mc[type] = BF(dav1d_put_##name, suffix)
#define init_mct_fn(type, name, suffix) \
c->mct[type] = dav1d_prep_##name##_##suffix
c->mct[type] = BF(dav1d_prep_##name, suffix)
#define init_mc_scaled_fn(type, name, suffix) \
c->mc_scaled[type] = dav1d_put_##name##_##suffix
c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
#define init_mct_scaled_fn(type, name, suffix) \
c->mct_scaled[type] = dav1d_prep_##name##_##suffix
#else
#define decl_fn(type, name) \
decl_##type##_fn(name##_16bpc_sse2); \
decl_##type##_fn(name##_16bpc_ssse3); \
decl_##type##_fn(name##_16bpc_avx2); \
decl_##type##_fn(name##_16bpc_avx512icl);
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_16bpc_##suffix
#define init_mct_fn(type, name, suffix) \
c->mct[type] = dav1d_prep_##name##_16bpc_##suffix
#define init_mc_scaled_fn(type, name, suffix) \
c->mc_scaled[type] = dav1d_put_##name##_16bpc_##suffix
#define init_mct_scaled_fn(type, name, suffix) \
c->mct_scaled[type] = dav1d_prep_##name##_16bpc_##suffix
#endif
c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
decl_fn(mc, dav1d_put_8tap_regular);
decl_fn(mc, dav1d_put_8tap_regular_smooth);
@ -113,14 +97,13 @@ decl_fn(blend_dir, dav1d_blend_v);
decl_fn(blend_dir, dav1d_blend_h);
decl_fn(warp8x8, dav1d_warp_affine_8x8);
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
decl_fn(emu_edge, dav1d_emu_edge);
decl_resize_fn(dav1d_resize_avx2);
decl_resize_fn(dav1d_resize_ssse3);
decl_fn(resize, dav1d_resize);
COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@ -140,8 +123,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
c->warp8x8 = dav1d_warp_affine_8x8_sse2;
c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
@ -193,40 +176,26 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
#endif
#if BITDEPTH == 8
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3;
c->w_mask[2] = dav1d_w_mask_420_ssse3;
c->blend = dav1d_blend_ssse3;
c->blend_v = dav1d_blend_v_ssse3;
c->blend_h = dav1d_blend_h_ssse3;
c->warp8x8 = dav1d_warp_affine_8x8_ssse3;
c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
c->emu_edge = dav1d_emu_edge_ssse3;
c->resize = dav1d_resize_ssse3;
#else
c->avg = dav1d_avg_16bpc_ssse3;
c->w_avg = dav1d_w_avg_16bpc_ssse3;
c->mask = dav1d_mask_16bpc_ssse3;
c->w_mask[0] = dav1d_w_mask_444_16bpc_ssse3;
c->w_mask[1] = dav1d_w_mask_422_16bpc_ssse3;
c->w_mask[2] = dav1d_w_mask_420_16bpc_ssse3;
c->blend = dav1d_blend_16bpc_ssse3;
c->blend_v = dav1d_blend_v_16bpc_ssse3;
c->blend_h = dav1d_blend_h_16bpc_ssse3;
c->emu_edge = dav1d_emu_edge_16bpc_ssse3;
#endif
c->avg = BF(dav1d_avg, ssse3);
c->w_avg = BF(dav1d_w_avg, ssse3);
c->mask = BF(dav1d_mask, ssse3);
c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
c->blend = BF(dav1d_blend, ssse3);
c->blend_v = BF(dav1d_blend_v, ssse3);
c->blend_h = BF(dav1d_blend_h, ssse3);
c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
c->emu_edge = BF(dav1d_emu_edge, ssse3);
c->resize = BF(dav1d_resize, ssse3);
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
return;
#if BITDEPTH == 8
c->warp8x8 = dav1d_warp_affine_8x8_sse4;
c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
#endif
#if ARCH_X86_64
@ -255,7 +224,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
#if BITDEPTH == 8
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
@ -278,35 +246,19 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
c->avg = dav1d_avg_avx2;
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
c->w_mask[0] = dav1d_w_mask_444_avx2;
c->w_mask[1] = dav1d_w_mask_422_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
c->blend = dav1d_blend_avx2;
c->blend_v = dav1d_blend_v_avx2;
c->blend_h = dav1d_blend_h_avx2;
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
c->emu_edge = dav1d_emu_edge_avx2;
c->resize = dav1d_resize_avx2;
#else
c->avg = dav1d_avg_16bpc_avx2;
c->w_avg = dav1d_w_avg_16bpc_avx2;
c->mask = dav1d_mask_16bpc_avx2;
c->w_mask[0] = dav1d_w_mask_444_16bpc_avx2;
c->w_mask[1] = dav1d_w_mask_422_16bpc_avx2;
c->w_mask[2] = dav1d_w_mask_420_16bpc_avx2;
c->blend = dav1d_blend_16bpc_avx2;
c->blend_v = dav1d_blend_v_16bpc_avx2;
c->blend_h = dav1d_blend_h_16bpc_avx2;
c->warp8x8 = dav1d_warp_affine_8x8_16bpc_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_16bpc_avx2;
c->emu_edge = dav1d_emu_edge_16bpc_avx2;
#endif
c->avg = BF(dav1d_avg, avx2);
c->w_avg = BF(dav1d_w_avg, avx2);
c->mask = BF(dav1d_mask, avx2);
c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
c->blend = BF(dav1d_blend, avx2);
c->blend_v = BF(dav1d_blend_v, avx2);
c->blend_h = BF(dav1d_blend_h, avx2);
c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
c->emu_edge = BF(dav1d_emu_edge, avx2);
c->resize = BF(dav1d_resize, avx2);
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
return;
@ -323,12 +275,12 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
c->avg = dav1d_avg_avx512icl;
c->w_avg = dav1d_w_avg_avx512icl;
c->mask = dav1d_mask_avx512icl;
c->w_mask[0] = dav1d_w_mask_444_avx512icl;
c->w_mask[1] = dav1d_w_mask_422_avx512icl;
c->w_mask[2] = dav1d_w_mask_420_avx512icl;
c->avg = BF(dav1d_avg, avx512icl);
c->w_avg = BF(dav1d_w_avg, avx512icl);
c->mask = BF(dav1d_mask, avx512icl);
c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
#endif
#endif
}

945
third_party/dav1d/src/x86/mc_sse.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

169
third_party/dav1d/src/x86/refmvs.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,169 @@
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 32
%macro JMP_TABLE 2-*
%xdefine %%prefix mangle(private_prefix %+ _%1)
%1_table:
%xdefine %%base %1_table
%rep %0 - 1
dd %%prefix %+ .w%2 - %%base
%rotate 1
%endrep
%endmacro
%if ARCH_X86_64
splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
%endif
JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
SECTION .text
INIT_XMM sse2
; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
add bx4d, bw4d
tzcnt bw4d, bw4d
mova m2, [aq]
LEA aq, splat_mv_sse2_table
lea bx4q, [bx4q*3-32]
movsxd bw4q, [aq+bw4q*4]
movifnidn bh4d, bh4m
pshufd m0, m2, q0210
pshufd m1, m2, q1021
pshufd m2, m2, q2102
add bw4q, aq
.loop:
mov aq, [rrq]
add rrq, gprsize
lea aq, [aq+bx4q*4]
jmp bw4q
.w32:
mova [aq-16*16], m0
mova [aq-16*15], m1
mova [aq-16*14], m2
mova [aq-16*13], m0
mova [aq-16*12], m1
mova [aq-16*11], m2
mova [aq-16*10], m0
mova [aq-16* 9], m1
mova [aq-16* 8], m2
mova [aq-16* 7], m0
mova [aq-16* 6], m1
mova [aq-16* 5], m2
.w16:
mova [aq-16* 4], m0
mova [aq-16* 3], m1
mova [aq-16* 2], m2
mova [aq-16* 1], m0
mova [aq+16* 0], m1
mova [aq+16* 1], m2
.w8:
mova [aq+16* 2], m0
mova [aq+16* 3], m1
mova [aq+16* 4], m2
.w4:
mova [aq+16* 5], m0
mova [aq+16* 6], m1
mova [aq+16* 7], m2
dec bh4d
jg .loop
RET
.w2:
movu [aq+104], m0
movq [aq+120], m1
dec bh4d
jg .loop
RET
.w1:
movq [aq+116], m0
movd [aq+124], m2
dec bh4d
jg .loop
RET
%if ARCH_X86_64
INIT_YMM avx2
cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
add bx4d, bw4d
tzcnt bw4d, bw4d
vbroadcasti128 m0, [aq]
lea aq, [splat_mv_avx2_table]
lea bx4q, [bx4q*3-32]
movsxd bw4q, [aq+bw4q*4]
pshufb m0, [splat_mv_shuf]
movifnidn bh4d, bh4m
pshufd m1, m0, q2102
pshufd m2, m0, q1021
add bw4q, aq
.loop:
mov aq, [rrq]
add rrq, gprsize
lea aq, [aq+bx4q*4]
jmp bw4q
.w32:
mova [aq-32*8], m0
mova [aq-32*7], m1
mova [aq-32*6], m2
mova [aq-32*5], m0
mova [aq-32*4], m1
mova [aq-32*3], m2
.w16:
mova [aq-32*2], m0
mova [aq-32*1], m1
mova [aq+32*0], m2
.w8:
mova [aq+32*1], m0
mova [aq+32*2], m1
mova [aq+32*3], m2
dec bh4d
jg .loop
RET
.w4:
movu [aq+ 80], m0
mova [aq+112], xm1
dec bh4d
jg .loop
RET
.w2:
movu [aq+104], xm0
movq [aq+120], xm2
dec bh4d
jg .loop
RET
.w1:
movq [aq+116], xm0
movd [aq+124], xm1
dec bh4d
jg .loop
RET
%endif

46
third_party/dav1d/src/x86/refmvs_init.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,46 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* Copyright © 2021, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/refmvs.h"
decl_splat_mv_fn(dav1d_splat_mv_sse2);
decl_splat_mv_fn(dav1d_splat_mv_avx2);
COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
c->splat_mv = dav1d_splat_mv_sse2;
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
c->splat_mv = dav1d_splat_mv_avx2;
#endif
}

201
third_party/dav1d/tests/checkasm/arm/checkasm_32.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,201 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2015 Martin Storsjo
* Copyright © 2015 Janne Grunau
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define PRIVATE_PREFIX checkasm_
#include "src/arm/asm.S"
#include "src/arm/32/util.S"
const register_init, align=3
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
endconst
const error_message_fpscr
.asciz "failed to preserve register FPSCR, changed bits: %x"
error_message_gpr:
.asciz "failed to preserve register r%d"
error_message_vfp:
.asciz "failed to preserve register d%d"
error_message_stack:
.asciz "failed to preserve stack"
endconst
@ max number of args used by any asm function.
#define MAX_ARGS 15
#define ARG_STACK 4*(MAX_ARGS - 4)
@ Align the used stack space to 8 to preserve the stack alignment.
@ +8 for stack canary reference.
#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed + 8)
.macro clobbercheck variant
.equ pushed, 4*9
function checked_call_\variant, export=1
push {r4-r11, lr}
.ifc \variant, vfp
vpush {d8-d15}
fmrx r4, FPSCR
push {r4}
.equ pushed, pushed + 16*4 + 4
.endif
movrel r12, register_init
.ifc \variant, vfp
vldm r12, {d8-d15}
.endif
ldm r12, {r4-r11}
sub sp, sp, #ARG_STACK_A
.equ pos, 0
.rept MAX_ARGS-4
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
str r12, [sp, #pos]
.equ pos, pos + 4
.endr
@ For stack overflows, the callee is free to overwrite the parameters
@ that were passed on the stack (if any), so we can only check after
@ that point. First figure out how many parameters the function
@ really took on the stack:
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
@ Load the first non-parameter value from the stack, that should be
@ left untouched by the function. Store a copy of it inverted, so that
@ e.g. overwriting everything with zero would be noticed.
ldr r12, [sp, r12, lsl #2]
mvn r12, r12
str r12, [sp, #ARG_STACK_A - 4]
mov r12, r0
mov r0, r2
mov r1, r3
ldrd r2, r3, [sp, #ARG_STACK_A + pushed]
@ Call the target function
blx r12
@ Load the number of stack parameters, stack canary and its reference
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
ldr r2, [sp, r12, lsl #2]
ldr r3, [sp, #ARG_STACK_A - 4]
add sp, sp, #ARG_STACK_A
push {r0, r1}
mvn r3, r3
cmp r2, r3
bne 5f
movrel r12, register_init
.ifc \variant, vfp
.macro check_reg_vfp, dreg, offset
ldrd r2, r3, [r12, #8 * (\offset)]
vmov r0, lr, \dreg
eor r2, r2, r0
eor r3, r3, lr
orrs r2, r2, r3
bne 4f
.endm
.irp n, 8, 9, 10, 11, 12, 13, 14, 15
@ keep track of the checked double/SIMD register
mov r1, #\n
check_reg_vfp d\n, \n-8
.endr
.purgem check_reg_vfp
fmrx r1, FPSCR
ldr r3, [sp, #8]
eor r1, r1, r3
@ Ignore changes in bits 0-4 and 7
bic r1, r1, #0x9f
@ Ignore changes in the topmost 5 bits
bics r1, r1, #0xf8000000
bne 3f
.endif
@ keep track of the checked GPR
mov r1, #4
.macro check_reg reg1, reg2=
ldrd r2, r3, [r12], #8
eors r2, r2, \reg1
bne 2f
add r1, r1, #1
.ifnb \reg2
eors r3, r3, \reg2
bne 2f
.endif
add r1, r1, #1
.endm
check_reg r4, r5
check_reg r6, r7
@ r9 is a volatile register in the ios ABI
#ifdef __APPLE__
check_reg r8
#else
check_reg r8, r9
#endif
check_reg r10, r11
.purgem check_reg
b 0f
5:
movrel r0, error_message_stack
b 1f
4:
movrel r0, error_message_vfp
b 1f
3:
movrel r0, error_message_fpscr
b 1f
2:
movrel r0, error_message_gpr
1:
#ifdef PREFIX
bl _checkasm_fail_func
#else
bl checkasm_fail_func
#endif
0:
pop {r0, r1}
.ifc \variant, vfp
pop {r2}
fmxr FPSCR, r2
vpop {d8-d15}
.endif
pop {r4-r11, pc}
endfunc
.endm
clobbercheck vfp

211
third_party/dav1d/tests/checkasm/arm/checkasm_64.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,211 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2015 Martin Storsjo
* Copyright © 2015 Janne Grunau
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define PRIVATE_PREFIX checkasm_
#include "src/arm/asm.S"
#include "src/arm/64/util.S"
const register_init, align=4
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
.quad 0x1a1b2550a612b48c
.quad 0x79445c159ce79064
.quad 0x2eed899d5a28ddcd
.quad 0x86b2536fcd8cf636
.quad 0xb0856806085e7943
.quad 0x3f2bf84fc0fcca4e
.quad 0xacbd382dcf5b8de2
.quad 0xd229e1f5b281303f
.quad 0x71aeaff20b095fd9
.quad 0xab63e2e11fa38ed9
endconst
const error_message_register
.asciz "failed to preserve register"
error_message_stack:
.asciz "stack clobbered"
endconst
// max number of args used by any asm function.
#define MAX_ARGS 15
#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
function stack_clobber, export=1
mov x3, sp
mov x2, #CLOBBER_STACK
1:
stp x0, x1, [sp, #-16]!
subs x2, x2, #16
b.gt 1b
mov sp, x3
ret
endfunc
// + 16 for stack canary reference
#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16)
function checked_call, export=1
stp x29, x30, [sp, #-16]!
mov x29, sp
stp x19, x20, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
stp x25, x26, [sp, #-16]!
stp x27, x28, [sp, #-16]!
stp d8, d9, [sp, #-16]!
stp d10, d11, [sp, #-16]!
stp d12, d13, [sp, #-16]!
stp d14, d15, [sp, #-16]!
movrel x9, register_init
ldp d8, d9, [x9], #16
ldp d10, d11, [x9], #16
ldp d12, d13, [x9], #16
ldp d14, d15, [x9], #16
ldp x19, x20, [x9], #16
ldp x21, x22, [x9], #16
ldp x23, x24, [x9], #16
ldp x25, x26, [x9], #16
ldp x27, x28, [x9], #16
sub sp, sp, #ARG_STACK
.equ pos, 0
.rept MAX_ARGS-8
// Skip the first 8 args, that are loaded into registers
ldr x9, [x29, #16 + 8*8 + pos]
str x9, [sp, #pos]
.equ pos, pos + 8
.endr
// Fill x8-x17 with garbage. This doesn't have to be preserved,
// but avoids relying on them having any particular value.
movrel x9, register_init
ldp x10, x11, [x9], #32
ldp x12, x13, [x9], #32
ldp x14, x15, [x9], #32
ldp x16, x17, [x9], #32
ldp x8, x9, [x9]
// For stack overflows, the callee is free to overwrite the parameters
// that were passed on the stack (if any), so we can only check after
// that point. First figure out how many parameters the function
// really took on the stack:
ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
// Load the first non-parameter value from the stack, that should be
// left untouched by the function. Store a copy of it inverted, so that
// e.g. overwriting everything with zero would be noticed.
ldr x2, [sp, x2, lsl #3]
mvn x2, x2
str x2, [sp, #ARG_STACK-8]
// Load the in-register arguments
mov x12, x0
ldp x0, x1, [x29, #16]
ldp x2, x3, [x29, #32]
ldp x4, x5, [x29, #48]
ldp x6, x7, [x29, #64]
// Call the target function
blr x12
// Load the number of stack parameters, stack canary and its reference
ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
ldr x2, [sp, x2, lsl #3]
ldr x3, [sp, #ARG_STACK-8]
add sp, sp, #ARG_STACK
stp x0, x1, [sp, #-16]!
mvn x3, x3
cmp x2, x3
b.ne 2f
movrel x9, register_init
movi v3.8h, #0
.macro check_reg_neon reg1, reg2
ldr q1, [x9], #16
uzp1 v2.2d, v\reg1\().2d, v\reg2\().2d
eor v1.16b, v1.16b, v2.16b
orr v3.16b, v3.16b, v1.16b
.endm
check_reg_neon 8, 9
check_reg_neon 10, 11
check_reg_neon 12, 13
check_reg_neon 14, 15
uqxtn v3.8b, v3.8h
umov x3, v3.d[0]
.macro check_reg reg1, reg2
ldp x0, x1, [x9], #16
eor x0, x0, \reg1
eor x1, x1, \reg2
orr x3, x3, x0
orr x3, x3, x1
.endm
check_reg x19, x20
check_reg x21, x22
check_reg x23, x24
check_reg x25, x26
check_reg x27, x28
cbz x3, 0f
movrel x0, error_message_register
b 1f
2:
movrel x0, error_message_stack
1:
#ifdef PREFIX
bl _checkasm_fail_func
#else
bl checkasm_fail_func
#endif
0:
ldp x0, x1, [sp], #16
ldp d14, d15, [sp], #16
ldp d12, d13, [sp], #16
ldp d10, d11, [sp], #16
ldp d8, d9, [sp], #16
ldp x27, x28, [sp], #16
ldp x25, x26, [sp], #16
ldp x23, x24, [sp], #16
ldp x21, x22, [sp], #16
ldp x19, x20, [sp], #16
ldp x29, x30, [sp], #16
ret
endfunc

150
third_party/dav1d/tests/checkasm/cdef.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,150 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <string.h>
#include <stdio.h>
#include "common/dump.h"
#include "src/levels.h"
#include "src/cdef.h"
static int to_binary(int x) { /* 0-15 -> 0000-1111 */
return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
}
static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
const int fill_type = rnd() & 7;
if (fill_type == 0)
while (n--) /* check for cdef_filter underflows */
*buf++ = rnd() & 1;
else if (fill_type == 1)
while (n--) /* check for cdef_filter overflows */
*buf++ = bitdepth_max - (rnd() & 1);
else
while (n--)
*buf++ = rnd() & bitdepth_max;
}
static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8;
ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8;
ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8;
ALIGN_STK_16(pixel, left, 8,[2]);
const ptrdiff_t stride = 16 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
const pixel *top, int pri_strength, int sec_strength,
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
for (int dir = 0; dir < 8; dir++) {
for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) {
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
init_tmp((pixel *) left, 8 * 2, bitdepth_max);
memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
const int lvl = 1 + (rnd() % 62);
const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
int pri_strength = (lvl >> 2) << bitdepth_min_8;
int sec_strength = lvl & 3;
sec_strength += sec_strength == 3;
sec_strength <<= bitdepth_min_8;
call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
pri_strength, sec_strength, dir, damping, to_binary(edges));
return;
}
if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) {
/* Benchmark a fixed set of cases to get consistent results:
* 1) top/left edges and pri_strength only
* 2) bottom/right edges and sec_strength only
* 3) all edges and both pri_strength and sec_strength
*/
pri_strength = (edges & 1) << bitdepth_min_8;
sec_strength = (edges & 2) << bitdepth_min_8;
bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
}
}
}
}
}
static void check_cdef_direction(const cdef_dir_fn fn) {
ALIGN_STK_64(pixel, src, 8 * 8,);
declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
HIGHBD_DECL_SUFFIX);
if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) {
unsigned c_var, a_var;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
init_tmp(src, 64, bitdepth_max);
const int c_dir = call_ref(src, 8 * sizeof(pixel), &c_var HIGHBD_TAIL_SUFFIX);
const int a_dir = call_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX);
if (c_var != a_var || c_dir != a_dir) {
if (fail()) {
hex_fdump(stderr, src, 8 * sizeof(pixel), 8, 8, "src");
fprintf(stderr, "c_dir %d a_dir %d\n", c_dir, a_dir);
}
}
bench_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX);
}
report("cdef_dir");
}
void bitfn(checkasm_check_cdef)(void) {
Dav1dCdefDSPContext c;
bitfn(dav1d_cdef_dsp_init)(&c);
check_cdef_direction(c.dir);
check_cdef_filter(c.fb[0], 8, 8);
check_cdef_filter(c.fb[1], 4, 8);
check_cdef_filter(c.fb[2], 4, 4);
report("cdef_filter");
}

874
third_party/dav1d/tests/checkasm/checkasm.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,874 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <math.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include "src/cpu.h"
#ifdef _WIN32
#include <windows.h>
#define COLOR_RED FOREGROUND_RED
#define COLOR_GREEN FOREGROUND_GREEN
#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN)
static unsigned get_seed(void) {
return GetTickCount();
}
#else
#include <unistd.h>
#include <signal.h>
#include <time.h>
#ifdef __APPLE__
#include <mach/mach_time.h>
#endif
#define COLOR_RED 1
#define COLOR_GREEN 2
#define COLOR_YELLOW 3
static unsigned get_seed(void) {
#ifdef __APPLE__
return (unsigned) mach_absolute_time();
#elif defined(HAVE_CLOCK_GETTIME)
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
#endif
}
#endif
/* List of tests to invoke */
static const struct {
const char *name;
void (*func)(void);
} tests[] = {
{ "msac", checkasm_check_msac },
{ "refmvs", checkasm_check_refmvs },
#if CONFIG_8BPC
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
{ "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
{ "itx_8bpc", checkasm_check_itx_8bpc },
{ "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
{ "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
{ "mc_8bpc", checkasm_check_mc_8bpc },
#endif
#if CONFIG_16BPC
{ "cdef_16bpc", checkasm_check_cdef_16bpc },
{ "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
{ "ipred_16bpc", checkasm_check_ipred_16bpc },
{ "itx_16bpc", checkasm_check_itx_16bpc },
{ "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
{ "looprestoration_16bpc", checkasm_check_looprestoration_16bpc },
{ "mc_16bpc", checkasm_check_mc_16bpc },
#endif
{ 0 }
};
/* List of cpu flags to check */
static const struct {
const char *name;
const char *suffix;
unsigned flag;
} cpus[] = {
#if ARCH_X86
{ "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
{ "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
{ "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
{ "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
{ "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
#elif ARCH_AARCH64 || ARCH_ARM
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
#elif ARCH_PPC64LE
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
#endif
{ 0 }
};
typedef struct CheckasmFuncVersion {
struct CheckasmFuncVersion *next;
void *func;
int ok;
unsigned cpu;
int iterations;
uint64_t cycles;
} CheckasmFuncVersion;
/* Binary search tree node */
typedef struct CheckasmFunc {
struct CheckasmFunc *child[2];
CheckasmFuncVersion versions;
uint8_t color; /* 0 = red, 1 = black */
char name[];
} CheckasmFunc;
/* Internal state */
static struct {
CheckasmFunc *funcs;
CheckasmFunc *current_func;
CheckasmFuncVersion *current_func_ver;
const char *current_test_name;
const char *bench_pattern;
size_t bench_pattern_len;
int num_checked;
int num_failed;
int nop_time;
unsigned cpu_flag;
const char *cpu_flag_name;
const char *test_name;
unsigned seed;
int bench_c;
int verbose;
int function_listing;
#if ARCH_X86_64
void (*simd_warmup)(void);
#endif
} state;
/* float compare support code */
typedef union {
float f;
uint32_t i;
} intfloat;
static uint32_t xs_state[4];
static void xor128_srand(unsigned seed) {
xs_state[0] = seed;
xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
xs_state[3] = ~seed;
}
// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
// Journal of Statistical Software. 8 (14).
// doi:10.18637/jss.v008.i14.
int xor128_rand(void) {
const uint32_t x = xs_state[0];
const uint32_t t = x ^ (x << 11);
xs_state[0] = xs_state[1];
xs_state[1] = xs_state[2];
xs_state[2] = xs_state[3];
uint32_t w = xs_state[3];
w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
xs_state[3] = w;
return w >> 1;
}
static int is_negative(const intfloat u) {
return u.i >> 31;
}
int float_near_ulp(const float a, const float b, const unsigned max_ulp) {
intfloat x, y;
x.f = a;
y.f = b;
if (is_negative(x) != is_negative(y)) {
// handle -0.0 == +0.0
return a == b;
}
if (llabs((int64_t)x.i - y.i) <= max_ulp)
return 1;
return 0;
}
int float_near_ulp_array(const float *const a, const float *const b,
const unsigned max_ulp, const int len)
{
for (int i = 0; i < len; i++)
if (!float_near_ulp(a[i], b[i], max_ulp))
return 0;
return 1;
}
int float_near_abs_eps(const float a, const float b, const float eps) {
return fabsf(a - b) < eps;
}
int float_near_abs_eps_array(const float *const a, const float *const b,
const float eps, const int len)
{
for (int i = 0; i < len; i++)
if (!float_near_abs_eps(a[i], b[i], eps))
return 0;
return 1;
}
int float_near_abs_eps_ulp(const float a, const float b, const float eps,
const unsigned max_ulp)
{
return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps);
}
int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
const float eps, const unsigned max_ulp,
const int len)
{
for (int i = 0; i < len; i++)
if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp))
return 0;
return 1;
}
/* Print colored text to stderr if the terminal supports it */
static void color_printf(const int color, const char *const fmt, ...) {
static int8_t use_color = -1;
va_list arg;
#ifdef _WIN32
static HANDLE con;
static WORD org_attributes;
if (use_color < 0) {
CONSOLE_SCREEN_BUFFER_INFO con_info;
con = GetStdHandle(STD_ERROR_HANDLE);
if (con && con != INVALID_HANDLE_VALUE &&
GetConsoleScreenBufferInfo(con, &con_info))
{
org_attributes = con_info.wAttributes;
use_color = 1;
} else
use_color = 0;
}
if (use_color)
SetConsoleTextAttribute(con, (org_attributes & 0xfff0) |
(color & 0x0f));
#else
if (use_color < 0) {
const char *const term = getenv("TERM");
use_color = term && strcmp(term, "dumb") && isatty(2);
}
if (use_color)
fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07);
#endif
va_start(arg, fmt);
vfprintf(stderr, fmt, arg);
va_end(arg);
if (use_color) {
#ifdef _WIN32
SetConsoleTextAttribute(con, org_attributes);
#else
fprintf(stderr, "\x1b[0m");
#endif
}
}
/* Deallocate a tree */
static void destroy_func_tree(CheckasmFunc *const f) {
if (f) {
CheckasmFuncVersion *v = f->versions.next;
while (v) {
CheckasmFuncVersion *next = v->next;
free(v);
v = next;
}
destroy_func_tree(f->child[0]);
destroy_func_tree(f->child[1]);
free(f);
}
}
/* Allocate a zero-initialized block, clean up and exit on failure */
static void *checkasm_malloc(const size_t size) {
void *const ptr = calloc(1, size);
if (!ptr) {
fprintf(stderr, "checkasm: malloc failed\n");
destroy_func_tree(state.funcs);
exit(1);
}
return ptr;
}
/* Get the suffix of the specified cpu flag */
static const char *cpu_suffix(const unsigned cpu) {
for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--)
if (cpu & cpus[i].flag)
return cpus[i].suffix;
return "c";
}
#ifdef readtime
static int cmp_nop(const void *a, const void *b) {
return *(const uint16_t*)a - *(const uint16_t*)b;
}
/* Measure the overhead of the timing code (in decicycles) */
static int measure_nop_time(void) {
uint16_t nops[10000];
int nop_sum = 0;
for (int i = 0; i < 10000; i++) {
uint64_t t = readtime();
nops[i] = (uint16_t) (readtime() - t);
}
qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
for (int i = 2500; i < 7500; i++)
nop_sum += nops[i];
return nop_sum / 500;
}
/* Print benchmark results */
static void print_benchs(const CheckasmFunc *const f) {
if (f) {
print_benchs(f->child[0]);
/* Only print functions with at least one assembly version */
if (state.bench_c || f->versions.cpu || f->versions.next) {
const CheckasmFuncVersion *v = &f->versions;
do {
if (v->iterations) {
const int decicycles = (int) (10*v->cycles/v->iterations -
state.nop_time) / 4;
printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu),
decicycles/10, decicycles%10);
}
} while ((v = v->next));
}
print_benchs(f->child[1]);
}
}
#endif
static void print_functions(const CheckasmFunc *const f) {
if (f) {
print_functions(f->child[0]);
printf("%s\n", f->name);
print_functions(f->child[1]);
}
}
#define is_digit(x) ((x) >= '0' && (x) <= '9')
/* ASCIIbetical sort except preserving natural order for numbers */
static int cmp_func_names(const char *a, const char *b) {
const char *const start = a;
int ascii_diff, digit_diff;
for (; !(ascii_diff = *(const unsigned char*)a -
*(const unsigned char*)b) && *a; a++, b++);
for (; is_digit(*a) && is_digit(*b); a++, b++);
if (a > start && is_digit(a[-1]) &&
(digit_diff = is_digit(*a) - is_digit(*b)))
{
return digit_diff;
}
return ascii_diff;
}
/* Perform a tree rotation in the specified direction and return the new root */
static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
CheckasmFunc *const r = f->child[dir^1];
f->child[dir^1] = r->child[dir];
r->child[dir] = f;
r->color = f->color;
f->color = 0;
return r;
}
#define is_red(f) ((f) && !(f)->color)
/* Balance a left-leaning red-black tree at the specified node */
static void balance_tree(CheckasmFunc **const root) {
CheckasmFunc *const f = *root;
if (is_red(f->child[0]) && is_red(f->child[1])) {
f->color ^= 1;
f->child[0]->color = f->child[1]->color = 1;
}
else if (!is_red(f->child[0]) && is_red(f->child[1]))
*root = rotate_tree(f, 0); /* Rotate left */
else if (is_red(f->child[0]) && is_red(f->child[0]->child[0]))
*root = rotate_tree(f, 1); /* Rotate right */
}
/* Get a node with the specified name, creating it if it doesn't exist */
static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) {
CheckasmFunc *f = *root;
if (f) {
/* Search the tree for a matching node */
const int cmp = cmp_func_names(name, f->name);
if (cmp) {
f = get_func(&f->child[cmp > 0], name);
/* Rebalance the tree on the way up if a new node was inserted */
if (!f->versions.func)
balance_tree(root);
}
} else {
/* Allocate and insert a new node into the tree */
const size_t name_length = strlen(name) + 1;
f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length);
memcpy(f->name, name, name_length);
}
return f;
}
checkasm_context checkasm_context_buf;
/* Crash handling: attempt to catch crashes and handle them
* gracefully instead of just aborting abruptly. */
#ifdef _WIN32
static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
const char *err;
switch (e->ExceptionRecord->ExceptionCode) {
case EXCEPTION_FLT_DIVIDE_BY_ZERO:
case EXCEPTION_INT_DIVIDE_BY_ZERO:
err = "fatal arithmetic error";
break;
case EXCEPTION_ILLEGAL_INSTRUCTION:
case EXCEPTION_PRIV_INSTRUCTION:
err = "illegal instruction";
break;
case EXCEPTION_ACCESS_VIOLATION:
case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
case EXCEPTION_DATATYPE_MISALIGNMENT:
case EXCEPTION_IN_PAGE_ERROR:
case EXCEPTION_STACK_OVERFLOW:
err = "segmentation fault";
break;
default:
return EXCEPTION_CONTINUE_SEARCH;
}
RemoveVectoredExceptionHandler(signal_handler);
checkasm_fail_func(err);
checkasm_load_context();
return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */
}
#else
static void signal_handler(const int s) {
checkasm_set_signal_handler_state(0);
checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
s == SIGILL ? "illegal instruction" :
"segmentation fault");
checkasm_load_context();
}
#endif
/* Perform tests and benchmarks for the specified
* cpu flag if supported by the host */
static void check_cpu_flag(const char *const name, unsigned flag) {
const unsigned old_cpu_flag = state.cpu_flag;
flag |= old_cpu_flag;
dav1d_set_cpu_flags_mask(flag);
state.cpu_flag = dav1d_get_cpu_flags();
if (!flag || state.cpu_flag != old_cpu_flag) {
state.cpu_flag_name = name;
for (int i = 0; tests[i].func; i++) {
if (state.test_name && strcmp(tests[i].name, state.test_name))
continue;
xor128_srand(state.seed);
state.current_test_name = tests[i].name;
tests[i].func();
}
}
}
/* Print the name of the current CPU flag, but only do it once */
static void print_cpu_name(void) {
if (state.cpu_flag_name) {
color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
state.cpu_flag_name = NULL;
}
}
int main(int argc, char *argv[]) {
state.seed = get_seed();
while (argc > 1) {
if (!strncmp(argv[1], "--help", 6)) {
fprintf(stdout,
"checkasm [options] <random seed>\n"
" <random seed> Numeric value to seed the rng\n"
"Options:\n"
" --test=<test_name> Test only <test_name>\n"
" --bench=<pattern> Test and benchmark the functions matching <pattern>\n"
" --list-functions List available functions\n"
" --list-tests List available tests\n"
" --bench-c Benchmark the C-only functions\n"
" --verbose -v Print failures verbosely\n");
return 0;
} else if (!strncmp(argv[1], "--bench-c", 9)) {
state.bench_c = 1;
} else if (!strncmp(argv[1], "--bench", 7)) {
#ifndef readtime
fprintf(stderr,
"checkasm: --bench is not supported on your system\n");
return 1;
#endif
if (argv[1][7] == '=') {
state.bench_pattern = argv[1] + 8;
state.bench_pattern_len = strlen(state.bench_pattern);
} else
state.bench_pattern = "";
} else if (!strncmp(argv[1], "--test=", 7)) {
state.test_name = argv[1] + 7;
} else if (!strcmp(argv[1], "--list-functions")) {
state.function_listing = 1;
} else if (!strcmp(argv[1], "--list-tests")) {
for (int i = 0; tests[i].name; i++)
printf("%s\n", tests[i].name);
return 0;
} else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
state.verbose = 1;
} else {
state.seed = (unsigned) strtoul(argv[1], NULL, 10);
}
argc--;
argv++;
}
dav1d_init_cpu();
#ifdef readtime
if (state.bench_pattern) {
static int testing = 0;
checkasm_save_context();
if (!testing) {
checkasm_set_signal_handler_state(1);
testing = 1;
readtime();
checkasm_set_signal_handler_state(0);
} else {
fprintf(stderr, "checkasm: unable to access cycle counter\n");
return 1;
}
}
#endif
int ret = 0;
if (!state.function_listing) {
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
#if ARCH_X86_64
void checkasm_warmup_avx2(void);
void checkasm_warmup_avx512(void);
const unsigned cpu_flags = dav1d_get_cpu_flags();
if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
state.simd_warmup = checkasm_warmup_avx512;
else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
state.simd_warmup = checkasm_warmup_avx2;
checkasm_simd_warmup();
#endif
}
check_cpu_flag(NULL, 0);
if (state.function_listing) {
print_functions(state.funcs);
} else {
for (int i = 0; cpus[i].flag; i++)
check_cpu_flag(cpus[i].name, cpus[i].flag);
if (!state.num_checked) {
fprintf(stderr, "checkasm: no tests to perform\n");
} else if (state.num_failed) {
fprintf(stderr, "checkasm: %d of %d tests have failed\n",
state.num_failed, state.num_checked);
ret = 1;
} else {
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
#ifdef readtime
if (state.bench_pattern) {
state.nop_time = measure_nop_time();
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
print_benchs(state.funcs);
}
#endif
}
}
destroy_func_tree(state.funcs);
return ret;
}
/* Decide whether or not the specified function needs to be tested and
* allocate/initialize data structures if needed. Returns a pointer to a
* reference function if the function should be tested, otherwise NULL */
void *checkasm_check_func(void *const func, const char *const name, ...) {
char name_buf[256];
va_list arg;
va_start(arg, name);
const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
va_end(arg);
if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf))
return NULL;
state.current_func = get_func(&state.funcs, name_buf);
if (state.function_listing) /* Save function names without running tests */
return NULL;
state.funcs->color = 1;
CheckasmFuncVersion *v = &state.current_func->versions;
void *ref = func;
if (v->func) {
CheckasmFuncVersion *prev;
do {
/* Only test functions that haven't already been tested */
if (v->func == func)
return NULL;
if (v->ok)
ref = v->func;
prev = v;
} while ((v = v->next));
v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
}
v->func = func;
v->ok = 1;
v->cpu = state.cpu_flag;
state.current_func_ver = v;
xor128_srand(state.seed);
if (state.cpu_flag || state.bench_c)
state.num_checked++;
return ref;
}
/* Decide whether or not the current function needs to be benchmarked */
int checkasm_bench_func(void) {
return !state.num_failed && state.bench_pattern &&
!strncmp(state.current_func->name, state.bench_pattern,
state.bench_pattern_len);
}
/* Indicate that the current test has failed, return whether verbose printing
* is requested. */
int checkasm_fail_func(const char *const msg, ...) {
if (state.current_func_ver && state.current_func_ver->cpu &&
state.current_func_ver->ok)
{
va_list arg;
print_cpu_name();
fprintf(stderr, " %s_%s (", state.current_func->name,
cpu_suffix(state.current_func_ver->cpu));
va_start(arg, msg);
vfprintf(stderr, msg, arg);
va_end(arg);
fprintf(stderr, ")\n");
state.current_func_ver->ok = 0;
state.num_failed++;
}
return state.verbose;
}
/* Update benchmark results of the current function */
void checkasm_update_bench(const int iterations, const uint64_t cycles) {
state.current_func_ver->iterations += iterations;
state.current_func_ver->cycles += cycles;
}
/* Print the outcome of all tests performed since
* the last time this function was called */
void checkasm_report(const char *const name, ...) {
static int prev_checked, prev_failed;
static size_t max_length;
if (state.num_checked > prev_checked) {
int pad_length = (int) max_length + 4;
va_list arg;
print_cpu_name();
pad_length -= fprintf(stderr, " - %s.", state.current_test_name);
va_start(arg, name);
pad_length -= vfprintf(stderr, name, arg);
va_end(arg);
fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
if (state.num_failed == prev_failed)
color_printf(COLOR_GREEN, "OK");
else
color_printf(COLOR_RED, "FAILED");
fprintf(stderr, "]\n");
prev_checked = state.num_checked;
prev_failed = state.num_failed;
} else if (!state.cpu_flag) {
/* Calculate the amount of padding required
* to make the output vertically aligned */
size_t length = strlen(state.current_test_name);
va_list arg;
va_start(arg, name);
length += vsnprintf(NULL, 0, name, arg);
va_end(arg);
if (length > max_length)
max_length = length;
}
}
void checkasm_set_signal_handler_state(const int enabled) {
#ifdef _WIN32
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
if (enabled)
AddVectoredExceptionHandler(0, signal_handler);
else
RemoveVectoredExceptionHandler(signal_handler);
#endif
#else
void (*const handler)(int) = enabled ? signal_handler : SIG_DFL;
signal(SIGBUS, handler);
signal(SIGFPE, handler);
signal(SIGILL, handler);
signal(SIGSEGV, handler);
#endif
}
static int check_err(const char *const file, const int line,
const char *const name, const int w, const int h,
int *const err)
{
if (*err)
return 0;
if (!checkasm_fail_func("%s:%d", file, line))
return 1;
*err = 1;
fprintf(stderr, "%s (%dx%d):\n", name, w, h);
return 0;
}
#define DEF_CHECKASM_CHECK_FUNC(type, fmt) \
int checkasm_check_##type(const char *const file, const int line, \
const type *buf1, ptrdiff_t stride1, \
const type *buf2, ptrdiff_t stride2, \
const int w, int h, const char *const name, \
const int align_w, const int align_h, \
const int padding) \
{ \
int aligned_w = (w + align_w - 1) & ~(align_w - 1); \
int aligned_h = (h + align_h - 1) & ~(align_h - 1); \
int err = 0; \
stride1 /= sizeof(*buf1); \
stride2 /= sizeof(*buf2); \
int y = 0; \
for (y = 0; y < h; y++) \
if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \
break; \
if (y != h) { \
if (check_err(file, line, name, w, h, &err)) \
return 1; \
for (y = 0; y < h; y++) { \
for (int x = 0; x < w; x++) \
fprintf(stderr, " " fmt, buf1[x]); \
fprintf(stderr, " "); \
for (int x = 0; x < w; x++) \
fprintf(stderr, " " fmt, buf2[x]); \
fprintf(stderr, " "); \
for (int x = 0; x < w; x++) \
fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \
buf1 += stride1; \
buf2 += stride2; \
fprintf(stderr, "\n"); \
} \
buf1 -= h*stride1; \
buf2 -= h*stride2; \
} \
for (y = -padding; y < 0; y++) \
if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
(w + 2*padding)*sizeof(*buf1))) { \
if (check_err(file, line, name, w, h, &err)) \
return 1; \
fprintf(stderr, " overwrite above\n"); \
break; \
} \
for (y = aligned_h; y < aligned_h + padding; y++) \
if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
(w + 2*padding)*sizeof(*buf1))) { \
if (check_err(file, line, name, w, h, &err)) \
return 1; \
fprintf(stderr, " overwrite below\n"); \
break; \
} \
for (y = 0; y < h; y++) \
if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
padding*sizeof(*buf1))) { \
if (check_err(file, line, name, w, h, &err)) \
return 1; \
fprintf(stderr, " overwrite left\n"); \
break; \
} \
for (y = 0; y < h; y++) \
if (memcmp(&buf1[y*stride1 + aligned_w], &buf2[y*stride2 + aligned_w], \
padding*sizeof(*buf1))) { \
if (check_err(file, line, name, w, h, &err)) \
return 1; \
fprintf(stderr, " overwrite right\n"); \
break; \
} \
return err; \
}
DEF_CHECKASM_CHECK_FUNC(int8_t, "%4d")
DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d")
DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d")
DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x")
DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x")
#if ARCH_X86_64
void checkasm_simd_warmup(void)
{
if (state.simd_warmup)
state.simd_warmup();
}
#endif

350
third_party/dav1d/tests/checkasm/checkasm.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,350 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_TESTS_CHECKASM_CHECKASM_H
#define DAV1D_TESTS_CHECKASM_CHECKASM_H
#include "config.h"
#include <stdint.h>
#include <stdlib.h>
#if ARCH_X86_64 && defined(_WIN32)
/* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack,
* which doesn't work for assembly functions without unwind information. */
#include <windows.h>
#define checkasm_context CONTEXT
#define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf)
#define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL)
#else
#include <setjmp.h>
#define checkasm_context jmp_buf
#define checkasm_save_context() setjmp(checkasm_context_buf)
#define checkasm_load_context() longjmp(checkasm_context_buf, 1)
#endif
#include "include/common/attributes.h"
#include "include/common/bitdepth.h"
#include "include/common/intops.h"
int xor128_rand(void);
#define rnd xor128_rand
#define decl_check_bitfns(name) \
name##_8bpc(void); \
name##_16bpc(void)
void checkasm_check_msac(void);
void checkasm_check_refmvs(void);
decl_check_bitfns(void checkasm_check_cdef);
decl_check_bitfns(void checkasm_check_filmgrain);
decl_check_bitfns(void checkasm_check_ipred);
decl_check_bitfns(void checkasm_check_itx);
decl_check_bitfns(void checkasm_check_loopfilter);
decl_check_bitfns(void checkasm_check_looprestoration);
decl_check_bitfns(void checkasm_check_mc);
void *checkasm_check_func(void *func, const char *name, ...);
int checkasm_bench_func(void);
int checkasm_fail_func(const char *msg, ...);
void checkasm_update_bench(int iterations, uint64_t cycles);
void checkasm_report(const char *name, ...);
void checkasm_set_signal_handler_state(int enabled);
extern checkasm_context checkasm_context_buf;
/* float compare utilities */
int float_near_ulp(float a, float b, unsigned max_ulp);
int float_near_abs_eps(float a, float b, float eps);
int float_near_abs_eps_ulp(float a, float b, float eps, unsigned max_ulp);
int float_near_ulp_array(const float *a, const float *b, unsigned max_ulp,
int len);
int float_near_abs_eps_array(const float *a, const float *b, float eps,
int len);
int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
unsigned max_ulp, int len);
#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
/* Decide whether or not the specified function needs to be tested */
#define check_func(func, ...)\
(func_ref = checkasm_check_func((func_new = func), __VA_ARGS__))
/* Declare the function prototype. The first argument is the return value,
* the remaining arguments are the function parameters. Naming parameters
* is optional. */
#define declare_func(ret, ...)\
declare_new(ret, __VA_ARGS__)\
void *func_ref, *func_new;\
typedef ret func_type(__VA_ARGS__);\
checkasm_save_context()
/* Indicate that the current test has failed */
#define fail() checkasm_fail_func("%s:%d", __FILE__, __LINE__)
/* Print the test outcome */
#define report checkasm_report
/* Call the reference function */
#define call_ref(...)\
(checkasm_set_signal_handler_state(1),\
((func_type *)func_ref)(__VA_ARGS__));\
checkasm_set_signal_handler_state(0)
#if HAVE_ASM
#if ARCH_X86
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
#define readtime() (_mm_lfence(), __rdtsc())
#else
static inline uint64_t readtime(void) {
uint32_t eax, edx;
__asm__ __volatile__("lfence\nrdtsc" : "=a"(eax), "=d"(edx));
return (((uint64_t)edx) << 32) | eax;
}
#define readtime readtime
#endif
#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__)
#include <mach/mach_time.h>
#define readtime() mach_absolute_time()
#elif ARCH_AARCH64
#ifdef _MSC_VER
#include <windows.h>
#define readtime() (_InstructionSynchronizationBarrier(), ReadTimeStampCounter())
#else
static inline uint64_t readtime(void) {
uint64_t cycle_counter;
/* This requires enabling user mode access to the cycle counter (which
* can only be done from kernel space).
* This could also read cntvct_el0 instead of pmccntr_el0; that register
* might also be readable (depending on kernel version), but it has much
* worse precision (it's a fixed 50 MHz timer). */
__asm__ __volatile__("isb\nmrs %0, pmccntr_el0"
: "=r"(cycle_counter)
:: "memory");
return cycle_counter;
}
#define readtime readtime
#endif
#elif ARCH_ARM && !defined(_MSC_VER) && __ARM_ARCH >= 7
static inline uint64_t readtime(void) {
uint32_t cycle_counter;
/* This requires enabling user mode access to the cycle counter (which
* can only be done from kernel space). */
__asm__ __volatile__("isb\nmrc p15, 0, %0, c9, c13, 0"
: "=r"(cycle_counter)
:: "memory");
return cycle_counter;
}
#define readtime readtime
#elif ARCH_PPC64LE
static inline uint64_t readtime(void) {
uint32_t tbu, tbl, temp;
__asm__ __volatile__(
"1:\n"
"mfspr %2,269\n"
"mfspr %0,268\n"
"mfspr %1,269\n"
"cmpw %2,%1\n"
"bne 1b\n"
: "=r"(tbl), "=r"(tbu), "=r"(temp)
:
: "cc");
return (((uint64_t)tbu) << 32) | (uint64_t)tbl;
}
#define readtime readtime
#endif
/* Verifies that clobbered callee-saved registers
* are properly saved and restored */
void checkasm_checked_call(void *func, ...);
#if ARCH_X86_64
/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended
* to 64-bit. This is done by clobbering the stack with junk around the stack
* pointer and calling the assembly function through checked_call() with added
* dummy arguments which forces all real arguments to be passed on the stack
* and not in registers. For 32-bit arguments the upper half of the 64-bit
* register locations on the stack will now contain junk which will cause
* misbehaving functions to either produce incorrect output or segfault. Note
* that even though this works extremely well in practice, it's technically
* not guaranteed and false negatives is theoretically possible, but there
* can never be any false positives. */
void checkasm_stack_clobber(uint64_t clobber, ...);
/* YMM and ZMM registers on x86 are turned off to save power when they haven't
* been used for some period of time. When they are used there will be a
* "warmup" period during which performance will be reduced and inconsistent
* which is problematic when trying to benchmark individual functions. We can
* work around this by periodically issuing "dummy" instructions that uses
* those registers to keep them powered on. */
void checkasm_simd_warmup(void);
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\
int, int, int, int, int, int, int, int,\
int, int, int, int, int, int, int) =\
(void *)checkasm_checked_call;
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
#ifdef _WIN32
#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0
#else
#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0
#endif
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checkasm_simd_warmup(),\
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\
checkasm_set_signal_handler_state(0)
#elif ARCH_X86_32
#define declare_new(ret, ...)\
ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\
int, int, int, int, int, int, int, int, int) =\
(void *)checkasm_checked_call;
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\
11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\
checkasm_set_signal_handler_state(0)
#elif ARCH_ARM
/* Use a dummy argument, to offset the real parameters by 2, not only 1.
* This makes sure that potential 8-byte-alignment of parameters is kept
* the same even when the extra parameters have been removed. */
void checkasm_checked_call_vfp(void *func, int dummy, ...);
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int dummy, __VA_ARGS__,\
int, int, int, int, int, int, int, int,\
int, int, int, int, int, int, int) =\
(void *)checkasm_checked_call_vfp;
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\
checkasm_set_signal_handler_state(0)
#elif ARCH_AARCH64 && !defined(__APPLE__)
void checkasm_stack_clobber(uint64_t clobber, ...);
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int, int, int, int, int, int, int,\
__VA_ARGS__, int, int, int, int, int, int, int, int,\
int, int, int, int, int, int, int) =\
(void *)checkasm_checked_call;
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB),\
checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
checkasm_set_signal_handler_state(0)
#else
#define declare_new(ret, ...)
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
((func_type *)func_new)(__VA_ARGS__));\
checkasm_set_signal_handler_state(0)
#endif
#else /* HAVE_ASM */
#define declare_new(ret, ...)
/* Call the function */
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
((func_type *)func_new)(__VA_ARGS__));\
checkasm_set_signal_handler_state(0)
#endif /* HAVE_ASM */
/* Benchmark the function */
#ifdef readtime
#define bench_new(...)\
do {\
if (checkasm_bench_func()) {\
func_type *tfunc = func_new;\
checkasm_set_signal_handler_state(1);\
uint64_t tsum = 0;\
int tcount = 0;\
for (int ti = 0; ti < BENCH_RUNS; ti++) {\
uint64_t t = readtime();\
tfunc(__VA_ARGS__);\
tfunc(__VA_ARGS__);\
tfunc(__VA_ARGS__);\
tfunc(__VA_ARGS__);\
t = readtime() - t;\
if (t*tcount <= tsum*4 && ti > 0) {\
tsum += t;\
tcount++;\
}\
}\
checkasm_set_signal_handler_state(0);\
checkasm_update_bench(tcount, tsum);\
} else {\
call_new(__VA_ARGS__);\
}\
} while (0)
#else
#define bench_new(...) do {} while (0)
#endif
#define PIXEL_RECT(name, w, h) \
ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \
ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \
(void)name##_stride; \
pixel *name = name##_buf + ((w)+64)*16 + 64
#define CLEAR_PIXEL_RECT(name) \
memset(name##_buf, 0x99, sizeof(name##_buf)) \
#define DECL_CHECKASM_CHECK_FUNC(type) \
int checkasm_check_##type(const char *const file, const int line, \
const type *const buf1, const ptrdiff_t stride1, \
const type *const buf2, const ptrdiff_t stride2, \
const int w, const int h, const char *const name, \
const int align_w, const int align_h, \
const int padding)
DECL_CHECKASM_CHECK_FUNC(int8_t);
DECL_CHECKASM_CHECK_FUNC(int16_t);
DECL_CHECKASM_CHECK_FUNC(int32_t);
DECL_CHECKASM_CHECK_FUNC(uint8_t);
DECL_CHECKASM_CHECK_FUNC(uint16_t);
DECL_CHECKASM_CHECK_FUNC(uint32_t);
#define CONCAT(a,b) a ## b
#define checkasm_check2(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__)
#define checkasm_check(prefix, ...) checkasm_check2(prefix, __VA_ARGS__, 0, 0, 0)
#ifdef BITDEPTH
#define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__)
#define checkasm_check_pixel_padded(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 1, 1, 8)
#define checkasm_check_pixel_padded_align(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 8)
#define checkasm_check_coef(...) checkasm_check(COEF_TYPE, __VA_ARGS__)
#endif
#endif /* DAV1D_TESTS_CHECKASM_CHECKASM_H */

401
third_party/dav1d/tests/checkasm/filmgrain.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,401 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <string.h>
#include "src/levels.h"
#include "src/film_grain.h"
#define UNIT_TEST 1
#include "src/fg_apply_tmpl.c"
#if BITDEPTH == 8
#define checkasm_check_entry(...) checkasm_check(int8_t, __VA_ARGS__)
#else
#define checkasm_check_entry(...) checkasm_check(int16_t, __VA_ARGS__)
#endif
static const char ss_name[][4] = {
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
};
static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX);
for (int i = 0; i < 4; i++) {
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH,
grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH,
GRAIN_WIDTH, GRAIN_HEIGHT, "grain_lut");
bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
}
}
report("gen_grain_y");
}
static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
const entry grain_lut_y[][GRAIN_WIDTH],
const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX);
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
const enum Dav1dPixelLayout layout = layout_idx + 1;
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
for (int i = 0; i < 4; i++) {
if (check_func(dsp->generate_grain_uv[layout_idx],
"gen_grain_uv_ar%d_%dbpc_%s",
i, BITDEPTH, ss_name[layout_idx]))
{
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
fg_data[0].num_y_points = rnd() & 1;
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);
const int uv = rnd() & 1;
const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
for (int n = 0; n < num_uv_pos; n++)
fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
if (!fg_data[0].num_y_points)
fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
int w = ss_x ? 44 : GRAIN_WIDTH;
int h = ss_y ? 38 : GRAIN_HEIGHT;
checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH,
grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH,
w, h, "grain_lut");
bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
}
}
}
report("gen_grain_uv");
}
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
PIXEL_RECT(c_dst, 128, 32);
PIXEL_RECT(a_dst, 128, 32);
PIXEL_RECT(src, 128, 32);
const ptrdiff_t stride = c_dst_stride;
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
const Dav1dFilmGrainData *data, size_t pw,
const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH],
int bh, int row_num HIGHBD_DECL_SUFFIX);
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
uint8_t scaling[SCALING_SIZE];
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);
fg_data[0].num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data[0].num_y_points;
for (int n = 0; n < fg_data[0].num_y_points; n++) {
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
fg_data[0].y_points[n][0] += rnd() % pad;
fg_data[0].y_points[n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
fg_data[0].num_y_points, scaling);
fg_data[0].clip_to_restricted_range = rnd() & 1;
fg_data[0].scaling_shift = (rnd() & 3) + 8;
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
fg_data[0].overlap_flag++)
{
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
int w, h, row_num;
if (fg_data[0].overlap_flag) {
w = 35 + (rnd() % 93);
if (i == 0) {
row_num = 0;
h = 1 + (rnd() % 31);
} else {
row_num = 1 + (rnd() & 0x7ff);
if (i == 1) {
h = 3 + (rnd() % 30);
} else {
h = 1 + (rnd() & 1);
}
}
} else {
w = 1 + (rnd() & 127);
h = 1 + (rnd() & 31);
row_num = rnd() & 0x7ff;
}
for (int y = 0; y < 32; y++) {
// Src pixels past the right edge can be uninitialized
for (int x = 0; x < 128; x++)
src[y * PXSTRIDE(stride) + x] = rnd();
for (int x = 0; x < w; x++)
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
}
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride,
w, h, "dst", 32, 2);
}
}
fg_data[0].overlap_flag = 1;
for (int y = 0; y < 32; y++) {
// Make sure all pixels are in range
for (int x = 0; x < 128; x++)
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
}
bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
1 HIGHBD_TAIL_SUFFIX);
}
report("fgy_32x32xn");
}
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
PIXEL_RECT(c_dst, 128, 32);
PIXEL_RECT(a_dst, 128, 32);
PIXEL_RECT(src, 128, 32);
PIXEL_RECT(luma_src, 128, 32);
const ptrdiff_t lstride = luma_src_stride;
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
const Dav1dFilmGrainData *data, size_t pw,
const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
int is_identity HIGHBD_DECL_SUFFIX);
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
const enum Dav1dPixelLayout layout = layout_idx + 1;
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
const ptrdiff_t stride = c_dst_stride;
for (int csfl = 0; csfl <= 1; csfl++) {
if (check_func(dsp->fguv_32x32xn[layout_idx],
"fguv_32x32xn_%dbpc_%s_csfl%d",
BITDEPTH, ss_name[layout_idx], csfl))
{
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const int uv_pl = rnd() & 1;
const int is_identity = rnd() & 1;
uint8_t scaling[SCALING_SIZE];
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
const int num_uv_pos = num_y_pos + 1;
for (int n = 0; n < num_uv_pos; n++)
fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
if (csfl) {
fg_data[0].num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data[0].num_y_points;
for (int n = 0; n < fg_data[0].num_y_points; n++) {
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
fg_data[0].y_points[n][0] += rnd() % pad;
fg_data[0].y_points[n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
fg_data[0].num_y_points, scaling);
} else {
fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
fg_data[0].num_uv_points[uv_pl], scaling);
fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
}
fg_data[0].clip_to_restricted_range = rnd() & 1;
fg_data[0].scaling_shift = (rnd() & 3) + 8;
fg_data[0].chroma_scaling_from_luma = csfl;
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
fg_data[0].overlap_flag++)
{
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
int w, h, row_num;
if (fg_data[0].overlap_flag) {
w = (36 >> ss_x) + (rnd() % (92 >> ss_x));
if (i == 0) {
row_num = 0;
h = 1 + (rnd() & (31 >> ss_y));
} else {
row_num = 1 + (rnd() & 0x7ff);
if (i == 1) {
h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30));
} else {
h = ss_y ? 1 : 1 + (rnd() & 1);
}
}
} else {
w = 1 + (rnd() & (127 >> ss_x));
h = 1 + (rnd() & (31 >> ss_y));
row_num = rnd() & 0x7ff;
}
for (int y = 0; y < 32; y++) {
// Src pixels past the right edge can be uninitialized
for (int x = 0; x < 128; x++) {
src[y * PXSTRIDE(stride) + x] = rnd();
luma_src[y * PXSTRIDE(lstride) + x] = rnd();
}
for (int x = 0; x < w; x++)
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
for (int x = 0; x < (w << ss_x); x++)
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
}
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel_padded_align(c_dst, stride,
a_dst, stride,
w, h, "dst",
32 >> ss_x, 2);
}
}
fg_data[0].overlap_flag = 1;
for (int y = 0; y < 32; y++) {
// Make sure all pixels are in range
for (int x = 0; x < 128; x++) {
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
}
}
bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
}
}
}
report("fguv_32x32xn");
}
void bitfn(checkasm_check_filmgrain)(void) {
Dav1dFilmGrainDSPContext c;
bitfn(dav1d_film_grain_dsp_init)(&c);
check_gen_grny(&c);
check_gen_grnuv(&c);
check_fgy_sbrow(&c);
check_fguv_sbrow(&c);
}

289
third_party/dav1d/tests/checkasm/ipred.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,289 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/ipred.h"
#include "src/levels.h"
#include <stdio.h>
static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
[DC_PRED] = "dc",
[DC_128_PRED] = "dc_128",
[TOP_DC_PRED] = "dc_top",
[LEFT_DC_PRED] = "dc_left",
[HOR_PRED] = "h",
[VERT_PRED] = "v",
[PAETH_PRED] = "paeth",
[SMOOTH_PRED] = "smooth",
[SMOOTH_V_PRED] = "smooth_v",
[SMOOTH_H_PRED] = "smooth_h",
[Z1_PRED] = "z1",
[Z2_PRED] = "z2",
[Z3_PRED] = "z3",
[FILTER_PRED] = "filter"
};
static const char *const cfl_ac_names[3] = { "420", "422", "444" };
static const char *const cfl_pred_mode_names[DC_128_PRED + 1] = {
[DC_PRED] = "cfl",
[DC_128_PRED] = "cfl_128",
[TOP_DC_PRED] = "cfl_top",
[LEFT_DC_PRED] = "cfl_left",
};
static const uint8_t z_angles[27] = {
3, 6, 9,
14, 17, 20, 23, 26, 29, 32,
36, 39, 42, 45, 48, 51, 54,
58, 61, 64, 67, 70, 73, 76,
81, 84, 87
};
static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
PIXEL_RECT(c_dst, 64, 64);
PIXEL_RECT(a_dst, 64, 64);
ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
int width, int height, int angle, int max_width, int max_height
HIGHBD_DECL_SUFFIX);
for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) {
int bpc_min = BITDEPTH, bpc_max = BITDEPTH;
if (mode == FILTER_PRED && BITDEPTH == 16) {
bpc_min = 10;
bpc_max = 12;
}
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2)
for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1)
if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc",
intra_pred_mode_names[mode], w, bpc))
{
for (int h = imax(w / 4, 4); h <= imin(w * 4,
(mode == FILTER_PRED ? 32 : 64)); h <<= 1)
{
const ptrdiff_t stride = c_dst_stride;
int a = 0, maxw = 0, maxh = 0;
if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
(rnd() & 0x600);
if (mode == Z2_PRED) {
maxw = rnd(), maxh = rnd();
maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
}
} else if (mode == FILTER_PRED) /* filter_idx */
a = (rnd() % 5) | (rnd() & ~511);
int bitdepth_max;
if (bpc == 16)
bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
else
bitdepth_max = (1 << bpc) - 1;
for (int i = -h * 2; i <= w * 2; i++)
topleft[i] = rnd() & bitdepth_max;
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel_padded(c_dst, stride,
a_dst, stride,
w, h, "dst"))
{
if (mode == Z1_PRED || mode == Z3_PRED)
fprintf(stderr, "angle = %d (0x%03x)\n",
a & 0x1ff, a & 0x600);
else if (mode == Z2_PRED)
fprintf(stderr, "angle = %d (0x%03x), "
"max_width = %d, max_height = %d\n",
a & 0x1ff, a & 0x600, maxw, maxh);
else if (mode == FILTER_PRED)
fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
}
bench_new(a_dst, stride, topleft, w, h, a, 128, 128
HIGHBD_TAIL_SUFFIX);
}
}
}
report("intra_pred");
}
static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
ALIGN_STK_64(pixel, luma, 32 * 32,);
declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
int w_pad, int h_pad, int cw, int ch);
for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) {
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
const int h_step = 2 >> ss_hor, v_step = 2 >> ss_ver;
for (int w = 4; w <= (32 >> ss_hor); w <<= 1)
if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc",
cfl_ac_names[layout - 1], w, BITDEPTH))
{
for (int h = imax(w / 4, 4);
h <= imin(w * 4, (32 >> ss_ver)); h <<= 1)
{
const ptrdiff_t stride = 32 * sizeof(pixel);
for (int w_pad = imax((w >> 2) - h_step, 0);
w_pad >= 0; w_pad -= h_step)
{
for (int h_pad = imax((h >> 2) - v_step, 0);
h_pad >= 0; h_pad -= v_step)
{
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int y = 0; y < (h << ss_ver); y++)
for (int x = 0; x < (w << ss_hor); x++)
luma[y * 32 + x] = rnd() & bitdepth_max;
call_ref(c_dst, luma, stride, w_pad, h_pad, w, h);
call_new(a_dst, luma, stride, w_pad, h_pad, w, h);
checkasm_check(int16_t, c_dst, w * sizeof(*c_dst),
a_dst, w * sizeof(*a_dst),
w, h, "dst");
}
}
bench_new(a_dst, luma, stride, 0, 0, w, h);
}
}
}
report("cfl_ac");
}
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
ALIGN_STK_64(int16_t, ac, 32 * 32,);
ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
int width, int height, const int16_t *ac, int alpha
HIGHBD_DECL_SUFFIX);
for (int mode = 0; mode <= DC_128_PRED; mode += 1 + 2 * !mode)
for (int w = 4; w <= 32; w <<= 1)
if (check_func(c->cfl_pred[mode], "cfl_pred_%s_w%d_%dbpc",
cfl_pred_mode_names[mode], w, BITDEPTH))
{
for (int h = imax(w / 4, 4); h <= imin(w * 4, 32); h <<= 1)
{
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const ptrdiff_t stride = w * sizeof(pixel);
int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2));
for (int i = -h * 2; i <= w * 2; i++)
topleft[i] = rnd() & bitdepth_max;
int luma_avg = w * h >> 1;
for (int i = 0; i < w * h; i++)
luma_avg += ac[i] = rnd() & (bitdepth_max << 3);
luma_avg /= w * h;
for (int i = 0; i < w * h; i++)
ac[i] -= luma_avg;
call_ref(c_dst, stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride,
w, h, "dst");
bench_new(a_dst, stride, topleft, w, h, ac, alpha
HIGHBD_TAIL_SUFFIX);
}
}
report("cfl_pred");
}
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
ALIGN_STK_16(uint16_t, pal, 8,);
declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
const uint8_t *idx, int w, int h);
for (int w = 4; w <= 64; w <<= 1)
if (check_func(c->pal_pred, "pal_pred_w%d_%dbpc", w, BITDEPTH))
for (int h = imax(w / 4, 4); h <= imin(w * 4, 64); h <<= 1)
{
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const ptrdiff_t stride = w * sizeof(pixel);
for (int i = 0; i < 8; i++)
pal[i] = rnd() & bitdepth_max;
for (int i = 0; i < w * h; i++)
idx[i] = rnd() & 7;
call_ref(c_dst, stride, pal, idx, w, h);
call_new(a_dst, stride, pal, idx, w, h);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
bench_new(a_dst, stride, pal, idx, w, h);
}
report("pal_pred");
}
void bitfn(checkasm_check_ipred)(void) {
Dav1dIntraPredDSPContext c;
bitfn(dav1d_intra_pred_dsp_init)(&c);
check_intra_pred(&c);
check_cfl_ac(&c);
check_cfl_pred(&c);
check_pal_pred(&c);
}

313
third_party/dav1d/tests/checkasm/itx.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,313 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <math.h>
#include "src/itx.h"
#include "src/levels.h"
#include "src/scan.h"
#include "src/tables.h"
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#ifndef M_SQRT1_2
#define M_SQRT1_2 0.707106781186547524401
#endif
enum Tx1D { DCT, ADST, FLIPADST, IDENTITY, WHT };
static const uint8_t itx_1d_types[N_TX_TYPES_PLUS_LL][2] = {
[DCT_DCT] = { DCT, DCT },
[ADST_DCT] = { DCT, ADST },
[DCT_ADST] = { ADST, DCT },
[ADST_ADST] = { ADST, ADST },
[FLIPADST_DCT] = { DCT, FLIPADST },
[DCT_FLIPADST] = { FLIPADST, DCT },
[FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
[ADST_FLIPADST] = { FLIPADST, ADST },
[FLIPADST_ADST] = { ADST, FLIPADST },
[IDTX] = { IDENTITY, IDENTITY },
[V_DCT] = { IDENTITY, DCT },
[H_DCT] = { DCT, IDENTITY },
[V_ADST] = { IDENTITY, ADST },
[H_ADST] = { ADST, IDENTITY },
[V_FLIPADST] = { IDENTITY, FLIPADST },
[H_FLIPADST] = { FLIPADST, IDENTITY },
[WHT_WHT] = { WHT, WHT },
};
static const char *const itx_1d_names[5] = {
[DCT] = "dct",
[ADST] = "adst",
[FLIPADST] = "flipadst",
[IDENTITY] = "identity",
[WHT] = "wht"
};
static const double scaling_factors[9] = {
4.0000, /* 4x4 */
4.0000 * M_SQRT1_2, /* 4x8 8x4 */
2.0000, /* 4x16 8x8 16x4 */
2.0000 * M_SQRT1_2, /* 8x16 16x8 */
1.0000, /* 8x32 16x16 32x8 */
0.5000 * M_SQRT1_2, /* 16x32 32x16 */
0.2500, /* 16x64 32x32 64x16 */
0.1250 * M_SQRT1_2, /* 32x64 64x32 */
0.0625, /* 64x64 */
};
/* FIXME: Ensure that those forward transforms are similar to the real AV1
* transforms. The FLIPADST currently uses the ADST forward transform for
* example which is obviously "incorrect", but we're just using it for now
* since it does produce coefficients in the correct range at least. */
/* DCT-II */
static void fdct_1d(double *const out, const double *const in, const int sz) {
for (int i = 0; i < sz; i++) {
out[i] = 0.0;
for (int j = 0; j < sz; j++)
out[i] += in[j] * cos(M_PI * (2 * j + 1) * i / (sz * 2.0));
}
out[0] *= M_SQRT1_2;
}
/* See "Towards jointly optimal spatial prediction and adaptive transform in
* video/image coding", by J. Han, A. Saxena, and K. Rose
* IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.
* and "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",
* by Jingning Han, Yaowu Xu, and Debargha Mukherjee
* http://research.google.com/pubs/archive/41418.pdf
*/
static void fadst_1d(double *const out, const double *const in, const int sz) {
for (int i = 0; i < sz; i++) {
out[i] = 0.0;
for (int j = 0; j < sz; j++)
out[i] += in[j] * sin(M_PI *
(sz == 4 ? ( j + 1) * (2 * i + 1) / (8.0 + 1.0) :
(2 * j + 1) * (2 * i + 1) / (sz * 4.0)));
}
}
static void fwht4_1d(double *const out, const double *const in)
{
const double t0 = in[0] + in[1];
const double t3 = in[3] - in[2];
const double t4 = (t0 - t3) * 0.5;
const double t1 = t4 - in[1];
const double t2 = t4 - in[2];
out[0] = t0 - t2;
out[1] = t2;
out[2] = t3 + t1;
out[3] = t1;
}
static int copy_subcoefs(coef *coeff,
const enum RectTxfmSize tx, const enum TxfmType txtp,
const int sw, const int sh, const int subsh)
{
/* copy the topleft coefficients such that the return value (being the
* coefficient scantable index for the eob token) guarantees that only
* the topleft $sub out of $sz (where $sz >= $sub) coefficients in both
* dimensions are non-zero. This leads to braching to specific optimized
* simd versions (e.g. dc-only) so that we get full asm coverage in this
* test */
const enum TxClass tx_class = dav1d_tx_type_class[txtp];
const uint16_t *const scan = dav1d_scans[tx];
const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
const int sub_low = subsh > 1 ? sub_high - 8 : 0;
int n, eob;
for (n = 0, eob = 0; n < sw * sh; n++) {
int rc, rcx, rcy;
if (tx_class == TX_CLASS_2D)
rc = scan[n], rcx = rc % sh, rcy = rc / sh;
else if (tx_class == TX_CLASS_H)
rcx = n % sh, rcy = n / sh, rc = n;
else /* tx_class == TX_CLASS_V */
rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx;
/* Pick a random eob within this sub-itx */
if (rcx > sub_high || rcy > sub_high) {
break; /* upper boundary */
} else if (!eob && (rcx > sub_low || rcy > sub_low))
eob = n; /* lower boundary */
}
if (eob)
eob += rnd() % (n - eob - 1);
if (tx_class == TX_CLASS_2D)
for (n = eob + 1; n < sw * sh; n++)
coeff[scan[n]] = 0;
else if (tx_class == TX_CLASS_H)
for (n = eob + 1; n < sw * sh; n++)
coeff[n] = 0;
else /* tx_class == TX_CLASS_V */ {
for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1)
while (++rcy < sw)
coeff[rcy * sh + rcx] = 0;
n = sw * sh;
}
for (; n < 32 * 32; n++)
coeff[n] = rnd();
return eob;
}
static int ftx(coef *const buf, const enum RectTxfmSize tx,
const enum TxfmType txtp, const int w, const int h,
const int subsh, const int bitdepth_max)
{
double out[64 * 64], temp[64 * 64];
const double scale = scaling_factors[ctz(w * h) - 4];
const int sw = imin(w, 32), sh = imin(h, 32);
for (int i = 0; i < h; i++) {
double in[64], temp_out[64];
for (int i = 0; i < w; i++)
in[i] = (rnd() & (2 * bitdepth_max + 1)) - bitdepth_max;
switch (itx_1d_types[txtp][0]) {
case DCT:
fdct_1d(temp_out, in, w);
break;
case ADST:
case FLIPADST:
fadst_1d(temp_out, in, w);
break;
case WHT:
fwht4_1d(temp_out, in);
break;
case IDENTITY:
memcpy(temp_out, in, w * sizeof(*temp_out));
break;
}
for (int j = 0; j < w; j++)
temp[j * h + i] = temp_out[j] * scale;
}
for (int i = 0; i < w; i++) {
switch (itx_1d_types[txtp][0]) {
case DCT:
fdct_1d(&out[i * h], &temp[i * h], h);
break;
case ADST:
case FLIPADST:
fadst_1d(&out[i * h], &temp[i * h], h);
break;
case WHT:
fwht4_1d(&out[i * h], &temp[i * h]);
break;
case IDENTITY:
memcpy(&out[i * h], &temp[i * h], h * sizeof(*out));
break;
}
}
for (int y = 0; y < sh; y++)
for (int x = 0; x < sw; x++)
buf[y * sw + x] = (coef) (out[y * w + x] + 0.5);
return copy_subcoefs(buf, tx, txtp, sw, sh, subsh);
}
static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
const enum RectTxfmSize tx)
{
ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };
const int w = dav1d_txfm_dimensions[tx].w * 4;
const int h = dav1d_txfm_dimensions[tx].h * 4;
const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
dav1d_txfm_dimensions[tx].lh)];
#if BITDEPTH == 16
const int bpc_min = 10, bpc_max = 12;
#else
const int bpc_min = 8, bpc_max = 8;
#endif
declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff,
int eob HIGHBD_DECL_SUFFIX);
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
bitfn(dav1d_itx_dsp_init)(c, bpc);
for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)
for (int subsh = 0; subsh < subsh_max; subsh++)
if (check_func(c->itxfm_add[tx][txtp],
"inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",
w, h, itx_1d_names[itx_1d_types[txtp][0]],
itx_1d_names[itx_1d_types[txtp][1]], subsh,
bpc))
{
const int bitdepth_max = (1 << bpc) - 1;
const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
memcpy(coeff[1], coeff[0], sizeof(*coeff));
for (int j = 0; j < w * h; j++)
c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
a_dst, w * sizeof(*a_dst),
w, h, "dst");
if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
fail();
bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
HIGHBD_TAIL_SUFFIX);
}
}
report("add_%dx%d", w, h);
}
void bitfn(checkasm_check_itx)(void) {
static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
TX_4X4, RTX_4X8, RTX_4X16,
RTX_8X4, TX_8X8, RTX_8X16, RTX_8X32,
RTX_16X4, RTX_16X8, TX_16X16, RTX_16X32, RTX_16X64,
RTX_32X8, RTX_32X16, TX_32X32, RTX_32X64,
RTX_64X16, RTX_64X32, TX_64X64
};
/* Zero unused function pointer elements. */
Dav1dInvTxfmDSPContext c = { { { 0 } } };
for (int i = 0; i < N_RECT_TX_SIZES; i++)
check_itxfm_add(&c, txfm_size_order[i]);
}

203
third_party/dav1d/tests/checkasm/loopfilter.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,203 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <string.h>
#include "src/levels.h"
#include "src/loopfilter.h"
static void init_lpf_border(pixel *const dst, const ptrdiff_t stride,
int E, int I, const int bitdepth_max)
{
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const int F = 1 << bitdepth_min_8;
E <<= bitdepth_min_8;
I <<= bitdepth_min_8;
const int filter_type = rnd() % 4;
const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2);
switch (filter_type) {
case 0: // random, unfiltered
for (int i = -8; i < 8; i++)
dst[i * stride] = rnd() & bitdepth_max;
break;
case 1: // long flat
dst[-8 * stride] = rnd() & bitdepth_max;
dst[+7 * stride] = rnd() & bitdepth_max;
dst[+0 * stride] = rnd() & bitdepth_max;
dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
for (int i = 1; i < 7; i++) {
dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] +
rnd() % (2 * (F + 1)) - (F + 1));
dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] +
rnd() % (2 * (F + 1)) - (F + 1));
}
break;
case 2: // short flat
for (int i = 4; i < 8; i++) {
dst[-(1 + i) * stride] = rnd() & bitdepth_max;
dst[+(0 + i) * stride] = rnd() & bitdepth_max;
}
dst[+0 * stride] = rnd() & bitdepth_max;
dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
for (int i = 1; i < 4; i++) {
dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] +
rnd() % (2 * (F + 1)) - (F + 1));
dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] +
rnd() % (2 * (F + 1)) - (F + 1));
}
break;
case 3: // normal or hev
for (int i = 4; i < 8; i++) {
dst[-(1 + i) * stride] = rnd() & bitdepth_max;
dst[+(0 + i) * stride] = rnd() & bitdepth_max;
}
dst[+0 * stride] = rnd() & bitdepth_max;
dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
for (int i = 1; i < 4; i++) {
dst[-(1 + i) * stride] = iclip_pixel(dst[-(0 + i) * stride] +
rnd() % (2 * (I + 1)) - (I + 1));
dst[+(0 + i) * stride] = iclip_pixel(dst[+(i - 1) * stride] +
rnd() % (2 * (I + 1)) - (I + 1));
}
break;
}
}
static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
const int n_blks, const int lf_idx,
const int is_chroma, const int dir)
{
ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX);
pixel *a_dst, *c_dst;
ptrdiff_t stride, b4_stride;
int w, h;
if (dir) {
a_dst = a_dst_mem + 128 * 8;
c_dst = c_dst_mem + 128 * 8;
w = 128;
h = 16;
b4_stride = 32;
} else {
a_dst = a_dst_mem + 8;
c_dst = c_dst_mem + 8;
w = 16;
h = 128;
b4_stride = 2;
}
stride = w * sizeof(pixel);
Av1FilterLUT lut;
const int sharp = rnd() & 7;
for (int level = 0; level < 64; level++) {
int limit = level;
if (sharp > 0) {
limit >>= (sharp + 3) >> 2;
limit = imin(limit, 9 - sharp);
}
limit = imax(limit, 1);
lut.i[level] = limit;
lut.e[level] = 2 * (level + 2) + limit;
}
lut.sharp[0] = (sharp + 3) >> 2;
lut.sharp[1] = sharp ? 9 - sharp : 0xff;
const int n_strengths = is_chroma ? 2 : 3;
for (int i = 0; i < n_strengths; i++) {
if (check_func(fn, "%s_w%d_%dbpc", name,
is_chroma ? 4 + 2 * i : 4 << i, BITDEPTH))
{
uint32_t vmask[4] = { 0 };
uint8_t l[32 * 2][4];
for (int j = 0; j < n_blks; j++) {
const int idx = rnd() % (i + 2);
if (idx) vmask[idx - 1] |= 1U << j;
if (dir) {
l[j][lf_idx] = rnd() & 63;
l[j + 32][lf_idx] = rnd() & 63;
} else {
l[j * 2][lf_idx] = rnd() & 63;
l[j * 2 + 1][lf_idx] = rnd() & 63;
}
}
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < 4 * n_blks; i++) {
const int x = i >> 2;
int L;
if (dir) {
L = l[32 + x][lf_idx] ? l[32 + x][lf_idx] : l[x][lf_idx];
} else {
L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx];
}
init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1,
lut.e[L], lut.i[L], bitdepth_max);
}
memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16);
call_ref(c_dst, stride,
vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
&lut, n_blks HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride,
vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
&lut, n_blks HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst_mem, stride, a_dst_mem, stride,
w, h, "dst");
bench_new(a_dst, stride,
vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
&lut, n_blks HIGHBD_TAIL_SUFFIX);
}
}
report(name);
}
void bitfn(checkasm_check_loopfilter)(void) {
Dav1dLoopFilterDSPContext c;
bitfn(dav1d_loop_filter_dsp_init)(&c);
check_lpf_sb(c.loop_filter_sb[0][0], "lpf_h_sb_y", 32, 0, 0, 0);
check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb_y", 32, 1, 0, 1);
check_lpf_sb(c.loop_filter_sb[1][0], "lpf_h_sb_uv", 16, 2, 1, 0);
check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb_uv", 16, 2, 1, 1);
}

202
third_party/dav1d/tests/checkasm/looprestoration.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,202 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <stdio.h>
#include <string.h>
#include "src/levels.h"
#include "src/looprestoration.h"
#include "src/tables.h"
static int to_binary(int x) { /* 0-15 -> 0000-1111 */
return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
}
static void init_tmp(pixel *buf, const ptrdiff_t stride,
const int w, const int h, const int bitdepth_max)
{
const int noise_mask = bitdepth_max >> 4;
const int x_off = rnd() & 7, y_off = rnd() & 7;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^
(rnd() & noise_mask);
}
buf += PXSTRIDE(stride);
}
}
static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
pixel left[64][4];
LooprestorationParams params;
int16_t (*const filter)[8] = params.filter;
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, const LooprestorationParams *params,
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
for (int t = 0; t < 2; t++) {
if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) {
filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5;
filter[0][1] = filter[0][5] = (rnd() & 31) - 23;
filter[0][2] = filter[0][4] = (rnd() & 63) - 17;
filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
#if BITDEPTH != 8
filter[0][3] += 128;
#endif
filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5;
filter[1][1] = filter[1][5] = (rnd() & 31) - 23;
filter[1][2] = filter[1][4] = (rnd() & 63) - 17;
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
const int base_w = 1 + (rnd() % 384);
const int base_h = 1 + (rnd() & 63);
const int bitdepth_max = (1 << bpc) - 1;
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
call_ref(c_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
a_dst, 448 * sizeof(pixel),
w, h, "dst"))
{
fprintf(stderr, "size = %dx%d, edges = %04d\n",
w, h, to_binary(edges));
break;
}
}
bench_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
256, 64, &params, 0xf HIGHBD_TAIL_SUFFIX);
}
}
}
static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
pixel left[64][4];
LooprestorationParams params;
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, const LooprestorationParams *params,
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
static const struct { char name[4]; uint8_t idx; } sgr_data[3] = {
{ "5x5", 14 },
{ "3x3", 10 },
{ "mix", 0 },
};
for (int i = 0; i < 3; i++) {
if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) {
const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx];
params.sgr.s0 = sgr_params[0];
params.sgr.s1 = sgr_params[1];
params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0;
params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0;
const int base_w = 1 + (rnd() % 384);
const int base_h = 1 + (rnd() & 63);
const int bitdepth_max = (1 << bpc) - 1;
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
call_ref(c_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
a_dst, 448 * sizeof(pixel),
w, h, "dst"))
{
fprintf(stderr, "size = %dx%d, edges = %04d\n",
w, h, to_binary(edges));
break;
}
}
bench_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
256, 64, &params, 0xf HIGHBD_TAIL_SUFFIX);
}
}
}
void bitfn(checkasm_check_looprestoration)(void) {
#if BITDEPTH == 16
const int bpc_min = 10, bpc_max = 12;
#else
const int bpc_min = 8, bpc_max = 8;
#endif
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
Dav1dLoopRestorationDSPContext c;
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
check_wiener(&c, bpc);
}
report("wiener");
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
Dav1dLoopRestorationDSPContext c;
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
check_sgr(&c, bpc);
}
report("sgr");
}

756
third_party/dav1d/tests/checkasm/mc.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,756 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/levels.h"
#include "src/mc.h"
static const char *const filter_names[] = {
"8tap_regular", "8tap_regular_smooth", "8tap_regular_sharp",
"8tap_sharp_regular", "8tap_sharp_smooth", "8tap_sharp",
"8tap_smooth_regular", "8tap_smooth", "8tap_smooth_sharp",
"bilinear"
};
static const char *const mxy_names[] = { "0", "h", "v", "hv" };
static const char *const scaled_paths[] = { "", "_dy1", "_dy2" };
static int mc_h_next(const int h) {
switch (h) {
case 4:
case 8:
case 16:
return (h * 3) >> 1;
case 6:
case 12:
case 24:
return (h & (h - 1)) * 2;
default:
return h * 2;
}
}
static void check_mc(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
ptrdiff_t src_stride, int w, int h, int mx, int my
HIGHBD_DECL_SUFFIX);
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
for (int mxy = 0; mxy < 4; mxy++)
if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc",
filter_names[filter], w, mxy_names[mxy], BITDEPTH))
{
const int h_min = w <= 32 ? 2 : w / 4;
const int h_max = imax(imin(w * 4, 128), 32);
for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < 135 * 135; i++)
src_buf[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride,
a_dst, dst_stride,
w, h, "dst");
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
{
bench_new(a_dst, dst_stride, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
}
}
}
}
report("mc");
}
/* Generate worst case input in the topleft corner, randomize the rest */
static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 };
const int sign = -(rnd() & 1);
for (int y = 0; y < 135; y++)
for (int x = 0; x < 135; x++)
buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
: rnd()) & bitdepth_max;
}
static void check_mct(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 4; w <= 128; w <<= 1)
for (int mxy = 0; mxy < 4; mxy++)
if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
filter_names[filter], w, mxy_names[mxy], BITDEPTH))
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
generate_mct_input(src_buf, bitdepth_max);
call_ref(c_tmp, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_tmp, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
a_tmp, w * sizeof(*a_tmp),
w, h, "tmp");
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
{
bench_new(a_tmp, src, src_stride, w, h,
mx, my HIGHBD_TAIL_SUFFIX);
}
}
report("mct");
}
static void check_mc_scaled(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
const pixel *src = src_buf + 263 * 3 + 3;
const ptrdiff_t src_stride = 263 * sizeof(pixel);
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
ptrdiff_t src_stride, int w, int h,
int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
for (int p = 0; p < 3; ++p) {
if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
filter_names[filter], w, scaled_paths[p], BITDEPTH))
{
const int h_min = w <= 32 ? 2 : w / 4;
const int h_max = imax(imin(w * 4, 128), 32);
for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
const int mx = rnd() % 1024;
const int my = rnd() % 1024;
const int dx = rnd() % 2048 + 1;
const int dy = !p
? rnd() % 2048 + 1
: p << 10; // ystep=1.0 and ystep=2.0 paths
for (int k = 0; k < 263 * 263; k++)
src_buf[k] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride,
a_dst, dst_stride, w, h, "dst");
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
bench_new(a_dst, dst_stride, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
}
}
}
}
report("mc_scaled");
}
static void check_mct_scaled(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
const pixel *src = src_buf + 263 * 3 + 3;
const ptrdiff_t src_stride = 263 * sizeof(pixel);
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 4; w <= 128; w <<= 1)
for (int p = 0; p < 3; ++p) {
if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc",
filter_names[filter], w, scaled_paths[p], BITDEPTH))
{
const int h_min = imax(w / 4, 4);
const int h_max = imin(w * 4, 128);
for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
const int mx = rnd() % 1024;
const int my = rnd() % 1024;
const int dx = rnd() % 2048 + 1;
const int dy = !p
? rnd() % 2048 + 1
: p << 10; // ystep=1.0 and ystep=2.0 paths
for (int k = 0; k < 263 * 263; k++)
src_buf[k] = rnd() & bitdepth_max;
call_ref(c_tmp, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
call_new(a_tmp, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
a_tmp, w * sizeof(*a_tmp),
w, h, "tmp");
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
bench_new(a_tmp, src, src_stride,
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
}
}
}
report("mct_scaled");
}
static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
int16_t (*const tmp)[128 * 128], const int bitdepth_max)
{
for (int i = 0; i < 2; i++) {
generate_mct_input(buf, bitdepth_max);
c->mct[FILTER_2D_8TAP_SHARP](tmp[i], buf + 135 * 3 + 3,
135 * sizeof(pixel), 128, 128,
8, 8 HIGHBD_TAIL_SUFFIX);
}
}
static void check_avg(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) {
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
}
}
report("avg");
}
static void check_w_avg(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) {
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
int weight = rnd() % 15 + 1;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
}
}
report("w_avg");
}
static void check_mask(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(uint8_t, mask, 128 * 128,);
for (int i = 0; i < 128 * 128; i++)
mask[i] = rnd() % 65;
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, const uint8_t *mask
HIGHBD_DECL_SUFFIX);
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) {
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
}
}
report("mask");
}
static void check_w_mask(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
HIGHBD_DECL_SUFFIX);
static const uint16_t ss[] = { 444, 422, 420 };
static const uint8_t ss_hor[] = { 0, 1, 1 };
static const uint8_t ss_ver[] = { 0, 0, 1 };
for (int i = 0; i < 3; i++)
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w,
BITDEPTH))
{
ptrdiff_t dst_stride = w * sizeof(pixel);
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
{
int sign = rnd() & 1;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
init_tmp(c, c_dst, tmp, bitdepth_max);
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h,
c_mask, sign HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
a_mask, sign HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride,
a_dst, dst_stride,
w, h, "dst");
checkasm_check(uint8_t, c_mask, w >> ss_hor[i],
a_mask, w >> ss_hor[i],
w >> ss_hor[i], h >> ss_ver[i],
"mask");
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
a_mask, sign HIGHBD_TAIL_SUFFIX);
}
}
report("w_mask");
}
static void check_blend(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, tmp, 32 * 32,);
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
ALIGN_STK_64(uint8_t, mask, 32 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h, const uint8_t *mask);
for (int w = 4; w <= 32; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < 32 * 32; i++) {
tmp[i] = rnd() & bitdepth_max;
mask[i] = rnd() % 65;
}
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, tmp, w, h, mask);
call_new(a_dst, dst_stride, tmp, w, h, mask);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp, w, h, mask);
}
}
report("blend");
}
static void check_blend_v(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, tmp, 32 * 128,);
ALIGN_STK_64(pixel, c_dst, 32 * 128,);
ALIGN_STK_64(pixel, a_dst, 32 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
for (int w = 2; w <= 32; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
for (int i = 0; i < 32 * 128; i++)
tmp[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, tmp, w, h);
call_new(a_dst, dst_stride, tmp, w, h);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp, w, h);
}
}
report("blend_v");
}
static void check_blend_h(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, tmp, 128 * 32,);
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
for (int i = 0; i < 128 * 32; i++)
tmp[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, tmp, w, h);
call_new(a_dst, dst_stride, tmp, w, h);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
w, h, "dst");
bench_new(a_dst, dst_stride, tmp, w, h);
}
}
report("blend_h");
}
static void check_warp8x8(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
ALIGN_STK_64(pixel, c_dst, 8 * 8,);
ALIGN_STK_64(pixel, a_dst, 8 * 8,);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
const ptrdiff_t src_stride = 15 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
ptrdiff_t src_stride, const int16_t *abcd, int mx, int my
HIGHBD_DECL_SUFFIX);
if (check_func(c->warp8x8, "warp_8x8_%dbpc", BITDEPTH)) {
const int mx = (rnd() & 0x1fff) - 0xa00;
const int my = (rnd() & 0x1fff) - 0xa00;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < 4; i++)
abcd[i] = (rnd() & 0x1fff) - 0xa00;
for (int i = 0; i < 15 * 15; i++)
src_buf[i] = rnd() & bitdepth_max;
call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
8, 8, "dst");
bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
}
report("warp8x8");
}
static void check_warp8x8t(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
ALIGN_STK_64(int16_t, c_tmp, 8 * 8,);
ALIGN_STK_64(int16_t, a_tmp, 8 * 8,);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t src_stride = 15 * sizeof(pixel);
declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src,
ptrdiff_t src_stride, const int16_t *abcd, int mx, int my
HIGHBD_DECL_SUFFIX);
if (check_func(c->warp8x8t, "warp_8x8t_%dbpc", BITDEPTH)) {
const int mx = (rnd() & 0x1fff) - 0xa00;
const int my = (rnd() & 0x1fff) - 0xa00;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < 4; i++)
abcd[i] = (rnd() & 0x1fff) - 0xa00;
for (int i = 0; i < 15 * 15; i++)
src_buf[i] = rnd() & bitdepth_max;
call_ref(c_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
call_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
checkasm_check(int16_t, c_tmp, 8 * sizeof(*c_tmp),
a_tmp, 8 * sizeof(*a_tmp),
8, 8, "tmp");
bench_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
}
report("warp8x8t");
}
enum EdgeFlags {
HAVE_TOP = 1,
HAVE_BOTTOM = 2,
HAVE_LEFT = 4,
HAVE_RIGHT = 8,
};
static void random_offset_for_edge(int *const x, int *const y,
const int bw, const int bh,
int *const iw, int *const ih,
const enum EdgeFlags edge)
{
#define set_off(edge1, edge2, pos, dim) \
*i##dim = edge & (HAVE_##edge1 | HAVE_##edge2) ? 160 : 1 + (rnd() % (b##dim - 2)); \
switch (edge & (HAVE_##edge1 | HAVE_##edge2)) { \
case HAVE_##edge1 | HAVE_##edge2: \
assert(b##dim <= *i##dim); \
*pos = rnd() % (*i##dim - b##dim + 1); \
break; \
case HAVE_##edge1: \
*pos = (*i##dim - b##dim) + 1 + (rnd() % (b##dim - 1)); \
break; \
case HAVE_##edge2: \
*pos = -(1 + (rnd() % (b##dim - 1))); \
break; \
case 0: \
assert(b##dim - 1 > *i##dim); \
*pos = -(1 + (rnd() % (b##dim - *i##dim - 1))); \
break; \
}
set_off(LEFT, RIGHT, x, w);
set_off(TOP, BOTTOM, y, h);
}
static void check_emuedge(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 135 * 192,);
ALIGN_STK_64(pixel, a_dst, 135 * 192,);
ALIGN_STK_64(pixel, src, 160 * 160,);
for (int i = 0; i < 160 * 160; i++)
src[i] = rnd() & ((1U << BITDEPTH) - 1);
declare_func(void, intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih,
intptr_t x, intptr_t y,
pixel *dst, ptrdiff_t dst_stride,
const pixel *src, ptrdiff_t src_stride);
int x, y, iw, ih;
for (int w = 4; w <= 128; w <<= 1)
if (check_func(c->emu_edge, "emu_edge_w%d_%dbpc", w, BITDEPTH)) {
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) {
// we skip 0xf, since it implies that we don't need emu_edge
for (enum EdgeFlags edge = 0; edge < 0xf; edge++) {
const int bw = w + (rnd() & 7);
const int bh = h + (rnd() & 7);
random_offset_for_edge(&x, &y, bw, bh, &iw, &ih, edge);
call_ref(bw, bh, iw, ih, x, y,
c_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
call_new(bw, bh, iw, ih, x, y,
a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
checkasm_check_pixel(c_dst, 192 * sizeof(pixel),
a_dst, 192 * sizeof(pixel),
bw, bh, "dst");
}
}
for (enum EdgeFlags edge = 1; edge < 0xf; edge <<= 1) {
random_offset_for_edge(&x, &y, w + 7, w + 7, &iw, &ih, edge);
bench_new(w + 7, w + 7, iw, ih, x, y,
a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
}
}
report("emu_edge");
}
static int get_upscale_x0(const int in_w, const int out_w, const int step) {
const int err = out_w * step - (in_w << 14);
const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
return x0 & 0x3fff;
}
static void check_resize(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
ALIGN_STK_64(pixel, src, 512 * 64,);
const int height = 64;
const int max_src_width = 512;
const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
const ptrdiff_t src_stride = 512 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel *src, ptrdiff_t src_stride,
int dst_w, int src_w, int h, int dx, int mx0
HIGHBD_DECL_SUFFIX);
if (check_func(c->resize, "resize_%dbpc", BITDEPTH)) {
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
for (int i = 0; i < max_src_width * height; i++)
src[i] = rnd() & bitdepth_max;
const int w_den = 9 + (rnd() & 7);
const int src_w = 16 + (rnd() % (max_src_width - 16 + 1));
const int dst_w = w_den * src_w >> 3;
#define scale_fac(ref_sz, this_sz) \
((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
const int dx = scale_fac(src_w, dst_w);
#undef scale_fac
const int mx0 = get_upscale_x0(src_w, dst_w, dx);
call_ref(c_dst, dst_stride, src, src_stride,
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
call_new(a_dst, dst_stride, src, src_stride,
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
dst_w, height, "dst");
bench_new(a_dst, dst_stride, src, src_stride,
512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
}
report("resize");
}
void bitfn(checkasm_check_mc)(void) {
Dav1dMCDSPContext c;
bitfn(dav1d_mc_dsp_init)(&c);
check_mc(&c);
check_mct(&c);
check_mc_scaled(&c);
check_mct_scaled(&c);
check_avg(&c);
check_w_avg(&c);
check_mask(&c);
check_w_mask(&c);
check_blend(&c);
check_blend_v(&c);
check_blend_h(&c);
check_warp8x8(&c);
check_warp8x8t(&c);
check_emuedge(&c);
check_resize(&c);
}

293
third_party/dav1d/tests/checkasm/msac.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,293 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/cpu.h"
#include "src/msac.h"
#include <stdio.h>
#include <string.h>
#define BUF_SIZE 8192
/* The normal code doesn't use function pointers */
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf);
typedef unsigned (*decode_bool_equi_fn)(MsacContext *s);
typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f);
typedef struct {
decode_symbol_adapt_fn decode_symbol_adapt4;
decode_symbol_adapt_fn decode_symbol_adapt8;
decode_symbol_adapt_fn decode_symbol_adapt16;
decode_adapt_fn decode_bool_adapt;
decode_bool_equi_fn decode_bool_equi;
decode_bool_fn decode_bool;
decode_adapt_fn decode_hi_tok;
} MsacDSPContext;
static void randomize_cdf(uint16_t *const cdf, const int n) {
int i;
for (i = 15; i > n; i--)
cdf[i] = rnd(); // padding
cdf[i] = 0; // count
do {
cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1;
} while (--i > 0);
}
/* memcmp() on structs can have weird behavior due to padding etc. */
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
a->allow_update_cdf != b->allow_update_cdf;
}
static void msac_dump(unsigned c_res, unsigned a_res,
const MsacContext *const a, const MsacContext *const b,
const uint16_t *const cdf_a, const uint16_t *const cdf_b,
const int num_cdf)
{
if (c_res != a_res)
fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res);
if (a->buf_pos != b->buf_pos)
fprintf(stderr, "buf_pos %p vs %p\n", a->buf_pos, b->buf_pos);
if (a->buf_end != b->buf_end)
fprintf(stderr, "buf_end %p vs %p\n", a->buf_end, b->buf_end);
if (a->dif != b->dif)
fprintf(stderr, "dif %zx vs %zx\n", a->dif, b->dif);
if (a->rng != b->rng)
fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng);
if (a->cnt != b->cnt)
fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt);
if (a->allow_update_cdf)
fprintf(stderr, "allow_update_cdf %d vs %d\n",
a->allow_update_cdf, b->allow_update_cdf);
if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
fprintf(stderr, "cdf:\n");
for (int i = 0; i <= num_cdf; i++)
fprintf(stderr, " %5u", cdf_a[i]);
fprintf(stderr, "\n");
for (int i = 0; i <= num_cdf; i++)
fprintf(stderr, " %5u", cdf_b[i]);
fprintf(stderr, "\n");
for (int i = 0; i <= num_cdf; i++)
fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.');
fprintf(stderr, "\n");
}
}
#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \
if (check_func(c->decode_symbol_adapt##n, \
"msac_decode_symbol_adapt%d", n)) \
{ \
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
for (int ns = n_min; ns <= n_max; ns++) { \
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \
s_a = s_c; \
randomize_cdf(cdf[0], ns); \
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
for (int i = 0; i < 64; i++) { \
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
unsigned a_res = call_new(&s_a, cdf[1], ns); \
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \
{ \
if (fail()) \
msac_dump(c_res, a_res, &s_c, &s_a, \
cdf[0], cdf[1], ns); \
} \
} \
if (cdf_update && ns == n - 1) \
bench_new(&s_a, cdf[1], ns); \
} \
} \
} \
} while (0)
static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
ALIGN_STK_32(uint16_t, cdf, 2, [16]);
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
CHECK_SYMBOL_ADAPT( 4, 1, 4);
CHECK_SYMBOL_ADAPT( 8, 1, 7);
CHECK_SYMBOL_ADAPT(16, 3, 15);
report("decode_symbol");
}
static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
if (check_func(c->decode_bool_adapt, "msac_decode_bool_adapt")) {
uint16_t cdf[2][2];
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
s_a = s_c;
cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1;
cdf[0][1] = cdf[1][1] = 0;
for (int i = 0; i < 64; i++) {
unsigned c_res = call_ref(&s_c, cdf[0]);
unsigned a_res = call_new(&s_a, cdf[1]);
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
{
if (fail())
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1);
}
}
if (cdf_update)
bench_new(&s_a, cdf[1]);
}
}
}
static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s);
if (check_func(c->decode_bool_equi, "msac_decode_bool_equi")) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c;
for (int i = 0; i < 64; i++) {
unsigned c_res = call_ref(&s_c);
unsigned a_res = call_new(&s_a);
if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
if (fail())
msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0);
}
}
bench_new(&s_a);
}
}
static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, unsigned f);
if (check_func(c->decode_bool, "msac_decode_bool")) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c;
for (int i = 0; i < 64; i++) {
const unsigned f = rnd() & 0x7fff;
unsigned c_res = call_ref(&s_c, f);
unsigned a_res = call_new(&s_a, f);
if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
if (fail())
msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0);
}
}
bench_new(&s_a, 16384);
}
}
static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) {
check_decode_bool_adapt(c, buf);
check_decode_bool_equi(c, buf);
check_decode_bool(c, buf);
report("decode_bool");
}
static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
ALIGN_STK_16(uint16_t, cdf, 2, [16]);
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
if (check_func(c->decode_hi_tok, "msac_decode_hi_tok")) {
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
s_a = s_c;
randomize_cdf(cdf[0], 3);
memcpy(cdf[1], cdf[0], sizeof(*cdf));
for (int i = 0; i < 64; i++) {
unsigned c_res = call_ref(&s_c, cdf[0]);
unsigned a_res = call_new(&s_a, cdf[1]);
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
{
if (fail())
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3);
break;
}
}
if (cdf_update)
bench_new(&s_a, cdf[1]);
}
}
report("decode_hi_tok");
}
void checkasm_check_msac(void) {
MsacDSPContext c;
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c;
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_c;
c.decode_bool_equi = dav1d_msac_decode_bool_equi_c;
c.decode_bool = dav1d_msac_decode_bool_c;
c.decode_hi_tok = dav1d_msac_decode_hi_tok_c;
#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon;
c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon;
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_neon;
c.decode_bool_equi = dav1d_msac_decode_bool_equi_neon;
c.decode_bool = dav1d_msac_decode_bool_neon;
c.decode_hi_tok = dav1d_msac_decode_hi_tok_neon;
}
#elif ARCH_X86 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_sse2;
c.decode_bool_equi = dav1d_msac_decode_bool_equi_sse2;
c.decode_bool = dav1d_msac_decode_bool_sse2;
c.decode_hi_tok = dav1d_msac_decode_hi_tok_sse2;
}
#if ARCH_X86_64
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) {
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
}
#endif
#endif
uint8_t buf[BUF_SIZE];
for (int i = 0; i < BUF_SIZE; i++)
buf[i] = rnd();
check_decode_symbol(&c, buf);
check_decode_bool_funcs(&c, buf);
check_decode_hi_tok(&c, buf);
}

78
third_party/dav1d/tests/checkasm/refmvs.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,78 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/refmvs.h"
static void check_splat_mv(const Dav1dRefmvsDSPContext *const c) {
ALIGN_STK_64(refmvs_block, c_buf, 32 * 32,);
ALIGN_STK_64(refmvs_block, a_buf, 32 * 32,);
refmvs_block *c_dst[32];
refmvs_block *a_dst[32];
const size_t stride = 32 * sizeof(refmvs_block);
for (int i = 0; i < 32; i++) {
c_dst[i] = c_buf + 32 * i;
a_dst[i] = a_buf + 32 * i;
}
declare_func(void, refmvs_block **rr, const refmvs_block *rmv,
int bx4, int bw4, int bh4);
for (int w = 1; w <= 32; w *= 2) {
if (check_func(c->splat_mv, "splat_mv_w%d", w)) {
const int h_min = imax(w / 4, 1);
const int h_max = imin(w * 4, 32);
const int w_uint32 = w * sizeof(refmvs_block) / sizeof(uint32_t);
for (int h = h_min; h <= h_max; h *= 2) {
const int offset = (w * rnd()) & 31;
union {
refmvs_block rmv;
uint32_t u32[3];
} ALIGN(tmp, 16);
tmp.u32[0] = rnd();
tmp.u32[1] = rnd();
tmp.u32[2] = rnd();
call_ref(c_dst, &tmp.rmv, offset, w, h);
call_new(a_dst, &tmp.rmv, offset, w, h);
checkasm_check(uint32_t, (uint32_t*)(c_buf + offset), stride,
(uint32_t*)(a_buf + offset), stride,
w_uint32, h, "dst");
bench_new(a_dst, &tmp.rmv, 0, w, h);
}
}
}
report("splat_mv");
}
void checkasm_check_refmvs(void) {
Dav1dRefmvsDSPContext c;
dav1d_refmvs_dsp_init(&c);
check_splat_mv(&c);
}

370
third_party/dav1d/tests/checkasm/x86/checkasm.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,370 @@
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%undef private_prefix
%define private_prefix checkasm
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
%if ARCH_X86_64
; just random numbers to reduce the chance of incidental match
%if WIN64
x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064
x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
x9: dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
n7: dq 0x21f86d66c8ca00ce
n8: dq 0x75b6ba21077c48ad
%endif
n9: dq 0xed56bb2dcb3c7736
n10: dq 0x8bda43d3fd1a7e06
n11: dq 0xb64a9c9e5d318408
n12: dq 0xdf9a54b303f1d3a3
n13: dq 0x4a75479abd64e097
n14: dq 0x249214109d5d1c88
%endif
errmsg_stack: db "stack corruption", 0
SECTION .text
cextern fail_func
; max number of args used by any asm function.
; (max_args % 4) must equal 3 for stack alignment
%define max_args 15
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int checkasm_stack_clobber(uint64_t clobber, ...)
;-----------------------------------------------------------------------------
cglobal stack_clobber, 1, 2
; Clobber the stack with junk below the stack pointer
%define argsize (max_args+6)*8
SUB rsp, argsize
mov r1, argsize-8
.loop:
mov [rsp+r1], r0
sub r1, 8
jge .loop
ADD rsp, argsize
RET
%if WIN64
%assign free_regs 7
%define stack_param rsp+32 ; shadow space
%define num_stack_params rsp+stack_offset+22*8
DECLARE_REG_TMP 4
%else
%assign free_regs 9
%define stack_param rsp
%define num_stack_params rsp+stack_offset+16*8
DECLARE_REG_TMP 7
%endif
;-----------------------------------------------------------------------------
; void checkasm_checked_call(void *func, ...)
;-----------------------------------------------------------------------------
INIT_XMM
cglobal checked_call, 2, 15, 16, max_args*8+64+8
mov t0, r0
; All arguments have been pushed on the stack instead of registers in
; order to test for incorrect assumptions that 32-bit ints are
; zero-extended to 64-bit.
mov r0, r6mp
mov r1, r7mp
mov r2, r8mp
mov r3, r9mp
%if UNIX64
mov r4, r10mp
mov r5, r11mp
%else ; WIN64
; Move possible floating-point arguments to the correct registers
movq m0, r0
movq m1, r1
movq m2, r2
movq m3, r3
%assign i 6
%rep 16-6
mova m %+ i, [x %+ i]
%assign i i+1
%endrep
%endif
; write stack canaries to the area above parameters passed on the stack
mov r9d, [num_stack_params]
mov r8, [rsp+stack_offset] ; return address
not r8
%assign i 0
%rep 8 ; 64 bytes
mov [stack_param+(r9+i)*8], r8
%assign i i+1
%endrep
dec r9d
jl .stack_setup_done ; no stack parameters
.copy_stack_parameter:
mov r8, [stack_param+stack_offset+7*8+r9*8]
mov [stack_param+r9*8], r8
dec r9d
jge .copy_stack_parameter
.stack_setup_done:
%assign i 14
%rep 15-free_regs
mov r %+ i, [n %+ i]
%assign i i-1
%endrep
call t0
; check for stack corruption
mov r0d, [num_stack_params]
mov r3, [rsp+stack_offset]
mov r4, [stack_param+r0*8]
not r3
xor r4, r3
%assign i 1
%rep 6
mov r5, [stack_param+(r0+i)*8]
xor r5, r3
or r4, r5
%assign i i+1
%endrep
xor r3, [stack_param+(r0+7)*8]
lea r0, [errmsg_stack]
or r4, r3
jnz .fail
; check for failure to preserve registers
%assign i 14
%rep 15-free_regs
cmp r %+ i, [r0-errmsg_stack+n %+ i]
setne r4b
lea r3d, [r4+r3*2]
%assign i i-1
%endrep
%if WIN64
lea r0, [rsp+60] ; account for shadow space
mov r5, r0
test r3d, r3d
jz .gpr_ok
%else
test r3d, r3d
jz .ok
lea r0, [rsp+28]
%endif
%assign i free_regs
%rep 15-free_regs
%if i < 10
mov dword [r0], " r0" + (i << 16)
lea r4, [r0+3]
%else
mov dword [r0], " r10" + ((i - 10) << 24)
lea r4, [r0+4]
%endif
test r3b, 1 << (i - free_regs)
cmovnz r0, r4
%assign i i+1
%endrep
%if WIN64 ; xmm registers
.gpr_ok:
%assign i 6
%rep 16-6
pxor m %+ i, [x %+ i]
%assign i i+1
%endrep
packsswb m6, m7
packsswb m8, m9
packsswb m10, m11
packsswb m12, m13
packsswb m14, m15
packsswb m6, m6
packsswb m8, m10
packsswb m12, m14
packsswb m6, m6
packsswb m8, m12
packsswb m6, m8
pxor m7, m7
pcmpeqb m6, m7
pmovmskb r3d, m6
cmp r3d, 0xffff
je .xmm_ok
mov r7d, " xmm"
%assign i 6
%rep 16-6
mov [r0+0], r7d
%if i < 10
mov byte [r0+4], "0" + i
lea r4, [r0+5]
%else
mov word [r0+4], "10" + ((i - 10) << 8)
lea r4, [r0+6]
%endif
test r3d, 1 << i
cmovz r0, r4
%assign i i+1
%endrep
.xmm_ok:
cmp r0, r5
je .ok
mov byte [r0], 0
lea r0, [r5-28]
%else
mov byte [r0], 0
mov r0, rsp
%endif
mov dword [r0+ 0], "fail"
mov dword [r0+ 4], "ed t"
mov dword [r0+ 8], "o pr"
mov dword [r0+12], "eser"
mov dword [r0+16], "ve r"
mov dword [r0+20], "egis"
mov dword [r0+24], "ter:"
.fail:
; Call fail_func() with a descriptive message to mark it as a failure.
; Save the return value located in rdx:rax first to prevent clobbering.
mov r9, rax
mov r10, rdx
xor eax, eax
call fail_func
mov rdx, r10
mov rax, r9
.ok:
RET
; trigger a warmup of vector units
%macro WARMUP 0
cglobal warmup, 0, 0
xorps m0, m0
mulps m0, m0
RET
%endmacro
INIT_YMM avx2
WARMUP
INIT_ZMM avx512
WARMUP
%else
; just random numbers to reduce the chance of incidental match
%assign n3 0x6549315c
%assign n4 0xe02f3e23
%assign n5 0xb78d0d1d
%assign n6 0x33627ba7
;-----------------------------------------------------------------------------
; void checkasm_checked_call(void *func, ...)
;-----------------------------------------------------------------------------
cglobal checked_call, 1, 7
mov r3, [esp+stack_offset] ; return address
mov r1, [esp+stack_offset+17*4] ; num_stack_params
mov r2, 27
not r3
sub r2, r1
.push_canary:
push r3
dec r2
jg .push_canary
.push_parameter:
push dword [esp+32*4]
dec r1
jg .push_parameter
mov r3, n3
mov r4, n4
mov r5, n5
mov r6, n6
call r0
; check for failure to preserve registers
cmp r3, n3
setne r3h
cmp r4, n4
setne r3b
shl r3d, 16
cmp r5, n5
setne r3h
cmp r6, n6
setne r3b
test r3, r3
jz .gpr_ok
lea r1, [esp+16]
mov dword [r1+ 0], "fail"
mov dword [r1+ 4], "ed t"
mov dword [r1+ 8], "o pr"
mov dword [r1+12], "eser"
mov dword [r1+16], "ve r"
mov dword [r1+20], "egis"
mov dword [r1+24], "ter:"
lea r4, [r1+28]
%assign i 3
%rep 4
mov dword [r4], " r0" + (i << 16)
lea r5, [r4+3]
test r3, 1 << ((6 - i) * 8)
cmovnz r4, r5
%assign i i+1
%endrep
mov byte [r4], 0
jmp .fail
.gpr_ok:
; check for stack corruption
mov r3, [esp+48*4] ; num_stack_params
mov r6, [esp+31*4] ; return address
mov r4, [esp+r3*4]
sub r3, 26
not r6
xor r4, r6
.check_canary:
mov r5, [esp+(r3+27)*4]
xor r5, r6
or r4, r5
inc r3
jl .check_canary
test r4, r4
jz .ok
LEA r1, errmsg_stack
.fail:
mov r3, eax
mov r4, edx
mov [esp], r1
call fail_func
mov edx, r4
mov eax, r3
.ok:
add esp, 27*4
RET
%endif ; ARCH_X86_64

33
third_party/dav1d/tests/header_test.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,33 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include DAV1D_TEST_HEADER
int main(void)
{
return 0;
}

Просмотреть файл

@ -106,23 +106,23 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
unsigned h = djb_xor(ptr, 32);
unsigned seed = h;
unsigned probability = h > (RAND_MAX >> 5) ? RAND_MAX >> 5 : h;
int n_frame_threads = (h & 0xf) + 1;
int n_tile_threads = ((h >> 4) & 0x7) + 1;
if (n_frame_threads > 5) n_frame_threads = 1;
if (n_tile_threads > 3) n_tile_threads = 1;
int max_frame_delay = (h & 0xf) + 1;
int n_threads = ((h >> 4) & 0x7) + 1;
if (max_frame_delay > 5) max_frame_delay = 1;
if (n_threads > 3) n_threads = 1;
#endif
ptr += 32; // skip ivf header
dav1d_default_settings(&settings);
#ifdef DAV1D_MT_FUZZING
settings.n_frame_threads = settings.n_tile_threads = 2;
settings.max_frame_delay = settings.n_threads = 4;
#elif defined(DAV1D_ALLOC_FAIL)
settings.n_frame_threads = n_frame_threads;
settings.n_tile_threads = n_tile_threads;
settings.max_frame_delay = max_frame_delay;
settings.n_threads = n_threads;
dav1d_setup_alloc_fail(seed, probability);
#else
settings.n_frame_threads = settings.n_tile_threads = 1;
settings.max_frame_delay = settings.n_threads = 1;
#endif
#if defined(DAV1D_FUZZ_MAX_SIZE)
settings.frame_size_limit = DAV1D_FUZZ_MAX_SIZE;

152
third_party/dav1d/tests/meson.build поставляемый Normal file
Просмотреть файл

@ -0,0 +1,152 @@
# Copyright © 2018, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Build definition for the dav1d tests
#
# Leave subdir if tests are disabled
if not get_option('enable_tests')
subdir_done()
endif
if is_asm_enabled
checkasm_sources = files(
'checkasm/checkasm.c',
'checkasm/msac.c',
'checkasm/refmvs.c',
)
checkasm_tmpl_sources = files(
'checkasm/cdef.c',
'checkasm/filmgrain.c',
'checkasm/ipred.c',
'checkasm/itx.c',
'checkasm/loopfilter.c',
'checkasm/looprestoration.c',
'checkasm/mc.c',
)
checkasm_bitdepth_objs = []
foreach bitdepth : dav1d_bitdepths
checkasm_bitdepth_lib = static_library(
'checkasm_bitdepth_@0@'.format(bitdepth),
checkasm_tmpl_sources,
include_directories: dav1d_inc_dirs,
c_args: ['-DBITDEPTH=@0@'.format(bitdepth), stackalign_flag],
install: false,
build_by_default: false,
)
checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects(recursive: true)
endforeach
checkasm_asm_objs = []
checkasm_asm_sources = []
if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
elif host_machine.cpu_family().startswith('arm')
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
elif host_machine.cpu_family().startswith('x86')
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
endif
if use_gaspp
checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
else
checkasm_sources += checkasm_asm_sources
endif
checkasm = executable('checkasm',
checkasm_sources,
checkasm_asm_objs,
objects: [
checkasm_bitdepth_objs,
libdav1d.extract_all_objects(recursive: true),
],
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag],
build_by_default: false,
dependencies : [
thread_dependency,
rt_dependency,
libdl_dependency,
libm_dependency,
],
)
test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false)
benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench')
endif
c99_extension_flag = cc.first_supported_argument(
'-Werror=c11-extensions',
'-Werror=c99-c11-compat',
'-Wc11-extensions',
'-Wc99-c11-compat',
)
# dav1d_api_headers
foreach header : dav1d_api_headers
target = header + '_test'
header_test_exe = executable(target,
'header_test.c',
include_directories: dav1d_inc_dirs,
c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
build_by_default: true
)
test(target, header_test_exe, suite: 'headers')
endforeach
# fuzzing binaries
subdir('libfuzzer')
# seek stress test binary, depends on dav1d cli tool
if get_option('enable_tools')
seek_stress_sources = files('seek_stress.c')
seek_stress = executable('seek_stress',
seek_stress_sources, rev_target,
objects: [
dav1d.extract_objects('dav1d_cli_parse.c'),
dav1d_input_objs.extract_objects('input/input.c', 'input/ivf.c'),
],
include_directories: [dav1d_inc_dirs, include_directories('../tools')],
link_with: libdav1d,
dependencies: [
thread_dependency,
rt_dependency,
getopt_dependency,
libm_dependency,
],
)
endif
# Include dav1d test data repository with additional tests
if get_option('testdata_tests')
subdir('dav1d-test-data')
endif

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше