зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1734058 - Update dav1d to new version f52aee04fbd711cddab23d0aa9b196e9c963e7b8 from 2021-10-04 21:58:36. r=mjf,haik
This is a fairly significant update, so required a few changes to Gecko code, but I've commented on the interesting details, so they should be easy to find Differential Revision: https://phabricator.services.mozilla.com/D129465
This commit is contained in:
Родитель
5ac2b54c29
Коммит
874adf9b96
|
@ -37,13 +37,8 @@ RefPtr<MediaDataDecoder::InitPromise> DAV1DDecoder::Init() {
|
|||
} else if (mInfo.mDisplay.width >= 1024) {
|
||||
decoder_threads = 4;
|
||||
}
|
||||
settings.n_frame_threads =
|
||||
settings.n_threads =
|
||||
static_cast<int>(std::min(decoder_threads, GetNumberOfProcessors()));
|
||||
// There is not much improvement with more than 2 tile threads at least with
|
||||
// the content being currently served. The ideal number of tile thread would
|
||||
// much the tile count of the content. Maybe dav1d can help to do that in the
|
||||
// future.
|
||||
settings.n_tile_threads = 2;
|
||||
|
||||
int res = dav1d_open(&mContext, &settings);
|
||||
if (res < 0) {
|
||||
|
|
|
@ -581,6 +581,7 @@ class Dav1dDecoder final : AVIFDecoderInterface {
|
|||
Dav1dSettings settings;
|
||||
dav1d_default_settings(&settings);
|
||||
settings.all_layers = 0;
|
||||
settings.max_frame_delay = 1;
|
||||
// TODO: tune settings a la DAV1DDecoder for AV1 (Bug 1681816)
|
||||
|
||||
return dav1d_open(&mContext, &settings);
|
||||
|
|
|
@ -71,6 +71,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/x86/cpu.c',
|
||||
'../../../third_party/dav1d/src/x86/msac_init.c',
|
||||
'../../../third_party/dav1d/src/x86/refmvs_init.c',
|
||||
]
|
||||
|
||||
EXPORTS.dav1d += [
|
||||
|
@ -88,12 +89,10 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
'../../../third_party/dav1d/src/x86/cdef_avx512.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration_avx2.asm',
|
||||
|
@ -106,16 +105,21 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
'../../../third_party/dav1d/src/x86/cdef16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/cpuid.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration16_sse.asm', # moved from autovendored
|
||||
'../../../third_party/dav1d/src/x86/looprestoration_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/msac.asm',
|
||||
'../../../third_party/dav1d/src/x86/refmvs.asm',
|
||||
]
|
||||
|
||||
# BITDEPTH
|
||||
|
@ -148,6 +152,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/arm/cpu.c',
|
||||
'../../../third_party/dav1d/src/arm/refmvs_init.c',
|
||||
]
|
||||
EXPORTS += [
|
||||
'../../../third_party/dav1d/src/arm/asm-offsets.h',
|
||||
|
@ -203,6 +208,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
'../../../third_party/dav1d/src/arm/64/mc.S',
|
||||
'../../../third_party/dav1d/src/arm/64/mc16.S',
|
||||
'../../../third_party/dav1d/src/arm/64/msac.S',
|
||||
'../../../third_party/dav1d/src/arm/64/refmvs.S',
|
||||
]
|
||||
elif CONFIG['CPU_ARCH'] == 'arm':
|
||||
SOURCES += [
|
||||
|
@ -224,6 +230,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
'../../../third_party/dav1d/src/arm/32/mc.S',
|
||||
'../../../third_party/dav1d/src/arm/32/mc16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/msac.S',
|
||||
'../../../third_party/dav1d/src/arm/32/refmvs.S',
|
||||
]
|
||||
|
||||
if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
||||
|
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit ddbbfde198aced0d02ea739c320d754d43406f7b (2021-06-12T07:58:29.000+00:00).
|
||||
release: commit f52aee04fbd711cddab23d0aa9b196e9c963e7b8 (2021-10-04T21:58:36.000+00:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: ddbbfde198aced0d02ea739c320d754d43406f7b
|
||||
revision: f52aee04fbd711cddab23d0aa9b196e9c963e7b8
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.9.0-24-gddbbfde"
|
||||
#define DAV1D_VERSION "f52aee04fbd711cddab23d0aa9b196e9c963e7b8"
|
||||
|
|
|
@ -122,6 +122,7 @@ static const char SandboxPolicyContent[] = R"SANDBOX_LITERAL(
|
|||
(sysctl-name "hw.activecpu")
|
||||
(sysctl-name "hw.byteorder")
|
||||
(sysctl-name "hw.pagesize_compat")
|
||||
(sysctl-name "hw.logicalcpu")
|
||||
(sysctl-name "hw.logicalcpu_max")
|
||||
(sysctl-name "hw.physicalcpu_max")
|
||||
(sysctl-name "hw.busfrequency_compat")
|
||||
|
|
|
@ -1,3 +1,29 @@
|
|||
Changes for 0.9.2 'Golden Eagle':
|
||||
---------------------------------
|
||||
|
||||
0.9.2 is a small update of dav1d on the 0.9.x branch:
|
||||
- x86: SSE4 optimizations of inverse transforms for 10bit for all sizes
|
||||
- x86: mc.resize optimizations with AVX2/SSSE3 for 10/12b
|
||||
- x86: SSSE3 optimizations for cdef_filter in 10/12b and mc_w_mask_422/444 in 8b
|
||||
- ARM NEON optimizations for FilmGrain Gen_grain functions
|
||||
- Optimizations for splat_mv in SSE2/AVX2 and NEON
|
||||
- x86: SGR improvements for SSSE3 CPUs
|
||||
- x86: AVX2 optimizations for cfl_ac
|
||||
|
||||
|
||||
Changes for 0.9.1 'Golden Eagle':
|
||||
---------------------------------
|
||||
|
||||
0.9.1 is a middle-size revision of dav1d, adding notably 10b acceleration for SSSE3:
|
||||
- 10/12b SSSE3 optimizations for mc (avg, w_avg, mask, w_mask, emu_edge),
|
||||
prep/put_bilin, prep/put_8tap, ipred (dc/h/v, paeth, smooth, pal, filter), wiener,
|
||||
sgr (10b), warp8x8, deblock, film_grain, cfl_ac/pred for 32bit and 64bit x86 processors
|
||||
- Film grain NEON for fguv 10/12b, fgy/fguv 8b and fgy/fguv 10/12 arm32
|
||||
- Fixes for filmgrain on ARM
|
||||
- itx 10bit optimizations for 4x4/x8/x16, 8x4/x8/x16 for SSE4
|
||||
- Misc improvements on SSE2, SSE4
|
||||
|
||||
|
||||
Changes for 0.9.0 'Golden Eagle':
|
||||
---------------------------------
|
||||
|
||||
|
|
|
@ -36,16 +36,16 @@ The plan is the following:
|
|||
7. Make high bit-depth fast on mobile, by writing asm for ARMv8 chips.
|
||||
8. Make it fast on older mobile, by writing asm for ARMv7 chips,
|
||||
9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips,
|
||||
10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
|
||||
11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
|
||||
|
||||
### On-going
|
||||
10. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
|
||||
11. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
|
||||
12. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
|
||||
12. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
|
||||
13. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
|
||||
14. Improve threading.
|
||||
|
||||
### After
|
||||
13. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
|
||||
14. Use more GPU decoding, when possible.
|
||||
15. Improve threading.
|
||||
15. Use more GPU decoding, when possible.
|
||||
|
||||
# Contribute
|
||||
|
||||
|
@ -60,7 +60,7 @@ Our contributions guidelines are quite strict. We want to build a coherent codeb
|
|||
|
||||
Notably, the codebase is in pure C and asm.
|
||||
|
||||
We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [KiwiIRC Web Interface](https://kiwiirc.com/nextclient/#ircs://irc.libera.chat/#dav1d).
|
||||
We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [IRC Web Interface](https://web.libera.chat/#dav1d).
|
||||
|
||||
See the [contributions document](CONTRIBUTING.md).
|
||||
|
||||
|
|
|
@ -33,6 +33,14 @@
|
|||
#include <stddef.h>
|
||||
#include <assert.h>
|
||||
|
||||
#ifndef __has_attribute
|
||||
#define __has_attribute(x) 0
|
||||
#endif
|
||||
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define ATTR_ALIAS __attribute__((may_alias))
|
||||
#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr)))
|
||||
|
@ -93,9 +101,11 @@
|
|||
*/
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#else /* !_MSC_VER */
|
||||
#elif __has_attribute(noclone)
|
||||
#define NOINLINE __attribute__((noinline, noclone))
|
||||
#else
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#endif /* !_MSC_VER */
|
||||
#endif
|
||||
|
||||
#ifdef __clang__
|
||||
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
|
||||
|
@ -160,10 +170,6 @@ static inline int clzll(const unsigned long long mask) {
|
|||
}
|
||||
#endif /* !_MSC_VER */
|
||||
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#ifndef static_assert
|
||||
#define CHECK_OFFSET(type, field, name) \
|
||||
struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; }
|
||||
|
@ -172,4 +178,10 @@ static inline int clzll(const unsigned long long mask) {
|
|||
static_assert(name == offsetof(type, field), #field)
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define PACKED(...) __pragma(pack(push, 1)) __VA_ARGS__ __pragma(pack(pop))
|
||||
#else
|
||||
#define PACKED(...) __VA_ARGS__ __attribute__((__packed__))
|
||||
#endif
|
||||
|
||||
#endif /* DAV1D_COMMON_ATTRIBUTES_H */
|
||||
|
|
|
@ -41,6 +41,8 @@ typedef unsigned int atomic_uint;
|
|||
#define atomic_load_explicit(p_a, mo) __atomic_load_n(p_a, mo)
|
||||
#define atomic_fetch_add(p_a, inc) __atomic_fetch_add(p_a, inc, __ATOMIC_SEQ_CST)
|
||||
#define atomic_fetch_sub(p_a, dec) __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST)
|
||||
#define atomic_exchange(p_a, v) __atomic_exchange_n(p_a, v, __ATOMIC_SEQ_CST)
|
||||
#define atomic_fetch_or(p_a, v) __atomic_fetch_or(p_a, v, __ATOMIC_SEQ_CST)
|
||||
|
||||
#endif /* !defined(__cplusplus) */
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@
|
|||
|
||||
#include "common/attributes.h"
|
||||
|
||||
typedef volatile LONG __declspec(align(32)) atomic_int;
|
||||
typedef volatile ULONG __declspec(align(32)) atomic_uint;
|
||||
typedef volatile LONG atomic_int;
|
||||
typedef volatile ULONG atomic_uint;
|
||||
|
||||
typedef enum {
|
||||
memory_order_relaxed,
|
||||
|
@ -52,6 +52,7 @@ typedef enum {
|
|||
#define atomic_init(p_a, v) do { *(p_a) = (v); } while(0)
|
||||
#define atomic_store(p_a, v) InterlockedExchange((LONG*)p_a, v)
|
||||
#define atomic_load(p_a) InterlockedCompareExchange((LONG*)p_a, 0, 0)
|
||||
#define atomic_exchange(p_a, v) InterlockedExchange(p_a, v)
|
||||
#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
|
||||
|
||||
/*
|
||||
|
@ -60,6 +61,7 @@ typedef enum {
|
|||
*/
|
||||
#define atomic_fetch_add(p_a, inc) InterlockedExchangeAdd(p_a, inc)
|
||||
#define atomic_fetch_sub(p_a, dec) InterlockedExchangeAdd(p_a, -(dec))
|
||||
#define atomic_fetch_or(p_a, v) InterlockedOr(p_a, v)
|
||||
|
||||
#endif /* ! stdatomic.h */
|
||||
|
||||
|
|
|
@ -43,9 +43,8 @@ extern "C" {
|
|||
typedef struct Dav1dContext Dav1dContext;
|
||||
typedef struct Dav1dRef Dav1dRef;
|
||||
|
||||
#define DAV1D_MAX_FRAME_THREADS 256
|
||||
#define DAV1D_MAX_TILE_THREADS 64
|
||||
#define DAV1D_MAX_POSTFILTER_THREADS 256
|
||||
#define DAV1D_MAX_THREADS 256
|
||||
#define DAV1D_MAX_FRAME_DELAY 256
|
||||
|
||||
typedef struct Dav1dLogger {
|
||||
void *cookie; ///< Custom data to pass to the callback.
|
||||
|
@ -60,16 +59,15 @@ typedef struct Dav1dLogger {
|
|||
} Dav1dLogger;
|
||||
|
||||
typedef struct Dav1dSettings {
|
||||
int n_frame_threads;
|
||||
int n_tile_threads;
|
||||
int n_threads; ///< number of threads (0 = auto)
|
||||
int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = auto)
|
||||
int apply_grain;
|
||||
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
|
||||
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
|
||||
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
|
||||
Dav1dPicAllocator allocator; ///< Picture allocator callback.
|
||||
Dav1dLogger logger; ///< Logger callback.
|
||||
int n_postfilter_threads;
|
||||
uint8_t reserved[28]; ///< reserved for future use
|
||||
uint8_t reserved[32]; ///< reserved for future use
|
||||
} Dav1dSettings;
|
||||
|
||||
/**
|
||||
|
@ -105,7 +103,12 @@ DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
|
|||
* @param buf The data to be parser.
|
||||
* @param sz Size of the data.
|
||||
*
|
||||
* @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
|
||||
* @return
|
||||
* 0: Success, and out is filled with the parsed Sequence Header
|
||||
* OBU parameters.
|
||||
* DAV1D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer.
|
||||
* other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in
|
||||
* arguments, and other errors during parsing.
|
||||
*
|
||||
* @note It is safe to feed this function data containing other OBUs than a
|
||||
* Sequence Header, as they will simply be ignored. If there is more than
|
||||
|
|
|
@ -23,14 +23,14 @@
|
|||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '0.9.0',
|
||||
version: '0.9.2',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.49.0')
|
||||
|
||||
dav1d_soname_version = '5.1.0'
|
||||
dav1d_soname_version = '6.0.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
|
@ -173,16 +173,16 @@ libm_dependency = cc.find_library('m', required: false)
|
|||
|
||||
# Header checks
|
||||
|
||||
stdatomic_dependency = []
|
||||
stdatomic_dependencies = []
|
||||
if not cc.check_header('stdatomic.h')
|
||||
if cc.get_id() == 'msvc'
|
||||
# we have a custom replacement for MSVC
|
||||
stdatomic_dependency = declare_dependency(
|
||||
stdatomic_dependencies += declare_dependency(
|
||||
include_directories : include_directories('include/compat/msvc'),
|
||||
)
|
||||
elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''',
|
||||
name : 'GCC-style atomics', args : test_args)
|
||||
stdatomic_dependency = declare_dependency(
|
||||
stdatomic_dependencies += declare_dependency(
|
||||
include_directories : include_directories('include/compat/gcc'),
|
||||
)
|
||||
else
|
||||
|
@ -190,6 +190,11 @@ if not cc.check_header('stdatomic.h')
|
|||
endif
|
||||
endif
|
||||
|
||||
if host_machine.cpu_family().startswith('wasm')
|
||||
# enable atomics + bulk-memory features
|
||||
stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
|
||||
endif
|
||||
|
||||
if cc.check_header('unistd.h')
|
||||
cdata.set('HAVE_UNISTD_H', 1)
|
||||
endif
|
||||
|
@ -247,6 +252,7 @@ if cc.get_argument_syntax() != 'msvc'
|
|||
'-Wno-maybe-uninitialized',
|
||||
'-Wno-missing-field-initializers',
|
||||
'-Wno-unused-parameter',
|
||||
'-Wstrict-prototypes',
|
||||
'-Werror=missing-prototypes',
|
||||
'-Wshorten-64-to-32',
|
||||
]
|
||||
|
@ -369,11 +375,18 @@ if host_machine.cpu_family().startswith('x86')
|
|||
cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
|
||||
cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
|
||||
cdata_asm.set10('PIC', true)
|
||||
|
||||
# Convert SSE asm into (128-bit) AVX when compiler flags are set to use AVX instructions
|
||||
cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__') != '')
|
||||
endif
|
||||
|
||||
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
|
||||
|
||||
if cc.symbols_have_underscore_prefix()
|
||||
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
|
||||
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
|
||||
# see following meson issue https://github.com/mesonbuild/meson/issues/5482
|
||||
if (host_machine.system() == 'darwin' or
|
||||
(host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86'))
|
||||
cdata.set10('PREFIX', true)
|
||||
cdata_asm.set10('PREFIX', true)
|
||||
endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2021, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
|
||||
// int bx4, int bw4, int bh4)
|
||||
|
||||
function splat_mv_neon, export=1
|
||||
push {r4, lr}
|
||||
vld1.8 {q3}, [r1]
|
||||
ldr r4, [sp, #8]
|
||||
clz r3, r3
|
||||
adr lr, L(splat_tbl)
|
||||
sub r3, r3, #26
|
||||
vext.8 q2, q3, q3, #12
|
||||
ldr r3, [lr, r3, lsl #2]
|
||||
add r2, r2, r2, lsl #1
|
||||
vext.8 q0, q2, q3, #4
|
||||
add r3, lr, r3
|
||||
vext.8 q1, q2, q3, #8
|
||||
lsl r2, r2, #2
|
||||
vext.8 q2, q2, q3, #12
|
||||
vmov q3, q0
|
||||
1:
|
||||
ldr r1, [r0], #4
|
||||
subs r4, r4, #1
|
||||
add r1, r1, r2
|
||||
bx r3
|
||||
|
||||
.align 2
|
||||
L(splat_tbl):
|
||||
.word 320f - L(splat_tbl) + CONFIG_THUMB
|
||||
.word 160f - L(splat_tbl) + CONFIG_THUMB
|
||||
.word 80f - L(splat_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(splat_tbl) + CONFIG_THUMB
|
||||
.word 20f - L(splat_tbl) + CONFIG_THUMB
|
||||
.word 10f - L(splat_tbl) + CONFIG_THUMB
|
||||
|
||||
10:
|
||||
vst1.8 {d0}, [r1]
|
||||
vstr s2, [r1, #8]
|
||||
bgt 1b
|
||||
pop {r4, pc}
|
||||
20:
|
||||
vst1.8 {q0}, [r1]
|
||||
vstr d2, [r1, #16]
|
||||
bgt 1b
|
||||
pop {r4, pc}
|
||||
40:
|
||||
vst1.8 {q0, q1}, [r1]!
|
||||
vst1.8 {q2}, [r1]
|
||||
bgt 1b
|
||||
pop {r4, pc}
|
||||
320:
|
||||
vst1.8 {q0, q1}, [r1]!
|
||||
vst1.8 {q2, q3}, [r1]!
|
||||
vst1.8 {q1, q2}, [r1]!
|
||||
vst1.8 {q0, q1}, [r1]!
|
||||
vst1.8 {q2, q3}, [r1]!
|
||||
vst1.8 {q1, q2}, [r1]!
|
||||
160:
|
||||
vst1.8 {q0, q1}, [r1]!
|
||||
vst1.8 {q2, q3}, [r1]!
|
||||
vst1.8 {q1, q2}, [r1]!
|
||||
80:
|
||||
vst1.8 {q0, q1}, [r1]!
|
||||
vst1.8 {q2, q3}, [r1]!
|
||||
vst1.8 {q1, q2}, [r1]
|
||||
bgt 1b
|
||||
pop {r4, pc}
|
||||
endfunc
|
|
@ -186,32 +186,53 @@ endfunc
|
|||
add x0, x0, #GRAIN_WIDTH-32
|
||||
.endm
|
||||
|
||||
.macro get_grain_2 dst
|
||||
function get_grain_2_neon
|
||||
increment_seed 2
|
||||
read_rand x14, 11, 1
|
||||
read_rand x15, 11, 0
|
||||
add x14, x3, x14, lsl #1
|
||||
add x15, x3, x15, lsl #1
|
||||
ld1 {\dst\().h}[0], [x14]
|
||||
ld1 {\dst\().h}[1], [x15]
|
||||
srshl v0.4h, \dst\().4h, v31.4h
|
||||
xtn \dst\().8b, v0.8h
|
||||
ld1 {v0.h}[0], [x14]
|
||||
ld1 {v0.h}[1], [x15]
|
||||
srshl v0.4h, v0.4h, v31.4h
|
||||
xtn v0.8b, v0.8h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro get_grain_2 dst
|
||||
bl get_grain_2_neon
|
||||
.ifnc \dst, v0
|
||||
mov \dst\().8b, v0.8b
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// w15 holds the number of entries to produce
|
||||
// w14 holds the previous output entry
|
||||
// w14, w16 and w17 hold the previous output entries
|
||||
// v0 holds the vector of produced entries
|
||||
// v1 holds the input vector of sums from above
|
||||
function output_lag1_neon
|
||||
.macro output_lag n
|
||||
function output_lag\n\()_neon
|
||||
1:
|
||||
read_shift_rand x13, 11
|
||||
mov w11, v1.s[0]
|
||||
ldrsh w12, [x3, x13, lsl #1]
|
||||
ext v0.16b, v0.16b, v0.16b, #1
|
||||
madd w14, w14, w4, w11 // sum (above) + *coeff * prev output
|
||||
add w14, w14, w8 // 1 << (ar_coeff_shift - 1)
|
||||
.if \n == 1
|
||||
madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
|
||||
.elseif \n == 2
|
||||
madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
|
||||
madd w11, w14, w17, w11 // += *coeff * prev output 2
|
||||
mov w16, w14
|
||||
.else
|
||||
madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
|
||||
madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
|
||||
madd w11, w14, w21, w11 // += *coeff * prev output 3
|
||||
mov w17, w16
|
||||
mov w16, w14
|
||||
.endif
|
||||
add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
|
||||
add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1)
|
||||
asr w14, w14, w7 // >> ar_coeff_shift
|
||||
add w12, w12, w10
|
||||
asr w12, w12, w9 // >> (4 + grain_scale_shift)
|
||||
add w14, w14, w12
|
||||
cmp w14, w5
|
||||
|
@ -224,6 +245,12 @@ function output_lag1_neon
|
|||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
output_lag 1
|
||||
output_lag 2
|
||||
output_lag 3
|
||||
|
||||
|
||||
function sum_lag1_above_neon
|
||||
smull v2.8h, v3.8b, v28.8b
|
||||
|
@ -243,10 +270,8 @@ function sum_lag1_above_neon
|
|||
ret
|
||||
endfunc
|
||||
|
||||
.macro sum_lag1_func type, uv_layout, edge, elems=16
|
||||
function sum_\type\()_lag1_\edge\()_neon
|
||||
str x30, [sp, #-16]!
|
||||
bl sum_lag1_above_neon
|
||||
.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
|
||||
bl sum_\lag\()_above_neon
|
||||
.ifc \type, uv_420
|
||||
add x12, x19, #GRAIN_WIDTH
|
||||
ld1 {v22.16b, v23.16b}, [x19], #32
|
||||
|
@ -257,35 +282,41 @@ function sum_\type\()_lag1_\edge\()_neon
|
|||
saddlp v25.8h, v25.16b
|
||||
add v22.8h, v22.8h, v24.8h
|
||||
add v23.8h, v23.8h, v25.8h
|
||||
rshrn v0.8b, v22.8h, #2
|
||||
rshrn2 v0.16b, v23.8h, #2
|
||||
rshrn v0.8b, v22.8h, #2
|
||||
rshrn2 v0.16b, v23.8h, #2
|
||||
.endif
|
||||
.ifc \type, uv_422
|
||||
ld1 {v22.16b, v23.16b}, [x19], #32
|
||||
saddlp v22.8h, v22.16b
|
||||
saddlp v23.8h, v23.16b
|
||||
rshrn v0.8b, v22.8h, #1
|
||||
rshrn2 v0.16b, v23.8h, #1
|
||||
rshrn v0.8b, v22.8h, #1
|
||||
rshrn2 v0.16b, v23.8h, #1
|
||||
.endif
|
||||
.ifc \type, uv_444
|
||||
ld1 {v0.16b}, [x19], #16
|
||||
.endif
|
||||
.if \uv_layout
|
||||
.ifnb \uv_coeff
|
||||
dup v1.16b, \uv_coeff
|
||||
smull v2.8h, v0.8b, v1.8b
|
||||
smull2 v3.8h, v0.16b, v1.16b
|
||||
.else
|
||||
smull v2.8h, v0.8b, v30.8b
|
||||
smull2 v3.8h, v0.16b, v30.16b
|
||||
.endif
|
||||
saddw v4.4s, v4.4s, v2.4h
|
||||
saddw2 v5.4s, v5.4s, v2.8h
|
||||
saddw v6.4s, v6.4s, v3.4h
|
||||
saddw2 v7.4s, v7.4s, v3.8h
|
||||
.endif
|
||||
.if \uv_layout && \elems == 16
|
||||
b sum_lag1_y_\edge\()_start
|
||||
b sum_\lag\()_y_\edge\()_start
|
||||
.elseif \uv_layout == 444 && \elems == 15
|
||||
b sum_lag1_y_\edge\()_start
|
||||
b sum_\lag\()_y_\edge\()_start
|
||||
.elseif \uv_layout == 422 && \elems == 9
|
||||
b sum_lag1_uv_420_\edge\()_start
|
||||
b sum_\lag\()_uv_420_\edge\()_start
|
||||
.else
|
||||
sum_lag1_\type\()_\edge\()_start:
|
||||
sum_\lag\()_\type\()_\edge\()_start:
|
||||
.ifc \edge, left
|
||||
increment_seed 4
|
||||
read_rand x12, 11, 3
|
||||
|
@ -301,28 +332,34 @@ sum_lag1_\type\()_\edge\()_start:
|
|||
srshl v0.8h, v0.8h, v31.8h
|
||||
xtn2 v0.16b, v0.8h
|
||||
ext v4.16b, v4.16b, v4.16b, #12
|
||||
.ifc \lag, lag3
|
||||
smov w17, v0.b[13]
|
||||
.endif
|
||||
.ifnc \lag, lag1
|
||||
smov w16, v0.b[14]
|
||||
.endif
|
||||
smov w14, v0.b[15]
|
||||
|
||||
mov v1.16b, v4.16b
|
||||
mov w15, #1
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
.else
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v4.16b
|
||||
mov w15, #4
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
.endif
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v5.16b
|
||||
mov w15, #4
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v6.16b
|
||||
.if \elems == 9
|
||||
mov w15, #1
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
lsr w2, w2, #3
|
||||
|
||||
read_rand x12, 11, 2
|
||||
|
@ -339,14 +376,14 @@ sum_lag1_\type\()_\edge\()_start:
|
|||
ext v0.16b, v0.16b, v1.16b, #7
|
||||
.else
|
||||
mov w15, #4
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v7.16b
|
||||
|
||||
.ifc \edge, right
|
||||
mov w15, #3
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
read_shift_rand x15, 11
|
||||
add x15, x3, x15, lsl #1
|
||||
ld1 {v1.h}[0], [x15]
|
||||
|
@ -354,12 +391,21 @@ sum_lag1_\type\()_\edge\()_start:
|
|||
ext v0.16b, v0.16b, v1.16b, #1
|
||||
.else
|
||||
mov w15, #4
|
||||
bl output_lag1_neon
|
||||
bl output_\lag\()_neon
|
||||
.endif
|
||||
.endif
|
||||
.if \store
|
||||
st1 {v0.16b}, [x0], #16
|
||||
.endif
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro sum_lag1_func type, uv_layout, edge, elems=16
|
||||
function sum_\type\()_lag1_\edge\()_neon
|
||||
str x30, [sp, #-16]!
|
||||
sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -400,34 +446,6 @@ sum_lag1_func uv_420, 420, right, 9
|
|||
sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
|
||||
.endm
|
||||
|
||||
// w15 holds the number of entries to produce
|
||||
// w14 and w16 hold the previous output entries
|
||||
// v0 holds the vector of produced entries
|
||||
// v1 holds the input vector of sums from above
|
||||
function output_lag2_neon
|
||||
1:
|
||||
read_shift_rand x13, 11
|
||||
mov w11, v1.s[0]
|
||||
ldrsh w12, [x3, x13, lsl #1]
|
||||
ext v0.16b, v0.16b, v0.16b, #1
|
||||
madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
|
||||
madd w11, w14, w17, w11 // += *coeff * prev output 2
|
||||
mov w16, w14
|
||||
add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
|
||||
asr w14, w14, w7 // >> ar_coeff_shift
|
||||
add w12, w12, w10
|
||||
asr w12, w12, w9 // >> (4 + grain_scale_shift)
|
||||
add w14, w14, w12
|
||||
cmp w14, w5
|
||||
csel w14, w14, w5, le
|
||||
cmp w14, w6
|
||||
csel w14, w14, w6, ge
|
||||
subs w15, w15, #1
|
||||
ext v1.16b, v1.16b, v1.16b, #4
|
||||
ins v0.b[15], w14
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sum_lag2_above_neon
|
||||
sub x12, x0, #2*GRAIN_WIDTH - 16
|
||||
|
@ -530,123 +548,7 @@ function sum_\type\()_lag2_\edge\()_neon
|
|||
ld1 {v17.16b}, [x12] // load the previous block right above
|
||||
ld1 {v20.16b}, [x13]
|
||||
.endif
|
||||
bl sum_lag2_above_neon
|
||||
.ifc \type, uv_420
|
||||
add x12, x19, #GRAIN_WIDTH
|
||||
ld1 {v22.16b, v23.16b}, [x19], #32
|
||||
ld1 {v24.16b, v25.16b}, [x12]
|
||||
saddlp v22.8h, v22.16b
|
||||
saddlp v23.8h, v23.16b
|
||||
saddlp v24.8h, v24.16b
|
||||
saddlp v25.8h, v25.16b
|
||||
add v22.8h, v22.8h, v24.8h
|
||||
add v23.8h, v23.8h, v25.8h
|
||||
rshrn v0.8b, v22.8h, #2
|
||||
rshrn2 v0.16b, v23.8h, #2
|
||||
.endif
|
||||
.ifc \type, uv_422
|
||||
ld1 {v22.16b, v23.16b}, [x19], #32
|
||||
saddlp v22.8h, v22.16b
|
||||
saddlp v23.8h, v23.16b
|
||||
rshrn v0.8b, v22.8h, #1
|
||||
rshrn2 v0.16b, v23.8h, #1
|
||||
.endif
|
||||
.ifc \type, uv_444
|
||||
ld1 {v0.16b}, [x19], #16
|
||||
.endif
|
||||
.if \uv_layout
|
||||
dup v1.16b, v30.b[12]
|
||||
smull v2.8h, v0.8b, v1.8b
|
||||
smull2 v3.8h, v0.16b, v1.16b
|
||||
saddw v4.4s, v4.4s, v2.4h
|
||||
saddw2 v5.4s, v5.4s, v2.8h
|
||||
saddw v6.4s, v6.4s, v3.4h
|
||||
saddw2 v7.4s, v7.4s, v3.8h
|
||||
.endif
|
||||
.if \uv_layout && \elems == 16
|
||||
b sum_lag2_y_\edge\()_start
|
||||
.elseif \uv_layout == 444 && \elems == 15
|
||||
b sum_lag2_y_\edge\()_start
|
||||
.elseif \uv_layout == 422 && \elems == 9
|
||||
b sum_lag2_uv_420_\edge\()_start
|
||||
.else
|
||||
sum_lag2_\type\()_\edge\()_start:
|
||||
.ifc \edge, left
|
||||
increment_seed 4
|
||||
read_rand x12, 11, 3
|
||||
read_rand x13, 11, 2
|
||||
read_rand x14, 11, 1
|
||||
add x12, x3, x12, lsl #1
|
||||
add x13, x3, x13, lsl #1
|
||||
add x14, x3, x14, lsl #1
|
||||
ld1 {v0.h}[5], [x12]
|
||||
ld1 {v0.h}[6], [x13]
|
||||
ld1 {v0.h}[7], [x14]
|
||||
lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
|
||||
srshl v0.8h, v0.8h, v31.8h
|
||||
xtn2 v0.16b, v0.8h
|
||||
ext v4.16b, v4.16b, v4.16b, #12
|
||||
smov w16, v0.b[14]
|
||||
smov w14, v0.b[15]
|
||||
|
||||
mov v1.16b, v4.16b
|
||||
mov w15, #1
|
||||
bl output_lag2_neon
|
||||
.else
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v4.16b
|
||||
mov w15, #4
|
||||
bl output_lag2_neon
|
||||
.endif
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v5.16b
|
||||
mov w15, #4
|
||||
bl output_lag2_neon
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v6.16b
|
||||
.if \elems == 9
|
||||
mov w15, #1
|
||||
bl output_lag2_neon
|
||||
lsr w2, w2, #3
|
||||
|
||||
read_rand x12, 11, 2
|
||||
read_rand x13, 11, 1
|
||||
read_rand x14, 11, 0
|
||||
add x12, x3, x12, lsl #1
|
||||
add x13, x3, x13, lsl #1
|
||||
add x14, x3, x14, lsl #1
|
||||
ld1 {v1.h}[0], [x12]
|
||||
ld1 {v1.h}[1], [x13]
|
||||
ld1 {v1.h}[2], [x14]
|
||||
srshl v1.4h, v1.4h, v31.4h
|
||||
xtn v1.8b, v1.8h
|
||||
ext v0.16b, v0.16b, v1.16b, #7
|
||||
.else
|
||||
mov w15, #4
|
||||
bl output_lag2_neon
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v7.16b
|
||||
|
||||
.ifc \edge, right
|
||||
mov w15, #3
|
||||
bl output_lag2_neon
|
||||
read_shift_rand x15, 11
|
||||
add x15, x3, x15, lsl #1
|
||||
ld1 {v1.h}[0], [x15]
|
||||
srshl v1.4h, v1.4h, v31.4h
|
||||
ext v0.16b, v0.16b, v1.16b, #1
|
||||
.else
|
||||
mov w15, #4
|
||||
bl output_lag2_neon
|
||||
.endif
|
||||
.endif
|
||||
st1 {v0.16b}, [x0], #16
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
.endif
|
||||
sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -664,37 +566,6 @@ sum_lag2_func uv_420, 420, mid
|
|||
sum_lag2_func uv_420, 420, right, 9
|
||||
|
||||
|
||||
// w15 holds the number of entries to produce
|
||||
// w14, w16 and w17 hold the previous output entries
|
||||
// v0 holds the vector of produced entries
|
||||
// v1 holds the input vector of sums from above
|
||||
function output_lag3_neon
|
||||
1:
|
||||
read_shift_rand x13, 11
|
||||
mov w11, v1.s[0]
|
||||
ldrsh w12, [x3, x13, lsl #1]
|
||||
ext v0.16b, v0.16b, v0.16b, #1
|
||||
madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
|
||||
madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
|
||||
madd w11, w14, w21, w11 // += *coeff * prev output 3
|
||||
mov w17, w16
|
||||
mov w16, w14
|
||||
add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
|
||||
asr w14, w14, w7 // >> ar_coeff_shift
|
||||
add w12, w12, w10
|
||||
asr w12, w12, w9 // >> (4 + grain_scale_shift)
|
||||
add w14, w14, w12
|
||||
cmp w14, w5
|
||||
csel w14, w14, w5, le
|
||||
cmp w14, w6
|
||||
csel w14, w14, w6, ge
|
||||
subs w15, w15, #1
|
||||
ext v1.16b, v1.16b, v1.16b, #4
|
||||
ins v0.b[15], w14
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sum_lag3_above_neon
|
||||
sub x11, x0, #3*GRAIN_WIDTH - 16
|
||||
sub x12, x0, #2*GRAIN_WIDTH - 16
|
||||
|
@ -890,124 +761,7 @@ function sum_\type\()_lag3_\edge\()_neon
|
|||
ld1 {v17.16b}, [x12]
|
||||
ld1 {v20.16b}, [x13]
|
||||
.endif
|
||||
bl sum_lag3_above_neon
|
||||
.ifc \type, uv_420
|
||||
add x12, x19, #GRAIN_WIDTH
|
||||
ld1 {v22.16b, v23.16b}, [x19], #32
|
||||
ld1 {v24.16b, v25.16b}, [x12]
|
||||
saddlp v22.8h, v22.16b
|
||||
saddlp v23.8h, v23.16b
|
||||
saddlp v24.8h, v24.16b
|
||||
saddlp v25.8h, v25.16b
|
||||
add v22.8h, v22.8h, v24.8h
|
||||
add v23.8h, v23.8h, v25.8h
|
||||
rshrn v0.8b, v22.8h, #2
|
||||
rshrn2 v0.16b, v23.8h, #2
|
||||
.endif
|
||||
.ifc \type, uv_422
|
||||
ld1 {v22.16b, v23.16b}, [x19], #32
|
||||
saddlp v22.8h, v22.16b
|
||||
saddlp v23.8h, v23.16b
|
||||
rshrn v0.8b, v22.8h, #1
|
||||
rshrn2 v0.16b, v23.8h, #1
|
||||
.endif
|
||||
.ifc \type, uv_444
|
||||
ld1 {v0.16b}, [x19], #16
|
||||
.endif
|
||||
.if \uv_layout
|
||||
dup v1.16b, v30.b[8]
|
||||
smull v2.8h, v0.8b, v1.8b
|
||||
smull2 v3.8h, v0.16b, v1.16b
|
||||
saddw v4.4s, v4.4s, v2.4h
|
||||
saddw2 v5.4s, v5.4s, v2.8h
|
||||
saddw v6.4s, v6.4s, v3.4h
|
||||
saddw2 v7.4s, v7.4s, v3.8h
|
||||
.endif
|
||||
.if \uv_layout && \elems == 16
|
||||
b sum_lag3_y_\edge\()_start
|
||||
.elseif \uv_layout == 444 && \elems == 15
|
||||
b sum_lag3_y_\edge\()_start
|
||||
.elseif \uv_layout == 422 && \elems == 9
|
||||
b sum_lag3_uv_420_\edge\()_start
|
||||
.else
|
||||
sum_lag3_\type\()_\edge\()_start:
|
||||
.ifc \edge, left
|
||||
increment_seed 4
|
||||
read_rand x12, 11, 3
|
||||
read_rand x13, 11, 2
|
||||
read_rand x14, 11, 1
|
||||
add x12, x3, x12, lsl #1
|
||||
add x13, x3, x13, lsl #1
|
||||
add x14, x3, x14, lsl #1
|
||||
ld1 {v0.h}[5], [x12]
|
||||
ld1 {v0.h}[6], [x13]
|
||||
ld1 {v0.h}[7], [x14]
|
||||
lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
|
||||
srshl v0.8h, v0.8h, v31.8h
|
||||
xtn2 v0.16b, v0.8h
|
||||
ext v4.16b, v4.16b, v4.16b, #12
|
||||
smov w17, v0.b[13]
|
||||
smov w16, v0.b[14]
|
||||
smov w14, v0.b[15]
|
||||
|
||||
mov v1.16b, v4.16b
|
||||
mov w15, #1
|
||||
bl output_lag3_neon
|
||||
.else
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v4.16b
|
||||
mov w15, #4
|
||||
bl output_lag3_neon
|
||||
.endif
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v5.16b
|
||||
mov w15, #4
|
||||
bl output_lag3_neon
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v6.16b
|
||||
.if \elems == 9
|
||||
mov w15, #1
|
||||
bl output_lag3_neon
|
||||
lsr w2, w2, #3
|
||||
|
||||
read_rand x12, 11, 2
|
||||
read_rand x13, 11, 1
|
||||
read_rand x14, 11, 0
|
||||
add x12, x3, x12, lsl #1
|
||||
add x13, x3, x13, lsl #1
|
||||
add x14, x3, x14, lsl #1
|
||||
ld1 {v1.h}[0], [x12]
|
||||
ld1 {v1.h}[1], [x13]
|
||||
ld1 {v1.h}[2], [x14]
|
||||
srshl v1.4h, v1.4h, v31.4h
|
||||
xtn v1.8b, v1.8h
|
||||
ext v0.16b, v0.16b, v1.16b, #7
|
||||
.else
|
||||
mov w15, #4
|
||||
bl output_lag3_neon
|
||||
|
||||
increment_seed 4, shift=0
|
||||
mov v1.16b, v7.16b
|
||||
|
||||
.ifc \edge, right
|
||||
mov w15, #3
|
||||
bl output_lag3_neon
|
||||
read_shift_rand x15, 11
|
||||
add x15, x3, x15, lsl #1
|
||||
ld1 {v1.h}[0], [x15]
|
||||
srshl v1.4h, v1.4h, v31.4h
|
||||
ext v0.16b, v0.16b, v1.16b, #1
|
||||
.else
|
||||
mov w15, #4
|
||||
bl output_lag3_neon
|
||||
.endif
|
||||
.endif
|
||||
st1 {v0.16b}, [x0], #16
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
.endif
|
||||
sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -1061,7 +815,6 @@ function get_grain_row_44_neon
|
|||
endfunc
|
||||
|
||||
function add_uv_444_coeff_lag0_neon
|
||||
str x30, [sp, #-16]!
|
||||
add_coeff_lag0_start:
|
||||
smull v2.8h, v0.8b, v27.8b
|
||||
smull2 v3.8h, v0.16b, v27.16b
|
||||
|
@ -1071,20 +824,18 @@ add_coeff_lag0_start:
|
|||
saddw2 v3.8h, v3.8h, v1.16b
|
||||
sqxtn v2.8b, v2.8h
|
||||
sqxtn2 v2.16b, v3.8h
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function add_uv_420_coeff_lag0_neon
|
||||
str x30, [sp, #-16]!
|
||||
ld1 {v4.16b, v5.16b}, [x19], #32
|
||||
ld1 {v6.16b, v7.16b}, [x12], #32
|
||||
saddlp v4.8h, v4.16b
|
||||
saddlp v5.8h, v5.16b
|
||||
saddlp v6.8h, v6.16b
|
||||
saddlp v7.8h, v7.16b
|
||||
add v4.8h, v4.8h, v6.8h
|
||||
add v5.8h, v5.8h, v7.8h
|
||||
add v4.8h, v4.8h, v6.8h
|
||||
add v5.8h, v5.8h, v7.8h
|
||||
rshrn v4.8b, v4.8h, #2
|
||||
rshrn2 v4.16b, v5.8h, #2
|
||||
and v0.16b, v4.16b, v0.16b
|
||||
|
@ -1092,7 +843,6 @@ function add_uv_420_coeff_lag0_neon
|
|||
endfunc
|
||||
|
||||
function add_uv_422_coeff_lag0_neon
|
||||
str x30, [sp, #-16]!
|
||||
ld1 {v4.16b, v5.16b}, [x19], #32
|
||||
saddlp v4.8h, v4.16b
|
||||
saddlp v5.8h, v5.16b
|
||||
|
@ -1153,8 +903,6 @@ function generate_grain_\type\()_8bpc_neon, export=1
|
|||
|
||||
br x16
|
||||
|
||||
ret
|
||||
|
||||
L(generate_grain_\type\()_lag0):
|
||||
.ifc \type, y
|
||||
mov w1, #GRAIN_HEIGHT
|
||||
|
@ -1208,15 +956,15 @@ L(generate_grain_\type\()_lag1):
|
|||
ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1]
|
||||
ld1r {v29.16b}, [x4] // ar_coeffs_y[2]
|
||||
.ifc \type, y
|
||||
ldrsb w4, [x4, #1] // ar_coeffs_y[4]
|
||||
ldrsb w4, [x4, #1] // ar_coeffs_y[3]
|
||||
.else
|
||||
add x4, x4, #2
|
||||
.endif
|
||||
|
||||
mov w1, #3
|
||||
.ifc \type, uv_444
|
||||
ld1r {v30.16b}, [x4] // ar_coeffs_uv[5]
|
||||
ldursb w4, [x4, #-1] // ar_coeffs_uv[4]
|
||||
ld1r {v30.16b}, [x4] // ar_coeffs_uv[4]
|
||||
ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
|
||||
.endif
|
||||
bl generate_grain_rows_neon
|
||||
|
||||
|
@ -1270,8 +1018,7 @@ L(generate_grain_\type\()_lag2):
|
|||
ret
|
||||
|
||||
L(generate_grain_\type\()_lag3):
|
||||
ldr q29, [x4] // ar_coeffs_y[0-15]
|
||||
ldr q30, [x4, #16] // ar_coeffs_y[16-23], ar_coeffs_uv[16-24]
|
||||
ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
|
||||
stp d8, d9, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d12, d13, [sp, #48]
|
||||
|
@ -1377,8 +1124,6 @@ function generate_grain_\type\()_8bpc_neon, export=1
|
|||
|
||||
br x16
|
||||
|
||||
ret
|
||||
|
||||
L(generate_grain_\type\()_lag0):
|
||||
dup v28.8h, w7
|
||||
ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
|
||||
|
@ -1423,8 +1168,8 @@ L(generate_grain_\type\()_lag1):
|
|||
add x4, x4, #2
|
||||
|
||||
mov w1, #3
|
||||
ld1r {v30.16b}, [x4] // ar_coeffs_uv[5]
|
||||
ldursb w4, [x4, #-1] // ar_coeffs_uv[4]
|
||||
ld1r {v30.16b}, [x4] // ar_coeffs_u4[4]
|
||||
ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
|
||||
bl generate_grain_rows_44_neon
|
||||
|
||||
set_height w1, \type
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2021, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
|
||||
// int bx4, int bw4, int bh4)
|
||||
|
||||
function splat_mv_neon, export=1
|
||||
ld1 {v3.16b}, [x1]
|
||||
clz w3, w3
|
||||
adr x5, L(splat_tbl)
|
||||
sub w3, w3, #26
|
||||
ext v2.16b, v3.16b, v3.16b, #12
|
||||
ldrh w3, [x5, w3, uxtw #1]
|
||||
add w2, w2, w2, lsl #1
|
||||
ext v0.16b, v2.16b, v3.16b, #4
|
||||
sub x3, x5, w3, uxtw
|
||||
ext v1.16b, v2.16b, v3.16b, #8
|
||||
lsl w2, w2, #2
|
||||
ext v2.16b, v2.16b, v3.16b, #12
|
||||
1:
|
||||
ldr x1, [x0], #8
|
||||
subs w4, w4, #1
|
||||
add x1, x1, x2
|
||||
br x3
|
||||
|
||||
10:
|
||||
st1 {v0.8b}, [x1]
|
||||
str s2, [x1, #8]
|
||||
b.gt 1b
|
||||
ret
|
||||
20:
|
||||
st1 {v0.16b}, [x1]
|
||||
str d1, [x1, #16]
|
||||
b.gt 1b
|
||||
ret
|
||||
320:
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
160:
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
80:
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
|
||||
40:
|
||||
st1 {v0.16b, v1.16b, v2.16b}, [x1]
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
L(splat_tbl):
|
||||
.hword L(splat_tbl) - 320b
|
||||
.hword L(splat_tbl) - 160b
|
||||
.hword L(splat_tbl) - 80b
|
||||
.hword L(splat_tbl) - 40b
|
||||
.hword L(splat_tbl) - 20b
|
||||
.hword L(splat_tbl) - 10b
|
||||
endfunc
|
|
@ -31,8 +31,6 @@
|
|||
#include "src/film_grain.h"
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#if ARCH_AARCH64
|
||||
|
||||
CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
|
||||
CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
|
||||
CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
|
||||
|
@ -60,7 +58,6 @@ void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
|
|||
GEN_GRAIN_UV(420);
|
||||
GEN_GRAIN_UV(422);
|
||||
GEN_GRAIN_UV(444);
|
||||
#endif
|
||||
|
||||
// Use ptrdiff_t instead of int for the last few parameters, to get the
|
||||
// same layout of parameters on the stack across platforms.
|
||||
|
@ -209,12 +206,10 @@ COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if ARCH_AARCH64 && BITDEPTH == 8
|
||||
c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
|
||||
#endif
|
||||
|
||||
c->fgy_32x32xn = fgy_32x32xn_neon;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2021, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
decl_splat_mv_fn(dav1d_splat_mv_neon);
|
||||
|
||||
COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
c->splat_mv = dav1d_splat_mv_neon;
|
||||
}
|
|
@ -113,7 +113,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
|
|||
const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
|
||||
const int pri_shift = imax(0, damping - ulog2(pri_strength));
|
||||
if (sec_strength) {
|
||||
const int sec_shift = imax(0, damping - ulog2(sec_strength));
|
||||
const int sec_shift = damping - ulog2(sec_strength);
|
||||
do {
|
||||
for (int x = 0; x < w; x++) {
|
||||
const int px = dst[x];
|
||||
|
@ -180,7 +180,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
|
|||
}
|
||||
} else { // sec_strength only
|
||||
assert(sec_strength);
|
||||
const int sec_shift = imax(0, damping - ulog2(sec_strength));
|
||||
const int sec_shift = damping - ulog2(sec_strength);
|
||||
do {
|
||||
for (int x = 0; x < w; x++) {
|
||||
const int px = dst[x];
|
||||
|
|
|
@ -4096,16 +4096,15 @@ void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const
|
|||
}
|
||||
|
||||
int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf,
|
||||
struct thread_data *const t)
|
||||
const int have_frame_mt)
|
||||
{
|
||||
cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
|
||||
sizeof(CdfContext) + sizeof(atomic_uint));
|
||||
if (!cdf->ref) return DAV1D_ERR(ENOMEM);
|
||||
cdf->data.cdf = cdf->ref->data;
|
||||
if (t) {
|
||||
if (have_frame_mt) {
|
||||
cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
|
||||
atomic_init(cdf->progress, 0);
|
||||
cdf->t = t;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -4123,22 +4122,3 @@ void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
|
|||
dav1d_ref_dec(&cdf->ref);
|
||||
memset(cdf, 0, sizeof(*cdf));
|
||||
}
|
||||
|
||||
void dav1d_cdf_thread_wait(CdfThreadContext *const cdf) {
|
||||
if (!cdf->t) return;
|
||||
|
||||
if (atomic_load(cdf->progress)) return;
|
||||
pthread_mutex_lock(&cdf->t->lock);
|
||||
while (!atomic_load(cdf->progress))
|
||||
pthread_cond_wait(&cdf->t->cond, &cdf->t->lock);
|
||||
pthread_mutex_unlock(&cdf->t->lock);
|
||||
}
|
||||
|
||||
void dav1d_cdf_thread_signal(CdfThreadContext *const cdf) {
|
||||
if (!cdf->t) return;
|
||||
|
||||
pthread_mutex_lock(&cdf->t->lock);
|
||||
atomic_store(cdf->progress, 1);
|
||||
pthread_cond_broadcast(&cdf->t->cond);
|
||||
pthread_mutex_unlock(&cdf->t->lock);
|
||||
}
|
||||
|
|
|
@ -135,23 +135,16 @@ typedef struct CdfThreadContext {
|
|||
CdfContext *cdf; // if ref != NULL
|
||||
unsigned qcat; // if ref == NULL, from static CDF tables
|
||||
} data;
|
||||
struct thread_data *t;
|
||||
atomic_uint *progress;
|
||||
} CdfThreadContext;
|
||||
|
||||
void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
|
||||
int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
|
||||
struct thread_data *t);
|
||||
const int have_frame_mt);
|
||||
void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
|
||||
void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
|
||||
void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
|
||||
void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst,
|
||||
const CdfContext *src);
|
||||
|
||||
/*
|
||||
* These are binary signals (so a signal is either "done" or "not done").
|
||||
*/
|
||||
void dav1d_cdf_thread_wait(CdfThreadContext *cdf);
|
||||
void dav1d_cdf_thread_signal(CdfThreadContext *cdf);
|
||||
|
||||
#endif /* DAV1D_SRC_CDF_H */
|
||||
|
|
|
@ -29,6 +29,17 @@
|
|||
#include <stdint.h>
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/log.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#elif defined(__linux__)
|
||||
#include <sched.h>
|
||||
#include <unistd.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
static unsigned flags = 0;
|
||||
|
||||
|
@ -61,3 +72,36 @@ COLD unsigned dav1d_get_cpu_flags(void) {
|
|||
COLD void dav1d_set_cpu_flags_mask(const unsigned mask) {
|
||||
flags_mask = mask;
|
||||
}
|
||||
|
||||
COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
|
||||
#ifdef _WIN32
|
||||
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
|
||||
GROUP_AFFINITY affinity;
|
||||
if (GetThreadGroupAffinity(GetCurrentThread(), &affinity)) {
|
||||
int num_processors = 1;
|
||||
while (affinity.Mask &= affinity.Mask - 1)
|
||||
num_processors++;
|
||||
return num_processors;
|
||||
}
|
||||
#else
|
||||
SYSTEM_INFO system_info;
|
||||
GetNativeSystemInfo(&system_info);
|
||||
return system_info.dwNumberOfProcessors;
|
||||
#endif
|
||||
#elif defined(__linux__)
|
||||
#ifdef CPU_COUNT
|
||||
cpu_set_t affinity;
|
||||
if (!sched_getaffinity(0, sizeof(affinity), &affinity))
|
||||
return CPU_COUNT(&affinity);
|
||||
#else
|
||||
return (int)sysconf(_SC_NPROCESSORS_ONLN);
|
||||
#endif
|
||||
#elif defined(__APPLE__)
|
||||
int num_processors;
|
||||
size_t length = sizeof(num_processors);
|
||||
if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0))
|
||||
return num_processors;
|
||||
#endif
|
||||
dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include "common/attributes.h"
|
||||
|
||||
#include "dav1d/common.h"
|
||||
#include "dav1d/dav1d.h"
|
||||
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#include "src/arm/cpu.h"
|
||||
|
@ -45,5 +46,6 @@
|
|||
void dav1d_init_cpu(void);
|
||||
unsigned dav1d_get_cpu_flags(void);
|
||||
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
|
||||
int dav1d_num_logical_processors(Dav1dContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_CPU_H */
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -79,6 +79,11 @@
|
|||
%define mangle(x) x
|
||||
%endif
|
||||
|
||||
; Use VEX-encoding even in non-AVX functions
|
||||
%ifndef FORCE_VEX_ENCODING
|
||||
%define FORCE_VEX_ENCODING 0
|
||||
%endif
|
||||
|
||||
%macro SECTION_RODATA 0-1 16
|
||||
%ifidn __OUTPUT_FORMAT__,win32
|
||||
SECTION .rdata align=%1
|
||||
|
@ -1008,7 +1013,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||
%endmacro
|
||||
|
||||
%macro INIT_XMM 0-1+
|
||||
%assign avx_enabled 0
|
||||
%assign avx_enabled FORCE_VEX_ENCODING
|
||||
%define RESET_MM_PERMUTATION INIT_XMM %1
|
||||
%define mmsize 16
|
||||
%define mova movdqa
|
||||
|
@ -1339,26 +1344,50 @@ INIT_XMM
|
|||
%elif %0 >= 9
|
||||
__instr %6, %7, %8, %9
|
||||
%elif %0 == 8
|
||||
%if avx_enabled && %5
|
||||
%if avx_enabled && __sizeofreg >= 16 && %4 == 0
|
||||
%xdefine __src1 %7
|
||||
%xdefine __src2 %8
|
||||
%ifnum regnumof%7
|
||||
%ifnum regnumof%8
|
||||
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
|
||||
; Most VEX-encoded instructions require an additional byte to encode when
|
||||
; src2 is a high register (e.g. m8..15). If the instruction is commutative
|
||||
; we can swap src1 and src2 when doing so reduces the instruction length.
|
||||
%xdefine __src1 %8
|
||||
%xdefine __src2 %7
|
||||
%if %5
|
||||
%ifnum regnumof%7
|
||||
%ifnum regnumof%8
|
||||
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
|
||||
; Most VEX-encoded instructions require an additional byte to encode when
|
||||
; src2 is a high register (e.g. m8..15). If the instruction is commutative
|
||||
; we can swap src1 and src2 when doing so reduces the instruction length.
|
||||
%xdefine __src1 %8
|
||||
%xdefine __src2 %7
|
||||
%endif
|
||||
%endif
|
||||
%elifnum regnumof%8 ; put memory operands in src2 when possible
|
||||
%xdefine __src1 %8
|
||||
%xdefine __src2 %7
|
||||
%else
|
||||
%assign __emulate_avx 1
|
||||
%endif
|
||||
%elifnnum regnumof%7
|
||||
; EVEX allows imm8 shift instructions to be used with memory operands,
|
||||
; but VEX does not. This handles those special cases.
|
||||
%ifnnum %8
|
||||
%assign __emulate_avx 1
|
||||
%elif notcpuflag(avx512)
|
||||
%assign __emulate_avx 1
|
||||
%endif
|
||||
%endif
|
||||
__instr %6, __src1, __src2
|
||||
%if __emulate_avx ; a separate load is required
|
||||
%if %3
|
||||
vmovaps %6, %7
|
||||
%else
|
||||
vmovdqa %6, %7
|
||||
%endif
|
||||
__instr %6, %8
|
||||
%else
|
||||
__instr %6, __src1, __src2
|
||||
%endif
|
||||
%else
|
||||
__instr %6, %7, %8
|
||||
%endif
|
||||
%elif %0 == 7
|
||||
%if avx_enabled && %5
|
||||
%if avx_enabled && __sizeofreg >= 16 && %5
|
||||
%xdefine __src1 %6
|
||||
%xdefine __src2 %7
|
||||
%ifnum regnumof%6
|
||||
|
|
|
@ -34,8 +34,7 @@
|
|||
|
||||
typedef struct Dav1dFrameContext Dav1dFrameContext;
|
||||
typedef struct Dav1dTileState Dav1dTileState;
|
||||
typedef struct Dav1dTileContext Dav1dTileContext;
|
||||
typedef struct Dav1dPostFilterContext Dav1dPostFilterContext;
|
||||
typedef struct Dav1dTaskContext Dav1dTaskContext;
|
||||
typedef struct Dav1dTask Dav1dTask;
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
@ -78,8 +77,8 @@ struct Dav1dContext {
|
|||
Dav1dFrameContext *fc;
|
||||
unsigned n_fc;
|
||||
|
||||
Dav1dPostFilterContext *pfc;
|
||||
unsigned n_pfc;
|
||||
Dav1dTaskContext *tc;
|
||||
unsigned n_tc;
|
||||
|
||||
// cache of OBUs that make up a single frame before we submit them
|
||||
// to a frame worker to be decoded
|
||||
|
@ -112,14 +111,20 @@ struct Dav1dContext {
|
|||
unsigned next;
|
||||
} frame_thread;
|
||||
|
||||
// postfilter threading (refer to pfc[] for per_thread thingies)
|
||||
struct PostFilterThreadData {
|
||||
// task threading (refer to tc[] for per_thread thingies)
|
||||
struct TaskThreadData {
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t cond;
|
||||
struct Dav1dTask *tasks;
|
||||
int frame_cnt;
|
||||
atomic_uint first;
|
||||
unsigned cur;
|
||||
// This is used for delayed reset of the task cur pointer when
|
||||
// such operation is needed but the thread doesn't enter a critical
|
||||
// section (typically when executing the next sbrow task locklessly).
|
||||
// See src/thread_task.c:reset_task_cur().
|
||||
atomic_uint reset_task_cur;
|
||||
atomic_int cond_signaled;
|
||||
int inited;
|
||||
} postfilter_thread;
|
||||
} task_thread;
|
||||
|
||||
// reference/entropy state
|
||||
Dav1dMemPool *segmap_pool;
|
||||
|
@ -134,6 +139,7 @@ struct Dav1dContext {
|
|||
CdfThreadContext cdf[8];
|
||||
|
||||
Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
|
||||
Dav1dRefmvsDSPContext refmvs_dsp;
|
||||
|
||||
// tree to keep track of which edges are available
|
||||
struct {
|
||||
|
@ -159,6 +165,29 @@ struct Dav1dContext {
|
|||
Dav1dMemPool *picture_pool;
|
||||
};
|
||||
|
||||
enum TaskType {
|
||||
DAV1D_TASK_TYPE_INIT,
|
||||
DAV1D_TASK_TYPE_TILE_ENTROPY,
|
||||
DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
|
||||
DAV1D_TASK_TYPE_DEBLOCK_COLS,
|
||||
DAV1D_TASK_TYPE_DEBLOCK_ROWS,
|
||||
DAV1D_TASK_TYPE_CDEF,
|
||||
DAV1D_TASK_TYPE_SUPER_RESOLUTION,
|
||||
DAV1D_TASK_TYPE_LOOP_RESTORATION,
|
||||
DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
|
||||
};
|
||||
|
||||
struct Dav1dTask {
|
||||
unsigned frame_idx; // frame thread id
|
||||
enum TaskType type; // task work
|
||||
int sby; // sbrow
|
||||
|
||||
// task dependencies
|
||||
int recon_progress, deblock_progress, cdef_progress, lr_progress;
|
||||
int deps_skip;
|
||||
struct Dav1dTask *next; // only used in task queue
|
||||
};
|
||||
|
||||
struct Dav1dFrameContext {
|
||||
Dav1dRef *seq_hdr_ref;
|
||||
Dav1dSequenceHeader *seq_hdr;
|
||||
|
@ -188,8 +217,6 @@ struct Dav1dFrameContext {
|
|||
int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];
|
||||
|
||||
const Dav1dContext *c;
|
||||
Dav1dTileContext *tc;
|
||||
int n_tc;
|
||||
Dav1dTileState *ts;
|
||||
int n_ts;
|
||||
const Dav1dDSPContext *dsp;
|
||||
|
@ -197,7 +224,8 @@ struct Dav1dFrameContext {
|
|||
recon_b_intra_fn recon_b_intra;
|
||||
recon_b_inter_fn recon_b_inter;
|
||||
filter_sbrow_fn filter_sbrow;
|
||||
filter_sbrow_fn filter_sbrow_deblock;
|
||||
filter_sbrow_fn filter_sbrow_deblock_cols;
|
||||
filter_sbrow_fn filter_sbrow_deblock_rows;
|
||||
filter_sbrow_fn filter_sbrow_cdef;
|
||||
filter_sbrow_fn filter_sbrow_resize;
|
||||
filter_sbrow_fn filter_sbrow_lr;
|
||||
|
@ -218,8 +246,9 @@ struct Dav1dFrameContext {
|
|||
int bitdepth_max;
|
||||
|
||||
struct {
|
||||
struct thread_data td;
|
||||
int pass, die;
|
||||
int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
|
||||
int entropy_progress;
|
||||
atomic_int deblock_progress, cdef_progress, lr_progress; // in sby units
|
||||
// indexed using t->by * f->b4_stride + t->bx
|
||||
Av1Block *b;
|
||||
struct CodedBlockInfo {
|
||||
|
@ -243,7 +272,8 @@ struct Dav1dFrameContext {
|
|||
Av1Restoration *lr_mask;
|
||||
int top_pre_cdef_toggle;
|
||||
int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
|
||||
int lr_line_sz, re_sz /* h */;
|
||||
size_t lr_plane_sz; /* w*sbh*4*is_sb128 if n_tc > 1, else w*12 */
|
||||
int re_sz /* h */;
|
||||
ALIGN(Av1FilterLUT lim_lut, 16);
|
||||
int last_sharpness;
|
||||
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
|
||||
|
@ -253,32 +283,34 @@ struct Dav1dFrameContext {
|
|||
pixel *lr_lpf_line[3 /* plane */];
|
||||
|
||||
// in-loop filter per-frame state keeping
|
||||
int tile_row; // for carry-over at tile row edges
|
||||
uint8_t *start_of_tile_row;
|
||||
int start_of_tile_row_sz;
|
||||
pixel *p[3], *sr_p[3];
|
||||
Av1Filter *mask_ptr, *prev_mask_ptr;
|
||||
int restore_planes; // enum LrRestorePlanes
|
||||
|
||||
struct {
|
||||
pthread_cond_t cond;
|
||||
struct PostFilterThreadData *pftd;
|
||||
struct Dav1dTask *tasks;
|
||||
int num_tasks;
|
||||
int npf;
|
||||
int done;
|
||||
int inited;
|
||||
} thread;
|
||||
} lf;
|
||||
|
||||
struct {
|
||||
pthread_cond_t cond;
|
||||
struct TaskThreadData *ttd;
|
||||
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
|
||||
int num_tasks, num_tile_tasks;
|
||||
int done[2];
|
||||
int update_set; // whether we need to update CDF reference
|
||||
atomic_int error;
|
||||
int task_counter;
|
||||
struct Dav1dTask *task_head, *task_tail;
|
||||
// Points to the task directly before the cur pointer in the queue.
|
||||
// This cur pointer is theoretical here, we actually keep track of the
|
||||
// "prev_t" variable. This is needed to not loose the tasks in
|
||||
// [head;cur-1] when picking one for execution.
|
||||
struct Dav1dTask *task_cur_prev;
|
||||
} task_thread;
|
||||
|
||||
// threading (refer to tc[] for per-thread things)
|
||||
struct FrameTileThreadData {
|
||||
uint64_t available;
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t cond, icond;
|
||||
int tasks_left, num_tasks;
|
||||
int (*task_idx_to_sby_and_tile_idx)[2];
|
||||
int titsati_sz, titsati_init[2];
|
||||
uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS];
|
||||
int inited;
|
||||
int (*lowest_pixel_mem)[7][2];
|
||||
int lowest_pixel_mem_sz;
|
||||
} tile_thread;
|
||||
};
|
||||
|
||||
|
@ -291,15 +323,16 @@ struct Dav1dTileState {
|
|||
int col, row; // in tile units
|
||||
} tiling;
|
||||
|
||||
atomic_int progress; // in sby units, TILE_ERROR after a decoding error
|
||||
struct {
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t cond;
|
||||
} tile_thread;
|
||||
// in sby units, TILE_ERROR after a decoding error
|
||||
atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
|
||||
struct {
|
||||
uint8_t *pal_idx;
|
||||
coef *cf;
|
||||
} frame_thread;
|
||||
} frame_thread[2 /* 0: reconstruction, 1: entropy */];
|
||||
|
||||
// in fullpel units, [0] = Y, [1] = UV, used for progress requirements
|
||||
// each entry is one tile-sbrow; middle index is refidx
|
||||
int (*lowest_pixel)[7][2];
|
||||
|
||||
uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
|
||||
const uint16_t (*dq)[3][2];
|
||||
|
@ -312,7 +345,8 @@ struct Dav1dTileState {
|
|||
Av1RestorationUnit *lr_ref[3];
|
||||
};
|
||||
|
||||
struct Dav1dTileContext {
|
||||
struct Dav1dTaskContext {
|
||||
const Dav1dContext *c;
|
||||
const Dav1dFrameContext *f;
|
||||
Dav1dTileState *ts;
|
||||
int bx, by;
|
||||
|
@ -375,18 +409,16 @@ struct Dav1dTileContext {
|
|||
// keeps it accessible
|
||||
enum Filter2d tl_4x4_filter;
|
||||
|
||||
struct {
|
||||
int pass;
|
||||
} frame_thread;
|
||||
struct {
|
||||
struct thread_data td;
|
||||
struct TaskThreadData *ttd;
|
||||
struct FrameTileThreadData *fttd;
|
||||
int flushed;
|
||||
int die;
|
||||
} tile_thread;
|
||||
};
|
||||
|
||||
struct Dav1dPostFilterContext {
|
||||
Dav1dContext *c;
|
||||
struct thread_data td;
|
||||
int flushed;
|
||||
int die;
|
||||
} task_thread;
|
||||
};
|
||||
|
||||
#endif /* DAV1D_SRC_INTERNAL_H */
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include <stdint.h>
|
||||
|
||||
#include "dav1d/headers.h"
|
||||
#include "common/attributes.h"
|
||||
|
||||
enum ObuMetaType {
|
||||
OBU_META_HDR_CLL = 1,
|
||||
|
|
|
@ -35,8 +35,11 @@
|
|||
#include "src/internal.h"
|
||||
#include "src/levels.h"
|
||||
|
||||
void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *f,
|
||||
pixel *const p[3], Av1Filter *lflvl,
|
||||
int sby, int start_of_tile_row);
|
||||
void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *f,
|
||||
pixel *const p[3], Av1Filter *lflvl,
|
||||
int sby, int start_of_tile_row);
|
||||
void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *f,
|
||||
pixel *const p[3], Av1Filter *lflvl,
|
||||
int sby);
|
||||
|
||||
#endif /* DAV1D_SRC_LF_APPLY_H */
|
||||
|
|
|
@ -170,13 +170,12 @@ static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
|
|||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
|
||||
pixel *const p[3], Av1Filter *const lflvl,
|
||||
int sby, const int start_of_tile_row)
|
||||
void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *const f,
|
||||
pixel *const p[3], Av1Filter *const lflvl,
|
||||
int sby, const int start_of_tile_row)
|
||||
{
|
||||
int x, have_left;
|
||||
// Don't filter outside the frame
|
||||
const int have_top = sby > 0;
|
||||
const int is_sb64 = !f->seq_hdr->sb128;
|
||||
const int starty4 = (sby & is_sb64) << 4;
|
||||
const int sbsz = 32 >> is_sb64;
|
||||
|
@ -271,13 +270,6 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
|
|||
imin(32, f->w4 - x * 32), starty4, endy4);
|
||||
}
|
||||
|
||||
level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
|
||||
for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
|
||||
filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
|
||||
lflvl[x].filter_y[1], ptr, f->cur.stride[0],
|
||||
imin(32, f->w4 - x * 32), starty4, endy4);
|
||||
}
|
||||
|
||||
if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
|
||||
return;
|
||||
|
||||
|
@ -292,7 +284,35 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
|
|||
(imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
|
||||
starty4 >> ss_ver, uv_endy4, ss_ver);
|
||||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *const f,
|
||||
pixel *const p[3], Av1Filter *const lflvl,
|
||||
int sby)
|
||||
{
|
||||
int x;
|
||||
// Don't filter outside the frame
|
||||
const int have_top = sby > 0;
|
||||
const int is_sb64 = !f->seq_hdr->sb128;
|
||||
const int starty4 = (sby & is_sb64) << 4;
|
||||
const int sbsz = 32 >> is_sb64;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
|
||||
const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
|
||||
|
||||
pixel *ptr;
|
||||
uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
|
||||
for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
|
||||
filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
|
||||
lflvl[x].filter_y[1], ptr, f->cur.stride[0],
|
||||
imin(32, f->w4 - x * 32), starty4, endy4);
|
||||
}
|
||||
|
||||
if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
|
||||
return;
|
||||
|
||||
ptrdiff_t uv_off;
|
||||
level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
|
||||
for (uv_off = 0, x = 0; x < f->sb128w;
|
||||
x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
|
||||
|
|
|
@ -63,9 +63,8 @@ COLD const char *dav1d_version(void) {
|
|||
}
|
||||
|
||||
COLD void dav1d_default_settings(Dav1dSettings *const s) {
|
||||
s->n_frame_threads = 1;
|
||||
s->n_tile_threads = 1;
|
||||
s->n_postfilter_threads = 1;
|
||||
s->n_threads = 0;
|
||||
s->max_frame_delay = 0;
|
||||
s->apply_grain = 1;
|
||||
s->allocator.cookie = NULL;
|
||||
s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
|
||||
|
@ -101,12 +100,10 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
|
||||
validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_postfilter_threads >= 1 &&
|
||||
s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_tile_threads >= 1 &&
|
||||
s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_frame_threads >= 1 &&
|
||||
s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_threads >= 0 &&
|
||||
s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->max_frame_delay >= 0 &&
|
||||
s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
|
||||
DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->allocator.release_picture_callback != NULL,
|
||||
|
@ -166,44 +163,28 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
c->flush = &c->flush_mem;
|
||||
atomic_init(c->flush, 0);
|
||||
|
||||
c->n_pfc = s->n_postfilter_threads;
|
||||
c->n_fc = s->n_frame_threads;
|
||||
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
|
||||
if (!c->fc) goto error;
|
||||
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
|
||||
c->n_tc = s->n_threads ? s->n_threads :
|
||||
iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
|
||||
c->n_fc = s->max_frame_delay ? umin(s->max_frame_delay, c->n_tc) :
|
||||
umin(c->n_tc, 8);
|
||||
|
||||
if (c->n_pfc > 1) {
|
||||
c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32);
|
||||
if (!c->pfc) goto error;
|
||||
memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads);
|
||||
if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) {
|
||||
pthread_mutex_destroy(&c->postfilter_thread.lock);
|
||||
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
|
||||
if (!c->fc) goto error;
|
||||
memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
|
||||
|
||||
c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64);
|
||||
if (!c->tc) goto error;
|
||||
memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
|
||||
if (c->n_tc > 1) {
|
||||
if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&c->task_thread.cond, NULL)) {
|
||||
pthread_mutex_destroy(&c->task_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
c->postfilter_thread.inited = 1;
|
||||
for (int n = 0; n < s->n_frame_threads; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error;
|
||||
f->lf.thread.pftd = &c->postfilter_thread;
|
||||
f->lf.thread.done = 1;
|
||||
f->lf.thread.inited = 1;
|
||||
}
|
||||
for (int n = 0; n < s->n_postfilter_threads; ++n) {
|
||||
Dav1dPostFilterContext *const pf = &c->pfc[n];
|
||||
pf->c = c;
|
||||
if (pthread_mutex_init(&pf->td.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&pf->td.cond, NULL)) {
|
||||
pthread_mutex_destroy(&pf->td.lock);
|
||||
goto error;
|
||||
}
|
||||
if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) {
|
||||
pthread_cond_destroy(&c->postfilter_thread.cond);
|
||||
pthread_mutex_destroy(&c->postfilter_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
pf->td.inited = 1;
|
||||
}
|
||||
c->task_thread.cur = c->n_fc;
|
||||
atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
|
||||
atomic_init(&c->task_thread.cond_signaled, 0);
|
||||
c->task_thread.inited = 1;
|
||||
}
|
||||
|
||||
if (c->n_fc > 1) {
|
||||
|
@ -211,61 +192,37 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
|
||||
if (!c->frame_thread.out_delayed) goto error;
|
||||
}
|
||||
for (int n = 0; n < s->n_frame_threads; n++) {
|
||||
for (unsigned n = 0; n < c->n_fc; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
if (c->n_tc > 1)
|
||||
if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error;
|
||||
f->c = c;
|
||||
f->task_thread.ttd = &c->task_thread;
|
||||
f->lf.last_sharpness = -1;
|
||||
f->n_tc = s->n_tile_threads;
|
||||
f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
|
||||
if (!f->tc) goto error;
|
||||
memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
|
||||
if (f->n_tc > 1) {
|
||||
if (pthread_mutex_init(&f->tile_thread.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&f->tile_thread.cond, NULL)) {
|
||||
pthread_mutex_destroy(&f->tile_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
if (pthread_cond_init(&f->tile_thread.icond, NULL)) {
|
||||
pthread_mutex_destroy(&f->tile_thread.lock);
|
||||
pthread_cond_destroy(&f->tile_thread.cond);
|
||||
goto error;
|
||||
}
|
||||
f->tile_thread.inited = 1;
|
||||
}
|
||||
for (int m = 0; m < s->n_tile_threads; m++) {
|
||||
Dav1dTileContext *const t = &f->tc[m];
|
||||
t->f = f;
|
||||
memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
|
||||
if (f->n_tc > 1) {
|
||||
if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) {
|
||||
pthread_mutex_destroy(&t->tile_thread.td.lock);
|
||||
goto error;
|
||||
}
|
||||
t->tile_thread.fttd = &f->tile_thread;
|
||||
if (pthread_create(&t->tile_thread.td.thread, &thread_attr, dav1d_tile_task, t)) {
|
||||
pthread_cond_destroy(&t->tile_thread.td.cond);
|
||||
pthread_mutex_destroy(&t->tile_thread.td.lock);
|
||||
goto error;
|
||||
}
|
||||
t->tile_thread.td.inited = 1;
|
||||
}
|
||||
}
|
||||
dav1d_refmvs_init(&f->rf);
|
||||
if (c->n_fc > 1) {
|
||||
if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) {
|
||||
pthread_mutex_destroy(&f->frame_thread.td.lock);
|
||||
}
|
||||
|
||||
for (unsigned m = 0; m < c->n_tc; m++) {
|
||||
Dav1dTaskContext *const t = &c->tc[m];
|
||||
t->f = &c->fc[0];
|
||||
t->task_thread.ttd = &c->task_thread;
|
||||
t->c = c;
|
||||
memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
|
||||
if (c->n_tc > 1) {
|
||||
if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&t->task_thread.td.cond, NULL)) {
|
||||
pthread_mutex_destroy(&t->task_thread.td.lock);
|
||||
goto error;
|
||||
}
|
||||
if (pthread_create(&f->frame_thread.td.thread, &thread_attr, dav1d_frame_task, f)) {
|
||||
pthread_cond_destroy(&f->frame_thread.td.cond);
|
||||
pthread_mutex_destroy(&f->frame_thread.td.lock);
|
||||
if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav1d_worker_task, t)) {
|
||||
pthread_cond_destroy(&t->task_thread.td.cond);
|
||||
pthread_mutex_destroy(&t->task_thread.td.lock);
|
||||
goto error;
|
||||
}
|
||||
f->frame_thread.td.inited = 1;
|
||||
t->task_thread.td.inited = 1;
|
||||
}
|
||||
}
|
||||
dav1d_refmvs_dsp_init(&c->refmvs_dsp);
|
||||
|
||||
// intra edge tree
|
||||
c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node;
|
||||
|
@ -297,6 +254,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
|
|||
|
||||
Dav1dSettings s;
|
||||
dav1d_default_settings(&s);
|
||||
s.n_threads = 1;
|
||||
s.logger.callback = NULL;
|
||||
|
||||
Dav1dContext *c;
|
||||
|
@ -318,7 +276,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
|
|||
}
|
||||
|
||||
if (!c->seq_hdr) {
|
||||
res = DAV1D_ERR(EINVAL);
|
||||
res = DAV1D_ERR(ENOENT);
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
@ -394,15 +352,23 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
|
|||
do {
|
||||
const unsigned next = c->frame_thread.next;
|
||||
Dav1dFrameContext *const f = &c->fc[next];
|
||||
pthread_mutex_lock(&f->frame_thread.td.lock);
|
||||
pthread_mutex_lock(&c->task_thread.lock);
|
||||
while (f->n_tile_data > 0)
|
||||
pthread_cond_wait(&f->frame_thread.td.cond,
|
||||
&f->frame_thread.td.lock);
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
pthread_cond_wait(&f->task_thread.cond,
|
||||
&f->task_thread.ttd->lock);
|
||||
Dav1dThreadPicture *const out_delayed =
|
||||
&c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
|
||||
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
|
||||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
if (++c->frame_thread.next == c->n_fc)
|
||||
c->frame_thread.next = 0;
|
||||
pthread_mutex_unlock(&c->task_thread.lock);
|
||||
if (out_delayed->p.data[0]) {
|
||||
const unsigned progress =
|
||||
atomic_load_explicit(&out_delayed->progress[1],
|
||||
|
@ -509,51 +475,43 @@ void dav1d_flush(Dav1dContext *const c) {
|
|||
dav1d_ref_dec(&c->content_light_ref);
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
|
||||
if (c->n_fc == 1 && c->n_pfc == 1) return;
|
||||
if (c->n_fc == 1 && c->n_tc == 1) return;
|
||||
atomic_store(c->flush, 1);
|
||||
|
||||
// stop running tasks in worker threads
|
||||
if (c->n_tc > 1) {
|
||||
pthread_mutex_lock(&c->task_thread.lock);
|
||||
for (unsigned i = 0; i < c->n_tc; i++) {
|
||||
Dav1dTaskContext *const tc = &c->tc[i];
|
||||
while (!tc->task_thread.flushed) {
|
||||
pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock);
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < c->n_fc; i++) {
|
||||
c->fc[i].task_thread.task_head = NULL;
|
||||
c->fc[i].task_thread.task_tail = NULL;
|
||||
c->fc[i].task_thread.task_cur_prev = NULL;
|
||||
}
|
||||
atomic_init(&c->task_thread.first, 0);
|
||||
c->task_thread.cur = c->n_fc;
|
||||
atomic_store(&c->task_thread.reset_task_cur, UINT_MAX);
|
||||
atomic_store(&c->task_thread.cond_signaled, 0);
|
||||
pthread_mutex_unlock(&c->task_thread.lock);
|
||||
}
|
||||
|
||||
// wait for threads to complete flushing
|
||||
if (c->n_pfc > 1)
|
||||
pthread_mutex_lock(&c->postfilter_thread.lock);
|
||||
atomic_store(c->flush, 1);
|
||||
if (c->n_pfc > 1) {
|
||||
pthread_cond_broadcast(&c->postfilter_thread.cond);
|
||||
pthread_mutex_unlock(&c->postfilter_thread.lock);
|
||||
}
|
||||
if (c->n_fc == 1) goto skip_ft_flush;
|
||||
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
|
||||
if (next == c->n_fc) next = 0;
|
||||
Dav1dFrameContext *const f = &c->fc[next];
|
||||
pthread_mutex_lock(&f->frame_thread.td.lock);
|
||||
if (f->n_tile_data > 0) {
|
||||
while (f->n_tile_data > 0)
|
||||
pthread_cond_wait(&f->frame_thread.td.cond,
|
||||
&f->frame_thread.td.lock);
|
||||
assert(!f->cur.data[0]);
|
||||
}
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
Dav1dThreadPicture *const out_delayed =
|
||||
&c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0])
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
}
|
||||
c->frame_thread.next = 0;
|
||||
skip_ft_flush:
|
||||
if (c->n_pfc > 1) {
|
||||
for (unsigned i = 0; i < c->n_pfc; ++i) {
|
||||
Dav1dPostFilterContext *const pf = &c->pfc[i];
|
||||
pthread_mutex_lock(&pf->td.lock);
|
||||
if (!pf->flushed)
|
||||
pthread_cond_wait(&pf->td.cond, &pf->td.lock);
|
||||
pf->flushed = 0;
|
||||
pthread_mutex_unlock(&pf->td.lock);
|
||||
}
|
||||
pthread_mutex_lock(&c->postfilter_thread.lock);
|
||||
c->postfilter_thread.tasks = NULL;
|
||||
pthread_mutex_unlock(&c->postfilter_thread.lock);
|
||||
for (unsigned i = 0; i < c->n_fc; ++i) {
|
||||
freep(&c->fc[i].lf.thread.tasks);
|
||||
c->fc[i].lf.thread.num_tasks = 0;
|
||||
if (c->n_fc > 1) {
|
||||
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
|
||||
if (next == c->n_fc) next = 0;
|
||||
Dav1dFrameContext *const f = &c->fc[next];
|
||||
dav1d_decode_frame_exit(f, -1);
|
||||
f->n_tile_data = 0;
|
||||
Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0]) {
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
}
|
||||
}
|
||||
c->frame_thread.next = 0;
|
||||
}
|
||||
atomic_store(c->flush, 0);
|
||||
}
|
||||
|
@ -569,82 +527,44 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
|
||||
if (flush) dav1d_flush(c);
|
||||
|
||||
if (c->pfc) {
|
||||
struct PostFilterThreadData *pftd = &c->postfilter_thread;
|
||||
if (pftd->inited) {
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++)
|
||||
c->pfc[n].die = 1;
|
||||
pthread_cond_broadcast(&pftd->cond);
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) {
|
||||
pthread_join(c->pfc[n].td.thread, NULL);
|
||||
pthread_cond_destroy(&c->pfc[n].td.cond);
|
||||
pthread_mutex_destroy(&c->pfc[n].td.lock);
|
||||
if (c->tc) {
|
||||
struct TaskThreadData *ttd = &c->task_thread;
|
||||
if (ttd->inited) {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++)
|
||||
c->tc[n].task_thread.die = 1;
|
||||
pthread_cond_broadcast(&ttd->cond);
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
for (unsigned n = 0; n < c->n_tc; n++) {
|
||||
Dav1dTaskContext *const pf = &c->tc[n];
|
||||
if (!pf->task_thread.td.inited) break;
|
||||
pthread_join(pf->task_thread.td.thread, NULL);
|
||||
pthread_cond_destroy(&pf->task_thread.td.cond);
|
||||
pthread_mutex_destroy(&pf->task_thread.td.lock);
|
||||
}
|
||||
pthread_cond_destroy(&pftd->cond);
|
||||
pthread_mutex_destroy(&pftd->lock);
|
||||
pthread_cond_destroy(&ttd->cond);
|
||||
pthread_mutex_destroy(&ttd->lock);
|
||||
}
|
||||
dav1d_free_aligned(c->pfc);
|
||||
dav1d_free_aligned(c->tc);
|
||||
}
|
||||
|
||||
for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
|
||||
// clean-up threading stuff
|
||||
if (c->n_fc > 1 && f->frame_thread.td.inited) {
|
||||
pthread_mutex_lock(&f->frame_thread.td.lock);
|
||||
f->frame_thread.die = 1;
|
||||
pthread_cond_signal(&f->frame_thread.td.cond);
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
pthread_join(f->frame_thread.td.thread, NULL);
|
||||
if (c->n_fc > 1) {
|
||||
freep(&f->tile_thread.lowest_pixel_mem);
|
||||
freep(&f->frame_thread.b);
|
||||
dav1d_freep_aligned(&f->frame_thread.pal_idx);
|
||||
dav1d_freep_aligned(&f->frame_thread.cf);
|
||||
freep(&f->frame_thread.tile_start_off);
|
||||
dav1d_freep_aligned(&f->frame_thread.pal);
|
||||
freep(&f->frame_thread.cbi);
|
||||
pthread_mutex_destroy(&f->frame_thread.td.lock);
|
||||
pthread_cond_destroy(&f->frame_thread.td.cond);
|
||||
}
|
||||
if (f->n_tc > 1 && f->tc && f->tile_thread.inited) {
|
||||
pthread_mutex_lock(&f->tile_thread.lock);
|
||||
for (int m = 0; m < f->n_tc; m++) {
|
||||
Dav1dTileContext *const t = &f->tc[m];
|
||||
t->tile_thread.die = 1;
|
||||
// mark not created tile threads as available
|
||||
if (!t->tile_thread.td.inited)
|
||||
f->tile_thread.available |= 1ULL<<m;
|
||||
}
|
||||
pthread_cond_broadcast(&f->tile_thread.cond);
|
||||
while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc))
|
||||
pthread_cond_wait(&f->tile_thread.icond,
|
||||
&f->tile_thread.lock);
|
||||
pthread_mutex_unlock(&f->tile_thread.lock);
|
||||
for (int m = 0; m < f->n_tc; m++) {
|
||||
Dav1dTileContext *const t = &f->tc[m];
|
||||
if (f->n_tc > 1 && t->tile_thread.td.inited) {
|
||||
pthread_join(t->tile_thread.td.thread, NULL);
|
||||
pthread_mutex_destroy(&t->tile_thread.td.lock);
|
||||
pthread_cond_destroy(&t->tile_thread.td.cond);
|
||||
}
|
||||
}
|
||||
pthread_mutex_destroy(&f->tile_thread.lock);
|
||||
pthread_cond_destroy(&f->tile_thread.cond);
|
||||
pthread_cond_destroy(&f->tile_thread.icond);
|
||||
freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
|
||||
}
|
||||
for (int m = 0; f->ts && m < f->n_ts; m++) {
|
||||
Dav1dTileState *const ts = &f->ts[m];
|
||||
pthread_cond_destroy(&ts->tile_thread.cond);
|
||||
pthread_mutex_destroy(&ts->tile_thread.lock);
|
||||
}
|
||||
if (f->lf.thread.inited) {
|
||||
freep(&f->lf.thread.tasks);
|
||||
pthread_cond_destroy(&f->lf.thread.cond);
|
||||
pthread_cond_destroy(&f->task_thread.cond);
|
||||
}
|
||||
freep(&f->task_thread.tasks);
|
||||
freep(&f->task_thread.tile_tasks[0]);
|
||||
dav1d_free_aligned(f->ts);
|
||||
dav1d_free_aligned(f->tc);
|
||||
dav1d_free_aligned(f->ipred_edge[0]);
|
||||
free(f->a);
|
||||
free(f->tile);
|
||||
|
@ -652,6 +572,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
free(f->lf.lr_mask);
|
||||
free(f->lf.level);
|
||||
free(f->lf.tx_lpf_right_edge[0]);
|
||||
free(f->lf.start_of_tile_row);
|
||||
dav1d_refmvs_clear(&f->rf);
|
||||
dav1d_free_aligned(f->lf.cdef_line_buf);
|
||||
dav1d_free_aligned(f->lf.lr_lpf_line[0]);
|
||||
|
|
|
@ -382,11 +382,11 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
|
|||
|
||||
const unsigned p = imax(a * n - b * b, 0);
|
||||
const unsigned z = (p * s + (1 << 19)) >> 20;
|
||||
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
|
||||
const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
|
||||
|
||||
// This is where we invert A and B, so that B is of size coef.
|
||||
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
|
||||
BB[i] = 256 - x;
|
||||
BB[i] = x;
|
||||
}
|
||||
AA += step * REST_UNIT_STRIDE;
|
||||
BB += step * REST_UNIT_STRIDE;
|
||||
|
@ -403,7 +403,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
|
|||
for (int i = 0; i < w; i++) {
|
||||
const int a = SIX_NEIGHBORS(B, i);
|
||||
const int b = SIX_NEIGHBORS(A, i);
|
||||
dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
|
||||
dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
|
||||
}
|
||||
dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
|
||||
src += REST_UNIT_STRIDE;
|
||||
|
@ -412,7 +412,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
|
|||
for (int i = 0; i < w; i++) {
|
||||
const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
|
||||
const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
|
||||
dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
|
||||
dst[i] = (b - a * src[i] + (1 << 7)) >> 8;
|
||||
}
|
||||
dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
|
||||
src += REST_UNIT_STRIDE;
|
||||
|
@ -423,7 +423,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
|
|||
for (int i = 0; i < w; i++) {
|
||||
const int a = SIX_NEIGHBORS(B, i);
|
||||
const int b = SIX_NEIGHBORS(A, i);
|
||||
dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
|
||||
dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
|
||||
}
|
||||
}
|
||||
#undef SIX_NEIGHBORS
|
||||
|
@ -436,7 +436,7 @@ selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
|
|||
for (int i = 0; i < w; i++) {
|
||||
const int a = EIGHT_NEIGHBORS(B, i);
|
||||
const int b = EIGHT_NEIGHBORS(A, i);
|
||||
dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
|
||||
dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
|
||||
}
|
||||
dst += 384;
|
||||
src += REST_UNIT_STRIDE;
|
||||
|
@ -468,9 +468,8 @@ static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride,
|
|||
const int w0 = params->sgr.w0;
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
const int v = w0 * dst[j * 384 + i];
|
||||
p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
|
@ -492,9 +491,8 @@ static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride,
|
|||
const int w1 = params->sgr.w1;
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
const int v = w1 * dst[j * 384 + i];
|
||||
p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
|
@ -520,10 +518,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride,
|
|||
const int w1 = params->sgr.w1;
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) +
|
||||
w1 * (dst1[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
const int v = w0 * dst0[j * 384 + i] + w1 * dst1[j * 384 + i];
|
||||
p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ static void backup_lpf(const Dav1dFrameContext *const f,
|
|||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const int ss_ver, const int sb128,
|
||||
int row, const int row_h, const int src_w,
|
||||
const int h, const int ss_hor, const int pft)
|
||||
const int h, const int ss_hor)
|
||||
{
|
||||
const int dst_w = f->frame_hdr->super_res.enabled ?
|
||||
(f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
|
||||
|
@ -57,7 +57,7 @@ static void backup_lpf(const Dav1dFrameContext *const f,
|
|||
int stripe_h = (64 - 8 * !row) >> ss_ver;
|
||||
src += (stripe_h - 2) * PXSTRIDE(src_stride);
|
||||
|
||||
if (!pft) {
|
||||
if (f->c->n_tc == 1) {
|
||||
if (row) {
|
||||
const int top = 4 << sb128;
|
||||
// Copy the top part of the stored loop filtered pixels from the
|
||||
|
@ -108,14 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f,
|
|||
void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
||||
/*const*/ pixel *const src[3], const int sby)
|
||||
{
|
||||
const int pft = f->c->n_pfc > 1;
|
||||
const int have_tt = f->c->n_tc > 1;
|
||||
const int offset = 8 * !!sby;
|
||||
const ptrdiff_t *const src_stride = f->cur.stride;
|
||||
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
|
||||
const ptrdiff_t tt_off = have_tt * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride);
|
||||
pixel *const dst[3] = {
|
||||
f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
|
||||
f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
|
||||
f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride)
|
||||
f->lf.lr_lpf_line[0] + tt_off,
|
||||
f->lf.lr_lpf_line[1] + tt_off,
|
||||
f->lf.lr_lpf_line[2] + tt_off
|
||||
};
|
||||
|
||||
// TODO Also check block level restore type to reduce copying.
|
||||
|
@ -128,7 +129,7 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
|||
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
|
||||
backup_lpf(f, dst[0], lr_stride,
|
||||
src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
|
||||
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft);
|
||||
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
|
||||
}
|
||||
if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
|
||||
const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
@ -141,12 +142,12 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
|||
if (restore_planes & LR_RESTORE_U) {
|
||||
backup_lpf(f, dst[1], lr_stride,
|
||||
src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
|
||||
}
|
||||
if (restore_planes & LR_RESTORE_V) {
|
||||
backup_lpf(f, dst[2], lr_stride,
|
||||
src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -162,7 +163,9 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
|
|||
const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
|
||||
const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
|
||||
const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
|
||||
const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
|
||||
const int have_tt = f->c->n_tc > 1;
|
||||
const pixel *lpf = f->lf.lr_lpf_line[plane] +
|
||||
have_tt * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
|
||||
|
||||
// The first stripe of the frame is shorter by 8 luma pixel rows.
|
||||
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
|
||||
|
|
|
@ -92,6 +92,7 @@ if is_asm_enabled
|
|||
|
||||
libdav1d_sources += files(
|
||||
'arm/cpu.c',
|
||||
'arm/refmvs_init.c',
|
||||
)
|
||||
libdav1d_tmpl_sources += files(
|
||||
'arm/cdef_init_tmpl.c',
|
||||
|
@ -109,6 +110,7 @@ if is_asm_enabled
|
|||
'arm/64/itx.S',
|
||||
'arm/64/looprestoration_common.S',
|
||||
'arm/64/msac.S',
|
||||
'arm/64/refmvs.S',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
|
@ -139,6 +141,7 @@ if is_asm_enabled
|
|||
'arm/32/itx.S',
|
||||
'arm/32/looprestoration_common.S',
|
||||
'arm/32/msac.S',
|
||||
'arm/32/refmvs.S',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
|
@ -175,6 +178,7 @@ if is_asm_enabled
|
|||
libdav1d_sources += files(
|
||||
'x86/cpu.c',
|
||||
'x86/msac_init.c',
|
||||
'x86/refmvs_init.c',
|
||||
)
|
||||
|
||||
libdav1d_tmpl_sources += files(
|
||||
|
@ -191,10 +195,12 @@ if is_asm_enabled
|
|||
libdav1d_sources_asm = files(
|
||||
'x86/cpuid.asm',
|
||||
'x86/msac.asm',
|
||||
'x86/refmvs.asm',
|
||||
'x86/cdef_avx2.asm',
|
||||
'x86/itx_avx2.asm',
|
||||
'x86/looprestoration_avx2.asm',
|
||||
'x86/cdef_sse.asm',
|
||||
'x86/itx_sse.asm',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
|
@ -207,7 +213,6 @@ if is_asm_enabled
|
|||
'x86/loopfilter_avx2.asm',
|
||||
'x86/film_grain_sse.asm',
|
||||
'x86/ipred_sse.asm',
|
||||
'x86/itx_sse.asm',
|
||||
'x86/loopfilter_sse.asm',
|
||||
'x86/looprestoration_sse.asm',
|
||||
'x86/mc_sse.asm',
|
||||
|
@ -224,6 +229,8 @@ if is_asm_enabled
|
|||
'x86/looprestoration16_avx2.asm',
|
||||
'x86/mc16_avx2.asm',
|
||||
'x86/cdef16_sse.asm',
|
||||
'x86/film_grain16_sse.asm',
|
||||
'x86/ipred16_sse.asm',
|
||||
'x86/itx16_sse.asm',
|
||||
'x86/loopfilter16_sse.asm',
|
||||
'x86/looprestoration16_sse.asm',
|
||||
|
@ -280,11 +287,11 @@ libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
|
|||
rev_target, config_h_target,
|
||||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
dependencies: [stdatomic_dependency],
|
||||
dependencies: [stdatomic_dependencies],
|
||||
c_args : [stackalign_flag, stackrealign_flag, api_export_flags],
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects()
|
||||
).extract_all_objects(recursive: true)
|
||||
|
||||
# Helper library for each bitdepth
|
||||
libdav1d_bitdepth_objs = []
|
||||
|
@ -293,11 +300,11 @@ foreach bitdepth : dav1d_bitdepths
|
|||
'dav1d_bitdepth_@0@'.format(bitdepth),
|
||||
libdav1d_tmpl_sources, config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependency],
|
||||
dependencies : [stdatomic_dependencies],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects()
|
||||
).extract_all_objects(recursive: true)
|
||||
endforeach
|
||||
|
||||
# Helper library for each bitdepth and architecture-specific flags
|
||||
|
@ -306,11 +313,11 @@ foreach bitdepth : dav1d_bitdepths
|
|||
'dav1d_arch_bitdepth_@0@'.format(bitdepth),
|
||||
libdav1d_arch_tmpl_sources, config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependency],
|
||||
dependencies : [stdatomic_dependencies],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects()
|
||||
).extract_all_objects(recursive: true)
|
||||
endforeach
|
||||
|
||||
# The final dav1d library
|
||||
|
@ -332,7 +339,7 @@ libdav1d = library('dav1d',
|
|||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
dependencies : [
|
||||
stdatomic_dependency,
|
||||
stdatomic_dependencies,
|
||||
thread_dependency,
|
||||
thread_compat_dep,
|
||||
libdl_dependency,
|
||||
|
|
|
@ -1547,18 +1547,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
dav1d_data_props_copy(&c->out.m, &in->m);
|
||||
c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
} else {
|
||||
pthread_mutex_lock(&c->task_thread.lock);
|
||||
// need to append this to the frame output queue
|
||||
const unsigned next = c->frame_thread.next++;
|
||||
if (c->frame_thread.next == c->n_fc)
|
||||
c->frame_thread.next = 0;
|
||||
|
||||
Dav1dFrameContext *const f = &c->fc[next];
|
||||
pthread_mutex_lock(&f->frame_thread.td.lock);
|
||||
while (f->n_tile_data > 0)
|
||||
pthread_cond_wait(&f->frame_thread.td.cond,
|
||||
&f->frame_thread.td.lock);
|
||||
pthread_cond_wait(&f->task_thread.cond,
|
||||
&f->task_thread.ttd->lock);
|
||||
Dav1dThreadPicture *const out_delayed =
|
||||
&c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
|
||||
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
|
||||
atomic_fetch_add(&c->task_thread.first, 1U);
|
||||
else
|
||||
atomic_store(&c->task_thread.first, 0);
|
||||
if (c->task_thread.cur < c->n_fc)
|
||||
c->task_thread.cur--;
|
||||
}
|
||||
if (out_delayed->p.data[0]) {
|
||||
const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
|
||||
memory_order_relaxed);
|
||||
|
@ -1572,7 +1580,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
out_delayed->visible = 1;
|
||||
dav1d_data_props_copy(&out_delayed->p.m, &in->m);
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
pthread_mutex_unlock(&c->task_thread.lock);
|
||||
}
|
||||
if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
|
||||
const int r = c->frame_hdr->existing_frame_idx;
|
||||
|
|
|
@ -176,7 +176,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
|
|||
const int bpc)
|
||||
{
|
||||
Dav1dThreadPicture *const p = &f->sr_cur;
|
||||
p->t = c->n_fc > 1 ? &f->frame_thread.td : NULL;
|
||||
const int have_frame_mt = c->n_fc > 1;
|
||||
|
||||
const int res =
|
||||
picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
|
||||
|
@ -186,7 +186,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
|
|||
c->mastering_display, c->mastering_display_ref,
|
||||
c->itut_t35, c->itut_t35_ref,
|
||||
bpc, &f->tile[0].data.m, &c->allocator,
|
||||
p->t != NULL ? sizeof(atomic_int) * 2 : 0,
|
||||
have_frame_mt ? sizeof(atomic_int) * 2 : 0,
|
||||
(void **) &p->progress);
|
||||
if (res) return res;
|
||||
|
||||
|
@ -198,7 +198,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
|
|||
c->frame_flags = 0;
|
||||
|
||||
p->visible = f->frame_hdr->show_frame;
|
||||
if (p->t) {
|
||||
if (have_frame_mt) {
|
||||
atomic_init(&p->progress[0], 0);
|
||||
atomic_init(&p->progress[1], 0);
|
||||
}
|
||||
|
@ -254,7 +254,6 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
|
|||
const Dav1dThreadPicture *const src)
|
||||
{
|
||||
dav1d_picture_ref(&dst->p, &src->p);
|
||||
dst->t = src->t;
|
||||
dst->visible = src->visible;
|
||||
dst->progress = src->progress;
|
||||
dst->flags = src->flags;
|
||||
|
@ -279,54 +278,9 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
|
|||
void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
|
||||
dav1d_picture_unref_internal(&p->p);
|
||||
|
||||
p->t = NULL;
|
||||
p->progress = NULL;
|
||||
}
|
||||
|
||||
int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
|
||||
int y_unclipped, const enum PlaneType plane_type)
|
||||
{
|
||||
assert(plane_type != PLANE_TYPE_ALL);
|
||||
|
||||
if (!p->t)
|
||||
return 0;
|
||||
|
||||
// convert to luma units; include plane delay from loopfilters; clip
|
||||
const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
y_unclipped *= 1 << (plane_type & ss_ver); // we rely here on PLANE_TYPE_UV being 1
|
||||
y_unclipped += (plane_type != PLANE_TYPE_BLOCK) * 8; // delay imposed by loopfilter
|
||||
const unsigned y = iclip(y_unclipped, 1, p->p.p.h);
|
||||
atomic_uint *const progress = &p->progress[plane_type != PLANE_TYPE_BLOCK];
|
||||
unsigned state;
|
||||
|
||||
if ((state = atomic_load_explicit(progress, memory_order_acquire)) >= y)
|
||||
return state == FRAME_ERROR;
|
||||
|
||||
pthread_mutex_lock(&p->t->lock);
|
||||
while ((state = atomic_load_explicit(progress, memory_order_relaxed)) < y)
|
||||
pthread_cond_wait(&p->t->cond, &p->t->lock);
|
||||
pthread_mutex_unlock(&p->t->lock);
|
||||
return state == FRAME_ERROR;
|
||||
}
|
||||
|
||||
void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p,
|
||||
const int y, // in pixel units
|
||||
const enum PlaneType plane_type)
|
||||
{
|
||||
assert(plane_type != PLANE_TYPE_UV);
|
||||
|
||||
if (!p->t)
|
||||
return;
|
||||
|
||||
pthread_mutex_lock(&p->t->lock);
|
||||
if (plane_type != PLANE_TYPE_Y)
|
||||
atomic_store(&p->progress[0], y);
|
||||
if (plane_type != PLANE_TYPE_BLOCK)
|
||||
atomic_store(&p->progress[1], y);
|
||||
pthread_cond_broadcast(&p->t->cond);
|
||||
pthread_mutex_unlock(&p->t->lock);
|
||||
}
|
||||
|
||||
enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) {
|
||||
if (!p->flags)
|
||||
return 0;
|
||||
|
|
|
@ -52,7 +52,6 @@ typedef struct Dav1dThreadPicture {
|
|||
Dav1dPicture p;
|
||||
int visible;
|
||||
enum PictureFlags flags;
|
||||
struct thread_data *t;
|
||||
// [0] block data (including segmentation map and motion vectors)
|
||||
// [1] pixel data
|
||||
atomic_uint *progress;
|
||||
|
@ -91,31 +90,6 @@ void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
|
|||
*/
|
||||
void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src);
|
||||
|
||||
/**
|
||||
* Wait for picture to reach a certain stage.
|
||||
*
|
||||
* y is in full-pixel units. If pt is not UV, this is in luma
|
||||
* units, else it is in chroma units.
|
||||
* plane_type is used to determine how many pixels delay are
|
||||
* introduced by loopfilter processes.
|
||||
*
|
||||
* Returns 0 on success, and 1 if there was an error while decoding p
|
||||
*/
|
||||
int dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y,
|
||||
enum PlaneType plane_type);
|
||||
|
||||
/**
|
||||
* Signal decoding progress.
|
||||
*
|
||||
* y is in full-pixel luma units. FRAME_ERROR is used to signal a decoding
|
||||
* error to frames using this frame as reference frame.
|
||||
* plane_type denotes whether we have completed block data (pass 1;
|
||||
* PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
|
||||
* 2-pass decoding; PLANE_TYPE_ALL).
|
||||
*/
|
||||
void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
|
||||
enum PlaneType plane_type);
|
||||
|
||||
int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
|
||||
void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
|
||||
void dav1d_picture_unref_internal(Dav1dPicture *p);
|
||||
|
|
|
@ -37,12 +37,12 @@
|
|||
#define DEBUG_B_PIXELS 0
|
||||
|
||||
#define decl_recon_b_intra_fn(name) \
|
||||
void (name)(Dav1dTileContext *t, enum BlockSize bs, \
|
||||
void (name)(Dav1dTaskContext *t, enum BlockSize bs, \
|
||||
enum EdgeFlags intra_edge_flags, const Av1Block *b)
|
||||
typedef decl_recon_b_intra_fn(*recon_b_intra_fn);
|
||||
|
||||
#define decl_recon_b_inter_fn(name) \
|
||||
int (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
|
||||
int (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
|
||||
typedef decl_recon_b_inter_fn(*recon_b_inter_fn);
|
||||
|
||||
#define decl_filter_sbrow_fn(name) \
|
||||
|
@ -50,11 +50,11 @@ void (name)(Dav1dFrameContext *f, int sby)
|
|||
typedef decl_filter_sbrow_fn(*filter_sbrow_fn);
|
||||
|
||||
#define decl_backup_ipred_edge_fn(name) \
|
||||
void (name)(Dav1dTileContext *t)
|
||||
void (name)(Dav1dTaskContext *t)
|
||||
typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
|
||||
|
||||
#define decl_read_coef_blocks_fn(name) \
|
||||
void (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
|
||||
void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
|
||||
typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
|
||||
|
||||
decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
|
||||
|
@ -65,8 +65,10 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
|
|||
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
|
||||
|
|
|
@ -318,7 +318,7 @@ static inline unsigned get_lo_ctx(const uint8_t *const levels,
|
|||
return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
|
||||
}
|
||||
|
||||
static int decode_coefs(Dav1dTileContext *const t,
|
||||
static int decode_coefs(Dav1dTaskContext *const t,
|
||||
uint8_t *const a, uint8_t *const l,
|
||||
const enum RectTxfmSize tx, const enum BlockSize bs,
|
||||
const Av1Block *const b, const int intra,
|
||||
|
@ -719,7 +719,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
return eob;
|
||||
}
|
||||
|
||||
static void read_coef_tree(Dav1dTileContext *const t,
|
||||
static void read_coef_tree(Dav1dTaskContext *const t,
|
||||
const enum BlockSize bs, const Av1Block *const b,
|
||||
const enum RectTxfmSize ytx, const int depth,
|
||||
const uint16_t *const tx_split,
|
||||
|
@ -768,15 +768,16 @@ static void read_coef_tree(Dav1dTileContext *const t,
|
|||
coef *cf;
|
||||
struct CodedBlockInfo *cbi;
|
||||
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.cf);
|
||||
cf = ts->frame_thread.cf;
|
||||
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].cf);
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
|
||||
} else {
|
||||
cf = bitfn(t->cf);
|
||||
}
|
||||
if (f->frame_thread.pass != 2) {
|
||||
if (t->frame_thread.pass != 2) {
|
||||
eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
|
||||
ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
|
@ -798,7 +799,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
|
|||
uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
|
||||
case_set_upto16(txw,,,);
|
||||
#undef set_ctx
|
||||
if (f->frame_thread.pass == 1) {
|
||||
if (t->frame_thread.pass == 1) {
|
||||
cbi->eob[0] = eob;
|
||||
cbi->txtp[0] = txtp;
|
||||
}
|
||||
|
@ -806,7 +807,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
|
|||
eob = cbi->eob[0];
|
||||
txtp = cbi->txtp[0];
|
||||
}
|
||||
if (!(f->frame_thread.pass & 1)) {
|
||||
if (!(t->frame_thread.pass & 1)) {
|
||||
assert(dst);
|
||||
if (eob >= 0) {
|
||||
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
|
||||
|
@ -820,7 +821,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
|
|||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
|
||||
void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
|
||||
const enum BlockSize bs, const Av1Block *const b)
|
||||
{
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
|
@ -855,7 +856,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
|
|||
Dav1dTileState *const ts = t->ts;
|
||||
const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
|
||||
const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
|
||||
assert(f->frame_thread.pass == 1);
|
||||
assert(t->frame_thread.pass == 1);
|
||||
assert(!b->skip);
|
||||
const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
|
||||
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
|
||||
|
@ -884,12 +885,12 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
|
|||
const int eob = cbi[t->bx].eob[0] =
|
||||
decode_coefs(t, &t->a->lcoef[bx4 + x],
|
||||
&t->l.lcoef[by4 + y], b->tx, bs, b, 1,
|
||||
0, ts->frame_thread.cf, &txtp, &cf_ctx);
|
||||
0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
|
||||
b->tx, txtp, eob, ts->msac.rng);
|
||||
cbi[t->bx].txtp[0] = txtp;
|
||||
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
|
||||
#define default_memset(dir, diridx, off, sz) \
|
||||
|
@ -927,14 +928,14 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
|
|||
const int eob = cbi[t->bx].eob[1 + pl] =
|
||||
decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
|
||||
&t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
|
||||
b, b->intra, 1 + pl, ts->frame_thread.cf,
|
||||
b, b->intra, 1 + pl, ts->frame_thread[1].cf,
|
||||
&txtp, &cf_ctx);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
|
||||
"txtp=%d,eob=%d]: r=%d\n",
|
||||
pl, b->uvtx, txtp, eob, ts->msac.rng);
|
||||
cbi[t->bx].txtp[1 + pl] = txtp;
|
||||
ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
|
||||
ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
|
||||
#define default_memset(dir, diridx, off, sz) \
|
||||
|
@ -956,7 +957,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
|
|||
}
|
||||
}
|
||||
|
||||
static int mc(Dav1dTileContext *const t,
|
||||
static int mc(Dav1dTaskContext *const t,
|
||||
pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
|
||||
const int bw4, const int bh4,
|
||||
const int bx, const int by, const int pl,
|
||||
|
@ -979,11 +980,6 @@ static int mc(Dav1dTileContext *const t,
|
|||
int w, h;
|
||||
|
||||
if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
|
||||
if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
|
||||
PLANE_TYPE_Y + !!pl))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
w = (f->cur.p.w + ss_hor) >> ss_hor;
|
||||
h = (f->cur.p.h + ss_ver) >> ss_ver;
|
||||
} else {
|
||||
|
@ -1034,8 +1030,6 @@ static int mc(Dav1dTileContext *const t,
|
|||
const int bottom =
|
||||
((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
|
||||
|
||||
if (dav1d_thread_picture_wait(refp, bottom + 4, PLANE_TYPE_Y + !!pl))
|
||||
return -1;
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
|
||||
left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
|
||||
|
@ -1077,7 +1071,7 @@ static int mc(Dav1dTileContext *const t,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int obmc(Dav1dTileContext *const t,
|
||||
static int obmc(Dav1dTaskContext *const t,
|
||||
pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const uint8_t *const b_dim, const int pl,
|
||||
const int bx4, const int by4, const int w4, const int h4)
|
||||
|
@ -1138,7 +1132,7 @@ static int obmc(Dav1dTileContext *const t,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int warp_affine(Dav1dTileContext *const t,
|
||||
static int warp_affine(Dav1dTaskContext *const t,
|
||||
pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
|
||||
const uint8_t *const b_dim, const int pl,
|
||||
const Dav1dThreadPicture *const refp,
|
||||
|
@ -1176,11 +1170,6 @@ static int warp_affine(Dav1dTileContext *const t,
|
|||
const pixel *ref_ptr;
|
||||
ptrdiff_t ref_stride = refp->p.stride[!!pl];
|
||||
|
||||
if (dav1d_thread_picture_wait(refp, dy + 4 + 8,
|
||||
PLANE_TYPE_Y + !!pl))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
|
||||
pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
|
||||
f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
|
||||
|
@ -1204,7 +1193,7 @@ static int warp_affine(Dav1dTileContext *const t,
|
|||
return 0;
|
||||
}
|
||||
|
||||
void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
|
||||
void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
|
||||
const enum EdgeFlags intra_edge_flags,
|
||||
const Av1Block *const b)
|
||||
{
|
||||
|
@ -1239,14 +1228,15 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
pixel *dst = ((pixel *) f->cur.data[0]) +
|
||||
4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
|
||||
const uint8_t *pal_idx;
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.pal_idx);
|
||||
pal_idx = ts->frame_thread.pal_idx;
|
||||
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
|
||||
} else {
|
||||
pal_idx = t->scratch.pal_idx;
|
||||
}
|
||||
const uint16_t *const pal = f->frame_thread.pass ?
|
||||
const uint16_t *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
|
||||
f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
|
||||
|
@ -1323,9 +1313,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
coef *cf;
|
||||
int eob;
|
||||
enum TxfmType txtp;
|
||||
if (f->frame_thread.pass) {
|
||||
cf = ts->frame_thread.cf;
|
||||
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
const struct CodedBlockInfo *const cbi =
|
||||
&f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
|
||||
eob = cbi->eob[0];
|
||||
|
@ -1362,7 +1353,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
hex_dump(dst, f->cur.stride[0],
|
||||
t_dim->w * 4, t_dim->h * 4, "recon");
|
||||
}
|
||||
} else if (!f->frame_thread.pass) {
|
||||
} else if (!t->frame_thread.pass) {
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir lcoef, off, mul * 0x40)
|
||||
case_set_upto16(t_dim->h, l., 1, by4 + y);
|
||||
|
@ -1435,12 +1426,13 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
(t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
|
||||
const uint16_t (*pal)[8];
|
||||
const uint8_t *pal_idx;
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.pal_idx);
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))];
|
||||
pal_idx = ts->frame_thread.pal_idx;
|
||||
ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
|
||||
} else {
|
||||
pal = t->scratch.pal;
|
||||
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
|
||||
|
@ -1545,9 +1537,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
enum TxfmType txtp;
|
||||
int eob;
|
||||
coef *cf;
|
||||
if (f->frame_thread.pass) {
|
||||
cf = ts->frame_thread.cf;
|
||||
ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
|
||||
const struct CodedBlockInfo *const cbi =
|
||||
&f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
|
||||
eob = cbi->eob[pl + 1];
|
||||
|
@ -1587,7 +1580,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
hex_dump(dst, stride, uv_t_dim->w * 4,
|
||||
uv_t_dim->h * 4, "recon");
|
||||
}
|
||||
} else if (!f->frame_thread.pass) {
|
||||
} else if (!t->frame_thread.pass) {
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
|
||||
case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
|
||||
|
@ -1604,7 +1597,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
|||
}
|
||||
}
|
||||
|
||||
int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
|
||||
int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
|
||||
const Av1Block *const b)
|
||||
{
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
|
@ -1719,7 +1712,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
r[-1][t->bx - 1].mv.mv[0],
|
||||
&f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
|
||||
r[-1][t->bx - 1].ref.ref[0] - 1,
|
||||
f->frame_thread.pass != 2 ? t->tl_4x4_filter :
|
||||
t->frame_thread.pass != 2 ? t->tl_4x4_filter :
|
||||
f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
|
||||
if (res) return res;
|
||||
}
|
||||
|
@ -1735,7 +1728,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
|
||||
&f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
|
||||
r[0][t->bx - 1].ref.ref[0] - 1,
|
||||
f->frame_thread.pass != 2 ? left_filter_2d :
|
||||
t->frame_thread.pass != 2 ? left_filter_2d :
|
||||
f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
|
||||
if (res) return res;
|
||||
}
|
||||
|
@ -1750,7 +1743,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
1 + pl, r[-1][t->bx].mv.mv[0],
|
||||
&f->refp[r[-1][t->bx].ref.ref[0] - 1],
|
||||
r[-1][t->bx].ref.ref[0] - 1,
|
||||
f->frame_thread.pass != 2 ? top_filter_2d :
|
||||
t->frame_thread.pass != 2 ? top_filter_2d :
|
||||
f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
|
||||
if (res) return res;
|
||||
}
|
||||
|
@ -1994,9 +1987,10 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
coef *cf;
|
||||
int eob;
|
||||
enum TxfmType txtp;
|
||||
if (f->frame_thread.pass) {
|
||||
cf = ts->frame_thread.cf;
|
||||
ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
|
||||
const struct CodedBlockInfo *const cbi =
|
||||
&f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
|
||||
eob = cbi->eob[1 + pl];
|
||||
|
@ -2051,7 +2045,21 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
return 0;
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) {
|
||||
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext*const f, const int sby) {
|
||||
const int y = sby * f->sb_step * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *const p[3] = {
|
||||
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
|
||||
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
|
||||
};
|
||||
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])
|
||||
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
|
||||
f->lf.start_of_tile_row[sby]);
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext*const f, const int sby) {
|
||||
const int y = sby * f->sb_step * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *const p[3] = {
|
||||
|
@ -2061,10 +2069,7 @@ void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby
|
|||
};
|
||||
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
|
||||
int start_of_tile_row = 0;
|
||||
if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
|
||||
start_of_tile_row = f->lf.tile_row++;
|
||||
bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row);
|
||||
bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
|
||||
}
|
||||
if (f->lf.restore_planes) {
|
||||
// Store loop filtered pixels required by loop restoration
|
||||
|
@ -2145,7 +2150,8 @@ void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
|
|||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
||||
bytefn(dav1d_filter_sbrow_deblock)(f, sby);
|
||||
bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
|
||||
bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
|
||||
if (f->seq_hdr->cdef)
|
||||
bytefn(dav1d_filter_sbrow_cdef)(f, sby);
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
|
||||
|
@ -2154,7 +2160,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
|||
bytefn(dav1d_filter_sbrow_lr)(f, sby);
|
||||
}
|
||||
|
||||
void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
|
||||
void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const int sby = t->by >> f->sb_shift;
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include "common/intops.h"
|
||||
|
||||
#include "src/env.h"
|
||||
#include "src/mem.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt,
|
||||
|
@ -652,11 +653,14 @@ void dav1d_refmvs_find(const refmvs_tile *const rt,
|
|||
void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf,
|
||||
const int tile_col_start4, const int tile_col_end4,
|
||||
const int tile_row_start4, const int tile_row_end4,
|
||||
const int sby, int tile_row_idx)
|
||||
const int sby, int tile_row_idx, const int pass)
|
||||
{
|
||||
if (rf->n_tile_threads == 1) tile_row_idx = 0;
|
||||
rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
|
||||
refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx];
|
||||
const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1;
|
||||
const ptrdiff_t pass_off = (uses_2pass && pass == 2) ?
|
||||
35 * rf->r_stride * rf->n_tile_rows : 0;
|
||||
refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off];
|
||||
const int sbsz = rf->sbsz;
|
||||
const int off = (sbsz * sby) & 16;
|
||||
for (int i = 0; i < sbsz; i++, r += rf->r_stride)
|
||||
|
@ -805,7 +809,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
|||
refmvs_temporal_block *const rp,
|
||||
const unsigned ref_ref_poc[7][7],
|
||||
/*const*/ refmvs_temporal_block *const rp_ref[7],
|
||||
const int n_tile_threads)
|
||||
const int n_tile_threads, const int n_frame_threads)
|
||||
{
|
||||
rf->sbsz = 16 << seq_hdr->sb128;
|
||||
rf->frm_hdr = frm_hdr;
|
||||
|
@ -817,21 +821,23 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
|||
const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
|
||||
const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
|
||||
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
|
||||
if (rf->r) free(rf->r);
|
||||
rf->r = malloc(sizeof(*rf->r) * 35 * r_stride * n_tile_rows);
|
||||
if (rf->r) dav1d_freep_aligned(&rf->r);
|
||||
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
|
||||
rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
|
||||
if (!rf->r) return DAV1D_ERR(ENOMEM);
|
||||
rf->r_stride = r_stride;
|
||||
}
|
||||
|
||||
const ptrdiff_t rp_stride = r_stride >> 1;
|
||||
if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
|
||||
if (rf->rp_proj) free(rf->rp_proj);
|
||||
rf->rp_proj = malloc(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows);
|
||||
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
|
||||
rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
|
||||
if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
|
||||
rf->rp_stride = rp_stride;
|
||||
}
|
||||
rf->n_tile_rows = n_tile_rows;
|
||||
rf->n_tile_threads = n_tile_threads;
|
||||
rf->n_frame_threads = n_frame_threads;
|
||||
rf->rp = rp;
|
||||
rf->rp_ref = rp_ref;
|
||||
const unsigned poc = frm_hdr->frame_offset;
|
||||
|
@ -902,6 +908,29 @@ void dav1d_refmvs_init(refmvs_frame *const rf) {
|
|||
}
|
||||
|
||||
void dav1d_refmvs_clear(refmvs_frame *const rf) {
|
||||
if (rf->r) free(rf->r);
|
||||
if (rf->rp_proj) free(rf->rp_proj);
|
||||
if (rf->r) dav1d_freep_aligned(&rf->r);
|
||||
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
|
||||
}
|
||||
|
||||
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
|
||||
const int bx4, const int bw4, int bh4)
|
||||
{
|
||||
do {
|
||||
refmvs_block *const r = *rr++ + bx4;
|
||||
for (int x = 0; x < bw4; x++)
|
||||
r[x] = *rmv;
|
||||
} while (--bh4);
|
||||
}
|
||||
|
||||
COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
|
||||
{
|
||||
c->splat_mv = splat_mv_c;
|
||||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
dav1d_refmvs_dsp_init_arm(c);
|
||||
#elif ARCH_X86
|
||||
dav1d_refmvs_dsp_init_x86(c);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
#include "common/intops.h"
|
||||
|
||||
#include "src/intra_edge.h"
|
||||
#include "src/levels.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#define INVALID_MV 0x80008000
|
||||
|
@ -55,11 +54,11 @@ typedef union refmvs_mvpair {
|
|||
uint64_t n;
|
||||
} refmvs_mvpair;
|
||||
|
||||
typedef struct refmvs_block {
|
||||
PACKED(typedef struct refmvs_block {
|
||||
refmvs_mvpair mv;
|
||||
refmvs_refpair ref;
|
||||
uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
|
||||
} refmvs_block;
|
||||
}) ALIGN(refmvs_block, 4);
|
||||
|
||||
typedef struct refmvs_frame {
|
||||
const Dav1dFrameHeader *frm_hdr;
|
||||
|
@ -80,7 +79,7 @@ typedef struct refmvs_frame {
|
|||
|
||||
refmvs_block *r; // 35 x r_stride memory
|
||||
ptrdiff_t r_stride;
|
||||
int n_tile_rows, n_tile_threads;
|
||||
int n_tile_rows, n_tile_threads, n_frame_threads;
|
||||
} refmvs_frame;
|
||||
|
||||
typedef struct refmvs_tile {
|
||||
|
@ -97,6 +96,14 @@ typedef struct refmvs_candidate {
|
|||
int weight;
|
||||
} refmvs_candidate;
|
||||
|
||||
#define decl_splat_mv_fn(name) \
|
||||
void (name)(refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4)
|
||||
typedef decl_splat_mv_fn(*splat_mv_fn);
|
||||
|
||||
typedef struct Dav1dRefmvsDSPContext {
|
||||
splat_mv_fn splat_mv;
|
||||
} Dav1dRefmvsDSPContext;
|
||||
|
||||
// call once per frame thread
|
||||
void dav1d_refmvs_init(refmvs_frame *rf);
|
||||
void dav1d_refmvs_clear(refmvs_frame *rf);
|
||||
|
@ -109,7 +116,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *rf,
|
|||
refmvs_temporal_block *rp,
|
||||
const unsigned ref_ref_poc[7][7],
|
||||
/*const*/ refmvs_temporal_block *const rp_ref[7],
|
||||
int n_tile_threads);
|
||||
int n_tile_threads, int n_frame_threads);
|
||||
|
||||
// initialize temporal MVs; this can be done in any configuration, e.g. one
|
||||
// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
|
||||
|
@ -129,7 +136,7 @@ void dav1d_refmvs_save_tmvs(const refmvs_tile *rt,
|
|||
void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
|
||||
int tile_col_start4, int tile_col_end4,
|
||||
int tile_row_start4, int tile_row_end4,
|
||||
int sby, int tile_row_idx);
|
||||
int sby, int tile_row_idx, int pass);
|
||||
|
||||
// call for each block
|
||||
void dav1d_refmvs_find(const refmvs_tile *rt,
|
||||
|
@ -137,97 +144,8 @@ void dav1d_refmvs_find(const refmvs_tile *rt,
|
|||
int *ctx, const refmvs_refpair ref, enum BlockSize bs,
|
||||
enum EdgeFlags edge_flags, int by4, int bx4);
|
||||
|
||||
static inline void splat_oneref_mv(refmvs_tile *const rt,
|
||||
const int by4, const int bx4,
|
||||
const enum BlockSize bs,
|
||||
const enum InterPredMode mode,
|
||||
const int ref, const mv mv,
|
||||
const int is_interintra)
|
||||
{
|
||||
const int bw4 = dav1d_block_dimensions[bs][0];
|
||||
int bh4 = dav1d_block_dimensions[bs][1];
|
||||
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
|
||||
|
||||
const refmvs_block tmpl = (refmvs_block) {
|
||||
.ref.ref = { ref + 1, is_interintra ? 0 : -1 },
|
||||
.mv.mv[0] = mv,
|
||||
.bs = bs,
|
||||
.mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
|
||||
};
|
||||
do {
|
||||
refmvs_block *r = *rr++ + bx4;
|
||||
for (int x = 0; x < bw4; x++)
|
||||
r[x] = tmpl;
|
||||
} while (--bh4);
|
||||
}
|
||||
|
||||
static inline void splat_intrabc_mv(refmvs_tile *const rt,
|
||||
const int by4, const int bx4,
|
||||
const enum BlockSize bs, const mv mv)
|
||||
{
|
||||
const int bw4 = dav1d_block_dimensions[bs][0];
|
||||
int bh4 = dav1d_block_dimensions[bs][1];
|
||||
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
|
||||
|
||||
const refmvs_block tmpl = (refmvs_block) {
|
||||
.ref.ref = { 0, -1 },
|
||||
.mv.mv[0] = mv,
|
||||
.bs = bs,
|
||||
.mf = 0,
|
||||
};
|
||||
do {
|
||||
refmvs_block *r = *rr++ + bx4;
|
||||
for (int x = 0; x < bw4; x++) {
|
||||
r[x] = tmpl;
|
||||
}
|
||||
} while (--bh4);
|
||||
}
|
||||
|
||||
static inline void splat_tworef_mv(refmvs_tile *const rt,
|
||||
const int by4, const int bx4,
|
||||
const enum BlockSize bs,
|
||||
const enum CompInterPredMode mode,
|
||||
const refmvs_refpair ref,
|
||||
const refmvs_mvpair mv)
|
||||
{
|
||||
const int bw4 = dav1d_block_dimensions[bs][0];
|
||||
int bh4 = dav1d_block_dimensions[bs][1];
|
||||
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
|
||||
|
||||
assert(bw4 >= 2 && bh4 >= 2);
|
||||
const refmvs_block tmpl = (refmvs_block) {
|
||||
.ref.pair = ref.pair + 0x0101,
|
||||
.mv = mv,
|
||||
.bs = bs,
|
||||
.mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
|
||||
};
|
||||
do {
|
||||
refmvs_block *r = *rr++ + bx4;
|
||||
for (int x = 0; x < bw4; x++)
|
||||
r[x] = tmpl;
|
||||
} while (--bh4);
|
||||
}
|
||||
|
||||
static inline void splat_intraref(refmvs_tile *const rt,
|
||||
const int by4, const int bx4,
|
||||
const enum BlockSize bs)
|
||||
{
|
||||
const int bw4 = dav1d_block_dimensions[bs][0];
|
||||
int bh4 = dav1d_block_dimensions[bs][1];
|
||||
refmvs_block **rr = &rt->r[(by4 & 31) + 5];
|
||||
|
||||
const refmvs_block tmpl = (refmvs_block) {
|
||||
.ref.ref = { 0, -1 },
|
||||
.mv.mv[0].n = INVALID_MV,
|
||||
.bs = bs,
|
||||
.mf = 0,
|
||||
};
|
||||
do {
|
||||
refmvs_block *r = *rr++ + bx4;
|
||||
for (int x = 0; x < bw4; x++) {
|
||||
r[x] = tmpl;
|
||||
}
|
||||
} while (--bh4);
|
||||
}
|
||||
void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
|
||||
void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
|
||||
void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
|
||||
|
||||
#endif /* DAV1D_SRC_REF_MVS_H */
|
||||
|
|
|
@ -27,345 +27,661 @@
|
|||
|
||||
#include "config.h"
|
||||
|
||||
#include "common/frame.h"
|
||||
|
||||
#include "src/thread_task.h"
|
||||
|
||||
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) {
|
||||
struct PostFilterThreadData *const pftd = f->lf.thread.pftd;
|
||||
const int frame_idx = (int)(f - f->c->fc);
|
||||
// This function resets the cur pointer to the first frame theoretically
|
||||
// executable after a task completed (ie. each time we update some progress or
|
||||
// insert some tasks in the queue).
|
||||
// When frame_idx is set, it can be either from a completed task, or from tasks
|
||||
// inserted in the queue, in which case we have to make sure the cur pointer
|
||||
// isn't past this insert.
|
||||
// The special case where frame_idx is UINT_MAX is to handle the reset after
|
||||
// completing a task and locklessly signaling progress. In this case we don't
|
||||
// enter a critical section, which is needed for this function, so we set an
|
||||
// atomic for a delayed handling, happening here. Meaning we can call this
|
||||
// function without any actual update other than what's in the atomic, hence
|
||||
// this special case.
|
||||
static inline int reset_task_cur(const Dav1dContext *const c,
|
||||
struct TaskThreadData *const ttd,
|
||||
unsigned frame_idx)
|
||||
{
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
|
||||
return 0;
|
||||
unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
|
||||
if (reset_frame_idx != UINT_MAX) {
|
||||
if (frame_idx == UINT_MAX) {
|
||||
if (reset_frame_idx > first + ttd->cur)
|
||||
return 0;
|
||||
ttd->cur = reset_frame_idx - first;
|
||||
goto cur_found;
|
||||
}
|
||||
} else if (frame_idx == UINT_MAX)
|
||||
return 0;
|
||||
if (frame_idx < first) frame_idx += c->n_fc;
|
||||
const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx);
|
||||
const unsigned cur_frame_idx = first + ttd->cur;
|
||||
if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx)
|
||||
return 0;
|
||||
for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++)
|
||||
if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head)
|
||||
break;
|
||||
cur_found:
|
||||
for (unsigned i = ttd->cur; i < c->n_fc; i++)
|
||||
c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
|
||||
unsigned frame_idx, unsigned n_frames)
|
||||
{
|
||||
if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames;
|
||||
unsigned last_idx = frame_idx;
|
||||
do {
|
||||
frame_idx = last_idx;
|
||||
last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
|
||||
} while (last_idx < frame_idx);
|
||||
}
|
||||
|
||||
static void insert_tasks_between(Dav1dFrameContext *const f,
|
||||
Dav1dTask *const first, Dav1dTask *const last,
|
||||
Dav1dTask *const a, Dav1dTask *const b,
|
||||
const int cond_signal)
|
||||
{
|
||||
struct TaskThreadData *const ttd = f->task_thread.ttd;
|
||||
if (atomic_load(f->c->flush)) return;
|
||||
assert(!a || a->next == b);
|
||||
if (!a) f->task_thread.task_head = first;
|
||||
else a->next = first;
|
||||
if (!b) f->task_thread.task_tail = last;
|
||||
last->next = b;
|
||||
reset_task_cur(f->c, ttd, first->frame_idx);
|
||||
if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1))
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
}
|
||||
|
||||
static void insert_tasks(Dav1dFrameContext *const f,
|
||||
Dav1dTask *const first, Dav1dTask *const last,
|
||||
const int cond_signal)
|
||||
{
|
||||
// insert task back into task queue
|
||||
Dav1dTask *t_ptr, *prev_t = NULL;
|
||||
for (t_ptr = f->task_thread.task_head;
|
||||
t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next)
|
||||
{
|
||||
// entropy coding precedes other steps
|
||||
if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
|
||||
if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue;
|
||||
// both are entropy
|
||||
if (first->sby > t_ptr->sby) continue;
|
||||
if (first->sby < t_ptr->sby) {
|
||||
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
|
||||
return;
|
||||
}
|
||||
// same sby
|
||||
} else {
|
||||
if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
|
||||
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
|
||||
return;
|
||||
}
|
||||
if (first->sby > t_ptr->sby) continue;
|
||||
if (first->sby < t_ptr->sby) {
|
||||
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
|
||||
return;
|
||||
}
|
||||
// same sby
|
||||
if (first->type > t_ptr->type) continue;
|
||||
if (first->type < t_ptr->type) {
|
||||
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
|
||||
return;
|
||||
}
|
||||
// same task type
|
||||
}
|
||||
|
||||
// sort by tile-id
|
||||
assert(first->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
|
||||
assert(first->type == t_ptr->type);
|
||||
assert(t_ptr->sby == first->sby);
|
||||
const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
|
||||
const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]);
|
||||
const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]);
|
||||
assert(t_tile_idx != p_tile_idx);
|
||||
if (t_tile_idx > p_tile_idx) continue;
|
||||
insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
|
||||
return;
|
||||
}
|
||||
// append at the end
|
||||
insert_tasks_between(f, first, last, prev_t, NULL, cond_signal);
|
||||
}
|
||||
|
||||
static inline void insert_task(Dav1dFrameContext *const f,
|
||||
Dav1dTask *const t, const int cond_signal)
|
||||
{
|
||||
insert_tasks(f, t, t, cond_signal);
|
||||
}
|
||||
|
||||
static int create_filter_sbrow(Dav1dFrameContext *const f,
|
||||
const int pass, Dav1dTask **res_t)
|
||||
{
|
||||
const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
|
||||
f->frame_hdr->loopfilter.level_y[1] ||
|
||||
f->lf.restore_planes;
|
||||
f->frame_hdr->loopfilter.level_y[1];
|
||||
const int has_cdef = f->seq_hdr->cdef;
|
||||
const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
|
||||
const int has_lr = !!f->lf.restore_planes;
|
||||
f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr;
|
||||
if (f->lf.thread.npf == 0) return 0;
|
||||
const int has_lr = f->lf.restore_planes;
|
||||
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
|
||||
Dav1dTask *tasks = f->lf.thread.tasks;
|
||||
int num_tasks = f->sbh * f->lf.thread.npf;
|
||||
if (num_tasks > f->lf.thread.num_tasks) {
|
||||
Dav1dTask *tasks = f->task_thread.tasks;
|
||||
const int uses_2pass = f->c->n_fc > 1;
|
||||
int num_tasks = f->sbh * (1 + uses_2pass);
|
||||
if (num_tasks > f->task_thread.num_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * num_tasks;
|
||||
tasks = realloc(f->lf.thread.tasks, size);
|
||||
if (!tasks) {
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
return -1;
|
||||
}
|
||||
tasks = realloc(f->task_thread.tasks, size);
|
||||
if (!tasks) return -1;
|
||||
memset(tasks, 0, size);
|
||||
f->lf.thread.tasks = tasks;
|
||||
f->lf.thread.num_tasks = num_tasks;
|
||||
f->task_thread.tasks = tasks;
|
||||
f->task_thread.num_tasks = num_tasks;
|
||||
}
|
||||
tasks += f->sbh * (pass & 1);
|
||||
|
||||
#define create_task(task, ready_cond, start_cond) \
|
||||
do { \
|
||||
t = &tasks[num_tasks++]; \
|
||||
t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \
|
||||
t->start = start_cond; \
|
||||
t->frame_id = frame_cnt; \
|
||||
t->frame_idx = frame_idx; \
|
||||
t->sby = sby; \
|
||||
t->fn = f->bd_fn.filter_sbrow_##task; \
|
||||
t->last_deps[0] = NULL; \
|
||||
t->last_deps[1] = NULL; \
|
||||
t->next_deps[0] = NULL; \
|
||||
t->next_deps[1] = NULL; \
|
||||
t->next_exec = NULL; \
|
||||
} while (0)
|
||||
|
||||
Dav1dTask *last_sbrow_deblock = NULL;
|
||||
Dav1dTask *last_sbrow_cdef = NULL;
|
||||
Dav1dTask *last_sbrow_resize = NULL;
|
||||
Dav1dTask *last_sbrow_lr = NULL;
|
||||
num_tasks = 0;
|
||||
const int frame_cnt = pftd->frame_cnt++;
|
||||
|
||||
for (int sby = 0; sby < f->sbh; ++sby) {
|
||||
Dav1dTask *t;
|
||||
Dav1dTask *last = NULL;
|
||||
if (has_deblock) {
|
||||
create_task(deblock, sby == 0, 0);
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_deblock;
|
||||
last_sbrow_deblock->next_deps[1] = t;
|
||||
}
|
||||
last = t;
|
||||
last_sbrow_deblock = t;
|
||||
}
|
||||
if (has_cdef) {
|
||||
create_task(cdef, sby == 0 && !has_deblock, has_deblock);
|
||||
if (has_deblock) {
|
||||
t->last_deps[0] = last;
|
||||
last->next_deps[0] = t;
|
||||
}
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_cdef;
|
||||
last_sbrow_cdef->next_deps[1] = t;
|
||||
}
|
||||
last = t;
|
||||
last_sbrow_cdef = t;
|
||||
};
|
||||
if (has_resize) {
|
||||
create_task(resize, sby == 0 && !last, !!last);
|
||||
if (last) {
|
||||
t->last_deps[0] = last;
|
||||
last->next_deps[0] = t;
|
||||
}
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_resize;
|
||||
last_sbrow_resize->next_deps[1] = t;
|
||||
}
|
||||
last = t;
|
||||
last_sbrow_resize = t;
|
||||
}
|
||||
if (has_lr) {
|
||||
create_task(lr, sby == 0 && !last, !!last);
|
||||
if (last) {
|
||||
t->last_deps[0] = last;
|
||||
last->next_deps[0] = t;
|
||||
}
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_lr;
|
||||
last_sbrow_lr->next_deps[1] = t;
|
||||
}
|
||||
last_sbrow_lr = t;
|
||||
}
|
||||
if (pass & 1) {
|
||||
f->frame_thread.entropy_progress = 0;
|
||||
} else {
|
||||
atomic_store(&f->frame_thread.deblock_progress, 0);
|
||||
atomic_store(&f->frame_thread.cdef_progress, 0);
|
||||
atomic_store(&f->frame_thread.lr_progress, 0);
|
||||
}
|
||||
f->lf.thread.done = 0;
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
f->frame_thread.next_tile_row[pass & 1] = 0;
|
||||
|
||||
Dav1dTask *t = &tasks[0];
|
||||
t->sby = 0;
|
||||
t->recon_progress = 1;
|
||||
t->deblock_progress = 0;
|
||||
t->cdef_progress = 0;
|
||||
t->lr_progress = 0;
|
||||
t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS :
|
||||
has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS :
|
||||
has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS :
|
||||
has_cdef ? DAV1D_TASK_TYPE_CDEF :
|
||||
has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION :
|
||||
DAV1D_TASK_TYPE_LOOP_RESTORATION;
|
||||
t->frame_idx = (int)(f - f->c->fc);
|
||||
|
||||
*res_t = t;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
|
||||
const int cond_signal)
|
||||
{
|
||||
Dav1dTask *tasks = f->task_thread.tile_tasks[0];
|
||||
const int uses_2pass = f->c->n_fc > 1;
|
||||
const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
|
||||
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
|
||||
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
|
||||
tasks = realloc(f->task_thread.tile_tasks[0], size);
|
||||
if (!tasks) return -1;
|
||||
memset(tasks, 0, size);
|
||||
f->task_thread.tile_tasks[0] = tasks;
|
||||
f->task_thread.num_tile_tasks = alloc_num_tasks;
|
||||
}
|
||||
f->task_thread.tile_tasks[1] = tasks + num_tasks;
|
||||
tasks += num_tasks * (pass & 1);
|
||||
|
||||
Dav1dTask *pf_t;
|
||||
if (create_filter_sbrow(f, pass, &pf_t))
|
||||
return -1;
|
||||
|
||||
Dav1dTask *prev_t = NULL;
|
||||
for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) {
|
||||
Dav1dTileState *const ts = &f->ts[tile_idx];
|
||||
Dav1dTask *t = &tasks[tile_idx];
|
||||
t->sby = ts->tiling.row_start >> f->sb_shift;
|
||||
if (pf_t && t->sby) {
|
||||
prev_t->next = pf_t;
|
||||
prev_t = pf_t;
|
||||
pf_t = NULL;
|
||||
}
|
||||
t->recon_progress = 0;
|
||||
t->deblock_progress = 0;
|
||||
t->cdef_progress = 0;
|
||||
t->lr_progress = 0;
|
||||
t->deps_skip = 0;
|
||||
t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION :
|
||||
DAV1D_TASK_TYPE_TILE_ENTROPY;
|
||||
t->frame_idx = (int)(f - f->c->fc);
|
||||
if (prev_t) prev_t->next = t;
|
||||
prev_t = t;
|
||||
}
|
||||
if (pf_t) {
|
||||
prev_t->next = pf_t;
|
||||
prev_t = pf_t;
|
||||
}
|
||||
insert_tasks(f, &tasks[0], prev_t, cond_signal);
|
||||
f->task_thread.done[pass & 1] = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dav1d_task_schedule(struct PostFilterThreadData *const pftd,
|
||||
Dav1dTask *const t)
|
||||
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
|
||||
const Dav1dContext *const c = f->c;
|
||||
|
||||
// schedule init task, which will schedule the remaining tasks
|
||||
Dav1dTask *const t = &f->task_thread.init_task;
|
||||
t->type = DAV1D_TASK_TYPE_INIT;
|
||||
t->frame_idx = (int)(f - c->fc);
|
||||
t->sby = 0;
|
||||
t->recon_progress = t->deblock_progress = 0;
|
||||
t->cdef_progress = t->lr_progress = 0;
|
||||
insert_task(f, t, 1);
|
||||
}
|
||||
|
||||
static inline int ensure_progress(struct TaskThreadData *const ttd,
|
||||
Dav1dFrameContext *const f,
|
||||
Dav1dTask *const t, const enum TaskType type,
|
||||
atomic_int *const state, int *const target)
|
||||
{
|
||||
Dav1dTask **pt = &pftd->tasks;
|
||||
while (*pt &&
|
||||
((*pt)->sby < t->sby ||
|
||||
((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id)))
|
||||
pt = &(*pt)->next_exec;
|
||||
t->next_exec = *pt;
|
||||
*pt = t;
|
||||
pthread_cond_signal(&pftd->cond);
|
||||
// deblock_rows (non-LR portion) depends on deblock of previous sbrow,
|
||||
// so ensure that completed. if not, re-add to task-queue; else, fall-through
|
||||
int p1 = atomic_load(state);
|
||||
if (p1 < t->sby) {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
p1 = atomic_load(state);
|
||||
if (p1 < t->sby) {
|
||||
t->type = type;
|
||||
t->deblock_progress = t->recon_progress = 0;
|
||||
t->cdef_progress = t->lr_progress = 0;
|
||||
*target = t->sby;
|
||||
insert_task(f, t, 0);
|
||||
return 1;
|
||||
}
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void update_task(Dav1dTask *const t, const int dep_type,
|
||||
Dav1dFrameContext *const f)
|
||||
static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
|
||||
const int frame_mt)
|
||||
{
|
||||
if (!t->last_deps[!dep_type] ||
|
||||
t->last_deps[!dep_type]->status == DAV1D_TASK_DONE)
|
||||
{
|
||||
t->status = DAV1D_TASK_READY;
|
||||
if (t->start)
|
||||
dav1d_task_schedule(f->lf.thread.pftd, t);
|
||||
const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
|
||||
const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]);
|
||||
Dav1dTileState *const ts = &f->ts[tile_idx];
|
||||
const int p1 = atomic_load(&ts->progress[tp]);
|
||||
if (p1 < t->sby) return 1;
|
||||
int error = p1 == TILE_ERROR;
|
||||
error |= atomic_fetch_or(&f->task_thread.error, error);
|
||||
if (!error && frame_mt && !tp) {
|
||||
const int p2 = atomic_load(&ts->progress[1]);
|
||||
if (p2 <= t->sby) return 1;
|
||||
error = p2 == TILE_ERROR;
|
||||
error |= atomic_fetch_or(&f->task_thread.error, error);
|
||||
}
|
||||
if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) {
|
||||
// check reference state
|
||||
const Dav1dThreadPicture *p = &f->sr_cur;
|
||||
const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2);
|
||||
const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift);
|
||||
const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby];
|
||||
for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) {
|
||||
unsigned lowest;
|
||||
if (tp) {
|
||||
// if temporal mv refs are disabled, we only need this
|
||||
// for the primary ref; if segmentation is disabled, we
|
||||
// don't even need that
|
||||
lowest = p_b;
|
||||
} else {
|
||||
// +8 is postfilter-induced delay
|
||||
const int y = lowest_px[n][0] == INT_MIN ? INT_MIN :
|
||||
lowest_px[n][0] + 8;
|
||||
const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN :
|
||||
lowest_px[n][1] * (1 << ss_ver) + 8;
|
||||
const int max = imax(y, uv);
|
||||
if (max == INT_MIN) continue;
|
||||
lowest = iclip(max, 1, f->refp[n].p.p.h);
|
||||
}
|
||||
const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]);
|
||||
if (p3 < lowest) return 1;
|
||||
atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void *dav1d_frame_task(void *const data) {
|
||||
Dav1dFrameContext *const f = data;
|
||||
void *dav1d_worker_task(void *data) {
|
||||
Dav1dTaskContext *const tc = data;
|
||||
const Dav1dContext *const c = tc->c;
|
||||
struct TaskThreadData *const ttd = tc->task_thread.ttd;
|
||||
|
||||
dav1d_set_thread_name("dav1d-frame");
|
||||
pthread_mutex_lock(&f->frame_thread.td.lock);
|
||||
dav1d_set_thread_name("dav1d-worker");
|
||||
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
for (;;) {
|
||||
while (!f->n_tile_data && !f->frame_thread.die) {
|
||||
pthread_cond_wait(&f->frame_thread.td.cond,
|
||||
&f->frame_thread.td.lock);
|
||||
}
|
||||
if (f->frame_thread.die) break;
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
|
||||
if (dav1d_decode_frame(f))
|
||||
memset(f->frame_thread.cf, 0,
|
||||
(size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
|
||||
|
||||
pthread_mutex_lock(&f->frame_thread.td.lock);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->frame_thread.td.cond);
|
||||
}
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *dav1d_tile_task(void *const data) {
|
||||
Dav1dTileContext *const t = data;
|
||||
struct FrameTileThreadData *const fttd = t->tile_thread.fttd;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int tile_thread_idx = (int) (t - f->tc);
|
||||
const uint64_t mask = 1ULL << tile_thread_idx;
|
||||
|
||||
dav1d_set_thread_name("dav1d-tile");
|
||||
|
||||
for (;;) {
|
||||
pthread_mutex_lock(&fttd->lock);
|
||||
fttd->available |= mask;
|
||||
int did_signal = 0;
|
||||
while (!fttd->tasks_left && !t->tile_thread.die) {
|
||||
if (!did_signal) {
|
||||
did_signal = 1;
|
||||
pthread_cond_signal(&fttd->icond);
|
||||
Dav1dFrameContext *f;
|
||||
Dav1dTask *t, *prev_t;
|
||||
if (tc->task_thread.die) break;
|
||||
if (atomic_load(c->flush)) goto park;
|
||||
while (ttd->cur < c->n_fc) {
|
||||
const unsigned first = atomic_load(&ttd->first);
|
||||
f = &c->fc[(first + ttd->cur) % c->n_fc];
|
||||
prev_t = f->task_thread.task_cur_prev;
|
||||
t = prev_t ? prev_t->next : f->task_thread.task_head;
|
||||
while (t) {
|
||||
if (t->type == DAV1D_TASK_TYPE_INIT) {
|
||||
const int p1 = f->in_cdf.progress ?
|
||||
atomic_load(f->in_cdf.progress) : 1;
|
||||
if (p1) {
|
||||
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
|
||||
goto found;
|
||||
}
|
||||
} else if (t->type <= DAV1D_TASK_TYPE_TILE_RECONSTRUCTION) {
|
||||
// if not bottom sbrow of tile, this task will be re-added
|
||||
// after it's finished
|
||||
if (!check_tile(t, f, c->n_fc > 1))
|
||||
goto found;
|
||||
} else if (t->recon_progress) {
|
||||
const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
|
||||
int error = atomic_load(&f->task_thread.error);
|
||||
assert(!f->task_thread.done[p] || error);
|
||||
const int tile_row_base = f->frame_hdr->tiling.cols *
|
||||
f->frame_thread.next_tile_row[p];
|
||||
if (p) {
|
||||
const int p1 = f->frame_thread.entropy_progress;
|
||||
if (p1 < t->sby) goto next;
|
||||
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
|
||||
}
|
||||
for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) {
|
||||
Dav1dTileState *const ts = &f->ts[tile_row_base + tc];
|
||||
const int p2 = atomic_load(&ts->progress[p]);
|
||||
if (p2 < t->recon_progress) goto next;
|
||||
atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR);
|
||||
}
|
||||
if (!p) {
|
||||
atomic_int *state = NULL;
|
||||
int needed;
|
||||
if (t->cdef_progress) {
|
||||
state = &f->frame_thread.cdef_progress;
|
||||
needed = t->cdef_progress;
|
||||
} else if (t->lr_progress) {
|
||||
state = &f->frame_thread.lr_progress;
|
||||
needed = t->lr_progress;
|
||||
}
|
||||
if (state) {
|
||||
const int p3 = atomic_load(state);
|
||||
if (p3 < needed) goto next;
|
||||
atomic_fetch_or(&f->task_thread.error, p3 == TILE_ERROR);
|
||||
}
|
||||
}
|
||||
if (t->sby + 1 < f->sbh) {
|
||||
// add sby+1 to list to replace this one
|
||||
Dav1dTask *next_t = &t[1];
|
||||
*next_t = *t;
|
||||
next_t->sby++;
|
||||
const int ntr = f->frame_thread.next_tile_row[p] + 1;
|
||||
const int start = f->frame_hdr->tiling.row_start_sb[ntr];
|
||||
if (next_t->sby == start)
|
||||
f->frame_thread.next_tile_row[p] = ntr;
|
||||
next_t->recon_progress = next_t->sby + 1;
|
||||
if (t->type == DAV1D_TASK_TYPE_CDEF)
|
||||
next_t->cdef_progress = next_t->sby;
|
||||
else if (t->type == DAV1D_TASK_TYPE_LOOP_RESTORATION)
|
||||
next_t->lr_progress = next_t->sby;
|
||||
insert_task(f, next_t, 0);
|
||||
}
|
||||
goto found;
|
||||
} else {
|
||||
assert(!!t->deblock_progress + !!t->cdef_progress + !!t->lr_progress == 1);
|
||||
atomic_int *state;
|
||||
int needed;
|
||||
if (t->deblock_progress) {
|
||||
needed = t->deblock_progress;
|
||||
state = &f->frame_thread.deblock_progress;
|
||||
} else if (t->cdef_progress) {
|
||||
needed = t->cdef_progress;
|
||||
state = &f->frame_thread.cdef_progress;
|
||||
} else {
|
||||
assert(t->lr_progress);
|
||||
needed = t->lr_progress;
|
||||
state = &f->frame_thread.lr_progress;
|
||||
}
|
||||
const int p1 = atomic_load(state);
|
||||
if (p1 >= needed) {
|
||||
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
next:
|
||||
prev_t = t;
|
||||
t = t->next;
|
||||
f->task_thread.task_cur_prev = prev_t;
|
||||
}
|
||||
pthread_cond_wait(&fttd->cond, &fttd->lock);
|
||||
ttd->cur++;
|
||||
}
|
||||
if (t->tile_thread.die) {
|
||||
pthread_cond_signal(&fttd->icond);
|
||||
pthread_mutex_unlock(&fttd->lock);
|
||||
break;
|
||||
}
|
||||
fttd->available &= ~mask;
|
||||
const int task_idx = fttd->num_tasks - fttd->tasks_left--;
|
||||
pthread_mutex_unlock(&fttd->lock);
|
||||
if (reset_task_cur(c, ttd, UINT_MAX)) continue;
|
||||
park:
|
||||
tc->task_thread.flushed = 1;
|
||||
pthread_cond_signal(&tc->task_thread.td.cond);
|
||||
// we want to be woken up next time progress is signaled
|
||||
atomic_store(&ttd->cond_signaled, 0);
|
||||
pthread_cond_wait(&ttd->cond, &ttd->lock);
|
||||
tc->task_thread.flushed = 0;
|
||||
reset_task_cur(c, ttd, UINT_MAX);
|
||||
continue;
|
||||
|
||||
if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
|
||||
// we can (or in fact, if >, we need to) do full tile decoding.
|
||||
// loopfilter happens in the main thread
|
||||
Dav1dTileState *const ts = t->ts = &f->ts[task_idx];
|
||||
for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end;
|
||||
t->by += f->sb_step)
|
||||
{
|
||||
const int error = dav1d_decode_tile_sbrow(t);
|
||||
const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift);
|
||||
found:
|
||||
// remove t from list
|
||||
if (prev_t) prev_t->next = t->next;
|
||||
else f->task_thread.task_head = t->next;
|
||||
if (!t->next) f->task_thread.task_tail = prev_t;
|
||||
if (!f->task_thread.task_head) ttd->cur++;
|
||||
// we don't need to check cond_signaled here, since we found a task
|
||||
// after the last signal so we want to re-signal the next waiting thread
|
||||
// and again won't need to signal after that
|
||||
atomic_store(&ttd->cond_signaled, 1);
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
found_unlocked:;
|
||||
const int flush = atomic_load(c->flush);
|
||||
int error = atomic_fetch_or(&f->task_thread.error, flush) | flush;
|
||||
|
||||
// signal progress
|
||||
pthread_mutex_lock(&ts->tile_thread.lock);
|
||||
atomic_store(&ts->progress, progress);
|
||||
pthread_cond_signal(&ts->tile_thread.cond);
|
||||
pthread_mutex_unlock(&ts->tile_thread.lock);
|
||||
if (error) break;
|
||||
// run it
|
||||
tc->f = f;
|
||||
int sby = t->sby;
|
||||
switch (t->type) {
|
||||
case DAV1D_TASK_TYPE_INIT: {
|
||||
assert(c->n_fc > 1);
|
||||
int res = -1;
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
res = dav1d_decode_frame_init(f);
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
|
||||
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
|
||||
}
|
||||
} else {
|
||||
const int sby = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0];
|
||||
const int tile_idx = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1];
|
||||
if (!res) {
|
||||
assert(c->n_fc > 1);
|
||||
for (int p = 1; p <= 2; p++) {
|
||||
const int res = dav1d_task_create_tile_sbrow(f, p, 0);
|
||||
if (res) {
|
||||
// memory allocation failed
|
||||
f->task_thread.done[2 - p] = 1;
|
||||
atomic_store(&f->task_thread.error, 1);
|
||||
f->task_thread.task_counter -= f->sbh +
|
||||
f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
|
||||
atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
|
||||
if (p == 2 && f->task_thread.done[1]) {
|
||||
assert(!f->task_thread.task_counter);
|
||||
dav1d_decode_frame_exit(f, -1);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// init failed, signal completion
|
||||
atomic_store(&f->task_thread.error, 1);
|
||||
f->task_thread.task_counter = 0;
|
||||
f->task_thread.done[0] = 1;
|
||||
f->task_thread.done[1] = 1;
|
||||
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
|
||||
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
|
||||
dav1d_decode_frame_exit(f, -1);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
continue;
|
||||
}
|
||||
case DAV1D_TASK_TYPE_TILE_ENTROPY:
|
||||
case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: {
|
||||
const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
|
||||
const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]);
|
||||
Dav1dTileState *const ts = &f->ts[tile_idx];
|
||||
int progress;
|
||||
|
||||
// the interleaved decoding can sometimes cause dependency issues
|
||||
// if one part of the frame decodes signifcantly faster than others.
|
||||
// Ideally, we'd "skip" tile_sbrows where dependencies are missing,
|
||||
// and resume them later as dependencies are met. This also would
|
||||
// solve the broadcast() below and allow us to use signal(). However,
|
||||
// for now, we use linear dependency tracking because it's simpler.
|
||||
if ((progress = atomic_load(&ts->progress)) < sby) {
|
||||
pthread_mutex_lock(&ts->tile_thread.lock);
|
||||
while ((progress = atomic_load(&ts->progress)) < sby)
|
||||
pthread_cond_wait(&ts->tile_thread.cond,
|
||||
&ts->tile_thread.lock);
|
||||
pthread_mutex_unlock(&ts->tile_thread.lock);
|
||||
}
|
||||
if (progress == TILE_ERROR) continue;
|
||||
|
||||
// we need to interleave sbrow decoding for all tile cols in a
|
||||
// tile row, since otherwise subsequent threads will be blocked
|
||||
// waiting for the post-filter to complete
|
||||
t->ts = ts;
|
||||
t->by = sby << f->sb_shift;
|
||||
const int error = dav1d_decode_tile_sbrow(t);
|
||||
progress = error ? TILE_ERROR : 1 + sby;
|
||||
tc->ts = ts;
|
||||
tc->by = sby << f->sb_shift;
|
||||
const int uses_2pass = c->n_fc > 1;
|
||||
tc->frame_thread.pass = !uses_2pass ? 0 :
|
||||
1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
|
||||
if (!error) error = dav1d_decode_tile_sbrow(tc);
|
||||
const int progress = error ? TILE_ERROR : 1 + sby;
|
||||
|
||||
// signal progress
|
||||
pthread_mutex_lock(&ts->tile_thread.lock);
|
||||
atomic_store(&ts->progress, progress);
|
||||
pthread_cond_broadcast(&ts->tile_thread.cond);
|
||||
pthread_mutex_unlock(&ts->tile_thread.lock);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int handle_abortion(Dav1dPostFilterContext *const pf,
|
||||
Dav1dContext *const c,
|
||||
struct PostFilterThreadData *const pftd)
|
||||
{
|
||||
const int flush = atomic_load_explicit(c->flush, memory_order_acquire);
|
||||
if (flush) {
|
||||
pthread_mutex_lock(&pf->td.lock);
|
||||
pf->flushed = 0;
|
||||
pthread_mutex_unlock(&pf->td.lock);
|
||||
}
|
||||
for (unsigned i = 0; i < c->n_fc; i++) {
|
||||
Dav1dFrameContext *const f = &c->fc[i];
|
||||
int send_signal;
|
||||
if (flush) // TODO before merge, see if this can be safely merged
|
||||
send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0;
|
||||
else
|
||||
send_signal = f->lf.thread.done == -1;
|
||||
for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) {
|
||||
Dav1dTask *const t = &f->lf.thread.tasks[j];
|
||||
if (t->status == DAV1D_TASK_RUNNING ||
|
||||
(t->status == DAV1D_TASK_DONE && t->start != -1))
|
||||
send_signal = 0;
|
||||
}
|
||||
if (send_signal) {
|
||||
if (!flush) {
|
||||
Dav1dTask **pt = &pftd->tasks;
|
||||
while (*pt) {
|
||||
if ((*pt)->frame_idx == i)
|
||||
*pt = (*pt)->next_exec;
|
||||
else
|
||||
pt = &(*pt)->next_exec;
|
||||
atomic_fetch_or(&f->task_thread.error, error);
|
||||
if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) {
|
||||
t->sby++;
|
||||
t->deps_skip = 0;
|
||||
if (!check_tile(t, f, uses_2pass)) {
|
||||
atomic_store(&ts->progress[p], progress);
|
||||
reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
|
||||
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
goto found_unlocked;
|
||||
}
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
atomic_store(&ts->progress[p], progress);
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
insert_task(f, t, 0);
|
||||
} else {
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
atomic_store(&ts->progress[p], progress);
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
error = atomic_load(&f->task_thread.error);
|
||||
if (f->frame_hdr->refresh_context &&
|
||||
tc->frame_thread.pass <= 1 && f->task_thread.update_set &&
|
||||
f->frame_hdr->tiling.update == tile_idx)
|
||||
{
|
||||
if (!error)
|
||||
dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
|
||||
&f->ts[f->frame_hdr->tiling.update].cdf);
|
||||
if (c->n_fc > 1)
|
||||
atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
|
||||
}
|
||||
if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
|
||||
(!uses_2pass || f->task_thread.done[1]))
|
||||
{
|
||||
dav1d_decode_frame_exit(f, error ? -1 : 0);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
assert(f->task_thread.task_counter >= 0);
|
||||
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
}
|
||||
f->lf.thread.done = 1;
|
||||
pthread_cond_signal(&f->lf.thread.cond);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (flush) {
|
||||
pthread_mutex_lock(&pf->td.lock);
|
||||
pf->flushed = 1;
|
||||
pthread_cond_signal(&pf->td.cond);
|
||||
pthread_mutex_unlock(&pf->td.lock);
|
||||
}
|
||||
return !flush;
|
||||
}
|
||||
|
||||
void *dav1d_postfilter_task(void *data) {
|
||||
Dav1dPostFilterContext *const pf = data;
|
||||
Dav1dContext *const c = pf->c;
|
||||
struct PostFilterThreadData *pftd = &c->postfilter_thread;
|
||||
|
||||
dav1d_set_thread_name("dav1d-postfilter");
|
||||
|
||||
int exec = 1;
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
for (;;) {
|
||||
if (!exec && !pf->die)
|
||||
pthread_cond_wait(&pftd->cond, &pftd->lock);
|
||||
if (!(exec = handle_abortion(pf, c, pftd))) continue;
|
||||
if (pf->die) break;
|
||||
|
||||
Dav1dTask *const t = pftd->tasks;
|
||||
if (!t) { exec = 0; continue; }
|
||||
pftd->tasks = t->next_exec;
|
||||
t->status = DAV1D_TASK_RUNNING;
|
||||
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
Dav1dFrameContext *const f = &c->fc[t->frame_idx];
|
||||
t->fn(f, t->sby);
|
||||
exec = 1;
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
|
||||
if (t->next_deps[0])
|
||||
update_task(t->next_deps[0], 0, f);
|
||||
if (t->next_deps[1])
|
||||
update_task(t->next_deps[1], 1, f);
|
||||
t->status = DAV1D_TASK_DONE;
|
||||
if (!t->next_deps[0]) {
|
||||
const enum PlaneType progress_plane_type =
|
||||
c->n_fc > 1 && f->frame_hdr->refresh_context ?
|
||||
PLANE_TYPE_Y : PLANE_TYPE_ALL;
|
||||
const int y = (t->sby + 1) * f->sb_step * 4;
|
||||
dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type);
|
||||
if (t->sby + 1 == f->sbh) {
|
||||
f->lf.thread.done = 1;
|
||||
pthread_cond_signal(&f->lf.thread.cond);
|
||||
case DAV1D_TASK_TYPE_DEBLOCK_COLS:
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
f->bd_fn.filter_sbrow_deblock_cols(f, sby);
|
||||
if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS,
|
||||
&f->frame_thread.deblock_progress,
|
||||
&t->deblock_progress)) continue;
|
||||
// fall-through
|
||||
case DAV1D_TASK_TYPE_DEBLOCK_ROWS:
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
f->bd_fn.filter_sbrow_deblock_rows(f, sby);
|
||||
// signal deblock progress
|
||||
if (f->frame_hdr->loopfilter.level_y[0] ||
|
||||
f->frame_hdr->loopfilter.level_y[1])
|
||||
{
|
||||
error = atomic_load(&f->task_thread.error);
|
||||
atomic_store(&f->frame_thread.deblock_progress,
|
||||
error ? TILE_ERROR : sby + 1);
|
||||
reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
|
||||
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
}
|
||||
// fall-through
|
||||
case DAV1D_TASK_TYPE_CDEF:
|
||||
if (f->seq_hdr->cdef) {
|
||||
// cdef caches top (pre-cdef) buffers internally and therefore
|
||||
// needs to be vertically linear
|
||||
if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_CDEF,
|
||||
&f->frame_thread.cdef_progress,
|
||||
&t->cdef_progress)) continue;
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
f->bd_fn.filter_sbrow_cdef(f, sby);
|
||||
// signal cdef progress
|
||||
error = atomic_load(&f->task_thread.error);
|
||||
atomic_store(&f->frame_thread.cdef_progress,
|
||||
error ? TILE_ERROR : sby + 1);
|
||||
reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
|
||||
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
}
|
||||
// fall-through
|
||||
case DAV1D_TASK_TYPE_SUPER_RESOLUTION:
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
|
||||
if (!atomic_load(&f->task_thread.error))
|
||||
f->bd_fn.filter_sbrow_resize(f, sby);
|
||||
// fall-through
|
||||
case DAV1D_TASK_TYPE_LOOP_RESTORATION:
|
||||
// lr is the last step before signaling frame completion, and
|
||||
// therefore needs to be done vertically linear
|
||||
if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_LOOP_RESTORATION,
|
||||
&f->frame_thread.lr_progress,
|
||||
&t->lr_progress)) continue;
|
||||
if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes)
|
||||
f->bd_fn.filter_sbrow_lr(f, sby);
|
||||
// fall-through
|
||||
case DAV1D_TASK_TYPE_ENTROPY_PROGRESS:
|
||||
// dummy to convert tile to frame
|
||||
break;
|
||||
default: abort();
|
||||
}
|
||||
t->start = -1;
|
||||
// if task completed [typically LR], signal picture progress as per below
|
||||
const int uses_2pass = c->n_fc > 1;
|
||||
const enum PlaneType progress_plane_type =
|
||||
t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK :
|
||||
c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL;
|
||||
const int sbh = f->sbh;
|
||||
pthread_mutex_lock(&ttd->lock);
|
||||
error = atomic_load(&f->task_thread.error);
|
||||
const unsigned y = error ? FRAME_ERROR :
|
||||
sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * f->sb_step * 4;
|
||||
if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) {
|
||||
if (!uses_2pass || t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
|
||||
atomic_store(&f->sr_cur.progress[0], y);
|
||||
if (!uses_2pass || t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
|
||||
atomic_store(&f->sr_cur.progress[1], y);
|
||||
}
|
||||
const int progress = error ? TILE_ERROR : sby + 1;
|
||||
if (progress_plane_type == PLANE_TYPE_BLOCK)
|
||||
f->frame_thread.entropy_progress = progress;
|
||||
else
|
||||
atomic_store(&f->frame_thread.lr_progress, progress);
|
||||
if (sby + 1 == sbh)
|
||||
f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1;
|
||||
if (!--f->task_thread.task_counter &&
|
||||
f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
|
||||
{
|
||||
dav1d_decode_frame_exit(f, error ? -1 : 0);
|
||||
f->n_tile_data = 0;
|
||||
pthread_cond_signal(&f->task_thread.cond);
|
||||
}
|
||||
reset_task_cur(c, ttd, t->frame_idx);
|
||||
}
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -35,33 +35,16 @@
|
|||
#define FRAME_ERROR (UINT_MAX - 1)
|
||||
#define TILE_ERROR (INT_MAX - 1)
|
||||
|
||||
enum TaskStatus {
|
||||
DAV1D_TASK_DEFAULT,
|
||||
DAV1D_TASK_READY,
|
||||
DAV1D_TASK_RUNNING,
|
||||
DAV1D_TASK_DONE,
|
||||
};
|
||||
// these functions assume the task scheduling lock is already taken
|
||||
int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal);
|
||||
void dav1d_task_frame_init(Dav1dFrameContext *f);
|
||||
|
||||
struct Dav1dTask {
|
||||
enum TaskStatus status; // task status
|
||||
int start; // frame thread start flag
|
||||
unsigned frame_idx; // frame thread id
|
||||
int frame_id; // frame ordering
|
||||
int sby; // sbrow
|
||||
filter_sbrow_fn fn; // task work
|
||||
Dav1dTask *last_deps[2]; // dependencies
|
||||
Dav1dTask *next_deps[2]; // dependant tasks
|
||||
Dav1dTask *next_exec; // tasks scheduling
|
||||
};
|
||||
|
||||
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f);
|
||||
void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t);
|
||||
|
||||
void *dav1d_frame_task(void *data);
|
||||
void *dav1d_tile_task(void *data);
|
||||
void *dav1d_postfilter_task(void *data);
|
||||
void *dav1d_worker_task(void *data);
|
||||
|
||||
int dav1d_decode_frame_init(Dav1dFrameContext *f);
|
||||
int dav1d_decode_frame_main(Dav1dFrameContext *f);
|
||||
void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
|
||||
int dav1d_decode_frame(Dav1dFrameContext *f);
|
||||
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
|
||||
int dav1d_decode_tile_sbrow(Dav1dTaskContext *t);
|
||||
|
||||
#endif /* DAV1D_SRC_THREAD_TASK_H */
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,3 +1,5 @@
|
|||
; Copyright © 2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2021, Two Orioles, LLC
|
||||
; Copyright (c) 2017-2021, The rav1e contributors
|
||||
; Copyright (c) 2021, Nathan Egge
|
||||
; All rights reserved.
|
||||
|
@ -28,10 +30,33 @@
|
|||
|
||||
SECTION_RODATA
|
||||
|
||||
%macro DUP8 1-*
|
||||
%rep %0
|
||||
times 8 dw %1
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
pri_taps: DUP8 4, 2, 3, 3
|
||||
dir_table: db 1 * 32 + 0, 2 * 32 + 0
|
||||
db 1 * 32 + 0, 2 * 32 - 2
|
||||
db -1 * 32 + 2, -2 * 32 + 4
|
||||
db 0 * 32 + 2, -1 * 32 + 4
|
||||
db 0 * 32 + 2, 0 * 32 + 4
|
||||
db 0 * 32 + 2, 1 * 32 + 4
|
||||
db 1 * 32 + 2, 2 * 32 + 4
|
||||
db 1 * 32 + 0, 2 * 32 + 2
|
||||
db 1 * 32 + 0, 2 * 32 + 0
|
||||
db 1 * 32 + 0, 2 * 32 - 2
|
||||
db -1 * 32 + 2, -2 * 32 + 4
|
||||
db 0 * 32 + 2, -1 * 32 + 4
|
||||
|
||||
dir_shift: times 4 dw 0x4000
|
||||
times 4 dw 0x1000
|
||||
|
||||
pw_128: times 4 dw 128
|
||||
pw_2048: times 8 dw 2048
|
||||
pw_m16384: times 8 dw -16384
|
||||
|
||||
cextern cdef_dir_8bpc_ssse3.main
|
||||
cextern cdef_dir_8bpc_sse4.main
|
||||
|
@ -47,6 +72,891 @@ SECTION .text
|
|||
%endrep
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
DECLARE_REG_TMP 5, 3
|
||||
%elif WIN64
|
||||
DECLARE_REG_TMP 7, 4
|
||||
%else
|
||||
DECLARE_REG_TMP 7, 8
|
||||
%endif
|
||||
|
||||
%macro CDEF_FILTER 2 ; w, h
|
||||
%if ARCH_X86_64
|
||||
DEFINE_ARGS dst, stride, tmp, pridmp, pri, sec, dir
|
||||
mova m8, [base+pw_2048]
|
||||
%else
|
||||
DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
|
||||
%define m8 [base+pw_2048]
|
||||
%define m9 [rsp+16*1+gprsize]
|
||||
%define m10 [rsp+16*2+gprsize]
|
||||
%endif
|
||||
movifnidn prid, r4m
|
||||
movifnidn secd, r5m
|
||||
test prid, prid
|
||||
jz .sec_only
|
||||
movd m6, r4m
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+24], pridmpd
|
||||
%endif
|
||||
bsr pridmpd, prid
|
||||
lea tmpd, [priq*4]
|
||||
cmp dword r9m, 0x3ff ; if (bpc == 10)
|
||||
cmove prid, tmpd ; pri <<= 2
|
||||
mov tmpd, r7m ; damping
|
||||
mov dird, r6m
|
||||
and prid, 16
|
||||
pshufb m6, m7 ; splat
|
||||
lea dirq, [base+dir_table+dirq*2]
|
||||
lea priq, [base+pri_taps+priq*2]
|
||||
test secd, secd
|
||||
jz .pri_only
|
||||
mova [rsp], m6
|
||||
movd m6, secd
|
||||
tzcnt secd, secd
|
||||
sub pridmpd, tmpd
|
||||
sub tmpd, secd
|
||||
pshufb m6, m7
|
||||
xor secd, secd
|
||||
neg pridmpd
|
||||
cmovs pridmpd, secd
|
||||
%if ARCH_X86_32
|
||||
mov [pri_shift+4], secd
|
||||
mov [sec_shift+4], secd
|
||||
%endif
|
||||
mov [pri_shift+0], pridmpq
|
||||
mov [sec_shift+0], tmpq
|
||||
lea tmpq, [px]
|
||||
%if WIN64
|
||||
movaps r4m, m9
|
||||
movaps r6m, m10
|
||||
%elif ARCH_X86_32
|
||||
mov pridmpd, [rsp+24]
|
||||
%endif
|
||||
%rep %1*%2/8
|
||||
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
|
||||
%endrep
|
||||
%if WIN64
|
||||
movaps m9, r4m
|
||||
movaps m10, r6m
|
||||
%endif
|
||||
jmp .end
|
||||
.pri_only:
|
||||
sub tmpd, pridmpd
|
||||
cmovs tmpd, secd
|
||||
%if ARCH_X86_32
|
||||
mov pridmpd, [rsp+24]
|
||||
mov [pri_shift+4], secd
|
||||
%endif
|
||||
mov [pri_shift+0], tmpq
|
||||
lea tmpq, [px]
|
||||
%rep %1*%2/8
|
||||
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
|
||||
%endrep
|
||||
.end:
|
||||
RET
|
||||
.sec_only:
|
||||
mov tmpd, r7m ; damping
|
||||
movd m6, r5m
|
||||
tzcnt secd, secd
|
||||
mov dird, r6m
|
||||
pshufb m6, m7
|
||||
sub tmpd, secd
|
||||
lea dirq, [base+dir_table+dirq*2]
|
||||
%if ARCH_X86_32
|
||||
mov [sec_shift+4], prid
|
||||
%endif
|
||||
mov [sec_shift+0], tmpq
|
||||
lea tmpq, [px]
|
||||
%rep %1*%2/8
|
||||
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
|
||||
%endrep
|
||||
jmp .end
|
||||
%if %1 == %2
|
||||
DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
|
||||
ALIGN function_align
|
||||
.pri:
|
||||
movsx offq, byte [dirq+4] ; off_k0
|
||||
%if %1 == 4
|
||||
movq m1, [dstq+strideq*0]
|
||||
movhps m1, [dstq+strideq*1]
|
||||
movq m2, [tmpq+offq+32*0] ; k0p0
|
||||
movhps m2, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m3, [tmpq+offq+32*0] ; k0p1
|
||||
movhps m3, [tmpq+offq+32*1]
|
||||
%else
|
||||
mova m1, [dstq]
|
||||
movu m2, [tmpq+offq]
|
||||
neg offq
|
||||
movu m3, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+5] ; off_k1
|
||||
psubw m2, m1 ; diff_k0p0
|
||||
psubw m3, m1 ; diff_k0p1
|
||||
pabsw m4, m2 ; adiff_k0p0
|
||||
psrlw m5, m4, [pri_shift+gprsize]
|
||||
psubusw m0, m6, m5
|
||||
pabsw m5, m3 ; adiff_k0p1
|
||||
pminsw m0, m4
|
||||
psrlw m4, m5, [pri_shift+gprsize]
|
||||
psignw m0, m2 ; constrain(diff_k0p0)
|
||||
psubusw m2, m6, m4
|
||||
pminsw m2, m5
|
||||
%if %1 == 4
|
||||
movq m4, [tmpq+offq+32*0] ; k1p0
|
||||
movhps m4, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m5, [tmpq+offq+32*0] ; k1p1
|
||||
movhps m5, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m4, [tmpq+offq]
|
||||
neg offq
|
||||
movu m5, [tmpq+offq]
|
||||
%endif
|
||||
psubw m4, m1 ; diff_k1p0
|
||||
psubw m5, m1 ; diff_k1p1
|
||||
psignw m2, m3 ; constrain(diff_k0p1)
|
||||
pabsw m3, m4 ; adiff_k1p0
|
||||
paddw m0, m2 ; constrain(diff_k0)
|
||||
psrlw m2, m3, [pri_shift+gprsize]
|
||||
psubusw m7, m6, m2
|
||||
pabsw m2, m5 ; adiff_k1p1
|
||||
pminsw m7, m3
|
||||
psrlw m3, m2, [pri_shift+gprsize]
|
||||
psignw m7, m4 ; constrain(diff_k1p0)
|
||||
psubusw m4, m6, m3
|
||||
pminsw m4, m2
|
||||
psignw m4, m5 ; constrain(diff_k1p1)
|
||||
paddw m7, m4 ; constrain(diff_k1)
|
||||
pmullw m0, [priq+16*0] ; pri_tap_k0
|
||||
pmullw m7, [priq+16*1] ; pri_tap_k1
|
||||
paddw m0, m7 ; sum
|
||||
psraw m2, m0, 15
|
||||
paddw m0, m2
|
||||
pmulhrsw m0, m8
|
||||
paddw m0, m1
|
||||
%if %1 == 4
|
||||
add tmpq, 32*2
|
||||
movq [dstq+strideq*0], m0
|
||||
movhps [dstq+strideq*1], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
%else
|
||||
add tmpq, 32
|
||||
mova [dstq], m0
|
||||
add dstq, strideq
|
||||
%endif
|
||||
ret
|
||||
ALIGN function_align
|
||||
.sec:
|
||||
movsx offq, byte [dirq+8] ; off1_k0
|
||||
%if %1 == 4
|
||||
movq m1, [dstq+strideq*0]
|
||||
movhps m1, [dstq+strideq*1]
|
||||
movq m2, [tmpq+offq+32*0] ; k0s0
|
||||
movhps m2, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m3, [tmpq+offq+32*0] ; k0s1
|
||||
movhps m3, [tmpq+offq+32*1]
|
||||
%else
|
||||
mova m1, [dstq]
|
||||
movu m2, [tmpq+offq]
|
||||
neg offq
|
||||
movu m3, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+0] ; off2_k0
|
||||
psubw m2, m1 ; diff_k0s0
|
||||
psubw m3, m1 ; diff_k0s1
|
||||
pabsw m4, m2 ; adiff_k0s0
|
||||
psrlw m5, m4, [sec_shift+gprsize]
|
||||
psubusw m0, m6, m5
|
||||
pabsw m5, m3 ; adiff_k0s1
|
||||
pminsw m0, m4
|
||||
psrlw m4, m5, [sec_shift+gprsize]
|
||||
psignw m0, m2 ; constrain(diff_k0s0)
|
||||
psubusw m2, m6, m4
|
||||
pminsw m2, m5
|
||||
%if %1 == 4
|
||||
movq m4, [tmpq+offq+32*0] ; k0s2
|
||||
movhps m4, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m5, [tmpq+offq+32*0] ; k0s3
|
||||
movhps m5, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m4, [tmpq+offq]
|
||||
neg offq
|
||||
movu m5, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+9] ; off1_k1
|
||||
psubw m4, m1 ; diff_k0s2
|
||||
psubw m5, m1 ; diff_k0s3
|
||||
psignw m2, m3 ; constrain(diff_k0s1)
|
||||
pabsw m3, m4 ; adiff_k0s2
|
||||
paddw m0, m2
|
||||
psrlw m2, m3, [sec_shift+gprsize]
|
||||
psubusw m7, m6, m2
|
||||
pabsw m2, m5 ; adiff_k0s3
|
||||
pminsw m7, m3
|
||||
psrlw m3, m2, [sec_shift+gprsize]
|
||||
psignw m7, m4 ; constrain(diff_k0s2)
|
||||
psubusw m4, m6, m3
|
||||
pminsw m4, m2
|
||||
%if %1 == 4
|
||||
movq m2, [tmpq+offq+32*0] ; k1s0
|
||||
movhps m2, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m3, [tmpq+offq+32*0] ; k1s1
|
||||
movhps m3, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m2, [tmpq+offq]
|
||||
neg offq
|
||||
movu m3, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+1] ; off2_k1
|
||||
paddw m0, m7
|
||||
psignw m4, m5 ; constrain(diff_k0s3)
|
||||
paddw m0, m4 ; constrain(diff_k0)
|
||||
psubw m2, m1 ; diff_k1s0
|
||||
psubw m3, m1 ; diff_k1s1
|
||||
paddw m0, m0 ; sec_tap_k0
|
||||
pabsw m4, m2 ; adiff_k1s0
|
||||
psrlw m5, m4, [sec_shift+gprsize]
|
||||
psubusw m7, m6, m5
|
||||
pabsw m5, m3 ; adiff_k1s1
|
||||
pminsw m7, m4
|
||||
psrlw m4, m5, [sec_shift+gprsize]
|
||||
psignw m7, m2 ; constrain(diff_k1s0)
|
||||
psubusw m2, m6, m4
|
||||
pminsw m2, m5
|
||||
%if %1 == 4
|
||||
movq m4, [tmpq+offq+32*0] ; k1s2
|
||||
movhps m4, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m5, [tmpq+offq+32*0] ; k1s3
|
||||
movhps m5, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m4, [tmpq+offq]
|
||||
neg offq
|
||||
movu m5, [tmpq+offq]
|
||||
%endif
|
||||
paddw m0, m7
|
||||
psubw m4, m1 ; diff_k1s2
|
||||
psubw m5, m1 ; diff_k1s3
|
||||
psignw m2, m3 ; constrain(diff_k1s1)
|
||||
pabsw m3, m4 ; adiff_k1s2
|
||||
paddw m0, m2
|
||||
psrlw m2, m3, [sec_shift+gprsize]
|
||||
psubusw m7, m6, m2
|
||||
pabsw m2, m5 ; adiff_k1s3
|
||||
pminsw m7, m3
|
||||
psrlw m3, m2, [sec_shift+gprsize]
|
||||
psignw m7, m4 ; constrain(diff_k1s2)
|
||||
psubusw m4, m6, m3
|
||||
pminsw m4, m2
|
||||
paddw m0, m7
|
||||
psignw m4, m5 ; constrain(diff_k1s3)
|
||||
paddw m0, m4 ; sum
|
||||
psraw m2, m0, 15
|
||||
paddw m0, m2
|
||||
pmulhrsw m0, m8
|
||||
paddw m0, m1
|
||||
%if %1 == 4
|
||||
add tmpq, 32*2
|
||||
movq [dstq+strideq*0], m0
|
||||
movhps [dstq+strideq*1], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
%else
|
||||
add tmpq, 32
|
||||
mova [dstq], m0
|
||||
add dstq, strideq
|
||||
%endif
|
||||
ret
|
||||
ALIGN function_align
|
||||
.pri_sec:
|
||||
movsx offq, byte [dirq+8] ; off2_k0
|
||||
%if %1 == 4
|
||||
movq m1, [dstq+strideq*0]
|
||||
movhps m1, [dstq+strideq*1]
|
||||
movq m2, [tmpq+offq+32*0] ; k0s0
|
||||
movhps m2, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m3, [tmpq+offq+32*0] ; k0s1
|
||||
movhps m3, [tmpq+offq+32*1]
|
||||
%else
|
||||
mova m1, [dstq]
|
||||
movu m2, [tmpq+offq]
|
||||
neg offq
|
||||
movu m3, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+0] ; off3_k0
|
||||
pabsw m4, m2
|
||||
%if ARCH_X86_64
|
||||
pabsw m10, m3
|
||||
pmaxsw m9, m2, m3
|
||||
pminsw m10, m4
|
||||
%else
|
||||
pabsw m7, m3
|
||||
pmaxsw m5, m2, m3
|
||||
pminsw m4, m7
|
||||
mova m9, m5
|
||||
mova m10, m4
|
||||
%endif
|
||||
psubw m2, m1 ; diff_k0s0
|
||||
psubw m3, m1 ; diff_k0s1
|
||||
pabsw m4, m2 ; adiff_k0s0
|
||||
psrlw m5, m4, [sec_shift+gprsize]
|
||||
psubusw m0, m6, m5
|
||||
pabsw m5, m3 ; adiff_k0s1
|
||||
pminsw m0, m4
|
||||
psrlw m4, m5, [sec_shift+gprsize]
|
||||
psignw m0, m2 ; constrain(diff_k0s0)
|
||||
psubusw m2, m6, m4
|
||||
pminsw m2, m5
|
||||
%if %1 == 4
|
||||
movq m4, [tmpq+offq+32*0] ; k0s2
|
||||
movhps m4, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m5, [tmpq+offq+32*0] ; k0s3
|
||||
movhps m5, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m4, [tmpq+offq]
|
||||
neg offq
|
||||
movu m5, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+9] ; off2_k1
|
||||
pabsw m7, m4
|
||||
psignw m2, m3
|
||||
pabsw m3, m5 ; constrain(diff_k0s1)
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m9, m4
|
||||
pminsw m10, m7
|
||||
pmaxsw m9, m5
|
||||
pminsw m10, m3
|
||||
%else
|
||||
pminsw m7, m10
|
||||
pminsw m7, m3
|
||||
pmaxsw m3, m9, m4
|
||||
pmaxsw m3, m5
|
||||
mova m10, m7
|
||||
mova m9, m3
|
||||
%endif
|
||||
psubw m4, m1 ; diff_k0s2
|
||||
psubw m5, m1 ; diff_k0s3
|
||||
paddw m0, m2
|
||||
pabsw m3, m4 ; adiff_k0s2
|
||||
psrlw m2, m3, [sec_shift+gprsize]
|
||||
psubusw m7, m6, m2
|
||||
pabsw m2, m5 ; adiff_k0s3
|
||||
pminsw m7, m3
|
||||
psrlw m3, m2, [sec_shift+gprsize]
|
||||
psignw m7, m4 ; constrain(diff_k0s2)
|
||||
psubusw m4, m6, m3
|
||||
pminsw m4, m2
|
||||
%if %1 == 4
|
||||
movq m2, [tmpq+offq+32*0] ; k1s0
|
||||
movhps m2, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m3, [tmpq+offq+32*0] ; k1s1
|
||||
movhps m3, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m2, [tmpq+offq]
|
||||
neg offq
|
||||
movu m3, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+1] ; off3_k1
|
||||
paddw m0, m7
|
||||
pabsw m7, m2
|
||||
psignw m4, m5 ; constrain(diff_k0s3)
|
||||
pabsw m5, m3
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m9, m2
|
||||
pminsw m10, m7
|
||||
pmaxsw m9, m3
|
||||
pminsw m10, m5
|
||||
%else
|
||||
pminsw m7, m10
|
||||
pminsw m7, m5
|
||||
pmaxsw m5, m9, m2
|
||||
pmaxsw m5, m3
|
||||
mova m10, m7
|
||||
mova m9, m5
|
||||
%endif
|
||||
paddw m0, m4 ; constrain(diff_k0)
|
||||
psubw m2, m1 ; diff_k1s0
|
||||
psubw m3, m1 ; diff_k1s1
|
||||
paddw m0, m0 ; sec_tap_k0
|
||||
pabsw m4, m2 ; adiff_k1s0
|
||||
psrlw m5, m4, [sec_shift+gprsize]
|
||||
psubusw m7, m6, m5
|
||||
pabsw m5, m3 ; adiff_k1s1
|
||||
pminsw m7, m4
|
||||
psrlw m4, m5, [sec_shift+gprsize]
|
||||
psignw m7, m2 ; constrain(diff_k1s0)
|
||||
psubusw m2, m6, m4
|
||||
pminsw m2, m5
|
||||
%if %1 == 4
|
||||
movq m4, [tmpq+offq+32*0] ; k1s2
|
||||
movhps m4, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m5, [tmpq+offq+32*0] ; k1s3
|
||||
movhps m5, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m4, [tmpq+offq]
|
||||
neg offq
|
||||
movu m5, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+4] ; off1_k0
|
||||
paddw m0, m7
|
||||
pabsw m7, m4
|
||||
psignw m2, m3 ; constrain(diff_k1s1)
|
||||
pabsw m3, m5
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m9, m4
|
||||
pminsw m10, m7
|
||||
pmaxsw m9, m5
|
||||
pminsw m10, m3
|
||||
%else
|
||||
pminsw m7, m10
|
||||
pminsw m7, m3
|
||||
pmaxsw m3, m9, m4
|
||||
pmaxsw m3, m5
|
||||
mova m10, m7
|
||||
mova m9, m3
|
||||
%endif
|
||||
psubw m4, m1 ; diff_k1s2
|
||||
psubw m5, m1 ; diff_k1s3
|
||||
pabsw m3, m4 ; adiff_k1s2
|
||||
paddw m0, m2
|
||||
psrlw m2, m3, [sec_shift+gprsize]
|
||||
psubusw m7, m6, m2
|
||||
pabsw m2, m5 ; adiff_k1s3
|
||||
pminsw m7, m3
|
||||
psrlw m3, m2, [sec_shift+gprsize]
|
||||
psignw m7, m4 ; constrain(diff_k1s2)
|
||||
psubusw m4, m6, m3
|
||||
pminsw m4, m2
|
||||
paddw m0, m7
|
||||
%if %1 == 4
|
||||
movq m2, [tmpq+offq+32*0] ; k0p0
|
||||
movhps m2, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m3, [tmpq+offq+32*0] ; k0p1
|
||||
movhps m3, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m2, [tmpq+offq]
|
||||
neg offq
|
||||
movu m3, [tmpq+offq]
|
||||
%endif
|
||||
movsx offq, byte [dirq+5] ; off1_k1
|
||||
pabsw m7, m2
|
||||
psignw m4, m5 ; constrain(diff_k1s3)
|
||||
pabsw m5, m3
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m9, m2
|
||||
pminsw m10, m7
|
||||
pmaxsw m9, m3
|
||||
pminsw m10, m5
|
||||
%else
|
||||
pminsw m7, m10
|
||||
pminsw m7, m5
|
||||
pmaxsw m5, m9, m2
|
||||
pmaxsw m5, m3
|
||||
mova m10, m7
|
||||
mova m9, m5
|
||||
%endif
|
||||
psubw m2, m1 ; diff_k0p0
|
||||
psubw m3, m1 ; diff_k0p1
|
||||
paddw m0, m4
|
||||
pabsw m4, m2 ; adiff_k0p0
|
||||
psrlw m5, m4, [pri_shift+gprsize]
|
||||
psubusw m7, [rsp+gprsize], m5
|
||||
pabsw m5, m3 ; adiff_k0p1
|
||||
pminsw m7, m4
|
||||
psrlw m4, m5, [pri_shift+gprsize]
|
||||
psignw m7, m2 ; constrain(diff_k0p0)
|
||||
psubusw m2, [rsp+gprsize], m4
|
||||
pminsw m2, m5
|
||||
%if %1 == 4
|
||||
movq m4, [tmpq+offq+32*0] ; k1p0
|
||||
movhps m4, [tmpq+offq+32*1]
|
||||
neg offq
|
||||
movq m5, [tmpq+offq+32*0] ; k1p1
|
||||
movhps m5, [tmpq+offq+32*1]
|
||||
%else
|
||||
movu m4, [tmpq+offq]
|
||||
neg offq
|
||||
movu m5, [tmpq+offq]
|
||||
%endif
|
||||
psignw m2, m3 ; constrain(diff_k0p1)
|
||||
pabsw m3, m4
|
||||
paddw m7, m2 ; constrain(diff_k0)
|
||||
pabsw m2, m5
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m9, m4
|
||||
pminsw m10, m3
|
||||
pmaxsw m9, m5
|
||||
pminsw m10, m2
|
||||
%else
|
||||
pminsw m3, m10
|
||||
pminsw m3, m2
|
||||
pmaxsw m2, m9, m4
|
||||
pmaxsw m2, m5
|
||||
mova m10, m3
|
||||
mova m9, m2
|
||||
%endif
|
||||
psubw m4, m1 ; diff_k1p0
|
||||
psubw m5, m1 ; diff_k1p1
|
||||
pabsw m3, m4 ; adiff_k1p0
|
||||
pmullw m7, [priq+16*0] ; pri_tap_k0
|
||||
paddw m0, m7
|
||||
psrlw m2, m3, [pri_shift+gprsize]
|
||||
psubusw m7, [rsp+16*0+gprsize], m2
|
||||
pabsw m2, m5 ; adiff_k1p1
|
||||
pminsw m7, m3
|
||||
psrlw m3, m2, [pri_shift+gprsize]
|
||||
psignw m7, m4 ; constrain(diff_k1p0)
|
||||
psubusw m4, [rsp+16*0+gprsize], m3
|
||||
pminsw m4, m2
|
||||
psignw m4, m5 ; constrain(diff_k1p1)
|
||||
paddw m7, m4 ; constrain(diff_k1)
|
||||
pmullw m7, [priq+16*1] ; pri_tap_k1
|
||||
paddw m0, m7 ; sum
|
||||
psraw m2, m0, 15
|
||||
paddw m0, m2
|
||||
pmulhrsw m0, m8
|
||||
paddw m0, m1
|
||||
%if ARCH_X86_64
|
||||
pmaxsw m9, m1
|
||||
pminsw m0, m9
|
||||
%else
|
||||
pmaxsw m2, m9, m1
|
||||
pminsw m0, m2
|
||||
%endif
|
||||
pminsw m1, m10
|
||||
pmaxsw m0, m1
|
||||
%if %1 == 4
|
||||
add tmpq, 32*2
|
||||
movq [dstq+strideq*0], m0
|
||||
movhps [dstq+strideq*1], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
%else
|
||||
add tmpq, 32
|
||||
mova [dstq], m0
|
||||
add dstq, strideq
|
||||
%endif
|
||||
ret
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_filter_4x4_16bpc, 4, 8, 9, 32*10, dst, stride, left, top, pri, sec, edge
|
||||
%define px rsp+32*4
|
||||
%else
|
||||
cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
|
||||
%define px rsp+32*5
|
||||
%endif
|
||||
%define base t0-dir_table
|
||||
%define pri_shift px-16*6
|
||||
%define sec_shift px-16*5
|
||||
mov edged, r8m
|
||||
LEA t0, dir_table
|
||||
movu m0, [dstq+strideq*0]
|
||||
movu m1, [dstq+strideq*1]
|
||||
lea t1, [dstq+strideq*2]
|
||||
movu m2, [t1 +strideq*0]
|
||||
movu m3, [t1 +strideq*1]
|
||||
movddup m7, [base+pw_m16384]
|
||||
mova [px+32*0+0], m0
|
||||
mova [px+32*1+0], m1
|
||||
mova [px+32*2+0], m2
|
||||
mova [px+32*3+0], m3
|
||||
test edgeb, 4 ; HAVE_TOP
|
||||
jz .no_top
|
||||
movifnidn topq, topmp
|
||||
movu m0, [topq+strideq*0]
|
||||
movu m1, [topq+strideq*1]
|
||||
mova [px-32*2+0], m0
|
||||
mova [px-32*1+0], m1
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .top_no_left
|
||||
movd m0, [topq+strideq*0-4]
|
||||
movd m1, [topq+strideq*1-4]
|
||||
movd [px-32*2-4], m0
|
||||
movd [px-32*1-4], m1
|
||||
jmp .top_done
|
||||
.no_top:
|
||||
mova [px-32*2+0], m7
|
||||
mova [px-32*1+0], m7
|
||||
.top_no_left:
|
||||
movd [px-32*2-4], m7
|
||||
movd [px-32*1-4], m7
|
||||
.top_done:
|
||||
test edgeb, 8 ; HAVE_BOTTOM
|
||||
jz .no_bottom
|
||||
lea r3, [dstq+strideq*4]
|
||||
movu m0, [r3+strideq*0]
|
||||
movu m1, [r3+strideq*1]
|
||||
mova [px+32*4+0], m0
|
||||
mova [px+32*5+0], m1
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .bottom_no_left
|
||||
movd m0, [r3+strideq*0-4]
|
||||
movd m1, [r3+strideq*1-4]
|
||||
movd [px+32*4-4], m0
|
||||
movd [px+32*5-4], m1
|
||||
jmp .bottom_done
|
||||
.no_bottom:
|
||||
mova [px+32*4+0], m7
|
||||
mova [px+32*5+0], m7
|
||||
.bottom_no_left:
|
||||
movd [px+32*4-4], m7
|
||||
movd [px+32*5-4], m7
|
||||
.bottom_done:
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .no_left
|
||||
movifnidn leftq, r2mp
|
||||
movd m0, [leftq+4*0]
|
||||
movd m1, [leftq+4*1]
|
||||
movd m2, [leftq+4*2]
|
||||
movd m3, [leftq+4*3]
|
||||
movd [px+32*0-4], m0
|
||||
movd [px+32*1-4], m1
|
||||
movd [px+32*2-4], m2
|
||||
movd [px+32*3-4], m3
|
||||
jmp .left_done
|
||||
.no_left:
|
||||
REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
|
||||
.left_done:
|
||||
test edgeb, 2 ; HAVE_RIGHT
|
||||
jnz .padding_done
|
||||
REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
|
||||
.padding_done:
|
||||
CDEF_FILTER 4, 4
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_filter_4x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
|
||||
%else
|
||||
cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
|
||||
%endif
|
||||
mov edged, r8m
|
||||
LEA t0, dir_table
|
||||
movu m0, [dstq+strideq*0]
|
||||
movu m1, [dstq+strideq*1]
|
||||
lea t1, [dstq+strideq*2]
|
||||
movu m2, [t1 +strideq*0]
|
||||
movu m3, [t1 +strideq*1]
|
||||
lea t1, [t1 +strideq*2]
|
||||
movu m4, [t1 +strideq*0]
|
||||
movu m5, [t1 +strideq*1]
|
||||
lea t1, [t1 +strideq*2]
|
||||
movu m6, [t1 +strideq*0]
|
||||
movu m7, [t1 +strideq*1]
|
||||
mova [px+32*0+0], m0
|
||||
mova [px+32*1+0], m1
|
||||
mova [px+32*2+0], m2
|
||||
mova [px+32*3+0], m3
|
||||
mova [px+32*4+0], m4
|
||||
mova [px+32*5+0], m5
|
||||
mova [px+32*6+0], m6
|
||||
mova [px+32*7+0], m7
|
||||
movddup m7, [base+pw_m16384]
|
||||
test edgeb, 4 ; HAVE_TOP
|
||||
jz .no_top
|
||||
movifnidn topq, topmp
|
||||
movu m0, [topq+strideq*0]
|
||||
movu m1, [topq+strideq*1]
|
||||
mova [px-32*2+0], m0
|
||||
mova [px-32*1+0], m1
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .top_no_left
|
||||
movd m0, [topq+strideq*0-4]
|
||||
movd m1, [topq+strideq*1-4]
|
||||
movd [px-32*2-4], m0
|
||||
movd [px-32*1-4], m1
|
||||
jmp .top_done
|
||||
.no_top:
|
||||
mova [px-32*2+0], m7
|
||||
mova [px-32*1+0], m7
|
||||
.top_no_left:
|
||||
movd [px-32*2-4], m7
|
||||
movd [px-32*1-4], m7
|
||||
.top_done:
|
||||
test edgeb, 8 ; HAVE_BOTTOM
|
||||
jz .no_bottom
|
||||
lea r3, [dstq+strideq*8]
|
||||
movu m0, [r3+strideq*0]
|
||||
movu m1, [r3+strideq*1]
|
||||
mova [px+32*8+0], m0
|
||||
mova [px+32*9+0], m1
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .bottom_no_left
|
||||
movd m0, [r3+strideq*0-4]
|
||||
movd m1, [r3+strideq*1-4]
|
||||
movd [px+32*8-4], m0
|
||||
movd [px+32*9-4], m1
|
||||
jmp .bottom_done
|
||||
.no_bottom:
|
||||
mova [px+32*8+0], m7
|
||||
mova [px+32*9+0], m7
|
||||
.bottom_no_left:
|
||||
movd [px+32*8-4], m7
|
||||
movd [px+32*9-4], m7
|
||||
.bottom_done:
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .no_left
|
||||
movifnidn leftq, r2mp
|
||||
movd m0, [leftq+4*0]
|
||||
movd m1, [leftq+4*1]
|
||||
movd m2, [leftq+4*2]
|
||||
movd m3, [leftq+4*3]
|
||||
movd [px+32*0-4], m0
|
||||
movd [px+32*1-4], m1
|
||||
movd [px+32*2-4], m2
|
||||
movd [px+32*3-4], m3
|
||||
movd m0, [leftq+4*4]
|
||||
movd m1, [leftq+4*5]
|
||||
movd m2, [leftq+4*6]
|
||||
movd m3, [leftq+4*7]
|
||||
movd [px+32*4-4], m0
|
||||
movd [px+32*5-4], m1
|
||||
movd [px+32*6-4], m2
|
||||
movd [px+32*7-4], m3
|
||||
jmp .left_done
|
||||
.no_left:
|
||||
REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
.left_done:
|
||||
test edgeb, 2 ; HAVE_RIGHT
|
||||
jnz .padding_done
|
||||
REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
|
||||
.padding_done:
|
||||
CDEF_FILTER 4, 8
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_filter_8x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
|
||||
%else
|
||||
cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
|
||||
%endif
|
||||
mov edged, r8m
|
||||
LEA t0, dir_table
|
||||
mova m0, [dstq+strideq*0+ 0]
|
||||
movd m1, [dstq+strideq*0+16]
|
||||
mova m2, [dstq+strideq*1+ 0]
|
||||
movd m3, [dstq+strideq*1+16]
|
||||
lea t1, [dstq+strideq*2]
|
||||
mova m4, [t1 +strideq*0+ 0]
|
||||
movd m5, [t1 +strideq*0+16]
|
||||
mova m6, [t1 +strideq*1+ 0]
|
||||
movd m7, [t1 +strideq*1+16]
|
||||
lea t1, [t1 +strideq*2]
|
||||
mova [px+32*0+ 0], m0
|
||||
movd [px+32*0+16], m1
|
||||
mova [px+32*1+ 0], m2
|
||||
movd [px+32*1+16], m3
|
||||
mova [px+32*2+ 0], m4
|
||||
movd [px+32*2+16], m5
|
||||
mova [px+32*3+ 0], m6
|
||||
movd [px+32*3+16], m7
|
||||
mova m0, [t1 +strideq*0+ 0]
|
||||
movd m1, [t1 +strideq*0+16]
|
||||
mova m2, [t1 +strideq*1+ 0]
|
||||
movd m3, [t1 +strideq*1+16]
|
||||
lea t1, [t1 +strideq*2]
|
||||
mova m4, [t1 +strideq*0+ 0]
|
||||
movd m5, [t1 +strideq*0+16]
|
||||
mova m6, [t1 +strideq*1+ 0]
|
||||
movd m7, [t1 +strideq*1+16]
|
||||
mova [px+32*4+ 0], m0
|
||||
movd [px+32*4+16], m1
|
||||
mova [px+32*5+ 0], m2
|
||||
movd [px+32*5+16], m3
|
||||
mova [px+32*6+ 0], m4
|
||||
movd [px+32*6+16], m5
|
||||
mova [px+32*7+ 0], m6
|
||||
movd [px+32*7+16], m7
|
||||
movddup m7, [base+pw_m16384]
|
||||
test edgeb, 4 ; HAVE_TOP
|
||||
jz .no_top
|
||||
movifnidn topq, topmp
|
||||
mova m0, [topq+strideq*0+ 0]
|
||||
mova m1, [topq+strideq*0+16]
|
||||
mova m2, [topq+strideq*1+ 0]
|
||||
mova m3, [topq+strideq*1+16]
|
||||
mova [px-32*2+ 0], m0
|
||||
movd [px-32*2+16], m1
|
||||
mova [px-32*1+ 0], m2
|
||||
movd [px-32*1+16], m3
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .top_no_left
|
||||
movd m0, [topq+strideq*0-4]
|
||||
movd m1, [topq+strideq*1-4]
|
||||
movd [px-32*2-4], m0
|
||||
movd [px-32*1-4], m1
|
||||
jmp .top_done
|
||||
.no_top:
|
||||
mova [px-32*2+ 0], m7
|
||||
movd [px-32*2+16], m7
|
||||
mova [px-32*1+ 0], m7
|
||||
movd [px-32*1+16], m7
|
||||
.top_no_left:
|
||||
movd [px-32*2- 4], m7
|
||||
movd [px-32*1- 4], m7
|
||||
.top_done:
|
||||
test edgeb, 8 ; HAVE_BOTTOM
|
||||
jz .no_bottom
|
||||
lea r3, [dstq+strideq*8]
|
||||
mova m0, [r3+strideq*0+ 0]
|
||||
movd m1, [r3+strideq*0+16]
|
||||
mova m2, [r3+strideq*1+ 0]
|
||||
movd m3, [r3+strideq*1+16]
|
||||
mova [px+32*8+ 0], m0
|
||||
movd [px+32*8+16], m1
|
||||
mova [px+32*9+ 0], m2
|
||||
movd [px+32*9+16], m3
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .bottom_no_left
|
||||
movd m0, [r3+strideq*0-4]
|
||||
movd m1, [r3+strideq*1-4]
|
||||
movd [px+32*8- 4], m0
|
||||
movd [px+32*9- 4], m1
|
||||
jmp .bottom_done
|
||||
.no_bottom:
|
||||
mova [px+32*8+ 0], m7
|
||||
movd [px+32*8+16], m7
|
||||
mova [px+32*9+ 0], m7
|
||||
movd [px+32*9+16], m7
|
||||
.bottom_no_left:
|
||||
movd [px+32*8- 4], m7
|
||||
movd [px+32*9- 4], m7
|
||||
.bottom_done:
|
||||
test edgeb, 1 ; HAVE_LEFT
|
||||
jz .no_left
|
||||
movifnidn leftq, r2mp
|
||||
movd m0, [leftq+4*0]
|
||||
movd m1, [leftq+4*1]
|
||||
movd m2, [leftq+4*2]
|
||||
movd m3, [leftq+4*3]
|
||||
movd [px+32*0- 4], m0
|
||||
movd [px+32*1- 4], m1
|
||||
movd [px+32*2- 4], m2
|
||||
movd [px+32*3- 4], m3
|
||||
movd m0, [leftq+4*4]
|
||||
movd m1, [leftq+4*5]
|
||||
movd m2, [leftq+4*6]
|
||||
movd m3, [leftq+4*7]
|
||||
movd [px+32*4- 4], m0
|
||||
movd [px+32*5- 4], m1
|
||||
movd [px+32*6- 4], m2
|
||||
movd [px+32*7- 4], m3
|
||||
jmp .left_done
|
||||
.no_left:
|
||||
REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
.left_done:
|
||||
test edgeb, 2 ; HAVE_RIGHT
|
||||
jnz .padding_done
|
||||
REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
|
||||
.padding_done:
|
||||
CDEF_FILTER 8, 8
|
||||
|
||||
%macro CDEF_DIR 0
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
|
||||
|
|
|
@ -472,7 +472,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
|
|||
movd xm1, secdmpd
|
||||
lzcnt secdmpd, secdmpd
|
||||
add secdmpd, dampingd
|
||||
cmovs secdmpd, zerod
|
||||
mov [rsp+8], secdmpq ; sec_shift
|
||||
|
||||
DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
|
||||
|
@ -552,7 +551,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
|
|||
movd xm1, secdmpd
|
||||
lzcnt secdmpd, secdmpd
|
||||
add secdmpd, dampingd
|
||||
cmovs secdmpd, zerod
|
||||
mov [rsp+8], secdmpq ; sec_shift
|
||||
DEFINE_ARGS dst, stride, left, top, _, secdmp, table
|
||||
lea tableq, [tap_table]
|
||||
|
@ -1481,7 +1479,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
|
|||
movd xm1, secdmpd
|
||||
lzcnt secdmpd, secdmpd
|
||||
add secdmpd, dampingd
|
||||
cmovs secdmpd, zerod
|
||||
mov [rsp+8], secdmpq ; sec_shift
|
||||
|
||||
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
|
||||
|
@ -1556,7 +1553,6 @@ cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
|
|||
movd xm1, secdmpd
|
||||
lzcnt secdmpd, secdmpd
|
||||
add secdmpd, dampingd
|
||||
cmovs secdmpd, zerod
|
||||
mov [rsp+8], secdmpq ; sec_shift
|
||||
DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
|
||||
lea tableq, [tap_table]
|
||||
|
|
|
@ -46,9 +46,9 @@ decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
|
|||
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
#if BITDEPTH == 8
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
|
||||
|
@ -57,11 +57,9 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
|||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
c->dir = BF(dav1d_cdef_dir, ssse3);
|
||||
#if BITDEPTH == 8
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||
|
||||
|
@ -77,9 +75,7 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
|||
|
||||
c->dir = BF(dav1d_cdef_dir, avx2);
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
|
||||
#if BITDEPTH == 8
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
|
||||
#endif
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
|
||||
|
|
|
@ -566,7 +566,7 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
|
|||
test secd, secd
|
||||
jz .pri_only
|
||||
movd m10, r5m
|
||||
bsr secd, secd
|
||||
tzcnt secd, secd
|
||||
and prid, 1
|
||||
sub pridmpd, dampingd
|
||||
sub secd, dampingd
|
||||
|
@ -575,7 +575,6 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
|
|||
neg pridmpd
|
||||
cmovs pridmpd, dampingd
|
||||
neg secd
|
||||
cmovs secd, dampingd
|
||||
PSHUFB_0 m1, m7
|
||||
PSHUFB_0 m10, m7
|
||||
%if ARCH_X86_64
|
||||
|
@ -697,7 +696,7 @@ cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
|
|||
DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
|
||||
%endif
|
||||
movd m1, r5m
|
||||
bsr secd, secd
|
||||
tzcnt secd, secd
|
||||
mov dird, r6m
|
||||
xor zerod, zerod
|
||||
sub dampingd, secd
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,4 +1,4 @@
|
|||
; Copyright © 2019, VideoLAN and dav1d authors
|
||||
; Copyright © 2019-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2019, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
|
@ -38,7 +38,8 @@ byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
|
|||
pw_seed_xor: times 2 dw 0xb524
|
||||
times 2 dw 0x49d8
|
||||
pd_m65536: dd ~0xffff
|
||||
pb_23_22: times 2 db 23, 22
|
||||
pb_23_22: db 23, 22
|
||||
times 3 db 0, 32
|
||||
pb_1: times 4 db 1
|
||||
hmul_bits: dw 32768, 16384, 8192, 4096
|
||||
round: dw 2048, 1024, 512
|
||||
|
@ -47,24 +48,25 @@ round_vals: dw 32, 64, 128, 256, 512
|
|||
max: dw 255, 240, 235
|
||||
min: dw 0, 16
|
||||
pb_27_17_17_27: db 27, 17, 17, 27
|
||||
times 2 db 0, 32
|
||||
pw_1: dw 1
|
||||
|
||||
%macro JMP_TABLE 1-*
|
||||
%xdefine %1_table %%table
|
||||
%xdefine %%base %1_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1)
|
||||
%macro JMP_TABLE 2-*
|
||||
%xdefine %1_8bpc_%2_table %%table
|
||||
%xdefine %%base %1_8bpc_%2_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 1
|
||||
dd %%prefix %+ .ar%2 - %%base
|
||||
%rep %0 - 2
|
||||
dd %%prefix %+ .ar%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
ALIGN 4
|
||||
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
|
||||
|
||||
struc FGData
|
||||
.seed: resd 1
|
||||
|
@ -90,8 +92,16 @@ cextern gaussian_sequence
|
|||
|
||||
SECTION .text
|
||||
|
||||
%macro REPX 2-*
|
||||
%xdefine %%f(x) %1
|
||||
%rep %0 - 1
|
||||
%rotate 1
|
||||
%%f(%1)
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx2
|
||||
cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
||||
cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data
|
||||
lea r4, [pb_mask]
|
||||
%define base r4-pb_mask
|
||||
movq xm1, [base+rnd_next_upperbit_mask]
|
||||
|
@ -132,8 +142,8 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
|
||||
; auto-regression code
|
||||
movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
|
||||
movsxd r2, [base+generate_grain_y_avx2_table+r2*4]
|
||||
lea r2, [r2+base+generate_grain_y_avx2_table]
|
||||
movsxd r2, [base+generate_grain_y_8bpc_avx2_table+r2*4]
|
||||
lea r2, [r2+base+generate_grain_y_8bpc_avx2_table]
|
||||
jmp r2
|
||||
|
||||
.ar1:
|
||||
|
@ -420,7 +430,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
|
||||
%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
|
||||
INIT_XMM avx2
|
||||
cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
|
||||
cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
|
||||
lea r4, [pb_mask]
|
||||
%define base r4-pb_mask
|
||||
movq xm1, [base+rnd_next_upperbit_mask]
|
||||
|
@ -478,8 +488,8 @@ cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
|
||||
; auto-regression code
|
||||
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
|
||||
movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
|
||||
lea r5, [r5+base+generate_grain_uv_%1_avx2_table]
|
||||
movsxd r5, [base+generate_grain_uv_%1_8bpc_avx2_table+r5*4]
|
||||
lea r5, [r5+base+generate_grain_uv_%1_8bpc_avx2_table]
|
||||
jmp r5
|
||||
|
||||
.ar0:
|
||||
|
@ -975,7 +985,7 @@ generate_grain_uv_fn 422, 1, 0
|
|||
generate_grain_uv_fn 444, 0, 0
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
||||
cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
||||
pcmpeqw m10, m10
|
||||
psrld m10, 24
|
||||
mov r7d, [fg_dataq+FGData.scaling_shift]
|
||||
|
@ -1092,12 +1102,12 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
jz .loop_x
|
||||
|
||||
; r8m = sbym
|
||||
movd xm15, [pb_27_17_17_27]
|
||||
movq xm15, [pb_27_17_17_27]
|
||||
cmp dword r8m, 0
|
||||
jne .loop_x_hv_overlap
|
||||
|
||||
; horizontal overlap (without vertical overlap)
|
||||
movd xm14, [pw_1024]
|
||||
movq xm14, [pw_1024]
|
||||
.loop_x_h_overlap:
|
||||
mov r6d, seed
|
||||
or seed, 0xEFF4
|
||||
|
@ -1156,8 +1166,7 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
pmaddubsw xm4, xm15, xm4
|
||||
pmulhrsw xm4, xm14
|
||||
packsswb xm4, xm4
|
||||
vpblendw xm4, xm3, 11111110b
|
||||
vpblendd m3, m4, 00001111b
|
||||
vpblendd m3, m3, m4, 00000001b
|
||||
pcmpgtb m7, m2, m3
|
||||
punpcklbw m2, m3, m7
|
||||
punpckhbw m3, m7
|
||||
|
@ -1329,7 +1338,7 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
; back to .loop_x_v_overlap, and instead always fall-through to
|
||||
; h+v overlap
|
||||
|
||||
movd xm15, [pb_27_17_17_27]
|
||||
movq xm15, [pb_27_17_17_27]
|
||||
.loop_x_hv_overlap:
|
||||
vpbroadcastw m8, [pb_27_17_17_27]
|
||||
|
||||
|
@ -1409,10 +1418,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
pmulhrsw xm7, xm14
|
||||
packsswb xm4, xm4
|
||||
packsswb xm7, xm7
|
||||
vpblendw xm4, xm3, 11111110b
|
||||
vpblendw xm7, xm6, 11111110b
|
||||
vpblendd m3, m4, 00001111b
|
||||
vpblendd m6, m7, 00001111b
|
||||
vpblendd m3, m4, 00000001b
|
||||
vpblendd m6, m7, 00000001b
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m7, m6, m3
|
||||
punpcklbw m6, m3
|
||||
|
@ -1461,10 +1468,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
RET
|
||||
|
||||
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
|
||||
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
grain_lut, h, sby, luma, lstride, uv_pl, is_id
|
||||
pcmpeqw m10, m10
|
||||
psrld m10, 24
|
||||
mov r7d, [fg_dataq+FGData.scaling_shift]
|
||||
lea r8, [pb_mask]
|
||||
%define base r8-pb_mask
|
||||
|
@ -1490,10 +1495,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%else
|
||||
vpbroadcastd m14, [pw_1024]
|
||||
%if %2
|
||||
vpbroadcastd m15, [pb_23_22]
|
||||
vpbroadcastq m15, [pb_23_22]
|
||||
%else
|
||||
vpbroadcastd xm15, [pb_27_17_17_27]
|
||||
vpbroadcastq xm15, [pb_27_17_17_27]
|
||||
%endif
|
||||
%endif
|
||||
%if %3
|
||||
vpbroadcastw m10, [pb_23_22]
|
||||
%elif %2
|
||||
mova m10, [pb_8x_27_17_8x_17_27]
|
||||
%endif
|
||||
|
||||
mov overlapd, [fg_dataq+FGData.overlap_flag]
|
||||
|
@ -1593,16 +1603,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; scaling[luma_src]
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m8, [scalingq+m4], m3
|
||||
vpgatherdd m4, [scalingq+m5], m9
|
||||
vpgatherdd m8, [scalingq-3+m4], m3
|
||||
vpgatherdd m4, [scalingq-3+m5], m9
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m5, [scalingq+m6], m3
|
||||
vpgatherdd m6, [scalingq+m7], m9
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m3
|
||||
vpgatherdd m6, [scalingq-3+m7], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
|
@ -1743,16 +1750,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; scaling[luma_src]
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m8, [scalingq+m4], m3
|
||||
vpgatherdd m4, [scalingq+m5], m9
|
||||
vpgatherdd m8, [scalingq-3+m4], m3
|
||||
vpgatherdd m4, [scalingq-3+m5], m9
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m5, [scalingq+m6], m3
|
||||
vpgatherdd m6, [scalingq+m7], m9
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m3
|
||||
vpgatherdd m6, [scalingq-3+m7], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
|
@ -1763,7 +1767,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if %2
|
||||
%if %1
|
||||
vpbroadcastd m6, [pb_23_22] ; FIXME
|
||||
vpbroadcastq m6, [pb_23_22]
|
||||
%endif
|
||||
movu xm3, [grain_lutq+offxyq+ 0]
|
||||
movd xm4, [grain_lutq+left_offxyq+ 0]
|
||||
|
@ -1778,12 +1782,10 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
pmulhrsw m4, m14
|
||||
%endif
|
||||
packsswb m4, m4
|
||||
pcmpeqw m6, m6 ; FIXME
|
||||
psrldq m6, 15 ; FIXME
|
||||
vpblendvb m3, m3, m4, m6
|
||||
vpblendd m3, m3, m4, 00010001b
|
||||
%else
|
||||
%if %1
|
||||
vpbroadcastd xm6, [pb_27_17_17_27]
|
||||
movq xm6, [pb_27_17_17_27]
|
||||
%endif
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
movd xm4, [grain_lutq+left_offxyq]
|
||||
|
@ -1796,9 +1798,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
pmulhrsw xm4, xm14
|
||||
%endif
|
||||
packsswb xm4, xm4
|
||||
pcmpeqw xm6, xm6
|
||||
psrldq xm6, 14
|
||||
vpblendvb m3, m3, m4, m6
|
||||
vpblendd m3, m3, m4, 00000001b
|
||||
%endif
|
||||
pcmpgtb m7, m2, m3
|
||||
punpcklbw m2, m3, m7
|
||||
|
@ -1915,7 +1915,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
mov hd, hm
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if %2 == 0
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
|
||||
%endif
|
||||
%%loop_y_v_overlap:
|
||||
; src
|
||||
|
@ -1966,16 +1966,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; scaling[luma_src]
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m8, [scalingq+m4], m3
|
||||
vpgatherdd m4, [scalingq+m5], m9
|
||||
vpgatherdd m8, [scalingq-3+m4], m3
|
||||
vpgatherdd m4, [scalingq-3+m5], m9
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m5, [scalingq+m6], m3
|
||||
vpgatherdd m6, [scalingq+m7], m9
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m3
|
||||
vpgatherdd m6, [scalingq-3+m7], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
|
@ -1988,7 +1985,6 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if %3 == 0
|
||||
%if %2
|
||||
mova m6, [pb_8x_27_17_8x_17_27]
|
||||
movu xm3, [grain_lutq+offxyq]
|
||||
movu xm4, [grain_lutq+top_offxyq]
|
||||
vinserti128 m3, [grain_lutq+offxyq+82], 1
|
||||
|
@ -1999,13 +1995,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%endif
|
||||
punpckhbw m9, m4, m3
|
||||
punpcklbw m4, m3
|
||||
%if %2
|
||||
pmaddubsw m9, m6, m9
|
||||
pmaddubsw m4, m6, m4
|
||||
%else
|
||||
pmaddubsw m9, m1, m9
|
||||
pmaddubsw m4, m1, m4
|
||||
%endif
|
||||
pmaddubsw m9, m10, m9
|
||||
pmaddubsw m4, m10, m4
|
||||
%if %1
|
||||
pmulhrsw m9, [pw_1024]
|
||||
pmulhrsw m4, [pw_1024]
|
||||
|
@ -2015,19 +2006,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%endif
|
||||
packsswb m3, m4, m9
|
||||
%else
|
||||
%if %1
|
||||
vpbroadcastd m6, [pb_23_22]
|
||||
%endif
|
||||
movq xm3, [grain_lutq+offxyq]
|
||||
movq xm4, [grain_lutq+top_offxyq]
|
||||
vinserti128 m3, [grain_lutq+offxyq+8], 1
|
||||
vinserti128 m4, [grain_lutq+top_offxyq+8], 1
|
||||
punpcklbw m4, m3
|
||||
pmaddubsw m4, m10, m4
|
||||
%if %1
|
||||
pmaddubsw m4, m6, m4
|
||||
pmulhrsw m4, [pw_1024]
|
||||
%else
|
||||
pmaddubsw m4, m15, m4
|
||||
pmulhrsw m4, m14
|
||||
%endif
|
||||
packsswb m4, m4
|
||||
|
@ -2084,7 +2071,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%endif
|
||||
add grain_lutq, 82<<%2
|
||||
%if %2 == 0
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
|
||||
btc hd, 16
|
||||
jnc %%loop_y_v_overlap
|
||||
%endif
|
||||
|
@ -2139,7 +2126,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
mov hd, hm
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if %2 == 0
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
|
||||
%endif
|
||||
%%loop_y_hv_overlap:
|
||||
; src
|
||||
|
@ -2190,16 +2177,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; scaling[src]
|
||||
pcmpeqw m9, m9
|
||||
pcmpeqw m3, m3
|
||||
vpgatherdd m8, [scalingq+m4], m9
|
||||
vpgatherdd m4, [scalingq+m5], m3
|
||||
vpgatherdd m8, [scalingq-3+m4], m9
|
||||
vpgatherdd m4, [scalingq-3+m5], m3
|
||||
pcmpeqw m9, m9
|
||||
pcmpeqw m3, m3
|
||||
vpgatherdd m5, [scalingq+m6], m9
|
||||
vpgatherdd m6, [scalingq+m7], m3
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m9
|
||||
vpgatherdd m6, [scalingq-3+m7], m3
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
|
@ -2212,9 +2196,9 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if %1
|
||||
%if %2
|
||||
vpbroadcastd m9, [pb_23_22]
|
||||
vpbroadcastq m9, [pb_23_22]
|
||||
%else
|
||||
vpbroadcastd xm9, [pb_27_17_17_27]
|
||||
vpbroadcastq xm9, [pb_27_17_17_27]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
@ -2252,7 +2236,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%else
|
||||
punpcklbw m7, m6
|
||||
%endif
|
||||
punpcklwd m4, m7
|
||||
punpcklqdq m4, m7
|
||||
%if %1
|
||||
pmaddubsw m4, m9, m4
|
||||
pmulhrsw m4, [pw_1024]
|
||||
|
@ -2261,18 +2245,17 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
pmulhrsw m4, m14
|
||||
%endif
|
||||
packsswb m4, m4
|
||||
pcmpeqw m9, m9 ; this is kind of ugly
|
||||
psrldq m9, 15
|
||||
vpblendvb m3, m3, m4, m9
|
||||
psrldq m4, 1
|
||||
vpblendd m3, m4, 00010001b
|
||||
psrldq m4, 4
|
||||
%if %3
|
||||
shufpd m9, m9, m9, 1110b ; clear upper lane
|
||||
vpblendd m6, m6, m4, 00000001b
|
||||
%else
|
||||
vpblendd m6, m6, m4, 00010001b
|
||||
%endif
|
||||
vpblendvb m6, m6, m4, m9
|
||||
%else
|
||||
punpcklbw xm4, xm3
|
||||
punpcklbw xm7, xm6
|
||||
punpckldq xm4, xm7
|
||||
punpcklqdq xm4, xm7
|
||||
%if %1
|
||||
pmaddubsw xm4, xm9, xm4
|
||||
pmulhrsw xm4, [pw_1024]
|
||||
|
@ -2281,23 +2264,19 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
pmulhrsw xm4, xm14
|
||||
%endif
|
||||
packsswb xm4, xm4
|
||||
pcmpeqw xm9, xm9 ; this is kind of ugly
|
||||
psrldq xm9, 14
|
||||
vpblendvb m3, m3, m4, m9
|
||||
psrldq xm4, 2
|
||||
vpblendvb m6, m6, m4, m9
|
||||
vpblendd m3, m3, m4, 00000001b
|
||||
psrldq xm4, 4
|
||||
vpblendd m6, m6, m4, 00000001b
|
||||
%endif
|
||||
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
%if %3
|
||||
vpermq m9, m3, q3120
|
||||
punpcklbw m6, m9
|
||||
pmaddubsw m6, m10, m6
|
||||
%if %1
|
||||
vpbroadcastd m9, [pb_23_22]
|
||||
pmaddubsw m6, m9, m6
|
||||
pmulhrsw m6, [pw_1024]
|
||||
%else
|
||||
pmaddubsw m6, m15, m6
|
||||
pmulhrsw m6, m14
|
||||
%endif
|
||||
packsswb m6, m6
|
||||
|
@ -2306,14 +2285,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%else
|
||||
punpckhbw m9, m6, m3
|
||||
punpcklbw m6, m3
|
||||
%if %2
|
||||
mova m3, [pb_8x_27_17_8x_17_27]
|
||||
pmaddubsw m9, m3, m9
|
||||
pmaddubsw m6, m3, m6
|
||||
%else
|
||||
pmaddubsw m9, m1, m9
|
||||
pmaddubsw m6, m1, m6
|
||||
%endif
|
||||
pmaddubsw m9, m10, m9
|
||||
pmaddubsw m6, m10, m6
|
||||
%if %1
|
||||
pmulhrsw m9, [pw_1024]
|
||||
pmulhrsw m6, [pw_1024]
|
||||
|
@ -2373,7 +2346,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
jg %%loop_y_h_overlap
|
||||
%else
|
||||
je %%end_y_hv_overlap
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
|
||||
btc hd, 16
|
||||
jnc %%loop_y_hv_overlap
|
||||
jmp %%loop_y_h_overlap
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -28,64 +28,48 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/film_grain.h"
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
|
||||
decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3));
|
||||
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3));
|
||||
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3));
|
||||
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3));
|
||||
decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3));
|
||||
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3));
|
||||
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3));
|
||||
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3));
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_16bpc_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_16bpc_avx2);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_16bpc_avx2);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_16bpc_avx2);
|
||||
decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2));
|
||||
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2));
|
||||
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2));
|
||||
decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2));
|
||||
decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2));
|
||||
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2));
|
||||
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2));
|
||||
decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2));
|
||||
|
||||
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->generate_grain_y = dav1d_generate_grain_y_ssse3;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
|
||||
#endif
|
||||
c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
|
||||
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->generate_grain_y = dav1d_generate_grain_y_avx2;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
|
||||
#else
|
||||
c->generate_grain_y = dav1d_generate_grain_y_16bpc_avx2;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] =
|
||||
dav1d_generate_grain_uv_420_16bpc_avx2;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_16bpc_avx2;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] =
|
||||
dav1d_fguv_32x32xn_i420_16bpc_avx2;
|
||||
#endif
|
||||
c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
|
||||
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; Copyright © 2019, VideoLAN and dav1d authors
|
||||
; Copyright © 2019-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2019, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
|
@ -29,14 +29,18 @@
|
|||
SECTION_RODATA
|
||||
|
||||
pw_1024: times 8 dw 1024
|
||||
pb_27_17_17_27: db 27, 17, 17, 27
|
||||
times 6 db 0, 32
|
||||
pb_23_22_h: db 23, 22
|
||||
times 7 db 0, 32
|
||||
pb_27_17: times 8 db 27, 17
|
||||
pb_17_27: times 8 db 17, 27
|
||||
pb_23_22: times 8 db 23, 22
|
||||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
|
||||
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
|
||||
pw_seed_xor: times 2 dw 0xb524
|
||||
times 2 dw 0x49d8
|
||||
pb_23_22: times 2 db 23, 22
|
||||
pb_1: times 4 db 1
|
||||
hmul_bits: dw 32768, 16384, 8192, 4096
|
||||
round: dw 2048, 1024, 512
|
||||
|
@ -46,23 +50,21 @@ max: dw 255, 240, 235
|
|||
min: dw 0, 16
|
||||
pw_1: dw 1
|
||||
|
||||
%define pb_27_17_17_27 pb_17_27 - 2
|
||||
|
||||
%macro JMP_TABLE 1-*
|
||||
%xdefine %1_table %%table
|
||||
%xdefine %%base %1_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1)
|
||||
%macro JMP_TABLE 2-*
|
||||
%xdefine %1_8bpc_%2_table %%table
|
||||
%xdefine %%base %1_8bpc_%2_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 1
|
||||
dd %%prefix %+ .ar%2 - %%base
|
||||
%rep %0 - 2
|
||||
dd %%prefix %+ .ar%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
|
||||
|
||||
struc FGData
|
||||
.seed: resd 1
|
||||
|
@ -88,6 +90,20 @@ cextern gaussian_sequence
|
|||
|
||||
SECTION .text
|
||||
|
||||
%macro REPX 2-*
|
||||
%xdefine %%f(x) %1
|
||||
%rep %0 - 1
|
||||
%rotate 1
|
||||
%%f(%1)
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
%define PIC_ptr(a) base+a
|
||||
%else
|
||||
%define PIC_ptr(a) a
|
||||
%endif
|
||||
|
||||
%macro SCRATCH 3
|
||||
%if ARCH_X86_32
|
||||
mova [rsp+%3*mmsize], m%1
|
||||
|
@ -98,7 +114,7 @@ SECTION .text
|
|||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
|
||||
cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
|
||||
LEA r4, $$
|
||||
%define base r4-$$
|
||||
movq m1, [base+rnd_next_upperbit_mask]
|
||||
|
@ -164,8 +180,8 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
|
|||
|
||||
; auto-regression code
|
||||
movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
|
||||
movsxd r2, [base+generate_grain_y_ssse3_table+r2*4]
|
||||
lea r2, [r2+base+generate_grain_y_ssse3_table]
|
||||
movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
|
||||
lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
|
||||
jmp r2
|
||||
|
||||
.ar1:
|
||||
|
@ -507,7 +523,7 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
|
|||
|
||||
%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
|
||||
INIT_XMM ssse3
|
||||
cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
|
||||
cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
|
||||
movifnidn r2, r2mp
|
||||
movifnidn r3, r3mp
|
||||
LEA r4, $$
|
||||
|
@ -606,8 +622,8 @@ cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
|
|||
|
||||
; auto-regression code
|
||||
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
|
||||
movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4]
|
||||
lea r5, [r5+base+generate_grain_uv_%1_ssse3_table]
|
||||
movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
|
||||
lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
|
||||
jmp r5
|
||||
|
||||
.ar0:
|
||||
|
@ -1284,7 +1300,7 @@ INIT_XMM ssse3
|
|||
; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
|
||||
%if ARCH_X86_32
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
|
||||
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
|
||||
dst, src, scaling, unused1, fg_data, picptr, unused2
|
||||
; copy stack arguments to new position post-alignment, so that we
|
||||
; don't have to keep the old stack location in a separate register
|
||||
|
@ -1295,43 +1311,41 @@ cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
|
|||
mov r4, r7m
|
||||
mov r5, r8m
|
||||
|
||||
mov [rsp+6*mmsize+ 3*gprsize], r0
|
||||
mov [rsp+6*mmsize+ 5*gprsize], r1
|
||||
mov [rsp+6*mmsize+ 7*gprsize], r2
|
||||
mov [rsp+6*mmsize+ 9*gprsize], r3
|
||||
mov [rsp+6*mmsize+10*gprsize], r4
|
||||
mov [rsp+6*mmsize+11*gprsize], r5
|
||||
mov [rsp+5*mmsize+ 4*gprsize], r0
|
||||
mov [rsp+5*mmsize+ 6*gprsize], r1
|
||||
mov [rsp+5*mmsize+ 8*gprsize], r2
|
||||
mov [rsp+5*mmsize+10*gprsize], r3
|
||||
mov [rsp+5*mmsize+11*gprsize], r4
|
||||
mov [rsp+5*mmsize+12*gprsize], r5
|
||||
%else
|
||||
cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
|
||||
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
|
||||
dst, src, scaling, unused1, fg_data, picptr, unused2
|
||||
%endif
|
||||
mov srcq, srcm
|
||||
mov fg_dataq, r3m
|
||||
mov scalingq, r5m
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
%define r0m [rsp+6*mmsize+ 3*gprsize]
|
||||
%define r1m [rsp+6*mmsize+ 4*gprsize]
|
||||
%define r2m [rsp+6*mmsize+ 5*gprsize]
|
||||
%define r3m [rsp+6*mmsize+ 6*gprsize]
|
||||
%define r4m [rsp+6*mmsize+ 7*gprsize]
|
||||
%define r5m [rsp+6*mmsize+ 8*gprsize]
|
||||
%define r6m [rsp+6*mmsize+ 9*gprsize]
|
||||
%define r7m [rsp+6*mmsize+10*gprsize]
|
||||
%define r8m [rsp+6*mmsize+11*gprsize]
|
||||
%define r0m [rsp+5*mmsize+ 4*gprsize]
|
||||
%define r1m [rsp+5*mmsize+ 5*gprsize]
|
||||
%define r2m [rsp+5*mmsize+ 6*gprsize]
|
||||
%define r3m [rsp+5*mmsize+ 7*gprsize]
|
||||
%define r4m [rsp+5*mmsize+ 8*gprsize]
|
||||
%define r5m [rsp+5*mmsize+ 9*gprsize]
|
||||
%define r6m [rsp+5*mmsize+10*gprsize]
|
||||
%define r7m [rsp+5*mmsize+11*gprsize]
|
||||
%define r8m [rsp+5*mmsize+12*gprsize]
|
||||
%endif
|
||||
LEA r5, pb_mask
|
||||
%define base r5-pb_mask
|
||||
mov r5m, picptrq
|
||||
%else
|
||||
cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
||||
cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
||||
lea r7, [pb_mask]
|
||||
%define base r7-pb_mask
|
||||
%endif
|
||||
mov r6d, [fg_dataq+FGData.scaling_shift]
|
||||
movd m3, [base+mul_bits+r6*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
pcmpeqw m2, m2
|
||||
psrldq m2, 14
|
||||
movd m4, [base+max+r6*4]
|
||||
movd m5, [base+min+r6*2]
|
||||
punpcklwd m3, m3
|
||||
|
@ -1340,10 +1354,9 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
pshufd m3, m3, q0000
|
||||
pshufd m4, m4, q0000
|
||||
pshufd m5, m5, q0000
|
||||
SCRATCH 2, 10, 0
|
||||
SCRATCH 3, 11, 1
|
||||
SCRATCH 4, 12, 2
|
||||
SCRATCH 5, 13, 3
|
||||
SCRATCH 3, 11, 0
|
||||
SCRATCH 4, 12, 1
|
||||
SCRATCH 5, 13, 2
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
|
||||
|
@ -1356,9 +1369,9 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
test overlapd, overlapd
|
||||
jz .no_vertical_overlap
|
||||
mova m6, [base+pw_1024]
|
||||
movd m7, [base+pb_27_17_17_27]
|
||||
SCRATCH 6, 14, 4
|
||||
SCRATCH 7, 15, 5
|
||||
mova m7, [base+pb_27_17_17_27]
|
||||
SCRATCH 6, 14, 3
|
||||
SCRATCH 7, 15, 4
|
||||
test sbyd, sbyd
|
||||
jnz .vertical_overlap
|
||||
; fall-through
|
||||
|
@ -1445,16 +1458,13 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m4, m0, scalingq, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq, r0, r5, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
|
||||
%else
|
||||
vpgatherdw m4, m0, scalingq, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq, r12, r13, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
|
||||
%endif
|
||||
pcmpeqw m3, m3
|
||||
psrlw m3, 8
|
||||
pand m4, m3
|
||||
pand m5, m3
|
||||
REPX {psrlw x, 8}, m4, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
|
@ -1504,7 +1514,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
jz .loop_x_odd
|
||||
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16 ; top_offxyd
|
||||
%endif
|
||||
|
@ -1525,7 +1535,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
|
||||
|
||||
add offxyd, 16 ; left_offxyd
|
||||
mov [rsp+6*mmsize+0*gprsize], offxyd
|
||||
mov [rsp+5*mmsize+0*gprsize], offxyd
|
||||
|
||||
DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
|
||||
|
||||
|
@ -1578,21 +1588,18 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m4, m0, scalingq, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq, r0, r5, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
|
||||
%else
|
||||
vpgatherdw m4, m0, scalingq, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq, r12, r13, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
|
||||
%endif
|
||||
pcmpeqw m3, m3
|
||||
psrlw m3, 8
|
||||
pand m4, m3
|
||||
pand m5, m3
|
||||
REPX {psrlw x, 8}, m4, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+6*mmsize+0*gprsize]
|
||||
mov r5, [rsp+5*mmsize+0*gprsize]
|
||||
movd m7, [grain_lutq+r5]
|
||||
%else
|
||||
movd m7, [grain_lutq+left_offxyq]
|
||||
|
@ -1601,9 +1608,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
pmaddubsw m6, m15, m7
|
||||
pmulhrsw m6, m14
|
||||
packsswb m6, m6
|
||||
pand m6, m10
|
||||
pandn m7, m10, m3
|
||||
por m6, m7
|
||||
shufps m6, m3, q3210
|
||||
pcmpgtb m2, m6
|
||||
punpcklbw m7, m6, m2
|
||||
punpckhbw m6, m2
|
||||
|
@ -1649,7 +1654,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
test dword r8m, 2 ; have_top_overlap
|
||||
jz .loop_x_odd
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16 ; top_offxyd
|
||||
%endif
|
||||
|
@ -1754,7 +1759,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
|
||||
movzx top_offxyd, offxyw
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+6*mmsize+1*gprsize], top_offxyd
|
||||
mov [rsp+5*mmsize+1*gprsize], top_offxyd
|
||||
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
|
||||
%endif
|
||||
|
@ -1764,7 +1769,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
lea r5, [base+pb_27_17]
|
||||
mov [rsp+5*mmsize+8], r5
|
||||
mov [rsp+5*mmsize+12], r5
|
||||
%else
|
||||
mova m8, [pb_27_17]
|
||||
%endif
|
||||
|
@ -1779,21 +1784,18 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m4, m0, scalingq, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq, r0, r5, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
|
||||
%else
|
||||
vpgatherdw m4, m0, scalingq, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq, r12, r13, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
|
||||
%endif
|
||||
pcmpeqw m3, m3
|
||||
psrlw m3, 8
|
||||
pand m4, m3
|
||||
pand m5, m3
|
||||
REPX {psrlw x, 8}, m4, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+6*mmsize+1*gprsize]
|
||||
mov r5, [rsp+5*mmsize+1*gprsize]
|
||||
movu m7, [grain_lutq+r5]
|
||||
%else
|
||||
movu m7, [grain_lutq+top_offxyq]
|
||||
|
@ -1801,7 +1803,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
punpckhbw m6, m7, m3
|
||||
punpcklbw m7, m3
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+5*mmsize+8]
|
||||
mov r5, [rsp+5*mmsize+12]
|
||||
pmaddubsw m3, [r5], m6
|
||||
pmaddubsw m6, [r5], m7
|
||||
%else
|
||||
|
@ -1833,7 +1835,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
mova [dstq+srcq], m0
|
||||
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+5*mmsize+8], mmsize
|
||||
add dword [rsp+5*mmsize+12], mmsize
|
||||
%else
|
||||
mova m8, [pb_17_27]
|
||||
%endif
|
||||
|
@ -1864,7 +1866,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
jc .loop_x_hv_overlap
|
||||
add offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
|
@ -1874,16 +1876,16 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
lea r5, [base+pb_27_17]
|
||||
mov [rsp+5*mmsize+8], r5
|
||||
mov [rsp+5*mmsize+12], r5
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
|
||||
|
||||
mov r5, [rsp+6*mmsize+1*gprsize]
|
||||
mov r5, [rsp+5*mmsize+1*gprsize]
|
||||
mov r4, offxyd
|
||||
add r5, 16
|
||||
add r4, 16
|
||||
mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy
|
||||
mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy
|
||||
mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
|
||||
mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
|
||||
|
||||
|
@ -1937,7 +1939,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
|
||||
|
||||
movzx r5, offxyw ; top_offxy
|
||||
mov [rsp+6*mmsize+1*gprsize], r5
|
||||
mov [rsp+5*mmsize+1*gprsize], r5
|
||||
%else
|
||||
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
|
||||
h, offxy, see, left_offxy, top_offxy, topleft_offxy
|
||||
|
@ -1952,10 +1954,10 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy
|
||||
mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy
|
||||
mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
|
||||
mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
|
||||
movu m6, [grain_lutq+r5]
|
||||
mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy
|
||||
mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
|
||||
movd m4, [grain_lutq+r0]
|
||||
movd m7, [grain_lutq+r5]
|
||||
%else
|
||||
|
@ -1972,17 +1974,13 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
pmulhrsw m4, m14
|
||||
packsswb m2, m2
|
||||
packsswb m4, m4
|
||||
pand m2, m10
|
||||
pand m4, m10
|
||||
pandn m7, m10, m3
|
||||
pandn m3, m10, m6
|
||||
por m7, m2
|
||||
por m3, m4
|
||||
shufps m2, m3, q3210
|
||||
shufps m4, m6, q3210
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m4, m3, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m3, m4, m2
|
||||
punpckhbw m4, m2
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+5*mmsize+8]
|
||||
mov r5, [rsp+5*mmsize+12]
|
||||
pmaddubsw m7, [r5], m4
|
||||
pmaddubsw m4, [r5], m3
|
||||
%else
|
||||
|
@ -2004,16 +2002,13 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m5, m0, scalingq, r0, r5, m7
|
||||
vpgatherdw m6, m1, scalingq, r0, r5, m7
|
||||
vpgatherdw m5, m0, scalingq-1, r0, r5, m7
|
||||
vpgatherdw m6, m1, scalingq-1, r0, r5, m7
|
||||
%else
|
||||
vpgatherdw m5, m0, scalingq, r13, r14, m7
|
||||
vpgatherdw m6, m1, scalingq, r13, r14, m7
|
||||
vpgatherdw m5, m0, scalingq-1, r13, r14, m7
|
||||
vpgatherdw m6, m1, scalingq-1, r13, r14, m7
|
||||
%endif
|
||||
pcmpeqw m7, m7
|
||||
psrlw m7, 8
|
||||
pand m5, m7
|
||||
pand m6, m7
|
||||
REPX {psrlw x, 8}, m5, m6
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
pmullw m3, m5
|
||||
|
@ -2033,7 +2028,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
mova [dstq+srcq], m0
|
||||
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+5*mmsize+8], mmsize
|
||||
add dword [rsp+5*mmsize+12], mmsize
|
||||
%else
|
||||
mova m8, [pb_17_27]
|
||||
%endif
|
||||
|
@ -2063,7 +2058,7 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
|||
xor dword r8m, 4
|
||||
add offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
|
@ -2079,61 +2074,60 @@ INIT_XMM ssse3
|
|||
; sby, luma, lstride, uv_pl, is_id)
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
|
||||
tmp, src, scaling, h, fg_data, picptr, unused
|
||||
mov r0, r0m
|
||||
mov r1, r2m
|
||||
mov r2, r4m
|
||||
mov r3, r6m
|
||||
mov r4, r7m
|
||||
mov [rsp+8*mmsize+3*gprsize], r0
|
||||
mov [rsp+8*mmsize+5*gprsize], r1
|
||||
mov [rsp+8*mmsize+7*gprsize], r2
|
||||
mov [rsp+8*mmsize+9*gprsize], r3
|
||||
mov [rsp+8*mmsize+10*gprsize], r4
|
||||
mov [rsp+7*mmsize+3*gprsize], r0
|
||||
mov [rsp+7*mmsize+5*gprsize], r1
|
||||
mov [rsp+7*mmsize+7*gprsize], r2
|
||||
mov [rsp+7*mmsize+9*gprsize], r3
|
||||
mov [rsp+7*mmsize+10*gprsize], r4
|
||||
|
||||
mov r0, r8m
|
||||
mov r1, r9m
|
||||
mov r2, r10m
|
||||
mov r4, r11m
|
||||
mov r3, r12m
|
||||
mov [rsp+8*mmsize+11*gprsize], r0
|
||||
mov [rsp+8*mmsize+12*gprsize], r1
|
||||
mov [rsp+8*mmsize+13*gprsize], r2
|
||||
mov [rsp+8*mmsize+14*gprsize], r4
|
||||
mov [rsp+7*mmsize+11*gprsize], r0
|
||||
mov [rsp+7*mmsize+12*gprsize], r1
|
||||
mov [rsp+7*mmsize+13*gprsize], r2
|
||||
mov [rsp+7*mmsize+14*gprsize], r4
|
||||
%else
|
||||
cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
|
||||
tmp, src, scaling, h, fg_data, picptr, unused
|
||||
%endif
|
||||
mov srcq, srcm
|
||||
mov fg_dataq, r3m
|
||||
mov scalingq, r5m
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
%define r0m [rsp+8*mmsize+ 3*gprsize]
|
||||
%define r1m [rsp+8*mmsize+ 4*gprsize]
|
||||
%define r2m [rsp+8*mmsize+ 5*gprsize]
|
||||
%define r3m [rsp+8*mmsize+ 6*gprsize]
|
||||
%define r4m [rsp+8*mmsize+ 7*gprsize]
|
||||
%define r5m [rsp+8*mmsize+ 8*gprsize]
|
||||
%define r6m [rsp+8*mmsize+ 9*gprsize]
|
||||
%define r7m [rsp+8*mmsize+10*gprsize]
|
||||
%define r8m [rsp+8*mmsize+11*gprsize]
|
||||
%define r9m [rsp+8*mmsize+12*gprsize]
|
||||
%define r10m [rsp+8*mmsize+13*gprsize]
|
||||
%define r11m [rsp+8*mmsize+14*gprsize]
|
||||
%define r12m [rsp+8*mmsize+15*gprsize]
|
||||
%define r0m [rsp+7*mmsize+ 3*gprsize]
|
||||
%define r1m [rsp+7*mmsize+ 4*gprsize]
|
||||
%define r2m [rsp+7*mmsize+ 5*gprsize]
|
||||
%define r3m [rsp+7*mmsize+ 6*gprsize]
|
||||
%define r4m [rsp+7*mmsize+ 7*gprsize]
|
||||
%define r5m [rsp+7*mmsize+ 8*gprsize]
|
||||
%define r6m [rsp+7*mmsize+ 9*gprsize]
|
||||
%define r7m [rsp+7*mmsize+10*gprsize]
|
||||
%define r8m [rsp+7*mmsize+11*gprsize]
|
||||
%define r9m [rsp+7*mmsize+12*gprsize]
|
||||
%define r10m [rsp+7*mmsize+13*gprsize]
|
||||
%define r11m [rsp+7*mmsize+14*gprsize]
|
||||
%define r12m [rsp+7*mmsize+15*gprsize]
|
||||
%endif
|
||||
LEA r5, pb_mask
|
||||
%define base r5-pb_mask
|
||||
mov r5m, r5
|
||||
%else
|
||||
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
|
||||
lea r8, [pb_mask]
|
||||
%define base r8-pb_mask
|
||||
%endif
|
||||
mov r6d, [fg_dataq+FGData.scaling_shift]
|
||||
pcmpeqw m2, m2
|
||||
movd m3, [base+mul_bits+r6*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
lea tmpd, [r6d*2]
|
||||
|
@ -2145,17 +2139,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
movd m5, [base+min+r6*2]
|
||||
cmovne r6d, tmpd
|
||||
movd m4, [base+max+r6*2]
|
||||
psrldq m2, 14+%2
|
||||
punpcklwd m3, m3
|
||||
punpcklwd m5, m5
|
||||
punpcklwd m4, m4
|
||||
pshufd m3, m3, q0000
|
||||
pshufd m5, m5, q0000
|
||||
pshufd m4, m4, q0000
|
||||
SCRATCH 2, 10, 0
|
||||
SCRATCH 3, 11, 1
|
||||
SCRATCH 4, 12, 2
|
||||
SCRATCH 5, 13, 3
|
||||
SCRATCH 3, 11, 0
|
||||
SCRATCH 4, 12, 1
|
||||
SCRATCH 5, 13, 2
|
||||
|
||||
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
||||
jne .csfl
|
||||
|
@ -2177,8 +2169,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
punpcklwd m7, m7
|
||||
pshufd m6, m6, q0000
|
||||
pshufd m7, m7, q0000
|
||||
SCRATCH 6, 14, 4
|
||||
SCRATCH 7, 15, 5
|
||||
SCRATCH 6, 14, 3
|
||||
SCRATCH 7, 15, 4
|
||||
%endif
|
||||
|
||||
mov sbyd, r8m
|
||||
|
@ -2187,22 +2179,21 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
jz %%no_vertical_overlap
|
||||
%if ARCH_X86_32
|
||||
%if %2
|
||||
movd m1, [base+pb_23_22]
|
||||
mova m1, [base+pb_23_22_h]
|
||||
%else
|
||||
movd m1, [base+pb_27_17_17_27]
|
||||
mova m1, [base+pb_27_17_17_27]
|
||||
%endif
|
||||
mova m0, [base+pw_1024]
|
||||
%else
|
||||
%if %2
|
||||
movd m1, [pb_23_22]
|
||||
mova m1, [pb_23_22_h]
|
||||
%else
|
||||
movd m1, [pb_27_17_17_27]
|
||||
mova m1, [pb_27_17_17_27]
|
||||
%endif
|
||||
mova m0, [pw_1024]
|
||||
%endif
|
||||
pshufd m1, m1, q0000
|
||||
SCRATCH 0, 8, 6
|
||||
SCRATCH 1, 9, 7
|
||||
SCRATCH 0, 8, 5
|
||||
SCRATCH 1, 9, 6
|
||||
test sbyd, sbyd
|
||||
jnz %%vertical_overlap
|
||||
; fall-through
|
||||
|
@ -2347,16 +2338,13 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
|
||||
; scaling[luma_src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
vpgatherdw m7, m4, scalingq-1, r12, r2
|
||||
vpgatherdw m5, m6, scalingq-1, r12, r2
|
||||
%endif
|
||||
pcmpeqw m1, m1
|
||||
psrlw m1, 8
|
||||
pand m7, m1
|
||||
pand m5, m1
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; unpack chroma_source
|
||||
punpckhbw m1, m0, m2
|
||||
|
@ -2426,7 +2414,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%if %2 == 0
|
||||
; adjust top_offxy
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16
|
||||
%endif
|
||||
|
@ -2450,9 +2438,9 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%if ARCH_X86_32
|
||||
%if %2
|
||||
lea r6, [offxyd+16]
|
||||
mov [rsp+8*mmsize+0*gprsize], r6
|
||||
mov [rsp+7*mmsize+0*gprsize], r6
|
||||
%else
|
||||
mov [rsp+8*mmsize+0*gprsize], offxyd
|
||||
mov [rsp+7*mmsize+0*gprsize], offxyd
|
||||
%endif
|
||||
|
||||
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
|
||||
|
@ -2558,36 +2546,31 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
|
||||
; scaling[luma_src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
vpgatherdw m7, m4, scalingq-1, r12, r2
|
||||
vpgatherdw m5, m6, scalingq-1, r12, r2
|
||||
%endif
|
||||
pcmpeqw m1, m1
|
||||
psrlw m1, 8
|
||||
pand m7, m1
|
||||
pand m5, m1
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; unpack chroma_source
|
||||
punpckhbw m1, m0, m2
|
||||
punpcklbw m0, m2 ; m0-1: src as word
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq+ 0]
|
||||
movu m4, [grain_lutq+offxyq+ 0]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+0*gprsize]
|
||||
movd m4, [grain_lutq+r0+ 0]
|
||||
mov r0, [rsp+7*mmsize+0*gprsize]
|
||||
movd m2, [grain_lutq+r0+ 0]
|
||||
%else
|
||||
movd m4, [grain_lutq+left_offxyq+ 0]
|
||||
movd m2, [grain_lutq+left_offxyq+ 0]
|
||||
%endif
|
||||
punpcklbw m2, m4, m3
|
||||
pmaddubsw m4, m9, m2
|
||||
pmulhrsw m4, m8
|
||||
packsswb m4, m4
|
||||
pand m4, m10
|
||||
pandn m2, m10, m3
|
||||
por m3, m4, m2
|
||||
punpcklbw m2, m4
|
||||
pmaddubsw m3, m9, m2
|
||||
pmulhrsw m3, m8
|
||||
packsswb m3, m3
|
||||
shufps m3, m4, q3210
|
||||
pxor m4, m4
|
||||
pcmpgtb m4, m3
|
||||
punpcklbw m2, m3, m4
|
||||
|
@ -2652,7 +2635,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
xor dword r8m, 4
|
||||
; adjust top_offxyd
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16
|
||||
%endif
|
||||
|
@ -2780,7 +2763,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
movzx top_offxyd, offxyw
|
||||
shr offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+8*mmsize+1*gprsize], top_offxyd
|
||||
mov [rsp+7*mmsize+1*gprsize], top_offxyd
|
||||
|
||||
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
|
||||
%endif
|
||||
|
@ -2790,9 +2773,11 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
mov grain_lutq, grain_lutmp
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m1, [base+pb_27_17]
|
||||
%endif
|
||||
%if %3
|
||||
mova m1, [PIC_ptr(pb_23_22)]
|
||||
%else
|
||||
mova m1, [pb_27_17]
|
||||
mova m1, [PIC_ptr(pb_27_17)]
|
||||
%endif
|
||||
%%loop_y_v_overlap:
|
||||
%if ARCH_X86_32
|
||||
|
@ -2848,34 +2833,26 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
|
||||
; scaling[luma_src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
vpgatherdw m7, m4, scalingq-1, r12, r2
|
||||
vpgatherdw m5, m6, scalingq-1, r12, r2
|
||||
%endif
|
||||
pcmpeqw m4, m4
|
||||
psrlw m4, 8
|
||||
pand m7, m4
|
||||
pand m5, m4
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+1*gprsize]
|
||||
mov r0, [rsp+7*mmsize+1*gprsize]
|
||||
movu m4, [grain_lutq+r0]
|
||||
%else
|
||||
movu m4, [grain_lutq+top_offxyq]
|
||||
%endif
|
||||
punpckhbw m6, m4, m3
|
||||
punpcklbw m4, m3
|
||||
%if %3
|
||||
pmaddubsw m2, m9, m6
|
||||
pmaddubsw m3, m9, m4
|
||||
%else
|
||||
pmaddubsw m2, m1, m6
|
||||
pmaddubsw m3, m1, m4
|
||||
%endif
|
||||
pmulhrsw m2, m8
|
||||
pmulhrsw m3, m8
|
||||
packsswb m3, m2
|
||||
|
@ -2928,10 +2905,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
btc hd, 16
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m1, [base+pb_17_27]
|
||||
%else
|
||||
mova m1, [pb_17_27]
|
||||
%endif
|
||||
mova m1, [PIC_ptr(pb_17_27)]
|
||||
jnc %%loop_y_v_overlap
|
||||
%endif
|
||||
jmp %%loop_y
|
||||
|
@ -2963,7 +2938,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
; h+v overlap
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
|
@ -2976,15 +2951,15 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
%if ARCH_X86_32
|
||||
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
|
||||
|
||||
mov r6, [rsp+8*mmsize+1*gprsize]
|
||||
mov r6, [rsp+7*mmsize+1*gprsize]
|
||||
%if %2
|
||||
lea r0, [r3d+16]
|
||||
add r6, 16
|
||||
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
|
||||
mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
|
||||
%else
|
||||
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
|
||||
mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
|
||||
%endif
|
||||
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
|
||||
mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
|
||||
|
||||
|
@ -3048,18 +3023,55 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
movzx top_offxyd, offxyw
|
||||
shr offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+8*mmsize+1*gprsize], top_offxyd
|
||||
mov [rsp+7*mmsize+1*gprsize], top_offxyd
|
||||
%endif
|
||||
|
||||
mov hd, r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m3, [base+pb_27_17]
|
||||
%endif
|
||||
%if %3
|
||||
mova m3, [PIC_ptr(pb_23_22)]
|
||||
%else
|
||||
mova m3, [pb_27_17]
|
||||
mova m3, [PIC_ptr(pb_27_17)]
|
||||
%endif
|
||||
%%loop_y_hv_overlap:
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
|
||||
mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
|
||||
movd m1, [grain_lutq+r0]
|
||||
mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
|
||||
%else
|
||||
movd m1, [grain_lutq+topleft_offxyq]
|
||||
%endif
|
||||
movu m2, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
movu m6, [grain_lutq+r5]
|
||||
movd m4, [grain_lutq+r0]
|
||||
%else
|
||||
movu m6, [grain_lutq+top_offxyq]
|
||||
movd m4, [grain_lutq+left_offxyq]
|
||||
%endif
|
||||
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m4, m2
|
||||
pmaddubsw m0, m9, m1
|
||||
pmaddubsw m1, m9, m4
|
||||
REPX {pmulhrsw x, m8}, m0, m1
|
||||
packsswb m0, m1
|
||||
shufps m4, m0, m2, q3232
|
||||
shufps m0, m6, q3210
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpcklbw m2, m0, m4
|
||||
punpckhbw m0, m4
|
||||
pmaddubsw m4, m3, m0
|
||||
pmaddubsw m1, m3, m2
|
||||
pmulhrsw m4, m8
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m4
|
||||
|
||||
; src
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
|
||||
|
@ -3116,69 +3128,20 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
movd m1, [grain_lutq+topleft_offxyq]
|
||||
%if %3
|
||||
vpgatherdw m7, m4, scalingq, r2, r12
|
||||
vpgatherdw m5, m6, scalingq, r2, r12
|
||||
vpgatherdw m7, m4, scalingq-1, r2, r12
|
||||
vpgatherdw m5, m6, scalingq-1, r2, r12
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r2, r13
|
||||
vpgatherdw m5, m6, scalingq, r2, r13
|
||||
vpgatherdw m7, m4, scalingq-1, r2, r13
|
||||
vpgatherdw m5, m6, scalingq-1, r2, r13
|
||||
%endif
|
||||
%endif
|
||||
pcmpeqw m2, m2
|
||||
psrlw m2, 8
|
||||
pand m7, m2
|
||||
pand m5, m2
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
|
||||
mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy
|
||||
movd m1, [grain_lutq+r0]
|
||||
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
|
||||
%endif
|
||||
movu m2, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
movu m6, [grain_lutq+r5]
|
||||
movd m4, [grain_lutq+r0]
|
||||
%else
|
||||
movu m6, [grain_lutq+top_offxyq]
|
||||
movd m4, [grain_lutq+left_offxyq]
|
||||
%endif
|
||||
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m4, m2
|
||||
%if %2
|
||||
punpcklwd m4, m1
|
||||
%else
|
||||
punpckldq m4, m1
|
||||
%endif
|
||||
pmaddubsw m1, m9, m4
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m1
|
||||
pandn m4, m10, m2
|
||||
pandn m2, m10, m6
|
||||
psrldq m6, m1, 2-%2
|
||||
pand m1, m10
|
||||
pand m6, m10
|
||||
por m4, m1
|
||||
por m2, m6
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m1, m2, m4
|
||||
punpcklbw m2, m4
|
||||
%if %3
|
||||
pmaddubsw m4, m9, m1
|
||||
pmaddubsw m1, m9, m2
|
||||
%else
|
||||
pmaddubsw m4, m3, m1
|
||||
pmaddubsw m1, m3, m2
|
||||
%endif
|
||||
pmulhrsw m4, m8
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m4
|
||||
; unpack grain
|
||||
pxor m4, m4
|
||||
pcmpgtb m4, m1
|
||||
punpcklbw m2, m1, m4
|
||||
|
@ -3229,10 +3192,8 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
jle %%end_y_hv_overlap
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m3, [base+pb_17_27]
|
||||
%else
|
||||
mova m3, [pb_17_27]
|
||||
%endif
|
||||
mova m3, [PIC_ptr(pb_17_27)]
|
||||
btc hd, 16
|
||||
jnc %%loop_y_hv_overlap
|
||||
%if ARCH_X86_64
|
||||
|
@ -3268,7 +3229,7 @@ cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
|||
jmp %%loop_x_hv_overlap
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,4 +1,4 @@
|
|||
; Copyright © 2018, VideoLAN and dav1d authors
|
||||
; Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2018, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
|
@ -141,7 +141,7 @@ pw_512: times 2 dw 512
|
|||
|
||||
%macro JMP_TABLE 3-*
|
||||
%xdefine %1_%2_table (%%table - 2*4)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1_%2)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 2
|
||||
dd %%base %+ .%3 - (%%table - 2*4)
|
||||
|
@ -178,7 +178,7 @@ cextern filter_intra_taps
|
|||
SECTION .text
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
|
||||
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
||||
lea r5, [ipred_dc_left_avx2_table]
|
||||
tzcnt wd, wm
|
||||
inc tlq
|
||||
|
@ -196,7 +196,7 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
|
|||
add wq, r5
|
||||
jmp r6
|
||||
|
||||
cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
mov hd, hm ; zero upper half
|
||||
tzcnt r6d, hd
|
||||
sub tlq, hq
|
||||
|
@ -235,7 +235,7 @@ cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
|
|||
mova m1, m0
|
||||
jmp wq
|
||||
|
||||
cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
movifnidn hd, hm
|
||||
movifnidn wd, wm
|
||||
tzcnt r6d, hd
|
||||
|
@ -446,7 +446,7 @@ ALIGN function_align
|
|||
jg .s64
|
||||
RET
|
||||
|
||||
cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
|
||||
lea r5, [ipred_dc_splat_avx2_table]
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
|
@ -457,7 +457,7 @@ cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
|
|||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
|
||||
cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
lea r5, [ipred_dc_splat_avx2_table]
|
||||
tzcnt wd, wm
|
||||
movu m0, [tlq+ 1]
|
||||
|
@ -486,7 +486,7 @@ ALIGN function_align
|
|||
%endmacro
|
||||
|
||||
INIT_XMM avx2
|
||||
cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
|
||||
lea r5, [ipred_h_avx2_table]
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
|
@ -543,7 +543,7 @@ INIT_YMM avx2
|
|||
vpblendvb m0, m5, m0, m1
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
|
||||
cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
|
||||
%define base r5-ipred_paeth_avx2_table
|
||||
lea r5, [ipred_paeth_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -677,7 +677,7 @@ ALIGN function_align
|
|||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights
|
||||
cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
|
||||
%define base r6-ipred_smooth_v_avx2_table
|
||||
lea r6, [ipred_smooth_v_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -835,7 +835,7 @@ ALIGN function_align
|
|||
ALLOC_STACK %1, %3
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h
|
||||
cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
|
||||
%define base r6-ipred_smooth_h_avx2_table
|
||||
lea r6, [ipred_smooth_h_avx2_table]
|
||||
mov wd, wm
|
||||
|
@ -1045,7 +1045,7 @@ ALIGN function_align
|
|||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights
|
||||
cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
|
||||
%define base r6-ipred_smooth_avx2_table
|
||||
lea r6, [ipred_smooth_avx2_table]
|
||||
mov wd, wm
|
||||
|
@ -1315,7 +1315,7 @@ ALIGN function_align
|
|||
sub r3, hq
|
||||
ret
|
||||
|
||||
cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
|
||||
cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
|
||||
%assign org_stack_offset stack_offset
|
||||
lea r6, [ipred_z1_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -2144,7 +2144,7 @@ ALIGN function_align
|
|||
.w64_end:
|
||||
RET
|
||||
|
||||
cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
|
||||
cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
|
||||
%define base r9-z_filter_t0
|
||||
lea r9, [ipred_z2_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -3000,7 +3000,7 @@ ALIGN function_align
|
|||
movu [rsp+97], m0
|
||||
jmp .w32_filter_above
|
||||
|
||||
cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
|
||||
cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
|
||||
%assign org_stack_offset stack_offset
|
||||
lea r6, [ipred_z3_avx2_table]
|
||||
tzcnt hd, hm
|
||||
|
@ -4211,7 +4211,7 @@ ALIGN function_align
|
|||
; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
|
||||
; 5 8 8 i
|
||||
|
||||
cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter
|
||||
cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
|
||||
%define base r6-ipred_filter_avx2_table
|
||||
lea r6, [filter_intra_taps]
|
||||
tzcnt wd, wm
|
||||
|
@ -4435,7 +4435,7 @@ DECLARE_REG_TMP 7
|
|||
paddw m%1, m0
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
lea t0, [ipred_cfl_left_avx2_table]
|
||||
tzcnt wd, wm
|
||||
inc tlq
|
||||
|
@ -4454,7 +4454,7 @@ cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
|||
movifnidn acq, acmp
|
||||
jmp r6
|
||||
|
||||
cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
mov hd, hm ; zero upper half
|
||||
tzcnt r6d, hd
|
||||
sub tlq, hq
|
||||
|
@ -4488,7 +4488,7 @@ cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
|||
vpbroadcastw m0, xm0
|
||||
jmp wq
|
||||
|
||||
cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
movifnidn hd, hm
|
||||
movifnidn wd, wm
|
||||
tzcnt r6d, hd
|
||||
|
@ -4692,7 +4692,7 @@ ALIGN function_align
|
|||
jg .s32_loop
|
||||
RET
|
||||
|
||||
cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
lea t0, [ipred_cfl_splat_avx2_table]
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
|
@ -4702,7 +4702,7 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
|||
movifnidn acq, acmp
|
||||
jmp wq
|
||||
|
||||
cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
||||
cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
||||
movifnidn hpadd, hpadm
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
|
@ -4883,7 +4883,7 @@ cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
|||
jg .sub_loop
|
||||
RET
|
||||
|
||||
cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
||||
cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
||||
movifnidn hpadd, hpadm
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
|
@ -5076,7 +5076,7 @@ cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
|||
jg .sub_loop
|
||||
RET
|
||||
|
||||
cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
||||
cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
||||
movifnidn hpadd, hpadm
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
|
@ -5306,7 +5306,7 @@ cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
|
|||
jg .sub_loop
|
||||
RET
|
||||
|
||||
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
vbroadcasti128 m4, [palq]
|
||||
lea r2, [pal_pred_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -28,19 +28,11 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/ipred.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
#define decl_fn(type, name) \
|
||||
decl_##type##_fn(dav1d_##name##_ssse3); \
|
||||
decl_##type##_fn(dav1d_##name##_avx2)
|
||||
decl_##type##_fn(BF(dav1d_##name, ssse3)); \
|
||||
decl_##type##_fn(BF(dav1d_##name, avx2))
|
||||
#define init_fn(type0, type1, name, suffix) \
|
||||
c->type0[type1] = dav1d_##name##_##suffix
|
||||
#else
|
||||
#define decl_fn(type, name) \
|
||||
decl_##type##_fn(dav1d_##name##_16bpc_ssse3); \
|
||||
decl_##type##_fn(dav1d_##name##_16bpc_avx2)
|
||||
#define init_fn(type0, type1, name, suffix) \
|
||||
c->type0[type1] = dav1d_##name##_16bpc_##suffix
|
||||
#endif
|
||||
c->type0[type1] = BF(dav1d_##name, suffix)
|
||||
|
||||
#define init_angular_ipred_fn(type, name, suffix) \
|
||||
init_fn(intra_pred, type, name, suffix)
|
||||
|
@ -80,7 +72,6 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
|
|||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
|
||||
init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
|
||||
init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
|
||||
|
@ -102,8 +93,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
|
|||
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
|
||||
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_ssse3;
|
||||
#endif
|
||||
c->pal_pred = BF(dav1d_pal_pred, ssse3);
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
@ -130,12 +120,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
|
|||
|
||||
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
|
||||
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
|
||||
#if BITDEPTH == 8
|
||||
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_avx2;
|
||||
#else
|
||||
c->pal_pred = dav1d_pal_pred_16bpc_avx2;
|
||||
#endif
|
||||
c->pal_pred = BF(dav1d_pal_pred, avx2);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; Copyright © 2018, VideoLAN and dav1d authors
|
||||
; Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2018, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
|
@ -74,7 +74,7 @@ pd_32768 : times 1 dd 32768
|
|||
|
||||
%macro JMP_TABLE 3-*
|
||||
%xdefine %1_%2_table (%%table - 2*4)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1_%2)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 2
|
||||
dd %%base %+ .%3 - (%%table - 2*4)
|
||||
|
@ -156,7 +156,7 @@ SECTION .text
|
|||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
|
||||
LEA r5, ipred_h_ssse3_table
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
|
@ -179,7 +179,7 @@ cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
|
|||
;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int a);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
LEA r5, ipred_dc_splat_ssse3_table
|
||||
tzcnt wd, wm
|
||||
movu m0, [tlq+ 1]
|
||||
|
@ -196,7 +196,7 @@ cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
|
|||
;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int a);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
movifnidn hd, hm
|
||||
movifnidn wd, wm
|
||||
tzcnt r6d, hd
|
||||
|
@ -438,7 +438,7 @@ ALIGN function_align
|
|||
;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int a);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
|
||||
LEA r5, ipred_dc_left_ssse3_table
|
||||
mov hd, hm ; zero upper half
|
||||
tzcnt r6d, hd
|
||||
|
@ -488,7 +488,7 @@ cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
|
|||
;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int a);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
|
||||
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
|
||||
LEA r5, ipred_dc_splat_ssse3_table
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
|
@ -505,7 +505,7 @@ cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
|
|||
;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int a);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
|
||||
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
||||
LEA r5, ipred_dc_left_ssse3_table
|
||||
tzcnt wd, wm
|
||||
inc tlq
|
||||
|
@ -540,7 +540,7 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
|
|||
packuswb m6, m0
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights
|
||||
cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
|
||||
%define base r6-ipred_smooth_v_ssse3_table
|
||||
LEA r6, ipred_smooth_v_ssse3_table
|
||||
tzcnt wd, wm
|
||||
|
@ -701,7 +701,7 @@ ALIGN function_align
|
|||
;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int a);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h
|
||||
cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
|
||||
%define base r6-ipred_smooth_h_ssse3_table
|
||||
LEA r6, ipred_smooth_h_ssse3_table
|
||||
mov wd, wm
|
||||
|
@ -958,7 +958,7 @@ ALIGN function_align
|
|||
mova m5, [rsp+16*%12] ; recovery
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
|
||||
cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
|
||||
%define base r6-ipred_smooth_ssse3_table
|
||||
mov wd, wm
|
||||
mov hd, hm
|
||||
|
@ -1194,7 +1194,7 @@ ALIGN function_align
|
|||
;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
|
||||
; const uint8_t *idx, const int w, const int h);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
mova m4, [palq]
|
||||
LEA r2, pal_pred_ssse3_table
|
||||
tzcnt wd, wm
|
||||
|
@ -1295,7 +1295,7 @@ DECLARE_REG_TMP 7
|
|||
DECLARE_REG_TMP 5
|
||||
%endif
|
||||
|
||||
cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
tzcnt r6d, hd
|
||||
|
@ -1535,7 +1535,7 @@ ALIGN function_align
|
|||
;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int16_t *ac, const int alpha);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
mov hd, hm ; zero upper half
|
||||
tzcnt r6d, hd
|
||||
sub tlq, hq
|
||||
|
@ -1576,7 +1576,7 @@ cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
|||
;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int16_t *ac, const int alpha);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
LEA t0, ipred_cfl_left_ssse3_table
|
||||
tzcnt wd, wm
|
||||
inc tlq
|
||||
|
@ -1600,7 +1600,7 @@ cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
|||
;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
|
||||
; const int width, const int height, const int16_t *ac, const int alpha);
|
||||
;---------------------------------------------------------------------------------------
|
||||
cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
LEA r6, ipred_cfl_splat_ssse3_table
|
||||
|
@ -1615,11 +1615,11 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
|
|||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
|
||||
cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
|
||||
DECLARE_REG_TMP 7
|
||||
movddup m2, [pb_2]
|
||||
%else
|
||||
cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
|
||||
cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
|
||||
DECLARE_REG_TMP 4
|
||||
%define ac_bakq acmp
|
||||
mov t0d, 0x02020202
|
||||
|
@ -1855,10 +1855,10 @@ DECLARE_REG_TMP 4
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
|
||||
cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
|
||||
movddup m2, [pb_4]
|
||||
%else
|
||||
cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
|
||||
cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
|
||||
mov t0d, 0x04040404
|
||||
movd m2, t0d
|
||||
pshufd m2, m2, q0000
|
||||
|
@ -2128,10 +2128,10 @@ cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
|
||||
cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
|
||||
movddup m2, [pb_4]
|
||||
%else
|
||||
cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
|
||||
cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
|
||||
%define ac_bakq [rsp+16*4]
|
||||
mov t0d, 0x04040404
|
||||
movd m2, t0d
|
||||
|
@ -2769,7 +2769,7 @@ cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
|
|||
BLEND m1, m0, m5
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h
|
||||
cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
|
||||
%define base r5-ipred_paeth_ssse3_table
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
|
@ -2937,7 +2937,7 @@ ALIGN function_align
|
|||
packuswb m%1, m%1
|
||||
%endmacro
|
||||
|
||||
cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter
|
||||
cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
|
||||
%define base r6-$$
|
||||
LEA r6, $$
|
||||
tzcnt wd, wm
|
||||
|
|
|
@ -105,32 +105,32 @@ cextern pw_16384
|
|||
cextern pw_2896x8
|
||||
cextern pd_2048
|
||||
|
||||
cextern idct_4x8_internal_avx2.main
|
||||
cextern idct_4x16_internal_avx2.main
|
||||
cextern idct_8x8_internal_avx2.main
|
||||
cextern idct_8x16_internal_avx2.main
|
||||
cextern idct_16x4_internal_avx2.main
|
||||
cextern idct_16x8_internal_avx2.main
|
||||
cextern idct_16x16_internal_avx2.main
|
||||
cextern inv_txfm_add_dct_dct_8x32_avx2.main
|
||||
cextern inv_txfm_add_dct_dct_8x32_avx2.main_fast
|
||||
cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf
|
||||
cextern inv_txfm_add_dct_dct_16x32_avx2.main_oddhalf_fast
|
||||
cextern inv_txfm_add_dct_dct_16x64_avx2.main_part1
|
||||
cextern inv_txfm_add_dct_dct_16x64_avx2.main_part2_internal
|
||||
cextern idct_4x8_internal_8bpc_avx2.main
|
||||
cextern idct_4x16_internal_8bpc_avx2.main
|
||||
cextern idct_8x8_internal_8bpc_avx2.main
|
||||
cextern idct_8x16_internal_8bpc_avx2.main
|
||||
cextern idct_16x4_internal_8bpc_avx2.main
|
||||
cextern idct_16x8_internal_8bpc_avx2.main
|
||||
cextern idct_16x16_internal_8bpc_avx2.main
|
||||
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
|
||||
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
|
||||
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
|
||||
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
|
||||
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
|
||||
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
|
||||
|
||||
cextern iadst_4x4_internal_avx2.main
|
||||
cextern iadst_4x8_internal_avx2.main_pass2
|
||||
cextern iadst_4x16_internal_avx2.main2
|
||||
cextern iadst_8x4_internal_avx2.main
|
||||
cextern iadst_8x8_internal_avx2.main_pass2
|
||||
cextern iadst_8x16_internal_avx2.main
|
||||
cextern iadst_8x16_internal_avx2.main_pass2_end
|
||||
cextern iadst_16x4_internal_avx2.main
|
||||
cextern iadst_16x8_internal_avx2.main
|
||||
cextern iadst_16x8_internal_avx2.main_pass2_end
|
||||
cextern iadst_16x16_internal_avx2.main
|
||||
cextern iadst_16x16_internal_avx2.main_pass2_end
|
||||
cextern iadst_4x4_internal_8bpc_avx2.main
|
||||
cextern iadst_4x8_internal_8bpc_avx2.main_pass2
|
||||
cextern iadst_4x16_internal_8bpc_avx2.main2
|
||||
cextern iadst_8x4_internal_8bpc_avx2.main
|
||||
cextern iadst_8x8_internal_8bpc_avx2.main_pass2
|
||||
cextern iadst_8x16_internal_8bpc_avx2.main
|
||||
cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
|
||||
cextern iadst_16x4_internal_8bpc_avx2.main
|
||||
cextern iadst_16x8_internal_8bpc_avx2.main
|
||||
cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
|
||||
cextern iadst_16x16_internal_8bpc_avx2.main
|
||||
cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
@ -384,7 +384,7 @@ cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
|
|||
.pass2:
|
||||
lea rax, [deint_shuf+128]
|
||||
vextracti128 xm1, m0, 1
|
||||
call m(iadst_4x4_internal).main
|
||||
call m(iadst_4x4_internal_8bpc).main
|
||||
.end:
|
||||
vpbroadcastd xm4, [pw_2048]
|
||||
movq xm2, [dstq+strideq*0]
|
||||
|
@ -457,7 +457,7 @@ cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
|
|||
.pass2:
|
||||
lea rax, [deint_shuf+128]
|
||||
vextracti128 xm1, m0, 1
|
||||
call m(iadst_4x4_internal).main
|
||||
call m(iadst_4x4_internal_8bpc).main
|
||||
vpbroadcastd xm4, [pw_2048]
|
||||
movq xm3, [dstq+strideq*1]
|
||||
movhps xm3, [dstq+strideq*0]
|
||||
|
@ -607,7 +607,7 @@ cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|||
punpckldq m0, m2 ; 0 1
|
||||
vextracti128 xm2, m0, 1 ; 4 5
|
||||
vextracti128 xm3, m1, 1 ; 6 7
|
||||
call m(idct_4x8_internal).main
|
||||
call m(idct_4x8_internal_8bpc).main
|
||||
vpbroadcastd xm4, [pw_2048]
|
||||
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
|
||||
lea r3, [strideq*3]
|
||||
|
@ -697,7 +697,7 @@ ALIGN function_align
|
|||
vextracti128 xm3, m5, 1 ; 6 7
|
||||
pshufd xm4, xm4, q1032 ; 1 0
|
||||
pshufd xm5, xm5, q1032 ; 3 2
|
||||
jmp m(iadst_4x8_internal).main_pass2
|
||||
jmp m(iadst_4x8_internal_8bpc).main_pass2
|
||||
ALIGN function_align
|
||||
.main:
|
||||
vbroadcasti128 m0, [cq+16*0]
|
||||
|
@ -934,7 +934,7 @@ cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2
|
|||
vextracti128 xm3, m1, 1 ; 6 7
|
||||
vextracti128 xm6, m4, 1 ; c d
|
||||
vextracti128 xm7, m5, 1 ; e f
|
||||
call m(idct_4x16_internal).main
|
||||
call m(idct_4x16_internal_8bpc).main
|
||||
vpbroadcastd m9, [pw_2048]
|
||||
vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
|
||||
vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
|
||||
|
@ -1054,7 +1054,7 @@ ALIGN function_align
|
|||
vinserti128 m0, xm3, 1 ; 0 3 2 1
|
||||
vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
|
||||
vinserti128 m2, xm4, 1 ; b 8 9 a
|
||||
call m(iadst_4x16_internal).main2
|
||||
call m(iadst_4x16_internal_8bpc).main2
|
||||
vpbroadcastd m5, [pw_2896x8]
|
||||
paddsw m1, m2, m4
|
||||
psubsw m2, m4
|
||||
|
@ -1434,7 +1434,7 @@ ALIGN function_align
|
|||
vinserti128 m0, xm2, 1
|
||||
pshufb m0, m4
|
||||
pshufb m1, m4
|
||||
jmp m(iadst_8x4_internal).main
|
||||
jmp m(iadst_8x4_internal_8bpc).main
|
||||
ALIGN function_align
|
||||
.main:
|
||||
vpbroadcastd m1, [pd_2896]
|
||||
|
@ -1636,7 +1636,7 @@ cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
jmp tx2q
|
||||
.pass2:
|
||||
call .transpose_8x8_packed
|
||||
call m(idct_8x8_internal).main
|
||||
call m(idct_8x8_internal_8bpc).main
|
||||
vpbroadcastd m12, [pw_2048]
|
||||
vpermq m0, m0, q3120
|
||||
vpermq m1, m1, q2031
|
||||
|
@ -1754,7 +1754,7 @@ cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
call m(idct_8x8_internal_16bpc).transpose_8x8_packed
|
||||
pshufd m4, m0, q1032
|
||||
pshufd m5, m1, q1032
|
||||
call m(iadst_8x8_internal).main_pass2
|
||||
call m(iadst_8x8_internal_8bpc).main_pass2
|
||||
vpbroadcastd m5, [pw_2048]
|
||||
vpbroadcastd xm12, [pw_4096]
|
||||
psubw m12, m5
|
||||
|
@ -1814,7 +1814,7 @@ cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
call m(idct_8x8_internal_16bpc).transpose_8x8_packed
|
||||
pshufd m4, m0, q1032
|
||||
pshufd m5, m1, q1032
|
||||
call m(iadst_8x8_internal).main_pass2
|
||||
call m(iadst_8x8_internal_8bpc).main_pass2
|
||||
vpbroadcastd m12, [pw_2048]
|
||||
vpbroadcastd xm5, [pw_4096]
|
||||
psubw m12, m5
|
||||
|
@ -1971,7 +1971,7 @@ cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|||
jmp tx2q
|
||||
.pass2:
|
||||
call .transpose
|
||||
call m(idct_8x16_internal).main
|
||||
call m(idct_8x16_internal_8bpc).main
|
||||
vpbroadcastd m12, [pw_2048]
|
||||
REPX {vpermq x, x, q3120}, m0, m2, m4, m6
|
||||
REPX {vpermq x, x, q2031}, m1, m3, m5, m7
|
||||
|
@ -2167,8 +2167,8 @@ cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|||
jmp tx2q
|
||||
.pass2:
|
||||
call m(idct_8x16_internal_16bpc).transpose
|
||||
call m(iadst_8x16_internal).main
|
||||
call m(iadst_8x16_internal).main_pass2_end
|
||||
call m(iadst_8x16_internal_8bpc).main
|
||||
call m(iadst_8x16_internal_8bpc).main_pass2_end
|
||||
vpbroadcastd m8, [pw_2048]
|
||||
vpbroadcastd xm12, [pw_4096]
|
||||
REPX {vpermq x, x, q2031}, m0, m1, m2, m3
|
||||
|
@ -2232,8 +2232,8 @@ cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|||
jmp tx2q
|
||||
.pass2:
|
||||
call m(idct_8x16_internal_16bpc).transpose
|
||||
call m(iadst_8x16_internal).main
|
||||
call m(iadst_8x16_internal).main_pass2_end
|
||||
call m(iadst_8x16_internal_8bpc).main
|
||||
call m(iadst_8x16_internal_8bpc).main_pass2_end
|
||||
vpbroadcastd m12, [pw_2048]
|
||||
vpbroadcastd xm13, [pw_4096]
|
||||
mova m11, m0
|
||||
|
@ -2458,7 +2458,7 @@ cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
.pass2:
|
||||
call .transpose_4x16_packed
|
||||
lea rax, [deint_shuf+128]
|
||||
call m(idct_16x4_internal).main
|
||||
call m(idct_16x4_internal_8bpc).main
|
||||
.end:
|
||||
vpbroadcastd m4, [pw_2048]
|
||||
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
|
||||
|
@ -2517,7 +2517,7 @@ cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
.pass2:
|
||||
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
|
||||
lea rax, [deint_shuf+128]
|
||||
call m(iadst_16x4_internal).main
|
||||
call m(iadst_16x4_internal_8bpc).main
|
||||
jmp m(idct_16x4_internal_16bpc).end
|
||||
ALIGN function_align
|
||||
.main:
|
||||
|
@ -2596,7 +2596,7 @@ cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|||
.pass2:
|
||||
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
|
||||
lea rax, [deint_shuf+128]
|
||||
call m(iadst_16x4_internal).main
|
||||
call m(iadst_16x4_internal_8bpc).main
|
||||
vpbroadcastd m4, [pw_2048]
|
||||
pmulhrsw m5, m3, m4
|
||||
pmulhrsw m6, m2, m4
|
||||
|
@ -2712,7 +2712,7 @@ cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|||
jmp tx2q
|
||||
.pass2:
|
||||
call .transpose
|
||||
call m(idct_16x8_internal).main
|
||||
call m(idct_16x8_internal_8bpc).main
|
||||
vpbroadcastd m10, [pw_2048]
|
||||
.end:
|
||||
pmulhrsw m0, m10
|
||||
|
@ -2827,8 +2827,8 @@ cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|||
jmp tx2q
|
||||
.pass2:
|
||||
call m(idct_16x8_internal_16bpc).transpose
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass2_end
|
||||
call m(iadst_16x8_internal_8bpc).main
|
||||
call m(iadst_16x8_internal_8bpc).main_pass2_end
|
||||
vpbroadcastd m10, [pw_2048]
|
||||
pxor m11, m11
|
||||
psubw m11, m10
|
||||
|
@ -3039,8 +3039,8 @@ cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|||
jmp m(iadst_16x8_internal_16bpc).pass1_end
|
||||
.pass2:
|
||||
call m(idct_16x8_internal_16bpc).transpose
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass2_end
|
||||
call m(iadst_16x8_internal_8bpc).main
|
||||
call m(iadst_16x8_internal_8bpc).main_pass2_end
|
||||
vpbroadcastd m10, [pw_2048]
|
||||
pxor m11, m11
|
||||
psubw m11, m10
|
||||
|
@ -3216,7 +3216,7 @@ cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|||
call .transpose
|
||||
lea rax, [pw_5+128]
|
||||
mova [rsp], m15
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
mova m1, [rsp+32*1]
|
||||
.end:
|
||||
call .write_16x16
|
||||
|
@ -3450,8 +3450,8 @@ cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|||
call m(idct_16x16_internal_16bpc).transpose
|
||||
lea rax, [pw_5+128]
|
||||
mova [rsp], m15
|
||||
call m(iadst_16x16_internal).main
|
||||
call m(iadst_16x16_internal).main_pass2_end
|
||||
call m(iadst_16x16_internal_8bpc).main
|
||||
call m(iadst_16x16_internal_8bpc).main_pass2_end
|
||||
mova [rsp+32*0], m8
|
||||
mova [rsp+32*2], m12
|
||||
mova [rsp+32*3], m13
|
||||
|
@ -3582,8 +3582,8 @@ cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
|
|||
call m(idct_16x16_internal_16bpc).transpose
|
||||
lea rax, [pw_5+128]
|
||||
mova [rsp], m15
|
||||
call m(iadst_16x16_internal).main
|
||||
call m(iadst_16x16_internal).main_pass2_end
|
||||
call m(iadst_16x16_internal_8bpc).main
|
||||
call m(iadst_16x16_internal_8bpc).main_pass2_end
|
||||
mova [rsp+32*3], m3
|
||||
mova [rsp+32*2], m2
|
||||
mova [rsp+32*0], m0
|
||||
|
@ -3740,7 +3740,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
vpbroadcastd m10, [pw_2048]
|
||||
lea rax, [deint_shuf+128]
|
||||
REPX {mova x, m4}, m5, m6, m7
|
||||
call m(inv_txfm_add_dct_dct_8x32).main_fast
|
||||
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
|
||||
jmp .end
|
||||
.eob107:
|
||||
mova [rsp+32*3], m3
|
||||
|
@ -3778,7 +3778,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
lea rax, [deint_shuf+128]
|
||||
mova m11, [rsp+32*3] ; out13 out15
|
||||
vpbroadcastd m10, [pw_2048]
|
||||
call m(inv_txfm_add_dct_dct_8x32).main
|
||||
call m(inv_txfm_add_dct_dct_8x32_8bpc).main
|
||||
.end: ; [rsp+0*32] = m12
|
||||
vpbroadcastd m12, [pw_2048]
|
||||
mov cq, r4
|
||||
|
@ -4294,7 +4294,7 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
RET
|
||||
ALIGN function_align
|
||||
.pass2:
|
||||
call m(idct_16x8_internal).main
|
||||
call m(idct_16x8_internal_8bpc).main
|
||||
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
|
||||
call m(idct_16x8_internal_16bpc).write_16x4_start
|
||||
pmulhrsw m0, m11, m4
|
||||
|
@ -4404,7 +4404,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m3, [r4+32*3]
|
||||
.fast:
|
||||
lea rax, [pw_5+128]
|
||||
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
||||
pxor m8, m8
|
||||
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
|
||||
jmp .idct16
|
||||
|
@ -4456,7 +4456,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m6, [r4-32*2]
|
||||
mova m7, [r4-32*1]
|
||||
lea rax, [pw_5 + 128]
|
||||
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
|
||||
lea r3, [rsp+32*8]
|
||||
mova m8, [r3+32*0]
|
||||
mova m9, [r3+32*1]
|
||||
|
@ -4477,7 +4477,7 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m6, [r3-32*2]
|
||||
mova m7, [r3-32*1]
|
||||
mova [rsp], m15
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
imul r2, strideq, 19
|
||||
lea r3, [strideq*3]
|
||||
add r2, dstq
|
||||
|
@ -4711,7 +4711,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
|
||||
lea rax, [pw_5+128]
|
||||
mov r7, dstq
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
call .write_16x16
|
||||
mova m0, [r5+32*3]
|
||||
mova m1, [r5+32*2]
|
||||
|
@ -4750,7 +4750,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
call .transpose_16x16
|
||||
lea rax, [pw_5+128]
|
||||
mov r7, dstq
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
call .write_16x16
|
||||
mova m0, [r5+32*3]
|
||||
mova m1, [r5+32*2]
|
||||
|
@ -4764,7 +4764,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
call .transpose_16x16
|
||||
.end:
|
||||
lea dstq, [r7+32]
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
call .write_16x16
|
||||
RET
|
||||
ALIGN function_align
|
||||
|
@ -5124,7 +5124,7 @@ ALIGN function_align
|
|||
mova m13, [r3+32*51] ; 27
|
||||
mova m14, [r3+32*53] ; 29
|
||||
mova m15, [r3+32*55] ; 31
|
||||
jmp m(inv_txfm_add_dct_dct_16x32).main_oddhalf
|
||||
jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
|
||||
ALIGN function_align
|
||||
.pass2_evenhalf:
|
||||
mova m0, [r3+32* 0] ; 0
|
||||
|
@ -5144,7 +5144,7 @@ ALIGN function_align
|
|||
mova m14, [r3+32*52] ; 28
|
||||
mova m15, [r3+32*54] ; 30
|
||||
mova [rsp+gprsize], m15
|
||||
jmp m(idct_16x16_internal).main
|
||||
jmp m(idct_16x16_internal_8bpc).main
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob
|
||||
%undef cmp
|
||||
|
@ -5300,7 +5300,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
pxor m8, m8
|
||||
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
|
||||
mova [rsp], m8
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
mova m1, [rsp+32*1]
|
||||
lea r4, [rsp+32*38]
|
||||
mova [r4-32*4], m0
|
||||
|
@ -5330,7 +5330,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m7, [rsp+32*32] ; in30
|
||||
lea r5, [r4+32*16]
|
||||
add r4, 32*8
|
||||
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
||||
mova m0, [rsp+32* 3] ; in1
|
||||
mova m1, [rsp+32*33] ; in31
|
||||
mova m2, [rsp+32*19] ; in17
|
||||
|
@ -5342,7 +5342,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
lea rax, [idct64_mul - 8]
|
||||
add r4, 32*16
|
||||
add r5, 32*32
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
||||
mova m0, [rsp+32* 7] ; in5
|
||||
mova m1, [rsp+32*29] ; in27
|
||||
mova m2, [rsp+32*23] ; in21
|
||||
|
@ -5354,7 +5354,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
add rax, 8
|
||||
add r4, 32*8
|
||||
sub r5, 32*8
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
||||
lea r8, [strideq*4]
|
||||
lea r9, [strideq*5]
|
||||
lea r3, [r9+strideq*1] ; stride*6
|
||||
|
@ -5449,7 +5449,7 @@ ALIGN function_align
|
|||
lea r2, [dstq+r7]
|
||||
.main_part2_pass2_loop:
|
||||
vpbroadcastd m14, [pw_m2896_2896]
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part2_internal
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
|
||||
vpbroadcastd m14, [pw_2048]
|
||||
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
|
||||
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
|
||||
|
@ -5648,7 +5648,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
pxor m8, m8
|
||||
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
|
||||
mova [rsp], m8
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
mova m1, [rsp+32*1]
|
||||
lea r4, [rsp+32*70]
|
||||
mova [r4-32*4], m0
|
||||
|
@ -5678,7 +5678,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m7, [r10+32*56] ; in30
|
||||
lea r5, [r4+32*16]
|
||||
add r4, 32*8
|
||||
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
||||
mova m0, [r10+32* 3] ; in1
|
||||
mova m1, [r10+32*57] ; in31
|
||||
mova m2, [r10+32*35] ; in17
|
||||
|
@ -5690,7 +5690,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
lea rax, [idct64_mul - 8]
|
||||
add r4, 32*16
|
||||
add r5, 32*32
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
||||
mova m0, [r10+32* 7] ; in5
|
||||
mova m1, [r10+32*53] ; in27
|
||||
mova m2, [r10+32*39] ; in21
|
||||
|
@ -5702,7 +5702,7 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
add rax, 8
|
||||
add r4, 32*8
|
||||
sub r5, 32*8
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2
|
||||
add r10, 32*8
|
||||
sub r4, 32*98 ; rsp+32*16
|
||||
|
@ -5877,7 +5877,7 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m15, [r7+32*3]
|
||||
sub r7, 32*24
|
||||
mova [rsp], m15
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
mova m1, [rsp+32*1]
|
||||
call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16
|
||||
add r5, 32
|
||||
|
@ -6109,7 +6109,7 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m13, [r7-32* 1]
|
||||
mova m14, [r7+32* 1]
|
||||
mova m15, [r7+32* 3]
|
||||
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
|
||||
mova m0, [r7-32*100]
|
||||
mova m1, [r7-32*98]
|
||||
mova m2, [r7-32*96]
|
||||
|
@ -6128,7 +6128,7 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m15, [r7+32* 2]
|
||||
add r7, 32*8
|
||||
mova [rsp], m15
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end
|
||||
sub dstq, r3
|
||||
lea r2, [r2+r3+32]
|
||||
|
@ -6248,7 +6248,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
pxor m8, m8
|
||||
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
|
||||
mova [rsp], m8
|
||||
call m(idct_16x16_internal).main
|
||||
call m(idct_16x16_internal_8bpc).main
|
||||
mova m1, [rsp+32*1]
|
||||
mova [r4-32*4], m0
|
||||
mova [r4-32*3], m1
|
||||
|
@ -6277,7 +6277,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
mova m7, [r10+32* 2] ; in30
|
||||
lea r5, [r4+32*16]
|
||||
add r4, 32*8
|
||||
call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
||||
mova m0, [r10-32*99] ; in1
|
||||
mova m1, [r10+32* 3] ; in31
|
||||
mova m2, [r10-32*35] ; in17
|
||||
|
@ -6289,7 +6289,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
lea rax, [idct64_mul - 8]
|
||||
add r4, 32*16
|
||||
add r5, 32*32
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
||||
mova m0, [r10-32*95] ; in5
|
||||
mova m1, [r10-32* 1] ; in27
|
||||
mova m2, [r10-32*31] ; in21
|
||||
|
@ -6301,7 +6301,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
|
|||
add rax, 8
|
||||
add r4, 32*8
|
||||
sub r5, 32*8
|
||||
call m(inv_txfm_add_dct_dct_16x64).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
||||
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2
|
||||
add r10, 32*8
|
||||
sub dstq, r8
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -29,78 +29,57 @@
|
|||
#include "src/itx.h"
|
||||
|
||||
#define decl_itx2_fns(w, h, opt) \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
|
||||
|
||||
#define decl_itx12_fns(w, h, opt) \
|
||||
decl_itx2_fns(w, h, opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
|
||||
|
||||
#define decl_itx16_fns(w, h, opt) \
|
||||
decl_itx12_fns(w, h, opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
|
||||
|
||||
#define decl_itx17_fns(w, h, opt) \
|
||||
decl_itx16_fns(w, h, opt); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
|
||||
|
||||
#define avx2_fns(avx2) \
|
||||
decl_itx17_fns( 4, 4, avx2); \
|
||||
decl_itx16_fns( 4, 8, avx2); \
|
||||
decl_itx16_fns( 4, 16, avx2); \
|
||||
decl_itx16_fns( 8, 4, avx2); \
|
||||
decl_itx16_fns( 8, 8, avx2); \
|
||||
decl_itx16_fns( 8, 16, avx2); \
|
||||
decl_itx2_fns ( 8, 32, avx2); \
|
||||
decl_itx16_fns(16, 4, avx2); \
|
||||
decl_itx16_fns(16, 8, avx2); \
|
||||
decl_itx12_fns(16, 16, avx2); \
|
||||
decl_itx2_fns (16, 32, avx2); \
|
||||
decl_itx2_fns (32, 8, avx2); \
|
||||
decl_itx2_fns (32, 16, avx2); \
|
||||
decl_itx2_fns (32, 32, avx2); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_##avx2); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_##avx2); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_##avx2); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_##avx2); \
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_##avx2)
|
||||
|
||||
avx2_fns(avx2);
|
||||
avx2_fns(16bpc_avx2);
|
||||
|
||||
decl_itx17_fns( 4, 4, ssse3);
|
||||
decl_itx16_fns( 4, 8, ssse3);
|
||||
decl_itx16_fns( 8, 4, ssse3);
|
||||
decl_itx16_fns( 8, 8, ssse3);
|
||||
decl_itx16_fns( 4, 16, ssse3);
|
||||
decl_itx16_fns(16, 4, ssse3);
|
||||
decl_itx16_fns( 8, 16, ssse3);
|
||||
decl_itx16_fns(16, 8, ssse3);
|
||||
decl_itx12_fns(16, 16, ssse3);
|
||||
decl_itx2_fns ( 8, 32, ssse3);
|
||||
decl_itx2_fns (32, 8, ssse3);
|
||||
decl_itx2_fns (16, 32, ssse3);
|
||||
decl_itx2_fns (32, 16, ssse3);
|
||||
decl_itx2_fns (32, 32, ssse3);
|
||||
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
|
||||
#define decl_itx_fns(ext) \
|
||||
decl_itx17_fns( 4, 4, ext); \
|
||||
decl_itx16_fns( 4, 8, ext); \
|
||||
decl_itx16_fns( 4, 16, ext); \
|
||||
decl_itx16_fns( 8, 4, ext); \
|
||||
decl_itx16_fns( 8, 8, ext); \
|
||||
decl_itx16_fns( 8, 16, ext); \
|
||||
decl_itx2_fns ( 8, 32, ext); \
|
||||
decl_itx16_fns(16, 4, ext); \
|
||||
decl_itx16_fns(16, 8, ext); \
|
||||
decl_itx12_fns(16, 16, ext); \
|
||||
decl_itx2_fns (16, 32, ext); \
|
||||
decl_itx2_fns (32, 8, ext); \
|
||||
decl_itx2_fns (32, 16, ext); \
|
||||
decl_itx2_fns (32, 32, ext); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
|
||||
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
|
||||
|
||||
decl_itx_fns(avx2);
|
||||
decl_itx_fns(sse4);
|
||||
decl_itx_fns(ssse3);
|
||||
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_sse2);
|
||||
|
||||
COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
|
||||
|
@ -108,7 +87,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
|
|||
{
|
||||
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
|
||||
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
|
||||
dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
|
||||
BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
|
||||
|
||||
#define assign_itx1_fn(pfx, w, h, ext) \
|
||||
assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
|
||||
|
@ -146,7 +125,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
|
|||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, 16bpc_sse2);
|
||||
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
@ -173,38 +152,59 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
|
|||
assign_itx1_fn ( , 64, 64, ssse3);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
if (bpc <= 10) {
|
||||
assign_itx16_fn(, 4, 4, sse4);
|
||||
assign_itx16_fn(R, 4, 8, sse4);
|
||||
assign_itx16_fn(R, 4, 16, sse4);
|
||||
assign_itx16_fn(R, 8, 4, sse4);
|
||||
assign_itx16_fn(, 8, 8, sse4);
|
||||
assign_itx16_fn(R, 8, 16, sse4);
|
||||
assign_itx16_fn(R, 16, 4, sse4);
|
||||
assign_itx16_fn(R, 16, 8, sse4);
|
||||
assign_itx12_fn(, 16, 16, sse4);
|
||||
assign_itx2_fn (R, 8, 32, sse4);
|
||||
assign_itx2_fn (R, 32, 8, sse4);
|
||||
assign_itx2_fn (R, 16, 32, sse4);
|
||||
assign_itx2_fn (R, 32, 16, sse4);
|
||||
assign_itx2_fn (, 32, 32, sse4);
|
||||
assign_itx1_fn (R, 16, 64, sse4);
|
||||
assign_itx1_fn (R, 32, 64, sse4);
|
||||
assign_itx1_fn (R, 64, 16, sse4);
|
||||
assign_itx1_fn (R, 64, 32, sse4);
|
||||
assign_itx1_fn (, 64, 64, sse4);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if ARCH_X86_64 && BITDEPTH == 16
|
||||
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, 16bpc_avx2);
|
||||
#if ARCH_X86_64
|
||||
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
|
||||
#endif
|
||||
|
||||
if (bpc > 10) return;
|
||||
|
||||
#if ARCH_X86_64
|
||||
#if BITDEPTH == 8
|
||||
#define SUFFIX avx2
|
||||
#else
|
||||
#define SUFFIX 16bpc_avx2
|
||||
#endif
|
||||
assign_itx17_fn( , 4, 4, SUFFIX);
|
||||
assign_itx16_fn(R, 4, 8, SUFFIX);
|
||||
assign_itx16_fn(R, 4, 16, SUFFIX);
|
||||
assign_itx16_fn(R, 8, 4, SUFFIX);
|
||||
assign_itx16_fn( , 8, 8, SUFFIX);
|
||||
assign_itx16_fn(R, 8, 16, SUFFIX);
|
||||
assign_itx2_fn (R, 8, 32, SUFFIX);
|
||||
assign_itx16_fn(R, 16, 4, SUFFIX);
|
||||
assign_itx16_fn(R, 16, 8, SUFFIX);
|
||||
assign_itx12_fn( , 16, 16, SUFFIX);
|
||||
assign_itx2_fn (R, 16, 32, SUFFIX);
|
||||
assign_itx1_fn (R, 16, 64, SUFFIX);
|
||||
assign_itx2_fn (R, 32, 8, SUFFIX);
|
||||
assign_itx2_fn (R, 32, 16, SUFFIX);
|
||||
assign_itx2_fn ( , 32, 32, SUFFIX);
|
||||
assign_itx1_fn (R, 32, 64, SUFFIX);
|
||||
assign_itx1_fn (R, 64, 16, SUFFIX);
|
||||
assign_itx1_fn (R, 64, 32, SUFFIX);
|
||||
assign_itx1_fn ( , 64, 64, SUFFIX);
|
||||
assign_itx17_fn( , 4, 4, avx2);
|
||||
assign_itx16_fn(R, 4, 8, avx2);
|
||||
assign_itx16_fn(R, 4, 16, avx2);
|
||||
assign_itx16_fn(R, 8, 4, avx2);
|
||||
assign_itx16_fn( , 8, 8, avx2);
|
||||
assign_itx16_fn(R, 8, 16, avx2);
|
||||
assign_itx2_fn (R, 8, 32, avx2);
|
||||
assign_itx16_fn(R, 16, 4, avx2);
|
||||
assign_itx16_fn(R, 16, 8, avx2);
|
||||
assign_itx12_fn( , 16, 16, avx2);
|
||||
assign_itx2_fn (R, 16, 32, avx2);
|
||||
assign_itx1_fn (R, 16, 64, avx2);
|
||||
assign_itx2_fn (R, 32, 8, avx2);
|
||||
assign_itx2_fn (R, 32, 16, avx2);
|
||||
assign_itx2_fn ( , 32, 32, avx2);
|
||||
assign_itx1_fn (R, 32, 64, avx2);
|
||||
assign_itx1_fn (R, 64, 16, avx2);
|
||||
assign_itx1_fn (R, 64, 32, avx2);
|
||||
assign_itx1_fn ( , 64, 64, avx2);
|
||||
#endif
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -623,9 +623,7 @@ SECTION .text
|
|||
paddw m8, m5 ; p6*7+p3+p1+q0
|
||||
paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m2
|
||||
por m10, m9
|
||||
vpblendvb m10, m2, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*2], m10 ; p5
|
||||
%else
|
||||
|
@ -638,9 +636,7 @@ SECTION .text
|
|||
paddw m8, m6
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m7
|
||||
por m10, m9
|
||||
vpblendvb m10, m7, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+stride3q], m10 ; p4
|
||||
%else
|
||||
|
@ -653,9 +649,7 @@ SECTION .text
|
|||
psubw m8, m2
|
||||
paddw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m11
|
||||
por m10, m9
|
||||
vpblendvb m10, m11, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*4], m10 ; p3
|
||||
lea tmpq, [dstq+strideq*4]
|
||||
|
@ -669,9 +663,7 @@ SECTION .text
|
|||
paddw m8, m15
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m13
|
||||
por m10, m9
|
||||
vpblendvb m10, m13, m10, m1
|
||||
mova [rsp+1*32], m10 ; don't clobber p2/m13
|
||||
|
||||
; sub p6/p3, add p0/q4
|
||||
|
@ -684,9 +676,7 @@ SECTION .text
|
|||
%endif
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m3
|
||||
por m10, m9
|
||||
vpblendvb m10, m3, m10, m1
|
||||
mova [rsp+2*32], m10 ; don't clobber p1/m3
|
||||
|
||||
; sub p6/p2, add q0/q5
|
||||
|
@ -699,9 +689,7 @@ SECTION .text
|
|||
%endif
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m4
|
||||
por m10, m9
|
||||
vpblendvb m10, m4, m10, m1
|
||||
mova [rsp+3*32], m10 ; don't clobber p0/m4
|
||||
|
||||
; sub p6/p1, add q1/q6
|
||||
|
@ -715,9 +703,7 @@ SECTION .text
|
|||
paddw m8, m0
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m5
|
||||
por m10, m9
|
||||
vpblendvb m10, m5, m10, m1
|
||||
mova [rsp+4*32], m10 ; don't clobber q0/m5
|
||||
|
||||
; sub p5/p0, add q2/q6
|
||||
|
@ -726,9 +712,7 @@ SECTION .text
|
|||
paddw m8, m0
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m6
|
||||
por m2, m10, m9 ; don't clobber q1/m6
|
||||
vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6
|
||||
|
||||
; sub p4/q0, add q3/q6
|
||||
paddw m8, m15
|
||||
|
@ -736,9 +720,7 @@ SECTION .text
|
|||
paddw m8, m0
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m14
|
||||
por m7, m10, m9 ; don't clobber q2/m14
|
||||
vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14
|
||||
|
||||
; sub p3/q1, add q4/q6
|
||||
%ifidn %2, v
|
||||
|
@ -750,9 +732,7 @@ SECTION .text
|
|||
paddw m8, m0
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
pandn m9, m1, m15
|
||||
por m10, m9
|
||||
vpblendvb m10, m15, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+mstrideq], m10 ; q3
|
||||
%else
|
||||
|
@ -769,13 +749,12 @@ SECTION .text
|
|||
paddw m8, m0
|
||||
psubw m8, m10
|
||||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
%ifidn %2, v
|
||||
pandn m9, m1, [tmpq+strideq*0]
|
||||
mova m9, [tmpq+strideq*0]
|
||||
%else
|
||||
pandn m9, m1, [rsp+10*32]
|
||||
mova m9, [rsp+10*32]
|
||||
%endif
|
||||
por m10, m9
|
||||
vpblendvb m10, m9, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*0], m10 ; q4
|
||||
%else
|
||||
|
@ -790,11 +769,11 @@ SECTION .text
|
|||
psrlw m10, m8, 4
|
||||
pand m10, m1
|
||||
%ifidn %2, v
|
||||
pandn m9, m1, [tmpq+strideq*1]
|
||||
mova m9, [tmpq+strideq*1]
|
||||
%else
|
||||
pandn m9, m1, [rsp+11*32]
|
||||
mova m9, [rsp+11*32]
|
||||
%endif
|
||||
por m10, m9
|
||||
vpblendvb m10, m9, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*1], m10 ; q5
|
||||
%else
|
||||
|
@ -859,14 +838,12 @@ SECTION .text
|
|||
paddw m2, m0
|
||||
pmulhrsw m2, [pw_4096]
|
||||
|
||||
REPX {pand x, m9}, m7, m8, m10, m11, m1, m2
|
||||
REPX {pandn x, m9, x}, m13, m3, m4, m5, m6, m14
|
||||
por m13, m7
|
||||
por m3, m8
|
||||
por m4, m10
|
||||
por m5, m11
|
||||
por m6, m1
|
||||
por m14, m2
|
||||
vpblendvb m13, m13, m7, m9
|
||||
vpblendvb m3, m3, m8, m9
|
||||
vpblendvb m4, m4, m10, m9
|
||||
vpblendvb m5, m5, m11, m9
|
||||
vpblendvb m6, m6, m1, m9
|
||||
vpblendvb m14, m14, m2, m9
|
||||
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*1], m13 ; p2
|
||||
|
@ -984,12 +961,10 @@ SECTION .text
|
|||
paddw m8, m14
|
||||
pmulhrsw m8, [pw_4096]
|
||||
|
||||
REPX {pand x, m9}, m2, m10, m11, m8
|
||||
REPX {pandn x, m9, x}, m3, m4, m5, m6
|
||||
por m3, m2
|
||||
por m4, m10
|
||||
por m5, m11
|
||||
por m6, m8
|
||||
vpblendvb m3, m3, m2, m9
|
||||
vpblendvb m4, m4, m10, m9
|
||||
vpblendvb m5, m5, m11, m9
|
||||
vpblendvb m6, m6, m8, m9
|
||||
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*2], m3 ; p1
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,4 +1,4 @@
|
|||
; Copyright © 2018, VideoLAN and dav1d authors
|
||||
; Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2018, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
|
@ -822,9 +822,7 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
pandn m9, m1, m7
|
||||
por m8, m9
|
||||
vpblendvb m8, m7, m8, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+stride3q], m8 ; p4
|
||||
%else
|
||||
|
@ -850,9 +848,7 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
pandn m9, m1, m12
|
||||
por m8, m9
|
||||
vpblendvb m8, m12, m8, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*4], m8 ; p3
|
||||
%else
|
||||
|
@ -878,9 +874,7 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
pandn m9, m1, m13
|
||||
por m8, m9
|
||||
vpblendvb m8, m13, m8, m1
|
||||
mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F
|
||||
|
||||
; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
|
||||
|
@ -910,9 +904,7 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
pandn m9, m1, m3
|
||||
por m8, m9
|
||||
vpblendvb m8, m3, m8, m1
|
||||
mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G
|
||||
|
||||
; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
|
||||
|
@ -940,9 +932,7 @@ SECTION .text
|
|||
pmulhrsw m0, m10, [pw_2048]
|
||||
pmulhrsw m8, m11, [pw_2048]
|
||||
packuswb m0, m8
|
||||
pand m0, m1
|
||||
pandn m8, m1, m4
|
||||
por m0, m8
|
||||
vpblendvb m0, m4, m0, m1
|
||||
mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H
|
||||
|
||||
; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
|
||||
|
@ -966,9 +956,7 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
pandn m9, m1, m5
|
||||
por m8, m9
|
||||
vpblendvb m8, m5, m8, m1
|
||||
mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I
|
||||
|
||||
; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
|
||||
|
@ -985,9 +973,7 @@ SECTION .text
|
|||
pmulhrsw m2, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m2, m9
|
||||
pand m2, m1
|
||||
pandn m9, m1, m6
|
||||
por m2, m9 ; don't clobber q1/m6 since we need it in K
|
||||
vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K
|
||||
|
||||
; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
|
||||
; write +2
|
||||
|
@ -1003,9 +989,7 @@ SECTION .text
|
|||
pmulhrsw m7, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m7, m9
|
||||
pand m7, m1
|
||||
pandn m9, m1, m14
|
||||
por m7, m9 ; don't clobber q2/m14 since we need it in K
|
||||
vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K
|
||||
|
||||
; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
|
||||
; write +3
|
||||
|
@ -1021,9 +1005,7 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
pandn m9, m1, m15
|
||||
por m8, m9
|
||||
vpblendvb m8, m15, m8, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+mstrideq], m8 ; q3
|
||||
%else
|
||||
|
@ -1044,13 +1026,12 @@ SECTION .text
|
|||
pmulhrsw m8, m10, [pw_2048]
|
||||
pmulhrsw m9, m11, [pw_2048]
|
||||
packuswb m8, m9
|
||||
pand m8, m1
|
||||
%ifidn %2, v
|
||||
pandn m9, m1, [tmpq+strideq*0]
|
||||
mova m9, [tmpq+strideq*0]
|
||||
%else
|
||||
pandn m9, m1, [rsp+15*32]
|
||||
mova m9, [rsp+15*32]
|
||||
%endif
|
||||
por m8, m9
|
||||
vpblendvb m8, m9, m8, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*0], m8 ; q4
|
||||
%else
|
||||
|
@ -1070,13 +1051,12 @@ SECTION .text
|
|||
pmulhrsw m10, [pw_2048]
|
||||
pmulhrsw m11, [pw_2048]
|
||||
packuswb m10, m11
|
||||
pand m10, m1
|
||||
%ifidn %2, v
|
||||
pandn m11, m1, [tmpq+strideq*1]
|
||||
mova m11, [tmpq+strideq*1]
|
||||
%else
|
||||
pandn m11, m1, [rsp+16*32]
|
||||
mova m11, [rsp+16*32]
|
||||
%endif
|
||||
por m10, m11
|
||||
vpblendvb m10, m11, m10, m1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*1], m10 ; q5
|
||||
%else
|
||||
|
@ -1109,9 +1089,7 @@ SECTION .text
|
|||
psrlw m8, m2, 3
|
||||
psrlw m11, m7, 3
|
||||
packuswb m8, m11
|
||||
pand m8, m9
|
||||
pandn m11, m9, m13
|
||||
por m10, m8, m11 ; p2
|
||||
vpblendvb m10, m13, m8, m9 ; p2
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*1], m10 ; p2
|
||||
%endif
|
||||
|
@ -1129,9 +1107,7 @@ SECTION .text
|
|||
psrlw m8, m2, 3
|
||||
psrlw m11, m7, 3
|
||||
packuswb m8, m11
|
||||
pand m8, m9
|
||||
pandn m11, m9, m3
|
||||
por m8, m11 ; p1
|
||||
vpblendvb m8, m3, m8, m9 ; p1
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*2], m8 ; p1
|
||||
%else
|
||||
|
@ -1151,9 +1127,7 @@ SECTION .text
|
|||
psrlw m8, m2, 3
|
||||
psrlw m11, m7, 3
|
||||
packuswb m8, m11
|
||||
pand m8, m9
|
||||
pandn m11, m9, m4
|
||||
por m8, m11 ; p0
|
||||
vpblendvb m8, m4, m8, m9 ; p0
|
||||
%ifidn %2, v
|
||||
mova [tmpq+stride3q ], m8 ; p0
|
||||
%else
|
||||
|
@ -1175,9 +1149,7 @@ SECTION .text
|
|||
psrlw m8, m2, 3
|
||||
psrlw m11, m7, 3
|
||||
packuswb m8, m11
|
||||
pand m8, m9
|
||||
pandn m11, m9, m5
|
||||
por m11, m8, m11 ; q0
|
||||
vpblendvb m11, m5, m8, m9 ; q0
|
||||
%ifidn %2, v
|
||||
mova [dstq+strideq*0], m11 ; q0
|
||||
%endif
|
||||
|
@ -1195,9 +1167,7 @@ SECTION .text
|
|||
psrlw m8, m2, 3
|
||||
psrlw m13, m7, 3
|
||||
packuswb m8, m13
|
||||
pand m8, m9
|
||||
pandn m13, m9, m6
|
||||
por m13, m8, m13 ; q1
|
||||
vpblendvb m13, m6, m8, m9 ; q1
|
||||
%ifidn %2, v
|
||||
mova [dstq+strideq*1], m13 ; q1
|
||||
%endif
|
||||
|
@ -1217,9 +1187,7 @@ SECTION .text
|
|||
psrlw m2, 3
|
||||
psrlw m7, 3
|
||||
packuswb m2, m7
|
||||
pand m2, m9
|
||||
pandn m7, m9, m14
|
||||
por m2, m7 ; q2
|
||||
vpblendvb m2, m14, m2, m9 ; q2
|
||||
%ifidn %2, v
|
||||
mova [dstq+strideq*2], m2 ; q2
|
||||
%else
|
||||
|
@ -1380,9 +1348,7 @@ SECTION .text
|
|||
pmulhrsw m2, m0, [pw_4096]
|
||||
pmulhrsw m12, m1, [pw_4096]
|
||||
packuswb m2, m12
|
||||
pand m2, m9
|
||||
pandn m12, m9, m3
|
||||
por m2, m12
|
||||
vpblendvb m2, m3, m2, m9
|
||||
%ifidn %2, v
|
||||
mova [tmpq+strideq*2], m2 ; p1
|
||||
%endif
|
||||
|
@ -1400,9 +1366,7 @@ SECTION .text
|
|||
pmulhrsw m12, m0, [pw_4096]
|
||||
pmulhrsw m13, m1, [pw_4096]
|
||||
packuswb m12, m13
|
||||
pand m12, m9
|
||||
pandn m13, m9, m4
|
||||
por m12, m13
|
||||
vpblendvb m12, m4, m12, m9
|
||||
%ifidn %2, v
|
||||
mova [tmpq+stride3q], m12 ; p0
|
||||
%endif
|
||||
|
@ -1418,9 +1382,7 @@ SECTION .text
|
|||
pmulhrsw m14, m0, [pw_4096]
|
||||
pmulhrsw m13, m1, [pw_4096]
|
||||
packuswb m14, m13
|
||||
pand m14, m9
|
||||
pandn m13, m9, m5
|
||||
por m14, m13
|
||||
vpblendvb m14, m5, m14, m9
|
||||
%ifidn %2, v
|
||||
mova [dstq+strideq*0], m14 ; q0
|
||||
%endif
|
||||
|
@ -1436,9 +1398,7 @@ SECTION .text
|
|||
pmulhrsw m0, [pw_4096]
|
||||
pmulhrsw m1, [pw_4096]
|
||||
packuswb m0, m1
|
||||
pand m0, m9
|
||||
pandn m9, m6
|
||||
por m0, m9
|
||||
vpblendvb m0, m6, m0, m9
|
||||
%ifidn %2, v
|
||||
mova [dstq+strideq*1], m0 ; q1
|
||||
%else
|
||||
|
@ -1457,7 +1417,7 @@ SECTION .text
|
|||
%endmacro
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
|
||||
cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
w, stride3, mstride, tmp
|
||||
shl l_strideq, 2
|
||||
|
@ -1495,7 +1455,7 @@ cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
|
|||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
|
||||
cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
h, stride3, l_stride3, tmp
|
||||
shl l_strideq, 2
|
||||
|
@ -1535,7 +1495,7 @@ cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
|
|||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal lpf_v_sb_uv, 7, 10, 16, \
|
||||
cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
w, stride3, mstride, tmp
|
||||
shl l_strideq, 2
|
||||
|
@ -1566,7 +1526,7 @@ cglobal lpf_v_sb_uv, 7, 10, 16, \
|
|||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal lpf_h_sb_uv, 7, 10, 16, \
|
||||
cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
h, stride3, l_stride3, tmp
|
||||
shl l_strideq, 2
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -29,48 +29,30 @@
|
|||
#include "src/loopfilter.h"
|
||||
|
||||
#define decl_loopfilter_sb_fns(ext) \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
|
||||
|
||||
decl_loopfilter_sb_fns(ssse3);
|
||||
decl_loopfilter_sb_fns(avx2);
|
||||
decl_loopfilter_sb_fns(16bpc_ssse3);
|
||||
decl_loopfilter_sb_fns(16bpc_avx2);
|
||||
|
||||
COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
|
||||
#else
|
||||
#if ARCH_X86_64
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_ssse3;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_ssse3;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_ssse3;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_ssse3;
|
||||
#endif
|
||||
#endif
|
||||
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
|
||||
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
|
||||
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
|
||||
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if ARCH_X86_64
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2;
|
||||
#else
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_avx2;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_avx2;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_avx2;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_avx2;
|
||||
#endif
|
||||
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
|
||||
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
|
||||
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
|
||||
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; Copyright © 2018, VideoLAN and dav1d authors
|
||||
; Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2018, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
|
@ -1977,11 +1977,11 @@ SECTION .text
|
|||
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_64
|
||||
cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \
|
||||
cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
w, stride3, mstride, tmp, mask_bits
|
||||
%else
|
||||
cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
|
||||
cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \
|
||||
dst, stride, mask, l, l_stride, lut, mask_bits
|
||||
RELOC_ARGS w
|
||||
SETUP_PIC
|
||||
|
@ -2075,11 +2075,11 @@ cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
|
|||
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_64
|
||||
cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \
|
||||
cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
h, stride3, l_stride3, tmp, mask_bits
|
||||
%else
|
||||
cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
|
||||
cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \
|
||||
dst, stride, mask, l, l_stride, lut, mask_bits
|
||||
RELOC_ARGS h
|
||||
SETUP_PIC
|
||||
|
@ -2179,11 +2179,11 @@ cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
|
|||
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_64
|
||||
cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \
|
||||
cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
w, stride3, mstride, tmp, mask_bits
|
||||
%else
|
||||
cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
|
||||
cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \
|
||||
dst, stride, mask, l, l_stride, lut, mask_bits
|
||||
RELOC_ARGS w
|
||||
SETUP_PIC
|
||||
|
@ -2261,11 +2261,11 @@ cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
|
|||
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_64
|
||||
cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \
|
||||
cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \
|
||||
dst, stride, mask, l, l_stride, lut, \
|
||||
h, stride3, l_stride3, tmp, mask_bits
|
||||
%else
|
||||
cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \
|
||||
cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \
|
||||
dst, stride, mask, l, l_stride, lut, mask_bits
|
||||
RELOC_ARGS h
|
||||
SETUP_PIC
|
||||
|
|
|
@ -662,7 +662,7 @@ ALIGN function_align
|
|||
jl .v_loop
|
||||
ret
|
||||
|
||||
cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \
|
||||
cglobal sgr_filter_5x5_16bpc, 5, 14, 15, 400*24+16, dst, dst_stride, left, lpf, \
|
||||
lpf_stride, w, edge, params, h
|
||||
movifnidn wd, wm
|
||||
mov paramsq, paramsmp
|
||||
|
@ -680,13 +680,12 @@ cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf,
|
|||
lea t3, [rsp+wq*2+400*12+16]
|
||||
vpbroadcastd m11, [pd_0xf00800a4]
|
||||
lea t4, [rsp+wq+400*20+16]
|
||||
vpbroadcastd m12, [pw_256]
|
||||
mova xm12, [sgr_lshuf5]
|
||||
neg wq
|
||||
vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15)
|
||||
pxor m6, m6
|
||||
vpbroadcastd m14, [pw_1023]
|
||||
psllw m7, 4
|
||||
mova xm15, [sgr_lshuf5]
|
||||
test edgeb, 4 ; LR_HAVE_TOP
|
||||
jz .no_top
|
||||
call .h_top
|
||||
|
@ -786,7 +785,7 @@ cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf,
|
|||
jmp .h_main
|
||||
.h_extend_left:
|
||||
mova xm4, [lpfq+wq]
|
||||
pshufb xm4, xm15
|
||||
pshufb xm4, xm12
|
||||
vinserti128 m4, [lpfq+wq+10], 1
|
||||
jmp .h_main
|
||||
.h_top:
|
||||
|
@ -867,7 +866,7 @@ ALIGN function_align
|
|||
jmp .hv_main
|
||||
.hv_extend_left:
|
||||
mova xm4, [lpfq+wq]
|
||||
pshufb xm4, xm15
|
||||
pshufb xm4, xm12
|
||||
vinserti128 m4, [lpfq+wq+10], 1
|
||||
jmp .hv_main
|
||||
.hv_bottom:
|
||||
|
@ -945,13 +944,12 @@ ALIGN function_align
|
|||
paddusw m4, m11
|
||||
paddusw m5, m11
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
pmulld m1, m3
|
||||
packssdw m2, m3
|
||||
psubw m2, m12, m2 ; a
|
||||
paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m13
|
||||
mova [t4+r10+4], m2
|
||||
|
@ -1015,13 +1013,12 @@ ALIGN function_align
|
|||
paddusw m4, m11
|
||||
paddusw m5, m11
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
pmulld m1, m3
|
||||
packssdw m2, m3
|
||||
psubw m2, m12, m2 ; a
|
||||
paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m13
|
||||
mova [t4+r10+4], m2
|
||||
|
@ -1098,15 +1095,13 @@ ALIGN function_align
|
|||
pmaddwd m3, m1
|
||||
vinserti128 m1, m4, xm5, 1
|
||||
vperm2i128 m4, m5, 0x31
|
||||
paddd m2, m1 ; a * src + b + (1 << 8)
|
||||
paddd m3, m4
|
||||
psrld m2, 9
|
||||
psrld m3, 9
|
||||
packssdw m2, m3
|
||||
psllw m1, m0, 4
|
||||
psubw m2, m1
|
||||
pmulhrsw m2, m7
|
||||
paddw m0, m2
|
||||
psubd m1, m2 ; b - a * src + (1 << 8)
|
||||
psubd m4, m3
|
||||
psrad m1, 9
|
||||
psrad m4, 9
|
||||
packssdw m1, m4
|
||||
pmulhrsw m1, m7
|
||||
paddw m0, m1
|
||||
pmaxsw m0, m6
|
||||
pminsw m0, m14
|
||||
mova [dstq+r10], m0
|
||||
|
@ -1130,15 +1125,13 @@ ALIGN function_align
|
|||
pmaddwd m3, m1
|
||||
vinserti128 m1, m4, xm5, 1
|
||||
vperm2i128 m4, m5, 0x31
|
||||
paddd m2, m1 ; a * src + b + (1 <<7)
|
||||
paddd m3, m4
|
||||
psrld m2, 8
|
||||
psrld m3, 8
|
||||
packssdw m2, m3
|
||||
psllw m1, m0, 4
|
||||
psubw m2, m1
|
||||
pmulhrsw m2, m7
|
||||
paddw m0, m2
|
||||
psubd m1, m2 ; b - a * src + (1 << 7)
|
||||
psubd m4, m3
|
||||
psrad m1, 8
|
||||
psrad m4, 8
|
||||
packssdw m1, m4
|
||||
pmulhrsw m1, m7
|
||||
paddw m0, m1
|
||||
pmaxsw m0, m6
|
||||
pminsw m0, m14
|
||||
mova [dstq+r10], m0
|
||||
|
@ -1147,7 +1140,7 @@ ALIGN function_align
|
|||
add dstq, dst_strideq
|
||||
ret
|
||||
|
||||
cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
|
||||
cglobal sgr_filter_3x3_16bpc, 5, 14, 14, 400*42+8, dst, dst_stride, left, lpf, \
|
||||
lpf_stride, w, edge, params, h
|
||||
movifnidn wd, wm
|
||||
mov paramsq, paramsmp
|
||||
|
@ -1166,11 +1159,10 @@ cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
|
|||
lea t4, [rsp+wq+400*32+8]
|
||||
vpbroadcastd m11, [pd_34816]
|
||||
neg wq
|
||||
vpbroadcastd m12, [pw_256]
|
||||
mova xm12, [sgr_lshuf3]
|
||||
pxor m6, m6
|
||||
vpbroadcastd m13, [pw_1023]
|
||||
psllw m7, 4
|
||||
mova xm14, [sgr_lshuf3]
|
||||
test edgeb, 4 ; LR_HAVE_TOP
|
||||
jz .no_top
|
||||
call .h_top
|
||||
|
@ -1268,7 +1260,7 @@ cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
|
|||
jmp .h_main
|
||||
.h_extend_left:
|
||||
mova xm4, [lpfq+wq]
|
||||
pshufb xm4, xm14
|
||||
pshufb xm4, xm12
|
||||
vinserti128 m4, [lpfq+wq+12], 1
|
||||
jmp .h_main
|
||||
.h_top:
|
||||
|
@ -1318,7 +1310,7 @@ ALIGN function_align
|
|||
jmp .hv0_main
|
||||
.hv0_extend_left:
|
||||
mova xm4, [lpfq+wq]
|
||||
pshufb xm4, xm14
|
||||
pshufb xm4, xm12
|
||||
vinserti128 m4, [lpfq+wq+12], 1
|
||||
jmp .hv0_main
|
||||
.hv0_bottom:
|
||||
|
@ -1388,7 +1380,7 @@ ALIGN function_align
|
|||
paddusw m4, m10
|
||||
paddusw m5, m10
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -1396,7 +1388,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m11
|
||||
psubw m2, m12, m2
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*0+ 4], m2
|
||||
|
@ -1420,7 +1411,7 @@ ALIGN function_align
|
|||
jmp .hv1_main
|
||||
.hv1_extend_left:
|
||||
mova xm4, [lpfq+wq]
|
||||
pshufb xm4, xm14
|
||||
pshufb xm4, xm12
|
||||
vinserti128 m4, [lpfq+wq+12], 1
|
||||
jmp .hv1_main
|
||||
.hv1_bottom:
|
||||
|
@ -1484,7 +1475,7 @@ ALIGN function_align
|
|||
paddusw m4, m10
|
||||
paddusw m5, m10
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -1492,7 +1483,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m11
|
||||
psubw m2, m12, m2
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*2 +4], m2
|
||||
|
@ -1548,7 +1538,7 @@ ALIGN function_align
|
|||
paddusw m4, m10
|
||||
paddusw m5, m10
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -1556,7 +1546,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m11
|
||||
psubw m2, m12, m2
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*0+ 4], m2
|
||||
|
@ -1606,7 +1595,7 @@ ALIGN function_align
|
|||
paddusw m4, m10
|
||||
paddusw m5, m10
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -1614,7 +1603,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m11
|
||||
psubw m2, m12, m2
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*2+ 4], m2
|
||||
|
@ -1700,15 +1688,13 @@ ALIGN function_align
|
|||
pmaddwd m3, m1
|
||||
vinserti128 m1, m4, xm5, 1
|
||||
vperm2i128 m4, m5, 0x31
|
||||
paddd m2, m1 ; a * src + b + (1 << 8)
|
||||
paddd m3, m4
|
||||
psrld m2, 9
|
||||
psrld m3, 9
|
||||
packssdw m2, m3
|
||||
psllw m1, m0, 4
|
||||
psubw m2, m1
|
||||
pmulhrsw m2, m7
|
||||
paddw m0, m2
|
||||
psubd m1, m2 ; b - a * src + (1 << 8)
|
||||
psubd m4, m3
|
||||
psrad m1, 9
|
||||
psrad m4, 9
|
||||
packssdw m1, m4
|
||||
pmulhrsw m1, m7
|
||||
paddw m0, m1
|
||||
pmaxsw m0, m6
|
||||
pminsw m0, m13
|
||||
mova [dstq+r10], m0
|
||||
|
@ -1756,15 +1742,13 @@ ALIGN function_align
|
|||
pmaddwd m3, m1
|
||||
vinserti128 m1, m4, xm5, 1
|
||||
vperm2i128 m4, m5, 0x31
|
||||
paddd m2, m1 ; a * src + b + (1 << 8)
|
||||
paddd m3, m4
|
||||
psrld m2, 9
|
||||
psrld m3, 9
|
||||
packssdw m2, m3
|
||||
psllw m1, m0, 4
|
||||
psubw m2, m1
|
||||
pmulhrsw m2, m7
|
||||
paddw m0, m2
|
||||
psubd m1, m2 ; b - a * src + (1 << 8)
|
||||
psubd m4, m3
|
||||
psrad m1, 9
|
||||
psrad m4, 9
|
||||
packssdw m1, m4
|
||||
pmulhrsw m1, m7
|
||||
paddw m0, m1
|
||||
pmaxsw m0, m6
|
||||
pminsw m0, m13
|
||||
mova [dstq+r10], m0
|
||||
|
@ -1786,7 +1770,7 @@ cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \
|
|||
lea t1, [rsp+wq+12]
|
||||
vpbroadcastd m10, [pd_34816]
|
||||
add dstq, wq
|
||||
vpbroadcastd m11, [pw_256]
|
||||
vpbroadcastd m11, [pd_4096]
|
||||
lea t3, [rsp+wq*2+400*24+8]
|
||||
vpbroadcastd m12, [pd_0xf00801c7]
|
||||
lea t4, [rsp+wq+400*52+8]
|
||||
|
@ -2048,7 +2032,7 @@ ALIGN function_align
|
|||
paddusw m4, m12
|
||||
paddusw m5, m12
|
||||
psrad m3, m4, 20 ; min(z3, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x3
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -2056,7 +2040,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m10
|
||||
psubw m2, m11, m2
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*2+ 4], m2
|
||||
|
@ -2154,7 +2137,7 @@ ALIGN function_align
|
|||
paddusw m2, m12
|
||||
paddusw m3, m12
|
||||
psrad m7, m2, 20 ; min(z3, 255) - 256
|
||||
vpgatherdd m6, [r13+m7*4], m2
|
||||
vpgatherdd m6, [r13+m7*4], m2 ; x3
|
||||
psrad m2, m3, 20
|
||||
vpgatherdd m7, [r13+m2*4], m3
|
||||
pmulld m0, m6
|
||||
|
@ -2162,7 +2145,6 @@ ALIGN function_align
|
|||
pmulld m7, m1
|
||||
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m7, m10
|
||||
psubw m6, m11, m6
|
||||
psrld m0, 12
|
||||
psrld m7, 12
|
||||
paddw m1, m8, [t2+r10+400*0]
|
||||
|
@ -2207,7 +2189,7 @@ ALIGN function_align
|
|||
paddusw m2, m4
|
||||
paddusw m3, m4
|
||||
psrad m5, m2, 20 ; min(z5, 255) - 256
|
||||
vpgatherdd m4, [r13+m5*4], m2
|
||||
vpgatherdd m4, [r13+m5*4], m2 ; x5
|
||||
psrad m2, m3, 20
|
||||
vpgatherdd m5, [r13+m2*4], m3
|
||||
pmulld m0, m4
|
||||
|
@ -2215,7 +2197,6 @@ ALIGN function_align
|
|||
packssdw m4, m5
|
||||
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m10
|
||||
psubw m4, m11, m4
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*0+ 4], m4
|
||||
|
@ -2271,7 +2252,7 @@ ALIGN function_align
|
|||
paddusw m4, m12
|
||||
paddusw m5, m12
|
||||
psrad m3, m4, 20 ; min(z3, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x3
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -2279,7 +2260,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m10
|
||||
psubw m2, m11, m2
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova m3, [t1+r10+400*0]
|
||||
|
@ -2341,7 +2321,7 @@ ALIGN function_align
|
|||
paddusw m4, m12
|
||||
paddusw m5, m12
|
||||
psrad m3, m4, 20 ; min(z3, 255) - 256
|
||||
vpgatherdd m2, [r13+m3*4], m4
|
||||
vpgatherdd m2, [r13+m3*4], m4 ; x3
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r13+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -2349,7 +2329,6 @@ ALIGN function_align
|
|||
packssdw m2, m3
|
||||
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m10
|
||||
psubw m2, m11, m2
|
||||
psrld m0, 12
|
||||
psrld m8, m1, 12
|
||||
mova [t4+r10*1+400*4+4], m2
|
||||
|
@ -2396,7 +2375,7 @@ ALIGN function_align
|
|||
paddusw m2, m4
|
||||
paddusw m3, m4
|
||||
psrad m5, m2, 20 ; min(z5, 255) - 256
|
||||
vpgatherdd m4, [r13+m5*4], m2
|
||||
vpgatherdd m4, [r13+m5*4], m2 ; x5
|
||||
psrad m2, m3, 20
|
||||
vpgatherdd m5, [r13+m2*4], m3
|
||||
pmulld m0, m4
|
||||
|
@ -2404,7 +2383,6 @@ ALIGN function_align
|
|||
packssdw m4, m5
|
||||
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
|
||||
paddd m1, m10
|
||||
psubw m4, m11, m4
|
||||
psrld m0, 12
|
||||
psrld m1, 12
|
||||
mova [t4+r10*1+400*0+ 4], m4
|
||||
|
@ -2508,16 +2486,13 @@ ALIGN function_align
|
|||
pmaddwd m2, m4 ; a5 * src
|
||||
pmaddwd m3, m4 ; a3 * src
|
||||
pslld m4, 13
|
||||
psubd m0, m4
|
||||
psubd m1, m4
|
||||
paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13)
|
||||
paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13)
|
||||
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
|
||||
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
|
||||
psrld m0, 9
|
||||
pslld m1, 7
|
||||
pblendw m0, m1, 0xaa
|
||||
pmaddwd m0, m15
|
||||
vpbroadcastd m1, [pd_4096]
|
||||
paddd m4, m1
|
||||
paddd m4, m11
|
||||
paddd m0, m4
|
||||
psrad m0, 7
|
||||
vextracti128 xm1, m0, 1
|
||||
|
@ -2551,22 +2526,19 @@ ALIGN function_align
|
|||
mova [t3+r10*2+400*20], m5
|
||||
mova [t3+r10*2+400*24], m4
|
||||
pmovzxwd m4, [dstq+r10]
|
||||
pmovzxwd m0, [t4+r10*1+400* 6]
|
||||
pmovzxwd m2, [t4+r10*1+400* 6]
|
||||
pmovzxwd m3, xm3
|
||||
pmaddwd m0, m4 ; a5 * src
|
||||
mova m0, [t3+r10*2+400*12]
|
||||
pmaddwd m2, m4 ; a5 * src
|
||||
pmaddwd m3, m4 ; a3 * src
|
||||
pslld m4, 12
|
||||
psubd m2, m4, [t3+r10*2+400*12]
|
||||
paddd m4, m4
|
||||
psubd m1, m4
|
||||
psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13)
|
||||
paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13)
|
||||
pslld m4, 13
|
||||
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
|
||||
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
|
||||
psrld m0, 8
|
||||
pslld m1, 7
|
||||
pblendw m0, m1, 0xaa
|
||||
pmaddwd m0, m15
|
||||
vpbroadcastd m1, [pd_4096]
|
||||
paddd m4, m1
|
||||
paddd m4, m11
|
||||
paddd m0, m4
|
||||
psrad m0, 7
|
||||
vextracti128 xm1, m0, 1
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -79,14 +79,6 @@ pd_0xf00800a4: dd 0xf00800a4
|
|||
|
||||
SECTION .text
|
||||
|
||||
%macro REPX 2-*
|
||||
%xdefine %%f(x) %1
|
||||
%rep %0 - 1
|
||||
%rotate 1
|
||||
%%f(%1)
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers
|
||||
|
||||
INIT_YMM avx2
|
||||
|
@ -111,6 +103,8 @@ cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf,
|
|||
add dstq, wq
|
||||
vpbroadcastd m15, [fltq+20] ; y2 y3
|
||||
neg wq
|
||||
psllw m14, 5
|
||||
psllw m15, 5
|
||||
test edgeb, 4 ; LR_HAVE_TOP
|
||||
jz .no_top
|
||||
call .h_top
|
||||
|
@ -357,9 +351,7 @@ ALIGN function_align
|
|||
mova m3, [t3+r10*2+32]
|
||||
mova m5, [t5+r10*2+32]
|
||||
paddw m5, [t1+r10*2+32]
|
||||
psrad m0, 11
|
||||
psrad m4, 11
|
||||
packssdw m0, m4
|
||||
packuswb m0, m4
|
||||
paddw m4, m1, [t6+r10*2+32]
|
||||
mova [t0+r10*2+32], m1
|
||||
punpcklwd m1, m2, m3
|
||||
|
@ -372,9 +364,9 @@ ALIGN function_align
|
|||
pmaddwd m4, m14
|
||||
paddd m1, m3
|
||||
paddd m2, m4
|
||||
psrad m1, 11
|
||||
psrad m2, 11
|
||||
packssdw m1, m2
|
||||
packuswb m1, m2
|
||||
psrlw m0, 8
|
||||
psrlw m1, 8
|
||||
packuswb m0, m1
|
||||
mova [dstq+r10], m0
|
||||
add r10, 32
|
||||
|
@ -423,9 +415,10 @@ ALIGN function_align
|
|||
paddd m2, m6
|
||||
paddd m1, m5
|
||||
paddd m3, m7
|
||||
REPX {psrad x, 11}, m0, m2, m1, m3
|
||||
packssdw m0, m2
|
||||
packssdw m1, m3
|
||||
packuswb m0, m2
|
||||
packuswb m1, m3
|
||||
psrlw m0, 8
|
||||
psrlw m1, 8
|
||||
packuswb m0, m1
|
||||
mova [dstq+r10], m0
|
||||
add r10, 32
|
||||
|
@ -459,6 +452,8 @@ cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
|
|||
add dstq, wq
|
||||
vpbroadcastd m15, [fltq+20] ; y2 y3
|
||||
neg wq
|
||||
psllw m14, 5
|
||||
psllw m15, 5
|
||||
test edgeb, 4 ; LR_HAVE_TOP
|
||||
jz .no_top
|
||||
call .h_top
|
||||
|
@ -661,9 +656,7 @@ ALIGN function_align
|
|||
mova m2, [t3+r10*2+32]
|
||||
paddw m2, [t1+r10*2+32]
|
||||
mova m3, [t2+r10*2+32]
|
||||
psrad m0, 11
|
||||
psrad m4, 11
|
||||
packssdw m0, m4
|
||||
packuswb m0, m4
|
||||
paddw m4, m1, [t4+r10*2+32]
|
||||
mova [t0+r10*2+32], m1
|
||||
punpcklwd m1, m2, m3
|
||||
|
@ -676,9 +669,9 @@ ALIGN function_align
|
|||
pmaddwd m4, m14
|
||||
paddd m1, m3
|
||||
paddd m2, m4
|
||||
psrad m1, 11
|
||||
psrad m2, 11
|
||||
packssdw m1, m2
|
||||
packuswb m1, m2
|
||||
psrlw m0, 8
|
||||
psrlw m1, 8
|
||||
packuswb m0, m1
|
||||
mova [dstq+r10], m0
|
||||
add r10, 32
|
||||
|
@ -720,9 +713,10 @@ ALIGN function_align
|
|||
paddd m2, m6
|
||||
paddd m1, m5
|
||||
paddd m3, m7
|
||||
REPX {psrad x, 11}, m0, m2, m1, m3
|
||||
packssdw m0, m2
|
||||
packssdw m1, m3
|
||||
packuswb m0, m2
|
||||
packuswb m1, m3
|
||||
psrlw m0, 8
|
||||
psrlw m1, 8
|
||||
packuswb m0, m1
|
||||
mova [dstq+r10], m0
|
||||
add r10, 32
|
||||
|
@ -1003,7 +997,7 @@ ALIGN function_align
|
|||
paddusw m4, m13
|
||||
paddusw m5, m13
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r12+m3*4], m4
|
||||
vpgatherdd m2, [r12+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r12+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -1063,7 +1057,7 @@ ALIGN function_align
|
|||
paddusw m4, m13
|
||||
paddusw m5, m13
|
||||
psrad m3, m4, 20 ; min(z, 255) - 256
|
||||
vpgatherdd m2, [r12+m3*4], m4
|
||||
vpgatherdd m2, [r12+m3*4], m4 ; x
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r12+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -1096,12 +1090,9 @@ ALIGN function_align
|
|||
pslld m3, 2
|
||||
paddd m2, m0 ; ab 565
|
||||
paddd m3, m1
|
||||
; a = 4096 - (ab & 4095) = -(ab | ~4095), so by
|
||||
; using OR instead of AND for the masking we get
|
||||
; the subtraction for free (with a negated result)
|
||||
por m0, m15, m2 ; -a
|
||||
psrld m2, 12 ; b
|
||||
por m1, m15, m3
|
||||
pandn m0, m15, m2 ; a
|
||||
psrld m2, 12 ; b
|
||||
pandn m1, m15, m3
|
||||
psrld m3, 12
|
||||
mova [t3+r10*4+400*4+ 0], m0
|
||||
mova [t3+r10*4+400*8+ 0], m2
|
||||
|
@ -1126,11 +1117,11 @@ ALIGN function_align
|
|||
pslld m3, 2
|
||||
paddd m2, m0
|
||||
paddd m3, m1
|
||||
por m0, m15, m2
|
||||
pandn m0, m15, m2
|
||||
psrld m2, 12
|
||||
por m1, m15, m3
|
||||
pandn m1, m15, m3
|
||||
psrld m3, 12
|
||||
paddd m4, m0, [t3+r10*4+400*4+ 0] ; -a
|
||||
paddd m4, m0, [t3+r10*4+400*4+ 0] ; a
|
||||
paddd m5, m1, [t3+r10*4+400*4+32]
|
||||
mova [t3+r10*4+400*4+ 0], m0
|
||||
mova [t3+r10*4+400*4+32], m1
|
||||
|
@ -1140,16 +1131,14 @@ ALIGN function_align
|
|||
mova [t3+r10*4+400*8+32], m3
|
||||
pmovzxbd m2, [dstq+r10+0]
|
||||
pmovzxbd m3, [dstq+r10+8]
|
||||
pmaddwd m4, m2 ; -a * src
|
||||
pmaddwd m4, m2 ; a * src
|
||||
pmaddwd m5, m3
|
||||
packssdw m2, m3
|
||||
psubd m0, m4 ; a * src + b + (1 << 8)
|
||||
psubd m0, m4 ; b - a * src + (1 << 8)
|
||||
psubd m1, m5
|
||||
psrld m0, 9
|
||||
psrld m1, 9
|
||||
psrad m0, 9
|
||||
psrad m1, 9
|
||||
packssdw m0, m1
|
||||
psllw m1, m2, 4
|
||||
psubw m0, m1
|
||||
pmulhrsw m0, m7
|
||||
paddw m0, m2
|
||||
vextracti128 xm1, m0, 1
|
||||
|
@ -1166,18 +1155,16 @@ ALIGN function_align
|
|||
.n1_loop:
|
||||
pmovzxbd m2, [dstq+r10+0]
|
||||
pmovzxbd m3, [dstq+r10+8]
|
||||
pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; -a * src
|
||||
pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src
|
||||
pmaddwd m5, m3, [t3+r10*4+400*4+32]
|
||||
mova m0, [t3+r10*4+400*8+ 0] ; b
|
||||
mova m1, [t3+r10*4+400*8+32]
|
||||
packssdw m2, m3
|
||||
psubd m0, m4 ; a * src + b + (1 << 7)
|
||||
psubd m0, m4 ; b - a * src + (1 << 7)
|
||||
psubd m1, m5
|
||||
psrld m0, 8
|
||||
psrld m1, 8
|
||||
psrad m0, 8
|
||||
psrad m1, 8
|
||||
packssdw m0, m1
|
||||
psllw m1, m2, 4
|
||||
psubw m0, m1
|
||||
pmulhrsw m0, m7
|
||||
paddw m0, m2
|
||||
vextracti128 xm1, m0, 1
|
||||
|
@ -1509,31 +1496,29 @@ ALIGN function_align
|
|||
paddd m5, m5
|
||||
psubd m5, m4
|
||||
mova [t5+r10*4+32], m5
|
||||
por m4, m14, m0
|
||||
pandn m4, m14, m0
|
||||
psrld m0, 12
|
||||
paddd m3, m5
|
||||
por m5, m14, m2
|
||||
pandn m5, m14, m2
|
||||
psrld m2, 12
|
||||
paddd m4, m5 ; -a
|
||||
por m5, m14, m1
|
||||
paddd m4, m5 ; a
|
||||
pandn m5, m14, m1
|
||||
psrld m1, 12
|
||||
paddd m0, m2 ; b + (1 << 8)
|
||||
por m2, m14, m3
|
||||
paddd m0, m2 ; b + (1 << 8)
|
||||
pandn m2, m14, m3
|
||||
psrld m3, 12
|
||||
paddd m5, m2
|
||||
pmovzxbd m2, [dstq+r10+0]
|
||||
paddd m1, m3
|
||||
pmovzxbd m3, [dstq+r10+8]
|
||||
pmaddwd m4, m2 ; -a * src
|
||||
pmaddwd m4, m2 ; a * src
|
||||
pmaddwd m5, m3
|
||||
packssdw m2, m3
|
||||
psubd m0, m4 ; a * src + b + (1 << 8)
|
||||
psubd m0, m4 ; b - a * src + (1 << 8)
|
||||
psubd m1, m5
|
||||
psrld m0, 9
|
||||
psrld m1, 9
|
||||
psrad m0, 9
|
||||
psrad m1, 9
|
||||
packssdw m0, m1
|
||||
psllw m1, m2, 4
|
||||
psubw m0, m1
|
||||
pmulhrsw m0, m7
|
||||
paddw m0, m2
|
||||
vextracti128 xm1, m0, 1
|
||||
|
@ -1908,7 +1893,7 @@ ALIGN function_align
|
|||
vpgatherdd m2, [r12+m3*4], m6
|
||||
psrad m6, m7, 20
|
||||
vpgatherdd m3, [r12+m6*4], m7
|
||||
vpbroadcastd m6, [base+pd_34816]
|
||||
vpbroadcastd m6, [base+pd_34816] ; x3
|
||||
pmulld m0, m2
|
||||
vpbroadcastd m7, [base+pd_m4096]
|
||||
pmulld m1, m3
|
||||
|
@ -1918,12 +1903,12 @@ ALIGN function_align
|
|||
pand m7, m1
|
||||
por m0, m2 ; a3 | (b3 << 12)
|
||||
por m7, m3
|
||||
paddw m1, m8, [t2+r10*2+400*0]
|
||||
paddd m2, m4, [t2+r10*2+400*2]
|
||||
paddd m3, m5, [t2+r10*2+400*4]
|
||||
paddw m1, [t1+r10*2+400*0]
|
||||
paddd m2, [t1+r10*2+400*2]
|
||||
paddd m3, [t1+r10*2+400*4]
|
||||
paddw m1, m8, [t2+r10*2+400*0]
|
||||
paddd m2, m4, [t2+r10*2+400*2]
|
||||
paddd m3, m5, [t2+r10*2+400*4]
|
||||
paddw m1, [t1+r10*2+400*0]
|
||||
paddd m2, [t1+r10*2+400*2]
|
||||
paddd m3, [t1+r10*2+400*4]
|
||||
mova [t2+r10*2+400*0], m8
|
||||
mova [t2+r10*2+400*2], m4
|
||||
mova [t2+r10*2+400*4], m5
|
||||
|
@ -1949,7 +1934,7 @@ ALIGN function_align
|
|||
paddusw m2, m4
|
||||
paddusw m3, m4
|
||||
psrad m5, m2, 20 ; min(z5, 255) - 256
|
||||
vpgatherdd m4, [r12+m5*4], m2
|
||||
vpgatherdd m4, [r12+m5*4], m2 ; x5
|
||||
psrad m2, m3, 20
|
||||
vpgatherdd m5, [r12+m2*4], m3
|
||||
pmulld m0, m4
|
||||
|
@ -2006,7 +1991,7 @@ ALIGN function_align
|
|||
paddusw m4, m2
|
||||
paddusw m5, m2
|
||||
psrad m3, m4, 20 ; min(z3, 255) - 256
|
||||
vpgatherdd m2, [r12+m3*4], m4
|
||||
vpgatherdd m2, [r12+m3*4], m4 ; x3
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r12+m4*4], m5
|
||||
pmulld m0, m2
|
||||
|
@ -2023,7 +2008,7 @@ ALIGN function_align
|
|||
mova [t3+r10*4+400*8+ 8], m2
|
||||
mova [t3+r10*4+400*0+ 8], m3
|
||||
mova [t3+r10*4+400*0+40], m4
|
||||
paddw m2, m2 ; cc5
|
||||
paddw m2, m2 ; cc5
|
||||
paddd m3, m3
|
||||
paddd m4, m4
|
||||
mova [t1+r10*2+400*0], m2
|
||||
|
@ -2066,7 +2051,7 @@ ALIGN function_align
|
|||
paddusw m4, m2
|
||||
paddusw m5, m2
|
||||
psrad m3, m4, 20 ; min(z3, 255) - 256
|
||||
vpgatherdd m2, [r12+m3*4], m4
|
||||
vpgatherdd m2, [r12+m3*4], m4 ; x3
|
||||
psrad m4, m5, 20
|
||||
vpgatherdd m3, [r12+m4*4], m5
|
||||
vpbroadcastd m4, [base+pd_34816]
|
||||
|
@ -2112,7 +2097,7 @@ ALIGN function_align
|
|||
paddusw m2, m4
|
||||
paddusw m3, m4
|
||||
psrad m5, m2, 20 ; min(z5, 255) - 256
|
||||
vpgatherdd m4, [r12+m5*4], m2
|
||||
vpgatherdd m4, [r12+m5*4], m2 ; x5
|
||||
psrad m2, m3, 20
|
||||
vpgatherdd m5, [r12+m2*4], m3
|
||||
pmulld m0, m4
|
||||
|
@ -2154,7 +2139,7 @@ ALIGN function_align
|
|||
paddd m3, m3 ; ab3[ 0] 222
|
||||
psubd m2, m4 ; ab3[-1] 343
|
||||
mova [t3+r10*4+400*20], m3
|
||||
por m0, m6, m1 ; a5 565
|
||||
pandn m0, m6, m1 ; a5 565
|
||||
mova [t3+r10*4+400*24], m2
|
||||
psrld m1, 12 ; b5 565
|
||||
mova [t3+r10*4+400*12], m0
|
||||
|
@ -2175,11 +2160,11 @@ ALIGN function_align
|
|||
paddd m0, m4
|
||||
pslld m4, 2
|
||||
paddd m4, m0
|
||||
por m0, m6, m4
|
||||
pandn m0, m6, m4
|
||||
psrld m4, 12
|
||||
paddd m2, m0, [t3+r10*4+400*12] ; -a5
|
||||
paddd m2, m0, [t3+r10*4+400*12] ; a5
|
||||
mova [t3+r10*4+400*12], m0
|
||||
paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
|
||||
paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
|
||||
mova [t3+r10*4+400*16], m4
|
||||
mova m3, [t3+r10*4+400*4+0]
|
||||
paddd m3, [t3+r10*4+400*4+8]
|
||||
|
@ -2192,27 +2177,24 @@ ALIGN function_align
|
|||
psubd m5, m3 ; ab3[ 1] 343
|
||||
mova [t3+r10*4+400*24], m5
|
||||
paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
|
||||
por m3, m6, m1
|
||||
pandn m3, m6, m1
|
||||
psrld m1, 12
|
||||
por m5, m6, m4
|
||||
pandn m5, m6, m4
|
||||
psrld m4, 12
|
||||
paddd m3, m5 ; -a3
|
||||
paddd m1, m4 ; b3 + (1 << 8)
|
||||
paddd m3, m5 ; a3
|
||||
paddd m1, m4 ; b3 + (1 << 8)
|
||||
pmovzxbd m4, [dstq+r10]
|
||||
pmaddwd m2, m4 ; -a5 * src
|
||||
pmaddwd m3, m4 ; -a3 * src
|
||||
pslld m4, 13
|
||||
psubd m0, m4
|
||||
psubd m1, m4
|
||||
psubd m0, m2 ; a5 * src + b5 + (1 << 8)
|
||||
psubd m1, m3 ; a3 * src + b3 + (1 << 8)
|
||||
pmaddwd m2, m4 ; a5 * src
|
||||
pmaddwd m3, m4 ; a3 * src
|
||||
psubd m0, m2 ; b5 - a5 * src + (1 << 8)
|
||||
psubd m1, m3 ; b3 - a3 * src + (1 << 8)
|
||||
psrld m0, 9
|
||||
pslld m1, 7
|
||||
pblendw m0, m1, 0xaa
|
||||
pmaddwd m0, m15
|
||||
psubd m4, m6
|
||||
paddd m0, m4
|
||||
psubd m0, m6
|
||||
psrad m0, 13
|
||||
paddd m0, m4
|
||||
vextracti128 xm1, m0, 1
|
||||
packssdw xm0, xm1
|
||||
packuswb xm0, xm0
|
||||
|
@ -2236,9 +2218,9 @@ ALIGN function_align
|
|||
psubd m5, m3 ; ab3[ 1] 343
|
||||
mova [t3+r10*4+400*28], m5
|
||||
paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
|
||||
por m3, m6, m1
|
||||
pandn m3, m6, m1
|
||||
psrld m1, 12
|
||||
por m5, m6, m4
|
||||
pandn m5, m6, m4
|
||||
psrld m4, 12
|
||||
paddd m3, m5 ; -a3
|
||||
paddd m1, m4 ; b3 + (1 << 8)
|
||||
|
@ -2246,19 +2228,15 @@ ALIGN function_align
|
|||
pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src
|
||||
mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7)
|
||||
pmaddwd m3, m4 ; -a3 * src
|
||||
pslld m4, 12
|
||||
psubd m0, m4
|
||||
paddd m4, m4
|
||||
psubd m1, m4
|
||||
psubd m0, m2 ; a5 * src + b5 + (1 << 7)
|
||||
psubd m1, m3 ; a3 * src + b3 + (1 << 8)
|
||||
psrld m0, 8
|
||||
pslld m1, 7
|
||||
pblendw m0, m1, 0xaa
|
||||
pmaddwd m0, m15
|
||||
psubd m4, m6
|
||||
paddd m0, m4
|
||||
psubd m0, m6
|
||||
psrad m0, 13
|
||||
paddd m0, m4
|
||||
vextracti128 xm1, m0, 1
|
||||
packssdw xm0, xm1
|
||||
packuswb xm0, xm0
|
||||
|
|
|
@ -39,152 +39,12 @@ decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
|
|||
decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
|
||||
decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
|
||||
|
||||
/* FIXME: Replace with a port of the AVX2 code */
|
||||
#define SGR_FILTER_OLD(ext) \
|
||||
void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
|
||||
const int w, const int h, const unsigned s); \
|
||||
void BF(dav1d_sgr_finish_filter1, ext)(int16_t *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int32_t *a, const int16_t *b, \
|
||||
const int w, const int h); \
|
||||
\
|
||||
/* filter with a 3x3 box (radius=1) */ \
|
||||
static void BF(dav1d_sgr_filter1, ext)(int16_t *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int strength, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
|
||||
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
|
||||
\
|
||||
BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
|
||||
if (edges & LR_HAVE_TOP) \
|
||||
BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
|
||||
NULL, lpf, lpf_stride, w, 2, edges); \
|
||||
\
|
||||
if (edges & LR_HAVE_BOTTOM) \
|
||||
BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, w, 2, edges); \
|
||||
\
|
||||
BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
|
||||
BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
|
||||
BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
|
||||
} \
|
||||
\
|
||||
void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
|
||||
const int w, const int h, const int strength); \
|
||||
void BF(dav1d_sgr_finish_filter2, ext)(int16_t *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int32_t *a, const int16_t *b, \
|
||||
const int w, const int h); \
|
||||
\
|
||||
/* filter with a 5x5 box (radius=2) */ \
|
||||
static void BF(dav1d_sgr_filter2, ext)(int16_t *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int strength, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
|
||||
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
|
||||
\
|
||||
BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
|
||||
if (edges & LR_HAVE_TOP) \
|
||||
BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
|
||||
NULL, lpf, lpf_stride, w, 2, edges); \
|
||||
\
|
||||
if (edges & LR_HAVE_BOTTOM) \
|
||||
BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, w, 2, edges); \
|
||||
\
|
||||
BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
|
||||
BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
|
||||
BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
|
||||
} \
|
||||
\
|
||||
void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
|
||||
const int16_t *t1, const int w, const int h, \
|
||||
const int wt); \
|
||||
void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
|
||||
const int16_t *t1, const int16_t *t2, \
|
||||
const int w, const int h, \
|
||||
const uint32_t wt); \
|
||||
\
|
||||
static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, \
|
||||
const LooprestorationParams *const params, \
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
ALIGN_STK_32(int16_t, tmp, 64 * 384,); \
|
||||
BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s0, edges); \
|
||||
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
|
||||
} \
|
||||
static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, \
|
||||
const LooprestorationParams *const params, \
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
ALIGN_STK_32(int16_t, tmp, 64 * 384,); \
|
||||
BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s1, edges); \
|
||||
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
|
||||
} \
|
||||
static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, \
|
||||
const LooprestorationParams *const params, \
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
ALIGN_STK_32(int16_t, tmp1, 64 * 384,); \
|
||||
ALIGN_STK_32(int16_t, tmp2, 64 * 384,); \
|
||||
BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s0, edges); \
|
||||
BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s1, edges); \
|
||||
const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
|
||||
BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
|
||||
}
|
||||
|
||||
decl_wiener_filter_fns(sse2);
|
||||
decl_wiener_filter_fns(ssse3);
|
||||
decl_wiener_filter_fns(avx2);
|
||||
decl_sgr_filter_fns(ssse3);
|
||||
decl_sgr_filter_fns(avx2);
|
||||
|
||||
#if BITDEPTH == 8
|
||||
SGR_FILTER_OLD(ssse3)
|
||||
#endif
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c,
|
||||
const int bpc)
|
||||
{
|
||||
|
@ -199,11 +59,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
|
|||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
|
||||
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
|
||||
#if BITDEPTH == 8
|
||||
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
|
||||
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
|
||||
c->sgr[2] = BF(sgr_filter_mix, ssse3);
|
||||
#endif
|
||||
if (bpc <= 10) {
|
||||
c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
|
||||
c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
|
||||
c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
|
||||
}
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -41,6 +41,9 @@ blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
|
|||
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
|
||||
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
|
||||
spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
|
||||
rescale_mul: dd 0, 1, 2, 3
|
||||
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
|
||||
|
||||
pw_2: times 8 dw 2
|
||||
pw_16: times 4 dw 16
|
||||
|
@ -54,6 +57,8 @@ pw_8192: times 8 dw 8192
|
|||
pw_27615: times 8 dw 27615
|
||||
pw_32766: times 8 dw 32766
|
||||
pw_m512: times 8 dw -512
|
||||
pd_63: times 4 dd 63
|
||||
pd_64: times 4 dd 64
|
||||
pd_512: times 4 dd 512
|
||||
pd_65538: times 2 dd 65538
|
||||
|
||||
|
@ -65,6 +70,12 @@ put_8tap_h_rnd: dd 34, 34, 40, 40
|
|||
prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
|
||||
prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
|
||||
|
||||
warp8x8_shift: dd 11, 13
|
||||
warp8x8_rnd1: dd 1024, 1024, 4096, 4096
|
||||
warp8x8_rnd2: times 4 dw 4096
|
||||
times 4 dw 16384
|
||||
warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
|
||||
|
||||
%macro BIDIR_JMP_TABLE 2-*
|
||||
%xdefine %1_%2_table (%%table - 2*%3)
|
||||
%xdefine %%base %1_%2_table
|
||||
|
@ -105,6 +116,9 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
|
|||
cextern mc_subpel_filters
|
||||
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
|
||||
|
||||
cextern mc_warp_filter
|
||||
cextern resize_filter
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro REPX 2-*
|
||||
|
@ -2526,6 +2540,398 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
|
|||
RET
|
||||
%undef tmp
|
||||
|
||||
%if ARCH_X86_64
|
||||
; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
|
||||
; by allocating 16 bytes more stack space so that stack offsets match up.
|
||||
%if WIN64 && STACK_ALIGNMENT == 16
|
||||
%assign stksz 16*14
|
||||
%else
|
||||
%assign stksz 16*13
|
||||
%endif
|
||||
cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
|
||||
mx, tmp, alpha, beta, \
|
||||
filter, my, gamma, cnt
|
||||
%assign stack_size_padded_8x8t stack_size_padded
|
||||
%else
|
||||
cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
|
||||
filter, mx, my
|
||||
%define m8 [esp+16*13]
|
||||
%define m9 [esp+16*14]
|
||||
%define cntd dword [esp+4*63]
|
||||
%define dstq tmpq
|
||||
%define dsq 0
|
||||
%if STACK_ALIGNMENT < 16
|
||||
%define dstm [esp+4*65]
|
||||
%define dsm [esp+4*66]
|
||||
%else
|
||||
%define dstm r0m
|
||||
%define dsm r1m
|
||||
%endif
|
||||
%endif
|
||||
%define base filterq-$$
|
||||
mov t0d, r7m
|
||||
LEA filterq, $$
|
||||
shr t0d, 11
|
||||
%if ARCH_X86_64
|
||||
movddup m8, [base+warp8x8t_rnd]
|
||||
%else
|
||||
movddup m1, [base+warp8x8t_rnd]
|
||||
mov r1, r1m
|
||||
add r1, r1
|
||||
mova m8, m1
|
||||
mov r1m, r1 ; ds *= 2
|
||||
%endif
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
|
||||
jmp .start
|
||||
.loop:
|
||||
%if ARCH_X86_64
|
||||
lea dstq, [dstq+dsq*4]
|
||||
%else
|
||||
add dstq, dsm
|
||||
mov dstm, dstq
|
||||
%endif
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
|
||||
.start:
|
||||
%if ARCH_X86_32
|
||||
mov dstq, dstm
|
||||
%endif
|
||||
paddd m1, m8
|
||||
paddd m2, m8
|
||||
psrad m1, 15
|
||||
psrad m2, 15
|
||||
packssdw m1, m2
|
||||
mova [dstq+dsq*0], m1
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
|
||||
%if ARCH_X86_32
|
||||
mov dstq, dstm
|
||||
add dstq, dsm
|
||||
%endif
|
||||
paddd m1, m8
|
||||
paddd m2, m8
|
||||
psrad m1, 15
|
||||
psrad m2, 15
|
||||
packssdw m1, m2
|
||||
mova [dstq+dsq*2], m1
|
||||
dec cntd
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
|
||||
mx, tmp, alpha, beta, \
|
||||
filter, my, gamma, cnt
|
||||
ASSERT stack_size_padded == stack_size_padded_8x8t
|
||||
%else
|
||||
cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
|
||||
filter, mx, my
|
||||
%endif
|
||||
mov t0d, r7m
|
||||
LEA filterq, $$
|
||||
shr t0d, 11
|
||||
%if ARCH_X86_64
|
||||
movddup m8, [base+warp8x8_rnd2+t0*8]
|
||||
movd m9, r7m ; pixel_max
|
||||
pshufb m9, [base+pw_256]
|
||||
%else
|
||||
movddup m1, [base+warp8x8_rnd2+t0*8]
|
||||
movd m2, r7m ; pixel_max
|
||||
pshufb m2, [base+pw_256]
|
||||
mova m8, m1
|
||||
mova m9, m2
|
||||
%endif
|
||||
call .main
|
||||
jmp .start
|
||||
.loop:
|
||||
%if ARCH_X86_64
|
||||
lea dstq, [dstq+dsq*2]
|
||||
%else
|
||||
add dstq, dsm
|
||||
mov dstm, dstq
|
||||
%endif
|
||||
call .main2
|
||||
.start:
|
||||
%if ARCH_X86_32
|
||||
mov dstq, dstm
|
||||
%endif
|
||||
psrad m1, 16
|
||||
psrad m2, 16
|
||||
packssdw m1, m2
|
||||
pmaxsw m1, m6
|
||||
pmulhrsw m1, m8
|
||||
pminsw m1, m9
|
||||
mova [dstq+dsq*0], m1
|
||||
call .main3
|
||||
%if ARCH_X86_32
|
||||
mov dstq, dstm
|
||||
add dstq, dsm
|
||||
%endif
|
||||
psrad m1, 16
|
||||
psrad m2, 16
|
||||
packssdw m1, m2
|
||||
pmaxsw m1, m6
|
||||
pmulhrsw m1, m8
|
||||
pminsw m1, m9
|
||||
mova [dstq+dsq*1], m1
|
||||
dec cntd
|
||||
jg .loop
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
; Stack args offset by one (r4m -> r5m etc.) due to call
|
||||
%if WIN64
|
||||
mov deltaq, r5m
|
||||
mov mxd, r6m
|
||||
%endif
|
||||
movd m0, [base+warp8x8_shift+t0*4]
|
||||
movddup m7, [base+warp8x8_rnd1+t0*8]
|
||||
add filterq, mc_warp_filter-$$
|
||||
%if ARCH_X86_64
|
||||
movsx alphad, word [deltaq+2*0]
|
||||
movsx betad, word [deltaq+2*1]
|
||||
movsx gammad, word [deltaq+2*2]
|
||||
movsx deltad, word [deltaq+2*3]
|
||||
lea tmpq, [ssq*3]
|
||||
add mxd, 512+(64<<10)
|
||||
sub srcq, tmpq ; src -= ss*3
|
||||
imul tmpd, alphad, -7
|
||||
mov myd, r7m
|
||||
add betad, tmpd ; beta -= alpha*7
|
||||
imul tmpd, gammad, -7
|
||||
add myd, 512+(64<<10)
|
||||
mov cntd, 4
|
||||
add deltad, tmpd ; delta -= gamma*7
|
||||
%else
|
||||
%if STACK_ALIGNMENT < 16
|
||||
%assign stack_offset stack_offset - gprsize
|
||||
%endif
|
||||
mov r3d, r5m ; abcd
|
||||
%if STACK_ALIGNMENT < 16
|
||||
mov r0, r1m ; dst
|
||||
mov r1, r2m ; ds
|
||||
mov [esp+gprsize+4*65], r0
|
||||
mov [esp+gprsize+4*66], r1
|
||||
%endif
|
||||
movsx alphad, word [r3+2*0]
|
||||
movsx r2d, word [r3+2*1]
|
||||
movsx gammad, word [r3+2*2]
|
||||
movsx r3d, word [r3+2*3]
|
||||
imul r5d, alphad, -7
|
||||
add r2d, r5d ; beta -= alpha*7
|
||||
imul r5d, gammad, -7
|
||||
mov [esp+gprsize+4*60], r2d
|
||||
add r3d, r5d ; delta -= gamma*7
|
||||
mov [esp+gprsize+4*61], r3d
|
||||
mov r3d, r4m ; ss
|
||||
mov srcq, r3m
|
||||
mov mxd, r6m
|
||||
mov myd, r7m
|
||||
mov dword [esp+gprsize+4*63], 4 ; cnt
|
||||
mov [esp+gprsize+4*62], r3
|
||||
lea r3, [r3*3]
|
||||
add mxd, 512+(64<<10)
|
||||
add myd, 512+(64<<10)
|
||||
sub srcq, r3 ; src -= ss*3
|
||||
%if STACK_ALIGNMENT < 16
|
||||
%assign stack_offset stack_offset + gprsize
|
||||
%endif
|
||||
%endif
|
||||
mova [rsp+gprsize], m0
|
||||
pxor m6, m6
|
||||
call .h
|
||||
mova m5, m0
|
||||
call .h
|
||||
punpcklwd m1, m5, m0 ; 01
|
||||
punpckhwd m5, m0
|
||||
mova [rsp+gprsize+16* 1], m1
|
||||
mova [rsp+gprsize+16* 4], m5
|
||||
mova m5, m0
|
||||
call .h
|
||||
punpcklwd m1, m5, m0 ; 12
|
||||
punpckhwd m5, m0
|
||||
mova [rsp+gprsize+16* 7], m1
|
||||
mova [rsp+gprsize+16*10], m5
|
||||
mova m5, m0
|
||||
call .h
|
||||
punpcklwd m1, m5, m0 ; 23
|
||||
punpckhwd m5, m0
|
||||
mova [rsp+gprsize+16* 2], m1
|
||||
mova [rsp+gprsize+16* 5], m5
|
||||
mova m5, m0
|
||||
call .h
|
||||
punpcklwd m1, m5, m0 ; 34
|
||||
punpckhwd m5, m0
|
||||
mova [rsp+gprsize+16* 8], m1
|
||||
mova [rsp+gprsize+16*11], m5
|
||||
mova m5, m0
|
||||
call .h
|
||||
punpcklwd m1, m5, m0 ; 45
|
||||
punpckhwd m5, m0
|
||||
mova [rsp+gprsize+16* 3], m1
|
||||
mova [rsp+gprsize+16* 6], m5
|
||||
mova m5, m0
|
||||
call .h
|
||||
punpcklwd m1, m5, m0 ; 56
|
||||
punpckhwd m5, m0
|
||||
mova [rsp+gprsize+16* 9], m1
|
||||
mova [rsp+gprsize+16*12], m5
|
||||
mova m5, m0
|
||||
.main2:
|
||||
call .h
|
||||
%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
|
||||
lea tmpd, [myq+gammaq]
|
||||
shr myd, 10
|
||||
movq m4, [filterq+myq*8] ; a
|
||||
lea myd, [tmpq+gammaq]
|
||||
shr tmpd, 10
|
||||
movq m2, [filterq+tmpq*8] ; b
|
||||
lea tmpd, [myq+gammaq]
|
||||
shr myd, 10
|
||||
movq m3, [filterq+myq*8] ; c
|
||||
lea myd, [tmpq+gammaq]
|
||||
shr tmpd, 10
|
||||
movq m1, [filterq+tmpq*8] ; d
|
||||
lea tmpd, [myq+gammaq]
|
||||
shr myd, 10
|
||||
punpcklwd m4, m2
|
||||
punpcklwd m3, m1
|
||||
punpckldq m2, m4, m3
|
||||
punpckhdq m4, m3
|
||||
punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
|
||||
pmaddwd m1, [rsp+gprsize+16*%1]
|
||||
punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
|
||||
mova m2, [rsp+gprsize+16*%2]
|
||||
pmaddwd m3, m2
|
||||
mova [rsp+gprsize+16*%1], m2
|
||||
paddd m1, m3
|
||||
punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
|
||||
mova m2, [rsp+gprsize+16*%3]
|
||||
pmaddwd m3, m2
|
||||
mova [rsp+gprsize+16*%2], m2
|
||||
paddd m1, m3
|
||||
punpcklwd m3, m5, m0 ; 67
|
||||
punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
|
||||
pmaddwd m2, m3
|
||||
mova [rsp+gprsize+16*%3], m3
|
||||
paddd m1, m2
|
||||
movq m4, [filterq+myq*8] ; e
|
||||
lea myd, [tmpq+gammaq]
|
||||
shr tmpd, 10
|
||||
movq m3, [filterq+tmpq*8] ; f
|
||||
lea tmpd, [myq+gammaq]
|
||||
shr myd, 10
|
||||
movq m2, [filterq+myq*8] ; g
|
||||
%if ARCH_X86_64
|
||||
lea myd, [tmpq+deltaq] ; my += delta
|
||||
%else
|
||||
mov myd, [esp+gprsize+4*61]
|
||||
add myd, tmpd
|
||||
%endif
|
||||
shr tmpd, 10
|
||||
punpcklwd m4, m3
|
||||
movq m3, [filterq+tmpq*8] ; h
|
||||
punpcklwd m2, m3
|
||||
punpckldq m3, m4, m2
|
||||
punpckhdq m4, m2
|
||||
punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
|
||||
pmaddwd m2, [rsp+gprsize+16*%4]
|
||||
punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
|
||||
mova m3, [rsp+gprsize+16*%5]
|
||||
pmaddwd m6, m3
|
||||
mova [rsp+gprsize+16*%4], m3
|
||||
pxor m3, m3
|
||||
paddd m2, m6
|
||||
punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
|
||||
mova m6, [rsp+gprsize+16*%6]
|
||||
pmaddwd m3, m6
|
||||
mova [rsp+gprsize+16*%5], m6
|
||||
punpckhwd m5, m0
|
||||
pxor m6, m6
|
||||
paddd m2, m3
|
||||
punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
|
||||
pmaddwd m3, m5
|
||||
mova [rsp+gprsize+16*%6], m5
|
||||
mova m5, m0
|
||||
paddd m2, m3
|
||||
%endmacro
|
||||
WARP_V 1, 2, 3, 4, 5, 6
|
||||
ret
|
||||
.main3:
|
||||
call .h
|
||||
WARP_V 7, 8, 9, 10, 11, 12
|
||||
ret
|
||||
ALIGN function_align
|
||||
.h:
|
||||
lea tmpd, [mxq+alphaq]
|
||||
shr mxd, 10
|
||||
movq m3, [filterq+mxq*8]
|
||||
punpcklbw m0, m6, m3
|
||||
movu m3, [srcq-6]
|
||||
pmaddwd m0, m3 ; 0
|
||||
lea mxd, [tmpq+alphaq]
|
||||
shr tmpd, 10
|
||||
movq m3, [filterq+tmpq*8]
|
||||
punpcklbw m2, m6, m3
|
||||
movu m3, [srcq-4]
|
||||
pmaddwd m2, m3 ; 1
|
||||
lea tmpd, [mxq+alphaq]
|
||||
shr mxd, 10
|
||||
movq m3, [filterq+mxq*8]
|
||||
phaddd m0, m2 ; 0 1
|
||||
punpcklbw m2, m6, m3
|
||||
movu m3, [srcq-2]
|
||||
pmaddwd m2, m3 ; 2
|
||||
lea mxd, [tmpq+alphaq]
|
||||
shr tmpd, 10
|
||||
movq m3, [filterq+tmpq*8]
|
||||
punpcklbw m1, m6, m3
|
||||
movu m3, [srcq+0]
|
||||
pmaddwd m1, m3 ; 3
|
||||
lea tmpd, [mxq+alphaq]
|
||||
shr mxd, 10
|
||||
movq m3, [filterq+mxq*8]
|
||||
phaddd m2, m1 ; 2 3
|
||||
punpcklbw m1, m6, m3
|
||||
movu m3, [srcq+2]
|
||||
pmaddwd m1, m3 ; 4
|
||||
lea mxd, [tmpq+alphaq]
|
||||
shr tmpd, 10
|
||||
movq m3, [filterq+tmpq*8]
|
||||
phaddd m0, m2 ; 0 1 2 3
|
||||
punpcklbw m2, m6, m3
|
||||
movu m3, [srcq+4]
|
||||
pmaddwd m2, m3 ; 5
|
||||
lea tmpd, [mxq+alphaq]
|
||||
shr mxd, 10
|
||||
movq m3, [filterq+mxq*8]
|
||||
phaddd m1, m2 ; 4 5
|
||||
punpcklbw m2, m6, m3
|
||||
movu m3, [srcq+6]
|
||||
pmaddwd m2, m3 ; 6
|
||||
%if ARCH_X86_64
|
||||
lea mxd, [tmpq+betaq] ; mx += beta
|
||||
%else
|
||||
mov mxd, [esp+gprsize*2+4*60]
|
||||
add mxd, tmpd
|
||||
%endif
|
||||
shr tmpd, 10
|
||||
movq m3, [filterq+tmpq*8]
|
||||
punpcklbw m4, m6, m3
|
||||
movu m3, [srcq+8]
|
||||
%if ARCH_X86_64
|
||||
add srcq, ssq
|
||||
%else
|
||||
add srcq, [esp+gprsize*2+4*62]
|
||||
%endif
|
||||
pmaddwd m3, m4 ; 7
|
||||
phaddd m2, m3 ; 6 7
|
||||
phaddd m1, m2 ; 4 5 6 7
|
||||
paddd m0, m7
|
||||
paddd m1, m7
|
||||
psrad m0, [rsp+gprsize*2]
|
||||
psrad m1, [rsp+gprsize*2]
|
||||
packssdw m0, m1
|
||||
ret
|
||||
|
||||
%macro BIDIR_FN 0
|
||||
call .main
|
||||
jmp wq
|
||||
|
@ -4142,3 +4548,233 @@ cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
|
|||
%undef reg_dstride
|
||||
%undef reg_blkm
|
||||
%undef reg_tmp
|
||||
|
||||
%macro SCRATCH 3
|
||||
%if ARCH_X86_32
|
||||
mova [rsp+%3*mmsize], m%1
|
||||
%define m%2 [rsp+%3*mmsize]
|
||||
%else
|
||||
SWAP %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0, pxmax
|
||||
%elif STACK_ALIGNMENT >= 16
|
||||
cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0, pxmax
|
||||
%else
|
||||
cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0, pxmax
|
||||
%endif
|
||||
movifnidn dstq, dstmp
|
||||
movifnidn srcq, srcmp
|
||||
%if STACK_ALIGNMENT >= 16
|
||||
movifnidn dst_wd, dst_wm
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
movifnidn hd, hm
|
||||
%endif
|
||||
sub dword mx0m, 4<<14
|
||||
sub dword src_wm, 8
|
||||
movd m4, pxmaxm
|
||||
movd m7, dxm
|
||||
movd m6, mx0m
|
||||
movd m5, src_wm
|
||||
punpcklwd m4, m4
|
||||
pshufd m4, m4, q0000
|
||||
pshufd m7, m7, q0000
|
||||
pshufd m6, m6, q0000
|
||||
pshufd m5, m5, q0000
|
||||
mova [rsp+16*3*ARCH_X86_32], m4
|
||||
%if ARCH_X86_64
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
|
||||
LEA r7, $$
|
||||
%define base r7-$$
|
||||
%else
|
||||
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
|
||||
%define hd dword r5m
|
||||
%if STACK_ALIGNMENT >= 16
|
||||
LEA r6, $$
|
||||
%define base r6-$$
|
||||
%else
|
||||
LEA r4, $$
|
||||
%define base r4-$$
|
||||
%endif
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
mova m12, [base+pd_64]
|
||||
mova m11, [base+pd_63]
|
||||
%else
|
||||
%define m12 [base+pd_64]
|
||||
%define m11 [base+pd_63]
|
||||
%endif
|
||||
pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
|
||||
pslld m7, 2 ; dx*4
|
||||
pslld m5, 14
|
||||
paddd m6, m4 ; mx+[0..3]*dx
|
||||
SCRATCH 7, 15, 0
|
||||
SCRATCH 6, 14, 1
|
||||
SCRATCH 5, 13, 2
|
||||
pxor m1, m1
|
||||
.loop_y:
|
||||
xor xd, xd
|
||||
mova m0, m14 ; per-line working version of mx
|
||||
.loop_x:
|
||||
pcmpgtd m1, m0
|
||||
pandn m1, m0
|
||||
psrad m2, m0, 8 ; filter offset (unmasked)
|
||||
pcmpgtd m3, m13, m1
|
||||
pand m1, m3
|
||||
pandn m3, m13
|
||||
por m1, m3
|
||||
psubd m3, m0, m1 ; pshufb offset
|
||||
psrad m1, 14 ; clipped src_x offset
|
||||
psrad m3, 14 ; pshufb edge_emu offset
|
||||
pand m2, m11 ; filter offset (masked)
|
||||
; load source pixels
|
||||
%if ARCH_X86_64
|
||||
movd r8d, m1
|
||||
pshuflw m1, m1, q3232
|
||||
movd r9d, m1
|
||||
punpckhqdq m1, m1
|
||||
movd r10d, m1
|
||||
psrlq m1, 32
|
||||
movd r11d, m1
|
||||
movu m4, [srcq+r8*2]
|
||||
movu m5, [srcq+r9*2]
|
||||
movu m6, [srcq+r10*2]
|
||||
movu m7, [srcq+r11*2]
|
||||
; if no emulation is required, we don't need to shuffle or emulate edges
|
||||
packssdw m3, m3
|
||||
movq r11, m3
|
||||
test r11, r11
|
||||
jz .filter
|
||||
movsx r8, r11w
|
||||
sar r11, 16
|
||||
movsx r9, r11w
|
||||
sar r11, 16
|
||||
movsx r10, r11w
|
||||
sar r11, 16
|
||||
movu m1, [base+resize_shuf+8+r8*2]
|
||||
movu m3, [base+resize_shuf+8+r9*2]
|
||||
movu m8, [base+resize_shuf+8+r10*2]
|
||||
movu m9, [base+resize_shuf+8+r11*2]
|
||||
pshufb m4, m1
|
||||
pshufb m5, m3
|
||||
pshufb m6, m8
|
||||
pshufb m7, m9
|
||||
.filter:
|
||||
movd r8d, m2
|
||||
pshuflw m2, m2, q3232
|
||||
movd r9d, m2
|
||||
punpckhqdq m2, m2
|
||||
movd r10d, m2
|
||||
psrlq m2, 32
|
||||
movd r11d, m2
|
||||
movq m8, [base+resize_filter+r8*8]
|
||||
movq m2, [base+resize_filter+r9*8]
|
||||
pxor m9, m9
|
||||
punpcklbw m1, m9, m8
|
||||
punpcklbw m3, m9, m2
|
||||
psraw m1, 8
|
||||
psraw m3, 8
|
||||
movq m10, [base+resize_filter+r10*8]
|
||||
movq m2, [base+resize_filter+r11*8]
|
||||
punpcklbw m8, m9, m10
|
||||
punpcklbw m9, m2
|
||||
psraw m8, 8
|
||||
psraw m9, 8
|
||||
pmaddwd m4, m1
|
||||
pmaddwd m5, m3
|
||||
pmaddwd m6, m8
|
||||
pmaddwd m7, m9
|
||||
phaddd m4, m5
|
||||
%else
|
||||
movd r3, m1
|
||||
pshuflw m1, m1, q3232
|
||||
movd r1, m1
|
||||
punpckhqdq m1, m1
|
||||
movu m4, [srcq+r3*2]
|
||||
movu m5, [srcq+r1*2]
|
||||
movd r3, m1
|
||||
psrlq m1, 32
|
||||
movd r1, m1
|
||||
movu m6, [srcq+r3*2]
|
||||
movu m7, [srcq+r1*2]
|
||||
; if no emulation is required, we don't need to shuffle or emulate edges
|
||||
pxor m1, m1
|
||||
pcmpeqb m1, m3
|
||||
pmovmskb r3d, m1
|
||||
cmp r3d, 0xffff
|
||||
je .filter
|
||||
movd r3, m3
|
||||
movu m1, [base+resize_shuf+8+r3*2]
|
||||
pshuflw m3, m3, q3232
|
||||
movd r1, m3
|
||||
pshufb m4, m1
|
||||
movu m1, [base+resize_shuf+8+r1*2]
|
||||
punpckhqdq m3, m3
|
||||
movd r3, m3
|
||||
pshufb m5, m1
|
||||
movu m1, [base+resize_shuf+8+r3*2]
|
||||
psrlq m3, 32
|
||||
movd r1, m3
|
||||
pshufb m6, m1
|
||||
movu m1, [base+resize_shuf+8+r1*2]
|
||||
pshufb m7, m1
|
||||
.filter:
|
||||
mova [esp+4*16], m6
|
||||
mova [esp+5*16], m7
|
||||
movd r3, m2
|
||||
pshuflw m2, m2, q3232
|
||||
movd r1, m2
|
||||
movq m6, [base+resize_filter+r3*8]
|
||||
movq m7, [base+resize_filter+r1*8]
|
||||
pxor m3, m3
|
||||
punpcklbw m1, m3, m6
|
||||
punpcklbw m3, m7
|
||||
psraw m1, 8
|
||||
psraw m3, 8
|
||||
pmaddwd m4, m1
|
||||
pmaddwd m5, m3
|
||||
punpckhqdq m2, m2
|
||||
movd r3, m2
|
||||
psrlq m2, 32
|
||||
movd r1, m2
|
||||
phaddd m4, m5
|
||||
movq m2, [base+resize_filter+r3*8]
|
||||
movq m5, [base+resize_filter+r1*8]
|
||||
mova m6, [esp+4*16]
|
||||
mova m7, [esp+5*16]
|
||||
pxor m3, m3
|
||||
punpcklbw m1, m3, m2
|
||||
punpcklbw m3, m5
|
||||
psraw m1, 8
|
||||
psraw m3, 8
|
||||
pmaddwd m6, m1
|
||||
pmaddwd m7, m3
|
||||
%endif
|
||||
phaddd m6, m7
|
||||
phaddd m4, m6
|
||||
pxor m1, m1
|
||||
psubd m2, m12, m4
|
||||
psrad m2, 7
|
||||
packssdw m2, m2
|
||||
pmaxsw m2, m1
|
||||
pminsw m2, [rsp+16*3*ARCH_X86_32]
|
||||
movq [dstq+xq*2], m2
|
||||
paddd m0, m15
|
||||
add xd, 4
|
||||
%if STACK_ALIGNMENT >= 16
|
||||
cmp xd, dst_wd
|
||||
%else
|
||||
cmp xd, dst_wm
|
||||
%endif
|
||||
jl .loop_x
|
||||
add dstq, dst_stridemp
|
||||
add srcq, src_stridemp
|
||||
dec hd
|
||||
jg .loop_y
|
||||
RET
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; Copyright © 2018-2020, VideoLAN and dav1d authors
|
||||
; Copyright © 2018-2020, Two Orioles, LLC
|
||||
; Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2018-2021, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
|
@ -69,7 +69,6 @@ bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 1
|
|||
wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
||||
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
|
||||
resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
|
||||
db 7, 7, 7, 7, 7, 7, 7, 7
|
||||
|
||||
wm_420_sign: dd 0x01020102, 0x01010101
|
||||
wm_422_sign: dd 0x80808080, 0x7f7f7f7f
|
||||
|
@ -110,7 +109,7 @@ cextern resize_filter
|
|||
%endmacro
|
||||
|
||||
%macro HV_JMP_TABLE 5-*
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
|
||||
%xdefine %%base %1_%3
|
||||
%assign %%types %4
|
||||
%if %%types & 1
|
||||
|
@ -141,68 +140,68 @@ cextern resize_filter
|
|||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIDIR_JMP_TABLE 1-*
|
||||
%xdefine %1_table (%%table - 2*%2)
|
||||
%xdefine %%base %1_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1)
|
||||
%macro BIDIR_JMP_TABLE 2-*
|
||||
%xdefine %1_%2_table (%%table - 2*%3)
|
||||
%xdefine %%base %1_%2_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 1
|
||||
dd %%prefix %+ .w%2 - %%base
|
||||
%rep %0 - 2
|
||||
dd %%prefix %+ .w%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%macro SCALED_JMP_TABLE 1-*
|
||||
%xdefine %1_table (%%table - %2)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1)
|
||||
%macro SCALED_JMP_TABLE 2-*
|
||||
%xdefine %1_%2_table (%%table - %3)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 1
|
||||
dw %%base %+ .w%2 - %%base
|
||||
%rep %0 - 2
|
||||
dw %%base %+ .w%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%rotate 1
|
||||
%rotate 2
|
||||
%%dy_1024:
|
||||
%xdefine %1_dy1_table (%%dy_1024 - %2)
|
||||
%rep %0 - 1
|
||||
dw %%base %+ .dy1_w%2 - %%base
|
||||
%xdefine %1_%2_dy1_table (%%dy_1024 - %3)
|
||||
%rep %0 - 2
|
||||
dw %%base %+ .dy1_w%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%rotate 1
|
||||
%rotate 2
|
||||
%%dy_2048:
|
||||
%xdefine %1_dy2_table (%%dy_2048 - %2)
|
||||
%rep %0 - 1
|
||||
dw %%base %+ .dy2_w%2 - %%base
|
||||
%xdefine %1_%2_dy2_table (%%dy_2048 - %3)
|
||||
%rep %0 - 2
|
||||
dw %%base %+ .dy2_w%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
|
||||
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
|
||||
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
|
||||
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
|
||||
|
||||
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
|
||||
|
||||
BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
|
||||
SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
|
||||
BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
|
||||
BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
|
||||
BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
|
||||
SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
|
||||
SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
|
||||
BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
|
||||
BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM avx2
|
||||
cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
|
||||
cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
|
||||
movifnidn mxyd, r6m ; mx
|
||||
lea r7, [put_avx2]
|
||||
tzcnt wd, wm
|
||||
|
@ -769,7 +768,7 @@ INIT_YMM avx2
|
|||
%endif
|
||||
RET
|
||||
|
||||
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
|
||||
cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
|
||||
movifnidn mxyd, r5m ; mx
|
||||
lea r6, [prep%+SUFFIX]
|
||||
tzcnt wd, wm
|
||||
|
@ -1439,7 +1438,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
|
|||
%assign FILTER_SHARP (2*15 << 16) | 3*15
|
||||
|
||||
%macro FN 4 ; fn, type, type_h, type_v
|
||||
cglobal %1_%2
|
||||
cglobal %1_%2_8bpc
|
||||
mov t0d, FILTER_%3
|
||||
%ifidn %3, %4
|
||||
mov t1d, t0d
|
||||
|
@ -1447,7 +1446,7 @@ cglobal %1_%2
|
|||
mov t1d, FILTER_%4
|
||||
%endif
|
||||
%ifnidn %2, regular ; skip the jump in the last filter
|
||||
jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
|
||||
jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
@ -1458,7 +1457,6 @@ DECLARE_REG_TMP 7, 8
|
|||
%endif
|
||||
|
||||
%define PUT_8TAP_FN FN put_8tap,
|
||||
|
||||
PUT_8TAP_FN sharp, SHARP, SHARP
|
||||
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
|
||||
PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
|
||||
|
@ -1469,7 +1467,7 @@ PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
|
|||
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
|
||||
PUT_8TAP_FN regular, REGULAR, REGULAR
|
||||
|
||||
cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
||||
cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
||||
imul mxd, mxm, 0x010101
|
||||
add mxd, t0d ; 8tap_h, mx, 4tap_h
|
||||
imul myd, mym, 0x010101
|
||||
|
@ -2124,7 +2122,6 @@ DECLARE_REG_TMP 6, 7
|
|||
%endif
|
||||
|
||||
%define PREP_8TAP_FN FN prep_8tap,
|
||||
|
||||
PREP_8TAP_FN sharp, SHARP, SHARP
|
||||
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
|
||||
PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
|
||||
|
@ -2135,7 +2132,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
|
|||
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
|
||||
PREP_8TAP_FN regular, REGULAR, REGULAR
|
||||
|
||||
cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
|
||||
cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
|
||||
imul mxd, mxm, 0x010101
|
||||
add mxd, t0d ; 8tap_h, mx, 4tap_h
|
||||
imul myd, mym, 0x010101
|
||||
|
@ -2725,26 +2722,26 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
%ifidn %1, put
|
||||
%assign isprep 0
|
||||
%if required_stack_alignment <= STACK_ALIGNMENT
|
||||
cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
|
||||
cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
|
||||
%else
|
||||
cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
|
||||
cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
|
||||
%endif
|
||||
%xdefine base_reg r12
|
||||
%define rndshift 10
|
||||
%else
|
||||
%assign isprep 1
|
||||
%if required_stack_alignment <= STACK_ALIGNMENT
|
||||
cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
||||
cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
||||
%xdefine tmp_stridem r14q
|
||||
%else
|
||||
cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
||||
cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
||||
%define tmp_stridem qword [rsp+120]
|
||||
%endif
|
||||
%xdefine base_reg r11
|
||||
%define rndshift 6
|
||||
%endif
|
||||
lea base_reg, [%1_8tap_scaled_avx2]
|
||||
%define base base_reg-%1_8tap_scaled_avx2
|
||||
lea base_reg, [%1_8tap_scaled_8bpc_avx2]
|
||||
%define base base_reg-%1_8tap_scaled_8bpc_avx2
|
||||
tzcnt wd, wm
|
||||
vpbroadcastd m8, dxm
|
||||
%if isprep && UNIX64
|
||||
|
@ -2817,7 +2814,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
dec srcq
|
||||
movd xm15, t0d
|
||||
punpckldq m8, m9, m8
|
||||
paddd m14, m8 ; mx+dx*[0-1]
|
||||
paddd m14, m8 ; mx+dx*[0,1]
|
||||
vpbroadcastd m11, [base+pd_0x4000]
|
||||
vpbroadcastd xm15, xm15
|
||||
pand m8, m14, m10
|
||||
|
@ -2868,8 +2865,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
lea r4d, [t1+r4]
|
||||
cmovnz r6q, [base+subpel_filters+r4*8]
|
||||
movq xm11, r6q
|
||||
punpcklbw xm11, xm11
|
||||
psraw xm11, 8
|
||||
pmovsxbw xm11, xm11
|
||||
pshufd xm8, xm11, q0000
|
||||
pshufd xm9, xm11, q1111
|
||||
pshufd xm10, xm11, q2222
|
||||
|
@ -2997,8 +2993,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
lea r4d, [t1+r4]
|
||||
cmovnz r6q, [base+subpel_filters+r4*8]
|
||||
movq xm10, r6q
|
||||
punpcklbw xm10, xm10
|
||||
psraw xm10, 8
|
||||
pmovsxbw xm10, xm10
|
||||
pshufd xm7, xm10, q0000
|
||||
pshufd xm8, xm10, q1111
|
||||
pshufd xm9, xm10, q2222
|
||||
|
@ -3172,9 +3167,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
lea r4d, [t1+r4]
|
||||
cmovnz r6q, [base+subpel_filters+r4*8]
|
||||
movq xm11, r6q
|
||||
punpcklbw xm11, xm11
|
||||
psraw xm11, 8
|
||||
vinserti128 m11, xm11, 1
|
||||
punpcklqdq xm11, xm11
|
||||
pmovsxbw m11, xm11
|
||||
pshufd m8, m11, q0000
|
||||
pshufd m9, m11, q1111
|
||||
pmaddwd m4, m0, m8
|
||||
|
@ -3320,8 +3314,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
vpbroadcastq m2, [srcq+ssq*1]
|
||||
add srcq, ss3q
|
||||
movq xm10, r4q
|
||||
punpcklbw xm10, xm10
|
||||
psraw xm10, 8
|
||||
pmovsxbw xm10, xm10
|
||||
vpblendd m15, m7, 0xaa
|
||||
pblendvb m15, m11, m8
|
||||
pshufd xm8, xm10, q0000
|
||||
|
@ -3417,9 +3410,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
punpcklqdq m15, m15
|
||||
pblendvb m15, m11, m8
|
||||
movq xm10, r4q
|
||||
punpcklbw xm10, xm10
|
||||
psraw xm10, 8
|
||||
vinserti128 m10, xm10, 1
|
||||
punpcklqdq xm10, xm10
|
||||
pmovsxbw m10, xm10
|
||||
pshufb m2, m14
|
||||
pshufb m3, m14
|
||||
pshufb m4, m14
|
||||
|
@ -3526,8 +3518,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
vpbroadcastd m15, xm15
|
||||
paddd m14, m8 ; mx+dx*[0-7]
|
||||
movq xm0, r4q
|
||||
punpcklbw xm0, xm0
|
||||
psraw xm0, 8
|
||||
pmovsxbw xm0, xm0
|
||||
mova [rsp+96], xm0
|
||||
jmp .dy1_hloop
|
||||
.dy1_hloop_prep:
|
||||
|
@ -3695,8 +3686,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
pmaddubsw m0, m15
|
||||
pmaddubsw m1, m15
|
||||
movq xm11, r4q
|
||||
punpcklbw xm11, xm11
|
||||
psraw xm11, 8
|
||||
pmovsxbw xm11, xm11
|
||||
phaddw m0, m1
|
||||
pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
|
||||
pshufd xm8, xm11, q0000
|
||||
|
@ -3792,9 +3782,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
pmaddubsw xm1, xm15
|
||||
pmaddubsw m3, m15
|
||||
movq xm11, r4q
|
||||
punpcklbw xm11, xm11
|
||||
psraw xm11, 8
|
||||
vinserti128 m11, xm11, 1
|
||||
punpcklqdq xm11, xm11
|
||||
pmovsxbw m11, xm11
|
||||
phaddw m0, m2
|
||||
phaddw m1, m3
|
||||
pmulhrsw m0, m12 ; 0 2 _ 4
|
||||
|
@ -3889,8 +3878,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
vpbroadcastd m15, xm15
|
||||
paddd m14, m8 ; mx+dx*[0-7]
|
||||
movq xm0, r4q
|
||||
punpcklbw xm0, xm0
|
||||
psraw xm0, 8
|
||||
pmovsxbw xm0, xm0
|
||||
mova [rsp+0x50], xm0
|
||||
jmp .dy2_hloop
|
||||
.dy2_hloop_prep:
|
||||
|
@ -4025,10 +4013,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
%endmacro
|
||||
|
||||
%macro BILIN_SCALED_FN 1
|
||||
cglobal %1_bilin_scaled
|
||||
cglobal %1_bilin_scaled_8bpc
|
||||
mov t0d, (5*15 << 16) | 5*15
|
||||
mov t1d, t0d
|
||||
jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
|
||||
jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
|
||||
%endmacro
|
||||
|
||||
%if WIN64
|
||||
|
@ -4113,11 +4101,11 @@ MC_8TAP_SCALED prep
|
|||
paddd m%1, m0, m%2
|
||||
%endmacro
|
||||
|
||||
cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
|
||||
cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
|
||||
%if WIN64
|
||||
sub rsp, 0xa0
|
||||
%endif
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
|
||||
.loop:
|
||||
psrad m7, 13
|
||||
psrad m0, 13
|
||||
|
@ -4127,13 +4115,13 @@ cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
|
|||
mova [tmpq+tsq*0], xm7
|
||||
vextracti128 [tmpq+tsq*2], m7, 1
|
||||
dec r4d
|
||||
jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
|
||||
jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
|
||||
lea tmpq, [tmpq+tsq*4]
|
||||
jmp .loop
|
||||
|
||||
cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
|
||||
beta, filter, tmp1, delta, my, gamma
|
||||
cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
|
||||
beta, filter, tmp1, delta, my, gamma
|
||||
%if WIN64
|
||||
sub rsp, 0xa0
|
||||
%assign xmm_regs_used 16
|
||||
|
@ -4389,7 +4377,7 @@ ALIGN function_align
|
|||
add tmp2q, %1*32
|
||||
%endmacro
|
||||
|
||||
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
%define base r6-avg %+ SUFFIX %+ _table
|
||||
lea r6, [avg %+ SUFFIX %+ _table]
|
||||
tzcnt wd, wm
|
||||
|
@ -4419,7 +4407,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
|
|||
|
||||
%define W_AVG_INC_PTR AVG_INC_PTR
|
||||
|
||||
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
%define base r6-w_avg %+ SUFFIX %+ _table
|
||||
lea r6, [w_avg %+ SUFFIX %+ _table]
|
||||
tzcnt wd, wm
|
||||
|
@ -4469,7 +4457,7 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
|
|||
add tmp1q, %1*32
|
||||
%endmacro
|
||||
|
||||
cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-mask %+ SUFFIX %+ _table
|
||||
lea r7, [mask %+ SUFFIX %+ _table]
|
||||
tzcnt wd, wm
|
||||
|
@ -4512,7 +4500,7 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|||
packuswb m%1, m1
|
||||
%endmacro
|
||||
|
||||
cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||||
cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||||
%define base r6-blend_avx2_table
|
||||
lea r6, [blend_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -4629,7 +4617,7 @@ ALIGN function_align
|
|||
jg .w32
|
||||
RET
|
||||
|
||||
cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
|
||||
cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
|
||||
%define base r5-blend_v_avx2_table
|
||||
lea r5, [blend_v_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -4740,7 +4728,7 @@ ALIGN function_align
|
|||
jg .w32_loop
|
||||
RET
|
||||
|
||||
cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
|
||||
cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
|
||||
%define base r5-blend_h_avx2_table
|
||||
lea r5, [blend_h_avx2_table]
|
||||
mov r6d, wd
|
||||
|
@ -4866,7 +4854,7 @@ ALIGN function_align
|
|||
jl .w32_loop0
|
||||
RET
|
||||
|
||||
cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
|
||||
cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
|
||||
bottomext, rightext
|
||||
; we assume that the buffer (stride) is larger than width, so we can
|
||||
; safely overwrite by a few bytes
|
||||
|
@ -5053,8 +5041,8 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
|
|||
.end:
|
||||
RET
|
||||
|
||||
cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0
|
||||
cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
|
||||
dst_w, h, src_w, dx, mx0
|
||||
sub dword mx0m, 4<<14
|
||||
sub dword src_wm, 8
|
||||
vpbroadcastd m5, dxm
|
||||
|
@ -5117,27 +5105,23 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
|
|||
vptest m1, m1
|
||||
jz .filter
|
||||
|
||||
movd r8d, xm1
|
||||
pextrd r9d, xm1, 1
|
||||
pextrd r10d, xm1, 2
|
||||
pextrd r11d, xm1, 3
|
||||
movsxd r8, r8d
|
||||
movsxd r9, r9d
|
||||
movsxd r10, r10d
|
||||
movsxd r11, r11d
|
||||
movq r9, xm1
|
||||
pextrq r11, xm1, 1
|
||||
movsxd r8, r9d
|
||||
sar r9, 32
|
||||
movsxd r10, r11d
|
||||
sar r11, 32
|
||||
vextracti128 xm1, m1, 1
|
||||
movq xm14, [base+resize_shuf+4+r8]
|
||||
movq xm0, [base+resize_shuf+4+r10]
|
||||
movhps xm14, [base+resize_shuf+4+r9]
|
||||
movhps xm0, [base+resize_shuf+4+r11]
|
||||
movd r8d, xm1
|
||||
pextrd r9d, xm1, 1
|
||||
pextrd r10d, xm1, 2
|
||||
pextrd r11d, xm1, 3
|
||||
movsxd r8, r8d
|
||||
movsxd r9, r9d
|
||||
movsxd r10, r10d
|
||||
movsxd r11, r11d
|
||||
movq r9, xm1
|
||||
pextrq r11, xm1, 1
|
||||
movsxd r8, r9d
|
||||
sar r9, 32
|
||||
movsxd r10, r11d
|
||||
sar r11, 32
|
||||
vinserti128 m14, [base+resize_shuf+4+r8], 1
|
||||
vinserti128 m0, [base+resize_shuf+4+r10], 1
|
||||
vpbroadcastq m10, [base+resize_shuf+4+r9]
|
||||
|
@ -5191,7 +5175,7 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
|
|||
jg .loop_y
|
||||
RET
|
||||
|
||||
cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-w_mask_420_avx2_table
|
||||
lea r7, [w_mask_420_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -5397,7 +5381,7 @@ cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|||
jg .w128_loop
|
||||
RET
|
||||
|
||||
cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-w_mask_422_avx2_table
|
||||
lea r7, [w_mask_422_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -5570,7 +5554,7 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|||
jg .w128_loop
|
||||
RET
|
||||
|
||||
cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-w_mask_444_avx2_table
|
||||
lea r7, [w_mask_444_avx2_table]
|
||||
tzcnt wd, wm
|
||||
|
|
|
@ -146,7 +146,7 @@ cextern mc_subpel_filters
|
|||
%endmacro
|
||||
|
||||
%macro HV_JMP_TABLE 5-*
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
|
||||
%xdefine %%base %1_%3
|
||||
%assign %%types %4
|
||||
%if %%types & 1
|
||||
|
@ -177,30 +177,30 @@ cextern mc_subpel_filters
|
|||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIDIR_JMP_TABLE 1-*
|
||||
%xdefine %1_table (%%table - 2*%2)
|
||||
%xdefine %%base %1_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1)
|
||||
%macro BIDIR_JMP_TABLE 2-*
|
||||
%xdefine %1_%2_table (%%table - 2*%3)
|
||||
%xdefine %%base %1_%2_table
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
|
||||
%%table:
|
||||
%rep %0 - 1
|
||||
dd %%prefix %+ .w%2 - %%base
|
||||
%rep %0 - 2
|
||||
dd %%prefix %+ .w%3 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
|
||||
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
|
||||
|
||||
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
|
||||
|
||||
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
|
||||
HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
@ -221,7 +221,7 @@ INIT_ZMM cpuname
|
|||
DECLARE_REG_TMP 3, 5, 6
|
||||
|
||||
INIT_ZMM avx512icl
|
||||
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
|
||||
cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
|
||||
movifnidn mxyd, r5m ; mx
|
||||
lea t2, [prep_avx512icl]
|
||||
tzcnt wd, wm
|
||||
|
@ -772,7 +772,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
|
|||
%assign FILTER_SHARP (2*15 << 16) | 3*15
|
||||
|
||||
%macro FN 4 ; fn, type, type_h, type_v
|
||||
cglobal %1_%2
|
||||
cglobal %1_%2_8bpc
|
||||
mov t0d, FILTER_%3
|
||||
%ifidn %3, %4
|
||||
mov t1d, t0d
|
||||
|
@ -780,7 +780,7 @@ cglobal %1_%2
|
|||
mov t1d, FILTER_%4
|
||||
%endif
|
||||
%ifnidn %2, regular ; skip the jump in the last filter
|
||||
jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
|
||||
jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
@ -829,7 +829,7 @@ PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
|
|||
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
|
||||
PREP_8TAP_FN regular, REGULAR, REGULAR
|
||||
|
||||
cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
|
||||
cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
|
||||
imul mxd, mxm, 0x010101
|
||||
add mxd, t0d ; 8tap_h, mx, 4tap_h
|
||||
imul myd, mym, 0x010101
|
||||
|
@ -1753,7 +1753,7 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
add tmp2q, %1*mmsize
|
||||
%endmacro
|
||||
|
||||
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
%define base r6-avg_avx512icl_table
|
||||
lea r6, [avg_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -1783,7 +1783,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
|
|||
|
||||
%define W_AVG_INC_PTR AVG_INC_PTR
|
||||
|
||||
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
|
||||
%define base r6-w_avg_avx512icl_table
|
||||
lea r6, [w_avg_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -1837,7 +1837,7 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
|
|||
add tmp1q, %1*64
|
||||
%endmacro
|
||||
|
||||
cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-mask_avx512icl_table
|
||||
lea r7, [mask_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -1877,7 +1877,7 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|||
packuswb m%1, m1
|
||||
%endmacro
|
||||
|
||||
cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-w_mask_420_avx512icl_table
|
||||
lea r7, [w_mask_420_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -2070,7 +2070,7 @@ cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|||
jg .w128_loop
|
||||
RET
|
||||
|
||||
cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-w_mask_422_avx512icl_table
|
||||
lea r7, [w_mask_422_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
@ -2243,7 +2243,7 @@ cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|||
jg .w128_loop
|
||||
RET
|
||||
|
||||
cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
||||
%define base r7-w_mask_444_avx512icl_table
|
||||
lea r7, [w_mask_444_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -28,35 +28,19 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/mc.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
#define decl_fn(type, name) \
|
||||
decl_##type##_fn(name##_sse2); \
|
||||
decl_##type##_fn(name##_ssse3); \
|
||||
decl_##type##_fn(name##_avx2); \
|
||||
decl_##type##_fn(name##_avx512icl);
|
||||
decl_##type##_fn(BF(name, sse2)); \
|
||||
decl_##type##_fn(BF(name, ssse3)); \
|
||||
decl_##type##_fn(BF(name, avx2)); \
|
||||
decl_##type##_fn(BF(name, avx512icl));
|
||||
#define init_mc_fn(type, name, suffix) \
|
||||
c->mc[type] = dav1d_put_##name##_##suffix
|
||||
c->mc[type] = BF(dav1d_put_##name, suffix)
|
||||
#define init_mct_fn(type, name, suffix) \
|
||||
c->mct[type] = dav1d_prep_##name##_##suffix
|
||||
c->mct[type] = BF(dav1d_prep_##name, suffix)
|
||||
#define init_mc_scaled_fn(type, name, suffix) \
|
||||
c->mc_scaled[type] = dav1d_put_##name##_##suffix
|
||||
c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
|
||||
#define init_mct_scaled_fn(type, name, suffix) \
|
||||
c->mct_scaled[type] = dav1d_prep_##name##_##suffix
|
||||
#else
|
||||
#define decl_fn(type, name) \
|
||||
decl_##type##_fn(name##_16bpc_sse2); \
|
||||
decl_##type##_fn(name##_16bpc_ssse3); \
|
||||
decl_##type##_fn(name##_16bpc_avx2); \
|
||||
decl_##type##_fn(name##_16bpc_avx512icl);
|
||||
#define init_mc_fn(type, name, suffix) \
|
||||
c->mc[type] = dav1d_put_##name##_16bpc_##suffix
|
||||
#define init_mct_fn(type, name, suffix) \
|
||||
c->mct[type] = dav1d_prep_##name##_16bpc_##suffix
|
||||
#define init_mc_scaled_fn(type, name, suffix) \
|
||||
c->mc_scaled[type] = dav1d_put_##name##_16bpc_##suffix
|
||||
#define init_mct_scaled_fn(type, name, suffix) \
|
||||
c->mct_scaled[type] = dav1d_prep_##name##_16bpc_##suffix
|
||||
#endif
|
||||
c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
|
||||
|
||||
decl_fn(mc, dav1d_put_8tap_regular);
|
||||
decl_fn(mc, dav1d_put_8tap_regular_smooth);
|
||||
|
@ -113,14 +97,13 @@ decl_fn(blend_dir, dav1d_blend_v);
|
|||
decl_fn(blend_dir, dav1d_blend_h);
|
||||
|
||||
decl_fn(warp8x8, dav1d_warp_affine_8x8);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
|
||||
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
|
||||
decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
|
||||
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
|
||||
|
||||
decl_fn(emu_edge, dav1d_emu_edge);
|
||||
|
||||
decl_resize_fn(dav1d_resize_avx2);
|
||||
decl_resize_fn(dav1d_resize_ssse3);
|
||||
decl_fn(resize, dav1d_resize);
|
||||
|
||||
COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
@ -140,8 +123,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
|
||||
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_sse2;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
|
||||
#endif
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
|
||||
|
@ -193,40 +176,26 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
|
||||
#endif
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->avg = dav1d_avg_ssse3;
|
||||
c->w_avg = dav1d_w_avg_ssse3;
|
||||
c->mask = dav1d_mask_ssse3;
|
||||
c->w_mask[2] = dav1d_w_mask_420_ssse3;
|
||||
c->blend = dav1d_blend_ssse3;
|
||||
c->blend_v = dav1d_blend_v_ssse3;
|
||||
c->blend_h = dav1d_blend_h_ssse3;
|
||||
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_ssse3;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_ssse3;
|
||||
c->resize = dav1d_resize_ssse3;
|
||||
#else
|
||||
c->avg = dav1d_avg_16bpc_ssse3;
|
||||
c->w_avg = dav1d_w_avg_16bpc_ssse3;
|
||||
c->mask = dav1d_mask_16bpc_ssse3;
|
||||
c->w_mask[0] = dav1d_w_mask_444_16bpc_ssse3;
|
||||
c->w_mask[1] = dav1d_w_mask_422_16bpc_ssse3;
|
||||
c->w_mask[2] = dav1d_w_mask_420_16bpc_ssse3;
|
||||
c->blend = dav1d_blend_16bpc_ssse3;
|
||||
c->blend_v = dav1d_blend_v_16bpc_ssse3;
|
||||
c->blend_h = dav1d_blend_h_16bpc_ssse3;
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_16bpc_ssse3;
|
||||
#endif
|
||||
c->avg = BF(dav1d_avg, ssse3);
|
||||
c->w_avg = BF(dav1d_w_avg, ssse3);
|
||||
c->mask = BF(dav1d_mask, ssse3);
|
||||
c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
|
||||
c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
|
||||
c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
|
||||
c->blend = BF(dav1d_blend, ssse3);
|
||||
c->blend_v = BF(dav1d_blend_v, ssse3);
|
||||
c->blend_h = BF(dav1d_blend_h, ssse3);
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
|
||||
c->emu_edge = BF(dav1d_emu_edge, ssse3);
|
||||
c->resize = BF(dav1d_resize, ssse3);
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_sse4;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
|
||||
#endif
|
||||
|
||||
#if ARCH_X86_64
|
||||
|
@ -255,7 +224,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
|
||||
|
||||
#if BITDEPTH == 8
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
|
||||
|
@ -278,35 +246,19 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
|
||||
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
|
||||
|
||||
c->avg = dav1d_avg_avx2;
|
||||
c->w_avg = dav1d_w_avg_avx2;
|
||||
c->mask = dav1d_mask_avx2;
|
||||
c->w_mask[0] = dav1d_w_mask_444_avx2;
|
||||
c->w_mask[1] = dav1d_w_mask_422_avx2;
|
||||
c->w_mask[2] = dav1d_w_mask_420_avx2;
|
||||
c->blend = dav1d_blend_avx2;
|
||||
c->blend_v = dav1d_blend_v_avx2;
|
||||
c->blend_h = dav1d_blend_h_avx2;
|
||||
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_avx2;
|
||||
c->resize = dav1d_resize_avx2;
|
||||
#else
|
||||
c->avg = dav1d_avg_16bpc_avx2;
|
||||
c->w_avg = dav1d_w_avg_16bpc_avx2;
|
||||
c->mask = dav1d_mask_16bpc_avx2;
|
||||
c->w_mask[0] = dav1d_w_mask_444_16bpc_avx2;
|
||||
c->w_mask[1] = dav1d_w_mask_422_16bpc_avx2;
|
||||
c->w_mask[2] = dav1d_w_mask_420_16bpc_avx2;
|
||||
c->blend = dav1d_blend_16bpc_avx2;
|
||||
c->blend_v = dav1d_blend_v_16bpc_avx2;
|
||||
c->blend_h = dav1d_blend_h_16bpc_avx2;
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_16bpc_avx2;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_16bpc_avx2;
|
||||
c->emu_edge = dav1d_emu_edge_16bpc_avx2;
|
||||
#endif
|
||||
c->avg = BF(dav1d_avg, avx2);
|
||||
c->w_avg = BF(dav1d_w_avg, avx2);
|
||||
c->mask = BF(dav1d_mask, avx2);
|
||||
c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
|
||||
c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
|
||||
c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
|
||||
c->blend = BF(dav1d_blend, avx2);
|
||||
c->blend_v = BF(dav1d_blend_v, avx2);
|
||||
c->blend_h = BF(dav1d_blend_h, avx2);
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
|
||||
c->emu_edge = BF(dav1d_emu_edge, avx2);
|
||||
c->resize = BF(dav1d_resize, avx2);
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
|
||||
return;
|
||||
|
@ -323,12 +275,12 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
|
||||
|
||||
c->avg = dav1d_avg_avx512icl;
|
||||
c->w_avg = dav1d_w_avg_avx512icl;
|
||||
c->mask = dav1d_mask_avx512icl;
|
||||
c->w_mask[0] = dav1d_w_mask_444_avx512icl;
|
||||
c->w_mask[1] = dav1d_w_mask_422_avx512icl;
|
||||
c->w_mask[2] = dav1d_w_mask_420_avx512icl;
|
||||
c->avg = BF(dav1d_avg, avx512icl);
|
||||
c->w_avg = BF(dav1d_w_avg, avx512icl);
|
||||
c->mask = BF(dav1d_mask, avx512icl);
|
||||
c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
|
||||
c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
|
||||
c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,169 @@
|
|||
; Copyright © 2021, VideoLAN and dav1d authors
|
||||
; Copyright © 2021, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
%macro JMP_TABLE 2-*
|
||||
%xdefine %%prefix mangle(private_prefix %+ _%1)
|
||||
%1_table:
|
||||
%xdefine %%base %1_table
|
||||
%rep %0 - 1
|
||||
dd %%prefix %+ .w%2 - %%base
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
|
||||
db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
|
||||
%endif
|
||||
JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse2
|
||||
; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
|
||||
cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
|
||||
add bx4d, bw4d
|
||||
tzcnt bw4d, bw4d
|
||||
mova m2, [aq]
|
||||
LEA aq, splat_mv_sse2_table
|
||||
lea bx4q, [bx4q*3-32]
|
||||
movsxd bw4q, [aq+bw4q*4]
|
||||
movifnidn bh4d, bh4m
|
||||
pshufd m0, m2, q0210
|
||||
pshufd m1, m2, q1021
|
||||
pshufd m2, m2, q2102
|
||||
add bw4q, aq
|
||||
.loop:
|
||||
mov aq, [rrq]
|
||||
add rrq, gprsize
|
||||
lea aq, [aq+bx4q*4]
|
||||
jmp bw4q
|
||||
.w32:
|
||||
mova [aq-16*16], m0
|
||||
mova [aq-16*15], m1
|
||||
mova [aq-16*14], m2
|
||||
mova [aq-16*13], m0
|
||||
mova [aq-16*12], m1
|
||||
mova [aq-16*11], m2
|
||||
mova [aq-16*10], m0
|
||||
mova [aq-16* 9], m1
|
||||
mova [aq-16* 8], m2
|
||||
mova [aq-16* 7], m0
|
||||
mova [aq-16* 6], m1
|
||||
mova [aq-16* 5], m2
|
||||
.w16:
|
||||
mova [aq-16* 4], m0
|
||||
mova [aq-16* 3], m1
|
||||
mova [aq-16* 2], m2
|
||||
mova [aq-16* 1], m0
|
||||
mova [aq+16* 0], m1
|
||||
mova [aq+16* 1], m2
|
||||
.w8:
|
||||
mova [aq+16* 2], m0
|
||||
mova [aq+16* 3], m1
|
||||
mova [aq+16* 4], m2
|
||||
.w4:
|
||||
mova [aq+16* 5], m0
|
||||
mova [aq+16* 6], m1
|
||||
mova [aq+16* 7], m2
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
.w2:
|
||||
movu [aq+104], m0
|
||||
movq [aq+120], m1
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
.w1:
|
||||
movq [aq+116], m0
|
||||
movd [aq+124], m2
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
INIT_YMM avx2
|
||||
cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
|
||||
add bx4d, bw4d
|
||||
tzcnt bw4d, bw4d
|
||||
vbroadcasti128 m0, [aq]
|
||||
lea aq, [splat_mv_avx2_table]
|
||||
lea bx4q, [bx4q*3-32]
|
||||
movsxd bw4q, [aq+bw4q*4]
|
||||
pshufb m0, [splat_mv_shuf]
|
||||
movifnidn bh4d, bh4m
|
||||
pshufd m1, m0, q2102
|
||||
pshufd m2, m0, q1021
|
||||
add bw4q, aq
|
||||
.loop:
|
||||
mov aq, [rrq]
|
||||
add rrq, gprsize
|
||||
lea aq, [aq+bx4q*4]
|
||||
jmp bw4q
|
||||
.w32:
|
||||
mova [aq-32*8], m0
|
||||
mova [aq-32*7], m1
|
||||
mova [aq-32*6], m2
|
||||
mova [aq-32*5], m0
|
||||
mova [aq-32*4], m1
|
||||
mova [aq-32*3], m2
|
||||
.w16:
|
||||
mova [aq-32*2], m0
|
||||
mova [aq-32*1], m1
|
||||
mova [aq+32*0], m2
|
||||
.w8:
|
||||
mova [aq+32*1], m0
|
||||
mova [aq+32*2], m1
|
||||
mova [aq+32*3], m2
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
.w4:
|
||||
movu [aq+ 80], m0
|
||||
mova [aq+112], xm1
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
.w2:
|
||||
movu [aq+104], xm0
|
||||
movq [aq+120], xm2
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
.w1:
|
||||
movq [aq+116], xm0
|
||||
movd [aq+124], xm1
|
||||
dec bh4d
|
||||
jg .loop
|
||||
RET
|
||||
%endif
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2021, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
decl_splat_mv_fn(dav1d_splat_mv_sse2);
|
||||
decl_splat_mv_fn(dav1d_splat_mv_avx2);
|
||||
|
||||
COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
|
||||
c->splat_mv = dav1d_splat_mv_sse2;
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
c->splat_mv = dav1d_splat_mv_avx2;
|
||||
#endif
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
/******************************************************************************
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2015 Martin Storsjo
|
||||
* Copyright © 2015 Janne Grunau
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define PRIVATE_PREFIX checkasm_
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "src/arm/32/util.S"
|
||||
|
||||
const register_init, align=3
|
||||
.quad 0x21f86d66c8ca00ce
|
||||
.quad 0x75b6ba21077c48ad
|
||||
.quad 0xed56bb2dcb3c7736
|
||||
.quad 0x8bda43d3fd1a7e06
|
||||
.quad 0xb64a9c9e5d318408
|
||||
.quad 0xdf9a54b303f1d3a3
|
||||
.quad 0x4a75479abd64e097
|
||||
.quad 0x249214109d5d1c88
|
||||
endconst
|
||||
|
||||
const error_message_fpscr
|
||||
.asciz "failed to preserve register FPSCR, changed bits: %x"
|
||||
error_message_gpr:
|
||||
.asciz "failed to preserve register r%d"
|
||||
error_message_vfp:
|
||||
.asciz "failed to preserve register d%d"
|
||||
error_message_stack:
|
||||
.asciz "failed to preserve stack"
|
||||
endconst
|
||||
|
||||
@ max number of args used by any asm function.
|
||||
#define MAX_ARGS 15
|
||||
|
||||
#define ARG_STACK 4*(MAX_ARGS - 4)
|
||||
|
||||
@ Align the used stack space to 8 to preserve the stack alignment.
|
||||
@ +8 for stack canary reference.
|
||||
#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed + 8)
|
||||
|
||||
.macro clobbercheck variant
|
||||
.equ pushed, 4*9
|
||||
function checked_call_\variant, export=1
|
||||
push {r4-r11, lr}
|
||||
.ifc \variant, vfp
|
||||
vpush {d8-d15}
|
||||
fmrx r4, FPSCR
|
||||
push {r4}
|
||||
.equ pushed, pushed + 16*4 + 4
|
||||
.endif
|
||||
|
||||
movrel r12, register_init
|
||||
.ifc \variant, vfp
|
||||
vldm r12, {d8-d15}
|
||||
.endif
|
||||
ldm r12, {r4-r11}
|
||||
|
||||
sub sp, sp, #ARG_STACK_A
|
||||
.equ pos, 0
|
||||
.rept MAX_ARGS-4
|
||||
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos]
|
||||
str r12, [sp, #pos]
|
||||
.equ pos, pos + 4
|
||||
.endr
|
||||
|
||||
@ For stack overflows, the callee is free to overwrite the parameters
|
||||
@ that were passed on the stack (if any), so we can only check after
|
||||
@ that point. First figure out how many parameters the function
|
||||
@ really took on the stack:
|
||||
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
|
||||
@ Load the first non-parameter value from the stack, that should be
|
||||
@ left untouched by the function. Store a copy of it inverted, so that
|
||||
@ e.g. overwriting everything with zero would be noticed.
|
||||
ldr r12, [sp, r12, lsl #2]
|
||||
mvn r12, r12
|
||||
str r12, [sp, #ARG_STACK_A - 4]
|
||||
|
||||
mov r12, r0
|
||||
mov r0, r2
|
||||
mov r1, r3
|
||||
ldrd r2, r3, [sp, #ARG_STACK_A + pushed]
|
||||
@ Call the target function
|
||||
blx r12
|
||||
|
||||
@ Load the number of stack parameters, stack canary and its reference
|
||||
ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
|
||||
ldr r2, [sp, r12, lsl #2]
|
||||
ldr r3, [sp, #ARG_STACK_A - 4]
|
||||
|
||||
add sp, sp, #ARG_STACK_A
|
||||
push {r0, r1}
|
||||
|
||||
mvn r3, r3
|
||||
cmp r2, r3
|
||||
bne 5f
|
||||
|
||||
movrel r12, register_init
|
||||
.ifc \variant, vfp
|
||||
.macro check_reg_vfp, dreg, offset
|
||||
ldrd r2, r3, [r12, #8 * (\offset)]
|
||||
vmov r0, lr, \dreg
|
||||
eor r2, r2, r0
|
||||
eor r3, r3, lr
|
||||
orrs r2, r2, r3
|
||||
bne 4f
|
||||
.endm
|
||||
|
||||
.irp n, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
@ keep track of the checked double/SIMD register
|
||||
mov r1, #\n
|
||||
check_reg_vfp d\n, \n-8
|
||||
.endr
|
||||
.purgem check_reg_vfp
|
||||
|
||||
fmrx r1, FPSCR
|
||||
ldr r3, [sp, #8]
|
||||
eor r1, r1, r3
|
||||
@ Ignore changes in bits 0-4 and 7
|
||||
bic r1, r1, #0x9f
|
||||
@ Ignore changes in the topmost 5 bits
|
||||
bics r1, r1, #0xf8000000
|
||||
bne 3f
|
||||
.endif
|
||||
|
||||
@ keep track of the checked GPR
|
||||
mov r1, #4
|
||||
.macro check_reg reg1, reg2=
|
||||
ldrd r2, r3, [r12], #8
|
||||
eors r2, r2, \reg1
|
||||
bne 2f
|
||||
add r1, r1, #1
|
||||
.ifnb \reg2
|
||||
eors r3, r3, \reg2
|
||||
bne 2f
|
||||
.endif
|
||||
add r1, r1, #1
|
||||
.endm
|
||||
check_reg r4, r5
|
||||
check_reg r6, r7
|
||||
@ r9 is a volatile register in the ios ABI
|
||||
#ifdef __APPLE__
|
||||
check_reg r8
|
||||
#else
|
||||
check_reg r8, r9
|
||||
#endif
|
||||
check_reg r10, r11
|
||||
.purgem check_reg
|
||||
|
||||
b 0f
|
||||
5:
|
||||
movrel r0, error_message_stack
|
||||
b 1f
|
||||
4:
|
||||
movrel r0, error_message_vfp
|
||||
b 1f
|
||||
3:
|
||||
movrel r0, error_message_fpscr
|
||||
b 1f
|
||||
2:
|
||||
movrel r0, error_message_gpr
|
||||
1:
|
||||
#ifdef PREFIX
|
||||
bl _checkasm_fail_func
|
||||
#else
|
||||
bl checkasm_fail_func
|
||||
#endif
|
||||
0:
|
||||
pop {r0, r1}
|
||||
.ifc \variant, vfp
|
||||
pop {r2}
|
||||
fmxr FPSCR, r2
|
||||
vpop {d8-d15}
|
||||
.endif
|
||||
pop {r4-r11, pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
clobbercheck vfp
|
|
@ -0,0 +1,211 @@
|
|||
/******************************************************************************
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2015 Martin Storsjo
|
||||
* Copyright © 2015 Janne Grunau
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define PRIVATE_PREFIX checkasm_
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "src/arm/64/util.S"
|
||||
|
||||
const register_init, align=4
|
||||
.quad 0x21f86d66c8ca00ce
|
||||
.quad 0x75b6ba21077c48ad
|
||||
.quad 0xed56bb2dcb3c7736
|
||||
.quad 0x8bda43d3fd1a7e06
|
||||
.quad 0xb64a9c9e5d318408
|
||||
.quad 0xdf9a54b303f1d3a3
|
||||
.quad 0x4a75479abd64e097
|
||||
.quad 0x249214109d5d1c88
|
||||
.quad 0x1a1b2550a612b48c
|
||||
.quad 0x79445c159ce79064
|
||||
.quad 0x2eed899d5a28ddcd
|
||||
.quad 0x86b2536fcd8cf636
|
||||
.quad 0xb0856806085e7943
|
||||
.quad 0x3f2bf84fc0fcca4e
|
||||
.quad 0xacbd382dcf5b8de2
|
||||
.quad 0xd229e1f5b281303f
|
||||
.quad 0x71aeaff20b095fd9
|
||||
.quad 0xab63e2e11fa38ed9
|
||||
endconst
|
||||
|
||||
|
||||
const error_message_register
|
||||
.asciz "failed to preserve register"
|
||||
error_message_stack:
|
||||
.asciz "stack clobbered"
|
||||
endconst
|
||||
|
||||
|
||||
// max number of args used by any asm function.
|
||||
#define MAX_ARGS 15
|
||||
|
||||
#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
|
||||
|
||||
function stack_clobber, export=1
|
||||
mov x3, sp
|
||||
mov x2, #CLOBBER_STACK
|
||||
1:
|
||||
stp x0, x1, [sp, #-16]!
|
||||
subs x2, x2, #16
|
||||
b.gt 1b
|
||||
mov sp, x3
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// + 16 for stack canary reference
|
||||
#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16)
|
||||
|
||||
function checked_call, export=1
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
stp x19, x20, [sp, #-16]!
|
||||
stp x21, x22, [sp, #-16]!
|
||||
stp x23, x24, [sp, #-16]!
|
||||
stp x25, x26, [sp, #-16]!
|
||||
stp x27, x28, [sp, #-16]!
|
||||
stp d8, d9, [sp, #-16]!
|
||||
stp d10, d11, [sp, #-16]!
|
||||
stp d12, d13, [sp, #-16]!
|
||||
stp d14, d15, [sp, #-16]!
|
||||
|
||||
movrel x9, register_init
|
||||
ldp d8, d9, [x9], #16
|
||||
ldp d10, d11, [x9], #16
|
||||
ldp d12, d13, [x9], #16
|
||||
ldp d14, d15, [x9], #16
|
||||
ldp x19, x20, [x9], #16
|
||||
ldp x21, x22, [x9], #16
|
||||
ldp x23, x24, [x9], #16
|
||||
ldp x25, x26, [x9], #16
|
||||
ldp x27, x28, [x9], #16
|
||||
|
||||
sub sp, sp, #ARG_STACK
|
||||
.equ pos, 0
|
||||
.rept MAX_ARGS-8
|
||||
// Skip the first 8 args, that are loaded into registers
|
||||
ldr x9, [x29, #16 + 8*8 + pos]
|
||||
str x9, [sp, #pos]
|
||||
.equ pos, pos + 8
|
||||
.endr
|
||||
|
||||
// Fill x8-x17 with garbage. This doesn't have to be preserved,
|
||||
// but avoids relying on them having any particular value.
|
||||
movrel x9, register_init
|
||||
ldp x10, x11, [x9], #32
|
||||
ldp x12, x13, [x9], #32
|
||||
ldp x14, x15, [x9], #32
|
||||
ldp x16, x17, [x9], #32
|
||||
ldp x8, x9, [x9]
|
||||
|
||||
// For stack overflows, the callee is free to overwrite the parameters
|
||||
// that were passed on the stack (if any), so we can only check after
|
||||
// that point. First figure out how many parameters the function
|
||||
// really took on the stack:
|
||||
ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
|
||||
// Load the first non-parameter value from the stack, that should be
|
||||
// left untouched by the function. Store a copy of it inverted, so that
|
||||
// e.g. overwriting everything with zero would be noticed.
|
||||
ldr x2, [sp, x2, lsl #3]
|
||||
mvn x2, x2
|
||||
str x2, [sp, #ARG_STACK-8]
|
||||
|
||||
// Load the in-register arguments
|
||||
mov x12, x0
|
||||
ldp x0, x1, [x29, #16]
|
||||
ldp x2, x3, [x29, #32]
|
||||
ldp x4, x5, [x29, #48]
|
||||
ldp x6, x7, [x29, #64]
|
||||
// Call the target function
|
||||
blr x12
|
||||
|
||||
// Load the number of stack parameters, stack canary and its reference
|
||||
ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
|
||||
ldr x2, [sp, x2, lsl #3]
|
||||
ldr x3, [sp, #ARG_STACK-8]
|
||||
|
||||
add sp, sp, #ARG_STACK
|
||||
stp x0, x1, [sp, #-16]!
|
||||
|
||||
mvn x3, x3
|
||||
cmp x2, x3
|
||||
b.ne 2f
|
||||
|
||||
movrel x9, register_init
|
||||
movi v3.8h, #0
|
||||
|
||||
.macro check_reg_neon reg1, reg2
|
||||
ldr q1, [x9], #16
|
||||
uzp1 v2.2d, v\reg1\().2d, v\reg2\().2d
|
||||
eor v1.16b, v1.16b, v2.16b
|
||||
orr v3.16b, v3.16b, v1.16b
|
||||
.endm
|
||||
check_reg_neon 8, 9
|
||||
check_reg_neon 10, 11
|
||||
check_reg_neon 12, 13
|
||||
check_reg_neon 14, 15
|
||||
uqxtn v3.8b, v3.8h
|
||||
umov x3, v3.d[0]
|
||||
|
||||
.macro check_reg reg1, reg2
|
||||
ldp x0, x1, [x9], #16
|
||||
eor x0, x0, \reg1
|
||||
eor x1, x1, \reg2
|
||||
orr x3, x3, x0
|
||||
orr x3, x3, x1
|
||||
.endm
|
||||
check_reg x19, x20
|
||||
check_reg x21, x22
|
||||
check_reg x23, x24
|
||||
check_reg x25, x26
|
||||
check_reg x27, x28
|
||||
|
||||
cbz x3, 0f
|
||||
|
||||
movrel x0, error_message_register
|
||||
b 1f
|
||||
2:
|
||||
movrel x0, error_message_stack
|
||||
1:
|
||||
#ifdef PREFIX
|
||||
bl _checkasm_fail_func
|
||||
#else
|
||||
bl checkasm_fail_func
|
||||
#endif
|
||||
0:
|
||||
ldp x0, x1, [sp], #16
|
||||
ldp d14, d15, [sp], #16
|
||||
ldp d12, d13, [sp], #16
|
||||
ldp d10, d11, [sp], #16
|
||||
ldp d8, d9, [sp], #16
|
||||
ldp x27, x28, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "common/dump.h"
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/cdef.h"
|
||||
|
||||
static int to_binary(int x) { /* 0-15 -> 0000-1111 */
|
||||
return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
|
||||
}
|
||||
|
||||
static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
|
||||
const int fill_type = rnd() & 7;
|
||||
if (fill_type == 0)
|
||||
while (n--) /* check for cdef_filter underflows */
|
||||
*buf++ = rnd() & 1;
|
||||
else if (fill_type == 1)
|
||||
while (n--) /* check for cdef_filter overflows */
|
||||
*buf++ = bitdepth_max - (rnd() & 1);
|
||||
else
|
||||
while (n--)
|
||||
*buf++ = rnd() & bitdepth_max;
|
||||
}
|
||||
|
||||
static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
|
||||
ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8;
|
||||
ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8;
|
||||
ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8;
|
||||
ALIGN_STK_16(pixel, left, 8,[2]);
|
||||
const ptrdiff_t stride = 16 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
|
||||
const pixel *top, int pri_strength, int sec_strength,
|
||||
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
|
||||
for (int dir = 0; dir < 8; dir++) {
|
||||
for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) {
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
|
||||
init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
|
||||
init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
|
||||
init_tmp((pixel *) left, 8 * 2, bitdepth_max);
|
||||
memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
|
||||
|
||||
const int lvl = 1 + (rnd() % 62);
|
||||
const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
|
||||
int pri_strength = (lvl >> 2) << bitdepth_min_8;
|
||||
int sec_strength = lvl & 3;
|
||||
sec_strength += sec_strength == 3;
|
||||
sec_strength <<= bitdepth_min_8;
|
||||
call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride, left, top, pri_strength, sec_strength,
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
|
||||
fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
|
||||
pri_strength, sec_strength, dir, damping, to_binary(edges));
|
||||
return;
|
||||
}
|
||||
if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) {
|
||||
/* Benchmark a fixed set of cases to get consistent results:
|
||||
* 1) top/left edges and pri_strength only
|
||||
* 2) bottom/right edges and sec_strength only
|
||||
* 3) all edges and both pri_strength and sec_strength
|
||||
*/
|
||||
pri_strength = (edges & 1) << bitdepth_min_8;
|
||||
sec_strength = (edges & 2) << bitdepth_min_8;
|
||||
bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void check_cdef_direction(const cdef_dir_fn fn) {
|
||||
ALIGN_STK_64(pixel, src, 8 * 8,);
|
||||
|
||||
declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) {
|
||||
unsigned c_var, a_var;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
init_tmp(src, 64, bitdepth_max);
|
||||
|
||||
const int c_dir = call_ref(src, 8 * sizeof(pixel), &c_var HIGHBD_TAIL_SUFFIX);
|
||||
const int a_dir = call_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX);
|
||||
if (c_var != a_var || c_dir != a_dir) {
|
||||
if (fail()) {
|
||||
hex_fdump(stderr, src, 8 * sizeof(pixel), 8, 8, "src");
|
||||
fprintf(stderr, "c_dir %d a_dir %d\n", c_dir, a_dir);
|
||||
}
|
||||
}
|
||||
bench_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
report("cdef_dir");
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_cdef)(void) {
|
||||
Dav1dCdefDSPContext c;
|
||||
bitfn(dav1d_cdef_dsp_init)(&c);
|
||||
|
||||
check_cdef_direction(c.dir);
|
||||
|
||||
check_cdef_filter(c.fb[0], 8, 8);
|
||||
check_cdef_filter(c.fb[1], 4, 8);
|
||||
check_cdef_filter(c.fb[2], 4, 4);
|
||||
report("cdef_filter");
|
||||
}
|
|
@ -0,0 +1,874 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "src/cpu.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#define COLOR_RED FOREGROUND_RED
|
||||
#define COLOR_GREEN FOREGROUND_GREEN
|
||||
#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN)
|
||||
|
||||
static unsigned get_seed(void) {
|
||||
return GetTickCount();
|
||||
}
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <time.h>
|
||||
#ifdef __APPLE__
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
#define COLOR_RED 1
|
||||
#define COLOR_GREEN 2
|
||||
#define COLOR_YELLOW 3
|
||||
|
||||
static unsigned get_seed(void) {
|
||||
#ifdef __APPLE__
|
||||
return (unsigned) mach_absolute_time();
|
||||
#elif defined(HAVE_CLOCK_GETTIME)
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* List of tests to invoke */
|
||||
static const struct {
|
||||
const char *name;
|
||||
void (*func)(void);
|
||||
} tests[] = {
|
||||
{ "msac", checkasm_check_msac },
|
||||
{ "refmvs", checkasm_check_refmvs },
|
||||
#if CONFIG_8BPC
|
||||
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
|
||||
{ "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
|
||||
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
|
||||
{ "itx_8bpc", checkasm_check_itx_8bpc },
|
||||
{ "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
|
||||
{ "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
|
||||
{ "mc_8bpc", checkasm_check_mc_8bpc },
|
||||
#endif
|
||||
#if CONFIG_16BPC
|
||||
{ "cdef_16bpc", checkasm_check_cdef_16bpc },
|
||||
{ "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
|
||||
{ "ipred_16bpc", checkasm_check_ipred_16bpc },
|
||||
{ "itx_16bpc", checkasm_check_itx_16bpc },
|
||||
{ "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
|
||||
{ "looprestoration_16bpc", checkasm_check_looprestoration_16bpc },
|
||||
{ "mc_16bpc", checkasm_check_mc_16bpc },
|
||||
#endif
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
/* List of cpu flags to check */
|
||||
static const struct {
|
||||
const char *name;
|
||||
const char *suffix;
|
||||
unsigned flag;
|
||||
} cpus[] = {
|
||||
#if ARCH_X86
|
||||
{ "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
|
||||
{ "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
|
||||
{ "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
|
||||
{ "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
|
||||
{ "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
|
||||
#elif ARCH_AARCH64 || ARCH_ARM
|
||||
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
|
||||
#elif ARCH_PPC64LE
|
||||
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
|
||||
#endif
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
typedef struct CheckasmFuncVersion {
|
||||
struct CheckasmFuncVersion *next;
|
||||
void *func;
|
||||
int ok;
|
||||
unsigned cpu;
|
||||
int iterations;
|
||||
uint64_t cycles;
|
||||
} CheckasmFuncVersion;
|
||||
|
||||
/* Binary search tree node */
|
||||
typedef struct CheckasmFunc {
|
||||
struct CheckasmFunc *child[2];
|
||||
CheckasmFuncVersion versions;
|
||||
uint8_t color; /* 0 = red, 1 = black */
|
||||
char name[];
|
||||
} CheckasmFunc;
|
||||
|
||||
/* Internal state */
|
||||
static struct {
|
||||
CheckasmFunc *funcs;
|
||||
CheckasmFunc *current_func;
|
||||
CheckasmFuncVersion *current_func_ver;
|
||||
const char *current_test_name;
|
||||
const char *bench_pattern;
|
||||
size_t bench_pattern_len;
|
||||
int num_checked;
|
||||
int num_failed;
|
||||
int nop_time;
|
||||
unsigned cpu_flag;
|
||||
const char *cpu_flag_name;
|
||||
const char *test_name;
|
||||
unsigned seed;
|
||||
int bench_c;
|
||||
int verbose;
|
||||
int function_listing;
|
||||
#if ARCH_X86_64
|
||||
void (*simd_warmup)(void);
|
||||
#endif
|
||||
} state;
|
||||
|
||||
/* float compare support code */
|
||||
typedef union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} intfloat;
|
||||
|
||||
static uint32_t xs_state[4];
|
||||
|
||||
static void xor128_srand(unsigned seed) {
|
||||
xs_state[0] = seed;
|
||||
xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
|
||||
xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
|
||||
xs_state[3] = ~seed;
|
||||
}
|
||||
|
||||
// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
|
||||
// Journal of Statistical Software. 8 (14).
|
||||
// doi:10.18637/jss.v008.i14.
|
||||
int xor128_rand(void) {
|
||||
const uint32_t x = xs_state[0];
|
||||
const uint32_t t = x ^ (x << 11);
|
||||
|
||||
xs_state[0] = xs_state[1];
|
||||
xs_state[1] = xs_state[2];
|
||||
xs_state[2] = xs_state[3];
|
||||
uint32_t w = xs_state[3];
|
||||
|
||||
w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
|
||||
xs_state[3] = w;
|
||||
|
||||
return w >> 1;
|
||||
}
|
||||
|
||||
static int is_negative(const intfloat u) {
|
||||
return u.i >> 31;
|
||||
}
|
||||
|
||||
int float_near_ulp(const float a, const float b, const unsigned max_ulp) {
|
||||
intfloat x, y;
|
||||
|
||||
x.f = a;
|
||||
y.f = b;
|
||||
|
||||
if (is_negative(x) != is_negative(y)) {
|
||||
// handle -0.0 == +0.0
|
||||
return a == b;
|
||||
}
|
||||
|
||||
if (llabs((int64_t)x.i - y.i) <= max_ulp)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int float_near_ulp_array(const float *const a, const float *const b,
|
||||
const unsigned max_ulp, const int len)
|
||||
{
|
||||
for (int i = 0; i < len; i++)
|
||||
if (!float_near_ulp(a[i], b[i], max_ulp))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int float_near_abs_eps(const float a, const float b, const float eps) {
|
||||
return fabsf(a - b) < eps;
|
||||
}
|
||||
|
||||
int float_near_abs_eps_array(const float *const a, const float *const b,
|
||||
const float eps, const int len)
|
||||
{
|
||||
for (int i = 0; i < len; i++)
|
||||
if (!float_near_abs_eps(a[i], b[i], eps))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int float_near_abs_eps_ulp(const float a, const float b, const float eps,
|
||||
const unsigned max_ulp)
|
||||
{
|
||||
return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps);
|
||||
}
|
||||
|
||||
int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
|
||||
const float eps, const unsigned max_ulp,
|
||||
const int len)
|
||||
{
|
||||
for (int i = 0; i < len; i++)
|
||||
if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Print colored text to stderr if the terminal supports it */
|
||||
static void color_printf(const int color, const char *const fmt, ...) {
|
||||
static int8_t use_color = -1;
|
||||
va_list arg;
|
||||
|
||||
#ifdef _WIN32
|
||||
static HANDLE con;
|
||||
static WORD org_attributes;
|
||||
|
||||
if (use_color < 0) {
|
||||
CONSOLE_SCREEN_BUFFER_INFO con_info;
|
||||
con = GetStdHandle(STD_ERROR_HANDLE);
|
||||
if (con && con != INVALID_HANDLE_VALUE &&
|
||||
GetConsoleScreenBufferInfo(con, &con_info))
|
||||
{
|
||||
org_attributes = con_info.wAttributes;
|
||||
use_color = 1;
|
||||
} else
|
||||
use_color = 0;
|
||||
}
|
||||
if (use_color)
|
||||
SetConsoleTextAttribute(con, (org_attributes & 0xfff0) |
|
||||
(color & 0x0f));
|
||||
#else
|
||||
if (use_color < 0) {
|
||||
const char *const term = getenv("TERM");
|
||||
use_color = term && strcmp(term, "dumb") && isatty(2);
|
||||
}
|
||||
if (use_color)
|
||||
fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07);
|
||||
#endif
|
||||
|
||||
va_start(arg, fmt);
|
||||
vfprintf(stderr, fmt, arg);
|
||||
va_end(arg);
|
||||
|
||||
if (use_color) {
|
||||
#ifdef _WIN32
|
||||
SetConsoleTextAttribute(con, org_attributes);
|
||||
#else
|
||||
fprintf(stderr, "\x1b[0m");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* Deallocate a tree */
|
||||
static void destroy_func_tree(CheckasmFunc *const f) {
|
||||
if (f) {
|
||||
CheckasmFuncVersion *v = f->versions.next;
|
||||
while (v) {
|
||||
CheckasmFuncVersion *next = v->next;
|
||||
free(v);
|
||||
v = next;
|
||||
}
|
||||
|
||||
destroy_func_tree(f->child[0]);
|
||||
destroy_func_tree(f->child[1]);
|
||||
free(f);
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate a zero-initialized block, clean up and exit on failure */
|
||||
static void *checkasm_malloc(const size_t size) {
|
||||
void *const ptr = calloc(1, size);
|
||||
if (!ptr) {
|
||||
fprintf(stderr, "checkasm: malloc failed\n");
|
||||
destroy_func_tree(state.funcs);
|
||||
exit(1);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/* Get the suffix of the specified cpu flag */
|
||||
static const char *cpu_suffix(const unsigned cpu) {
|
||||
for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--)
|
||||
if (cpu & cpus[i].flag)
|
||||
return cpus[i].suffix;
|
||||
|
||||
return "c";
|
||||
}
|
||||
|
||||
#ifdef readtime
|
||||
static int cmp_nop(const void *a, const void *b) {
|
||||
return *(const uint16_t*)a - *(const uint16_t*)b;
|
||||
}
|
||||
|
||||
/* Measure the overhead of the timing code (in decicycles) */
|
||||
static int measure_nop_time(void) {
|
||||
uint16_t nops[10000];
|
||||
int nop_sum = 0;
|
||||
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
uint64_t t = readtime();
|
||||
nops[i] = (uint16_t) (readtime() - t);
|
||||
}
|
||||
|
||||
qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
|
||||
for (int i = 2500; i < 7500; i++)
|
||||
nop_sum += nops[i];
|
||||
|
||||
return nop_sum / 500;
|
||||
}
|
||||
|
||||
/* Print benchmark results */
|
||||
static void print_benchs(const CheckasmFunc *const f) {
|
||||
if (f) {
|
||||
print_benchs(f->child[0]);
|
||||
|
||||
/* Only print functions with at least one assembly version */
|
||||
if (state.bench_c || f->versions.cpu || f->versions.next) {
|
||||
const CheckasmFuncVersion *v = &f->versions;
|
||||
do {
|
||||
if (v->iterations) {
|
||||
const int decicycles = (int) (10*v->cycles/v->iterations -
|
||||
state.nop_time) / 4;
|
||||
printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu),
|
||||
decicycles/10, decicycles%10);
|
||||
}
|
||||
} while ((v = v->next));
|
||||
}
|
||||
|
||||
print_benchs(f->child[1]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void print_functions(const CheckasmFunc *const f) {
|
||||
if (f) {
|
||||
print_functions(f->child[0]);
|
||||
printf("%s\n", f->name);
|
||||
print_functions(f->child[1]);
|
||||
}
|
||||
}
|
||||
|
||||
#define is_digit(x) ((x) >= '0' && (x) <= '9')
|
||||
|
||||
/* ASCIIbetical sort except preserving natural order for numbers */
|
||||
static int cmp_func_names(const char *a, const char *b) {
|
||||
const char *const start = a;
|
||||
int ascii_diff, digit_diff;
|
||||
|
||||
for (; !(ascii_diff = *(const unsigned char*)a -
|
||||
*(const unsigned char*)b) && *a; a++, b++);
|
||||
for (; is_digit(*a) && is_digit(*b); a++, b++);
|
||||
|
||||
if (a > start && is_digit(a[-1]) &&
|
||||
(digit_diff = is_digit(*a) - is_digit(*b)))
|
||||
{
|
||||
return digit_diff;
|
||||
}
|
||||
|
||||
return ascii_diff;
|
||||
}
|
||||
|
||||
/* Perform a tree rotation in the specified direction and return the new root */
|
||||
static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
|
||||
CheckasmFunc *const r = f->child[dir^1];
|
||||
f->child[dir^1] = r->child[dir];
|
||||
r->child[dir] = f;
|
||||
r->color = f->color;
|
||||
f->color = 0;
|
||||
return r;
|
||||
}
|
||||
|
||||
#define is_red(f) ((f) && !(f)->color)
|
||||
|
||||
/* Balance a left-leaning red-black tree at the specified node */
|
||||
static void balance_tree(CheckasmFunc **const root) {
|
||||
CheckasmFunc *const f = *root;
|
||||
|
||||
if (is_red(f->child[0]) && is_red(f->child[1])) {
|
||||
f->color ^= 1;
|
||||
f->child[0]->color = f->child[1]->color = 1;
|
||||
}
|
||||
else if (!is_red(f->child[0]) && is_red(f->child[1]))
|
||||
*root = rotate_tree(f, 0); /* Rotate left */
|
||||
else if (is_red(f->child[0]) && is_red(f->child[0]->child[0]))
|
||||
*root = rotate_tree(f, 1); /* Rotate right */
|
||||
}
|
||||
|
||||
/* Get a node with the specified name, creating it if it doesn't exist */
|
||||
static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) {
|
||||
CheckasmFunc *f = *root;
|
||||
|
||||
if (f) {
|
||||
/* Search the tree for a matching node */
|
||||
const int cmp = cmp_func_names(name, f->name);
|
||||
if (cmp) {
|
||||
f = get_func(&f->child[cmp > 0], name);
|
||||
|
||||
/* Rebalance the tree on the way up if a new node was inserted */
|
||||
if (!f->versions.func)
|
||||
balance_tree(root);
|
||||
}
|
||||
} else {
|
||||
/* Allocate and insert a new node into the tree */
|
||||
const size_t name_length = strlen(name) + 1;
|
||||
f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length);
|
||||
memcpy(f->name, name, name_length);
|
||||
}
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
checkasm_context checkasm_context_buf;
|
||||
|
||||
/* Crash handling: attempt to catch crashes and handle them
|
||||
* gracefully instead of just aborting abruptly. */
|
||||
#ifdef _WIN32
|
||||
static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
|
||||
const char *err;
|
||||
switch (e->ExceptionRecord->ExceptionCode) {
|
||||
case EXCEPTION_FLT_DIVIDE_BY_ZERO:
|
||||
case EXCEPTION_INT_DIVIDE_BY_ZERO:
|
||||
err = "fatal arithmetic error";
|
||||
break;
|
||||
case EXCEPTION_ILLEGAL_INSTRUCTION:
|
||||
case EXCEPTION_PRIV_INSTRUCTION:
|
||||
err = "illegal instruction";
|
||||
break;
|
||||
case EXCEPTION_ACCESS_VIOLATION:
|
||||
case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
|
||||
case EXCEPTION_DATATYPE_MISALIGNMENT:
|
||||
case EXCEPTION_IN_PAGE_ERROR:
|
||||
case EXCEPTION_STACK_OVERFLOW:
|
||||
err = "segmentation fault";
|
||||
break;
|
||||
default:
|
||||
return EXCEPTION_CONTINUE_SEARCH;
|
||||
}
|
||||
RemoveVectoredExceptionHandler(signal_handler);
|
||||
checkasm_fail_func(err);
|
||||
checkasm_load_context();
|
||||
return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */
|
||||
}
|
||||
#else
|
||||
static void signal_handler(const int s) {
|
||||
checkasm_set_signal_handler_state(0);
|
||||
checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
|
||||
s == SIGILL ? "illegal instruction" :
|
||||
"segmentation fault");
|
||||
checkasm_load_context();
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Perform tests and benchmarks for the specified
|
||||
* cpu flag if supported by the host */
|
||||
static void check_cpu_flag(const char *const name, unsigned flag) {
|
||||
const unsigned old_cpu_flag = state.cpu_flag;
|
||||
|
||||
flag |= old_cpu_flag;
|
||||
dav1d_set_cpu_flags_mask(flag);
|
||||
state.cpu_flag = dav1d_get_cpu_flags();
|
||||
|
||||
if (!flag || state.cpu_flag != old_cpu_flag) {
|
||||
state.cpu_flag_name = name;
|
||||
for (int i = 0; tests[i].func; i++) {
|
||||
if (state.test_name && strcmp(tests[i].name, state.test_name))
|
||||
continue;
|
||||
xor128_srand(state.seed);
|
||||
state.current_test_name = tests[i].name;
|
||||
tests[i].func();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Print the name of the current CPU flag, but only do it once */
|
||||
static void print_cpu_name(void) {
|
||||
if (state.cpu_flag_name) {
|
||||
color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
|
||||
state.cpu_flag_name = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
state.seed = get_seed();
|
||||
|
||||
while (argc > 1) {
|
||||
if (!strncmp(argv[1], "--help", 6)) {
|
||||
fprintf(stdout,
|
||||
"checkasm [options] <random seed>\n"
|
||||
" <random seed> Numeric value to seed the rng\n"
|
||||
"Options:\n"
|
||||
" --test=<test_name> Test only <test_name>\n"
|
||||
" --bench=<pattern> Test and benchmark the functions matching <pattern>\n"
|
||||
" --list-functions List available functions\n"
|
||||
" --list-tests List available tests\n"
|
||||
" --bench-c Benchmark the C-only functions\n"
|
||||
" --verbose -v Print failures verbosely\n");
|
||||
return 0;
|
||||
} else if (!strncmp(argv[1], "--bench-c", 9)) {
|
||||
state.bench_c = 1;
|
||||
} else if (!strncmp(argv[1], "--bench", 7)) {
|
||||
#ifndef readtime
|
||||
fprintf(stderr,
|
||||
"checkasm: --bench is not supported on your system\n");
|
||||
return 1;
|
||||
#endif
|
||||
if (argv[1][7] == '=') {
|
||||
state.bench_pattern = argv[1] + 8;
|
||||
state.bench_pattern_len = strlen(state.bench_pattern);
|
||||
} else
|
||||
state.bench_pattern = "";
|
||||
} else if (!strncmp(argv[1], "--test=", 7)) {
|
||||
state.test_name = argv[1] + 7;
|
||||
} else if (!strcmp(argv[1], "--list-functions")) {
|
||||
state.function_listing = 1;
|
||||
} else if (!strcmp(argv[1], "--list-tests")) {
|
||||
for (int i = 0; tests[i].name; i++)
|
||||
printf("%s\n", tests[i].name);
|
||||
return 0;
|
||||
} else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
|
||||
state.verbose = 1;
|
||||
} else {
|
||||
state.seed = (unsigned) strtoul(argv[1], NULL, 10);
|
||||
}
|
||||
|
||||
argc--;
|
||||
argv++;
|
||||
}
|
||||
|
||||
dav1d_init_cpu();
|
||||
|
||||
#ifdef readtime
|
||||
if (state.bench_pattern) {
|
||||
static int testing = 0;
|
||||
checkasm_save_context();
|
||||
if (!testing) {
|
||||
checkasm_set_signal_handler_state(1);
|
||||
testing = 1;
|
||||
readtime();
|
||||
checkasm_set_signal_handler_state(0);
|
||||
} else {
|
||||
fprintf(stderr, "checkasm: unable to access cycle counter\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int ret = 0;
|
||||
|
||||
if (!state.function_listing) {
|
||||
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
|
||||
#if ARCH_X86_64
|
||||
void checkasm_warmup_avx2(void);
|
||||
void checkasm_warmup_avx512(void);
|
||||
const unsigned cpu_flags = dav1d_get_cpu_flags();
|
||||
if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
|
||||
state.simd_warmup = checkasm_warmup_avx512;
|
||||
else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
|
||||
state.simd_warmup = checkasm_warmup_avx2;
|
||||
checkasm_simd_warmup();
|
||||
#endif
|
||||
}
|
||||
|
||||
check_cpu_flag(NULL, 0);
|
||||
|
||||
if (state.function_listing) {
|
||||
print_functions(state.funcs);
|
||||
} else {
|
||||
for (int i = 0; cpus[i].flag; i++)
|
||||
check_cpu_flag(cpus[i].name, cpus[i].flag);
|
||||
if (!state.num_checked) {
|
||||
fprintf(stderr, "checkasm: no tests to perform\n");
|
||||
} else if (state.num_failed) {
|
||||
fprintf(stderr, "checkasm: %d of %d tests have failed\n",
|
||||
state.num_failed, state.num_checked);
|
||||
ret = 1;
|
||||
} else {
|
||||
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
|
||||
#ifdef readtime
|
||||
if (state.bench_pattern) {
|
||||
state.nop_time = measure_nop_time();
|
||||
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
|
||||
print_benchs(state.funcs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
destroy_func_tree(state.funcs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Decide whether or not the specified function needs to be tested and
|
||||
* allocate/initialize data structures if needed. Returns a pointer to a
|
||||
* reference function if the function should be tested, otherwise NULL */
|
||||
void *checkasm_check_func(void *const func, const char *const name, ...) {
|
||||
char name_buf[256];
|
||||
va_list arg;
|
||||
|
||||
va_start(arg, name);
|
||||
const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
|
||||
va_end(arg);
|
||||
|
||||
if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf))
|
||||
return NULL;
|
||||
|
||||
state.current_func = get_func(&state.funcs, name_buf);
|
||||
|
||||
if (state.function_listing) /* Save function names without running tests */
|
||||
return NULL;
|
||||
|
||||
state.funcs->color = 1;
|
||||
CheckasmFuncVersion *v = &state.current_func->versions;
|
||||
void *ref = func;
|
||||
|
||||
if (v->func) {
|
||||
CheckasmFuncVersion *prev;
|
||||
do {
|
||||
/* Only test functions that haven't already been tested */
|
||||
if (v->func == func)
|
||||
return NULL;
|
||||
|
||||
if (v->ok)
|
||||
ref = v->func;
|
||||
|
||||
prev = v;
|
||||
} while ((v = v->next));
|
||||
|
||||
v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
|
||||
}
|
||||
|
||||
v->func = func;
|
||||
v->ok = 1;
|
||||
v->cpu = state.cpu_flag;
|
||||
state.current_func_ver = v;
|
||||
xor128_srand(state.seed);
|
||||
|
||||
if (state.cpu_flag || state.bench_c)
|
||||
state.num_checked++;
|
||||
|
||||
return ref;
|
||||
}
|
||||
|
||||
/* Decide whether or not the current function needs to be benchmarked */
|
||||
int checkasm_bench_func(void) {
|
||||
return !state.num_failed && state.bench_pattern &&
|
||||
!strncmp(state.current_func->name, state.bench_pattern,
|
||||
state.bench_pattern_len);
|
||||
}
|
||||
|
||||
/* Indicate that the current test has failed, return whether verbose printing
|
||||
* is requested. */
|
||||
int checkasm_fail_func(const char *const msg, ...) {
|
||||
if (state.current_func_ver && state.current_func_ver->cpu &&
|
||||
state.current_func_ver->ok)
|
||||
{
|
||||
va_list arg;
|
||||
|
||||
print_cpu_name();
|
||||
fprintf(stderr, " %s_%s (", state.current_func->name,
|
||||
cpu_suffix(state.current_func_ver->cpu));
|
||||
va_start(arg, msg);
|
||||
vfprintf(stderr, msg, arg);
|
||||
va_end(arg);
|
||||
fprintf(stderr, ")\n");
|
||||
|
||||
state.current_func_ver->ok = 0;
|
||||
state.num_failed++;
|
||||
}
|
||||
return state.verbose;
|
||||
}
|
||||
|
||||
/* Update benchmark results of the current function */
|
||||
void checkasm_update_bench(const int iterations, const uint64_t cycles) {
|
||||
state.current_func_ver->iterations += iterations;
|
||||
state.current_func_ver->cycles += cycles;
|
||||
}
|
||||
|
||||
/* Print the outcome of all tests performed since
|
||||
* the last time this function was called */
|
||||
void checkasm_report(const char *const name, ...) {
|
||||
static int prev_checked, prev_failed;
|
||||
static size_t max_length;
|
||||
|
||||
if (state.num_checked > prev_checked) {
|
||||
int pad_length = (int) max_length + 4;
|
||||
va_list arg;
|
||||
|
||||
print_cpu_name();
|
||||
pad_length -= fprintf(stderr, " - %s.", state.current_test_name);
|
||||
va_start(arg, name);
|
||||
pad_length -= vfprintf(stderr, name, arg);
|
||||
va_end(arg);
|
||||
fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
|
||||
|
||||
if (state.num_failed == prev_failed)
|
||||
color_printf(COLOR_GREEN, "OK");
|
||||
else
|
||||
color_printf(COLOR_RED, "FAILED");
|
||||
fprintf(stderr, "]\n");
|
||||
|
||||
prev_checked = state.num_checked;
|
||||
prev_failed = state.num_failed;
|
||||
} else if (!state.cpu_flag) {
|
||||
/* Calculate the amount of padding required
|
||||
* to make the output vertically aligned */
|
||||
size_t length = strlen(state.current_test_name);
|
||||
va_list arg;
|
||||
|
||||
va_start(arg, name);
|
||||
length += vsnprintf(NULL, 0, name, arg);
|
||||
va_end(arg);
|
||||
|
||||
if (length > max_length)
|
||||
max_length = length;
|
||||
}
|
||||
}
|
||||
|
||||
void checkasm_set_signal_handler_state(const int enabled) {
|
||||
#ifdef _WIN32
|
||||
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
|
||||
if (enabled)
|
||||
AddVectoredExceptionHandler(0, signal_handler);
|
||||
else
|
||||
RemoveVectoredExceptionHandler(signal_handler);
|
||||
#endif
|
||||
#else
|
||||
void (*const handler)(int) = enabled ? signal_handler : SIG_DFL;
|
||||
signal(SIGBUS, handler);
|
||||
signal(SIGFPE, handler);
|
||||
signal(SIGILL, handler);
|
||||
signal(SIGSEGV, handler);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int check_err(const char *const file, const int line,
|
||||
const char *const name, const int w, const int h,
|
||||
int *const err)
|
||||
{
|
||||
if (*err)
|
||||
return 0;
|
||||
if (!checkasm_fail_func("%s:%d", file, line))
|
||||
return 1;
|
||||
*err = 1;
|
||||
fprintf(stderr, "%s (%dx%d):\n", name, w, h);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define DEF_CHECKASM_CHECK_FUNC(type, fmt) \
|
||||
int checkasm_check_##type(const char *const file, const int line, \
|
||||
const type *buf1, ptrdiff_t stride1, \
|
||||
const type *buf2, ptrdiff_t stride2, \
|
||||
const int w, int h, const char *const name, \
|
||||
const int align_w, const int align_h, \
|
||||
const int padding) \
|
||||
{ \
|
||||
int aligned_w = (w + align_w - 1) & ~(align_w - 1); \
|
||||
int aligned_h = (h + align_h - 1) & ~(align_h - 1); \
|
||||
int err = 0; \
|
||||
stride1 /= sizeof(*buf1); \
|
||||
stride2 /= sizeof(*buf2); \
|
||||
int y = 0; \
|
||||
for (y = 0; y < h; y++) \
|
||||
if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \
|
||||
break; \
|
||||
if (y != h) { \
|
||||
if (check_err(file, line, name, w, h, &err)) \
|
||||
return 1; \
|
||||
for (y = 0; y < h; y++) { \
|
||||
for (int x = 0; x < w; x++) \
|
||||
fprintf(stderr, " " fmt, buf1[x]); \
|
||||
fprintf(stderr, " "); \
|
||||
for (int x = 0; x < w; x++) \
|
||||
fprintf(stderr, " " fmt, buf2[x]); \
|
||||
fprintf(stderr, " "); \
|
||||
for (int x = 0; x < w; x++) \
|
||||
fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \
|
||||
buf1 += stride1; \
|
||||
buf2 += stride2; \
|
||||
fprintf(stderr, "\n"); \
|
||||
} \
|
||||
buf1 -= h*stride1; \
|
||||
buf2 -= h*stride2; \
|
||||
} \
|
||||
for (y = -padding; y < 0; y++) \
|
||||
if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
|
||||
(w + 2*padding)*sizeof(*buf1))) { \
|
||||
if (check_err(file, line, name, w, h, &err)) \
|
||||
return 1; \
|
||||
fprintf(stderr, " overwrite above\n"); \
|
||||
break; \
|
||||
} \
|
||||
for (y = aligned_h; y < aligned_h + padding; y++) \
|
||||
if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
|
||||
(w + 2*padding)*sizeof(*buf1))) { \
|
||||
if (check_err(file, line, name, w, h, &err)) \
|
||||
return 1; \
|
||||
fprintf(stderr, " overwrite below\n"); \
|
||||
break; \
|
||||
} \
|
||||
for (y = 0; y < h; y++) \
|
||||
if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
|
||||
padding*sizeof(*buf1))) { \
|
||||
if (check_err(file, line, name, w, h, &err)) \
|
||||
return 1; \
|
||||
fprintf(stderr, " overwrite left\n"); \
|
||||
break; \
|
||||
} \
|
||||
for (y = 0; y < h; y++) \
|
||||
if (memcmp(&buf1[y*stride1 + aligned_w], &buf2[y*stride2 + aligned_w], \
|
||||
padding*sizeof(*buf1))) { \
|
||||
if (check_err(file, line, name, w, h, &err)) \
|
||||
return 1; \
|
||||
fprintf(stderr, " overwrite right\n"); \
|
||||
break; \
|
||||
} \
|
||||
return err; \
|
||||
}
|
||||
|
||||
DEF_CHECKASM_CHECK_FUNC(int8_t, "%4d")
|
||||
DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d")
|
||||
DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d")
|
||||
DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x")
|
||||
DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
|
||||
DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x")
|
||||
|
||||
#if ARCH_X86_64
|
||||
void checkasm_simd_warmup(void)
|
||||
{
|
||||
if (state.simd_warmup)
|
||||
state.simd_warmup();
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,350 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_TESTS_CHECKASM_CHECKASM_H
|
||||
#define DAV1D_TESTS_CHECKASM_CHECKASM_H
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if ARCH_X86_64 && defined(_WIN32)
|
||||
/* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack,
|
||||
* which doesn't work for assembly functions without unwind information. */
|
||||
#include <windows.h>
|
||||
#define checkasm_context CONTEXT
|
||||
#define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf)
|
||||
#define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL)
|
||||
#else
|
||||
#include <setjmp.h>
|
||||
#define checkasm_context jmp_buf
|
||||
#define checkasm_save_context() setjmp(checkasm_context_buf)
|
||||
#define checkasm_load_context() longjmp(checkasm_context_buf, 1)
|
||||
#endif
|
||||
|
||||
#include "include/common/attributes.h"
|
||||
#include "include/common/bitdepth.h"
|
||||
#include "include/common/intops.h"
|
||||
|
||||
int xor128_rand(void);
|
||||
#define rnd xor128_rand
|
||||
|
||||
#define decl_check_bitfns(name) \
|
||||
name##_8bpc(void); \
|
||||
name##_16bpc(void)
|
||||
|
||||
void checkasm_check_msac(void);
|
||||
void checkasm_check_refmvs(void);
|
||||
decl_check_bitfns(void checkasm_check_cdef);
|
||||
decl_check_bitfns(void checkasm_check_filmgrain);
|
||||
decl_check_bitfns(void checkasm_check_ipred);
|
||||
decl_check_bitfns(void checkasm_check_itx);
|
||||
decl_check_bitfns(void checkasm_check_loopfilter);
|
||||
decl_check_bitfns(void checkasm_check_looprestoration);
|
||||
decl_check_bitfns(void checkasm_check_mc);
|
||||
|
||||
void *checkasm_check_func(void *func, const char *name, ...);
|
||||
int checkasm_bench_func(void);
|
||||
int checkasm_fail_func(const char *msg, ...);
|
||||
void checkasm_update_bench(int iterations, uint64_t cycles);
|
||||
void checkasm_report(const char *name, ...);
|
||||
void checkasm_set_signal_handler_state(int enabled);
|
||||
extern checkasm_context checkasm_context_buf;
|
||||
|
||||
/* float compare utilities */
|
||||
int float_near_ulp(float a, float b, unsigned max_ulp);
|
||||
int float_near_abs_eps(float a, float b, float eps);
|
||||
int float_near_abs_eps_ulp(float a, float b, float eps, unsigned max_ulp);
|
||||
int float_near_ulp_array(const float *a, const float *b, unsigned max_ulp,
|
||||
int len);
|
||||
int float_near_abs_eps_array(const float *a, const float *b, float eps,
|
||||
int len);
|
||||
int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
|
||||
unsigned max_ulp, int len);
|
||||
|
||||
#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
|
||||
|
||||
/* Decide whether or not the specified function needs to be tested */
|
||||
#define check_func(func, ...)\
|
||||
(func_ref = checkasm_check_func((func_new = func), __VA_ARGS__))
|
||||
|
||||
/* Declare the function prototype. The first argument is the return value,
|
||||
* the remaining arguments are the function parameters. Naming parameters
|
||||
* is optional. */
|
||||
#define declare_func(ret, ...)\
|
||||
declare_new(ret, __VA_ARGS__)\
|
||||
void *func_ref, *func_new;\
|
||||
typedef ret func_type(__VA_ARGS__);\
|
||||
checkasm_save_context()
|
||||
|
||||
/* Indicate that the current test has failed */
|
||||
#define fail() checkasm_fail_func("%s:%d", __FILE__, __LINE__)
|
||||
|
||||
/* Print the test outcome */
|
||||
#define report checkasm_report
|
||||
|
||||
/* Call the reference function */
|
||||
#define call_ref(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
((func_type *)func_ref)(__VA_ARGS__));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_X86
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#include <intrin.h>
|
||||
#define readtime() (_mm_lfence(), __rdtsc())
|
||||
#else
|
||||
static inline uint64_t readtime(void) {
|
||||
uint32_t eax, edx;
|
||||
__asm__ __volatile__("lfence\nrdtsc" : "=a"(eax), "=d"(edx));
|
||||
return (((uint64_t)edx) << 32) | eax;
|
||||
}
|
||||
#define readtime readtime
|
||||
#endif
|
||||
#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__)
|
||||
#include <mach/mach_time.h>
|
||||
#define readtime() mach_absolute_time()
|
||||
#elif ARCH_AARCH64
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h>
|
||||
#define readtime() (_InstructionSynchronizationBarrier(), ReadTimeStampCounter())
|
||||
#else
|
||||
static inline uint64_t readtime(void) {
|
||||
uint64_t cycle_counter;
|
||||
/* This requires enabling user mode access to the cycle counter (which
|
||||
* can only be done from kernel space).
|
||||
* This could also read cntvct_el0 instead of pmccntr_el0; that register
|
||||
* might also be readable (depending on kernel version), but it has much
|
||||
* worse precision (it's a fixed 50 MHz timer). */
|
||||
__asm__ __volatile__("isb\nmrs %0, pmccntr_el0"
|
||||
: "=r"(cycle_counter)
|
||||
:: "memory");
|
||||
return cycle_counter;
|
||||
}
|
||||
#define readtime readtime
|
||||
#endif
|
||||
#elif ARCH_ARM && !defined(_MSC_VER) && __ARM_ARCH >= 7
|
||||
static inline uint64_t readtime(void) {
|
||||
uint32_t cycle_counter;
|
||||
/* This requires enabling user mode access to the cycle counter (which
|
||||
* can only be done from kernel space). */
|
||||
__asm__ __volatile__("isb\nmrc p15, 0, %0, c9, c13, 0"
|
||||
: "=r"(cycle_counter)
|
||||
:: "memory");
|
||||
return cycle_counter;
|
||||
}
|
||||
#define readtime readtime
|
||||
#elif ARCH_PPC64LE
|
||||
static inline uint64_t readtime(void) {
|
||||
uint32_t tbu, tbl, temp;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"1:\n"
|
||||
"mfspr %2,269\n"
|
||||
"mfspr %0,268\n"
|
||||
"mfspr %1,269\n"
|
||||
"cmpw %2,%1\n"
|
||||
"bne 1b\n"
|
||||
: "=r"(tbl), "=r"(tbu), "=r"(temp)
|
||||
:
|
||||
: "cc");
|
||||
|
||||
return (((uint64_t)tbu) << 32) | (uint64_t)tbl;
|
||||
}
|
||||
#define readtime readtime
|
||||
#endif
|
||||
|
||||
/* Verifies that clobbered callee-saved registers
|
||||
* are properly saved and restored */
|
||||
void checkasm_checked_call(void *func, ...);
|
||||
|
||||
#if ARCH_X86_64
|
||||
/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended
|
||||
* to 64-bit. This is done by clobbering the stack with junk around the stack
|
||||
* pointer and calling the assembly function through checked_call() with added
|
||||
* dummy arguments which forces all real arguments to be passed on the stack
|
||||
* and not in registers. For 32-bit arguments the upper half of the 64-bit
|
||||
* register locations on the stack will now contain junk which will cause
|
||||
* misbehaving functions to either produce incorrect output or segfault. Note
|
||||
* that even though this works extremely well in practice, it's technically
|
||||
* not guaranteed and false negatives is theoretically possible, but there
|
||||
* can never be any false positives. */
|
||||
void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
/* YMM and ZMM registers on x86 are turned off to save power when they haven't
|
||||
* been used for some period of time. When they are used there will be a
|
||||
* "warmup" period during which performance will be reduced and inconsistent
|
||||
* which is problematic when trying to benchmark individual functions. We can
|
||||
* work around this by periodically issuing "dummy" instructions that uses
|
||||
* those registers to keep them powered on. */
|
||||
void checkasm_simd_warmup(void);
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\
|
||||
int, int, int, int, int, int, int, int,\
|
||||
int, int, int, int, int, int, int) =\
|
||||
(void *)checkasm_checked_call;
|
||||
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
|
||||
#ifdef _WIN32
|
||||
#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0
|
||||
#else
|
||||
#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0
|
||||
#endif
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
checkasm_simd_warmup(),\
|
||||
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
|
||||
checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#elif ARCH_X86_32
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\
|
||||
int, int, int, int, int, int, int, int, int) =\
|
||||
(void *)checkasm_checked_call;
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\
|
||||
11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#elif ARCH_ARM
|
||||
/* Use a dummy argument, to offset the real parameters by 2, not only 1.
|
||||
* This makes sure that potential 8-byte-alignment of parameters is kept
|
||||
* the same even when the extra parameters have been removed. */
|
||||
void checkasm_checked_call_vfp(void *func, int dummy, ...);
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(void *, int dummy, __VA_ARGS__,\
|
||||
int, int, int, int, int, int, int, int,\
|
||||
int, int, int, int, int, int, int) =\
|
||||
(void *)checkasm_checked_call_vfp;
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#elif ARCH_AARCH64 && !defined(__APPLE__)
|
||||
void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(void *, int, int, int, int, int, int, int,\
|
||||
__VA_ARGS__, int, int, int, int, int, int, int, int,\
|
||||
int, int, int, int, int, int, int) =\
|
||||
(void *)checkasm_checked_call;
|
||||
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB),\
|
||||
checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
|
||||
7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#else
|
||||
#define declare_new(ret, ...)
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
((func_type *)func_new)(__VA_ARGS__));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#endif
|
||||
#else /* HAVE_ASM */
|
||||
#define declare_new(ret, ...)
|
||||
/* Call the function */
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
((func_type *)func_new)(__VA_ARGS__));\
|
||||
checkasm_set_signal_handler_state(0)
|
||||
#endif /* HAVE_ASM */
|
||||
|
||||
/* Benchmark the function */
|
||||
#ifdef readtime
|
||||
#define bench_new(...)\
|
||||
do {\
|
||||
if (checkasm_bench_func()) {\
|
||||
func_type *tfunc = func_new;\
|
||||
checkasm_set_signal_handler_state(1);\
|
||||
uint64_t tsum = 0;\
|
||||
int tcount = 0;\
|
||||
for (int ti = 0; ti < BENCH_RUNS; ti++) {\
|
||||
uint64_t t = readtime();\
|
||||
tfunc(__VA_ARGS__);\
|
||||
tfunc(__VA_ARGS__);\
|
||||
tfunc(__VA_ARGS__);\
|
||||
tfunc(__VA_ARGS__);\
|
||||
t = readtime() - t;\
|
||||
if (t*tcount <= tsum*4 && ti > 0) {\
|
||||
tsum += t;\
|
||||
tcount++;\
|
||||
}\
|
||||
}\
|
||||
checkasm_set_signal_handler_state(0);\
|
||||
checkasm_update_bench(tcount, tsum);\
|
||||
} else {\
|
||||
call_new(__VA_ARGS__);\
|
||||
}\
|
||||
} while (0)
|
||||
#else
|
||||
#define bench_new(...) do {} while (0)
|
||||
#endif
|
||||
|
||||
|
||||
#define PIXEL_RECT(name, w, h) \
|
||||
ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \
|
||||
ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \
|
||||
(void)name##_stride; \
|
||||
pixel *name = name##_buf + ((w)+64)*16 + 64
|
||||
|
||||
#define CLEAR_PIXEL_RECT(name) \
|
||||
memset(name##_buf, 0x99, sizeof(name##_buf)) \
|
||||
|
||||
#define DECL_CHECKASM_CHECK_FUNC(type) \
|
||||
int checkasm_check_##type(const char *const file, const int line, \
|
||||
const type *const buf1, const ptrdiff_t stride1, \
|
||||
const type *const buf2, const ptrdiff_t stride2, \
|
||||
const int w, const int h, const char *const name, \
|
||||
const int align_w, const int align_h, \
|
||||
const int padding)
|
||||
|
||||
DECL_CHECKASM_CHECK_FUNC(int8_t);
|
||||
DECL_CHECKASM_CHECK_FUNC(int16_t);
|
||||
DECL_CHECKASM_CHECK_FUNC(int32_t);
|
||||
DECL_CHECKASM_CHECK_FUNC(uint8_t);
|
||||
DECL_CHECKASM_CHECK_FUNC(uint16_t);
|
||||
DECL_CHECKASM_CHECK_FUNC(uint32_t);
|
||||
|
||||
#define CONCAT(a,b) a ## b
|
||||
|
||||
#define checkasm_check2(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__)
|
||||
#define checkasm_check(prefix, ...) checkasm_check2(prefix, __VA_ARGS__, 0, 0, 0)
|
||||
|
||||
#ifdef BITDEPTH
|
||||
#define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__)
|
||||
#define checkasm_check_pixel_padded(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 1, 1, 8)
|
||||
#define checkasm_check_pixel_padded_align(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 8)
|
||||
#define checkasm_check_coef(...) checkasm_check(COEF_TYPE, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#endif /* DAV1D_TESTS_CHECKASM_CHECKASM_H */
|
|
@ -0,0 +1,401 @@
|
|||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/film_grain.h"
|
||||
#define UNIT_TEST 1
|
||||
#include "src/fg_apply_tmpl.c"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
#define checkasm_check_entry(...) checkasm_check(int8_t, __VA_ARGS__)
|
||||
#else
|
||||
#define checkasm_check_entry(...) checkasm_check(int16_t, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
static const char ss_name[][4] = {
|
||||
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
|
||||
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
|
||||
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
|
||||
};
|
||||
|
||||
static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
|
||||
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
|
||||
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#endif
|
||||
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
|
||||
call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH,
|
||||
grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH,
|
||||
GRAIN_WIDTH, GRAIN_HEIGHT, "grain_lut");
|
||||
|
||||
bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
||||
report("gen_grain_y");
|
||||
}
|
||||
|
||||
static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
|
||||
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
|
||||
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
|
||||
const entry grain_lut_y[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
|
||||
const enum Dav1dPixelLayout layout = layout_idx + 1;
|
||||
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (check_func(dsp->generate_grain_uv[layout_idx],
|
||||
"gen_grain_uv_ar%d_%dbpc_%s",
|
||||
i, BITDEPTH, ss_name[layout_idx]))
|
||||
{
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#endif
|
||||
|
||||
fg_data[0].num_y_points = rnd() & 1;
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int uv = rnd() & 1;
|
||||
const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
|
||||
for (int n = 0; n < num_uv_pos; n++)
|
||||
fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
|
||||
if (!fg_data[0].num_y_points)
|
||||
fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
|
||||
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
|
||||
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
|
||||
call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
int w = ss_x ? 44 : GRAIN_WIDTH;
|
||||
int h = ss_y ? 38 : GRAIN_HEIGHT;
|
||||
checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH,
|
||||
grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH,
|
||||
w, h, "grain_lut");
|
||||
|
||||
bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
report("gen_grain_uv");
|
||||
}
|
||||
|
||||
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
PIXEL_RECT(c_dst, 128, 32);
|
||||
PIXEL_RECT(a_dst, 128, 32);
|
||||
PIXEL_RECT(src, 128, 32);
|
||||
const ptrdiff_t stride = c_dst_stride;
|
||||
|
||||
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
|
||||
const Dav1dFilmGrainData *data, size_t pw,
|
||||
const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH],
|
||||
int bh, int row_num HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
uint8_t scaling[SCALING_SIZE];
|
||||
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
fg_data[0].num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data[0].num_y_points;
|
||||
for (int n = 0; n < fg_data[0].num_y_points; n++) {
|
||||
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
|
||||
fg_data[0].y_points[n][0] += rnd() % pad;
|
||||
fg_data[0].y_points[n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
|
||||
fg_data[0].num_y_points, scaling);
|
||||
|
||||
fg_data[0].clip_to_restricted_range = rnd() & 1;
|
||||
fg_data[0].scaling_shift = (rnd() & 3) + 8;
|
||||
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
|
||||
fg_data[0].overlap_flag++)
|
||||
{
|
||||
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
|
||||
int w, h, row_num;
|
||||
if (fg_data[0].overlap_flag) {
|
||||
w = 35 + (rnd() % 93);
|
||||
if (i == 0) {
|
||||
row_num = 0;
|
||||
h = 1 + (rnd() % 31);
|
||||
} else {
|
||||
row_num = 1 + (rnd() & 0x7ff);
|
||||
if (i == 1) {
|
||||
h = 3 + (rnd() % 30);
|
||||
} else {
|
||||
h = 1 + (rnd() & 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
w = 1 + (rnd() & 127);
|
||||
h = 1 + (rnd() & 31);
|
||||
row_num = rnd() & 0x7ff;
|
||||
}
|
||||
|
||||
for (int y = 0; y < 32; y++) {
|
||||
// Src pixels past the right edge can be uninitialized
|
||||
for (int x = 0; x < 128; x++)
|
||||
src[y * PXSTRIDE(stride) + x] = rnd();
|
||||
for (int x = 0; x < w; x++)
|
||||
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
|
||||
}
|
||||
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride,
|
||||
w, h, "dst", 32, 2);
|
||||
}
|
||||
}
|
||||
fg_data[0].overlap_flag = 1;
|
||||
for (int y = 0; y < 32; y++) {
|
||||
// Make sure all pixels are in range
|
||||
for (int x = 0; x < 128; x++)
|
||||
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
|
||||
}
|
||||
bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
|
||||
1 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
report("fgy_32x32xn");
|
||||
}
|
||||
|
||||
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
PIXEL_RECT(c_dst, 128, 32);
|
||||
PIXEL_RECT(a_dst, 128, 32);
|
||||
PIXEL_RECT(src, 128, 32);
|
||||
PIXEL_RECT(luma_src, 128, 32);
|
||||
const ptrdiff_t lstride = luma_src_stride;
|
||||
|
||||
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
|
||||
const Dav1dFilmGrainData *data, size_t pw,
|
||||
const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
|
||||
const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
|
||||
int is_identity HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
|
||||
const enum Dav1dPixelLayout layout = layout_idx + 1;
|
||||
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const ptrdiff_t stride = c_dst_stride;
|
||||
|
||||
for (int csfl = 0; csfl <= 1; csfl++) {
|
||||
if (check_func(dsp->fguv_32x32xn[layout_idx],
|
||||
"fguv_32x32xn_%dbpc_%s_csfl%d",
|
||||
BITDEPTH, ss_name[layout_idx], csfl))
|
||||
{
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
|
||||
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const int uv_pl = rnd() & 1;
|
||||
const int is_identity = rnd() & 1;
|
||||
|
||||
uint8_t scaling[SCALING_SIZE];
|
||||
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
const int num_uv_pos = num_y_pos + 1;
|
||||
for (int n = 0; n < num_uv_pos; n++)
|
||||
fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
|
||||
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
|
||||
fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
if (csfl) {
|
||||
fg_data[0].num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data[0].num_y_points;
|
||||
for (int n = 0; n < fg_data[0].num_y_points; n++) {
|
||||
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
|
||||
fg_data[0].y_points[n][0] += rnd() % pad;
|
||||
fg_data[0].y_points[n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
|
||||
fg_data[0].num_y_points, scaling);
|
||||
} else {
|
||||
fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
|
||||
const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
|
||||
for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
|
||||
fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
|
||||
fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
|
||||
fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
|
||||
fg_data[0].num_uv_points[uv_pl], scaling);
|
||||
|
||||
fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
|
||||
}
|
||||
|
||||
fg_data[0].clip_to_restricted_range = rnd() & 1;
|
||||
fg_data[0].scaling_shift = (rnd() & 3) + 8;
|
||||
fg_data[0].chroma_scaling_from_luma = csfl;
|
||||
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
|
||||
fg_data[0].overlap_flag++)
|
||||
{
|
||||
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
|
||||
int w, h, row_num;
|
||||
if (fg_data[0].overlap_flag) {
|
||||
w = (36 >> ss_x) + (rnd() % (92 >> ss_x));
|
||||
if (i == 0) {
|
||||
row_num = 0;
|
||||
h = 1 + (rnd() & (31 >> ss_y));
|
||||
} else {
|
||||
row_num = 1 + (rnd() & 0x7ff);
|
||||
if (i == 1) {
|
||||
h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30));
|
||||
} else {
|
||||
h = ss_y ? 1 : 1 + (rnd() & 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
w = 1 + (rnd() & (127 >> ss_x));
|
||||
h = 1 + (rnd() & (31 >> ss_y));
|
||||
row_num = rnd() & 0x7ff;
|
||||
}
|
||||
|
||||
for (int y = 0; y < 32; y++) {
|
||||
// Src pixels past the right edge can be uninitialized
|
||||
for (int x = 0; x < 128; x++) {
|
||||
src[y * PXSTRIDE(stride) + x] = rnd();
|
||||
luma_src[y * PXSTRIDE(lstride) + x] = rnd();
|
||||
}
|
||||
for (int x = 0; x < w; x++)
|
||||
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
|
||||
for (int x = 0; x < (w << ss_x); x++)
|
||||
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
|
||||
}
|
||||
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel_padded_align(c_dst, stride,
|
||||
a_dst, stride,
|
||||
w, h, "dst",
|
||||
32 >> ss_x, 2);
|
||||
}
|
||||
}
|
||||
|
||||
fg_data[0].overlap_flag = 1;
|
||||
for (int y = 0; y < 32; y++) {
|
||||
// Make sure all pixels are in range
|
||||
for (int x = 0; x < 128; x++) {
|
||||
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
|
||||
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
|
||||
}
|
||||
}
|
||||
bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
|
||||
1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
report("fguv_32x32xn");
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_filmgrain)(void) {
|
||||
Dav1dFilmGrainDSPContext c;
|
||||
|
||||
bitfn(dav1d_film_grain_dsp_init)(&c);
|
||||
|
||||
check_gen_grny(&c);
|
||||
check_gen_grnuv(&c);
|
||||
check_fgy_sbrow(&c);
|
||||
check_fguv_sbrow(&c);
|
||||
}
|
|
@ -0,0 +1,289 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
#include "src/ipred.h"
|
||||
#include "src/levels.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
|
||||
[DC_PRED] = "dc",
|
||||
[DC_128_PRED] = "dc_128",
|
||||
[TOP_DC_PRED] = "dc_top",
|
||||
[LEFT_DC_PRED] = "dc_left",
|
||||
[HOR_PRED] = "h",
|
||||
[VERT_PRED] = "v",
|
||||
[PAETH_PRED] = "paeth",
|
||||
[SMOOTH_PRED] = "smooth",
|
||||
[SMOOTH_V_PRED] = "smooth_v",
|
||||
[SMOOTH_H_PRED] = "smooth_h",
|
||||
[Z1_PRED] = "z1",
|
||||
[Z2_PRED] = "z2",
|
||||
[Z3_PRED] = "z3",
|
||||
[FILTER_PRED] = "filter"
|
||||
};
|
||||
|
||||
static const char *const cfl_ac_names[3] = { "420", "422", "444" };
|
||||
|
||||
static const char *const cfl_pred_mode_names[DC_128_PRED + 1] = {
|
||||
[DC_PRED] = "cfl",
|
||||
[DC_128_PRED] = "cfl_128",
|
||||
[TOP_DC_PRED] = "cfl_top",
|
||||
[LEFT_DC_PRED] = "cfl_left",
|
||||
};
|
||||
|
||||
static const uint8_t z_angles[27] = {
|
||||
3, 6, 9,
|
||||
14, 17, 20, 23, 26, 29, 32,
|
||||
36, 39, 42, 45, 48, 51, 54,
|
||||
58, 61, 64, 67, 70, 73, 76,
|
||||
81, 84, 87
|
||||
};
|
||||
|
||||
static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
PIXEL_RECT(c_dst, 64, 64);
|
||||
PIXEL_RECT(a_dst, 64, 64);
|
||||
ALIGN_STK_64(pixel, topleft_buf, 257,);
|
||||
pixel *const topleft = topleft_buf + 128;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
|
||||
int width, int height, int angle, int max_width, int max_height
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) {
|
||||
int bpc_min = BITDEPTH, bpc_max = BITDEPTH;
|
||||
if (mode == FILTER_PRED && BITDEPTH == 16) {
|
||||
bpc_min = 10;
|
||||
bpc_max = 12;
|
||||
}
|
||||
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2)
|
||||
for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1)
|
||||
if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc",
|
||||
intra_pred_mode_names[mode], w, bpc))
|
||||
{
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4,
|
||||
(mode == FILTER_PRED ? 32 : 64)); h <<= 1)
|
||||
{
|
||||
const ptrdiff_t stride = c_dst_stride;
|
||||
|
||||
int a = 0, maxw = 0, maxh = 0;
|
||||
if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
|
||||
a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
|
||||
(rnd() & 0x600);
|
||||
if (mode == Z2_PRED) {
|
||||
maxw = rnd(), maxh = rnd();
|
||||
maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
|
||||
maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
|
||||
}
|
||||
} else if (mode == FILTER_PRED) /* filter_idx */
|
||||
a = (rnd() % 5) | (rnd() & ~511);
|
||||
|
||||
int bitdepth_max;
|
||||
if (bpc == 16)
|
||||
bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
else
|
||||
bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
for (int i = -h * 2; i <= w * 2; i++)
|
||||
topleft[i] = rnd() & bitdepth_max;
|
||||
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel_padded(c_dst, stride,
|
||||
a_dst, stride,
|
||||
w, h, "dst"))
|
||||
{
|
||||
if (mode == Z1_PRED || mode == Z3_PRED)
|
||||
fprintf(stderr, "angle = %d (0x%03x)\n",
|
||||
a & 0x1ff, a & 0x600);
|
||||
else if (mode == Z2_PRED)
|
||||
fprintf(stderr, "angle = %d (0x%03x), "
|
||||
"max_width = %d, max_height = %d\n",
|
||||
a & 0x1ff, a & 0x600, maxw, maxh);
|
||||
else if (mode == FILTER_PRED)
|
||||
fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
|
||||
}
|
||||
|
||||
bench_new(a_dst, stride, topleft, w, h, a, 128, 128
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
report("intra_pred");
|
||||
}
|
||||
|
||||
static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, luma, 32 * 32,);
|
||||
|
||||
declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
|
||||
int w_pad, int h_pad, int cw, int ch);
|
||||
|
||||
for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) {
|
||||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int h_step = 2 >> ss_hor, v_step = 2 >> ss_ver;
|
||||
for (int w = 4; w <= (32 >> ss_hor); w <<= 1)
|
||||
if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc",
|
||||
cfl_ac_names[layout - 1], w, BITDEPTH))
|
||||
{
|
||||
for (int h = imax(w / 4, 4);
|
||||
h <= imin(w * 4, (32 >> ss_ver)); h <<= 1)
|
||||
{
|
||||
const ptrdiff_t stride = 32 * sizeof(pixel);
|
||||
for (int w_pad = imax((w >> 2) - h_step, 0);
|
||||
w_pad >= 0; w_pad -= h_step)
|
||||
{
|
||||
for (int h_pad = imax((h >> 2) - v_step, 0);
|
||||
h_pad >= 0; h_pad -= v_step)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
for (int y = 0; y < (h << ss_ver); y++)
|
||||
for (int x = 0; x < (w << ss_hor); x++)
|
||||
luma[y * 32 + x] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, luma, stride, w_pad, h_pad, w, h);
|
||||
call_new(a_dst, luma, stride, w_pad, h_pad, w, h);
|
||||
checkasm_check(int16_t, c_dst, w * sizeof(*c_dst),
|
||||
a_dst, w * sizeof(*a_dst),
|
||||
w, h, "dst");
|
||||
}
|
||||
}
|
||||
|
||||
bench_new(a_dst, luma, stride, 0, 0, w, h);
|
||||
}
|
||||
}
|
||||
}
|
||||
report("cfl_ac");
|
||||
}
|
||||
|
||||
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
|
||||
ALIGN_STK_64(int16_t, ac, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, topleft_buf, 257,);
|
||||
pixel *const topleft = topleft_buf + 128;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
|
||||
int width, int height, const int16_t *ac, int alpha
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int mode = 0; mode <= DC_128_PRED; mode += 1 + 2 * !mode)
|
||||
for (int w = 4; w <= 32; w <<= 1)
|
||||
if (check_func(c->cfl_pred[mode], "cfl_pred_%s_w%d_%dbpc",
|
||||
cfl_pred_mode_names[mode], w, BITDEPTH))
|
||||
{
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 32); h <<= 1)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
const ptrdiff_t stride = w * sizeof(pixel);
|
||||
|
||||
int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2));
|
||||
|
||||
for (int i = -h * 2; i <= w * 2; i++)
|
||||
topleft[i] = rnd() & bitdepth_max;
|
||||
|
||||
int luma_avg = w * h >> 1;
|
||||
for (int i = 0; i < w * h; i++)
|
||||
luma_avg += ac[i] = rnd() & (bitdepth_max << 3);
|
||||
luma_avg /= w * h;
|
||||
for (int i = 0; i < w * h; i++)
|
||||
ac[i] -= luma_avg;
|
||||
|
||||
call_ref(c_dst, stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, stride, topleft, w, h, ac, alpha
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("cfl_pred");
|
||||
}
|
||||
|
||||
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
|
||||
ALIGN_STK_16(uint16_t, pal, 8,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
|
||||
const uint8_t *idx, int w, int h);
|
||||
|
||||
for (int w = 4; w <= 64; w <<= 1)
|
||||
if (check_func(c->pal_pred, "pal_pred_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 64); h <<= 1)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const ptrdiff_t stride = w * sizeof(pixel);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
pal[i] = rnd() & bitdepth_max;
|
||||
|
||||
for (int i = 0; i < w * h; i++)
|
||||
idx[i] = rnd() & 7;
|
||||
|
||||
call_ref(c_dst, stride, pal, idx, w, h);
|
||||
call_new(a_dst, stride, pal, idx, w, h);
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
|
||||
|
||||
bench_new(a_dst, stride, pal, idx, w, h);
|
||||
}
|
||||
report("pal_pred");
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_ipred)(void) {
|
||||
Dav1dIntraPredDSPContext c;
|
||||
bitfn(dav1d_intra_pred_dsp_init)(&c);
|
||||
|
||||
check_intra_pred(&c);
|
||||
check_cfl_ac(&c);
|
||||
check_cfl_pred(&c);
|
||||
check_pal_pred(&c);
|
||||
}
|
|
@ -0,0 +1,313 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "src/itx.h"
|
||||
#include "src/levels.h"
|
||||
#include "src/scan.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
#ifndef M_SQRT1_2
|
||||
#define M_SQRT1_2 0.707106781186547524401
|
||||
#endif
|
||||
|
||||
enum Tx1D { DCT, ADST, FLIPADST, IDENTITY, WHT };
|
||||
|
||||
static const uint8_t itx_1d_types[N_TX_TYPES_PLUS_LL][2] = {
|
||||
[DCT_DCT] = { DCT, DCT },
|
||||
[ADST_DCT] = { DCT, ADST },
|
||||
[DCT_ADST] = { ADST, DCT },
|
||||
[ADST_ADST] = { ADST, ADST },
|
||||
[FLIPADST_DCT] = { DCT, FLIPADST },
|
||||
[DCT_FLIPADST] = { FLIPADST, DCT },
|
||||
[FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },
|
||||
[ADST_FLIPADST] = { FLIPADST, ADST },
|
||||
[FLIPADST_ADST] = { ADST, FLIPADST },
|
||||
[IDTX] = { IDENTITY, IDENTITY },
|
||||
[V_DCT] = { IDENTITY, DCT },
|
||||
[H_DCT] = { DCT, IDENTITY },
|
||||
[V_ADST] = { IDENTITY, ADST },
|
||||
[H_ADST] = { ADST, IDENTITY },
|
||||
[V_FLIPADST] = { IDENTITY, FLIPADST },
|
||||
[H_FLIPADST] = { FLIPADST, IDENTITY },
|
||||
[WHT_WHT] = { WHT, WHT },
|
||||
};
|
||||
|
||||
static const char *const itx_1d_names[5] = {
|
||||
[DCT] = "dct",
|
||||
[ADST] = "adst",
|
||||
[FLIPADST] = "flipadst",
|
||||
[IDENTITY] = "identity",
|
||||
[WHT] = "wht"
|
||||
};
|
||||
|
||||
static const double scaling_factors[9] = {
|
||||
4.0000, /* 4x4 */
|
||||
4.0000 * M_SQRT1_2, /* 4x8 8x4 */
|
||||
2.0000, /* 4x16 8x8 16x4 */
|
||||
2.0000 * M_SQRT1_2, /* 8x16 16x8 */
|
||||
1.0000, /* 8x32 16x16 32x8 */
|
||||
0.5000 * M_SQRT1_2, /* 16x32 32x16 */
|
||||
0.2500, /* 16x64 32x32 64x16 */
|
||||
0.1250 * M_SQRT1_2, /* 32x64 64x32 */
|
||||
0.0625, /* 64x64 */
|
||||
};
|
||||
|
||||
/* FIXME: Ensure that those forward transforms are similar to the real AV1
|
||||
* transforms. The FLIPADST currently uses the ADST forward transform for
|
||||
* example which is obviously "incorrect", but we're just using it for now
|
||||
* since it does produce coefficients in the correct range at least. */
|
||||
|
||||
/* DCT-II */
|
||||
static void fdct_1d(double *const out, const double *const in, const int sz) {
|
||||
for (int i = 0; i < sz; i++) {
|
||||
out[i] = 0.0;
|
||||
for (int j = 0; j < sz; j++)
|
||||
out[i] += in[j] * cos(M_PI * (2 * j + 1) * i / (sz * 2.0));
|
||||
}
|
||||
out[0] *= M_SQRT1_2;
|
||||
}
|
||||
|
||||
/* See "Towards jointly optimal spatial prediction and adaptive transform in
|
||||
* video/image coding", by J. Han, A. Saxena, and K. Rose
|
||||
* IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.
|
||||
* and "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",
|
||||
* by Jingning Han, Yaowu Xu, and Debargha Mukherjee
|
||||
* http://research.google.com/pubs/archive/41418.pdf
|
||||
*/
|
||||
static void fadst_1d(double *const out, const double *const in, const int sz) {
|
||||
for (int i = 0; i < sz; i++) {
|
||||
out[i] = 0.0;
|
||||
for (int j = 0; j < sz; j++)
|
||||
out[i] += in[j] * sin(M_PI *
|
||||
(sz == 4 ? ( j + 1) * (2 * i + 1) / (8.0 + 1.0) :
|
||||
(2 * j + 1) * (2 * i + 1) / (sz * 4.0)));
|
||||
}
|
||||
}
|
||||
|
||||
static void fwht4_1d(double *const out, const double *const in)
|
||||
{
|
||||
const double t0 = in[0] + in[1];
|
||||
const double t3 = in[3] - in[2];
|
||||
const double t4 = (t0 - t3) * 0.5;
|
||||
const double t1 = t4 - in[1];
|
||||
const double t2 = t4 - in[2];
|
||||
out[0] = t0 - t2;
|
||||
out[1] = t2;
|
||||
out[2] = t3 + t1;
|
||||
out[3] = t1;
|
||||
}
|
||||
|
||||
static int copy_subcoefs(coef *coeff,
|
||||
const enum RectTxfmSize tx, const enum TxfmType txtp,
|
||||
const int sw, const int sh, const int subsh)
|
||||
{
|
||||
/* copy the topleft coefficients such that the return value (being the
|
||||
* coefficient scantable index for the eob token) guarantees that only
|
||||
* the topleft $sub out of $sz (where $sz >= $sub) coefficients in both
|
||||
* dimensions are non-zero. This leads to braching to specific optimized
|
||||
* simd versions (e.g. dc-only) so that we get full asm coverage in this
|
||||
* test */
|
||||
|
||||
const enum TxClass tx_class = dav1d_tx_type_class[txtp];
|
||||
const uint16_t *const scan = dav1d_scans[tx];
|
||||
const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
|
||||
const int sub_low = subsh > 1 ? sub_high - 8 : 0;
|
||||
int n, eob;
|
||||
|
||||
for (n = 0, eob = 0; n < sw * sh; n++) {
|
||||
int rc, rcx, rcy;
|
||||
if (tx_class == TX_CLASS_2D)
|
||||
rc = scan[n], rcx = rc % sh, rcy = rc / sh;
|
||||
else if (tx_class == TX_CLASS_H)
|
||||
rcx = n % sh, rcy = n / sh, rc = n;
|
||||
else /* tx_class == TX_CLASS_V */
|
||||
rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx;
|
||||
|
||||
/* Pick a random eob within this sub-itx */
|
||||
if (rcx > sub_high || rcy > sub_high) {
|
||||
break; /* upper boundary */
|
||||
} else if (!eob && (rcx > sub_low || rcy > sub_low))
|
||||
eob = n; /* lower boundary */
|
||||
}
|
||||
|
||||
if (eob)
|
||||
eob += rnd() % (n - eob - 1);
|
||||
if (tx_class == TX_CLASS_2D)
|
||||
for (n = eob + 1; n < sw * sh; n++)
|
||||
coeff[scan[n]] = 0;
|
||||
else if (tx_class == TX_CLASS_H)
|
||||
for (n = eob + 1; n < sw * sh; n++)
|
||||
coeff[n] = 0;
|
||||
else /* tx_class == TX_CLASS_V */ {
|
||||
for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1)
|
||||
while (++rcy < sw)
|
||||
coeff[rcy * sh + rcx] = 0;
|
||||
n = sw * sh;
|
||||
}
|
||||
for (; n < 32 * 32; n++)
|
||||
coeff[n] = rnd();
|
||||
return eob;
|
||||
}
|
||||
|
||||
static int ftx(coef *const buf, const enum RectTxfmSize tx,
|
||||
const enum TxfmType txtp, const int w, const int h,
|
||||
const int subsh, const int bitdepth_max)
|
||||
{
|
||||
double out[64 * 64], temp[64 * 64];
|
||||
const double scale = scaling_factors[ctz(w * h) - 4];
|
||||
const int sw = imin(w, 32), sh = imin(h, 32);
|
||||
|
||||
for (int i = 0; i < h; i++) {
|
||||
double in[64], temp_out[64];
|
||||
|
||||
for (int i = 0; i < w; i++)
|
||||
in[i] = (rnd() & (2 * bitdepth_max + 1)) - bitdepth_max;
|
||||
|
||||
switch (itx_1d_types[txtp][0]) {
|
||||
case DCT:
|
||||
fdct_1d(temp_out, in, w);
|
||||
break;
|
||||
case ADST:
|
||||
case FLIPADST:
|
||||
fadst_1d(temp_out, in, w);
|
||||
break;
|
||||
case WHT:
|
||||
fwht4_1d(temp_out, in);
|
||||
break;
|
||||
case IDENTITY:
|
||||
memcpy(temp_out, in, w * sizeof(*temp_out));
|
||||
break;
|
||||
}
|
||||
|
||||
for (int j = 0; j < w; j++)
|
||||
temp[j * h + i] = temp_out[j] * scale;
|
||||
}
|
||||
|
||||
for (int i = 0; i < w; i++) {
|
||||
switch (itx_1d_types[txtp][0]) {
|
||||
case DCT:
|
||||
fdct_1d(&out[i * h], &temp[i * h], h);
|
||||
break;
|
||||
case ADST:
|
||||
case FLIPADST:
|
||||
fadst_1d(&out[i * h], &temp[i * h], h);
|
||||
break;
|
||||
case WHT:
|
||||
fwht4_1d(&out[i * h], &temp[i * h]);
|
||||
break;
|
||||
case IDENTITY:
|
||||
memcpy(&out[i * h], &temp[i * h], h * sizeof(*out));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int y = 0; y < sh; y++)
|
||||
for (int x = 0; x < sw; x++)
|
||||
buf[y * sw + x] = (coef) (out[y * w + x] + 0.5);
|
||||
|
||||
return copy_subcoefs(buf, tx, txtp, sw, sh, subsh);
|
||||
}
|
||||
|
||||
static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c,
|
||||
const enum RectTxfmSize tx)
|
||||
{
|
||||
ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
|
||||
static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };
|
||||
|
||||
const int w = dav1d_txfm_dimensions[tx].w * 4;
|
||||
const int h = dav1d_txfm_dimensions[tx].h * 4;
|
||||
const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
|
||||
dav1d_txfm_dimensions[tx].lh)];
|
||||
#if BITDEPTH == 16
|
||||
const int bpc_min = 10, bpc_max = 12;
|
||||
#else
|
||||
const int bpc_min = 8, bpc_max = 8;
|
||||
#endif
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff,
|
||||
int eob HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
|
||||
bitfn(dav1d_itx_dsp_init)(c, bpc);
|
||||
for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)
|
||||
for (int subsh = 0; subsh < subsh_max; subsh++)
|
||||
if (check_func(c->itxfm_add[tx][txtp],
|
||||
"inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",
|
||||
w, h, itx_1d_names[itx_1d_types[txtp][0]],
|
||||
itx_1d_names[itx_1d_types[txtp][1]], subsh,
|
||||
bpc))
|
||||
{
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
|
||||
memcpy(coeff[1], coeff[0], sizeof(*coeff));
|
||||
|
||||
for (int j = 0; j < w * h; j++)
|
||||
c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
|
||||
a_dst, w * sizeof(*a_dst),
|
||||
w, h, "dst");
|
||||
if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
|
||||
fail();
|
||||
|
||||
bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("add_%dx%d", w, h);
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_itx)(void) {
|
||||
static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
|
||||
TX_4X4, RTX_4X8, RTX_4X16,
|
||||
RTX_8X4, TX_8X8, RTX_8X16, RTX_8X32,
|
||||
RTX_16X4, RTX_16X8, TX_16X16, RTX_16X32, RTX_16X64,
|
||||
RTX_32X8, RTX_32X16, TX_32X32, RTX_32X64,
|
||||
RTX_64X16, RTX_64X32, TX_64X64
|
||||
};
|
||||
|
||||
/* Zero unused function pointer elements. */
|
||||
Dav1dInvTxfmDSPContext c = { { { 0 } } };
|
||||
|
||||
for (int i = 0; i < N_RECT_TX_SIZES; i++)
|
||||
check_itxfm_add(&c, txfm_size_order[i]);
|
||||
}
|
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
static void init_lpf_border(pixel *const dst, const ptrdiff_t stride,
|
||||
int E, int I, const int bitdepth_max)
|
||||
{
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const int F = 1 << bitdepth_min_8;
|
||||
E <<= bitdepth_min_8;
|
||||
I <<= bitdepth_min_8;
|
||||
|
||||
const int filter_type = rnd() % 4;
|
||||
const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2);
|
||||
switch (filter_type) {
|
||||
case 0: // random, unfiltered
|
||||
for (int i = -8; i < 8; i++)
|
||||
dst[i * stride] = rnd() & bitdepth_max;
|
||||
break;
|
||||
case 1: // long flat
|
||||
dst[-8 * stride] = rnd() & bitdepth_max;
|
||||
dst[+7 * stride] = rnd() & bitdepth_max;
|
||||
dst[+0 * stride] = rnd() & bitdepth_max;
|
||||
dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
|
||||
for (int i = 1; i < 7; i++) {
|
||||
dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] +
|
||||
rnd() % (2 * (F + 1)) - (F + 1));
|
||||
dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] +
|
||||
rnd() % (2 * (F + 1)) - (F + 1));
|
||||
}
|
||||
break;
|
||||
case 2: // short flat
|
||||
for (int i = 4; i < 8; i++) {
|
||||
dst[-(1 + i) * stride] = rnd() & bitdepth_max;
|
||||
dst[+(0 + i) * stride] = rnd() & bitdepth_max;
|
||||
}
|
||||
dst[+0 * stride] = rnd() & bitdepth_max;
|
||||
dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
|
||||
for (int i = 1; i < 4; i++) {
|
||||
dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] +
|
||||
rnd() % (2 * (F + 1)) - (F + 1));
|
||||
dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] +
|
||||
rnd() % (2 * (F + 1)) - (F + 1));
|
||||
}
|
||||
break;
|
||||
case 3: // normal or hev
|
||||
for (int i = 4; i < 8; i++) {
|
||||
dst[-(1 + i) * stride] = rnd() & bitdepth_max;
|
||||
dst[+(0 + i) * stride] = rnd() & bitdepth_max;
|
||||
}
|
||||
dst[+0 * stride] = rnd() & bitdepth_max;
|
||||
dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff);
|
||||
for (int i = 1; i < 4; i++) {
|
||||
dst[-(1 + i) * stride] = iclip_pixel(dst[-(0 + i) * stride] +
|
||||
rnd() % (2 * (I + 1)) - (I + 1));
|
||||
dst[+(0 + i) * stride] = iclip_pixel(dst[+(i - 1) * stride] +
|
||||
rnd() % (2 * (I + 1)) - (I + 1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
|
||||
const int n_blks, const int lf_idx,
|
||||
const int is_chroma, const int dir)
|
||||
{
|
||||
ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
|
||||
ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
|
||||
const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX);
|
||||
|
||||
pixel *a_dst, *c_dst;
|
||||
ptrdiff_t stride, b4_stride;
|
||||
int w, h;
|
||||
if (dir) {
|
||||
a_dst = a_dst_mem + 128 * 8;
|
||||
c_dst = c_dst_mem + 128 * 8;
|
||||
w = 128;
|
||||
h = 16;
|
||||
b4_stride = 32;
|
||||
} else {
|
||||
a_dst = a_dst_mem + 8;
|
||||
c_dst = c_dst_mem + 8;
|
||||
w = 16;
|
||||
h = 128;
|
||||
b4_stride = 2;
|
||||
}
|
||||
stride = w * sizeof(pixel);
|
||||
|
||||
Av1FilterLUT lut;
|
||||
const int sharp = rnd() & 7;
|
||||
for (int level = 0; level < 64; level++) {
|
||||
int limit = level;
|
||||
|
||||
if (sharp > 0) {
|
||||
limit >>= (sharp + 3) >> 2;
|
||||
limit = imin(limit, 9 - sharp);
|
||||
}
|
||||
limit = imax(limit, 1);
|
||||
|
||||
lut.i[level] = limit;
|
||||
lut.e[level] = 2 * (level + 2) + limit;
|
||||
}
|
||||
lut.sharp[0] = (sharp + 3) >> 2;
|
||||
lut.sharp[1] = sharp ? 9 - sharp : 0xff;
|
||||
|
||||
const int n_strengths = is_chroma ? 2 : 3;
|
||||
for (int i = 0; i < n_strengths; i++) {
|
||||
if (check_func(fn, "%s_w%d_%dbpc", name,
|
||||
is_chroma ? 4 + 2 * i : 4 << i, BITDEPTH))
|
||||
{
|
||||
uint32_t vmask[4] = { 0 };
|
||||
uint8_t l[32 * 2][4];
|
||||
|
||||
for (int j = 0; j < n_blks; j++) {
|
||||
const int idx = rnd() % (i + 2);
|
||||
if (idx) vmask[idx - 1] |= 1U << j;
|
||||
if (dir) {
|
||||
l[j][lf_idx] = rnd() & 63;
|
||||
l[j + 32][lf_idx] = rnd() & 63;
|
||||
} else {
|
||||
l[j * 2][lf_idx] = rnd() & 63;
|
||||
l[j * 2 + 1][lf_idx] = rnd() & 63;
|
||||
}
|
||||
}
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < 4 * n_blks; i++) {
|
||||
const int x = i >> 2;
|
||||
int L;
|
||||
if (dir) {
|
||||
L = l[32 + x][lf_idx] ? l[32 + x][lf_idx] : l[x][lf_idx];
|
||||
} else {
|
||||
L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx];
|
||||
}
|
||||
init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1,
|
||||
lut.e[L], lut.i[L], bitdepth_max);
|
||||
}
|
||||
memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16);
|
||||
|
||||
call_ref(c_dst, stride,
|
||||
vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
|
||||
&lut, n_blks HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride,
|
||||
vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
|
||||
&lut, n_blks HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst_mem, stride, a_dst_mem, stride,
|
||||
w, h, "dst");
|
||||
bench_new(a_dst, stride,
|
||||
vmask, (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], b4_stride,
|
||||
&lut, n_blks HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report(name);
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_loopfilter)(void) {
|
||||
Dav1dLoopFilterDSPContext c;
|
||||
|
||||
bitfn(dav1d_loop_filter_dsp_init)(&c);
|
||||
|
||||
check_lpf_sb(c.loop_filter_sb[0][0], "lpf_h_sb_y", 32, 0, 0, 0);
|
||||
check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb_y", 32, 1, 0, 1);
|
||||
check_lpf_sb(c.loop_filter_sb[1][0], "lpf_h_sb_uv", 16, 2, 1, 0);
|
||||
check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb_uv", 16, 2, 1, 1);
|
||||
}
|
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/looprestoration.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
static int to_binary(int x) { /* 0-15 -> 0000-1111 */
|
||||
return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
|
||||
}
|
||||
|
||||
static void init_tmp(pixel *buf, const ptrdiff_t stride,
|
||||
const int w, const int h, const int bitdepth_max)
|
||||
{
|
||||
const int noise_mask = bitdepth_max >> 4;
|
||||
const int x_off = rnd() & 7, y_off = rnd() & 7;
|
||||
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^
|
||||
(rnd() & noise_mask);
|
||||
}
|
||||
buf += PXSTRIDE(stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
|
||||
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
|
||||
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
|
||||
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
|
||||
pixel left[64][4];
|
||||
LooprestorationParams params;
|
||||
int16_t (*const filter)[8] = params.filter;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, ptrdiff_t lpf_stride,
|
||||
int w, int h, const LooprestorationParams *params,
|
||||
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int t = 0; t < 2; t++) {
|
||||
if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) {
|
||||
filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5;
|
||||
filter[0][1] = filter[0][5] = (rnd() & 31) - 23;
|
||||
filter[0][2] = filter[0][4] = (rnd() & 63) - 17;
|
||||
filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
|
||||
#if BITDEPTH != 8
|
||||
filter[0][3] += 128;
|
||||
#endif
|
||||
|
||||
filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5;
|
||||
filter[1][1] = filter[1][5] = (rnd() & 31) - 23;
|
||||
filter[1][2] = filter[1][4] = (rnd() & 63) - 17;
|
||||
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
|
||||
|
||||
const int base_w = 1 + (rnd() % 384);
|
||||
const int base_h = 1 + (rnd() & 63);
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
|
||||
|
||||
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
|
||||
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
|
||||
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
|
||||
|
||||
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
|
||||
|
||||
call_ref(c_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
|
||||
a_dst, 448 * sizeof(pixel),
|
||||
w, h, "dst"))
|
||||
{
|
||||
fprintf(stderr, "size = %dx%d, edges = %04d\n",
|
||||
w, h, to_binary(edges));
|
||||
break;
|
||||
}
|
||||
}
|
||||
bench_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
|
||||
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
|
||||
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
|
||||
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
|
||||
pixel left[64][4];
|
||||
LooprestorationParams params;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, ptrdiff_t lpf_stride,
|
||||
int w, int h, const LooprestorationParams *params,
|
||||
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
static const struct { char name[4]; uint8_t idx; } sgr_data[3] = {
|
||||
{ "5x5", 14 },
|
||||
{ "3x3", 10 },
|
||||
{ "mix", 0 },
|
||||
};
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) {
|
||||
const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx];
|
||||
params.sgr.s0 = sgr_params[0];
|
||||
params.sgr.s1 = sgr_params[1];
|
||||
params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0;
|
||||
params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0;
|
||||
|
||||
const int base_w = 1 + (rnd() % 384);
|
||||
const int base_h = 1 + (rnd() & 63);
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
|
||||
|
||||
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
|
||||
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
|
||||
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
|
||||
|
||||
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
|
||||
|
||||
call_ref(c_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
|
||||
a_dst, 448 * sizeof(pixel),
|
||||
w, h, "dst"))
|
||||
{
|
||||
fprintf(stderr, "size = %dx%d, edges = %04d\n",
|
||||
w, h, to_binary(edges));
|
||||
break;
|
||||
}
|
||||
}
|
||||
bench_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_looprestoration)(void) {
|
||||
#if BITDEPTH == 16
|
||||
const int bpc_min = 10, bpc_max = 12;
|
||||
#else
|
||||
const int bpc_min = 8, bpc_max = 8;
|
||||
#endif
|
||||
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
|
||||
Dav1dLoopRestorationDSPContext c;
|
||||
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
|
||||
check_wiener(&c, bpc);
|
||||
}
|
||||
report("wiener");
|
||||
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
|
||||
Dav1dLoopRestorationDSPContext c;
|
||||
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
|
||||
check_sgr(&c, bpc);
|
||||
}
|
||||
report("sgr");
|
||||
}
|
|
@ -0,0 +1,756 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/mc.h"
|
||||
|
||||
static const char *const filter_names[] = {
|
||||
"8tap_regular", "8tap_regular_smooth", "8tap_regular_sharp",
|
||||
"8tap_sharp_regular", "8tap_sharp_smooth", "8tap_sharp",
|
||||
"8tap_smooth_regular", "8tap_smooth", "8tap_smooth_sharp",
|
||||
"bilinear"
|
||||
};
|
||||
|
||||
static const char *const mxy_names[] = { "0", "h", "v", "hv" };
|
||||
static const char *const scaled_paths[] = { "", "_dy1", "_dy2" };
|
||||
|
||||
static int mc_h_next(const int h) {
|
||||
switch (h) {
|
||||
case 4:
|
||||
case 8:
|
||||
case 16:
|
||||
return (h * 3) >> 1;
|
||||
case 6:
|
||||
case 12:
|
||||
case 24:
|
||||
return (h & (h - 1)) * 2;
|
||||
default:
|
||||
return h * 2;
|
||||
}
|
||||
}
|
||||
|
||||
static void check_mc(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
const pixel *src = src_buf + 135 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 135 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
|
||||
ptrdiff_t src_stride, int w, int h, int mx, int my
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int filter = 0; filter < N_2D_FILTERS; filter++)
|
||||
for (int w = 2; w <= 128; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int mxy = 0; mxy < 4; mxy++)
|
||||
if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc",
|
||||
filter_names[filter], w, mxy_names[mxy], BITDEPTH))
|
||||
{
|
||||
const int h_min = w <= 32 ? 2 : w / 4;
|
||||
const int h_max = imax(imin(w * 4, 128), 32);
|
||||
for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
|
||||
const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
|
||||
const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < 135 * 135; i++)
|
||||
src_buf[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride,
|
||||
a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
if (filter == FILTER_2D_8TAP_REGULAR ||
|
||||
filter == FILTER_2D_BILINEAR)
|
||||
{
|
||||
bench_new(a_dst, dst_stride, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
report("mc");
|
||||
}
|
||||
|
||||
/* Generate worst case input in the topleft corner, randomize the rest */
|
||||
static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
|
||||
static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 };
|
||||
const int sign = -(rnd() & 1);
|
||||
|
||||
for (int y = 0; y < 135; y++)
|
||||
for (int x = 0; x < 135; x++)
|
||||
buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
|
||||
: rnd()) & bitdepth_max;
|
||||
}
|
||||
|
||||
static void check_mct(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
|
||||
const pixel *src = src_buf + 135 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 135 * sizeof(pixel);
|
||||
|
||||
declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
|
||||
int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int filter = 0; filter < N_2D_FILTERS; filter++)
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
for (int mxy = 0; mxy < 4; mxy++)
|
||||
if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
|
||||
filter_names[filter], w, mxy_names[mxy], BITDEPTH))
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
|
||||
const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
generate_mct_input(src_buf, bitdepth_max);
|
||||
|
||||
call_ref(c_tmp, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_tmp, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
|
||||
a_tmp, w * sizeof(*a_tmp),
|
||||
w, h, "tmp");
|
||||
|
||||
if (filter == FILTER_2D_8TAP_REGULAR ||
|
||||
filter == FILTER_2D_BILINEAR)
|
||||
{
|
||||
bench_new(a_tmp, src, src_stride, w, h,
|
||||
mx, my HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("mct");
|
||||
}
|
||||
|
||||
static void check_mc_scaled(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
const pixel *src = src_buf + 263 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 263 * sizeof(pixel);
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
|
||||
ptrdiff_t src_stride, int w, int h,
|
||||
int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int filter = 0; filter < N_2D_FILTERS; filter++)
|
||||
for (int w = 2; w <= 128; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int p = 0; p < 3; ++p) {
|
||||
if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
|
||||
filter_names[filter], w, scaled_paths[p], BITDEPTH))
|
||||
{
|
||||
const int h_min = w <= 32 ? 2 : w / 4;
|
||||
const int h_max = imax(imin(w * 4, 128), 32);
|
||||
for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
|
||||
const int mx = rnd() % 1024;
|
||||
const int my = rnd() % 1024;
|
||||
const int dx = rnd() % 2048 + 1;
|
||||
const int dy = !p
|
||||
? rnd() % 2048 + 1
|
||||
: p << 10; // ystep=1.0 and ystep=2.0 paths
|
||||
|
||||
for (int k = 0; k < 263 * 263; k++)
|
||||
src_buf[k] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride,
|
||||
a_dst, dst_stride, w, h, "dst");
|
||||
|
||||
if (filter == FILTER_2D_8TAP_REGULAR ||
|
||||
filter == FILTER_2D_BILINEAR)
|
||||
bench_new(a_dst, dst_stride, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
report("mc_scaled");
|
||||
}
|
||||
|
||||
static void check_mct_scaled(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
|
||||
ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
|
||||
const pixel *src = src_buf + 263 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 263 * sizeof(pixel);
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
|
||||
int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int filter = 0; filter < N_2D_FILTERS; filter++)
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
for (int p = 0; p < 3; ++p) {
|
||||
if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc",
|
||||
filter_names[filter], w, scaled_paths[p], BITDEPTH))
|
||||
{
|
||||
const int h_min = imax(w / 4, 4);
|
||||
const int h_max = imin(w * 4, 128);
|
||||
for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
|
||||
const int mx = rnd() % 1024;
|
||||
const int my = rnd() % 1024;
|
||||
const int dx = rnd() % 2048 + 1;
|
||||
const int dy = !p
|
||||
? rnd() % 2048 + 1
|
||||
: p << 10; // ystep=1.0 and ystep=2.0 paths
|
||||
|
||||
for (int k = 0; k < 263 * 263; k++)
|
||||
src_buf[k] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_tmp, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_tmp, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
|
||||
a_tmp, w * sizeof(*a_tmp),
|
||||
w, h, "tmp");
|
||||
|
||||
if (filter == FILTER_2D_8TAP_REGULAR ||
|
||||
filter == FILTER_2D_BILINEAR)
|
||||
bench_new(a_tmp, src, src_stride,
|
||||
w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
report("mct_scaled");
|
||||
}
|
||||
|
||||
static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
|
||||
int16_t (*const tmp)[128 * 128], const int bitdepth_max)
|
||||
{
|
||||
for (int i = 0; i < 2; i++) {
|
||||
generate_mct_input(buf, bitdepth_max);
|
||||
c->mct[FILTER_2D_8TAP_SHARP](tmp[i], buf + 135 * 3 + 3,
|
||||
135 * sizeof(pixel), 128, 128,
|
||||
8, 8 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_avg(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) {
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("avg");
|
||||
}
|
||||
|
||||
static void check_w_avg(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) {
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
int weight = rnd() % 15 + 1;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("w_avg");
|
||||
}
|
||||
|
||||
static void check_mask(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, mask, 128 * 128,);
|
||||
|
||||
for (int i = 0; i < 128 * 128; i++)
|
||||
mask[i] = rnd() % 65;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h, const uint8_t *mask
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) {
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("mask");
|
||||
}
|
||||
|
||||
static void check_w_mask(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
static const uint16_t ss[] = { 444, 422, 420 };
|
||||
static const uint8_t ss_hor[] = { 0, 1, 1 };
|
||||
static const uint8_t ss_ver[] = { 0, 0, 1 };
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w,
|
||||
BITDEPTH))
|
||||
{
|
||||
ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
|
||||
{
|
||||
int sign = rnd() & 1;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
init_tmp(c, c_dst, tmp, bitdepth_max);
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h,
|
||||
c_mask, sign HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
|
||||
a_mask, sign HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride,
|
||||
a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
checkasm_check(uint8_t, c_mask, w >> ss_hor[i],
|
||||
a_mask, w >> ss_hor[i],
|
||||
w >> ss_hor[i], h >> ss_ver[i],
|
||||
"mask");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h,
|
||||
a_mask, sign HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("w_mask");
|
||||
}
|
||||
|
||||
static void check_blend(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, tmp, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
|
||||
ALIGN_STK_64(uint8_t, mask, 32 * 32,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h, const uint8_t *mask);
|
||||
|
||||
for (int w = 4; w <= 32; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
for (int i = 0; i < 32 * 32; i++) {
|
||||
tmp[i] = rnd() & bitdepth_max;
|
||||
mask[i] = rnd() % 65;
|
||||
}
|
||||
for (int i = 0; i < w * h; i++)
|
||||
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp, w, h, mask);
|
||||
call_new(a_dst, dst_stride, tmp, w, h, mask);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp, w, h, mask);
|
||||
}
|
||||
}
|
||||
report("blend");
|
||||
}
|
||||
|
||||
static void check_blend_v(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, tmp, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h);
|
||||
|
||||
for (int w = 2; w <= 32; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < w * h; i++)
|
||||
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
|
||||
for (int i = 0; i < 32 * 128; i++)
|
||||
tmp[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp, w, h);
|
||||
call_new(a_dst, dst_stride, tmp, w, h);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp, w, h);
|
||||
}
|
||||
}
|
||||
report("blend_v");
|
||||
}
|
||||
|
||||
static void check_blend_h(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, tmp, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h);
|
||||
|
||||
for (int w = 2; w <= 128; w <<= 1) {
|
||||
const ptrdiff_t dst_stride = w * sizeof(pixel);
|
||||
if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
|
||||
for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
for (int i = 0; i < w * h; i++)
|
||||
c_dst[i] = a_dst[i] = rnd() & bitdepth_max;
|
||||
for (int i = 0; i < 128 * 32; i++)
|
||||
tmp[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, tmp, w, h);
|
||||
call_new(a_dst, dst_stride, tmp, w, h);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
w, h, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, tmp, w, h);
|
||||
}
|
||||
}
|
||||
report("blend_h");
|
||||
}
|
||||
|
||||
static void check_warp8x8(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_64(pixel, c_dst, 8 * 8,);
|
||||
ALIGN_STK_64(pixel, a_dst, 8 * 8,);
|
||||
int16_t abcd[4];
|
||||
const pixel *src = src_buf + 15 * 3 + 3;
|
||||
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
|
||||
const ptrdiff_t src_stride = 15 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
|
||||
ptrdiff_t src_stride, const int16_t *abcd, int mx, int my
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(c->warp8x8, "warp_8x8_%dbpc", BITDEPTH)) {
|
||||
const int mx = (rnd() & 0x1fff) - 0xa00;
|
||||
const int my = (rnd() & 0x1fff) - 0xa00;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
abcd[i] = (rnd() & 0x1fff) - 0xa00;
|
||||
|
||||
for (int i = 0; i < 15 * 15; i++)
|
||||
src_buf[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
8, 8, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
report("warp8x8");
|
||||
}
|
||||
|
||||
static void check_warp8x8t(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_64(int16_t, c_tmp, 8 * 8,);
|
||||
ALIGN_STK_64(int16_t, a_tmp, 8 * 8,);
|
||||
int16_t abcd[4];
|
||||
const pixel *src = src_buf + 15 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 15 * sizeof(pixel);
|
||||
|
||||
declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src,
|
||||
ptrdiff_t src_stride, const int16_t *abcd, int mx, int my
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(c->warp8x8t, "warp_8x8t_%dbpc", BITDEPTH)) {
|
||||
const int mx = (rnd() & 0x1fff) - 0xa00;
|
||||
const int my = (rnd() & 0x1fff) - 0xa00;
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
abcd[i] = (rnd() & 0x1fff) - 0xa00;
|
||||
|
||||
for (int i = 0; i < 15 * 15; i++)
|
||||
src_buf[i] = rnd() & bitdepth_max;
|
||||
|
||||
call_ref(c_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check(int16_t, c_tmp, 8 * sizeof(*c_tmp),
|
||||
a_tmp, 8 * sizeof(*a_tmp),
|
||||
8, 8, "tmp");
|
||||
|
||||
bench_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
report("warp8x8t");
|
||||
}
|
||||
|
||||
enum EdgeFlags {
|
||||
HAVE_TOP = 1,
|
||||
HAVE_BOTTOM = 2,
|
||||
HAVE_LEFT = 4,
|
||||
HAVE_RIGHT = 8,
|
||||
};
|
||||
|
||||
static void random_offset_for_edge(int *const x, int *const y,
|
||||
const int bw, const int bh,
|
||||
int *const iw, int *const ih,
|
||||
const enum EdgeFlags edge)
|
||||
{
|
||||
#define set_off(edge1, edge2, pos, dim) \
|
||||
*i##dim = edge & (HAVE_##edge1 | HAVE_##edge2) ? 160 : 1 + (rnd() % (b##dim - 2)); \
|
||||
switch (edge & (HAVE_##edge1 | HAVE_##edge2)) { \
|
||||
case HAVE_##edge1 | HAVE_##edge2: \
|
||||
assert(b##dim <= *i##dim); \
|
||||
*pos = rnd() % (*i##dim - b##dim + 1); \
|
||||
break; \
|
||||
case HAVE_##edge1: \
|
||||
*pos = (*i##dim - b##dim) + 1 + (rnd() % (b##dim - 1)); \
|
||||
break; \
|
||||
case HAVE_##edge2: \
|
||||
*pos = -(1 + (rnd() % (b##dim - 1))); \
|
||||
break; \
|
||||
case 0: \
|
||||
assert(b##dim - 1 > *i##dim); \
|
||||
*pos = -(1 + (rnd() % (b##dim - *i##dim - 1))); \
|
||||
break; \
|
||||
}
|
||||
set_off(LEFT, RIGHT, x, w);
|
||||
set_off(TOP, BOTTOM, y, h);
|
||||
}
|
||||
|
||||
static void check_emuedge(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 192,);
|
||||
ALIGN_STK_64(pixel, a_dst, 135 * 192,);
|
||||
ALIGN_STK_64(pixel, src, 160 * 160,);
|
||||
|
||||
for (int i = 0; i < 160 * 160; i++)
|
||||
src[i] = rnd() & ((1U << BITDEPTH) - 1);
|
||||
|
||||
declare_func(void, intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih,
|
||||
intptr_t x, intptr_t y,
|
||||
pixel *dst, ptrdiff_t dst_stride,
|
||||
const pixel *src, ptrdiff_t src_stride);
|
||||
|
||||
int x, y, iw, ih;
|
||||
for (int w = 4; w <= 128; w <<= 1)
|
||||
if (check_func(c->emu_edge, "emu_edge_w%d_%dbpc", w, BITDEPTH)) {
|
||||
for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) {
|
||||
// we skip 0xf, since it implies that we don't need emu_edge
|
||||
for (enum EdgeFlags edge = 0; edge < 0xf; edge++) {
|
||||
const int bw = w + (rnd() & 7);
|
||||
const int bh = h + (rnd() & 7);
|
||||
random_offset_for_edge(&x, &y, bw, bh, &iw, &ih, edge);
|
||||
call_ref(bw, bh, iw, ih, x, y,
|
||||
c_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
|
||||
call_new(bw, bh, iw, ih, x, y,
|
||||
a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
|
||||
checkasm_check_pixel(c_dst, 192 * sizeof(pixel),
|
||||
a_dst, 192 * sizeof(pixel),
|
||||
bw, bh, "dst");
|
||||
}
|
||||
}
|
||||
for (enum EdgeFlags edge = 1; edge < 0xf; edge <<= 1) {
|
||||
random_offset_for_edge(&x, &y, w + 7, w + 7, &iw, &ih, edge);
|
||||
bench_new(w + 7, w + 7, iw, ih, x, y,
|
||||
a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel));
|
||||
}
|
||||
}
|
||||
report("emu_edge");
|
||||
}
|
||||
|
||||
static int get_upscale_x0(const int in_w, const int out_w, const int step) {
|
||||
const int err = out_w * step - (in_w << 14);
|
||||
const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
|
||||
return x0 & 0x3fff;
|
||||
}
|
||||
|
||||
static void check_resize(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
|
||||
ALIGN_STK_64(pixel, src, 512 * 64,);
|
||||
|
||||
const int height = 64;
|
||||
const int max_src_width = 512;
|
||||
const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
|
||||
const ptrdiff_t src_stride = 512 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
const pixel *src, ptrdiff_t src_stride,
|
||||
int dst_w, int src_w, int h, int dx, int mx0
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(c->resize, "resize_%dbpc", BITDEPTH)) {
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < max_src_width * height; i++)
|
||||
src[i] = rnd() & bitdepth_max;
|
||||
|
||||
const int w_den = 9 + (rnd() & 7);
|
||||
const int src_w = 16 + (rnd() % (max_src_width - 16 + 1));
|
||||
const int dst_w = w_den * src_w >> 3;
|
||||
#define scale_fac(ref_sz, this_sz) \
|
||||
((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
|
||||
const int dx = scale_fac(src_w, dst_w);
|
||||
#undef scale_fac
|
||||
const int mx0 = get_upscale_x0(src_w, dst_w, dx);
|
||||
|
||||
call_ref(c_dst, dst_stride, src, src_stride,
|
||||
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, dst_stride, src, src_stride,
|
||||
dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
|
||||
dst_w, height, "dst");
|
||||
|
||||
bench_new(a_dst, dst_stride, src, src_stride,
|
||||
512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
report("resize");
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_mc)(void) {
|
||||
Dav1dMCDSPContext c;
|
||||
bitfn(dav1d_mc_dsp_init)(&c);
|
||||
|
||||
check_mc(&c);
|
||||
check_mct(&c);
|
||||
check_mc_scaled(&c);
|
||||
check_mct_scaled(&c);
|
||||
check_avg(&c);
|
||||
check_w_avg(&c);
|
||||
check_mask(&c);
|
||||
check_w_mask(&c);
|
||||
check_blend(&c);
|
||||
check_blend_v(&c);
|
||||
check_blend_h(&c);
|
||||
check_warp8x8(&c);
|
||||
check_warp8x8t(&c);
|
||||
check_emuedge(&c);
|
||||
check_resize(&c);
|
||||
}
|
|
@ -0,0 +1,293 @@
|
|||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/msac.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#define BUF_SIZE 8192
|
||||
|
||||
/* The normal code doesn't use function pointers */
|
||||
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf);
|
||||
typedef unsigned (*decode_bool_equi_fn)(MsacContext *s);
|
||||
typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f);
|
||||
|
||||
typedef struct {
|
||||
decode_symbol_adapt_fn decode_symbol_adapt4;
|
||||
decode_symbol_adapt_fn decode_symbol_adapt8;
|
||||
decode_symbol_adapt_fn decode_symbol_adapt16;
|
||||
decode_adapt_fn decode_bool_adapt;
|
||||
decode_bool_equi_fn decode_bool_equi;
|
||||
decode_bool_fn decode_bool;
|
||||
decode_adapt_fn decode_hi_tok;
|
||||
} MsacDSPContext;
|
||||
|
||||
static void randomize_cdf(uint16_t *const cdf, const int n) {
|
||||
int i;
|
||||
for (i = 15; i > n; i--)
|
||||
cdf[i] = rnd(); // padding
|
||||
cdf[i] = 0; // count
|
||||
do {
|
||||
cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1;
|
||||
} while (--i > 0);
|
||||
}
|
||||
|
||||
/* memcmp() on structs can have weird behavior due to padding etc. */
|
||||
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
|
||||
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
|
||||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
|
||||
a->allow_update_cdf != b->allow_update_cdf;
|
||||
}
|
||||
|
||||
static void msac_dump(unsigned c_res, unsigned a_res,
|
||||
const MsacContext *const a, const MsacContext *const b,
|
||||
const uint16_t *const cdf_a, const uint16_t *const cdf_b,
|
||||
const int num_cdf)
|
||||
{
|
||||
if (c_res != a_res)
|
||||
fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res);
|
||||
if (a->buf_pos != b->buf_pos)
|
||||
fprintf(stderr, "buf_pos %p vs %p\n", a->buf_pos, b->buf_pos);
|
||||
if (a->buf_end != b->buf_end)
|
||||
fprintf(stderr, "buf_end %p vs %p\n", a->buf_end, b->buf_end);
|
||||
if (a->dif != b->dif)
|
||||
fprintf(stderr, "dif %zx vs %zx\n", a->dif, b->dif);
|
||||
if (a->rng != b->rng)
|
||||
fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng);
|
||||
if (a->cnt != b->cnt)
|
||||
fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt);
|
||||
if (a->allow_update_cdf)
|
||||
fprintf(stderr, "allow_update_cdf %d vs %d\n",
|
||||
a->allow_update_cdf, b->allow_update_cdf);
|
||||
if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
|
||||
fprintf(stderr, "cdf:\n");
|
||||
for (int i = 0; i <= num_cdf; i++)
|
||||
fprintf(stderr, " %5u", cdf_a[i]);
|
||||
fprintf(stderr, "\n");
|
||||
for (int i = 0; i <= num_cdf; i++)
|
||||
fprintf(stderr, " %5u", cdf_b[i]);
|
||||
fprintf(stderr, "\n");
|
||||
for (int i = 0; i <= num_cdf; i++)
|
||||
fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.');
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \
|
||||
if (check_func(c->decode_symbol_adapt##n, \
|
||||
"msac_decode_symbol_adapt%d", n)) \
|
||||
{ \
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
|
||||
for (int ns = n_min; ns <= n_max; ns++) { \
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \
|
||||
s_a = s_c; \
|
||||
randomize_cdf(cdf[0], ns); \
|
||||
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
|
||||
for (int i = 0; i < 64; i++) { \
|
||||
unsigned c_res = call_ref(&s_c, cdf[0], ns); \
|
||||
unsigned a_res = call_new(&s_a, cdf[1], ns); \
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a) || \
|
||||
memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \
|
||||
{ \
|
||||
if (fail()) \
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, \
|
||||
cdf[0], cdf[1], ns); \
|
||||
} \
|
||||
} \
|
||||
if (cdf_update && ns == n - 1) \
|
||||
bench_new(&s_a, cdf[1], ns); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
ALIGN_STK_32(uint16_t, cdf, 2, [16]);
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
|
||||
CHECK_SYMBOL_ADAPT( 4, 1, 4);
|
||||
CHECK_SYMBOL_ADAPT( 8, 1, 7);
|
||||
CHECK_SYMBOL_ADAPT(16, 3, 15);
|
||||
report("decode_symbol");
|
||||
}
|
||||
|
||||
static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
|
||||
if (check_func(c->decode_bool_adapt, "msac_decode_bool_adapt")) {
|
||||
uint16_t cdf[2][2];
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
|
||||
s_a = s_c;
|
||||
cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1;
|
||||
cdf[0][1] = cdf[1][1] = 0;
|
||||
for (int i = 0; i < 64; i++) {
|
||||
unsigned c_res = call_ref(&s_c, cdf[0]);
|
||||
unsigned a_res = call_new(&s_a, cdf[1]);
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
|
||||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
|
||||
{
|
||||
if (fail())
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1);
|
||||
}
|
||||
}
|
||||
if (cdf_update)
|
||||
bench_new(&s_a, cdf[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
declare_func(unsigned, MsacContext *s);
|
||||
if (check_func(c->decode_bool_equi, "msac_decode_bool_equi")) {
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
|
||||
s_a = s_c;
|
||||
for (int i = 0; i < 64; i++) {
|
||||
unsigned c_res = call_ref(&s_c);
|
||||
unsigned a_res = call_new(&s_a);
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
|
||||
if (fail())
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0);
|
||||
}
|
||||
}
|
||||
bench_new(&s_a);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
declare_func(unsigned, MsacContext *s, unsigned f);
|
||||
if (check_func(c->decode_bool, "msac_decode_bool")) {
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
|
||||
s_a = s_c;
|
||||
for (int i = 0; i < 64; i++) {
|
||||
const unsigned f = rnd() & 0x7fff;
|
||||
unsigned c_res = call_ref(&s_c, f);
|
||||
unsigned a_res = call_new(&s_a, f);
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
|
||||
if (fail())
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0);
|
||||
}
|
||||
}
|
||||
bench_new(&s_a, 16384);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
check_decode_bool_adapt(c, buf);
|
||||
check_decode_bool_equi(c, buf);
|
||||
check_decode_bool(c, buf);
|
||||
report("decode_bool");
|
||||
}
|
||||
|
||||
static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
ALIGN_STK_16(uint16_t, cdf, 2, [16]);
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
|
||||
if (check_func(c->decode_hi_tok, "msac_decode_hi_tok")) {
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
|
||||
s_a = s_c;
|
||||
randomize_cdf(cdf[0], 3);
|
||||
memcpy(cdf[1], cdf[0], sizeof(*cdf));
|
||||
for (int i = 0; i < 64; i++) {
|
||||
unsigned c_res = call_ref(&s_c, cdf[0]);
|
||||
unsigned a_res = call_new(&s_a, cdf[1]);
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
|
||||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
|
||||
{
|
||||
if (fail())
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cdf_update)
|
||||
bench_new(&s_a, cdf[1]);
|
||||
}
|
||||
}
|
||||
report("decode_hi_tok");
|
||||
}
|
||||
|
||||
void checkasm_check_msac(void) {
|
||||
MsacDSPContext c;
|
||||
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
|
||||
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_c;
|
||||
c.decode_bool_equi = dav1d_msac_decode_bool_equi_c;
|
||||
c.decode_bool = dav1d_msac_decode_bool_c;
|
||||
c.decode_hi_tok = dav1d_msac_decode_hi_tok_c;
|
||||
|
||||
#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
|
||||
if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
|
||||
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon;
|
||||
c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon;
|
||||
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
|
||||
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_neon;
|
||||
c.decode_bool_equi = dav1d_msac_decode_bool_equi_neon;
|
||||
c.decode_bool = dav1d_msac_decode_bool_neon;
|
||||
c.decode_hi_tok = dav1d_msac_decode_hi_tok_neon;
|
||||
}
|
||||
#elif ARCH_X86 && HAVE_ASM
|
||||
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
|
||||
c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2;
|
||||
c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2;
|
||||
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
|
||||
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_sse2;
|
||||
c.decode_bool_equi = dav1d_msac_decode_bool_equi_sse2;
|
||||
c.decode_bool = dav1d_msac_decode_bool_sse2;
|
||||
c.decode_hi_tok = dav1d_msac_decode_hi_tok_sse2;
|
||||
}
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) {
|
||||
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
uint8_t buf[BUF_SIZE];
|
||||
for (int i = 0; i < BUF_SIZE; i++)
|
||||
buf[i] = rnd();
|
||||
|
||||
check_decode_symbol(&c, buf);
|
||||
check_decode_bool_funcs(&c, buf);
|
||||
check_decode_hi_tok(&c, buf);
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
static void check_splat_mv(const Dav1dRefmvsDSPContext *const c) {
|
||||
ALIGN_STK_64(refmvs_block, c_buf, 32 * 32,);
|
||||
ALIGN_STK_64(refmvs_block, a_buf, 32 * 32,);
|
||||
refmvs_block *c_dst[32];
|
||||
refmvs_block *a_dst[32];
|
||||
const size_t stride = 32 * sizeof(refmvs_block);
|
||||
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c_dst[i] = c_buf + 32 * i;
|
||||
a_dst[i] = a_buf + 32 * i;
|
||||
}
|
||||
|
||||
declare_func(void, refmvs_block **rr, const refmvs_block *rmv,
|
||||
int bx4, int bw4, int bh4);
|
||||
|
||||
for (int w = 1; w <= 32; w *= 2) {
|
||||
if (check_func(c->splat_mv, "splat_mv_w%d", w)) {
|
||||
const int h_min = imax(w / 4, 1);
|
||||
const int h_max = imin(w * 4, 32);
|
||||
const int w_uint32 = w * sizeof(refmvs_block) / sizeof(uint32_t);
|
||||
for (int h = h_min; h <= h_max; h *= 2) {
|
||||
const int offset = (w * rnd()) & 31;
|
||||
union {
|
||||
refmvs_block rmv;
|
||||
uint32_t u32[3];
|
||||
} ALIGN(tmp, 16);
|
||||
tmp.u32[0] = rnd();
|
||||
tmp.u32[1] = rnd();
|
||||
tmp.u32[2] = rnd();
|
||||
|
||||
call_ref(c_dst, &tmp.rmv, offset, w, h);
|
||||
call_new(a_dst, &tmp.rmv, offset, w, h);
|
||||
checkasm_check(uint32_t, (uint32_t*)(c_buf + offset), stride,
|
||||
(uint32_t*)(a_buf + offset), stride,
|
||||
w_uint32, h, "dst");
|
||||
|
||||
bench_new(a_dst, &tmp.rmv, 0, w, h);
|
||||
}
|
||||
}
|
||||
}
|
||||
report("splat_mv");
|
||||
}
|
||||
|
||||
void checkasm_check_refmvs(void) {
|
||||
Dav1dRefmvsDSPContext c;
|
||||
dav1d_refmvs_dsp_init(&c);
|
||||
|
||||
check_splat_mv(&c);
|
||||
}
|
|
@ -0,0 +1,370 @@
|
|||
; Copyright © 2018, VideoLAN and dav1d authors
|
||||
; Copyright © 2018, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%undef private_prefix
|
||||
%define private_prefix checkasm
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
SECTION_RODATA 16
|
||||
|
||||
%if ARCH_X86_64
|
||||
; just random numbers to reduce the chance of incidental match
|
||||
%if WIN64
|
||||
x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064
|
||||
x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
|
||||
x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
|
||||
x9: dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
|
||||
x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
|
||||
x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
|
||||
x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
|
||||
x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
|
||||
x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
|
||||
x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
|
||||
n7: dq 0x21f86d66c8ca00ce
|
||||
n8: dq 0x75b6ba21077c48ad
|
||||
%endif
|
||||
n9: dq 0xed56bb2dcb3c7736
|
||||
n10: dq 0x8bda43d3fd1a7e06
|
||||
n11: dq 0xb64a9c9e5d318408
|
||||
n12: dq 0xdf9a54b303f1d3a3
|
||||
n13: dq 0x4a75479abd64e097
|
||||
n14: dq 0x249214109d5d1c88
|
||||
%endif
|
||||
|
||||
errmsg_stack: db "stack corruption", 0
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern fail_func
|
||||
|
||||
; max number of args used by any asm function.
|
||||
; (max_args % 4) must equal 3 for stack alignment
|
||||
%define max_args 15
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int checkasm_stack_clobber(uint64_t clobber, ...)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal stack_clobber, 1, 2
|
||||
; Clobber the stack with junk below the stack pointer
|
||||
%define argsize (max_args+6)*8
|
||||
SUB rsp, argsize
|
||||
mov r1, argsize-8
|
||||
.loop:
|
||||
mov [rsp+r1], r0
|
||||
sub r1, 8
|
||||
jge .loop
|
||||
ADD rsp, argsize
|
||||
RET
|
||||
|
||||
%if WIN64
|
||||
%assign free_regs 7
|
||||
%define stack_param rsp+32 ; shadow space
|
||||
%define num_stack_params rsp+stack_offset+22*8
|
||||
DECLARE_REG_TMP 4
|
||||
%else
|
||||
%assign free_regs 9
|
||||
%define stack_param rsp
|
||||
%define num_stack_params rsp+stack_offset+16*8
|
||||
DECLARE_REG_TMP 7
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void checkasm_checked_call(void *func, ...)
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_XMM
|
||||
cglobal checked_call, 2, 15, 16, max_args*8+64+8
|
||||
mov t0, r0
|
||||
|
||||
; All arguments have been pushed on the stack instead of registers in
|
||||
; order to test for incorrect assumptions that 32-bit ints are
|
||||
; zero-extended to 64-bit.
|
||||
mov r0, r6mp
|
||||
mov r1, r7mp
|
||||
mov r2, r8mp
|
||||
mov r3, r9mp
|
||||
%if UNIX64
|
||||
mov r4, r10mp
|
||||
mov r5, r11mp
|
||||
%else ; WIN64
|
||||
; Move possible floating-point arguments to the correct registers
|
||||
movq m0, r0
|
||||
movq m1, r1
|
||||
movq m2, r2
|
||||
movq m3, r3
|
||||
|
||||
%assign i 6
|
||||
%rep 16-6
|
||||
mova m %+ i, [x %+ i]
|
||||
%assign i i+1
|
||||
%endrep
|
||||
%endif
|
||||
|
||||
; write stack canaries to the area above parameters passed on the stack
|
||||
mov r9d, [num_stack_params]
|
||||
mov r8, [rsp+stack_offset] ; return address
|
||||
not r8
|
||||
%assign i 0
|
||||
%rep 8 ; 64 bytes
|
||||
mov [stack_param+(r9+i)*8], r8
|
||||
%assign i i+1
|
||||
%endrep
|
||||
dec r9d
|
||||
jl .stack_setup_done ; no stack parameters
|
||||
.copy_stack_parameter:
|
||||
mov r8, [stack_param+stack_offset+7*8+r9*8]
|
||||
mov [stack_param+r9*8], r8
|
||||
dec r9d
|
||||
jge .copy_stack_parameter
|
||||
.stack_setup_done:
|
||||
|
||||
%assign i 14
|
||||
%rep 15-free_regs
|
||||
mov r %+ i, [n %+ i]
|
||||
%assign i i-1
|
||||
%endrep
|
||||
call t0
|
||||
|
||||
; check for stack corruption
|
||||
mov r0d, [num_stack_params]
|
||||
mov r3, [rsp+stack_offset]
|
||||
mov r4, [stack_param+r0*8]
|
||||
not r3
|
||||
xor r4, r3
|
||||
%assign i 1
|
||||
%rep 6
|
||||
mov r5, [stack_param+(r0+i)*8]
|
||||
xor r5, r3
|
||||
or r4, r5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
xor r3, [stack_param+(r0+7)*8]
|
||||
lea r0, [errmsg_stack]
|
||||
or r4, r3
|
||||
jnz .fail
|
||||
|
||||
; check for failure to preserve registers
|
||||
%assign i 14
|
||||
%rep 15-free_regs
|
||||
cmp r %+ i, [r0-errmsg_stack+n %+ i]
|
||||
setne r4b
|
||||
lea r3d, [r4+r3*2]
|
||||
%assign i i-1
|
||||
%endrep
|
||||
%if WIN64
|
||||
lea r0, [rsp+60] ; account for shadow space
|
||||
mov r5, r0
|
||||
test r3d, r3d
|
||||
jz .gpr_ok
|
||||
%else
|
||||
test r3d, r3d
|
||||
jz .ok
|
||||
lea r0, [rsp+28]
|
||||
%endif
|
||||
%assign i free_regs
|
||||
%rep 15-free_regs
|
||||
%if i < 10
|
||||
mov dword [r0], " r0" + (i << 16)
|
||||
lea r4, [r0+3]
|
||||
%else
|
||||
mov dword [r0], " r10" + ((i - 10) << 24)
|
||||
lea r4, [r0+4]
|
||||
%endif
|
||||
test r3b, 1 << (i - free_regs)
|
||||
cmovnz r0, r4
|
||||
%assign i i+1
|
||||
%endrep
|
||||
%if WIN64 ; xmm registers
|
||||
.gpr_ok:
|
||||
%assign i 6
|
||||
%rep 16-6
|
||||
pxor m %+ i, [x %+ i]
|
||||
%assign i i+1
|
||||
%endrep
|
||||
packsswb m6, m7
|
||||
packsswb m8, m9
|
||||
packsswb m10, m11
|
||||
packsswb m12, m13
|
||||
packsswb m14, m15
|
||||
packsswb m6, m6
|
||||
packsswb m8, m10
|
||||
packsswb m12, m14
|
||||
packsswb m6, m6
|
||||
packsswb m8, m12
|
||||
packsswb m6, m8
|
||||
pxor m7, m7
|
||||
pcmpeqb m6, m7
|
||||
pmovmskb r3d, m6
|
||||
cmp r3d, 0xffff
|
||||
je .xmm_ok
|
||||
mov r7d, " xmm"
|
||||
%assign i 6
|
||||
%rep 16-6
|
||||
mov [r0+0], r7d
|
||||
%if i < 10
|
||||
mov byte [r0+4], "0" + i
|
||||
lea r4, [r0+5]
|
||||
%else
|
||||
mov word [r0+4], "10" + ((i - 10) << 8)
|
||||
lea r4, [r0+6]
|
||||
%endif
|
||||
test r3d, 1 << i
|
||||
cmovz r0, r4
|
||||
%assign i i+1
|
||||
%endrep
|
||||
.xmm_ok:
|
||||
cmp r0, r5
|
||||
je .ok
|
||||
mov byte [r0], 0
|
||||
lea r0, [r5-28]
|
||||
%else
|
||||
mov byte [r0], 0
|
||||
mov r0, rsp
|
||||
%endif
|
||||
mov dword [r0+ 0], "fail"
|
||||
mov dword [r0+ 4], "ed t"
|
||||
mov dword [r0+ 8], "o pr"
|
||||
mov dword [r0+12], "eser"
|
||||
mov dword [r0+16], "ve r"
|
||||
mov dword [r0+20], "egis"
|
||||
mov dword [r0+24], "ter:"
|
||||
.fail:
|
||||
; Call fail_func() with a descriptive message to mark it as a failure.
|
||||
; Save the return value located in rdx:rax first to prevent clobbering.
|
||||
mov r9, rax
|
||||
mov r10, rdx
|
||||
xor eax, eax
|
||||
call fail_func
|
||||
mov rdx, r10
|
||||
mov rax, r9
|
||||
.ok:
|
||||
RET
|
||||
|
||||
; trigger a warmup of vector units
|
||||
%macro WARMUP 0
|
||||
cglobal warmup, 0, 0
|
||||
xorps m0, m0
|
||||
mulps m0, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx2
|
||||
WARMUP
|
||||
INIT_ZMM avx512
|
||||
WARMUP
|
||||
|
||||
%else
|
||||
|
||||
; just random numbers to reduce the chance of incidental match
|
||||
%assign n3 0x6549315c
|
||||
%assign n4 0xe02f3e23
|
||||
%assign n5 0xb78d0d1d
|
||||
%assign n6 0x33627ba7
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void checkasm_checked_call(void *func, ...)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal checked_call, 1, 7
|
||||
mov r3, [esp+stack_offset] ; return address
|
||||
mov r1, [esp+stack_offset+17*4] ; num_stack_params
|
||||
mov r2, 27
|
||||
not r3
|
||||
sub r2, r1
|
||||
.push_canary:
|
||||
push r3
|
||||
dec r2
|
||||
jg .push_canary
|
||||
.push_parameter:
|
||||
push dword [esp+32*4]
|
||||
dec r1
|
||||
jg .push_parameter
|
||||
mov r3, n3
|
||||
mov r4, n4
|
||||
mov r5, n5
|
||||
mov r6, n6
|
||||
call r0
|
||||
|
||||
; check for failure to preserve registers
|
||||
cmp r3, n3
|
||||
setne r3h
|
||||
cmp r4, n4
|
||||
setne r3b
|
||||
shl r3d, 16
|
||||
cmp r5, n5
|
||||
setne r3h
|
||||
cmp r6, n6
|
||||
setne r3b
|
||||
test r3, r3
|
||||
jz .gpr_ok
|
||||
lea r1, [esp+16]
|
||||
mov dword [r1+ 0], "fail"
|
||||
mov dword [r1+ 4], "ed t"
|
||||
mov dword [r1+ 8], "o pr"
|
||||
mov dword [r1+12], "eser"
|
||||
mov dword [r1+16], "ve r"
|
||||
mov dword [r1+20], "egis"
|
||||
mov dword [r1+24], "ter:"
|
||||
lea r4, [r1+28]
|
||||
%assign i 3
|
||||
%rep 4
|
||||
mov dword [r4], " r0" + (i << 16)
|
||||
lea r5, [r4+3]
|
||||
test r3, 1 << ((6 - i) * 8)
|
||||
cmovnz r4, r5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
mov byte [r4], 0
|
||||
jmp .fail
|
||||
.gpr_ok:
|
||||
; check for stack corruption
|
||||
mov r3, [esp+48*4] ; num_stack_params
|
||||
mov r6, [esp+31*4] ; return address
|
||||
mov r4, [esp+r3*4]
|
||||
sub r3, 26
|
||||
not r6
|
||||
xor r4, r6
|
||||
.check_canary:
|
||||
mov r5, [esp+(r3+27)*4]
|
||||
xor r5, r6
|
||||
or r4, r5
|
||||
inc r3
|
||||
jl .check_canary
|
||||
test r4, r4
|
||||
jz .ok
|
||||
LEA r1, errmsg_stack
|
||||
.fail:
|
||||
mov r3, eax
|
||||
mov r4, edx
|
||||
mov [esp], r1
|
||||
call fail_func
|
||||
mov edx, r4
|
||||
mov eax, r3
|
||||
.ok:
|
||||
add esp, 27*4
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include DAV1D_TEST_HEADER
|
||||
|
||||
int main(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
|
@ -106,23 +106,23 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
|||
unsigned h = djb_xor(ptr, 32);
|
||||
unsigned seed = h;
|
||||
unsigned probability = h > (RAND_MAX >> 5) ? RAND_MAX >> 5 : h;
|
||||
int n_frame_threads = (h & 0xf) + 1;
|
||||
int n_tile_threads = ((h >> 4) & 0x7) + 1;
|
||||
if (n_frame_threads > 5) n_frame_threads = 1;
|
||||
if (n_tile_threads > 3) n_tile_threads = 1;
|
||||
int max_frame_delay = (h & 0xf) + 1;
|
||||
int n_threads = ((h >> 4) & 0x7) + 1;
|
||||
if (max_frame_delay > 5) max_frame_delay = 1;
|
||||
if (n_threads > 3) n_threads = 1;
|
||||
#endif
|
||||
ptr += 32; // skip ivf header
|
||||
|
||||
dav1d_default_settings(&settings);
|
||||
|
||||
#ifdef DAV1D_MT_FUZZING
|
||||
settings.n_frame_threads = settings.n_tile_threads = 2;
|
||||
settings.max_frame_delay = settings.n_threads = 4;
|
||||
#elif defined(DAV1D_ALLOC_FAIL)
|
||||
settings.n_frame_threads = n_frame_threads;
|
||||
settings.n_tile_threads = n_tile_threads;
|
||||
settings.max_frame_delay = max_frame_delay;
|
||||
settings.n_threads = n_threads;
|
||||
dav1d_setup_alloc_fail(seed, probability);
|
||||
#else
|
||||
settings.n_frame_threads = settings.n_tile_threads = 1;
|
||||
settings.max_frame_delay = settings.n_threads = 1;
|
||||
#endif
|
||||
#if defined(DAV1D_FUZZ_MAX_SIZE)
|
||||
settings.frame_size_limit = DAV1D_FUZZ_MAX_SIZE;
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
# Copyright © 2018, VideoLAN and dav1d authors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#
|
||||
# Build definition for the dav1d tests
|
||||
#
|
||||
|
||||
# Leave subdir if tests are disabled
|
||||
if not get_option('enable_tests')
|
||||
subdir_done()
|
||||
endif
|
||||
|
||||
if is_asm_enabled
|
||||
checkasm_sources = files(
|
||||
'checkasm/checkasm.c',
|
||||
'checkasm/msac.c',
|
||||
'checkasm/refmvs.c',
|
||||
)
|
||||
|
||||
checkasm_tmpl_sources = files(
|
||||
'checkasm/cdef.c',
|
||||
'checkasm/filmgrain.c',
|
||||
'checkasm/ipred.c',
|
||||
'checkasm/itx.c',
|
||||
'checkasm/loopfilter.c',
|
||||
'checkasm/looprestoration.c',
|
||||
'checkasm/mc.c',
|
||||
)
|
||||
|
||||
checkasm_bitdepth_objs = []
|
||||
foreach bitdepth : dav1d_bitdepths
|
||||
checkasm_bitdepth_lib = static_library(
|
||||
'checkasm_bitdepth_@0@'.format(bitdepth),
|
||||
checkasm_tmpl_sources,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: ['-DBITDEPTH=@0@'.format(bitdepth), stackalign_flag],
|
||||
install: false,
|
||||
build_by_default: false,
|
||||
)
|
||||
checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects(recursive: true)
|
||||
endforeach
|
||||
|
||||
checkasm_asm_objs = []
|
||||
checkasm_asm_sources = []
|
||||
if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
|
||||
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
|
||||
elif host_machine.cpu_family().startswith('x86')
|
||||
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
|
||||
endif
|
||||
|
||||
if use_gaspp
|
||||
checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
|
||||
else
|
||||
checkasm_sources += checkasm_asm_sources
|
||||
endif
|
||||
|
||||
checkasm = executable('checkasm',
|
||||
checkasm_sources,
|
||||
checkasm_asm_objs,
|
||||
|
||||
objects: [
|
||||
checkasm_bitdepth_objs,
|
||||
libdav1d.extract_all_objects(recursive: true),
|
||||
],
|
||||
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag],
|
||||
build_by_default: false,
|
||||
dependencies : [
|
||||
thread_dependency,
|
||||
rt_dependency,
|
||||
libdl_dependency,
|
||||
libm_dependency,
|
||||
],
|
||||
)
|
||||
|
||||
test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false)
|
||||
benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench')
|
||||
endif
|
||||
|
||||
c99_extension_flag = cc.first_supported_argument(
|
||||
'-Werror=c11-extensions',
|
||||
'-Werror=c99-c11-compat',
|
||||
'-Wc11-extensions',
|
||||
'-Wc99-c11-compat',
|
||||
)
|
||||
|
||||
# dav1d_api_headers
|
||||
foreach header : dav1d_api_headers
|
||||
target = header + '_test'
|
||||
|
||||
header_test_exe = executable(target,
|
||||
'header_test.c',
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
|
||||
build_by_default: true
|
||||
)
|
||||
|
||||
test(target, header_test_exe, suite: 'headers')
|
||||
endforeach
|
||||
|
||||
|
||||
# fuzzing binaries
|
||||
subdir('libfuzzer')
|
||||
|
||||
# seek stress test binary, depends on dav1d cli tool
|
||||
if get_option('enable_tools')
|
||||
seek_stress_sources = files('seek_stress.c')
|
||||
seek_stress = executable('seek_stress',
|
||||
seek_stress_sources, rev_target,
|
||||
objects: [
|
||||
dav1d.extract_objects('dav1d_cli_parse.c'),
|
||||
dav1d_input_objs.extract_objects('input/input.c', 'input/ivf.c'),
|
||||
],
|
||||
include_directories: [dav1d_inc_dirs, include_directories('../tools')],
|
||||
link_with: libdav1d,
|
||||
dependencies: [
|
||||
thread_dependency,
|
||||
rt_dependency,
|
||||
getopt_dependency,
|
||||
libm_dependency,
|
||||
],
|
||||
)
|
||||
endif
|
||||
|
||||
# Include dav1d test data repository with additional tests
|
||||
if get_option('testdata_tests')
|
||||
subdir('dav1d-test-data')
|
||||
endif
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче