Bug 1620471 - Update libdav1d to 0.6.0 r=dminor

Differential Revision: https://phabricator.services.mozilla.com/D67511

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Michael Froman 2020-03-20 15:17:26 +00:00
Родитель 05300c66f1
Коммит 4ef4d2b585
104 изменённых файлов: 18352 добавлений и 4698 удалений

Просмотреть файл

@ -2,7 +2,7 @@ This directory contains build files for dav1d. The actual library
source is in $TOPSRCDIR/third_party/dav1d/
Any patches or additional configuration to be applied to the
upstream source should be kept here in the media/libaom
upstream source should be kept here in the media/libdav1d
directory.
To update the library source and build config files, execute
@ -13,8 +13,35 @@ To update to a specific upstream git tag or commit, use
./mach vendor dav1d -r <commit>
The upstream git repository is https://aomedia.googlesource.com/aom
The upstream git repository is https://code.videolan.org/videolan/dav1d
To update to a fork, use
./mach vendor dav1d --repo <repository url> [-r <commit>]
The rough steps are:
- Execute ./mach vendor dav1d -r {tag-name} # ex: ./mach vendor dav1d -r 0.6.0
- Update moz.build with the new files, check the
third_party/dav1d/src/meson.build (confirm with the diff) (note the
empty .asm file in x86_64)
- Build a stand-alone libdav1d following the steps here:
https://code.videolan.org/videolan/dav1d#compile
- Copy vcs_version.h from the local build/include/vcs_version.h
to media/libdav1d/vcs_version.h
- Copy version.h from local build/include/dav1di/version.h to
media/libdav1d/version.h
- Update dav1d.rc:
- update the API_VERSION_NUMBER, API_VERSION_NUMBER_STR, defines to
match the 'dav1d_soname_version' field in
third_party/dav1d/meson.build.
- update the PROJECT_VERSION_NUMBER, PROJECT_VERSION_NUMBER_STR
defines to match the new project versions from the git tag (or from
the project version found in third_party/dav1d/meson.build).
- Add new options, if any, in moz.build or config.h
Tips:
- If you see build failures in build-linux64-base-toolchains (or
similar jobs) dav1d may now require a higher minimum nasm version
than our base toolchains currently support. A bug updating the
minimum nasm version will probably be necessary.

Просмотреть файл

@ -88,11 +88,13 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/loopfilter.asm',
'../../../third_party/dav1d/src/x86/looprestoration.asm',
'../../../third_party/dav1d/src/x86/mc.asm',
'../../../third_party/dav1d/src/x86/msac_init.c',
]
SOURCES += [
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
'../../../third_party/dav1d/src/x86/cpuid.asm',
'../../../third_party/dav1d/src/x86/film_grain_ssse3.asm',
'../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
'../../../third_party/dav1d/src/x86/itx_ssse3.asm',
'../../../third_party/dav1d/src/x86/loopfilter_ssse3.asm',
@ -192,11 +194,18 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
if CONFIG['CPU_ARCH'] == 'aarch64':
SOURCES += [
'../../../third_party/dav1d/src/arm/64/cdef.S',
'../../../third_party/dav1d/src/arm/64/cdef16.S',
'../../../third_party/dav1d/src/arm/64/cdef_tmpl.S',
'../../../third_party/dav1d/src/arm/64/ipred.S',
'../../../third_party/dav1d/src/arm/64/itx.S',
'../../../third_party/dav1d/src/arm/64/loopfilter.S',
'../../../third_party/dav1d/src/arm/64/loopfilter16.S',
'../../../third_party/dav1d/src/arm/64/looprestoration.S',
'../../../third_party/dav1d/src/arm/64/looprestoration16.S',
'../../../third_party/dav1d/src/arm/64/looprestoration_common.S',
'../../../third_party/dav1d/src/arm/64/looprestoration_tmpl.S',
'../../../third_party/dav1d/src/arm/64/mc.S',
'../../../third_party/dav1d/src/arm/64/mc16.S',
'../../../third_party/dav1d/src/arm/64/msac.S',
]
elif CONFIG['CPU_ARCH'] == 'arm':

Просмотреть файл

@ -1,7 +1,7 @@
#define API_VERSION_NUMBER 3,1,0,0
#define API_VERSION_NUMBER_STR "3.1.0"
#define PROJECT_VERSION_NUMBER 0,5,2,0
#define PROJECT_VERSION_NUMBER_STR "0.5.2"
#define API_VERSION_NUMBER 4,0,0,0
#define API_VERSION_NUMBER_STR "4.0.0"
#define PROJECT_VERSION_NUMBER 0,6,0,0
#define PROJECT_VERSION_NUMBER_STR "0.6.0"
#include <windows.h>

Просмотреть файл

@ -79,6 +79,7 @@ SOURCES += [
'../../third_party/dav1d/src/dequant_tables.c',
'../../third_party/dav1d/src/getbits.c',
'../../third_party/dav1d/src/intra_edge.c',
'../../third_party/dav1d/src/itx_1d.c',
'../../third_party/dav1d/src/lf_mask.c',
'../../third_party/dav1d/src/log.c',
'../../third_party/dav1d/src/msac.c',
@ -167,6 +168,7 @@ EXPORTS.dav1d.src += [
'../../third_party/dav1d/src/ipred.h',
'../../third_party/dav1d/src/ipred_prepare.h',
'../../third_party/dav1d/src/itx.h',
'../../third_party/dav1d/src/itx_1d.h',
'../../third_party/dav1d/src/lf_apply.h',
'../../third_party/dav1d/src/loopfilter.h',
'../../third_party/dav1d/src/looprestoration.h',

Просмотреть файл

@ -20,7 +20,7 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 39667c751d427e447cbe8be783cfecd296659e24 (2019-12-02T18:19:06.000+01:00).
release: commit efd9e5518e0ed5114f8b4579debd7ee6dbede21f (2020-03-06T00:16:53.000+01:00).
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.5.2-0-g39667c7"
#define DAV1D_VERSION "0.6.0-0-gefd9e55"

Просмотреть файл

@ -27,8 +27,8 @@
#ifndef DAV1D_VERSION_H
#define DAV1D_VERSION_H
#define DAV1D_API_VERSION_MAJOR 3
#define DAV1D_API_VERSION_MINOR 1
#define DAV1D_API_VERSION_MAJOR 4
#define DAV1D_API_VERSION_MINOR 0
#define DAV1D_API_VERSION_PATCH 0
#endif /* DAV1D_VERSION_H */

22
third_party/dav1d/.gitlab-ci.yml поставляемый
Просмотреть файл

@ -38,7 +38,7 @@ build-debian:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: build
tags:
- debian
- avx2
- amd64
script:
- meson build --buildtype release --werror
@ -173,7 +173,7 @@ build-win-arm64:
build-debian-aarch64:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
tags:
- aarch64
- debian
@ -184,7 +184,7 @@ build-debian-aarch64:
build-debian-aarch64-clang-5:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
tags:
- aarch64
- debian
@ -203,7 +203,7 @@ build-macos:
- cd build && meson test -v
build-debian-werror:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
stage: build
tags:
- aarch64
@ -219,7 +219,7 @@ build-debian-armv7:
- armv7
- debian
script:
- meson build --buildtype debugoptimized --werror
- linux32 meson build --buildtype debugoptimized --werror
- ninja -C build
- cd build && meson test -v
@ -230,13 +230,13 @@ build-debian-armv7-clang-5:
- armv7
- debian
script:
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
- ninja -C build
- cd build && meson test -v
build-ubuntu-snap:
stage: build
image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127
image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340
tags:
- debian
- amd64
@ -292,7 +292,7 @@ test-debian-unaligned-stack:
stage: test
needs: ["build-debian"]
tags:
- debian
- avx2
- amd64
cache:
key: testdata.git-20190215
@ -382,7 +382,7 @@ test-win64:
stage: test
needs: ["build-win64"]
tags:
- debian
- avx2
- amd64
cache:
key: testdata.git-20190215
@ -403,7 +403,7 @@ test-win64:
dependencies: []
test-debian-aarch64:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
stage: test
needs: ["build-debian-aarch64"]
tags:
@ -464,7 +464,7 @@ test-debian-armv7-clang-5:
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
- ninja -C build

33
third_party/dav1d/NEWS поставляемый
Просмотреть файл

@ -1,3 +1,26 @@
Changes for 0.6.0 'Gyrfalcon':
------------------------------
0.6.0 is a major release for dav1d:
- New ARM64 optimizations for the 10/12bit depth:
- mc_avg, mc_w_avg, mc_mask
- mc_put/mc_prep 8tap/bilin
- mc_warp_8x8
- mc_w_mask
- mc_blend
- wiener
- SGR
- loopfilter
- cdef
- New AVX-512 optimizations for prep_bilin, prep_8tap, cdef_filter, mc_avg/w_avg/mask
- New SSSE3 optimizations for film grain
- New AVX2 optimizations for msac_adapt16
- Fix rare mismatches against the reference decoder, notably because of clipping
- Improvements on ARM64 on msac, cdef and looprestoration optimizations
- Improvements on AVX2 optimizations for cdef_filter
- Improvements in the C version for itxfm, cdef_filter
Changes for 0.5.2 'Asiatic Cheetah':
------------------------------------
@ -32,7 +55,7 @@ and improving speed significantly:
- NEON optimizations for CDEF and warp on ARM32
- SSE2 optimizations for MSAC hi_tok decoding
- SSSE3 optimizations for deblocking loopfilters and warp_affine
- AVX-2 optimizations for film grain and ipred_z2
- AVX2 optimizations for film grain and ipred_z2
- SSE4 optimizations for warp_affine
- VSX optimizations for wiener
- Fix inverse transform overflows in x86 and NEON asm
@ -81,7 +104,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
-----------------------------
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
The impact is important on SSSE3, SSE4 and AVX2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
@ -93,7 +116,7 @@ Changes for 0.2.1 'Antelope':
----------------------------
- SSSE3 optimization for cdef_dir
- AVX-2 improvements of the existing CDEF optimizations
- AVX2 improvements of the existing CDEF optimizations
- NEON improvements of the existing CDEF and wiener optimizations
- Clarification about the numbering/versionning scheme
@ -103,7 +126,7 @@ Changes for 0.2.0 'Antelope':
- ARM64 and ARM optimizations using NEON instructions
- SSSE3 optimizations for both 32 and 64bits
- More AVX-2 assembly, reaching almost completion
- More AVX2 assembly, reaching almost completion
- Fix installation of includes
- Rewrite inverse transforms to avoid overflows
- Snap packaging for Linux
@ -118,6 +141,6 @@ Initial release of dav1d, the fast and small AV1 decoder.
- Support for all features of the AV1 bitstream
- Support for all bitdepth, 8, 10 and 12bits
- Support for all chroma subsamplings 4:2:0, 4:2:2, 4:4:4 *and* grayscale
- Full acceleration for AVX-2 64bits processors, making it the fastest decoder
- Full acceleration for AVX2 64bits processors, making it the fastest decoder
- Partial acceleration for SSSE3 processors
- Partial acceleration for NEON processors

2
third_party/dav1d/README.md поставляемый
Просмотреть файл

@ -73,7 +73,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
2. Run `mkdir build && cd build` to create a build directory and enter it
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
4. Run `ninja` to compile

14
third_party/dav1d/include/common/attributes.h поставляемый
Просмотреть файл

@ -43,15 +43,18 @@
#endif
#if ARCH_X86_64
/* x86-64 needs 32-byte alignment for AVX2. */
/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
#define ALIGN_64_VAL 64
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_64_VAL 16
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
#else
/* No need for extra alignment on platforms without assembly. */
#define ALIGN_64_VAL 8
#define ALIGN_32_VAL 8
#define ALIGN_16_VAL 8
#endif
@ -76,9 +79,10 @@
* becomes:
* ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
*/
#define ALIGN_STK_64(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
#define ALIGN_STK_32(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
// as long as stack is itself 16-byte aligned, this works (win64, gcc)
#define ALIGN_STK_16(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
@ -92,6 +96,12 @@
#define NOINLINE __attribute__((noinline))
#endif /* !_MSC_VER */
#ifdef __clang__
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
#else
#define NO_SANITIZE(x)
#endif
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
#elif defined(NDEBUG) && defined(_MSC_VER)

13
third_party/dav1d/include/common/bitdepth.h поставляемый
Просмотреть файл

@ -31,6 +31,8 @@
#include <stdint.h>
#include <string.h>
#include "common/attributes.h"
#if !defined(BITDEPTH)
typedef void pixel;
typedef void coef;
@ -47,12 +49,14 @@ typedef int16_t coef;
#define iclip_pixel iclip_u8
#define PIX_HEX_FMT "%02x"
#define bitfn(x) x##_8bpc
#define PXSTRIDE(x) x
#define BF(x, suffix) x##_8bpc_##suffix
#define PXSTRIDE(x) (x)
#define highbd_only(x)
#define HIGHBD_DECL_SUFFIX /* nothing */
#define HIGHBD_CALL_SUFFIX /* nothing */
#define HIGHBD_TAIL_SUFFIX /* nothing */
#define bitdepth_from_max(x) 8
#define BITDEPTH_MAX 0xff
#elif BITDEPTH == 16
typedef uint16_t pixel;
typedef int32_t coef;
@ -69,8 +73,13 @@ static inline void pixel_set(pixel *const dst, const int val, const int num) {
#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
#define HIGHBD_TAIL_SUFFIX , bitdepth_max
#define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
#define BITDEPTH_MAX bitdepth_max
#define bitfn(x) x##_16bpc
#define PXSTRIDE(x) (x >> 1)
#define BF(x, suffix) x##_16bpc_##suffix
static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
assert(!(x & 1));
return x >> 1;
}
#define highbd_only(x) x
#else
#error invalid value for bitdepth

12
third_party/dav1d/include/dav1d/headers.h поставляемый
Просмотреть файл

@ -318,8 +318,8 @@ typedef struct Dav1dFilmGrainData {
int scaling_shift;
int ar_coeff_lag;
int8_t ar_coeffs_y[24];
int8_t ar_coeffs_uv[2][25];
int ar_coeff_shift;
int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
uint64_t ar_coeff_shift;
int grain_scale_shift;
int uv_mult[2];
int uv_luma_mult[2];
@ -329,13 +329,13 @@ typedef struct Dav1dFilmGrainData {
} Dav1dFilmGrainData;
typedef struct Dav1dFrameHeader {
struct {
Dav1dFilmGrainData data;
int present, update;
} film_grain; ///< film grain parameters
enum Dav1dFrameType frame_type; ///< type of the picture
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
int frame_offset; ///< frame number
struct {
int present, update;
Dav1dFilmGrainData data;
} film_grain; ///< film grain parameters
int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC
int show_existing_frame;

39
third_party/dav1d/meson.build поставляемый
Просмотреть файл

@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.5.2',
version: '0.6.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '3.1.0'
dav1d_soname_version = '4.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -84,13 +84,15 @@ test_args = []
optional_arguments = []
# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
if host_machine.system() == 'darwin'
if host_machine.system() == 'linux'
test_args += '-D_GNU_SOURCE'
add_project_arguments('-D_GNU_SOURCE', language: 'c')
elif host_machine.system() == 'darwin'
test_args += '-D_DARWIN_C_SOURCE'
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
else
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
endif
if host_machine.system() == 'windows'
@ -131,6 +133,15 @@ else
endif
endif
libdl_dependency = []
if host_machine.system() == 'linux'
libdl_dependency = cc.find_library('dl', required : false)
if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
cdata.set('HAVE_DLSYM', 1)
endif
endif
# Header checks
stdatomic_dependency = []
@ -257,12 +268,12 @@ if host_machine.cpu_family().startswith('x86')
if get_option('stack_alignment') > 0
stack_alignment = get_option('stack_alignment')
elif host_machine.cpu_family() == 'x86_64'
if cc.has_argument('-mpreferred-stack-boundary=5')
stackalign_flag = ['-mpreferred-stack-boundary=5']
if cc.has_argument('-mpreferred-stack-boundary=6')
stackalign_flag = ['-mpreferred-stack-boundary=6']
stackrealign_flag = ['-mincoming-stack-boundary=4']
stack_alignment = 32
elif cc.has_argument('-mstack-alignment=32')
stackalign_flag = ['-mstack-alignment=32']
elif cc.has_argument('-mstack-alignment=64')
stackalign_flag = ['-mstack-alignment=64']
stackrealign_flag = ['-mstackrealign']
stack_alignment = 32
else
@ -364,8 +375,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13.02')
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
if out[2].version_compare('<2.14')
error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
endif
else
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
@ -390,7 +401,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
depfile: '@BASENAME@.obj.ndep',
arguments: [
'-f', nasm_format,
'-I', '@SOURCE_DIR@/src/',
'-I', '@0@/src/'.format(meson.current_source_dir()),
'-I', '@0@/'.format(meson.current_build_dir()),
'-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
'@EXTRA_ARGS@',

376
third_party/dav1d/src/arm/32/cdef.S поставляемый
Просмотреть файл

@ -148,20 +148,22 @@
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro padding_func w, stride, n1, w1, n2, w2, align
function cdef_padding\w\()_neon, export=1
function cdef_padding\w\()_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldr r6, [sp, #28]
cmp r6, #0xf // fully edged
beq cdef_padding\w\()_edged_8bpc_neon
vmov.i16 q3, #0x8000
tst r6, #4 // CDEF_HAVE_TOP
bne 1f
@ -175,10 +177,9 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// CDEF_HAVE_TOP
ldr r7, [r4]
ldr lr, [r4, #4]
add r7, r4, r2
sub r0, r0, #2*(2*\stride)
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
// Middle section
3:
@ -267,6 +268,65 @@ endfunc
padding_func 8, 16, d0, q0, d2, q1, 128
padding_func 4, 8, s0, d0, s4, d2, 64
// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg, align
function cdef_padding\w\()_edged_8bpc_neon
sub r0, r0, #(2*\stride)
ldrh r12, [r4, #-2]
vldr \reg, [r4]
add r7, r4, r2
strh r12, [r0, #-2]
ldrh r12, [r4, #\w]
vstr \reg, [r0]
strh r12, [r0, #\w]
ldrh r12, [r7, #-2]
vldr \reg, [r7]
strh r12, [r0, #\stride-2]
ldrh r12, [r7, #\w]
vstr \reg, [r0, #\stride]
strh r12, [r0, #\stride+\w]
add r0, r0, #2*\stride
0:
ldrh r12, [r3], #2
vldr \reg, [r1]
str r12, [r0, #-2]
ldrh r12, [r1, #\w]
add r1, r1, r2
subs r5, r5, #1
vstr \reg, [r0]
str r12, [r0, #\w]
add r0, r0, #\stride
bgt 0b
ldrh r12, [r1, #-2]
vldr \reg, [r1]
add r7, r1, r2
strh r12, [r0, #-2]
ldrh r12, [r1, #\w]
vstr \reg, [r0]
strh r12, [r0, #\w]
ldrh r12, [r7, #-2]
vldr \reg, [r7]
strh r12, [r0, #\stride-2]
ldrh r12, [r7, #\w]
vstr \reg, [r0, #\stride]
strh r12, [r0, #\stride+\w]
pop {r4-r7,pc}
endfunc
.endm
padding_func_edged 8, 16, d0, 64
padding_func_edged 4, 8, s0, 32
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
@ -311,14 +371,13 @@ endconst
vld1.16 {\d22}, [r9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmp \threshold, #0
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
.if \min
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
beq 3f
.endif
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
@ -326,7 +385,7 @@ endconst
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vsub.i16 q10, \s1, q0 // diff = p0 - px
vsub.u16 q13, \s2, q0 // diff = p1 - px
vsub.i16 q13, \s2, q0 // diff = p1 - px
vneg.s16 q8, q9 // -clip
vneg.s16 q11, q12 // -clip
vmin.s16 q10, q10, q9 // imin(diff, clip)
@ -336,36 +395,44 @@ endconst
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
3:
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_neon
cmp r8, #0xf
beq cdef_filter\w\suffix\()_edged_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
.if \pri
vdup.16 q5, r3 // threshold
.endif
.if \sec
vdup.16 q7, r4 // threshold
.endif
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
.if \sec
vdup.16 q6, d8[1]
.endif
.if \pri
vdup.16 q4, d8[0]
.endif
1:
.if \w == 8
@ -377,47 +444,64 @@ function cdef_filter\w\()_neon, export=1
.endif
vmov.u16 q1, #0 // sum
.if \min
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
handle_pixel q14, q15, r3, q5, q4, r12
handle_pixel q14, q15, q5, q4, r12, \min
.endif
.if \sec
load_px d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, q7, q6, lr, \min
load_px d28, d29, d30, d31, \w
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, q7, q6, lr, \min
sub r5, r5, #11 // x8 -= 2*(2+4); x8 += 1;
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
.if \min
vmin.s16 q0, q0, q3
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
.endif
vmovn.u16 d0, q0
.if \w == 8
add r2, r2, #2*16 // tmp += tmp_stride
@ -430,9 +514,11 @@ function cdef_filter\w\()_neon, export=1
vst1.32 {d0[1]}, [r0, :32], r1
.endif
// Reset pri_taps/sec_taps back to the original point
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
@ -440,9 +526,237 @@ function cdef_filter\w\()_neon, export=1
endfunc
.endm
.macro filter w
filter_func \w, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_8bpc_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
ldr r8, [sp, #108]
cmp r3, #0 // pri_strength
bne 1f
b cdef_filter\w\()_sec_neon // only sec
1:
cmp r4, #0 // sec_strength
bne 1f
b cdef_filter\w\()_pri_neon // only pri
1:
b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm
filter 8
filter 4
.macro load_px_8 d11, d12, d21, d22, w
.if \w == 8
add r6, r2, r9 // x + off
sub r9, r2, r9 // x - off
vld1.8 {\d11}, [r6] // p0
add r6, r6, #16 // += stride
vld1.8 {\d21}, [r9] // p1
add r9, r9, #16 // += stride
vld1.8 {\d12}, [r6] // p0
vld1.8 {\d22}, [r9] // p1
.else
add r6, r2, r9 // x + off
sub r9, r2, r9 // x - off
vld1.32 {\d11[0]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d21[0]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d11[1]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d21[1]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d12[0]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d22[0]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d12[1]}, [r6] // p0
vld1.32 {\d22[1]}, [r9] // p1
.endif
.endm
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
vmin.u8 q3, q3, \s1
vmax.u8 q4, q4, \s1
vmin.u8 q3, q3, \s2
vmax.u8 q4, q4, \s2
.endif
vabd.u8 q8, q0, \s1 // abs(diff)
vabd.u8 q11, q0, \s2 // abs(diff)
vshl.u8 q9, q8, \shift // abs(diff) >> shift
vshl.u8 q12, q11, \shift // abs(diff) >> shift
vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vcgt.u8 q10, q0, \s1 // px > p0
vcgt.u8 q13, q0, \s2 // px > p1
vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
vneg.s8 q8, q9 // -imin()
vneg.s8 q11, q12 // -imin()
vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
vdup.8 d18, \tap // taps[k]
vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
.endm
// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u8 d17, #7
vdup.8 d16, r6 // damping
vmov.8 d8[0], r3
vmov.8 d8[1], r4
vclz.i8 d8, d8 // clz(threshold)
vsub.i8 d8, d17, d8 // ulog2(threshold)
vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s8 d8, d8 // -shift
.if \sec
vdup.8 q6, d8[1]
.endif
.if \pri
vdup.8 q5, d8[0]
.endif
1:
.if \w == 8
add r12, r2, #16
vld1.8 {d0}, [r2, :64] // px
vld1.8 {d1}, [r12, :64] // px
.else
add r12, r2, #8
vld1.32 {d0[0]}, [r2, :32] // px
add r9, r2, #2*8
vld1.32 {d0[1]}, [r12, :32] // px
add r12, r12, #2*8
vld1.32 {d1[0]}, [r9, :32] // px
vld1.32 {d1[1]}, [r12, :32] // px
.endif
vmov.u8 q1, #0 // sum
vmov.u8 q2, #0 // sum
.if \min
vmov.u16 q3, q0 // min
vmov.u16 q4, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px_8 d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
vdup.8 q7, r3 // threshold
handle_pixel_8 q14, q15, q7, q5, r12, \min
.endif
.if \sec
load_px_8 d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
vdup.8 q7, r4 // threshold
handle_pixel_8 q14, q15, q7, q6, lr, \min
load_px_8 d28, d29, d30, d31, \w
handle_pixel_8 q14, q15, q7, q6, lr, \min
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vshr.s16 q15, q2, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vadd.i16 q2, q2, q15 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
.if \min
vmin.u8 q0, q0, q4
vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
.endif
.if \w == 8
vst1.8 {d0}, [r0, :64], r1
add r2, r2, #2*16 // tmp += 2*tmp_stride
subs r7, r7, #2 // h -= 2
vst1.8 {d1}, [r0, :64], r1
.else
vst1.32 {d0[0]}, [r0, :32], r1
add r2, r2, #4*8 // tmp += 4*tmp_stride
vst1.32 {d0[1]}, [r0, :32], r1
subs r7, r7, #4 // h -= 4
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d1[1]}, [r0, :32], r1
.endif
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
pop {r4-r9,pc}
endfunc
.endm
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
filter_8 8
filter_8 4
const div_table, align=4
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
@ -451,9 +765,9 @@ const alt_fact, align=4
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_neon, export=1
// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_8bpc_neon, export=1
push {lr}
vpush {q4-q7}
sub sp, sp, #32 // cost

14
third_party/dav1d/src/arm/32/loopfilter.S поставляемый
Просмотреть файл

@ -143,8 +143,8 @@ function lpf_8_wd\wd\()_neon
vaddw.s8 q1, q1, d4
vmov.i8 d7, #3
vqmovn.s16 d2, q1 // f
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
vshr.s8 d4, d4, #3 // f1
vshr.s8 d5, d5, #3 // f2
vmovl.u8 q1, d23 // p0
@ -734,13 +734,13 @@ function lpf_h_16_8_neon
bx r12
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]

Просмотреть файл

@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_neon, export=1
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4}
ldrd r4, r5, [sp, #52]
@ -367,11 +367,11 @@ L(variable_shift_tbl):
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_neon, export=1
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
@ -548,9 +548,9 @@ function wiener_filter_v_neon, export=1
.purgem filter
endfunc
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_neon, export=1
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_8bpc_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
@ -687,12 +687,12 @@ endfunc
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_neon, export=1
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
@ -925,11 +925,11 @@ L(box3_variable_shift_tbl):
vmull.u8 q6, d9, d9
add3 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q8}, [r10, :128]!
subs r5, r5, #4
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
@ -961,12 +961,12 @@ L(box3_variable_shift_tbl):
.purgem add3
endfunc
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_neon, export=1
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
@ -1038,7 +1038,7 @@ function sgr_box5_h_neon, export=1
b 2f
0:
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 2x the first byte at the front.
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
vdup.8 q5, d8[0]
// Move r3 back to account for the last 3 bytes we loaded before,
@ -1215,11 +1215,11 @@ L(box5_variable_shift_tbl):
vmull.u8 q6, d9, d9
add5 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q10}, [r10, :128]!
subs r5, r5, #4
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
@ -1661,11 +1661,11 @@ endfunc
#define FILTER_OUT_STRIDE 384
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_neon, export=1
// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
@ -1765,11 +1765,11 @@ function sgr_finish_filter1_neon, export=1
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_neon, export=1
// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
@ -1925,11 +1925,11 @@ function sgr_finish_filter2_neon, export=1
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_8bpc_neon, export=1
push {r4-r9,lr}
ldrd r4, r5, [sp, #28]
ldrd r6, r7, [sp, #36]
@ -2009,12 +2009,12 @@ function sgr_weighted1_neon, export=1
pop {r4-r9,pc}
endfunc
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const coef *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_8bpc_neon, export=1
push {r4-r11,lr}
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]

28
third_party/dav1d/src/arm/32/mc.S поставляемый
Просмотреть файл

@ -753,7 +753,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
sub r1, r1, #2
4:
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
@ -764,10 +764,8 @@ L(blend_v_tbl):
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
vst1.8 {d20[2]}, [r0], r1
vst1.8 {d20[6]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
@ -776,7 +774,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
sub r1, r1, #4
8:
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
@ -790,10 +788,8 @@ L(blend_v_tbl):
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
vst1.16 {d22[2]}, [r0, :16]!
vst1.16 {d23[2]}, [r12, :16]!
add r0, r0, r1
add r12, r12, r1
vst1.16 {d22[2]}, [r0, :16], r1
vst1.16 {d23[2]}, [r12, :16], r1
bgt 8b
pop {r4-r5,pc}
160:
@ -802,7 +798,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
sub r1, r1, #12
sub r1, r1, #8
16:
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
@ -822,20 +818,18 @@ L(blend_v_tbl):
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
vst1.32 {d19[0]}, [r0, :32]!
vst1.32 {d21[0]}, [r12, :32]!
add r0, r0, r1
add r12, r12, r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d21[0]}, [r12, :32], r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
vsub.i8 d24, d20, d6
32:
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
vld1.u8 {d0, d1, d2}, [r0, :64]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22

558
third_party/dav1d/src/arm/64/cdef.S поставляемый
Просмотреть файл

@ -27,6 +27,7 @@
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
tst w6, #1 // CDEF_HAVE_LEFT
@ -137,13 +138,15 @@
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_neon, export=1
function cdef_padding\w\()_8bpc_neon, export=1
cmp w6, #0xf // fully edged
b.eq cdef_padding\w\()_edged_8bpc_neon
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
@ -157,9 +160,8 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// CDEF_HAVE_TOP
ldr x8, [x4]
ldr x9, [x4, #8]
pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0
add x9, x4, x2
pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
// Middle section
3:
@ -242,358 +244,274 @@ endfunc
padding_func 8, 16, d, q
padding_func 4, 8, s, d
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg
function cdef_padding\w\()_edged_8bpc_neon, export=1
sub x4, x4, #2
sub x0, x0, #(2*\stride+2)
.if \w == 4
ldr d0, [x4]
ldr d1, [x4, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x4, x2
ldr d0, [x4]
ldr s1, [x4, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
add x0, x0, #2*\stride
.endif
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w5, w5, #1
str h0, [x0]
stur \reg\()1, [x0, #2]
str h2, [x0, #2+\w]
add x0, x0, #\stride
b.gt 0b
sub x1, x1, #2
.if \w == 4
ldr d0, [x1]
ldr d1, [x1, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x1, x2
ldr d0, [x1]
ldr s1, [x1, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
.endif
ret
endfunc
.endm
dir_table 8, 16
dir_table 4, 8
padding_func_edged 8, 16, d
padding_func_edged 4, 8, s
const pri_taps
.byte 4, 2, 3, 3
endconst
tables
.macro load_px d1, d2, w
filter 8, 8
filter 4, 8
find_dir 8
.macro load_px_8 d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().8h}, [x6] // p0
ld1 {\d2\().8h}, [x9] // p1
.else
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().4h}, [x6] // p0
add x6, x6, #2*8 // += stride
ld1 {\d2\().4h}, [x9] // p1
add x9, x9, #2*8 // += stride
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().d}[0], [x6] // p0
add x6, x6, #16 // += stride
ld1 {\d2\().d}[0], [x9] // p1
add x9, x9, #16 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p1
ld1 {\d2\().d}[1], [x9] // p0
.else
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().s}[0], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[0], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[1], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[1], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[2], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[2], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[3], [x6] // p0
ld1 {\d2\().s}[3], [x9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
cbz \threshold, 3f
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
neg v16.8h, v17.8h // -clip
neg v20.8h, v21.8h // -clip
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
3:
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
umin v3.16b, v3.16b, \s1\().16b
umax v4.16b, v4.16b, \s1\().16b
umin v3.16b, v3.16b, \s2\().16b
umax v4.16b, v4.16b, \s2\().16b
.endif
uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
cmhi v18.16b, v0.16b, \s1\().16b // px > p0
cmhi v22.16b, v0.16b, \s2\().16b // px > p1
umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
dup v19.16b, \tap // taps[k]
neg v16.16b, v17.16b // -imin()
neg v20.16b, v21.16b // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain()
smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain()
smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain()
smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain()
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint8_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_8bpc_neon
.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
movi v30.8b, #7
dup v28.8b, w6 // damping
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
dup v26.8h, v24.h[1]
dup v24.8h, v24.h[0]
.if \pri
dup v25.16b, w3 // threshold
.endif
.if \sec
dup v27.16b, w4 // threshold
.endif
trn1 v24.8b, v25.8b, v27.8b
clz v24.8b, v24.8b // clz(threshold)
sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
neg v24.8b, v24.8b // -shift
.if \sec
dup v26.16b, v24.b[1]
.endif
.if \pri
dup v24.16b, v24.b[0]
.endif
1:
.if \w == 8
ld1 {v0.8h}, [x2] // px
.else
add x12, x2, #2*8
ld1 {v0.4h}, [x2] // px
add x12, x2, #16
ld1 {v0.d}[0], [x2] // px
ld1 {v0.d}[1], [x12] // px
.else
add x12, x2, #1*8
add x13, x2, #2*8
add x14, x2, #3*8
ld1 {v0.s}[0], [x2] // px
ld1 {v0.s}[1], [x12] // px
ld1 {v0.s}[2], [x13] // px
ld1 {v0.s}[3], [x14] // px
.endif
movi v1.8h, #0 // sum
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
movi v2.8h, #0 // sum
.if \min
mov v3.16b, v0.16b // min
mov v4.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
ldrb w10, [x8] // *pri_taps
handle_pixel v4, v5, w3, v25.8h, v24.8h, w10
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
handle_pixel v6, v7, w4, v27.8h, v26.8h, w11
handle_pixel v4, v5, w4, v27.8h, v26.8h, w11
sub x5, x5, #11 // x8 -= 2*(2+4); x8 += 1;
subs w11, w11, #1 // sec_tap-- (value)
add x8, x8, #1 // pri_taps++ (pointer)
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
xtn v0.8b, v0.8h
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
subs w7, w7, #1 // h--
st1 {v0.8b}, [x0], x1
.else
st1 {v0.s}[0], [x0], x1
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
st1 {v0.s}[1], [x0], x1
load_px_8 v5, v6, \w
.endif
// Reset pri_taps/sec_taps back to the original point
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px_8 v28, v29, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px_8 v5, v6, \w
handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
sshr v5.8h, v1.8h, #15 // -(sum < 0)
sshr v6.8h, v2.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v5.8h // sum - (sum < 0)
add v2.8h, v2.8h, v6.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4
uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4
uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4
sqxtun v0.8b, v1.8h
sqxtun2 v0.16b, v2.8h
.if \min
umin v0.16b, v0.16b, v4.16b
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
.endif
.if \w == 8
st1 {v0.d}[0], [x0], x1
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
st1 {v0.d}[1], [x0], x1
.else
st1 {v0.s}[0], [x0], x1
add x2, x2, #4*8 // tmp += 4*tmp_stride
st1 {v0.s}[1], [x0], x1
subs w7, w7, #4 // h -= 4
st1 {v0.s}[2], [x0], x1
st1 {v0.s}[3], [x0], x1
.endif
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
filter 8
filter 4
const div_table
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
const alt_fact
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_neon, export=1
sub sp, sp, #32 // cost
mov w3, #8
movi v31.16b, #128
movi v30.16b, #0
movi v1.8h, #0 // v0-v1 sum_diag[0]
movi v3.8h, #0 // v2-v3 sum_diag[1]
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
movi v7.8h, #0 // v6-v7 sum_alt[0]
movi v17.8h, #0 // v16-v17 sum_alt[1]
movi v18.8h, #0 // v18-v19 sum_alt[2]
movi v19.8h, #0
movi v21.8h, #0 // v20-v21 sum_alt[3]
.irpc i, 01234567
ld1 {v26.8b}, [x0], x1
usubl v26.8h, v26.8b, v31.8b
addv h25, v26.8h // [y]
rev64 v27.8h, v26.8h
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
add v5.8h, v5.8h, v26.8h // sum_hv[1]
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
rev64 v29.4h, v28.4h // [-(x >> 1)]
ins v4.h[\i], v25.h[0] // sum_hv[0]
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
mov v2.16b, v27.16b // sum_diag[1]
mov v6.16b, v28.16b // sum_alt[0]
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
add v18.8h, v18.8h, v22.8h // sum_alt[2]
add v19.4h, v19.4h, v23.4h // sum_alt[2]
.else
add v18.8h, v18.8h, v26.8h // sum_alt[2]
.endif
.if \i == 0
mov v20.16b, v26.16b // sum_alt[3]
.elseif \i == 1
add v20.8h, v20.8h, v26.8h // sum_alt[3]
.else
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.endr
movi v31.4s, #105
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
smlal2 v26.4s, v4.8h, v4.8h
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
smlal2 v27.4s, v5.8h, v5.8h
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
addv s4, v26.4s // cost[2]
addv s5, v27.4s // cost[6]
rev64 v1.8h, v1.8h
rev64 v3.8h, v3.8h
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
str s4, [sp, #2*4] // cost[2]
str s5, [sp, #6*4] // cost[6]
movrel x4, div_table
ld1 {v31.8h}, [x4]
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
smull2 v23.4s, v0.8h, v0.8h
smlal v22.4s, v1.4h, v1.4h
smlal2 v23.4s, v1.8h, v1.8h
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
smull2 v25.4s, v2.8h, v2.8h
smlal v24.4s, v3.4h, v3.4h
smlal2 v25.4s, v3.8h, v3.8h
uxtl v30.4s, v31.4h // div_table
uxtl2 v31.4s, v31.8h
mul v22.4s, v22.4s, v30.4s // cost[0]
mla v22.4s, v23.4s, v31.4s // cost[0]
mul v24.4s, v24.4s, v30.4s // cost[4]
mla v24.4s, v25.4s, v31.4s // cost[4]
addv s0, v22.4s // cost[0]
addv s2, v24.4s // cost[4]
movrel x5, alt_fact
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
str s0, [sp, #0*4] // cost[0]
str s2, [sp, #4*4] // cost[4]
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
uxtl v30.4s, v30.4h
uxtl v31.4s, v31.4h
.macro cost_alt d1, d2, s1, s2, s3, s4
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
smull2 v23.4s, \s1\().8h, \s1\().8h
smull v24.4s, \s2\().4h, \s2\().4h
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
smull2 v26.4s, \s3\().8h, \s3\().8h
smull v27.4s, \s4\().4h, \s4\().4h
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
mla v22.4s, v23.4s, v30.4s
mla v22.4s, v24.4s, v31.4s
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
mla v25.4s, v26.4s, v30.4s
mla v25.4s, v27.4s, v31.4s
addv \d1, v22.4s // *cost_ptr
addv \d2, v25.4s // *cost_ptr
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s6, [sp, #1*4] // cost[1]
str s16, [sp, #3*4] // cost[3]
mov w0, #0 // best_dir
mov w1, v0.s[0] // best_cost
mov w3, #1 // n
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w4, v6.s[0]
.macro find_best s1, s2, s3
.ifnb \s2
mov w5, \s2\().s[0]
.endif
cmp w4, w1 // cost[n] > best_cost
csel w0, w3, w0, gt // best_dir = n
csel w1, w4, w1, gt // best_cost = cost[n]
.ifnb \s2
add w3, w3, #1 // n++
cmp w5, w1 // cost[n] > best_cost
mov w4, \s3\().s[0]
csel w0, w3, w0, gt // best_dir = n
csel w1, w5, w1, gt // best_cost = cost[n]
add w3, w3, #1 // n++
.endif
.endm
find_best v6, v4, v16
find_best v16, v2, v18
find_best v18, v5, v20
find_best v20
eor w3, w0, #4 // best_dir ^4
ldr w4, [sp, w3, uxtw #2]
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
lsr w1, w1, #10
str w1, [x2] // *var
add sp, sp, #32
ret
endfunc
filter_8 8
filter_8 4

228
third_party/dav1d/src/arm/64/cdef16.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,228 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
tst w6, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
sub \s1, \s1, #4
sub \s2, \s2, #4
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr d1, [\s1, #2*\w]
ldr \reg\()2, [\s2]
ldr d3, [\s2, #2*\w]
str \reg\()0, [x0]
str d1, [x0, #2*\w]
add x0, x0, #2*\stride
str \reg\()2, [x0]
str d3, [x0, #2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr s1, [\s1, #2*\w]
ldr \reg\()2, [\s2]
ldr s3, [\s2, #2*\w]
str \reg\()0, [x0]
str s1, [x0, #2*\w]
str s31, [x0, #2*\w+4]
add x0, x0, #2*\stride
str \reg\()2, [x0]
str s3, [x0, #2*\w]
str s31, [x0, #2*\w+4]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
2:
// !CDEF_HAVE_LEFT
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr s1, [\s1, #2*\w]
ldr \reg\()2, [\s2]
ldr s3, [\s2, #2*\w]
str s31, [x0]
stur \reg\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \reg\()2, [x0, #4]
str s3, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr \reg\()1, [\s2]
str s31, [x0]
stur \reg\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \reg\()1, [x0, #4]
str s31, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
.endif
3:
.endm
.macro load_n_incr_16 dst, src, incr, w
.if \w == 4
ld1 {\dst\().4h}, [\src], \incr
.else
ld1 {\dst\().8h}, [\src], \incr
.endif
.endm
// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_16 w, stride, reg
function cdef_padding\w\()_16bpc_neon, export=1
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w6, #4 // CDEF_HAVE_TOP
b.ne 1f
// !CDEF_HAVE_TOP
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
b 3f
1:
// CDEF_HAVE_TOP
add x9, x4, x2
pad_top_bot_16 x4, x9, \w, \stride, \reg, 0
// Middle section
3:
tst w6, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ld1 {v0.s}[0], [x3], #4
ldr s2, [x1, #2*\w]
load_n_incr_16 v1, x1, x2, \w
subs w5, w5, #1
str s0, [x0]
stur \reg\()1, [x0, #4]
str s2, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ld1 {v0.s}[0], [x3], #4
load_n_incr_16 v1, x1, x2, \w
subs w5, w5, #1
str s0, [x0]
stur \reg\()1, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
b 3f
2:
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldr s1, [x1, #2*\w]
load_n_incr_16 v0, x1, x2, \w
subs w5, w5, #1
str s31, [x0]
stur \reg\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
load_n_incr_16 v0, x1, x2, \w
subs w5, w5, #1
str s31, [x0]
stur \reg\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
3:
tst w6, #8 // CDEF_HAVE_BOTTOM
b.ne 1f
// !CDEF_HAVE_BOTTOM
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
ret
1:
// CDEF_HAVE_BOTTOM
add x9, x1, x2
pad_top_bot_16 x1, x9, \w, \stride, \reg, 1
endfunc
.endm
padding_func_16 8, 16, q
padding_func_16 4, 8, d
tables
filter 8, 16
filter 4, 16
find_dir 16

482
third_party/dav1d/src/arm/64/cdef_tmpl.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,482 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
.endm
.macro tables
dir_table 8, 16
dir_table 4, 8
const pri_taps
.byte 4, 2, 3, 3
endconst
.endm
.macro load_px d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().8h}, [x6] // p0
ld1 {\d2\().8h}, [x9] // p1
.else
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().4h}, [x6] // p0
add x6, x6, #2*8 // += stride
ld1 {\d2\().4h}, [x9] // p1
add x9, x9, #2*8 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p1
.endif
.endm
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
.if \min
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
.endif
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
neg v16.8h, v17.8h // -clip
neg v20.8h, v21.8h // -clip
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
.endm
// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func w, bpc, pri, sec, min, suffix
function cdef_filter\w\suffix\()_\bpc\()bpc_neon
.if \bpc == 8
ldr w8, [sp] // bitdepth_max
cmp w8, #0xf
b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
.endif
.if \pri
.if \bpc == 16
ldr w9, [sp, #8] // bitdepth_max
clz w9, w9
sub w9, w9, #24 // -bitdepth_min_8
neg w9, w9 // bitdepth_min_8
.endif
movrel x8, pri_taps
.if \bpc == 16
lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
.else
and w9, w3, #1
.endif
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
.if \pri
dup v25.8h, w3 // threshold
.endif
.if \sec
dup v27.8h, w4 // threshold
.endif
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
.if \sec
dup v26.8h, v24.h[1]
.endif
.if \pri
dup v24.8h, v24.h[0]
.endif
1:
.if \w == 8
ld1 {v0.8h}, [x2] // px
.else
add x12, x2, #2*8
ld1 {v0.4h}, [x2] // px
ld1 {v0.d}[1], [x12] // px
.endif
movi v1.8h, #0 // sum
.if \min
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
.if \min
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
.endif
.if \bpc == 8
xtn v0.8b, v0.8h
.endif
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
subs w7, w7, #1 // h--
.if \bpc == 8
st1 {v0.8b}, [x0], x1
.else
st1 {v0.8h}, [x0], x1
.endif
.else
.if \bpc == 8
st1 {v0.s}[0], [x0], x1
.else
st1 {v0.d}[0], [x0], x1
.endif
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
.if \bpc == 8
st1 {v0.s}[1], [x0], x1
.else
st1 {v0.d}[1], [x0], x1
.endif
.endif
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
.macro filter w, bpc
filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_\bpc\()bpc_neon, export=1
cbnz w3, 1f // pri_strength
b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
1:
cbnz w4, 1f // sec_strength
b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
1:
b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
endfunc
.endm
const div_table
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
const alt_fact
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
.macro cost_alt d1, d2, s1, s2, s3, s4
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
smull2 v23.4s, \s1\().8h, \s1\().8h
smull v24.4s, \s2\().4h, \s2\().4h
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
smull2 v26.4s, \s3\().8h, \s3\().8h
smull v27.4s, \s4\().4h, \s4\().4h
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
mla v22.4s, v23.4s, v30.4s
mla v22.4s, v24.4s, v31.4s
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
mla v25.4s, v26.4s, v30.4s
mla v25.4s, v27.4s, v31.4s
addv \d1, v22.4s // *cost_ptr
addv \d2, v25.4s // *cost_ptr
.endm
.macro find_best s1, s2, s3
.ifnb \s2
mov w5, \s2\().s[0]
.endif
cmp w4, w1 // cost[n] > best_cost
csel w0, w3, w0, gt // best_dir = n
csel w1, w4, w1, gt // best_cost = cost[n]
.ifnb \s2
add w3, w3, #1 // n++
cmp w5, w1 // cost[n] > best_cost
mov w4, \s3\().s[0]
csel w0, w3, w0, gt // best_dir = n
csel w1, w5, w1, gt // best_cost = cost[n]
add w3, w3, #1 // n++
.endif
.endm
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
.macro find_dir bpc
function cdef_find_dir_\bpc\()bpc_neon, export=1
.if \bpc == 16
str d8, [sp, #-0x10]!
clz w3, w3 // clz(bitdepth_max)
sub w3, w3, #24 // -bitdepth_min_8
dup v8.8h, w3
.endif
sub sp, sp, #32 // cost
mov w3, #8
.if \bpc == 8
movi v31.16b, #128
.else
movi v31.8h, #128
.endif
movi v30.16b, #0
movi v1.8h, #0 // v0-v1 sum_diag[0]
movi v3.8h, #0 // v2-v3 sum_diag[1]
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
movi v7.8h, #0 // v6-v7 sum_alt[0]
movi v17.8h, #0 // v16-v17 sum_alt[1]
movi v18.8h, #0 // v18-v19 sum_alt[2]
movi v19.8h, #0
movi v21.8h, #0 // v20-v21 sum_alt[3]
.irpc i, 01234567
.if \bpc == 8
ld1 {v26.8b}, [x0], x1
usubl v26.8h, v26.8b, v31.8b
.else
ld1 {v26.8h}, [x0], x1
ushl v26.8h, v26.8h, v8.8h
sub v26.8h, v26.8h, v31.8h
.endif
addv h25, v26.8h // [y]
rev64 v27.8h, v26.8h
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
add v5.8h, v5.8h, v26.8h // sum_hv[1]
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
rev64 v29.4h, v28.4h // [-(x >> 1)]
ins v4.h[\i], v25.h[0] // sum_hv[0]
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
mov v2.16b, v27.16b // sum_diag[1]
mov v6.16b, v28.16b // sum_alt[0]
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
add v18.8h, v18.8h, v22.8h // sum_alt[2]
add v19.4h, v19.4h, v23.4h // sum_alt[2]
.else
add v18.8h, v18.8h, v26.8h // sum_alt[2]
.endif
.if \i == 0
mov v20.16b, v26.16b // sum_alt[3]
.elseif \i == 1
add v20.8h, v20.8h, v26.8h // sum_alt[3]
.else
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.endr
movi v31.4s, #105
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
smlal2 v26.4s, v4.8h, v4.8h
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
smlal2 v27.4s, v5.8h, v5.8h
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
addv s4, v26.4s // cost[2]
addv s5, v27.4s // cost[6]
rev64 v1.8h, v1.8h
rev64 v3.8h, v3.8h
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
str s4, [sp, #2*4] // cost[2]
str s5, [sp, #6*4] // cost[6]
movrel x4, div_table
ld1 {v31.8h}, [x4]
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
smull2 v23.4s, v0.8h, v0.8h
smlal v22.4s, v1.4h, v1.4h
smlal2 v23.4s, v1.8h, v1.8h
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
smull2 v25.4s, v2.8h, v2.8h
smlal v24.4s, v3.4h, v3.4h
smlal2 v25.4s, v3.8h, v3.8h
uxtl v30.4s, v31.4h // div_table
uxtl2 v31.4s, v31.8h
mul v22.4s, v22.4s, v30.4s // cost[0]
mla v22.4s, v23.4s, v31.4s // cost[0]
mul v24.4s, v24.4s, v30.4s // cost[4]
mla v24.4s, v25.4s, v31.4s // cost[4]
addv s0, v22.4s // cost[0]
addv s2, v24.4s // cost[4]
movrel x5, alt_fact
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
str s0, [sp, #0*4] // cost[0]
str s2, [sp, #4*4] // cost[4]
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
uxtl v30.4s, v30.4h
uxtl v31.4s, v31.4h
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s6, [sp, #1*4] // cost[1]
str s16, [sp, #3*4] // cost[3]
mov w0, #0 // best_dir
mov w1, v0.s[0] // best_cost
mov w3, #1 // n
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w4, v6.s[0]
find_best v6, v4, v16
find_best v16, v2, v18
find_best v18, v5, v20
find_best v20
eor w3, w0, #4 // best_dir ^4
ldr w4, [sp, w3, uxtw #2]
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
lsr w1, w1, #10
str w1, [x2] // *var
add sp, sp, #32
.if \bpc == 16
ldr d8, [sp], 0x10
.endif
ret
endfunc
.endm

366
third_party/dav1d/src/arm/64/itx.S поставляемый
Просмотреть файл

@ -161,31 +161,6 @@ endconst
.endif
.endm
.macro scale_wide sz, c, r0, r1, r2 r3, r4, r5, r6, r7
smull_sz v2, v3, \r0, \c, \sz
smull_sz v4, v5, \r1, \c, \sz
smull_sz v6, v7, \r2, \c, \sz
rshrn_sz \r0, v2, v3, #12, \sz
smull_sz v2, v3, \r3, \c, \sz
rshrn_sz \r1, v4, v5, #12, \sz
.ifnb \r4
smull_sz v4, v5, \r4, \c, \sz
.endif
rshrn_sz \r2, v6, v7, #12, \sz
.ifnb \r4
smull_sz v6, v7, \r5, \c, \sz
.endif
rshrn_sz \r3, v2, v3, #12, \sz
.ifnb \r4
smull_sz v2, v3, \r6, \c, \sz
rshrn_sz \r4, v4, v5, #12, \sz
smull_sz v4, v5, \r7, \c, \sz
rshrn_sz \r5, v6, v7, #12, \sz
rshrn_sz \r6, v2, v3, #12, \sz
rshrn_sz \r7, v4, v5, #12, \sz
.endif
.endm
.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@ -599,41 +574,40 @@ function inv_flipadst_8x4_neon
endfunc
function inv_identity_4x4_neon
mov w16, #5793
mov w16, #(5793-4096)*8
dup v0.4h, w16
smull v4.4s, v16.4h, v0.h[0]
smull v5.4s, v17.4h, v0.h[0]
smull v6.4s, v18.4h, v0.h[0]
smull v7.4s, v19.4h, v0.h[0]
rshrn v16.4h, v4.4s, #12
rshrn v17.4h, v5.4s, #12
rshrn v18.4h, v6.4s, #12
rshrn v19.4h, v7.4s, #12
sqrdmulh v4.4h, v16.4h, v0.h[0]
sqrdmulh v5.4h, v17.4h, v0.h[0]
sqrdmulh v6.4h, v18.4h, v0.h[0]
sqrdmulh v7.4h, v19.4h, v0.h[0]
sqadd v16.4h, v16.4h, v4.4h
sqadd v17.4h, v17.4h, v5.4h
sqadd v18.4h, v18.4h, v6.4h
sqadd v19.4h, v19.4h, v7.4h
ret
endfunc
function inv_identity_8x4_neon
mov w16, #5793
mov w16, #(5793-4096)*8
dup v0.4h, w16
smull v2.4s, v16.4h, v0.h[0]
smull2 v3.4s, v16.8h, v0.h[0]
smull v4.4s, v17.4h, v0.h[0]
smull2 v5.4s, v17.8h, v0.h[0]
rshrn v16.4h, v2.4s, #12
rshrn2 v16.8h, v3.4s, #12
smull v6.4s, v18.4h, v0.h[0]
smull2 v7.4s, v18.8h, v0.h[0]
rshrn v17.4h, v4.4s, #12
rshrn2 v17.8h, v5.4s, #12
smull v2.4s, v19.4h, v0.h[0]
smull2 v3.4s, v19.8h, v0.h[0]
rshrn v18.4h, v6.4s, #12
rshrn2 v18.8h, v7.4s, #12
rshrn v19.4h, v2.4s, #12
rshrn2 v19.8h, v3.4s, #12
sqrdmulh v4.8h, v16.8h, v0.h[0]
sqrdmulh v5.8h, v17.8h, v0.h[0]
sqrdmulh v6.8h, v18.8h, v0.h[0]
sqrdmulh v7.8h, v19.8h, v0.h[0]
sqadd v16.8h, v16.8h, v4.8h
sqadd v17.8h, v17.8h, v5.8h
sqadd v18.8h, v18.8h, v6.8h
sqadd v19.8h, v19.8h, v7.8h
ret
endfunc
.macro identity_8x4_shift1 r0, r1, r2, r3, c
.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
sqrdmulh v2.8h, \i, \c
srhadd \i, \i, v2.8h
.endr
.endm
function inv_txfm_add_wht_wht_4x4_neon, export=1
mov x15, x30
movi v31.8h, #0
@ -877,30 +851,31 @@ function inv_flipadst_4x8_neon
endfunc
function inv_identity_8x8_neon
shl v16.8h, v16.8h, #1
shl v17.8h, v17.8h, #1
shl v18.8h, v18.8h, #1
shl v19.8h, v19.8h, #1
shl v20.8h, v20.8h, #1
shl v21.8h, v21.8h, #1
shl v22.8h, v22.8h, #1
shl v23.8h, v23.8h, #1
sqshl v16.8h, v16.8h, #1
sqshl v17.8h, v17.8h, #1
sqshl v18.8h, v18.8h, #1
sqshl v19.8h, v19.8h, #1
sqshl v20.8h, v20.8h, #1
sqshl v21.8h, v21.8h, #1
sqshl v22.8h, v22.8h, #1
sqshl v23.8h, v23.8h, #1
ret
endfunc
function inv_identity_4x8_neon
shl v16.4h, v16.4h, #1
shl v17.4h, v17.4h, #1
shl v18.4h, v18.4h, #1
shl v19.4h, v19.4h, #1
shl v20.4h, v20.4h, #1
shl v21.4h, v21.4h, #1
shl v22.4h, v22.4h, #1
shl v23.4h, v23.4h, #1
sqshl v16.4h, v16.4h, #1
sqshl v17.4h, v17.4h, #1
sqshl v18.4h, v18.4h, #1
sqshl v19.4h, v19.4h, #1
sqshl v20.4h, v20.4h, #1
sqshl v21.4h, v21.4h, #1
sqshl v22.4h, v22.4h, #1
sqshl v23.4h, v23.4h, #1
ret
endfunc
function inv_txfm_add_8x8_neon
.macro def_fn_8x8_base variant
function inv_txfm_\variant\()add_8x8_neon
movi v28.8h, #0
movi v29.8h, #0
movi v30.8h, #0
@ -910,6 +885,9 @@ function inv_txfm_add_8x8_neon
ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
.else
blr x4
srshr v16.8h, v16.8h, #1
@ -920,6 +898,7 @@ function inv_txfm_add_8x8_neon
srshr v21.8h, v21.8h, #1
srshr v22.8h, v22.8h, #1
srshr v23.8h, v23.8h, #1
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
@ -928,6 +907,10 @@ function inv_txfm_add_8x8_neon
load_add_store_8x8 x0, x7
br x15
endfunc
.endm
def_fn_8x8_base
def_fn_8x8_base identity_
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
@ -936,9 +919,13 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 8, 8, 1
.endif
adr x4, inv_\txfm1\()_8x8_neon
adr x5, inv_\txfm2\()_8x8_neon
.ifc \txfm1, identity
b inv_txfm_identity_add_8x8_neon
.else
adr x4, inv_\txfm1\()_8x8_neon
b inv_txfm_add_8x8_neon
.endif
endfunc
.endm
@ -1083,9 +1070,12 @@ def_fns_48 8, 4
rshrn_sz v27, v6, v7, #12, \sz // t14a
smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
neg v29\sz, v29\sz
smull_smlsl v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
rshrn_sz v29, v4, v5, #12, \sz // t13a
neg v6.4s, v6.4s
.ifc \sz, .8h
neg v7.4s, v7.4s
.endif
rshrn_sz v23, v6, v7, #12, \sz // t10a
sqsub v2\sz, v17\sz, v19\sz // t11a
@ -1333,27 +1323,59 @@ function inv_flipadst_4x16_neon
endfunc
function inv_identity_8x16_neon
mov w16, #2*5793
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
smull v2.4s, v\i\().4h, v0.h[0]
smull2 v3.4s, v\i\().8h, v0.h[0]
rshrn v\i\().4h, v2.4s, #12
rshrn2 v\i\().8h, v3.4s, #12
sqrdmulh v2.8h, v\i\().8h, v0.h[0]
sqadd v\i\().8h, v\i\().8h, v\i\().8h
sqadd v\i\().8h, v\i\().8h, v2.8h
.endr
ret
endfunc
function inv_identity_4x16_neon
mov w16, #2*5793
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
smull v2.4s, v\i\().4h, v0.h[0]
rshrn v\i\().4h, v2.4s, #12
sqrdmulh v2.4h, v\i\().4h, v0.h[0]
sqadd v\i\().4h, v\i\().4h, v\i\().4h
sqadd v\i\().4h, v\i\().4h, v2.4h
.endr
ret
endfunc
.macro identity_8x16_shift2 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
sqrdmulh v2.8h, \i, \c
sshr v2.8h, v2.8h, #1
srhadd \i, \i, v2.8h
.endr
.endm
.macro identity_8x16_shift1 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
sqrdmulh v2.8h, \i, \c
srshr v2.8h, v2.8h, #1
sqadd \i, \i, v2.8h
.endr
.endm
.macro identity_8x8_shift1 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
sqrdmulh v2.8h, \i, \c
srshr v2.8h, v2.8h, #1
sqadd \i, \i, v2.8h
.endr
.endm
.macro identity_8x8 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
sqrdmulh v2.8h, \i, \c
sqadd \i, \i, \i
sqadd \i, \i, v2.8h
.endr
.endm
function inv_txfm_horz_16x8_neon
mov x14, x30
movi v7.8h, #0
@ -1375,6 +1397,26 @@ function inv_txfm_horz_16x8_neon
br x14
endfunc
function inv_txfm_horz_identity_16x8_neon
mov x14, x30
movi v7.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x7]
st1 {v7.8h}, [x7], x8
.endr
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift2 v0.h[0]
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
st1 {v\i\().8h}, [x6], #16
.endr
br x14
endfunc
function inv_txfm_horz_scale_16x8_neon
mov x14, x30
movi v7.8h, #0
@ -1421,7 +1463,7 @@ function inv_txfm_add_16x16_neon
.endif
add x7, x2, #(\i*2)
mov x8, #16*2
bl inv_txfm_horz_16x8_neon
blr x9
.endr
b 2f
1:
@ -1449,7 +1491,12 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 16, 16, 2
.endif
.ifc \txfm1, identity
adr x9, inv_txfm_horz_identity_16x8_neon
.else
adr x9, inv_txfm_horz_16x8_neon
adr x4, inv_\txfm1\()_8x16_neon
.endif
adr x5, inv_\txfm2\()_8x16_neon
mov x13, #\eob_half
b inv_txfm_add_16x16_neon
@ -1469,12 +1516,35 @@ def_fn_16x16 flipadst, adst, 36
def_fn_16x16 flipadst, flipadst, 36
def_fn_16x16 identity, dct, 8
function inv_txfm_add_16x4_neon
.macro def_fn_416_base variant
function inv_txfm_\variant\()add_16x4_neon
mov x15, x30
movi v4.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().4h}, [x2]
.ifc \variant, identity_
.irp i, v16.4h, v17.4h, v18.4h, v19.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
.irp i, v16.d, v17.d, v18.d, v19.d
ld1 {\i}[1], [x2]
st1 {v4.4h}, [x2], #8
.endr
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, v20.4h, v21.4h, v22.4h, v23.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
.irp i, v20.d, v21.d, v22.d, v23.d
ld1 {\i}[1], [x2]
st1 {v4.4h}, [x2], #8
.endr
identity_8x16_shift1 v0.h[0]
.else
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
@ -1484,14 +1554,21 @@ function inv_txfm_add_16x4_neon
ins v17.d[1], v21.d[0]
ins v18.d[1], v22.d[0]
ins v19.d[1], v23.d[0]
.irp i, 16, 17, 18, 19
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
.endif
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
mov x6, x0
load_add_store_8x4 x6, x7
.ifc \variant, identity_
mov v16.16b, v20.16b
mov v17.16b, v21.16b
mov v18.16b, v22.16b
mov v19.16b, v23.16b
.else
ins v24.d[1], v28.d[0]
ins v25.d[1], v29.d[0]
ins v26.d[1], v30.d[0]
@ -1500,6 +1577,7 @@ function inv_txfm_add_16x4_neon
srshr v17.8h, v25.8h, #1
srshr v18.8h, v26.8h, #1
srshr v19.8h, v27.8h, #1
.endif
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
add x6, x0, #8
@ -1508,7 +1586,7 @@ function inv_txfm_add_16x4_neon
br x15
endfunc
function inv_txfm_add_4x16_neon
function inv_txfm_\variant\()add_4x16_neon
mov x15, x30
movi v2.8h, #0
@ -1517,8 +1595,17 @@ function inv_txfm_add_4x16_neon
b.lt 1f
add x6, x2, #16
.irp i, 16, 17, 18, 19
ld1 {v\i\().8h}, [x6]
.ifc \variant, identity_
.irp i, v24.8h, v25.8h, v26.8h, v27.8h
ld1 {\i}, [x6]
st1 {v2.8h}, [x6], x11
.endr
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
.else
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
ld1 {\i}, [x6]
st1 {v2.8h}, [x6], x11
.endr
blr x4
@ -1526,6 +1613,7 @@ function inv_txfm_add_4x16_neon
srshr v25.8h, v17.8h, #1
srshr v26.8h, v18.8h, #1
srshr v27.8h, v19.8h, #1
.endif
transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
ins v28.d[0], v24.d[1]
ins v29.d[0], v25.d[1]
@ -1534,19 +1622,25 @@ function inv_txfm_add_4x16_neon
b 2f
1:
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
movi v\i\().4h, #0
.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
movi \i, #0
.endr
2:
movi v2.8h, #0
.irp i, 16, 17, 18, 19
ld1 {v\i\().8h}, [x2]
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
ld1 {\i}, [x2]
st1 {v2.8h}, [x2], x11
.endr
.ifc \variant, identity_
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
.else
blr x4
.irp i, 16, 17, 18, 19
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
.endif
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
ins v21.d[0], v17.d[1]
@ -1559,6 +1653,10 @@ function inv_txfm_add_4x16_neon
br x15
endfunc
.endm
def_fn_416_base
def_fn_416_base identity_
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
@ -1573,7 +1671,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
adr x4, inv_\txfm1\()_4x\w\()_neon
adr x5, inv_\txfm2\()_8x\h\()_neon
.endif
.ifc \txfm1, identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
.endif
endfunc
.endm
@ -1600,24 +1702,31 @@ def_fns_416 4, 16
def_fns_416 16, 4
function inv_txfm_add_16x8_neon
.macro def_fn_816_base variant
function inv_txfm_\variant\()add_16x8_neon
mov x15, x30
movi v4.8h, #0
mov w16, #2896*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2]
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
ld1 {\i}, [x2]
st1 {v4.8h}, [x2], #16
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
.ifc \variant, identity_
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift1 v0.h[0]
.else
blr x4
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@ -1625,6 +1734,16 @@ function inv_txfm_add_16x8_neon
mov x6, x0
load_add_store_8x8 x6, x7
.ifc \variant, identity_
mov v16.16b, v24.16b
mov v17.16b, v25.16b
mov v18.16b, v26.16b
mov v19.16b, v27.16b
mov v20.16b, v28.16b
mov v21.16b, v29.16b
mov v22.16b, v30.16b
mov v23.16b, v31.16b
.else
srshr v16.8h, v24.8h, #1
srshr v17.8h, v25.8h, #1
srshr v18.8h, v26.8h, #1
@ -1633,6 +1752,7 @@ function inv_txfm_add_16x8_neon
srshr v21.8h, v29.8h, #1
srshr v22.8h, v30.8h, #1
srshr v23.8h, v31.8h, #1
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
@ -1644,7 +1764,7 @@ function inv_txfm_add_16x8_neon
br x15
endfunc
function inv_txfm_add_8x16_neon
function inv_txfm_\variant\()add_8x16_neon
mov x15, x30
movi v4.8h, #0
mov w16, #2896*8
@ -1655,8 +1775,16 @@ function inv_txfm_add_8x16_neon
b.lt 1f
add x6, x2, #16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
ld1 {v\i\().8h}, [x6]
.ifc \variant, identity_
.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
ld1 {\i}, [x6]
st1 {v4.8h}, [x6], x11
.endr
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
// The identity shl #1 and downshift srshr #1 cancel out
.else
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x6]
st1 {v4.8h}, [x6], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
@ -1670,13 +1798,14 @@ function inv_txfm_add_8x16_neon
srshr v29.8h, v21.8h, #1
srshr v30.8h, v22.8h, #1
srshr v31.8h, v23.8h, #1
.endif
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
b 2f
1:
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
movi v\i\().8h, #0
.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
movi \i, #0
.endr
2:
@ -1684,16 +1813,20 @@ function inv_txfm_add_8x16_neon
mov w16, #2896*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
ld1 {v\i\().8h}, [x2]
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x2]
st1 {v4.8h}, [x2], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
.else
blr x4
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
@ -1703,6 +1836,10 @@ function inv_txfm_add_8x16_neon
br x15
endfunc
.endm
def_fn_816_base
def_fn_816_base identity_
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
@ -1714,7 +1851,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
.if \w == 8
mov x13, #\eob_half
.endif
.ifc \txfm1, identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
.endif
endfunc
.endm
@ -2120,7 +2261,7 @@ endfunc
.macro def_identity_1632 w, h, wshort, hshort
function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
mov w16, #2896*8
mov w17, #2*5793
mov w17, #2*(5793-4096)*8
dup v1.4h, w16
movi v0.8h, #0
mov v1.h[1], w17
@ -2140,12 +2281,11 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
.if \w == 16
// 16x32
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
shift_8_regs srshr, 1
identity_8x8_shift1 v1.h[1]
.else
// 32x16
shift_8_regs shl, 1
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
shift_8_regs sqshl, 1
identity_8x8 v1.h[1]
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5

14
third_party/dav1d/src/arm/64/loopfilter.S поставляемый
Просмотреть файл

@ -151,8 +151,8 @@ function lpf_16_wd\wd\()_neon
movi v7.16b, #3
sqxtn v2.8b, v2.8h // f
sqxtn2 v2.16b, v3.8h
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 128)
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 128)
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127)
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
sshr v4.16b, v4.16b, #3 // f1
sshr v5.16b, v5.16b, #3 // f2
uxtl v2.8h, v23.8b // p0
@ -981,13 +981,13 @@ function lpf_h_16_16_neon
br x15
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
mov x11, x30
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]

907
third_party/dav1d/src/arm/64/loopfilter16.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,907 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
.if \wd >= 6
uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
.endif
.if \wd >= 8
uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
.endif
.if \wd >= 6
umax v4.8h, v4.8h, v5.8h
.endif
uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
.if \wd >= 8
umax v6.8h, v6.8h, v7.8h
.endif
ushr v3.8h, v3.8h, #1
.if \wd >= 8
umax v4.8h, v4.8h, v6.8h
.endif
.if \wd >= 6
and v4.16b, v4.16b, v14.16b
.endif
umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
umax v4.8h, v0.8h, v4.8h
cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
and v1.16b, v1.16b, v2.16b // fm
and v1.16b, v1.16b, v13.16b // fm && wd >= 4
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 16
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
b.eq 9f // if (!fm || wd < 4) return;
.if \wd >= 6
movi v10.8h, #1
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
dup v9.8h, w9 // bitdepth_min_8
.if \wd >= 8
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
.endif
umax v2.8h, v2.8h, v3.8h
umax v4.8h, v4.8h, v5.8h
.if \wd >= 8
umax v6.8h, v6.8h, v7.8h
.endif
umax v2.8h, v2.8h, v4.8h
ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
.if \wd >= 8
umax v2.8h, v2.8h, v6.8h
.endif
.if \wd == 16
uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
.endif
cmhs v2.8h, v10.8h, v2.8h // flat8in
.if \wd == 16
uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
.endif
and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
.if \wd == 16
umax v3.8h, v3.8h, v4.8h
umax v5.8h, v5.8h, v6.8h
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
.if \wd == 16
umax v7.8h, v7.8h, v8.8h
umax v3.8h, v3.8h, v5.8h
umax v3.8h, v3.8h, v7.8h
cmhs v3.8h, v10.8h, v3.8h // flat8out
.endif
adds x16, x16, x17
.if \wd == 16
and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
.endif
b.eq 1f // skip wd == 4 case
.endif
dup v3.8h, w8 // bitdepth_max
sub v2.8h, v22.8h, v25.8h // p1 - q1
ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
cmhi v0.8h, v0.8h, v12.8h // hev
not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
sub v2.8h, v24.8h, v23.8h
movi v5.8h, #3
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
mul v2.8h, v2.8h, v5.8h
movi v6.8h, #4
add v2.8h, v2.8h, v4.8h
smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
movi v7.8h, #3
smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
sqadd v4.8h, v6.8h, v2.8h // f + 4
sqadd v5.8h, v7.8h, v2.8h // f + 3
smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
sshr v4.8h, v4.8h, #3 // f1
sshr v5.8h, v5.8h, #3 // f2
movi v9.8h, #0
dup v3.8h, w8 // bitdepth_max
sqadd v2.8h, v23.8h, v5.8h // p0 + f2
sqsub v6.8h, v24.8h, v4.8h // q0 - f1
srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
sqadd v2.8h, v22.8h, v4.8h // p1 + f
sqsub v6.8h, v25.8h, v4.8h // q1 - f
smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 2f // skip if there's no flat8in
add v0.8h, v21.8h, v21.8h // p2 * 2
add v2.8h, v21.8h, v22.8h // p2 + p1
add v4.8h, v22.8h, v23.8h // p1 + p0
add v6.8h, v23.8h, v24.8h // p0 + q0
add v8.8h, v0.8h, v2.8h
add v10.8h, v4.8h, v6.8h
add v12.8h, v24.8h, v25.8h // q0 + q1
add v8.8h, v8.8h, v10.8h
sub v12.8h, v12.8h, v0.8h
add v10.8h, v25.8h, v26.8h // q1 + q2
urshr v0.8h, v8.8h, #3 // out p1
add v8.8h, v8.8h, v12.8h
sub v10.8h, v10.8h, v2.8h
add v12.8h, v26.8h, v26.8h // q2 + q2
urshr v1.8h, v8.8h, #3 // out p0
add v8.8h, v8.8h, v10.8h
sub v12.8h, v12.8h, v4.8h
urshr v2.8h, v8.8h, #3 // out q0
bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
add v8.8h, v8.8h, v12.8h
bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
urshr v3.8h, v8.8h, #3 // out q1
bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
.elseif \wd >= 8
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
.if \wd == 8
b.eq 8f // skip if there's no flat8in
.else
b.eq 2f // skip if there's no flat8in
.endif
add v0.8h, v20.8h, v21.8h // p3 + p2
add v2.8h, v22.8h, v25.8h // p1 + q1
add v4.8h, v20.8h, v22.8h // p3 + p1
add v6.8h, v23.8h, v26.8h // p0 + q2
add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
add v9.8h, v23.8h, v24.8h // p0 + q0
add v8.8h, v8.8h, v4.8h // + p3 + p1
sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
add v8.8h, v8.8h, v9.8h // + p0 + q0
sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
urshr v10.8h, v8.8h, #3 // out p2
add v8.8h, v8.8h, v2.8h
add v0.8h, v20.8h, v23.8h // p3 + p0
add v2.8h, v24.8h, v27.8h // q0 + q3
urshr v11.8h, v8.8h, #3 // out p1
add v8.8h, v8.8h, v6.8h
sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
add v4.8h, v21.8h, v24.8h // p2 + q0
add v6.8h, v25.8h, v27.8h // q1 + q3
urshr v12.8h, v8.8h, #3 // out p0
add v8.8h, v8.8h, v2.8h
sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
add v0.8h, v22.8h, v25.8h // p1 + q1
add v2.8h, v26.8h, v27.8h // q2 + q3
urshr v13.8h, v8.8h, #3 // out q0
add v8.8h, v8.8h, v6.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
urshr v0.8h, v8.8h, #3 // out q1
add v8.8h, v8.8h, v2.8h
bit v21.16b, v10.16b, v14.16b
bit v22.16b, v11.16b, v14.16b
bit v23.16b, v12.16b, v14.16b
urshr v1.8h, v8.8h, #3 // out q2
bit v24.16b, v13.16b, v14.16b
bit v25.16b, v0.16b, v14.16b
bit v26.16b, v1.16b, v14.16b
.endif
2:
.if \wd == 16
mov x16, v15.d[0]
mov x17, v15.d[1]
adds x16, x16, x17
b.ne 1f // check if flat8out is needed
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
add v2.8h, v17.8h, v17.8h // p6 + p6
add v4.8h, v17.8h, v18.8h // p6 + p5
add v6.8h, v17.8h, v19.8h // p6 + p4
add v8.8h, v17.8h, v20.8h // p6 + p3
add v12.8h, v2.8h, v4.8h
add v10.8h, v6.8h, v8.8h
add v6.8h, v17.8h, v21.8h // p6 + p2
add v12.8h, v12.8h, v10.8h
add v8.8h, v17.8h, v22.8h // p6 + p1
add v10.8h, v18.8h, v23.8h // p5 + p0
add v6.8h, v6.8h, v8.8h
add v8.8h, v19.8h, v24.8h // p4 + q0
add v12.8h, v12.8h, v6.8h
add v10.8h, v10.8h, v8.8h
add v6.8h, v20.8h, v25.8h // p3 + q1
add v12.8h, v12.8h, v10.8h
sub v6.8h, v6.8h, v2.8h
add v2.8h, v21.8h, v26.8h // p2 + q2
urshr v0.8h, v12.8h, #4 // out p5
add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
sub v2.8h, v2.8h, v4.8h
add v4.8h, v22.8h, v27.8h // p1 + q3
add v6.8h, v17.8h, v19.8h // p6 + p4
urshr v1.8h, v12.8h, #4 // out p4
add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
sub v4.8h, v4.8h, v6.8h
add v6.8h, v23.8h, v28.8h // p0 + q4
add v8.8h, v17.8h, v20.8h // p6 + p3
urshr v2.8h, v12.8h, #4 // out p3
add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
sub v6.8h, v6.8h, v8.8h
add v8.8h, v24.8h, v29.8h // q0 + q5
add v4.8h, v17.8h, v21.8h // p6 + p2
urshr v3.8h, v12.8h, #4 // out p2
add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
sub v8.8h, v8.8h, v4.8h
add v6.8h, v25.8h, v30.8h // q1 + q6
add v10.8h, v17.8h, v22.8h // p6 + p1
urshr v4.8h, v12.8h, #4 // out p1
add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
sub v6.8h, v6.8h, v10.8h
add v8.8h, v26.8h, v30.8h // q2 + q6
bif v0.16b, v18.16b, v15.16b // out p5
add v10.8h, v18.8h, v23.8h // p5 + p0
urshr v5.8h, v12.8h, #4 // out p0
add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
sub v8.8h, v8.8h, v10.8h
add v10.8h, v27.8h, v30.8h // q3 + q6
bif v1.16b, v19.16b, v15.16b // out p4
add v18.8h, v19.8h, v24.8h // p4 + q0
urshr v6.8h, v12.8h, #4 // out q0
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
sub v10.8h, v10.8h, v18.8h
add v8.8h, v28.8h, v30.8h // q4 + q6
bif v2.16b, v20.16b, v15.16b // out p3
add v18.8h, v20.8h, v25.8h // p3 + q1
urshr v7.8h, v12.8h, #4 // out q1
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
sub v18.8h, v8.8h, v18.8h
add v10.8h, v29.8h, v30.8h // q5 + q6
bif v3.16b, v21.16b, v15.16b // out p2
add v20.8h, v21.8h, v26.8h // p2 + q2
urshr v8.8h, v12.8h, #4 // out q2
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
sub v10.8h, v10.8h, v20.8h
add v18.8h, v30.8h, v30.8h // q6 + q6
bif v4.16b, v22.16b, v15.16b // out p1
add v20.8h, v22.8h, v27.8h // p1 + q3
urshr v9.8h, v12.8h, #4 // out q3
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
sub v18.8h, v18.8h, v20.8h
bif v5.16b, v23.16b, v15.16b // out p0
urshr v10.8h, v12.8h, #4 // out q4
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
urshr v11.8h, v12.8h, #4 // out q5
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2
bif v9.16b, v27.16b, v15.16b // out q3
bif v10.16b, v28.16b, v15.16b // out q4
bif v11.16b, v29.16b, v15.16b // out q5
.endif
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
br x13
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
br x14
.endif
9:
// Return directly without writing back any pixels
br x15
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_8_wd16
adr x13, 7f
adr x14, 8f
bl lpf_8_wd16_neon
.endm
.macro lpf_8_wd8
adr x14, 8f
bl lpf_8_wd8_neon
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
.endm
function lpf_v_4_8_neon
mov x15, x30
sub x16, x0, x1, lsl #1
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
lpf_8_wd4
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_4_8_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #2
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
add x0, x0, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd4
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
function lpf_v_6_8_neon
mov x15, x30
sub x16, x0, x1, lsl #1
sub x16, x16, x1
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
lpf_8_wd6
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_6_8_neon
mov x15, x30
sub x16, x0, #8
add x0, x16, x1, lsl #2
ld1 {v20.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
add x0, x0, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd6
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
function lpf_v_8_8_neon
mov x15, x30
sub x16, x0, x1, lsl #2
ld1 {v20.8h}, [x16], x1 // p3
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v27.8h}, [x0], x1 // q3
sub x0, x0, x1, lsl #2
lpf_8_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
st1 {v25.8h}, [x0], x1 // q1
st1 {v23.8h}, [x16], x1 // p0
st1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_8_8_neon
mov x15, x30
sub x16, x0, #8
add x0, x16, x1, lsl #2
ld1 {v20.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
add x0, x0, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd8
sub x16, x0, x1, lsl #3
sub x16, x16, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v20.8h}, [x16], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x16], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x16], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x16], x1
st1 {v27.8h}, [x0], x1
add x0, x0, #8
br x15
8:
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
function lpf_v_16_8_neon
mov x15, x30
sub x16, x0, x1, lsl #3
add x16, x16, x1
ld1 {v17.8h}, [x16], x1 // p6
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v18.8h}, [x16], x1 // p5
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v19.8h}, [x16], x1 // p4
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v20.8h}, [x16], x1 // p3
ld1 {v27.8h}, [x0], x1 // q3
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v28.8h}, [x0], x1 // q4
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v29.8h}, [x0], x1 // q5
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v30.8h}, [x0], x1 // q6
sub x0, x0, x1, lsl #3
add x0, x0, x1
lpf_8_wd16
sub x16, x0, x1, lsl #2
sub x16, x16, x1, lsl #1
st1 {v0.8h}, [x16], x1 // p5
st1 {v6.8h}, [x0], x1 // q0
st1 {v1.8h}, [x16], x1 // p4
st1 {v7.8h}, [x0], x1 // q1
st1 {v2.8h}, [x16], x1 // p3
st1 {v8.8h}, [x0], x1 // q2
st1 {v3.8h}, [x16], x1 // p2
st1 {v9.8h}, [x0], x1 // q3
st1 {v4.8h}, [x16], x1 // p1
st1 {v10.8h}, [x0], x1 // q4
st1 {v5.8h}, [x16], x1 // p0
st1 {v11.8h}, [x0], x1 // q5
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
br x15
7:
sub x16, x0, x1
sub x16, x16, x1, lsl #1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
st1 {v25.8h}, [x0], x1 // q1
st1 {v23.8h}, [x16], x1 // p0
st1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_16_8_neon
mov x15, x30
sub x16, x0, #16
ld1 {v16.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v17.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v18.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v19.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
ld1 {v20.8h}, [x16], x1
ld1 {v28.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v29.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v30.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v31.8h}, [x0], x1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
lpf_8_wd16
sub x0, x0, x1, lsl #3
sub x16, x0, #16
transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
st1 {v16.8h}, [x16], x1
st1 {v6.8h}, [x0], x1
st1 {v17.8h}, [x16], x1
st1 {v7.8h}, [x0], x1
st1 {v0.8h}, [x16], x1
st1 {v8.8h}, [x0], x1
st1 {v1.8h}, [x16], x1
st1 {v9.8h}, [x0], x1
st1 {v2.8h}, [x16], x1
st1 {v10.8h}, [x0], x1
st1 {v3.8h}, [x16], x1
st1 {v11.8h}, [x0], x1
st1 {v4.8h}, [x16], x1
st1 {v30.8h}, [x0], x1
st1 {v5.8h}, [x16], x1
st1 {v31.8h}, [x0], x1
br x15
7:
sub x16, x0, x1, lsl #3
sub x16, x16, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v20.8h}, [x16], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x16], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x16], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x16], x1
st1 {v27.8h}, [x0], x1
add x0, x0, #8
br x15
8:
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w,
// const int bitdepth_max)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
mov x11, x30
mov w8, w7 // bitdepth_max
clz w9, w8
mov w10, #24
sub w9, w10, w9 // bitdepth_min_8
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp w6, w7, [x2] // vmask[0], vmask[1]
.ifc \type, y
ldr w2, [x2, #8] // vmask[2]
.endif
add x5, x5, #128 // Move to sharp part of lut
.ifc \type, y
orr w7, w7, w2 // vmask[1] |= vmask[2]
.endif
.ifc \dir, v
sub x4, x3, x4, lsl #2
.else
sub x3, x3, #4
lsl x4, x4, #2
.endif
orr w6, w6, w7 // vmask[0] |= vmask[1]
1:
tst w6, #0x0f
.ifc \dir, v
ld1 {v0.8b}, [x4], #8
ld1 {v1.8b}, [x3], #8
.else
ld2 {v0.s,v1.s}[0], [x3], x4
ld2 {v0.s,v1.s}[1], [x3], x4
.endif
b.eq 7f // if (!(vm & bits)) continue;
ld1r {v5.8b}, [x5] // sharp[0]
add x5, x5, #8
movi v2.2s, #0xff
dup v13.2s, w6 // vmask[0]
dup v31.8h, w9 // bitdepth_min_8
and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
and v1.8b, v1.8b, v2.8b
cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
movi v4.8b, #1
ld1r {v6.8b}, [x5] // sharp[1]
sub x5, x5, #8
bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
mul v1.2s, v1.2s, v4.2s // L
.ifc \type, y
dup v15.2s, w2 // vmask[2]
.endif
cmtst v2.2s, v1.2s, v2.2s // L != 0
dup v14.2s, w7 // vmask[1]
mov x16, v2.d[0]
cmp x16, #0
b.eq 7f // if (!L) continue;
neg v5.8b, v5.8b // -sharp[0]
movrel x16, word_12
ushr v12.8b, v1.8b, #4 // H
ld1 {v16.2s}, [x16]
sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
.ifc \type, y
cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
.endif
movi v7.8b, #2
umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
add v0.8b, v1.8b, v7.8b // L + 2
umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
add v0.8b, v0.8b, v0.8b // 2*(L + 2)
cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
uxtl v12.8h, v12.8b
add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
uxtl v11.8h, v11.8b
uxtl v10.8h, v10.8b
and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
sxtl v14.8h, v14.8b
sxtl v13.8h, v13.8b
.ifc \type, y
sxtl v15.8h, v15.8b
.endif
ushl v12.8h, v12.8h, v31.8h
ushl v11.8h, v11.8h, v31.8h
ushl v10.8h, v10.8h, v31.8h
.ifc \type, y
tst w2, #0x0f
b.eq 2f
// wd16
bl lpf_\dir\()_16_8_neon
b 8f
2:
.endif
tst w7, #0x0f
b.eq 3f
.ifc \type, y
// wd8
bl lpf_\dir\()_8_8_neon
.else
// wd6
bl lpf_\dir\()_6_8_neon
.endif
b 8f
3:
// wd4
bl lpf_\dir\()_4_8_neon
.ifc \dir, h
b 8f
7:
// For dir h, the functions above increment x0.
// If the whole function is skipped, increment it here instead.
add x0, x0, x1, lsl #3
.else
7:
.endif
8:
lsr w6, w6, #2 // vmask[0] >>= 2
lsr w7, w7, #2 // vmask[1] >>= 2
.ifc \type, y
lsr w2, w2, #2 // vmask[2] >>= 2
.endif
.ifc \dir, v
add x0, x0, #16
.else
// For dir h, x0 is returned incremented
.endif
cbnz w6, 1b
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
br x11
endfunc
.endm
lpf_func v, y
lpf_func h, y
lpf_func v, uv
lpf_func h, uv
const word_12
.word 1, 2
endconst

895
third_party/dav1d/src/arm/64/looprestoration.S поставляемый
Просмотреть файл

@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_neon, export=1
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
mov w8, w5
ld1 {v0.8h}, [x4]
mov w9, #(1 << 14) - (1 << 2)
@ -306,11 +306,11 @@ L(variable_shift_tbl):
.purgem filter
endfunc
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_neon, export=1
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
mov w8, w4
ld1 {v0.8h}, [x5]
movi v1.8h, #128
@ -482,9 +482,9 @@ function wiener_filter_v_neon, export=1
.purgem filter
endfunc
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_neon, export=1
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_8bpc_neon, export=1
adr x5, L(copy_narrow_tbl)
ldrh w6, [x5, w3, uxtw #1]
sub x5, x5, w6, uxth
@ -617,12 +617,14 @@ endfunc
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_neon, export=1
#include "looprestoration_tmpl.S"
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
add w5, w5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
@ -844,11 +846,11 @@ L(box3_variable_shift_tbl):
umull2 v6.8h, v4.16b, v4.16b
add3 4
subs w5, w5, #4
st1 {v3.4h}, [x1], #8
st1 {v7.4h}, [x11], #8
st1 {v26.4s}, [x0], #16
st1 {v28.4s}, [x10], #16
subs w5, w5, #4
b.le 9f
ext v0.16b, v0.16b, v0.16b, #4
ext v4.16b, v4.16b, v4.16b, #4
@ -879,12 +881,12 @@ L(box3_variable_shift_tbl):
.purgem add3
endfunc
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_neon, export=1
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
add w5, w5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
@ -950,7 +952,7 @@ function sgr_box5_h_neon, export=1
b 2f
0:
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 2x the first byte at the front.
// and shift v0 to have 3x the first byte at the front.
dup v1.16b, v0.b[0]
dup v5.16b, v4.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
@ -1114,11 +1116,11 @@ L(box5_variable_shift_tbl):
umull2 v6.8h, v4.16b, v4.16b
add5 4
subs w5, w5, #4
st1 {v3.4h}, [x1], #8
st1 {v7.4h}, [x11], #8
st1 {v26.4s}, [x0], #16
st1 {v28.4s}, [x10], #16
subs w5, w5, #4
b.le 9f
ext v0.16b, v0.16b, v0.16b, #4
ext v1.16b, v1.16b, v2.16b, #8
@ -1147,839 +1149,4 @@ L(box5_variable_shift_tbl):
.purgem add5
endfunc
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #2 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 1f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Sum all h+2 lines with the main loop
add w11, w11, #2
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v21 and v24-v26 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v24.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v18.4s, v19.4s}, [x5], x7
ld1 {v25.8h}, [x6], x8
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v25.16b, v24.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v26.16b, v24.16b
3:
subs w3, w3, #1
.macro add3
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v24.8h, v24.8h, v25.8h
add v16.4s, v16.4s, v20.4s
add v17.4s, v17.4s, v21.4s
add v24.8h, v24.8h, v26.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v24.8h}, [x1], x8
.endm
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v25.16b, v26.16b
b.le 4f
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3b
4:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 5f
// !LR_HAVE_BOTTOM
// Produce two more rows, extending the already loaded rows.
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
add3
5: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add3
endfunc
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #8 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 0f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Handle h+2 lines with the main loop
add w11, w11, #2
b 1f
0:
// !LR_HAVE_BOTTOM
sub w3, w3, #1 // Handle h-1 lines with the main loop
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v25 and v26-v30 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v28.8h}, [x6], x8
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v28.16b, v26.16b
mov v22.16b, v16.16b
mov v23.16b, v17.16b
mov v29.16b, v26.16b
3:
cbz w3, 4f
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
3:
// Start of vertical loop
subs w3, w3, #2
.macro add5
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v26.8h, v26.8h, v27.8h
add v0.4s, v20.4s, v22.4s
add v1.4s, v21.4s, v23.4s
add v2.8h, v28.8h, v29.8h
add v16.4s, v16.4s, v24.4s
add v17.4s, v17.4s, v25.4s
add v26.8h, v26.8h, v30.8h
add v16.4s, v16.4s, v0.4s
add v17.4s, v17.4s, v1.4s
add v26.8h, v26.8h, v2.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v26.8h}, [x1], x8
.endm
add5
.macro shift2
mov v16.16b, v20.16b
mov v17.16b, v21.16b
mov v26.16b, v28.16b
mov v18.16b, v22.16b
mov v19.16b, v23.16b
mov v27.16b, v29.16b
mov v20.16b, v24.16b
mov v21.16b, v25.16b
mov v28.16b, v30.16b
.endm
shift2
add x0, x0, x7
add x1, x1, x8
b.le 5f
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
b 3b
4:
// h == 1, !LR_HAVE_BOTTOM.
// Pad the last row with the only content row, and add.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
add5
b 6f
5:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 6f
// !LR_HAVE_BOTTOM
cbnz w3, 5f
// The intended three edge rows left; output the one at h-2 and
// the past edge one at h.
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
// Pad the past-edge row from the last content row.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
// The last two rows are already padded properly here.
add5
b 6f
5:
// w3 == -1, two rows left, output one.
// Pad the last two rows from the mid one.
mov v22.16b, v20.16b
mov v23.16b, v21.16b
mov v29.16b, v28.16b
mov v24.16b, v20.16b
mov v25.16b, v21.16b
mov v30.16b, v28.16b
add5
add x0, x0, x7
add x1, x1, x8
b 6f
6: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add5
endfunc
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength);
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength);
function sgr_calc_ab1_neon, export=1
add x3, x3, #2 // h += 2
movi v31.4s, #9 // n
mov x5, #455
mov x8, #SUM_STRIDE
b sgr_calc_ab_neon
endfunc
function sgr_calc_ab2_neon, export=1
add x3, x3, #3 // h += 3
asr x3, x3, #1 // h /= 2
movi v31.4s, #25 // n
mov x5, #164
mov x8, #(2*SUM_STRIDE)
endfunc
function sgr_calc_ab_neon
movrel x12, X(sgr_x_by_x)
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
add x2, x2, #2 // w += 2
add x7, x2, #7
bic x7, x7, #7 // aligned w
sub x7, x8, x7 // increment between rows
movi v29.8h, #1, lsl #8
dup v28.4s, w4
dup v30.4s, w5 // one_by_x
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
ld1 {v2.8h}, [x1] // b
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v2.4h, v2.4h // b * b
umull2 v4.4s, v2.8h, v2.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v5.8b
add v6.8b, v6.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v1.8b, v1.8b, v6.8b
add v1.8b, v1.8b, v25.8b
uxtl v1.8h, v1.8b // x
umull v3.4s, v1.4h, v2.4h // x * BB[i]
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v2.8h, v29.8h, v1.8h // 256 - x
st1 {v3.4s, v4.4s}, [x0], #32
st1 {v2.8h}, [x1], #16
b.gt 1b
subs x3, x3, #1
b.le 0f
add x0, x0, x7, lsl #2
add x1, x1, x7, lsl #1
mov x2, x6
b 1b
0:
ret
endfunc
#define FILTER_OUT_STRIDE 384
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_neon, export=1
sub x7, x3, #(4*SUM_STRIDE)
add x8, x3, #(4*SUM_STRIDE)
sub x9, x4, #(2*SUM_STRIDE)
add x10, x4, #(2*SUM_STRIDE)
mov x11, #SUM_STRIDE
mov x12, #FILTER_OUT_STRIDE
add x13, x5, #7
bic x13, x13, #7 // Aligned width
sub x2, x2, x13
sub x12, x12, x13
sub x11, x11, x13
sub x11, x11, #4 // We read 4 extra elements from a
sub x14, x11, #4 // We read 8 extra elements from b
mov x13, x5
movi v6.8h, #3
movi v7.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x9], #32
ld1 {v2.8h, v3.8h}, [x4], #32
ld1 {v4.8h, v5.8h}, [x10], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
2:
subs x5, x5, #8
ext v25.16b, v0.16b, v1.16b, #2 // -stride
ext v26.16b, v2.16b, v3.16b, #2 // 0
ext v27.16b, v4.16b, v5.16b, #2 // +stride
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
ext v29.16b, v2.16b, v3.16b, #4 // +1
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
add v2.8h, v2.8h, v25.8h // -1, -stride
add v26.8h, v26.8h, v27.8h // 0, +stride
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
add v2.8h, v2.8h, v26.8h
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
add v2.8h, v2.8h, v29.8h // +1
add v0.8h, v0.8h, v4.8h
ext v25.16b, v16.16b, v17.16b, #4 // -stride
ext v26.16b, v17.16b, v18.16b, #4
shl v2.8h, v2.8h, #2
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
ext v28.16b, v17.16b, v18.16b, #8
ext v29.16b, v19.16b, v20.16b, #4 // 0
ext v30.16b, v20.16b, v21.16b, #4
mla v2.8h, v0.8h, v6.8h // * 3 -> a
add v25.4s, v25.4s, v19.4s // -stride, -1
add v26.4s, v26.4s, v20.4s
add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v28.4s
ext v27.16b, v19.16b, v20.16b, #8 // +1
ext v28.16b, v20.16b, v21.16b, #8
add v16.4s, v16.4s, v22.4s // -1+stride
add v17.4s, v17.4s, v23.4s
add v29.4s, v29.4s, v27.4s // 0, +1
add v30.4s, v30.4s, v28.4s
add v25.4s, v25.4s, v29.4s
add v26.4s, v26.4s, v30.4s
ext v27.16b, v22.16b, v23.16b, #4 // +stride
ext v28.16b, v23.16b, v24.16b, #4
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
ext v30.16b, v23.16b, v24.16b, #8
ld1 {v19.8b}, [x1], #8 // src
add v25.4s, v25.4s, v27.4s // +stride
add v26.4s, v26.4s, v28.4s
add v16.4s, v16.4s, v29.4s // +1+stride
add v17.4s, v17.4s, v30.4s
shl v25.4s, v25.4s, #2
shl v26.4s, v26.4s, #2
mla v25.4s, v16.4s, v7.4s // * 3 -> b
mla v26.4s, v17.4s, v7.4s
uxtl v19.8h, v19.8b // src
mov v0.16b, v1.16b
umlal v25.4s, v2.4h, v19.4h // b + a * src
umlal2 v26.4s, v2.8h, v19.8h
mov v2.16b, v3.16b
rshrn v25.4h, v25.4s, #9
rshrn2 v25.8h, v26.4s, #9
mov v4.16b, v5.16b
st1 {v25.8h}, [x0], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
mov v22.16b, v24.16b
ld1 {v1.8h}, [x9], #16
ld1 {v3.8h}, [x4], #16
ld1 {v5.8h}, [x10], #16
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x3], #32
ld1 {v23.4s, v24.4s}, [x8], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x13
add x0, x0, x12, lsl #1
add x1, x1, x2
add x3, x3, x11, lsl #2
add x7, x7, x11, lsl #2
add x8, x8, x11, lsl #2
add x4, x4, x14, lsl #1
add x9, x9, x14, lsl #1
add x10, x10, x14, lsl #1
b 1b
0:
ret
endfunc
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_neon, export=1
add x7, x3, #(4*(SUM_STRIDE))
sub x3, x3, #(4*(SUM_STRIDE))
add x8, x4, #(2*(SUM_STRIDE))
sub x4, x4, #(2*(SUM_STRIDE))
mov x9, #(2*SUM_STRIDE)
mov x10, #FILTER_OUT_STRIDE
add x11, x5, #7
bic x11, x11, #7 // Aligned width
sub x2, x2, x11
sub x10, x10, x11
sub x9, x9, x11
sub x9, x9, #4 // We read 4 extra elements from a
sub x12, x9, #4 // We read 8 extra elements from b
mov x11, x5
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
movi v7.4s, #6
1:
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
subs x5, x5, #8
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
ext v25.16b, v20.16b, v21.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
ext v27.16b, v17.16b, v18.16b, #8
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
ext v29.16b, v20.16b, v21.16b, #8
mul v0.8h, v0.8h, v4.8h // * 5
mla v0.8h, v2.8h, v6.8h // * 6
ld1 {v31.8b}, [x1], #8
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
add v20.4s, v20.4s, v29.4s
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v16.4s, v16.4s, v5.4s // * 5
mla v16.4s, v22.4s, v7.4s // * 6
mul v17.4s, v17.4s, v5.4s // * 5
mla v17.4s, v23.4s, v7.4s // * 6
uxtl v31.8h, v31.8b
umlal v16.4s, v0.4h, v31.4h // b + a * src
umlal2 v17.4s, v0.8h, v31.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
mov v2.16b, v3.16b
st1 {v16.8h}, [x0], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
ld1 {v1.8h}, [x4], #16
ld1 {v3.8h}, [x8], #16
ld1 {v17.4s, v18.4s}, [x3], #32
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
add x3, x3, x9, lsl #2
add x7, x7, x9, lsl #2
add x4, x4, x12, lsl #1
add x8, x8, x12, lsl #1
mov x13, x3
mov x14, x4
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
4:
subs x5, x5, #8
ext v23.16b, v0.16b, v1.16b, #4 // +1
ext v22.16b, v0.16b, v1.16b, #2 // 0
add v0.8h, v0.8h, v23.8h // -1, +1
ext v24.16b, v16.16b, v17.16b, #4 // 0
ext v25.16b, v17.16b, v18.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1
ext v27.16b, v17.16b, v18.16b, #8
mul v2.8h, v22.8h, v6.8h // * 6
mla v2.8h, v0.8h, v4.8h // * 5 -> a
ld1 {v31.8b}, [x1], #8
add v16.4s, v16.4s, v26.4s // -1, +1
add v17.4s, v17.4s, v27.4s
uxtl v31.8h, v31.8b
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v24.4s, v24.4s, v7.4s // * 6
mla v24.4s, v16.4s, v5.4s // * 5 -> b
mul v25.4s, v25.4s, v7.4s // * 6
mla v25.4s, v17.4s, v5.4s // * 5 -> b
umlal v24.4s, v2.4h, v31.4h // b + a * src
umlal2 v25.4s, v2.8h, v31.8h
mov v0.16b, v1.16b
rshrn v24.4h, v24.4s, #8
rshrn2 v24.8h, v25.4s, #8
mov v16.16b, v18.16b
st1 {v24.8h}, [x0], #16
b.le 5f
ld1 {v1.8h}, [x4], #16
ld1 {v17.4s, v18.4s}, [x3], #32
b 4b
5:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
mov x3, x13 // Rewind x3/x4 to where they started
mov x4, x14
b 1b
0:
ret
endfunc
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
dup v31.8h, w7
cmp x6, #2
add x9, x0, x1
add x10, x2, x3
add x11, x4, #2*FILTER_OUT_STRIDE
mov x7, #(4*FILTER_OUT_STRIDE)
lsl x1, x1, #1
lsl x3, x3, #1
add x8, x5, #7
bic x8, x8, #7 // Aligned width
sub x1, x1, x8
sub x3, x3, x8
sub x7, x7, x8, lsl #1
mov x8, x5
b.lt 2f
1:
ld1 {v0.8b}, [x2], #8
ld1 {v4.8b}, [x10], #8
ld1 {v1.8h}, [x4], #16
ld1 {v5.8h}, [x11], #16
subs x5, x5, #8
ushll v0.8h, v0.8b, #4 // u
ushll v4.8h, v4.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v5.8h, v5.8h, v4.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
ushll v6.4s, v4.4h, #7 // u << 7
ushll2 v7.4s, v4.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
smlal v6.4s, v5.4h, v31.4h // v
smlal2 v7.4s, v5.8h, v31.8h // v
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
rshrn v6.4h, v6.4s, #11
rshrn2 v6.8h, v7.4s, #11
sqxtun v2.8b, v2.8h
sqxtun v6.8b, v6.8h
st1 {v2.8b}, [x0], #8
st1 {v6.8b}, [x9], #8
b.gt 1b
sub x6, x6, #2
cmp x6, #1
b.lt 0f
mov x5, x8
add x0, x0, x1
add x9, x9, x1
add x2, x2, x3
add x10, x10, x3
add x4, x4, x7
add x11, x11, x7
b.eq 2f
b 1b
2:
ld1 {v0.8b}, [x2], #8
ld1 {v1.8h}, [x4], #16
subs x5, x5, #8
ushll v0.8h, v0.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
sqxtun v2.8b, v2.8h
st1 {v2.8b}, [x0], #8
b.gt 2b
0:
ret
endfunc
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const coef *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
ldr x8, [sp]
cmp x7, #2
add x10, x0, x1
add x11, x2, x3
add x12, x4, #2*FILTER_OUT_STRIDE
add x13, x5, #2*FILTER_OUT_STRIDE
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
mov x8, #4*FILTER_OUT_STRIDE
lsl x1, x1, #1
lsl x3, x3, #1
add x9, x6, #7
bic x9, x9, #7 // Aligned width
sub x1, x1, x9
sub x3, x3, x9
sub x8, x8, x9, lsl #1
mov x9, x6
b.lt 2f
1:
ld1 {v0.8b}, [x2], #8
ld1 {v16.8b}, [x11], #8
ld1 {v1.8h}, [x4], #16
ld1 {v17.8h}, [x12], #16
ld1 {v2.8h}, [x5], #16
ld1 {v18.8h}, [x13], #16
subs x6, x6, #8
ushll v0.8h, v0.8b, #4 // u
ushll v16.8h, v16.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v2.8h, v2.8h, v0.8h // t2 - u
sub v17.8h, v17.8h, v16.8h // t1 - u
sub v18.8h, v18.8h, v16.8h // t2 - u
ushll v3.4s, v0.4h, #7 // u << 7
ushll2 v4.4s, v0.8h, #7 // u << 7
ushll v19.4s, v16.4h, #7 // u << 7
ushll2 v20.4s, v16.8h, #7 // u << 7
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
rshrn v19.4h, v19.4s, #11
rshrn2 v19.8h, v20.4s, #11
sqxtun v3.8b, v3.8h
sqxtun v19.8b, v19.8h
st1 {v3.8b}, [x0], #8
st1 {v19.8b}, [x10], #8
b.gt 1b
subs x7, x7, #2
cmp x7, #1
b.lt 0f
mov x6, x9
add x0, x0, x1
add x10, x10, x1
add x2, x2, x3
add x11, x11, x3
add x4, x4, x8
add x12, x12, x8
add x5, x5, x8
add x13, x13, x8
b.eq 2f
b 1b
2:
ld1 {v0.8b}, [x2], #8
ld1 {v1.8h}, [x4], #16
ld1 {v2.8h}, [x5], #16
subs x6, x6, #8
ushll v0.8h, v0.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v2.8h, v2.8h, v0.8h // t2 - u
ushll v3.4s, v0.4h, #7 // u << 7
ushll2 v4.4s, v0.8h, #7 // u << 7
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
sqxtun v3.8b, v3.8h
st1 {v3.8b}, [x0], #8
b.gt 1b
0:
ret
endfunc
sgr_funcs 8

1239
third_party/dav1d/src/arm/64/looprestoration16.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

432
third_party/dav1d/src/arm/64/looprestoration_common.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,432 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #2 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 1f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Sum all h+2 lines with the main loop
add w11, w11, #2
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v21 and v24-v26 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v24.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v18.4s, v19.4s}, [x5], x7
ld1 {v25.8h}, [x6], x8
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v25.16b, v24.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v26.16b, v24.16b
3:
subs w3, w3, #1
.macro add3
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v24.8h, v24.8h, v25.8h
add v16.4s, v16.4s, v20.4s
add v17.4s, v17.4s, v21.4s
add v24.8h, v24.8h, v26.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v24.8h}, [x1], x8
.endm
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v25.16b, v26.16b
b.le 4f
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3b
4:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 5f
// !LR_HAVE_BOTTOM
// Produce two more rows, extending the already loaded rows.
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
add3
5: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add3
endfunc
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #8 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 0f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Handle h+2 lines with the main loop
add w11, w11, #2
b 1f
0:
// !LR_HAVE_BOTTOM
sub w3, w3, #1 // Handle h-1 lines with the main loop
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v25 and v26-v30 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v28.8h}, [x6], x8
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v28.16b, v26.16b
mov v22.16b, v16.16b
mov v23.16b, v17.16b
mov v29.16b, v26.16b
3:
cbz w3, 4f
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
3:
// Start of vertical loop
subs w3, w3, #2
.macro add5
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v26.8h, v26.8h, v27.8h
add v0.4s, v20.4s, v22.4s
add v1.4s, v21.4s, v23.4s
add v2.8h, v28.8h, v29.8h
add v16.4s, v16.4s, v24.4s
add v17.4s, v17.4s, v25.4s
add v26.8h, v26.8h, v30.8h
add v16.4s, v16.4s, v0.4s
add v17.4s, v17.4s, v1.4s
add v26.8h, v26.8h, v2.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v26.8h}, [x1], x8
.endm
add5
.macro shift2
mov v16.16b, v20.16b
mov v17.16b, v21.16b
mov v26.16b, v28.16b
mov v18.16b, v22.16b
mov v19.16b, v23.16b
mov v27.16b, v29.16b
mov v20.16b, v24.16b
mov v21.16b, v25.16b
mov v28.16b, v30.16b
.endm
shift2
add x0, x0, x7
add x1, x1, x8
b.le 5f
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
b 3b
4:
// h == 1, !LR_HAVE_BOTTOM.
// Pad the last row with the only content row, and add.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
add5
b 6f
5:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 6f
// !LR_HAVE_BOTTOM
cbnz w3, 5f
// The intended three edge rows left; output the one at h-2 and
// the past edge one at h.
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
// Pad the past-edge row from the last content row.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
// The last two rows are already padded properly here.
add5
b 6f
5:
// w3 == -1, two rows left, output one.
// Pad the last two rows from the mid one.
mov v22.16b, v20.16b
mov v23.16b, v21.16b
mov v29.16b, v28.16b
mov v24.16b, v20.16b
mov v25.16b, v21.16b
mov v30.16b, v28.16b
add5
add x0, x0, x7
add x1, x1, x8
b 6f
6: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add5
endfunc
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength,
// const int bitdepth_max);
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength,
// const int bitdepth_max);
function sgr_calc_ab1_neon, export=1
clz w9, w5
add x3, x3, #2 // h += 2
movi v31.4s, #9 // n
mov x5, #455
mov x8, #SUM_STRIDE
b sgr_calc_ab_neon
endfunc
function sgr_calc_ab2_neon, export=1
clz w9, w5
add x3, x3, #3 // h += 3
asr x3, x3, #1 // h /= 2
movi v31.4s, #25 // n
mov x5, #164
mov x8, #(2*SUM_STRIDE)
endfunc
function sgr_calc_ab_neon
sub w9, w9, #24 // -bitdepth_min_8
movrel x12, X(sgr_x_by_x)
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
dup v6.8h, w9 // -bitdepth_min_8
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
add x2, x2, #2 // w += 2
add x7, x2, #7
bic x7, x7, #7 // aligned w
sub x7, x8, x7 // increment between rows
movi v29.8h, #1, lsl #8
dup v28.4s, w4
dup v30.4s, w5 // one_by_x
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
ld1 {v2.8h}, [x1] // b
srshl v0.4s, v0.4s, v7.4s
srshl v1.4s, v1.4s, v7.4s
srshl v4.8h, v2.8h, v6.8h
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v4.4h, v4.4h // b * b
umull2 v4.4s, v4.8h, v4.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v4.8b
add v5.8b, v5.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v1.8b, v1.8b, v5.8b
add v1.8b, v1.8b, v25.8b
uxtl v1.8h, v1.8b // x
umull v3.4s, v1.4h, v2.4h // x * BB[i]
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v2.8h, v29.8h, v1.8h // 256 - x
st1 {v3.4s, v4.4s}, [x0], #32
st1 {v2.8h}, [x1], #16
b.gt 1b
subs x3, x3, #1
b.le 0f
add x0, x0, x7, lsl #2
add x1, x1, x7, lsl #1
mov x2, x6
b 1b
0:
ret
endfunc

597
third_party/dav1d/src/arm/64/looprestoration_tmpl.S поставляемый Normal file
Просмотреть файл

@ -0,0 +1,597 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#define FILTER_OUT_STRIDE 384
.macro sgr_funcs bpc
// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_\bpc\()bpc_neon, export=1
sub x7, x3, #(4*SUM_STRIDE)
add x8, x3, #(4*SUM_STRIDE)
sub x9, x4, #(2*SUM_STRIDE)
add x10, x4, #(2*SUM_STRIDE)
mov x11, #SUM_STRIDE
mov x12, #FILTER_OUT_STRIDE
add x13, x5, #7
bic x13, x13, #7 // Aligned width
.if \bpc == 8
sub x2, x2, x13
.else
sub x2, x2, x13, lsl #1
.endif
sub x12, x12, x13
sub x11, x11, x13
sub x11, x11, #4 // We read 4 extra elements from a
sub x14, x11, #4 // We read 8 extra elements from b
mov x13, x5
movi v6.8h, #3
movi v7.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x9], #32
ld1 {v2.8h, v3.8h}, [x4], #32
ld1 {v4.8h, v5.8h}, [x10], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
2:
subs x5, x5, #8
ext v25.16b, v0.16b, v1.16b, #2 // -stride
ext v26.16b, v2.16b, v3.16b, #2 // 0
ext v27.16b, v4.16b, v5.16b, #2 // +stride
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
ext v29.16b, v2.16b, v3.16b, #4 // +1
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
add v2.8h, v2.8h, v25.8h // -1, -stride
add v26.8h, v26.8h, v27.8h // 0, +stride
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
add v2.8h, v2.8h, v26.8h
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
add v2.8h, v2.8h, v29.8h // +1
add v0.8h, v0.8h, v4.8h
ext v25.16b, v16.16b, v17.16b, #4 // -stride
ext v26.16b, v17.16b, v18.16b, #4
shl v2.8h, v2.8h, #2
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
ext v28.16b, v17.16b, v18.16b, #8
ext v29.16b, v19.16b, v20.16b, #4 // 0
ext v30.16b, v20.16b, v21.16b, #4
mla v2.8h, v0.8h, v6.8h // * 3 -> a
add v25.4s, v25.4s, v19.4s // -stride, -1
add v26.4s, v26.4s, v20.4s
add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v28.4s
ext v27.16b, v19.16b, v20.16b, #8 // +1
ext v28.16b, v20.16b, v21.16b, #8
add v16.4s, v16.4s, v22.4s // -1+stride
add v17.4s, v17.4s, v23.4s
add v29.4s, v29.4s, v27.4s // 0, +1
add v30.4s, v30.4s, v28.4s
add v25.4s, v25.4s, v29.4s
add v26.4s, v26.4s, v30.4s
ext v27.16b, v22.16b, v23.16b, #4 // +stride
ext v28.16b, v23.16b, v24.16b, #4
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
ext v30.16b, v23.16b, v24.16b, #8
.if \bpc == 8
ld1 {v19.8b}, [x1], #8 // src
.else
ld1 {v19.8h}, [x1], #16 // src
.endif
add v25.4s, v25.4s, v27.4s // +stride
add v26.4s, v26.4s, v28.4s
add v16.4s, v16.4s, v29.4s // +1+stride
add v17.4s, v17.4s, v30.4s
shl v25.4s, v25.4s, #2
shl v26.4s, v26.4s, #2
mla v25.4s, v16.4s, v7.4s // * 3 -> b
mla v26.4s, v17.4s, v7.4s
.if \bpc == 8
uxtl v19.8h, v19.8b // src
.endif
mov v0.16b, v1.16b
umlal v25.4s, v2.4h, v19.4h // b + a * src
umlal2 v26.4s, v2.8h, v19.8h
mov v2.16b, v3.16b
rshrn v25.4h, v25.4s, #9
rshrn2 v25.8h, v26.4s, #9
mov v4.16b, v5.16b
st1 {v25.8h}, [x0], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
mov v22.16b, v24.16b
ld1 {v1.8h}, [x9], #16
ld1 {v3.8h}, [x4], #16
ld1 {v5.8h}, [x10], #16
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x3], #32
ld1 {v23.4s, v24.4s}, [x8], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x13
add x0, x0, x12, lsl #1
add x1, x1, x2
add x3, x3, x11, lsl #2
add x7, x7, x11, lsl #2
add x8, x8, x11, lsl #2
add x4, x4, x14, lsl #1
add x9, x9, x14, lsl #1
add x10, x10, x14, lsl #1
b 1b
0:
ret
endfunc
// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_\bpc\()bpc_neon, export=1
add x7, x3, #(4*(SUM_STRIDE))
sub x3, x3, #(4*(SUM_STRIDE))
add x8, x4, #(2*(SUM_STRIDE))
sub x4, x4, #(2*(SUM_STRIDE))
mov x9, #(2*SUM_STRIDE)
mov x10, #FILTER_OUT_STRIDE
add x11, x5, #7
bic x11, x11, #7 // Aligned width
.if \bpc == 8
sub x2, x2, x11
.else
sub x2, x2, x11, lsl #1
.endif
sub x10, x10, x11
sub x9, x9, x11
sub x9, x9, #4 // We read 4 extra elements from a
sub x12, x9, #4 // We read 8 extra elements from b
mov x11, x5
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
movi v7.4s, #6
1:
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
subs x5, x5, #8
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
ext v25.16b, v20.16b, v21.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
ext v27.16b, v17.16b, v18.16b, #8
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
ext v29.16b, v20.16b, v21.16b, #8
mul v0.8h, v0.8h, v4.8h // * 5
mla v0.8h, v2.8h, v6.8h // * 6
.if \bpc == 8
ld1 {v31.8b}, [x1], #8
.else
ld1 {v31.8h}, [x1], #16
.endif
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
add v20.4s, v20.4s, v29.4s
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v16.4s, v16.4s, v5.4s // * 5
mla v16.4s, v22.4s, v7.4s // * 6
mul v17.4s, v17.4s, v5.4s // * 5
mla v17.4s, v23.4s, v7.4s // * 6
.if \bpc == 8
uxtl v31.8h, v31.8b
.endif
umlal v16.4s, v0.4h, v31.4h // b + a * src
umlal2 v17.4s, v0.8h, v31.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
mov v2.16b, v3.16b
st1 {v16.8h}, [x0], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
ld1 {v1.8h}, [x4], #16
ld1 {v3.8h}, [x8], #16
ld1 {v17.4s, v18.4s}, [x3], #32
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
add x3, x3, x9, lsl #2
add x7, x7, x9, lsl #2
add x4, x4, x12, lsl #1
add x8, x8, x12, lsl #1
mov x13, x3
mov x14, x4
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
4:
subs x5, x5, #8
ext v23.16b, v0.16b, v1.16b, #4 // +1
ext v22.16b, v0.16b, v1.16b, #2 // 0
add v0.8h, v0.8h, v23.8h // -1, +1
ext v24.16b, v16.16b, v17.16b, #4 // 0
ext v25.16b, v17.16b, v18.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1
ext v27.16b, v17.16b, v18.16b, #8
mul v2.8h, v22.8h, v6.8h // * 6
mla v2.8h, v0.8h, v4.8h // * 5 -> a
.if \bpc == 8
ld1 {v31.8b}, [x1], #8
.else
ld1 {v31.8h}, [x1], #16
.endif
add v16.4s, v16.4s, v26.4s // -1, +1
add v17.4s, v17.4s, v27.4s
.if \bpc == 8
uxtl v31.8h, v31.8b
.endif
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v24.4s, v24.4s, v7.4s // * 6
mla v24.4s, v16.4s, v5.4s // * 5 -> b
mul v25.4s, v25.4s, v7.4s // * 6
mla v25.4s, v17.4s, v5.4s // * 5 -> b
umlal v24.4s, v2.4h, v31.4h // b + a * src
umlal2 v25.4s, v2.8h, v31.8h
mov v0.16b, v1.16b
rshrn v24.4h, v24.4s, #8
rshrn2 v24.8h, v25.4s, #8
mov v16.16b, v18.16b
st1 {v24.8h}, [x0], #16
b.le 5f
ld1 {v1.8h}, [x4], #16
ld1 {v17.4s, v18.4s}, [x3], #32
b 4b
5:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
mov x3, x13 // Rewind x3/x4 to where they started
mov x4, x14
b 1b
0:
ret
endfunc
// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int w, const int h,
// const int wt, const int bitdepth_max);
function sgr_weighted1_\bpc\()bpc_neon, export=1
.if \bpc == 16
ldr w8, [sp]
.endif
dup v31.8h, w7
cmp x6, #2
.if \bpc == 16
dup v30.8h, w8
.endif
add x9, x0, x1
add x10, x2, x3
add x11, x4, #2*FILTER_OUT_STRIDE
mov x7, #(4*FILTER_OUT_STRIDE)
lsl x1, x1, #1
lsl x3, x3, #1
add x8, x5, #7
bic x8, x8, #7 // Aligned width
.if \bpc == 8
sub x1, x1, x8
sub x3, x3, x8
.else
sub x1, x1, x8, lsl #1
sub x3, x3, x8, lsl #1
.endif
sub x7, x7, x8, lsl #1
mov x8, x5
b.lt 2f
1:
.if \bpc == 8
ld1 {v0.8b}, [x2], #8
ld1 {v4.8b}, [x10], #8
.else
ld1 {v0.8h}, [x2], #16
ld1 {v4.8h}, [x10], #16
.endif
ld1 {v1.8h}, [x4], #16
ld1 {v5.8h}, [x11], #16
subs x5, x5, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
ushll v4.8h, v4.8b, #4 // u
.else
shl v0.8h, v0.8h, #4 // u
shl v4.8h, v4.8h, #4 // u
.endif
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v5.8h, v5.8h, v4.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
ushll v6.4s, v4.4h, #7 // u << 7
ushll2 v7.4s, v4.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
smlal v6.4s, v5.4h, v31.4h // v
smlal2 v7.4s, v5.8h, v31.8h // v
.if \bpc == 8
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
rshrn v6.4h, v6.4s, #11
rshrn2 v6.8h, v7.4s, #11
sqxtun v2.8b, v2.8h
sqxtun v6.8b, v6.8h
st1 {v2.8b}, [x0], #8
st1 {v6.8b}, [x9], #8
.else
sqrshrun v2.4h, v2.4s, #11
sqrshrun2 v2.8h, v3.4s, #11
sqrshrun v6.4h, v6.4s, #11
sqrshrun2 v6.8h, v7.4s, #11
umin v2.8h, v2.8h, v30.8h
umin v6.8h, v6.8h, v30.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x9], #16
.endif
b.gt 1b
sub x6, x6, #2
cmp x6, #1
b.lt 0f
mov x5, x8
add x0, x0, x1
add x9, x9, x1
add x2, x2, x3
add x10, x10, x3
add x4, x4, x7
add x11, x11, x7
b.eq 2f
b 1b
2:
.if \bpc == 8
ld1 {v0.8b}, [x2], #8
.else
ld1 {v0.8h}, [x2], #16
.endif
ld1 {v1.8h}, [x4], #16
subs x5, x5, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
.else
shl v0.8h, v0.8h, #4 // u
.endif
sub v1.8h, v1.8h, v0.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
.if \bpc == 8
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
sqxtun v2.8b, v2.8h
st1 {v2.8b}, [x0], #8
.else
sqrshrun v2.4h, v2.4s, #11
sqrshrun2 v2.8h, v3.4s, #11
umin v2.8h, v2.8h, v30.8h
st1 {v2.8h}, [x0], #16
.endif
b.gt 2b
0:
ret
endfunc
// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_\bpc\()bpc_neon, export=1
.if \bpc == 8
ldr x8, [sp]
.else
ldp x8, x9, [sp]
.endif
cmp x7, #2
add x10, x0, x1
add x11, x2, x3
add x12, x4, #2*FILTER_OUT_STRIDE
add x13, x5, #2*FILTER_OUT_STRIDE
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
.if \bpc == 16
dup v29.8h, w9
.endif
mov x8, #4*FILTER_OUT_STRIDE
lsl x1, x1, #1
lsl x3, x3, #1
add x9, x6, #7
bic x9, x9, #7 // Aligned width
.if \bpc == 8
sub x1, x1, x9
sub x3, x3, x9
.else
sub x1, x1, x9, lsl #1
sub x3, x3, x9, lsl #1
.endif
sub x8, x8, x9, lsl #1
mov x9, x6
b.lt 2f
1:
.if \bpc == 8
ld1 {v0.8b}, [x2], #8
ld1 {v16.8b}, [x11], #8
.else
ld1 {v0.8h}, [x2], #16
ld1 {v16.8h}, [x11], #16
.endif
ld1 {v1.8h}, [x4], #16
ld1 {v17.8h}, [x12], #16
ld1 {v2.8h}, [x5], #16
ld1 {v18.8h}, [x13], #16
subs x6, x6, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
ushll v16.8h, v16.8b, #4 // u
.else
shl v0.8h, v0.8h, #4 // u
shl v16.8h, v16.8h, #4 // u
.endif
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v2.8h, v2.8h, v0.8h // t2 - u
sub v17.8h, v17.8h, v16.8h // t1 - u
sub v18.8h, v18.8h, v16.8h // t2 - u
ushll v3.4s, v0.4h, #7 // u << 7
ushll2 v4.4s, v0.8h, #7 // u << 7
ushll v19.4s, v16.4h, #7 // u << 7
ushll2 v20.4s, v16.8h, #7 // u << 7
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
.if \bpc == 8
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
rshrn v19.4h, v19.4s, #11
rshrn2 v19.8h, v20.4s, #11
sqxtun v3.8b, v3.8h
sqxtun v19.8b, v19.8h
st1 {v3.8b}, [x0], #8
st1 {v19.8b}, [x10], #8
.else
sqrshrun v3.4h, v3.4s, #11
sqrshrun2 v3.8h, v4.4s, #11
sqrshrun v19.4h, v19.4s, #11
sqrshrun2 v19.8h, v20.4s, #11
umin v3.8h, v3.8h, v29.8h
umin v19.8h, v19.8h, v29.8h
st1 {v3.8h}, [x0], #16
st1 {v19.8h}, [x10], #16
.endif
b.gt 1b
subs x7, x7, #2
cmp x7, #1
b.lt 0f
mov x6, x9
add x0, x0, x1
add x10, x10, x1
add x2, x2, x3
add x11, x11, x3
add x4, x4, x8
add x12, x12, x8
add x5, x5, x8
add x13, x13, x8
b.eq 2f
b 1b
2:
.if \bpc == 8
ld1 {v0.8b}, [x2], #8
.else
ld1 {v0.8h}, [x2], #16
.endif
ld1 {v1.8h}, [x4], #16
ld1 {v2.8h}, [x5], #16
subs x6, x6, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
.else
shl v0.8h, v0.8h, #4 // u
.endif
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v2.8h, v2.8h, v0.8h // t2 - u
ushll v3.4s, v0.4h, #7 // u << 7
ushll2 v4.4s, v0.8h, #7 // u << 7
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
.if \bpc == 8
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
sqxtun v3.8b, v3.8h
st1 {v3.8b}, [x0], #8
.else
sqrshrun v3.4h, v3.4s, #11
sqrshrun2 v3.8h, v4.4s, #11
umin v3.8h, v3.8h, v29.8h
st1 {v3.8h}, [x0], #16
.endif
b.gt 1b
0:
ret
endfunc
.endm

237
third_party/dav1d/src/arm/64/mc.S поставляемый
Просмотреть файл

@ -29,14 +29,7 @@
#include "src/arm/asm.S"
#include "util.S"
.macro avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
ld1 {\t1\().8h}, [x3], 16
add \t0\().8h, \t0\().8h, \t1\().8h
sqrshrun \dst\().8b, \t0\().8h, #5
.endm
.macro avg16 dst, t0, t1, t2, t3
.macro avg dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
add \t0\().8h, \t0\().8h, \t2\().8h
@ -45,16 +38,7 @@
sqrshrun2 \dst\().16b, \t1\().8h, #5
.endm
.macro w_avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
ld1 {\t1\().8h}, [x3], 16
sub \t0\().8h, \t1\().8h, \t0\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
add \t0\().8h, \t1\().8h, \t0\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
.endm
.macro w_avg16 dst, t0, t1, t2, t3
.macro w_avg dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
sub \t0\().8h, \t2\().8h, \t0\().8h
@ -67,19 +51,7 @@
sqrshrun2 \dst\().16b, \t1\().8h, #4
.endm
.macro mask dst, t0, t1
ld1 {v30.8b}, [x6], 8
ld1 {\t0\().8h}, [x2], 16
mul v30.8b, v30.8b, v31.8b
ld1 {\t1\().8h}, [x3], 16
shll v30.8h, v30.8b, #8
sub \t0\().8h, \t1\().8h, \t0\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
add \t0\().8h, \t1\().8h, \t0\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
.endm
.macro mask16 dst, t0, t1, t2, t3
.macro mask dst, t0, t1, t2, t3
ld1 {v30.16b}, [x6], 16
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
mul v30.16b, v30.16b, v31.16b
@ -109,113 +81,108 @@ function \type\()_8bpc_neon, export=1
.endif
adr x7, L(\type\()_tbl)
sub w4, w4, #24
\type v4, v0, v1
ldrh w4, [x7, x4, lsl #1]
\type v5, v2, v3
\type v4, v0, v1, v2, v3
sub x7, x7, w4, uxtw
br x7
40:
add x7, x0, x1
lsl x1, x1, #1
4:
cmp w5, #4
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
st1 {v4.s}[1], [x7], x1
st1 {v4.s}[2], [x0], x1
st1 {v4.s}[3], [x7], x1
b.eq 0f
\type v6, v0, v1
\type v7, v2, v3
\type v5, v0, v1, v2, v3
cmp w5, #8
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x0], x1
st1 {v7.s}[0], [x0], x1
st1 {v7.s}[1], [x0], x1
b.eq 0f
\type v4, v0, v1
\type v5, v2, v3
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
\type v6, v0, v1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
\type v7, v2, v3
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x0], x1
st1 {v7.s}[0], [x0], x1
st1 {v7.s}[1], [x0], x1
st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
st1 {v5.s}[3], [x7], x1
b.eq 0f
\type v4, v0, v1, v2, v3
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x7], x1
\type v5, v0, v1, v2, v3
st1 {v4.s}[2], [x0], x1
st1 {v4.s}[3], [x7], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
st1 {v5.s}[3], [x7], x1
ret
80:
add x7, x0, x1
lsl x1, x1, #1
8:
st1 {v4.8b}, [x0], x1
\type v6, v0, v1
st1 {v5.8b}, [x0], x1
\type v7, v0, v1
st1 {v6.8b}, [x0], x1
st1 {v4.d}[0], [x0], x1
\type v5, v0, v1, v2, v3
st1 {v4.d}[1], [x7], x1
st1 {v5.d}[0], [x0], x1
subs w5, w5, #4
st1 {v7.8b}, [x0], x1
st1 {v5.d}[1], [x7], x1
b.le 0f
\type v4, v0, v1
\type v5, v2, v3
\type v4, v0, v1, v2, v3
b 8b
160:
trn1 v4.2d, v4.2d, v5.2d
16:
\type\()16 v5, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
st1 {v4.16b}, [x0], x1
\type\()16 v6, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
st1 {v5.16b}, [x0], x1
\type\()16 v7, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
st1 {v6.16b}, [x0], x1
subs w5, w5, #4
st1 {v7.16b}, [x0], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 16b
320:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
32:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
st1 {v4.16b,v5.16b}, [x0], x1
\type\()16 v7, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
subs w5, w5, #2
st1 {v6.16b,v7.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 32b
640:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
64:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type\()16 v7, v0, v1, v2, v3
\type\()16 v16, v0, v1, v2, v3
\type\()16 v17, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
\type v16, v0, v1, v2, v3
\type v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
\type\()16 v18, v0, v1, v2, v3
\type\()16 v19, v0, v1, v2, v3
\type v18, v0, v1, v2, v3
\type v19, v0, v1, v2, v3
subs w5, w5, #2
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 64b
1280:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, #64
128:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type\()16 v7, v0, v1, v2, v3
\type\()16 v16, v0, v1, v2, v3
\type\()16 v17, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
\type v16, v0, v1, v2, v3
\type v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
\type\()16 v18, v0, v1, v2, v3
\type\()16 v19, v0, v1, v2, v3
\type v18, v0, v1, v2, v3
\type v19, v0, v1, v2, v3
subs w5, w5, #1
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 128b
0:
ret
@ -223,9 +190,9 @@ L(\type\()_tbl):
.hword L(\type\()_tbl) - 1280b
.hword L(\type\()_tbl) - 640b
.hword L(\type\()_tbl) - 320b
.hword L(\type\()_tbl) - 160b
.hword L(\type\()_tbl) - 8b
.hword L(\type\()_tbl) - 4b
.hword L(\type\()_tbl) - 16b
.hword L(\type\()_tbl) - 80b
.hword L(\type\()_tbl) - 40b
endfunc
.endm
@ -464,10 +431,10 @@ function blend_8bpc_neon, export=1
sub x6, x6, w3, uxtw
movi v4.16b, #64
add x8, x0, x1
lsl w1, w1, #1
lsl x1, x1, #1
br x6
4:
ld1 {v2.d}[0], [x5], #8
ld1 {v2.8b}, [x5], #8
ld1 {v1.d}[0], [x2], #8
ld1 {v0.s}[0], [x0]
subs w4, w4, #2
@ -481,8 +448,8 @@ function blend_8bpc_neon, export=1
b.gt 4b
ret
8:
ld1 {v2.2d}, [x5], #16
ld1 {v1.2d}, [x2], #16
ld1 {v2.16b}, [x5], #16
ld1 {v1.16b}, [x2], #16
ld1 {v0.d}[0], [x0]
ld1 {v0.d}[1], [x8]
sub v3.16b, v4.16b, v2.16b
@ -498,13 +465,13 @@ function blend_8bpc_neon, export=1
b.gt 8b
ret
16:
ld1 {v1.2d, v2.2d}, [x5], #32
ld1 {v5.2d, v6.2d}, [x2], #32
ld1 {v0.2d}, [x0]
ld1 {v1.16b, v2.16b}, [x5], #32
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v0.16b}, [x0]
subs w4, w4, #2
sub v7.16b, v4.16b, v1.16b
sub v20.16b, v4.16b, v2.16b
ld1 {v3.2d}, [x8]
ld1 {v3.16b}, [x8]
umull v16.8h, v5.8b, v1.8b
umlal v16.8h, v0.8b, v7.8b
umull2 v17.8h, v5.16b, v1.16b
@ -517,16 +484,16 @@ function blend_8bpc_neon, export=1
rshrn2 v18.16b, v17.8h, #6
rshrn v19.8b, v21.8h, #6
rshrn2 v19.16b, v22.8h, #6
st1 {v18.2d}, [x0], x1
st1 {v19.2d}, [x8], x1
st1 {v18.16b}, [x0], x1
st1 {v19.16b}, [x8], x1
b.gt 16b
ret
32:
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
ld1 {v20.2d, v21.2d}, [x0]
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v20.16b, v21.16b}, [x0]
subs w4, w4, #2
ld1 {v22.2d, v23.2d}, [x8]
ld1 {v22.16b, v23.16b}, [x8]
sub v5.16b, v4.16b, v0.16b
sub v6.16b, v4.16b, v1.16b
sub v30.16b, v4.16b, v2.16b
@ -555,8 +522,8 @@ function blend_8bpc_neon, export=1
rshrn2 v27.16b, v1.8h, #6
rshrn v28.8b, v29.8h, #6
rshrn2 v28.16b, v21.8h, #6
st1 {v24.2d, v25.2d}, [x0], x1
st1 {v27.2d, v28.2d}, [x8], x1
st1 {v24.16b, v25.16b}, [x0], x1
st1 {v27.16b, v28.16b}, [x8], x1
b.gt 32b
ret
L(blend_tbl):
@ -567,7 +534,7 @@ L(blend_tbl):
endfunc
function blend_h_8bpc_neon, export=1
adr x6, L(blend_h_tbl)
adr x6, L(blend_h_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w4, uxtw
sub w4, w4, w4, lsr #2
@ -596,7 +563,7 @@ function blend_h_8bpc_neon, export=1
ret
4:
ld2r {v0.8b, v1.8b}, [x5], #2
ld1 {v2.2s}, [x2], #8
ld1 {v2.8b}, [x2], #8
subs w4, w4, #2
ext v0.8b, v0.8b, v1.8b, #4
ld1 {v3.s}[0], [x0]
@ -742,8 +709,8 @@ function blend_v_8bpc_neon, export=1
ret
40:
ld1r {v0.2s}, [x5]
sub x1, x1, #2
sub v1.8b, v4.8b, v0.8b
sub x1, x1, #3
4:
ld1 {v2.8b}, [x2], #8
ld1 {v3.s}[0], [x0]
@ -754,16 +721,14 @@ function blend_v_8bpc_neon, export=1
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], #2
st1 {v5.h}[2], [x8], #2
st1 {v5.b}[2], [x0], #1
st1 {v5.b}[6], [x8], #1
add x0, x0, x1
add x8, x8, x1
st1 {v5.b}[2], [x0], x1
st1 {v5.b}[6], [x8], x1
b.gt 4b
ret
80:
ld1r {v0.2d}, [x5]
sub x1, x1, #4
sub v1.16b, v4.16b, v0.16b
sub x1, x1, #6
8:
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
@ -777,16 +742,14 @@ function blend_v_8bpc_neon, export=1
rshrn2 v7.16b, v6.8h, #6
st1 {v7.s}[0], [x0], #4
st1 {v7.s}[2], [x8], #4
st1 {v7.h}[2], [x0], #2
st1 {v7.h}[6], [x8], #2
add x0, x0, x1
add x8, x8, x1
st1 {v7.h}[2], [x0], x1
st1 {v7.h}[6], [x8], x1
b.gt 8b
ret
160:
ld1 {v0.16b}, [x5]
sub x1, x1, #8
sub v2.16b, v4.16b, v0.16b
sub x1, x1, #12
16:
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v7.16b}, [x0]
@ -806,17 +769,15 @@ function blend_v_8bpc_neon, export=1
rshrn2 v22.16b, v21.8h, #6
st1 {v19.8b}, [x0], #8
st1 {v22.8b}, [x8], #8
st1 {v19.s}[2], [x0], #4
st1 {v22.s}[2], [x8], #4
add x0, x0, x1
add x8, x8, x1
st1 {v19.s}[2], [x0], x1
st1 {v22.s}[2], [x8], x1
b.gt 16b
ret
320:
ld1 {v0.16b, v1.16b}, [x5]
sub x1, x1, #16
sub v2.16b, v4.16b, v0.16b
sub v3.16b, v4.16b, v1.16b
sub x1, x1, #24
sub v3.8b, v4.8b, v1.8b
32:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v5.16b, v6.16b}, [x0]
@ -828,30 +789,22 @@ function blend_v_8bpc_neon, export=1
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
umull2 v29.8h, v17.16b, v1.16b
umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
umull2 v26.8h, v19.16b, v1.16b
umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
rshrn2 v27.16b, v26.8h, #6
st1 {v24.16b}, [x0], #16
st1 {v30.16b}, [x8], #16
st1 {v28.8b}, [x0], #8
st1 {v27.8b}, [x8], #8
add x0, x0, x1
add x8, x8, x1
st1 {v28.8b}, [x0], x1
st1 {v27.8b}, [x8], x1
b.gt 32b
ret
L(blend_v_tbl):
@ -2106,9 +2059,9 @@ L(\type\()_8tap_filter_2):
st1 {v3.4h}, [\ds2], \d_strd
.endif
b.le 0f
mov v16.16b, v18.16b
mov v17.16b, v28.16b
mov v18.16b, v29.16b
mov v16.8b, v18.8b
mov v17.8b, v28.8b
mov v18.8b, v29.8b
b 4b
480: // 4x8, 4x16, 4x32 hv

3409
third_party/dav1d/src/arm/64/mc16.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

33
third_party/dav1d/src/arm/64/msac.S поставляемый
Просмотреть файл

@ -110,25 +110,10 @@ endconst
.endif
.endm
.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n
umull \d0\().4s, \s0\().4h, \s2\().4h
.if \n >= 8
umull2 \d1\().4s, \s0\().8h, \s2\().8h
.endif
.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
sqdmulh \d0\sz, \s0\sz, \s2\sz
.if \n == 16
umull \d2\().4s, \s1\().4h, \s3\().4h
umull2 \d3\().4s, \s1\().8h, \s3\().8h
.endif
.endm
.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n
shrn \d0\().4h, \s0\().4s, \shift
.if \n >= 8
shrn2 \d0\().8h, \s1\().4s, \shift
.endif
.if \n == 16
shrn \d1\().4h, \s2\().4s, \shift
shrn2 \d1\().8h, \s3\().4s, \shift
sqdmulh \d1\sz, \s1\sz, \s3\sz
.endif
.endm
@ -149,17 +134,19 @@ function msac_decode_symbol_adapt4_neon, export=1
ld1_n v0, v1, x1, \sz, \n // cdf
ld1r {v4\sz}, [x8] // rng
movrel x9, coeffs, 30
movi v31\sz, #0x7f, lsl #8 // 0x7f00
sub x9, x9, x2, lsl #1
ushr_n v2, v3, v0, v1, #6, \sz, \n // cdf >> EC_PROB_SHIFT
mvni v30\sz, #0x3f // 0xffc0
and v7\szb, v4\szb, v31\szb // rng & 0x7f00
str h4, [sp, #14] // store original u = s->rng
ushr v4\sz, v4\sz, #8 // r = rng >> 8
and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
umull_n v16, v17, v18, v19, v4, v4, v2, v3, \n // r * (cdf >> EC_PROB_SHIFT)
ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
shrn_n v2, v3, v16, v17, v18, v19, #1, \n // v >>= 7 - EC_PROB_SHIFT
sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
add x8, x0, #DIF + 6
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v += EC_MIN_PROB * (n_symbols - ret)
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
movrel x8, bits

62
third_party/dav1d/src/arm/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -27,42 +27,46 @@
#include "src/cpu.h"
#include "src/cdef.h"
#if BITDEPTH == 8
decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);
#if BITDEPTH == 8 || ARCH_AARCH64
decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
ptrdiff_t src_stride, const pixel (*left)[2],
/*const*/ pixel *const top[2], int h,
enum CdefEdgeFlags edges);
void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src,
ptrdiff_t src_stride, const pixel (*left)[2],
/*const*/ pixel *const top[2], int h,
enum CdefEdgeFlags edges);
void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
ptrdiff_t src_stride, const pixel (*left)[2],
const pixel *const top, int h,
enum CdefEdgeFlags edges);
void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
ptrdiff_t src_stride, const pixel (*left)[2],
const pixel *const top, int h,
enum CdefEdgeFlags edges);
void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride,
const uint16_t *tmp, int pri_strength,
int sec_strength, int dir, int damping, int h);
void dav1d_cdef_filter8_neon(pixel *dst, ptrdiff_t dst_stride,
const uint16_t *tmp, int pri_strength,
int sec_strength, int dir, int damping, int h);
// Passing edges to this function, to allow it to switch to a more
// optimized version for fully edged cases. Using size_t for edges,
// to avoid ABI differences for passing more than one argument on the stack.
void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
const uint16_t *tmp, int pri_strength,
int sec_strength, int dir, int damping, int h,
size_t edges HIGHBD_DECL_SUFFIX);
void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
const uint16_t *tmp, int pri_strength,
int sec_strength, int dir, int damping, int h,
size_t edges HIGHBD_DECL_SUFFIX);
#define DEFINE_FILTER(w, h, tmp_stride) \
static void \
cdef_filter_##w##x##h##_neon(pixel *dst, \
const ptrdiff_t stride, \
const pixel (*left)[2], \
/*const*/ pixel *const top[2], \
const int pri_strength, \
const int sec_strength, \
const int dir, \
const int damping, \
const enum CdefEdgeFlags edges) \
const pixel (*left)[2], const pixel *const top, \
const int pri_strength, const int sec_strength, \
const int dir, const int damping, \
const enum CdefEdgeFlags edges \
HIGHBD_DECL_SUFFIX) \
{ \
ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,); \
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges); \
dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength, \
sec_strength, dir, damping, h); \
BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges); \
BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
sec_strength, dir, damping, h, edges \
HIGHBD_TAIL_SUFFIX); \
}
DEFINE_FILTER(8, 8, 16)
@ -76,8 +80,8 @@ COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_find_dir_neon;
#if BITDEPTH == 8 || ARCH_AARCH64
c->dir = BF(dav1d_cdef_find_dir, neon);
c->fb[0] = cdef_filter_8x8_neon;
c->fb[1] = cdef_filter_4x8_neon;
c->fb[2] = cdef_filter_4x4_neon;

Просмотреть файл

@ -28,20 +28,20 @@
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
#if BITDEPTH == 8 || ARCH_AARCH64
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
#endif
}

Просмотреть файл

@ -29,19 +29,26 @@
#include "src/looprestoration.h"
#include "src/tables.h"
#if BITDEPTH == 8
// This calculates things slightly differently than the reference C version.
// This version calculates roughly this:
#if BITDEPTH == 8 || ARCH_AARCH64
// The 8bpc version calculates things slightly differently than the reference
// C version. That version calculates roughly this:
// int16_t sum = 0;
// for (int i = 0; i < 7; i++)
// sum += src[idx] * fh[i];
// int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
// sum += 2048;
void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
const pixel *src, ptrdiff_t stride,
const int16_t fh[7], const intptr_t w,
int h, enum LrEdgeFlags edges);
// sum += 1 << (bitdepth + 6 - round_bits_h);
// Compared to the reference C version, this is the output of the first pass
// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
// with round_offset precompensated.
// The 16bpc version calculates things pretty much the same way as the
// reference C version, but with the end result subtracted by
// 1 << (bitdepth + 6 - round_bits_h).
void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
const pixel *src, ptrdiff_t stride,
const int16_t fh[7], const intptr_t w,
int h, enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
// This calculates things slightly differently than the reference C version.
// This version calculates roughly this:
// fv[3] += 128;
@ -50,217 +57,242 @@ void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// sum += mid[idx] * fv[i];
// sum = (sum + rounding_off_v) >> round_bits_v;
// This function assumes that the width is a multiple of 8.
void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
const int16_t *mid, int w, int h,
const int16_t fv[7], enum LrEdgeFlags edges,
ptrdiff_t mid_stride);
void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
const pixel *src, int w, int h);
void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
const int16_t *mid, int w, int h,
const int16_t fv[7], enum LrEdgeFlags edges,
ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
const pixel *src, int w, int h);
static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int16_t fh[7],
const int16_t fv[7], const enum LrEdgeFlags edges)
const int16_t fv[7], const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, mid, 68 * 384,);
int mid_stride = (w + 7) & ~7;
// Horizontal filter
dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
fh, w, h, edges);
BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
fh, w, h, edges HIGHBD_TAIL_SUFFIX);
if (edges & LR_HAVE_TOP)
dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
fh, w, 2, edges);
BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
fh, w, 2, edges HIGHBD_TAIL_SUFFIX);
if (edges & LR_HAVE_BOTTOM)
dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
fh, w, 2, edges);
BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, fh, w, 2, edges
HIGHBD_TAIL_SUFFIX);
// Vertical filter
if (w >= 8)
dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
w & ~7, h, fv, edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into dest.
ALIGN_STK_16(pixel, tmp, 64 * 8,);
dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
w & 7, h, fv, edges, mid_stride * sizeof(*mid));
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
&mid[2*mid_stride + (w & ~7)],
w & 7, h, fv, edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
}
}
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter1_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
const int w, const int h, const int strength,
const int bitdepth_max);
void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 3x3 box (radius=1) */
static void dav1d_sgr_filter1_neon(coef *tmp,
static void dav1d_sgr_filter1_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter2_neon(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
const int w, const int h, const int strength,
const int bitdepth_max);
void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 5x5 box (radius=2) */
static void dav1d_sgr_filter2_neon(coef *tmp,
static void dav1d_sgr_filter2_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const int w, const int h,
const int wt);
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const coef *t2,
const int w, const int h,
const int16_t wt[2]);
void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const int16_t *t1, const int w, const int h,
const int wt HIGHBD_DECL_SUFFIX);
void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const int16_t *t1, const int16_t *t2,
const int w, const int h,
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
const int16_t sgr_wt[7], const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
w, h, dav1d_sgr_params[sgr_idx][3], edges
HIGHBD_TAIL_SUFFIX);
if (w >= 8)
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h,
(1 << 7) - sgr_wt[1]);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h,
(1 << 7) - sgr_wt[1]
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_16(coef, tmp, 64 * 384,);
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
w, h, dav1d_sgr_params[sgr_idx][2], edges
HIGHBD_TAIL_SUFFIX);
if (w >= 8)
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, sgr_wt[0]);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, sgr_wt[0]
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h, sgr_wt[0]
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else {
ALIGN_STK_16(coef, tmp1, 64 * 384,);
ALIGN_STK_16(coef, tmp2, 64 * 384,);
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
w, h, dav1d_sgr_params[sgr_idx][2], edges
HIGHBD_TAIL_SUFFIX);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
w, h, dav1d_sgr_params[sgr_idx][3], edges
HIGHBD_TAIL_SUFFIX);
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
if (w >= 8)
dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w & ~7, h, wt);
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w & ~7, h, wt
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
tmp1 + (w & ~7), tmp2 + (w & ~7),
w & 7, h, wt);
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
dst + (w & ~7), dst_stride,
tmp1 + (w & ~7), tmp2 + (w & ~7),
w & 7, h, wt HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
}
}
#endif // BITDEPTH == 8
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
#if BITDEPTH == 8 || ARCH_AARCH64
c->wiener = wiener_filter_neon;
c->selfguided = sgr_filter_neon;
if (bpc <= 10)
c->selfguided = sgr_filter_neon;
#endif
}

90
third_party/dav1d/src/arm/mc_init_tmpl.c поставляемый
Просмотреть файл

@ -30,52 +30,52 @@
#include "src/mc.h"
#include "src/cpu.h"
decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
decl_mc_fn(dav1d_put_bilin_8bpc_neon);
decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
decl_mc_fn(BF(dav1d_put_bilin, neon));
decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
decl_mct_fn(BF(dav1d_prep_bilin, neon));
decl_avg_fn(dav1d_avg_8bpc_neon);
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
decl_mask_fn(dav1d_mask_8bpc_neon);
decl_blend_fn(dav1d_blend_8bpc_neon);
decl_blend_dir_fn(dav1d_blend_h_8bpc_neon);
decl_blend_dir_fn(dav1d_blend_v_8bpc_neon);
decl_avg_fn(BF(dav1d_avg, neon));
decl_w_avg_fn(BF(dav1d_w_avg, neon));
decl_mask_fn(BF(dav1d_mask, neon));
decl_blend_fn(BF(dav1d_blend, neon));
decl_blend_dir_fn(BF(dav1d_blend_h, neon));
decl_blend_dir_fn(BF(dav1d_blend_v, neon));
decl_w_mask_fn(dav1d_w_mask_444_8bpc_neon);
decl_w_mask_fn(dav1d_w_mask_422_8bpc_neon);
decl_w_mask_fn(dav1d_w_mask_420_8bpc_neon);
decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_8bpc_##suffix
c->mc[type] = BF(dav1d_put_##name, suffix)
#define init_mct_fn(type, name, suffix) \
c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
c->mct[type] = BF(dav1d_prep_##name, suffix)
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
#if BITDEPTH == 8 || ARCH_AARCH64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
@ -98,16 +98,16 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;
c->blend = dav1d_blend_8bpc_neon;
c->blend_h = dav1d_blend_h_8bpc_neon;
c->blend_v = dav1d_blend_v_8bpc_neon;
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
c->avg = BF(dav1d_avg, neon);
c->w_avg = BF(dav1d_w_avg, neon);
c->mask = BF(dav1d_mask, neon);
c->blend = BF(dav1d_blend, neon);
c->blend_h = BF(dav1d_blend_h, neon);
c->blend_v = BF(dav1d_blend_v, neon);
c->w_mask[0] = BF(dav1d_w_mask_444, neon);
c->w_mask[1] = BF(dav1d_w_mask_422, neon);
c->w_mask[2] = BF(dav1d_w_mask_420, neon);
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
#endif
}

2
third_party/dav1d/src/cdef.h поставляемый
Просмотреть файл

@ -52,7 +52,7 @@ typedef const void *const_left_pixel_row_2px;
// order to get access to pre-filter top pixels, use $top.
#define decl_cdef_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
/*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
const pixel *top, int pri_strength, int sec_strength, \
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_cdef_fn(*cdef_fn);

109
third_party/dav1d/src/cdef_apply_tmpl.c поставляемый
Просмотреть файл

@ -39,24 +39,28 @@ enum Backup2x8Flags {
BACKUP_2X8_UV = 1 << 1,
};
static void backup2lines(pixel *const dst[3][2],
/*const*/ pixel *const src[3],
const ptrdiff_t src_stride[2], int y_off, int w,
static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
const ptrdiff_t stride[2],
const enum Dav1dPixelLayout layout)
{
pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
if (y_stride < 0)
pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
else
pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
w >>= ss_hor;
y_off >>= ss_ver;
pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
if (layout != DAV1D_PIXEL_LAYOUT_I400) {
const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
if (uv_stride < 0) {
const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
} else {
const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
}
}
}
static void backup2x8(pixel dst[3][8][2],
@ -105,7 +109,6 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
const enum Dav1dPixelLayout layout = f->cur.p.layout;
const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
@ -114,19 +117,16 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
const int by_idx = by & 30;
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
if (edges & CDEF_HAVE_BOTTOM) {
// backup pre-filter data for next iteration
backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,
8, f->bw * 4, layout);
}
if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);
pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
edges &= ~CDEF_HAVE_LEFT;
edges |= CDEF_HAVE_RIGHT;
enum Backup2x8Flags prev_flag = 0;
for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
const int sb128x = sbx >>1;
const int sb128x = sbx >> 1;
const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
if (cdef_idx == -1 ||
@ -141,6 +141,16 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
int y_sec_lvl = y_lvl & 3;
y_sec_lvl += y_sec_lvl == 3;
y_sec_lvl <<= bitdepth_min_8;
const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
int uv_sec_lvl = uv_lvl & 3;
uv_sec_lvl += uv_sec_lvl == 3;
uv_sec_lvl <<= bitdepth_min_8;
pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
bx += 2, edges |= CDEF_HAVE_LEFT)
@ -169,41 +179,32 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
}
// the actual filter
const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
int y_sec_lvl = y_lvl & 3;
y_sec_lvl += y_sec_lvl == 3;
y_sec_lvl <<= bitdepth_min_8;
const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
int uv_sec_lvl = uv_lvl & 3;
uv_sec_lvl += uv_sec_lvl == 3;
uv_sec_lvl <<= bitdepth_min_8;
int dir;
unsigned variance;
const int dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
&variance HIGHBD_CALL_SUFFIX);
if (y_lvl) {
if (y_pri_lvl || uv_pri_lvl)
dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
&variance HIGHBD_CALL_SUFFIX);
if (y_pri_lvl) {
const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
if (adj_y_pri_lvl || y_sec_lvl)
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
&f->lf.cdef_line[tf][0][bx * 4],
adj_y_pri_lvl, y_sec_lvl, dir,
damping, edges HIGHBD_CALL_SUFFIX);
} else if (y_sec_lvl)
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
(pixel *const [2]) {
&f->lf.cdef_line[tf][0][0][bx * 4],
&f->lf.cdef_line[tf][0][1][bx * 4],
},
adjust_strength(y_pri_lvl, variance),
y_sec_lvl, y_pri_lvl ? dir : 0,
&f->lf.cdef_line[tf][0][bx * 4],
0, y_sec_lvl, 0,
damping, edges HIGHBD_CALL_SUFFIX);
}
if (uv_lvl && has_chroma) {
const int uvdir =
f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
if (uv_lvl) {
assert(layout != DAV1D_PIXEL_LAYOUT_I400);
const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ?
((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0;
for (int pl = 1; pl <= 2; pl++) {
dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
lr_bak[bit][pl],
(pixel *const [2]) {
&f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],
&f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],
},
uv_pri_lvl, uv_sec_lvl,
uv_pri_lvl ? uvdir : 0,
dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
&f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
uv_pri_lvl, uv_sec_lvl, uvdir,
damping - 1, edges HIGHBD_CALL_SUFFIX);
}
}

193
third_party/dav1d/src/cdef_tmpl.c поставляемый
Просмотреть файл

@ -32,29 +32,30 @@
#include "common/intops.h"
#include "src/cdef.h"
#include "src/tables.h"
static inline int constrain(const int diff, const int threshold,
const int damping)
const int shift)
{
if (!threshold) return 0;
const int shift = imax(0, damping - ulog2(threshold));
return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
diff);
const int adiff = abs(diff);
return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
}
static inline void fill(uint16_t *tmp, const ptrdiff_t stride,
static inline void fill(int16_t *tmp, const ptrdiff_t stride,
const int w, const int h)
{
/* Use a value that's a large positive number when interpreted as unsigned,
* and a large negative number when interpreted as signed. */
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
tmp[x] = INT16_MAX;
tmp[x] = INT16_MIN;
tmp += stride;
}
}
static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
const pixel *src, const ptrdiff_t src_stride,
const pixel (*left)[2], pixel *const top[2],
const pixel (*left)[2], const pixel *top,
const int w, const int h,
const enum CdefEdgeFlags edges)
{
@ -77,9 +78,11 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
x_end -= 2;
}
for (int y = y_start; y < 0; y++)
for (int y = y_start; y < 0; y++) {
for (int x = x_start; x < x_end; x++)
tmp[x + y * tmp_stride] = top[y & 1][x];
tmp[x + y * tmp_stride] = top[x];
top += PXSTRIDE(src_stride);
}
for (int y = 0; y < h; y++)
for (int x = x_start; x < 0; x++)
tmp[x + y * tmp_stride] = left[y][2 + x];
@ -93,75 +96,113 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
static NOINLINE void
cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], /*const*/ pixel *const top[2],
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges
HIGHBD_DECL_SUFFIX)
const pixel (*left)[2], const pixel *const top,
const int pri_strength, const int sec_strength,
const int dir, const int damping, const int w, int h,
const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
static const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
{ -1 * 12 + 1, -2 * 12 + 2 },
{ 0 * 12 + 1, -1 * 12 + 2 },
{ 0 * 12 + 1, 0 * 12 + 2 },
{ 0 * 12 + 1, 1 * 12 + 2 },
{ 1 * 12 + 1, 2 * 12 + 2 },
{ 1 * 12 + 0, 2 * 12 + 1 },
{ 1 * 12 + 0, 2 * 12 + 0 },
{ 1 * 12 + 0, 2 * 12 - 1 }
};
const ptrdiff_t tmp_stride = 12;
assert((w == 4 || w == 8) && (h == 4 || h == 8));
uint16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
// run actual filter
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
int sum = 0;
const int px = dst[x];
int max = px, min = px;
int pri_tap_k = pri_tap;
for (int k = 0; k < 2; k++) {
const int off1 = cdef_directions[dir][k];
const int p0 = tmp[x + off1];
const int p1 = tmp[x - off1];
sum += pri_tap_k * constrain(p0 - px, pri_strength, damping);
sum += pri_tap_k * constrain(p1 - px, pri_strength, damping);
// if pri_tap_k == 4 then it becomes 2 else it remains 3
pri_tap_k -= (pri_tap_k << 1) - 6;
if (p0 != INT16_MAX) max = imax(p0, max);
if (p1 != INT16_MAX) max = imax(p1, max);
min = imin(p0, min);
min = imin(p1, min);
const int off2 = cdef_directions[(dir + 2) & 7][k];
const int s0 = tmp[x + off2];
const int s1 = tmp[x - off2];
const int off3 = cdef_directions[(dir + 6) & 7][k];
const int s2 = tmp[x + off3];
const int s3 = tmp[x - off3];
if (s0 != INT16_MAX) max = imax(s0, max);
if (s1 != INT16_MAX) max = imax(s1, max);
if (s2 != INT16_MAX) max = imax(s2, max);
if (s3 != INT16_MAX) max = imax(s3, max);
min = imin(s0, min);
min = imin(s1, min);
min = imin(s2, min);
min = imin(s3, min);
// sec_tap starts at 2 and becomes 1
const int sec_tap = 2 - k;
sum += sec_tap * constrain(s0 - px, sec_strength, damping);
sum += sec_tap * constrain(s1 - px, sec_strength, damping);
sum += sec_tap * constrain(s2 - px, sec_strength, damping);
sum += sec_tap * constrain(s3 - px, sec_strength, damping);
}
dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
if (pri_strength) {
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
const int pri_shift = imax(0, damping - ulog2(pri_strength));
if (sec_strength) {
const int sec_shift = imax(0, damping - ulog2(sec_strength));
do {
for (int x = 0; x < w; x++) {
const int px = dst[x];
int sum = 0;
int max = px, min = px;
int pri_tap_k = pri_tap;
for (int k = 0; k < 2; k++) {
const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
const int p0 = tmp[x + off1];
const int p1 = tmp[x - off1];
sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
// if pri_tap_k == 4 then it becomes 2 else it remains 3
pri_tap_k = (pri_tap_k & 3) | 2;
min = umin(p0, min);
max = imax(p0, max);
min = umin(p1, min);
max = imax(p1, max);
const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
const int s0 = tmp[x + off2];
const int s1 = tmp[x - off2];
const int s2 = tmp[x + off3];
const int s3 = tmp[x - off3];
// sec_tap starts at 2 and becomes 1
const int sec_tap = 2 - k;
sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
min = umin(s0, min);
max = imax(s0, max);
min = umin(s1, min);
max = imax(s1, max);
min = umin(s2, min);
max = imax(s2, max);
min = umin(s3, min);
max = imax(s3, max);
}
dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
}
dst += PXSTRIDE(dst_stride);
tmp += tmp_stride;
} while (--h);
} else { // pri_strength only
do {
for (int x = 0; x < w; x++) {
const int px = dst[x];
int sum = 0;
int pri_tap_k = pri_tap;
for (int k = 0; k < 2; k++) {
const int off = dav1d_cdef_directions[dir + 2][k]; // dir
const int p0 = tmp[x + off];
const int p1 = tmp[x - off];
sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
pri_tap_k = (pri_tap_k & 3) | 2;
}
dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
}
dst += PXSTRIDE(dst_stride);
tmp += tmp_stride;
} while (--h);
}
dst += PXSTRIDE(dst_stride);
tmp += tmp_stride;
} else { // sec_strength only
assert(sec_strength);
const int sec_shift = imax(0, damping - ulog2(sec_strength));
do {
for (int x = 0; x < w; x++) {
const int px = dst[x];
int sum = 0;
for (int k = 0; k < 2; k++) {
const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
const int s0 = tmp[x + off1];
const int s1 = tmp[x - off1];
const int s2 = tmp[x + off2];
const int s3 = tmp[x - off2];
const int sec_tap = 2 - k;
sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
}
dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
}
dst += PXSTRIDE(dst_stride);
tmp += tmp_stride;
} while (--h);
}
}
@ -169,7 +210,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
const ptrdiff_t stride, \
const pixel (*left)[2], \
/*const*/ pixel *const top[2], \
const pixel *const top, \
const int pri_strength, \
const int sec_strength, \
const int dir, \
@ -177,8 +218,8 @@ static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
const enum CdefEdgeFlags edges \
HIGHBD_DECL_SUFFIX) \
{ \
cdef_filter_block_c(dst, stride, left, top, w, h, pri_strength, sec_strength, \
dir, damping, edges HIGHBD_TAIL_SUFFIX); \
cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \
dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
}
cdef_fn(4, 4);

33
third_party/dav1d/src/cpu.c поставляемый
Просмотреть файл

@ -30,24 +30,27 @@
#include "src/cpu.h"
static unsigned flags = 0;
#if ARCH_X86
/* Disable AVX-512 by default for the time being */
static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL;
#else
static unsigned flags_mask = -1;
#endif
COLD void dav1d_init_cpu(void) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
flags = dav1d_get_cpu_flags_arm();
#elif ARCH_PPC64LE
flags = dav1d_get_cpu_flags_ppc();
#elif ARCH_X86
flags = dav1d_get_cpu_flags_x86();
#endif
#endif
}
COLD unsigned dav1d_get_cpu_flags(void) {
static unsigned flags;
static uint8_t checked = 0;
if (!checked) {
#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
flags = dav1d_get_cpu_flags_arm();
#elif ARCH_PPC64LE && HAVE_ASM
flags = dav1d_get_cpu_flags_ppc();
#elif ARCH_X86 && HAVE_ASM
flags = dav1d_get_cpu_flags_x86();
#else
flags = 0;
#endif
checked = 1;
}
return flags & flags_mask;
}

3
third_party/dav1d/src/cpu.h поставляемый
Просмотреть файл

@ -42,7 +42,8 @@
#include "src/x86/cpu.h"
#endif
void dav1d_init_cpu(void);
unsigned dav1d_get_cpu_flags(void);
DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask);
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
#endif /* DAV1D_SRC_CPU_H */

87
third_party/dav1d/src/decode.c поставляемый
Просмотреть файл

@ -627,8 +627,8 @@ static void read_vartx_tree(Dav1dTileContext *const t,
// var-tx tree coding
b->tx_split[0] = b->tx_split[1] = 0;
b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
if (f->frame_hdr->segmentation.lossless[b->seg_id] ||
b->max_ytx == TX_4X4)
if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
b->max_ytx == TX_4X4))
{
b->max_ytx = b->uvtx = TX_4X4;
if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
@ -645,8 +645,6 @@ static void read_vartx_tree(Dav1dTileContext *const t,
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
} else {
assert(f->frame_hdr->txfm_mode == DAV1D_TX_LARGEST);
}
b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
} else {
@ -1878,10 +1876,11 @@ static int decode_b(Dav1dTileContext *const t,
b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
lf_lvls, t->bx, t->by, f->w4, f->h4,
b->skip, bs, b->tx_split, b->uvtx,
f->cur.p.layout,
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
t->bx, t->by, f->w4, f->h4, b->skip, bs,
f->frame_hdr->segmentation.lossless[b->seg_id] ?
(enum RectTxfmSize) TX_4X4 : b->max_ytx,
b->tx_split, b->uvtx, f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
@ -2350,7 +2349,7 @@ static void setup_tile(Dav1dTileState *const ts,
// Reference Restoration Unit (used for exp coding)
int sb_idx, unit_idx;
if (f->frame_hdr->super_res.enabled) {
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
// vertical components only
sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
unit_idx = (ts->tiling.row_start & 16) >> 3;
@ -2363,7 +2362,7 @@ static void setup_tile(Dav1dTileState *const ts,
if (!((f->lf.restore_planes >> p) & 1U))
continue;
if (f->frame_hdr->super_res.enabled) {
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int d = f->frame_hdr->super_res.width_scale_denominator;
const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
@ -2543,7 +2542,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
if (f->frame_hdr->super_res.enabled) {
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
@ -2763,24 +2762,42 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
// update allocation of block contexts for above
const int line_sz = (int)f->b4_stride << hbd;
if (line_sz != f->lf.line_sz) {
dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);
uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);
const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) {
dav1d_free_aligned(f->lf.cdef_line_buf);
size_t alloc_sz = 64;
alloc_sz += (y_stride < 0 ? -y_stride : y_stride ) * 4;
alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8;
uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
if (!ptr) {
f->lf.line_sz = 0;
f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0;
goto error;
}
for (int pl = 0; pl <= 2; pl++) {
f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;
f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;
f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;
f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;
ptr += line_sz * 4 * 4;
ptr += 32;
if (y_stride < 0) {
f->lf.cdef_line[0][0] = ptr - y_stride * 1;
f->lf.cdef_line[1][0] = ptr - y_stride * 3;
ptr -= y_stride * 4;
} else {
f->lf.cdef_line[0][0] = ptr + y_stride * 0;
f->lf.cdef_line[1][0] = ptr + y_stride * 2;
ptr += y_stride * 4;
}
if (uv_stride < 0) {
f->lf.cdef_line[0][1] = ptr - uv_stride * 1;
f->lf.cdef_line[0][2] = ptr - uv_stride * 3;
f->lf.cdef_line[1][1] = ptr - uv_stride * 5;
f->lf.cdef_line[1][2] = ptr - uv_stride * 7;
} else {
f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
}
f->lf.line_sz = line_sz;
f->lf.cdef_line_sz[0] = (int) y_stride;
f->lf.cdef_line_sz[1] = (int) uv_stride;
}
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
@ -2944,14 +2961,19 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
}
// init loopfilter pointers
/* Init loopfilter pointers. Increasing NULL pointers is technically UB,
* so just point the chroma pointers in 4:0:0 to the luma plane here to
* avoid having additional in-loop branches in various places. We never
* dereference those pointers so it doesn't really matter what they
* point at, as long as the pointers are valid. */
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
f->lf.mask_ptr = f->lf.mask;
f->lf.p[0] = f->cur.data[0];
f->lf.p[1] = f->cur.data[1];
f->lf.p[2] = f->cur.data[2];
f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
f->lf.sr_p[0] = f->sr_cur.p.data[0];
f->lf.sr_p[1] = f->sr_cur.p.data[1];
f->lf.sr_p[2] = f->sr_cur.p.data[2];
f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
f->lf.tile_row = 1;
dav1d_cdf_thread_wait(&f->in_cdf);
@ -3220,7 +3242,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \
dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
break
@ -3301,7 +3323,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
}
f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
!f->frame_hdr->force_integer_mv &&
!dav1d_get_shear_params(&f->frame_hdr->gmv[i]);
!dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
!f->svc[i][0].scale;
}
}
@ -3338,14 +3361,14 @@ int dav1d_submit_frame(Dav1dContext *const c) {
res = dav1d_thread_picture_alloc(c, f, bpc);
if (res < 0) goto error;
if (f->frame_hdr->super_res.enabled) {
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
if (res < 0) goto error;
} else {
dav1d_picture_ref(&f->cur, &f->sr_cur.p);
}
if (f->frame_hdr->super_res.enabled) {
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;

55
third_party/dav1d/src/ext/x86/x86inc.asm поставляемый
Просмотреть файл

@ -651,8 +651,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
%endmacro
%macro WIN64_RESTORE_XMM 0
%assign xmm_regs_used 0
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
@ -824,33 +826,34 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
; cpuflags
%assign cpuflags_mmx (1<<0)
%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
%assign cpuflags_aesni (1<<12)| cpuflags_sse42
%assign cpuflags_gfni (1<<13)| cpuflags_sse42
%assign cpuflags_avx (1<<14)| cpuflags_sse42
%assign cpuflags_xop (1<<15)| cpuflags_avx
%assign cpuflags_fma4 (1<<16)| cpuflags_avx
%assign cpuflags_fma3 (1<<17)| cpuflags_avx
%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
%assign cpuflags_mmx (1<<0)
%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
%assign cpuflags_aesni (1<<12) | cpuflags_sse42
%assign cpuflags_gfni (1<<13) | cpuflags_sse42
%assign cpuflags_avx (1<<14) | cpuflags_sse42
%assign cpuflags_xop (1<<15) | cpuflags_avx
%assign cpuflags_fma4 (1<<16) | cpuflags_avx
%assign cpuflags_fma3 (1<<17) | cpuflags_avx
%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1
%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
%assign cpuflags_cache32 (1<<22)
%assign cpuflags_cache64 (1<<23)
%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<25)
%assign cpuflags_cache32 (1<<23)
%assign cpuflags_cache64 (1<<24)
%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<26)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)

31
third_party/dav1d/src/fg_apply_tmpl.c поставляемый
Просмотреть файл

@ -122,17 +122,32 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
// TODO: eliminate in favor of per-plane refs
assert(out->stride[0] == in->stride[0]);
if (!data->num_y_points) {
memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
const ptrdiff_t stride = out->stride[0];
const ptrdiff_t sz = out->p.h * stride;
if (sz < 0)
memcpy((uint8_t*) out->data[0] + sz - stride,
(uint8_t*) in->data[0] + sz - stride, -sz);
else
memcpy(out->data[0], in->data[0], sz);
}
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
assert(out->stride[1] == in->stride[1]);
for (int i = 0; i < 2; i++) {
if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
memcpy(out->data[1+i], in->data[1+i],
(out->p.h >> suby) * out->stride[1]);
}
const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const ptrdiff_t stride = out->stride[1];
const ptrdiff_t sz = (out->p.h * stride) >> ss_ver;
if (sz < 0) {
if (!data->num_uv_points[0])
memcpy((uint8_t*) out->data[1] + sz - stride,
(uint8_t*) in->data[1] + sz - stride, -sz);
if (!data->num_uv_points[1])
memcpy((uint8_t*) out->data[2] + sz - stride,
(uint8_t*) in->data[2] + sz - stride, -sz);
} else {
if (!data->num_uv_points[0])
memcpy(out->data[1], in->data[1], sz);
if (!data->num_uv_points[1])
memcpy(out->data[2], in->data[2], sz);
}
}

2
third_party/dav1d/src/film_grain_tmpl.c поставляемый
Просмотреть файл

@ -43,7 +43,7 @@ static inline int get_random_number(const int bits, unsigned *const state) {
return (*state >> (16 - bits)) & ((1 << bits) - 1);
}
static inline int round2(const int x, const int shift) {
static inline int round2(const int x, const uint64_t shift) {
return (x + ((1 << shift) >> 1)) >> shift;
}

8
third_party/dav1d/src/internal.h поставляемый
Просмотреть файл

@ -216,12 +216,14 @@ struct Dav1dFrameContext {
Av1Filter *mask;
Av1Restoration *lr_mask;
int top_pre_cdef_toggle;
int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
int lr_line_sz, re_sz /* h */;
ALIGN(Av1FilterLUT lim_lut, 16);
int last_sharpness;
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
uint8_t *tx_lpf_right_edge[2];
pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];
uint8_t *cdef_line_buf;
pixel *cdef_line[2 /* pre, post */][3 /* plane */];
pixel *lr_lpf_line[3 /* plane */];
// in-loop filter per-frame state keeping
@ -288,7 +290,7 @@ struct Dav1dTileContext {
uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
uint8_t txtp_map[32 * 32]; // inter-only
ALIGN(union, 32) {
ALIGN(union, 64) {
struct {
union {
uint8_t lap_8bpc [128 * 32];

2
third_party/dav1d/src/ipred_prepare.h поставляемый
Просмотреть файл

@ -66,7 +66,7 @@
* range, in the following order:
* - [0] will be the top/left edge pixel;
* - [1..w] will be the top edge pixels (1 being left-most, w being right-most);
* - [w+1..w*w] will be the top/right edge pixels;
* - [w+1..2*w] will be the top/right edge pixels;
* - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom-
* most);
* - [-w-1..-2*w] will be the bottom/left edge pixels.

785
third_party/dav1d/src/itx_1d.c поставляемый
Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* Copyright © 2018-2019, VideoLAN and dav1d authors
* Copyright © 2018-2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -30,7 +30,9 @@
#include <stddef.h>
#include <stdint.h>
#include "common/attributes.h"
#include "common/intops.h"
#include "src/itx_1d.h"
#define CLIP(a) iclip(a, min, max)
@ -60,41 +62,62 @@
* wrap around.
*/
static void NOINLINE
inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
static NOINLINE void
inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max, const int tx64)
{
const int min = -max - 1;
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
assert(stride > 0);
const int in0 = c[0 * stride], in1 = c[1 * stride];
int t0 = ((in0 + in2) * 181 + 128) >> 8;
int t1 = ((in0 - in2) * 181 + 128) >> 8;
int t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
int t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1;
int t0, t1, t2, t3;
if (tx64) {
t0 = t1 = (in0 * 181 + 128) >> 8;
t2 = (in1 * 1567 + 2048) >> 12;
t3 = (in1 * 3784 + 2048) >> 12;
} else {
const int in2 = c[2 * stride], in3 = c[3 * stride];
out[0 * out_s] = CLIP(t0 + t3);
out[1 * out_s] = CLIP(t1 + t2);
out[2 * out_s] = CLIP(t1 - t2);
out[3 * out_s] = CLIP(t0 - t3);
t0 = ((in0 + in2) * 181 + 128) >> 8;
t1 = ((in0 - in2) * 181 + 128) >> 8;
t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1;
}
c[0 * stride] = CLIP(t0 + t3);
c[1 * stride] = CLIP(t1 + t2);
c[2 * stride] = CLIP(t1 - t2);
c[3 * stride] = CLIP(t0 - t3);
}
static void NOINLINE
inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
const int min = -max - 1;
coef tmp[4];
inv_dct4_1d_internal_c(c, stride, min, max, 0);
}
inv_dct4_1d(in, in_s * 2, tmp, 1, max);
static NOINLINE void
inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max, const int tx64)
{
assert(stride > 0);
inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);
const int in1 = in[1 * in_s], in3 = in[3 * in_s];
const int in5 = in[5 * in_s], in7 = in[7 * in_s];
const int in1 = c[1 * stride], in3 = c[3 * stride];
int t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7;
int t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11;
int t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11;
int t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1;
int t4a, t5a, t6a, t7a;
if (tx64) {
t4a = (in1 * 799 + 2048) >> 12;
t5a = (in3 * -2276 + 2048) >> 12;
t6a = (in3 * 3406 + 2048) >> 12;
t7a = (in1 * 4017 + 2048) >> 12;
} else {
const int in5 = c[5 * stride], in7 = c[7 * stride];
t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7;
t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11;
t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11;
t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1;
}
int t4 = CLIP(t4a + t5a);
t5a = CLIP(t4a - t5a);
@ -104,38 +127,60 @@ inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
int t5 = ((t6a - t5a) * 181 + 128) >> 8;
int t6 = ((t6a + t5a) * 181 + 128) >> 8;
out[0 * out_s] = CLIP(tmp[0] + t7);
out[1 * out_s] = CLIP(tmp[1] + t6);
out[2 * out_s] = CLIP(tmp[2] + t5);
out[3 * out_s] = CLIP(tmp[3] + t4);
out[4 * out_s] = CLIP(tmp[3] - t4);
out[5 * out_s] = CLIP(tmp[2] - t5);
out[6 * out_s] = CLIP(tmp[1] - t6);
out[7 * out_s] = CLIP(tmp[0] - t7);
const int t0 = c[0 * stride];
const int t1 = c[2 * stride];
const int t2 = c[4 * stride];
const int t3 = c[6 * stride];
c[0 * stride] = CLIP(t0 + t7);
c[1 * stride] = CLIP(t1 + t6);
c[2 * stride] = CLIP(t2 + t5);
c[3 * stride] = CLIP(t3 + t4);
c[4 * stride] = CLIP(t3 - t4);
c[5 * stride] = CLIP(t2 - t5);
c[6 * stride] = CLIP(t1 - t6);
c[7 * stride] = CLIP(t0 - t7);
}
static void NOINLINE
inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
const int min = -max - 1;
coef tmp[8];
inv_dct8_1d_internal_c(c, stride, min, max, 0);
}
inv_dct8_1d(in, in_s * 2, tmp, 1, max);
static NOINLINE void
inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max, int tx64)
{
assert(stride > 0);
inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
const int in9 = in[ 9 * in_s], in11 = in[11 * in_s];
const int in13 = in[13 * in_s], in15 = in[15 * in_s];
const int in1 = c[1 * stride], in3 = c[3 * stride];
const int in5 = c[5 * stride], in7 = c[7 * stride];
int t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15;
int t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1;
int t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11;
int t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11;
int t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11;
int t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5;
int t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13;
int t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3;
int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
if (tx64) {
t8a = (in1 * 401 + 2048) >> 12;
t9a = (in7 * -2598 + 2048) >> 12;
t10a = (in5 * 1931 + 2048) >> 12;
t11a = (in3 * -1189 + 2048) >> 12;
t12a = (in3 * 3920 + 2048) >> 12;
t13a = (in5 * 3612 + 2048) >> 12;
t14a = (in7 * 3166 + 2048) >> 12;
t15a = (in1 * 4076 + 2048) >> 12;
} else {
const int in9 = c[ 9 * stride], in11 = c[11 * stride];
const int in13 = c[13 * stride], in15 = c[15 * stride];
t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15;
t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11;
t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11;
t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13;
t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3;
t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5;
t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11;
t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1;
}
int t8 = CLIP(t8a + t9a);
int t9 = CLIP(t8a - t9a);
@ -165,58 +210,93 @@ inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
t11 = ((t12a - t11a) * 181 + 128) >> 8;
t12 = ((t12a + t11a) * 181 + 128) >> 8;
out[ 0 * out_s] = CLIP(tmp[0] + t15a);
out[ 1 * out_s] = CLIP(tmp[1] + t14);
out[ 2 * out_s] = CLIP(tmp[2] + t13a);
out[ 3 * out_s] = CLIP(tmp[3] + t12);
out[ 4 * out_s] = CLIP(tmp[4] + t11);
out[ 5 * out_s] = CLIP(tmp[5] + t10a);
out[ 6 * out_s] = CLIP(tmp[6] + t9);
out[ 7 * out_s] = CLIP(tmp[7] + t8a);
out[ 8 * out_s] = CLIP(tmp[7] - t8a);
out[ 9 * out_s] = CLIP(tmp[6] - t9);
out[10 * out_s] = CLIP(tmp[5] - t10a);
out[11 * out_s] = CLIP(tmp[4] - t11);
out[12 * out_s] = CLIP(tmp[3] - t12);
out[13 * out_s] = CLIP(tmp[2] - t13a);
out[14 * out_s] = CLIP(tmp[1] - t14);
out[15 * out_s] = CLIP(tmp[0] - t15a);
const int t0 = c[ 0 * stride];
const int t1 = c[ 2 * stride];
const int t2 = c[ 4 * stride];
const int t3 = c[ 6 * stride];
const int t4 = c[ 8 * stride];
const int t5 = c[10 * stride];
const int t6 = c[12 * stride];
const int t7 = c[14 * stride];
c[ 0 * stride] = CLIP(t0 + t15a);
c[ 1 * stride] = CLIP(t1 + t14);
c[ 2 * stride] = CLIP(t2 + t13a);
c[ 3 * stride] = CLIP(t3 + t12);
c[ 4 * stride] = CLIP(t4 + t11);
c[ 5 * stride] = CLIP(t5 + t10a);
c[ 6 * stride] = CLIP(t6 + t9);
c[ 7 * stride] = CLIP(t7 + t8a);
c[ 8 * stride] = CLIP(t7 - t8a);
c[ 9 * stride] = CLIP(t6 - t9);
c[10 * stride] = CLIP(t5 - t10a);
c[11 * stride] = CLIP(t4 - t11);
c[12 * stride] = CLIP(t3 - t12);
c[13 * stride] = CLIP(t2 - t13a);
c[14 * stride] = CLIP(t1 - t14);
c[15 * stride] = CLIP(t0 - t15a);
}
static void NOINLINE
inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
const int min = -max - 1;
coef tmp[16];
inv_dct16_1d_internal_c(c, stride, min, max, 0);
}
inv_dct16_1d(in, in_s * 2, tmp, 1, max);
static NOINLINE void
inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max, const int tx64)
{
assert(stride > 0);
inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
const int in9 = in[ 9 * in_s], in11 = in[11 * in_s];
const int in13 = in[13 * in_s], in15 = in[15 * in_s];
const int in17 = in[17 * in_s], in19 = in[19 * in_s];
const int in21 = in[21 * in_s], in23 = in[23 * in_s];
const int in25 = in[25 * in_s], in27 = in[27 * in_s];
const int in29 = in[29 * in_s], in31 = in[31 * in_s];
const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
const int in9 = c[ 9 * stride], in11 = c[11 * stride];
const int in13 = c[13 * stride], in15 = c[15 * stride];
int t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31;
int t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1;
int t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17;
int t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15;
int t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23;
int t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9;
int t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25;
int t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7;
int t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27;
int t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5;
int t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21;
int t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11;
int t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11;
int t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11;
int t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29;
int t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3;
int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
if (tx64) {
t16a = (in1 * 201 + 2048) >> 12;
t17a = (in15 * -2751 + 2048) >> 12;
t18a = (in9 * 1751 + 2048) >> 12;
t19a = (in7 * -1380 + 2048) >> 12;
t20a = (in5 * 995 + 2048) >> 12;
t21a = (in11 * -2106 + 2048) >> 12;
t22a = (in13 * 2440 + 2048) >> 12;
t23a = (in3 * -601 + 2048) >> 12;
t24a = (in3 * 4052 + 2048) >> 12;
t25a = (in13 * 3290 + 2048) >> 12;
t26a = (in11 * 3513 + 2048) >> 12;
t27a = (in5 * 3973 + 2048) >> 12;
t28a = (in7 * 3857 + 2048) >> 12;
t29a = (in9 * 3703 + 2048) >> 12;
t30a = (in15 * 3035 + 2048) >> 12;
t31a = (in1 * 4091 + 2048) >> 12;
} else {
const int in17 = c[17 * stride], in19 = c[19 * stride];
const int in21 = c[21 * stride], in23 = c[23 * stride];
const int in25 = c[25 * stride], in27 = c[27 * stride];
const int in29 = c[29 * stride], in31 = c[31 * stride];
t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31;
t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17;
t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23;
t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25;
t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27;
t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21;
t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11;
t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29;
t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3;
t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11;
t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11;
t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5;
t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7;
t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9;
t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15;
t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1;
}
int t16 = CLIP(t16a + t17a);
int t17 = CLIP(t16a - t17a);
@ -296,98 +376,110 @@ inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
t23a = ((t24 - t23 ) * 181 + 128) >> 8;
t24a = ((t24 + t23 ) * 181 + 128) >> 8;
out[ 0 * out_s] = CLIP(tmp[ 0] + t31);
out[ 1 * out_s] = CLIP(tmp[ 1] + t30a);
out[ 2 * out_s] = CLIP(tmp[ 2] + t29);
out[ 3 * out_s] = CLIP(tmp[ 3] + t28a);
out[ 4 * out_s] = CLIP(tmp[ 4] + t27);
out[ 5 * out_s] = CLIP(tmp[ 5] + t26a);
out[ 6 * out_s] = CLIP(tmp[ 6] + t25);
out[ 7 * out_s] = CLIP(tmp[ 7] + t24a);
out[ 8 * out_s] = CLIP(tmp[ 8] + t23a);
out[ 9 * out_s] = CLIP(tmp[ 9] + t22);
out[10 * out_s] = CLIP(tmp[10] + t21a);
out[11 * out_s] = CLIP(tmp[11] + t20);
out[12 * out_s] = CLIP(tmp[12] + t19a);
out[13 * out_s] = CLIP(tmp[13] + t18);
out[14 * out_s] = CLIP(tmp[14] + t17a);
out[15 * out_s] = CLIP(tmp[15] + t16);
out[16 * out_s] = CLIP(tmp[15] - t16);
out[17 * out_s] = CLIP(tmp[14] - t17a);
out[18 * out_s] = CLIP(tmp[13] - t18);
out[19 * out_s] = CLIP(tmp[12] - t19a);
out[20 * out_s] = CLIP(tmp[11] - t20);
out[21 * out_s] = CLIP(tmp[10] - t21a);
out[22 * out_s] = CLIP(tmp[ 9] - t22);
out[23 * out_s] = CLIP(tmp[ 8] - t23a);
out[24 * out_s] = CLIP(tmp[ 7] - t24a);
out[25 * out_s] = CLIP(tmp[ 6] - t25);
out[26 * out_s] = CLIP(tmp[ 5] - t26a);
out[27 * out_s] = CLIP(tmp[ 4] - t27);
out[28 * out_s] = CLIP(tmp[ 3] - t28a);
out[29 * out_s] = CLIP(tmp[ 2] - t29);
out[30 * out_s] = CLIP(tmp[ 1] - t30a);
out[31 * out_s] = CLIP(tmp[ 0] - t31);
const int t0 = c[ 0 * stride];
const int t1 = c[ 2 * stride];
const int t2 = c[ 4 * stride];
const int t3 = c[ 6 * stride];
const int t4 = c[ 8 * stride];
const int t5 = c[10 * stride];
const int t6 = c[12 * stride];
const int t7 = c[14 * stride];
const int t8 = c[16 * stride];
const int t9 = c[18 * stride];
const int t10 = c[20 * stride];
const int t11 = c[22 * stride];
const int t12 = c[24 * stride];
const int t13 = c[26 * stride];
const int t14 = c[28 * stride];
const int t15 = c[30 * stride];
c[ 0 * stride] = CLIP(t0 + t31);
c[ 1 * stride] = CLIP(t1 + t30a);
c[ 2 * stride] = CLIP(t2 + t29);
c[ 3 * stride] = CLIP(t3 + t28a);
c[ 4 * stride] = CLIP(t4 + t27);
c[ 5 * stride] = CLIP(t5 + t26a);
c[ 6 * stride] = CLIP(t6 + t25);
c[ 7 * stride] = CLIP(t7 + t24a);
c[ 8 * stride] = CLIP(t8 + t23a);
c[ 9 * stride] = CLIP(t9 + t22);
c[10 * stride] = CLIP(t10 + t21a);
c[11 * stride] = CLIP(t11 + t20);
c[12 * stride] = CLIP(t12 + t19a);
c[13 * stride] = CLIP(t13 + t18);
c[14 * stride] = CLIP(t14 + t17a);
c[15 * stride] = CLIP(t15 + t16);
c[16 * stride] = CLIP(t15 - t16);
c[17 * stride] = CLIP(t14 - t17a);
c[18 * stride] = CLIP(t13 - t18);
c[19 * stride] = CLIP(t12 - t19a);
c[20 * stride] = CLIP(t11 - t20);
c[21 * stride] = CLIP(t10 - t21a);
c[22 * stride] = CLIP(t9 - t22);
c[23 * stride] = CLIP(t8 - t23a);
c[24 * stride] = CLIP(t7 - t24a);
c[25 * stride] = CLIP(t6 - t25);
c[26 * stride] = CLIP(t5 - t26a);
c[27 * stride] = CLIP(t4 - t27);
c[28 * stride] = CLIP(t3 - t28a);
c[29 * stride] = CLIP(t2 - t29);
c[30 * stride] = CLIP(t1 - t30a);
c[31 * stride] = CLIP(t0 - t31);
}
static void NOINLINE
inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
const int min = -max - 1;
coef tmp[32];
inv_dct32_1d_internal_c(c, stride, min, max, 0);
}
inv_dct32_1d(in, in_s * 2, tmp, 1, max);
void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
const int in9 = in[ 9 * in_s], in11 = in[11 * in_s];
const int in13 = in[13 * in_s], in15 = in[15 * in_s];
const int in17 = in[17 * in_s], in19 = in[19 * in_s];
const int in21 = in[21 * in_s], in23 = in[23 * in_s];
const int in25 = in[25 * in_s], in27 = in[27 * in_s];
const int in29 = in[29 * in_s], in31 = in[31 * in_s];
const int in33 = in[33 * in_s], in35 = in[35 * in_s];
const int in37 = in[37 * in_s], in39 = in[39 * in_s];
const int in41 = in[41 * in_s], in43 = in[43 * in_s];
const int in45 = in[45 * in_s], in47 = in[47 * in_s];
const int in49 = in[49 * in_s], in51 = in[51 * in_s];
const int in53 = in[53 * in_s], in55 = in[55 * in_s];
const int in57 = in[57 * in_s], in59 = in[59 * in_s];
const int in61 = in[61 * in_s], in63 = in[63 * in_s];
const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
const int in9 = c[ 9 * stride], in11 = c[11 * stride];
const int in13 = c[13 * stride], in15 = c[15 * stride];
const int in17 = c[17 * stride], in19 = c[19 * stride];
const int in21 = c[21 * stride], in23 = c[23 * stride];
const int in25 = c[25 * stride], in27 = c[27 * stride];
const int in29 = c[29 * stride], in31 = c[31 * stride];
int t32a = ((in1 * 101 - in63 * (4095 - 4096) + 2048) >> 12) - in63;
int t33a = ((in33 * (2967 - 4096) - in31 * 2824 + 2048) >> 12) + in33;
int t34a = ((in17 * 1660 - in47 * (3745 - 4096) + 2048) >> 12) - in47;
int t35a = (in49 * 1911 - in15 * 737 + 1024) >> 11;
int t36a = ((in9 * 897 - in55 * (3996 - 4096) + 2048) >> 12) - in55;
int t37a = ((in41 * (3461 - 4096) - in23 * 2191 + 2048) >> 12) + in41;
int t38a = ((in25 * 2359 - in39 * (3349 - 4096) + 2048) >> 12) - in39;
int t39a = (in57 * 2018 - in7 * 350 + 1024) >> 11;
int t40a = ((in5 * 501 - in59 * (4065 - 4096) + 2048) >> 12) - in59;
int t41a = ((in37 * (3229 - 4096) - in27 * 2520 + 2048) >> 12) + in37;
int t42a = ((in21 * 2019 - in43 * (3564 - 4096) + 2048) >> 12) - in43;
int t43a = (in53 * 1974 - in11 * 546 + 1024) >> 11;
int t44a = ((in13 * 1285 - in51 * (3889 - 4096) + 2048) >> 12) - in51;
int t45a = ((in45 * (3659 - 4096) - in19 * 1842 + 2048) >> 12) + in45;
int t46a = ((in29 * 2675 - in35 * (3102 - 4096) + 2048) >> 12) - in35;
int t47a = ((in61 * (4085 - 4096) - in3 * 301 + 2048) >> 12) + in61;
int t48a = ((in61 * 301 + in3 * (4085 - 4096) + 2048) >> 12) + in3;
int t49a = ((in29 * (3102 - 4096) + in35 * 2675 + 2048) >> 12) + in29;
int t50a = ((in45 * 1842 + in19 * (3659 - 4096) + 2048) >> 12) + in19;
int t51a = ((in13 * (3889 - 4096) + in51 * 1285 + 2048) >> 12) + in13;
int t52a = (in53 * 546 + in11 * 1974 + 1024) >> 11;
int t53a = ((in21 * (3564 - 4096) + in43 * 2019 + 2048) >> 12) + in21;
int t54a = ((in37 * 2520 + in27 * (3229 - 4096) + 2048) >> 12) + in27;
int t55a = ((in5 * (4065 - 4096) + in59 * 501 + 2048) >> 12) + in5;
int t56a = (in57 * 350 + in7 * 2018 + 1024) >> 11;
int t57a = ((in25 * (3349 - 4096) + in39 * 2359 + 2048) >> 12) + in25;
int t58a = ((in41 * 2191 + in23 * (3461 - 4096) + 2048) >> 12) + in23;
int t59a = ((in9 * (3996 - 4096) + in55 * 897 + 2048) >> 12) + in9;
int t60a = (in49 * 737 + in15 * 1911 + 1024) >> 11;
int t61a = ((in17 * (3745 - 4096) + in47 * 1660 + 2048) >> 12) + in17;
int t62a = ((in33 * 2824 + in31 * (2967 - 4096) + 2048) >> 12) + in31;
int t63a = ((in1 * (4095 - 4096) + in63 * 101 + 2048) >> 12) + in1;
int t32a = (in1 * 101 + 2048) >> 12;
int t33a = (in31 * -2824 + 2048) >> 12;
int t34a = (in17 * 1660 + 2048) >> 12;
int t35a = (in15 * -1474 + 2048) >> 12;
int t36a = (in9 * 897 + 2048) >> 12;
int t37a = (in23 * -2191 + 2048) >> 12;
int t38a = (in25 * 2359 + 2048) >> 12;
int t39a = (in7 * -700 + 2048) >> 12;
int t40a = (in5 * 501 + 2048) >> 12;
int t41a = (in27 * -2520 + 2048) >> 12;
int t42a = (in21 * 2019 + 2048) >> 12;
int t43a = (in11 * -1092 + 2048) >> 12;
int t44a = (in13 * 1285 + 2048) >> 12;
int t45a = (in19 * -1842 + 2048) >> 12;
int t46a = (in29 * 2675 + 2048) >> 12;
int t47a = (in3 * -301 + 2048) >> 12;
int t48a = (in3 * 4085 + 2048) >> 12;
int t49a = (in29 * 3102 + 2048) >> 12;
int t50a = (in19 * 3659 + 2048) >> 12;
int t51a = (in13 * 3889 + 2048) >> 12;
int t52a = (in11 * 3948 + 2048) >> 12;
int t53a = (in21 * 3564 + 2048) >> 12;
int t54a = (in27 * 3229 + 2048) >> 12;
int t55a = (in5 * 4065 + 2048) >> 12;
int t56a = (in7 * 4036 + 2048) >> 12;
int t57a = (in25 * 3349 + 2048) >> 12;
int t58a = (in23 * 3461 + 2048) >> 12;
int t59a = (in9 * 3996 + 2048) >> 12;
int t60a = (in15 * 3822 + 2048) >> 12;
int t61a = (in17 * 3745 + 2048) >> 12;
int t62a = (in31 * 2967 + 2048) >> 12;
int t63a = (in1 * 4095 + 2048) >> 12;
int t32 = CLIP(t32a + t33a);
int t33 = CLIP(t32a - t33a);
@ -589,76 +681,111 @@ inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
t54 = ((t41a + t54a) * 181 + 128) >> 8;
t55a = ((t40 + t55 ) * 181 + 128) >> 8;
out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);
out[ 1 * out_s] = CLIP(tmp[ 1] + t62);
out[ 2 * out_s] = CLIP(tmp[ 2] + t61a);
out[ 3 * out_s] = CLIP(tmp[ 3] + t60);
out[ 4 * out_s] = CLIP(tmp[ 4] + t59a);
out[ 5 * out_s] = CLIP(tmp[ 5] + t58);
out[ 6 * out_s] = CLIP(tmp[ 6] + t57a);
out[ 7 * out_s] = CLIP(tmp[ 7] + t56);
out[ 8 * out_s] = CLIP(tmp[ 8] + t55a);
out[ 9 * out_s] = CLIP(tmp[ 9] + t54);
out[10 * out_s] = CLIP(tmp[10] + t53a);
out[11 * out_s] = CLIP(tmp[11] + t52);
out[12 * out_s] = CLIP(tmp[12] + t51a);
out[13 * out_s] = CLIP(tmp[13] + t50);
out[14 * out_s] = CLIP(tmp[14] + t49a);
out[15 * out_s] = CLIP(tmp[15] + t48);
out[16 * out_s] = CLIP(tmp[16] + t47);
out[17 * out_s] = CLIP(tmp[17] + t46a);
out[18 * out_s] = CLIP(tmp[18] + t45);
out[19 * out_s] = CLIP(tmp[19] + t44a);
out[20 * out_s] = CLIP(tmp[20] + t43);
out[21 * out_s] = CLIP(tmp[21] + t42a);
out[22 * out_s] = CLIP(tmp[22] + t41);
out[23 * out_s] = CLIP(tmp[23] + t40a);
out[24 * out_s] = CLIP(tmp[24] + t39);
out[25 * out_s] = CLIP(tmp[25] + t38a);
out[26 * out_s] = CLIP(tmp[26] + t37);
out[27 * out_s] = CLIP(tmp[27] + t36a);
out[28 * out_s] = CLIP(tmp[28] + t35);
out[29 * out_s] = CLIP(tmp[29] + t34a);
out[30 * out_s] = CLIP(tmp[30] + t33);
out[31 * out_s] = CLIP(tmp[31] + t32a);
out[32 * out_s] = CLIP(tmp[31] - t32a);
out[33 * out_s] = CLIP(tmp[30] - t33);
out[34 * out_s] = CLIP(tmp[29] - t34a);
out[35 * out_s] = CLIP(tmp[28] - t35);
out[36 * out_s] = CLIP(tmp[27] - t36a);
out[37 * out_s] = CLIP(tmp[26] - t37);
out[38 * out_s] = CLIP(tmp[25] - t38a);
out[39 * out_s] = CLIP(tmp[24] - t39);
out[40 * out_s] = CLIP(tmp[23] - t40a);
out[41 * out_s] = CLIP(tmp[22] - t41);
out[42 * out_s] = CLIP(tmp[21] - t42a);
out[43 * out_s] = CLIP(tmp[20] - t43);
out[44 * out_s] = CLIP(tmp[19] - t44a);
out[45 * out_s] = CLIP(tmp[18] - t45);
out[46 * out_s] = CLIP(tmp[17] - t46a);
out[47 * out_s] = CLIP(tmp[16] - t47);
out[48 * out_s] = CLIP(tmp[15] - t48);
out[49 * out_s] = CLIP(tmp[14] - t49a);
out[50 * out_s] = CLIP(tmp[13] - t50);
out[51 * out_s] = CLIP(tmp[12] - t51a);
out[52 * out_s] = CLIP(tmp[11] - t52);
out[53 * out_s] = CLIP(tmp[10] - t53a);
out[54 * out_s] = CLIP(tmp[ 9] - t54);
out[55 * out_s] = CLIP(tmp[ 8] - t55a);
out[56 * out_s] = CLIP(tmp[ 7] - t56);
out[57 * out_s] = CLIP(tmp[ 6] - t57a);
out[58 * out_s] = CLIP(tmp[ 5] - t58);
out[59 * out_s] = CLIP(tmp[ 4] - t59a);
out[60 * out_s] = CLIP(tmp[ 3] - t60);
out[61 * out_s] = CLIP(tmp[ 2] - t61a);
out[62 * out_s] = CLIP(tmp[ 1] - t62);
out[63 * out_s] = CLIP(tmp[ 0] - t63a);
const int t0 = c[ 0 * stride];
const int t1 = c[ 2 * stride];
const int t2 = c[ 4 * stride];
const int t3 = c[ 6 * stride];
const int t4 = c[ 8 * stride];
const int t5 = c[10 * stride];
const int t6 = c[12 * stride];
const int t7 = c[14 * stride];
const int t8 = c[16 * stride];
const int t9 = c[18 * stride];
const int t10 = c[20 * stride];
const int t11 = c[22 * stride];
const int t12 = c[24 * stride];
const int t13 = c[26 * stride];
const int t14 = c[28 * stride];
const int t15 = c[30 * stride];
const int t16 = c[32 * stride];
const int t17 = c[34 * stride];
const int t18 = c[36 * stride];
const int t19 = c[38 * stride];
const int t20 = c[40 * stride];
const int t21 = c[42 * stride];
const int t22 = c[44 * stride];
const int t23 = c[46 * stride];
const int t24 = c[48 * stride];
const int t25 = c[50 * stride];
const int t26 = c[52 * stride];
const int t27 = c[54 * stride];
const int t28 = c[56 * stride];
const int t29 = c[58 * stride];
const int t30 = c[60 * stride];
const int t31 = c[62 * stride];
c[ 0 * stride] = CLIP(t0 + t63a);
c[ 1 * stride] = CLIP(t1 + t62);
c[ 2 * stride] = CLIP(t2 + t61a);
c[ 3 * stride] = CLIP(t3 + t60);
c[ 4 * stride] = CLIP(t4 + t59a);
c[ 5 * stride] = CLIP(t5 + t58);
c[ 6 * stride] = CLIP(t6 + t57a);
c[ 7 * stride] = CLIP(t7 + t56);
c[ 8 * stride] = CLIP(t8 + t55a);
c[ 9 * stride] = CLIP(t9 + t54);
c[10 * stride] = CLIP(t10 + t53a);
c[11 * stride] = CLIP(t11 + t52);
c[12 * stride] = CLIP(t12 + t51a);
c[13 * stride] = CLIP(t13 + t50);
c[14 * stride] = CLIP(t14 + t49a);
c[15 * stride] = CLIP(t15 + t48);
c[16 * stride] = CLIP(t16 + t47);
c[17 * stride] = CLIP(t17 + t46a);
c[18 * stride] = CLIP(t18 + t45);
c[19 * stride] = CLIP(t19 + t44a);
c[20 * stride] = CLIP(t20 + t43);
c[21 * stride] = CLIP(t21 + t42a);
c[22 * stride] = CLIP(t22 + t41);
c[23 * stride] = CLIP(t23 + t40a);
c[24 * stride] = CLIP(t24 + t39);
c[25 * stride] = CLIP(t25 + t38a);
c[26 * stride] = CLIP(t26 + t37);
c[27 * stride] = CLIP(t27 + t36a);
c[28 * stride] = CLIP(t28 + t35);
c[29 * stride] = CLIP(t29 + t34a);
c[30 * stride] = CLIP(t30 + t33);
c[31 * stride] = CLIP(t31 + t32a);
c[32 * stride] = CLIP(t31 - t32a);
c[33 * stride] = CLIP(t30 - t33);
c[34 * stride] = CLIP(t29 - t34a);
c[35 * stride] = CLIP(t28 - t35);
c[36 * stride] = CLIP(t27 - t36a);
c[37 * stride] = CLIP(t26 - t37);
c[38 * stride] = CLIP(t25 - t38a);
c[39 * stride] = CLIP(t24 - t39);
c[40 * stride] = CLIP(t23 - t40a);
c[41 * stride] = CLIP(t22 - t41);
c[42 * stride] = CLIP(t21 - t42a);
c[43 * stride] = CLIP(t20 - t43);
c[44 * stride] = CLIP(t19 - t44a);
c[45 * stride] = CLIP(t18 - t45);
c[46 * stride] = CLIP(t17 - t46a);
c[47 * stride] = CLIP(t16 - t47);
c[48 * stride] = CLIP(t15 - t48);
c[49 * stride] = CLIP(t14 - t49a);
c[50 * stride] = CLIP(t13 - t50);
c[51 * stride] = CLIP(t12 - t51a);
c[52 * stride] = CLIP(t11 - t52);
c[53 * stride] = CLIP(t10 - t53a);
c[54 * stride] = CLIP(t9 - t54);
c[55 * stride] = CLIP(t8 - t55a);
c[56 * stride] = CLIP(t7 - t56);
c[57 * stride] = CLIP(t6 - t57a);
c[58 * stride] = CLIP(t5 - t58);
c[59 * stride] = CLIP(t4 - t59a);
c[60 * stride] = CLIP(t3 - t60);
c[61 * stride] = CLIP(t2 - t61a);
c[62 * stride] = CLIP(t1 - t62);
c[63 * stride] = CLIP(t0 - t63a);
}
static void NOINLINE
inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int range)
static NOINLINE void
inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
const int min, const int max,
int32_t *const out, const ptrdiff_t out_s)
{
assert(in_s > 0 && out_s != 0);
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
@ -674,11 +801,12 @@ inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
in0 + in2 - in1;
}
static void NOINLINE
inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
static NOINLINE void
inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
const int min, const int max,
int32_t *const out, const ptrdiff_t out_s)
{
const int min = -max - 1;
assert(in_s > 0 && out_s != 0);
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
const int in4 = in[4 * in_s], in5 = in[5 * in_s];
@ -707,15 +835,14 @@ inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7;
t7a = (( 1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
out[0 * out_s] = CLIP( t0 + t2);
out[7 * out_s] = CLIP(-(t1 + t3));
t2 = CLIP( t0 - t2);
t3 = CLIP( t1 - t3);
out[1 * out_s] = CLIP(-(t4a + t6a));
out[6 * out_s] = CLIP( t5a + t7a );
t6 = CLIP( t4a - t6a );
t7 = CLIP( t5a - t7a );
out[0 * out_s] = CLIP(t0 + t2 );
out[7 * out_s] = -CLIP(t1 + t3 );
t2 = CLIP(t0 - t2 );
t3 = CLIP(t1 - t3 );
out[1 * out_s] = -CLIP(t4a + t6a);
out[6 * out_s] = CLIP(t5a + t7a);
t6 = CLIP(t4a - t6a);
t7 = CLIP(t5a - t7a);
out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8);
out[4 * out_s] = ((t2 - t3) * 181 + 128) >> 8;
@ -723,11 +850,12 @@ inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
}
static void NOINLINE
inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int max)
static NOINLINE void
inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
const int min, const int max,
int32_t *const out, const ptrdiff_t out_s)
{
const int min = -max - 1;
assert(in_s > 0 && out_s != 0);
const int in0 = in[ 0 * in_s], in1 = in[ 1 * in_s];
const int in2 = in[ 2 * in_s], in3 = in[ 3 * in_s];
const int in4 = in[ 4 * in_s], in5 = in[ 5 * in_s];
@ -806,22 +934,22 @@ inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
t14 = ((t15a * (3784 - 4096) - t14a * 1567 + 2048) >> 12) + t15a;
t15 = ((t15a * 1567 + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
out[ 0 * out_s] = CLIP( t0 + t2 );
out[15 * out_s] = CLIP(-(t1 + t3) );
t2a = CLIP( t0 - t2 );
t3a = CLIP( t1 - t3 );
out[ 3 * out_s] = CLIP(-(t4a + t6a) );
out[12 * out_s] = CLIP( t5a + t7a );
t6 = CLIP( t4a - t6a );
t7 = CLIP( t5a - t7a );
out[ 1 * out_s] = CLIP(-(t8a + t10a));
out[14 * out_s] = CLIP( t9a + t11a );
t10 = CLIP( t8a - t10a );
t11 = CLIP( t9a - t11a );
out[ 2 * out_s] = CLIP( t12 + t14 );
out[13 * out_s] = CLIP(-(t13 + t15) );
t14a = CLIP( t12 - t14 );
t15a = CLIP( t13 - t15 );
out[ 0 * out_s] = CLIP(t0 + t2 );
out[15 * out_s] = -CLIP(t1 + t3 );
t2a = CLIP(t0 - t2 );
t3a = CLIP(t1 - t3 );
out[ 3 * out_s] = -CLIP(t4a + t6a );
out[12 * out_s] = CLIP(t5a + t7a );
t6 = CLIP(t4a - t6a );
t7 = CLIP(t5a - t7a );
out[ 1 * out_s] = -CLIP(t8a + t10a);
out[14 * out_s] = CLIP(t9a + t11a);
t10 = CLIP(t8a - t10a);
t11 = CLIP(t9a - t11a);
out[ 2 * out_s] = CLIP(t12 + t14 );
out[13 * out_s] = -CLIP(t13 + t15 );
t14a = CLIP(t12 - t14 );
t15a = CLIP(t13 - t15 );
out[ 7 * out_s] = -(((t2a + t3a) * 181 + 128) >> 8);
out[ 8 * out_s] = ((t2a - t3a) * 181 + 128) >> 8;
@ -833,67 +961,74 @@ inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
out[10 * out_s] = ((t14a - t15a) * 181 + 128) >> 8;
}
#define flip_inv_adst(sz) \
static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \
coef *const out, const ptrdiff_t out_s, const int range) \
#define inv_adst_1d(sz) \
void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \
{ \
inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \
inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
} \
void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
const int min, const int max) \
{ \
inv_adst##sz##_1d_internal_c(c, stride, min, max, \
&c[(sz - 1) * stride], -stride); \
}
flip_inv_adst(4)
flip_inv_adst(8)
flip_inv_adst(16)
inv_adst_1d( 4)
inv_adst_1d( 8)
inv_adst_1d(16)
#undef flip_inv_adst
#undef inv_adst_1d
static void NOINLINE
inv_identity4_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int range)
void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
for (int i = 0; i < 4; i++)
out[out_s * i] = in[in_s * i] + ((in[in_s * i] * 1697 + 2048) >> 12);
assert(stride > 0);
for (int i = 0; i < 4; i++) {
const int in = c[stride * i];
c[stride * i] = in + ((in * 1697 + 2048) >> 12);
}
}
static void NOINLINE
inv_identity8_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int range)
void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
for (int i = 0; i < 8; i++)
out[out_s * i] = in[in_s * i] * 2;
c[stride * i] *= 2;
}
static void NOINLINE
inv_identity16_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int range)
void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
for (int i = 0; i < 16; i++)
out[out_s * i] = 2 * in[in_s * i] + ((in[in_s * i] * 1697 + 1024) >> 11);
assert(stride > 0);
for (int i = 0; i < 16; i++) {
const int in = c[stride * i];
c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);
}
}
static void NOINLINE
inv_identity32_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s, const int range)
void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
const int min, const int max)
{
assert(stride > 0);
for (int i = 0; i < 32; i++)
out[out_s * i] = in[in_s * i] * 4;
c[stride * i] *= 4;
}
static void NOINLINE
inv_wht4_1d(const coef *const in, const ptrdiff_t in_s,
coef *const out, const ptrdiff_t out_s,
const int pass)
{
const int sh = 2 * !pass;
const int in0 = in[0 * in_s] >> sh, in1 = in[1 * in_s] >> sh;
const int in2 = in[2 * in_s] >> sh, in3 = in[3 * in_s] >> sh;
void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
assert(stride > 0);
const int in0 = c[0 * stride], in1 = c[1 * stride];
const int in2 = c[2 * stride], in3 = c[3 * stride];
const int t0 = in0 + in1;
const int t2 = in2 - in3;
const int t4 = (t0 - t2) >> 1;
const int t3 = t4 - in3;
const int t1 = t4 - in1;
out[0 * out_s] = t0 - t3;
out[1 * out_s] = t3;
out[2 * out_s] = t1;
out[3 * out_s] = t2 + t1;
c[0 * stride] = t0 - t3;
c[1 * stride] = t3;
c[2 * stride] = t1;
c[3 * stride] = t2 + t1;
}

59
third_party/dav1d/src/itx_1d.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,59 @@
/*
* Copyright © 2018-2019, VideoLAN and dav1d authors
* Copyright © 2018-2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stddef.h>
#include <stdint.h>
#ifndef DAV1D_SRC_ITX_1D_H
#define DAV1D_SRC_ITX_1D_H
#define decl_itx_1d_fn(name) \
void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
typedef decl_itx_1d_fn(*itx_1d_fn);
decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
#endif /* DAV1D_SRC_ITX_1D_H */

132
third_party/dav1d/src/itx_tmpl.c поставляемый
Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* Copyright © 2018-2019, VideoLAN and dav1d authors
* Copyright © 2018-2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -35,78 +35,68 @@
#include "common/intops.h"
#include "src/itx.h"
#include "src/itx_1d.h"
#include "src/itx_1d.c"
typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
coef *out, ptrdiff_t out_s, const int range);
static void NOINLINE
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob,
const int w, const int h, const int shift,
static NOINLINE void
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
const int eob, const int w, const int h, const int shift,
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
const int has_dconly HIGHBD_DECL_SUFFIX)
{
int i, j;
assert((h >= 4 && h <= 64) && (w >= 4 && w <= 64));
assert(w >= 4 && w <= 64);
assert(h >= 4 && h <= 64);
assert(eob >= 0);
const int is_rect2 = w * 2 == h || h * 2 == w;
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int rnd = (1 << shift) >> 1;
if (has_dconly && eob == 0) {
if (eob < has_dconly) {
int dc = coeff[0];
coeff[0] = 0;
if (is_rect2)
dc = (dc * 2896 + 2048) >> 12;
dc = (dc * 2896 + 2048) >> 12;
dc = (dc * 181 + 128) >> 8;
dc = (dc * 181 + 128) >> 8;
dc = (dc + rnd) >> shift;
dc = (dc * 2896 + 2048) >> 12;
dc = (dc + 8) >> 4;
for (j = 0; j < h; j++)
for (i = 0; i < w; i++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] + dc);
dc = (dc * 181 + 128 + 2048) >> 12;
for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
for (int x = 0; x < w; x++)
dst[x] = iclip_pixel(dst[x] + dc);
return;
}
assert(eob > 0 || (eob == 0 && !has_dconly));
const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
// Maximum value for h and w is 64
coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;
const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
for (i = 0; i < sh; i++) {
if (w != sw || is_rect2) {
for (j = 0; j < sw; j++) {
in_mem[j] = coeff[i + j * sh];
if (is_rect2)
in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
}
first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
} else {
first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
}
for (j = 0; j < w; j++)
const int sh = imin(h, 32), sw = imin(w, 32);
#if BITDEPTH == 8
tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
const int row_clip_min = INT16_MIN;
const int col_clip_min = INT16_MIN;
#else
tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
-col_clip_max - 1, col_clip_max);
const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
#endif
const int row_clip_max = ~row_clip_min;
const int col_clip_max = ~col_clip_min;
int32_t tmp[64 * 64], *c = tmp;
for (int y = 0; y < sh; y++, c += w) {
if (is_rect2)
for (int x = 0; x < sw; x++)
c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
else
for (int x = 0; x < sw; x++)
c[x] = coeff[y + x * sh];
first_1d_fn(c, 1, row_clip_min, row_clip_max);
}
if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
for (i = 0; i < w; i++) {
second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
for (j = 0; j < h; j++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
((out[j] + 8) >> 4));
}
memset(coeff, 0, sizeof(*coeff) * sh * sw);
memset(coeff, 0, sizeof(*coeff) * sw * sh);
for (int i = 0; i < w * sh; i++)
tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
for (int x = 0; x < w; x++)
second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
c = tmp;
for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
for (int x = 0; x < w; x++)
dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
}
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
@ -118,8 +108,8 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
HIGHBD_DECL_SUFFIX) \
{ \
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
HIGHBD_TAIL_SUFFIX); \
dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
has_dconly HIGHBD_TAIL_SUFFIX); \
}
#define inv_txfm_fn64(w, h, shift) \
@ -173,23 +163,21 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob
HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
const int col_clip_min = -col_clip_max - 1;
coef tmp[4 * 4], out[4];
for (int i = 0; i < 4; i++)
inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
for (int k = 0; k < 4 * 4; k++)
tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);
for (int i = 0; i < 4; i++) {
inv_wht4_1d(&tmp[i], 4, out, 1, 1);
for (int j = 0; j < 4; j++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
int32_t tmp[4 * 4], *c = tmp;
for (int y = 0; y < 4; y++, c += 4) {
for (int x = 0; x < 4; x++)
c[x] = coeff[y + x * 4] >> 2;
dav1d_inv_wht4_1d_c(c, 1);
}
memset(coeff, 0, sizeof(*coeff) * 4 * 4);
for (int x = 0; x < 4; x++)
dav1d_inv_wht4_1d_c(&tmp[x], 4);
c = tmp;
for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))
for (int x = 0; x < 4; x++)
dst[x] = iclip_pixel(dst[x] + *c++);
}
COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {

7
third_party/dav1d/src/lf_mask.c поставляемый
Просмотреть файл

@ -43,8 +43,8 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /*
const uint16_t *const tx_masks)
{
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
const int is_split =
depth > 1 ? 0 : (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
(tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
if (is_split) {
const enum RectTxfmSize sub = t_dim->sub;
@ -350,6 +350,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
const int bx, const int by,
const int iw, const int ih,
const int skip, const enum BlockSize bs,
const enum RectTxfmSize max_ytx,
const uint16_t *const tx_masks,
const enum RectTxfmSize uvtx,
const enum Dav1dPixelLayout layout,
@ -373,7 +374,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
}
mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
dav1d_max_txfm_size_for_bs[bs][0], tx_masks, ay, ly);
max_ytx, tx_masks, ay, ly);
}
if (!auv) return;

4
third_party/dav1d/src/lf_mask.h поставляемый
Просмотреть файл

@ -72,8 +72,8 @@ void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
const ptrdiff_t b4_stride,
const uint8_t (*level)[8][2], int bx, int by,
int iw, int ih, int skip_inter,
enum BlockSize bs, const uint16_t *tx_mask,
enum RectTxfmSize uvtx,
enum BlockSize bs, enum RectTxfmSize max_ytx,
const uint16_t *tx_mask, enum RectTxfmSize uvtx,
enum Dav1dPixelLayout layout, uint8_t *ay,
uint8_t *ly, uint8_t *auv, uint8_t *luv);
void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);

36
third_party/dav1d/src/lib.c поставляемый
Просмотреть файл

@ -31,12 +31,17 @@
#include <errno.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#endif
#include "dav1d/dav1d.h"
#include "dav1d/data.h"
#include "common/mem.h"
#include "common/validate.h"
#include "src/cpu.h"
#include "src/fg_apply.h"
#include "src/internal.h"
#include "src/log.h"
@ -47,10 +52,11 @@
#include "src/wedge.h"
static COLD void init_internal(void) {
dav1d_init_wedge_masks();
dav1d_init_cpu();
dav1d_init_interintra_masks();
dav1d_init_qm_tables();
dav1d_init_thread();
dav1d_init_wedge_masks();
}
COLD const char *dav1d_version(void) {
@ -73,6 +79,22 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
static void close_internal(Dav1dContext **const c_out, int flush);
NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
#if defined(__linux__) && defined(HAVE_DLSYM)
/* glibc has an issue where the size of the TLS is subtracted from the stack
* size instead of allocated separately. As a result the specified stack
* size may be insufficient when used in an application with large amounts
* of TLS data. The following is a workaround to compensate for that.
* See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
size_t (*const get_minstack)(const pthread_attr_t*) =
dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
if (get_minstack)
return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
#endif
return 0;
}
COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
static pthread_once_t initted = PTHREAD_ONCE_INIT;
pthread_once(&initted, init_internal);
@ -92,7 +114,9 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
pthread_attr_t thread_attr;
if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
pthread_attr_setstacksize(&thread_attr, 1024 * 1024);
size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
pthread_attr_setstacksize(&thread_attr, stack_size);
Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
if (!c) goto error;
@ -124,17 +148,15 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
if (c->n_fc > 1) {
c->frame_thread.out_delayed =
malloc(sizeof(*c->frame_thread.out_delayed) * c->n_fc);
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
if (!c->frame_thread.out_delayed) goto error;
memset(c->frame_thread.out_delayed, 0,
sizeof(*c->frame_thread.out_delayed) * c->n_fc);
}
for (int n = 0; n < s->n_frame_threads; n++) {
Dav1dFrameContext *const f = &c->fc[n];
f->c = c;
f->lf.last_sharpness = -1;
f->n_tc = s->n_tile_threads;
f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 32);
f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
if (!f->tc) goto error;
memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
if (f->n_tc > 1) {
@ -512,7 +534,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
free(f->lf.level);
free(f->lf.tx_lpf_right_edge[0]);
if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
dav1d_free_aligned(f->lf.cdef_line[0][0][0]);
dav1d_free_aligned(f->lf.cdef_line_buf);
dav1d_free_aligned(f->lf.lr_lpf_line[0]);
}
dav1d_free_aligned(c->fc);

4
third_party/dav1d/src/looprestoration.h поставляемый
Просмотреть файл

@ -72,8 +72,8 @@ typedef struct Dav1dLoopRestorationDSPContext {
selfguided_fn selfguided;
} Dav1dLoopRestorationDSPContext;
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c);
bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c);
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc);
bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);

Просмотреть файл

@ -573,13 +573,13 @@ static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
}
}
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
c->wiener = wiener_c;
c->selfguided = selfguided_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc);
#elif ARCH_PPC64LE
bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
#elif ARCH_X86

60
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -35,6 +35,7 @@ libdav1d_sources = files(
'dequant_tables.c',
'getbits.c',
'intra_edge.c',
'itx_1d.c',
'lf_mask.c',
'log.c',
'msac.c',
@ -101,22 +102,47 @@ if is_asm_enabled
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/ipred.S',
'arm/64/itx.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
'arm/64/looprestoration_common.S',
'arm/64/msac.S',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/ipred.S',
'arm/64/itx.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
)
endif
if dav1d_bitdepths.contains('16')
libdav1d_sources += files(
'arm/64/cdef16.S',
'arm/64/loopfilter16.S',
'arm/64/looprestoration16.S',
'arm/64/mc16.S',
)
endif
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
)
endif
if dav1d_bitdepths.contains('16')
libdav1d_sources += files(
)
endif
endif
elif host_machine.cpu_family().startswith('x86')
@ -124,6 +150,12 @@ if is_asm_enabled
'x86/cpu.c',
)
if host_machine.cpu_family() == 'x86_64'
libdav1d_sources += files(
'x86/msac_init.c',
)
endif
libdav1d_tmpl_sources += files(
'x86/cdef_init_tmpl.c',
'x86/film_grain_init_tmpl.c',
@ -150,6 +182,7 @@ if is_asm_enabled
'x86/looprestoration.asm',
'x86/mc.asm',
'x86/cdef_sse.asm',
'x86/film_grain_ssse3.asm',
'x86/ipred_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
@ -278,6 +311,7 @@ libdav1d = library('dav1d',
stdatomic_dependency,
thread_dependency,
thread_compat_dep,
libdl_dependency,
],
c_args : [stackalign_flag, api_export_flags],
version : dav1d_soname_version,
@ -285,6 +319,10 @@ libdav1d = library('dav1d',
install : true,
)
dav1d_dep = declare_dependency(link_with: libdav1d,
include_directories : include_directories('../include/dav1d')
)
#
# Generate pkg-config .pc file
#

7
third_party/dav1d/src/msac.c поставляемый
Просмотреть файл

@ -196,5 +196,12 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
s->rng = 0x8000;
s->cnt = -15;
s->allow_update_cdf = !disable_cdf_update_flag;
#if ARCH_X86_64 && HAVE_ASM
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
dav1d_msac_init_x86(s);
#endif
ctx_refill(s);
}

6
third_party/dav1d/src/msac.h поставляемый
Просмотреть файл

@ -31,7 +31,7 @@
#include <stdint.h>
#include <stdlib.h>
#include "common/attributes.h"
#include "common/intops.h"
typedef size_t ec_win;
@ -42,6 +42,10 @@ typedef struct MsacContext {
unsigned rng;
int cnt;
int allow_update_cdf;
#if ARCH_X86_64 && HAVE_ASM
unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols);
#endif
} MsacContext;
#if HAVE_ASM

63
third_party/dav1d/src/obu.c поставляемый
Просмотреть файл

@ -917,10 +917,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->skip_mode_allowed = 0;
if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
const unsigned poc = hdr->frame_offset;
unsigned off_before[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
unsigned off_before = 0xFFFFFFFFU;
int off_after = -1;
int off_before_idx[2], off_after_idx;
off_before_idx[0] = 0;
int off_before_idx, off_after_idx;
for (int i = 0; i < 7; i++) {
if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
@ -933,36 +932,42 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
off_after = refpoc;
off_after_idx = i;
}
} else if (diff < 0) {
if (off_before[0] == 0xFFFFFFFFU ||
get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before[0]) > 0)
{
off_before[1] = off_before[0];
off_before[0] = refpoc;
off_before_idx[1] = off_before_idx[0];
off_before_idx[0] = i;
} else if (refpoc != off_before[0] &&
(off_before[1] == 0xFFFFFFFFU ||
get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before[1]) > 0))
{
off_before[1] = refpoc;
off_before_idx[1] = i;
}
} else if (diff < 0 && (off_before == 0xFFFFFFFFU ||
get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before) > 0))
{
off_before = refpoc;
off_before_idx = i;
}
}
if (off_before[0] != 0xFFFFFFFFU && off_after != -1) {
hdr->skip_mode_refs[0] = imin(off_before_idx[0], off_after_idx);
hdr->skip_mode_refs[1] = imax(off_before_idx[0], off_after_idx);
hdr->skip_mode_allowed = 1;
} else if (off_before[0] != 0xFFFFFFFFU &&
off_before[1] != 0xFFFFFFFFU)
{
hdr->skip_mode_refs[0] = imin(off_before_idx[0], off_before_idx[1]);
hdr->skip_mode_refs[1] = imax(off_before_idx[0], off_before_idx[1]);
if (off_before != 0xFFFFFFFFU && off_after != -1) {
hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
hdr->skip_mode_allowed = 1;
} else if (off_before != 0xFFFFFFFFU) {
unsigned off_before2 = 0xFFFFFFFFU;
int off_before2_idx;
for (int i = 0; i < 7; i++) {
if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
if (get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before) < 0) {
if (off_before2 == 0xFFFFFFFFU ||
get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before2) > 0)
{
off_before2 = refpoc;
off_before2_idx = i;
}
}
}
if (off_before2 != 0xFFFFFFFFU) {
hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
hdr->skip_mode_allowed = 1;
}
}
}
hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;

18
third_party/dav1d/src/ppc/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -53,7 +53,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], uint8_t *const top[2],
const uint8_t (*left)[2], const uint8_t *const top,
const int w, const int h,
const enum CdefEdgeFlags edges)
{
@ -70,8 +70,8 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
l1 = fill;
y_start = 0;
} else {
l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
}
vec_st(l0, 0, tmp - 2 * 8);
@ -115,7 +115,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], uint8_t *const top[2],
const uint8_t (*left)[2], const uint8_t *const top,
const int w, const int h,
const enum CdefEdgeFlags edges)
{
@ -134,8 +134,8 @@ static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
l1l = fill;
y_start = 0;
} else {
u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
l0h = u8h_to_u16(l0);
l0l = u8l_to_u16(l0);
l1h = u8h_to_u16(l1);
@ -275,7 +275,7 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
static inline void
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], /*const*/ pixel *const top[2],
const pixel (*left)[2], const pixel *const top,
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges,
@ -364,7 +364,7 @@ filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
static inline void
filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], /*const*/ pixel *const top[2],
const pixel (*left)[2], const pixel *const top,
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges,
@ -456,7 +456,7 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
const ptrdiff_t dst_stride, \
const pixel (*left)[2], \
/*const*/ pixel *const top[2], \
const pixel *const top, \
const int pri_strength, \
const int sec_strength, \
const int dir, \

34
third_party/dav1d/src/recon_tmpl.c поставляемый
Просмотреть файл

@ -70,10 +70,10 @@ static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
int ca, cl;
unsigned ca, cl;
#define MERGE_CTX(dir, type, mask) \
c##dir = !!((*(const type *) dir) & mask); \
#define MERGE_CTX(dir, type, no_val) \
c##dir = *(const type *) dir != no_val; \
break
switch (t_dim->lw) {
@ -83,17 +83,17 @@ static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
* and will therefore complain about the use of uninitialized variables
* when compiled in debug mode if we put the default case at the end. */
default: assert(0); /* fall-through */
case TX_4X4: MERGE_CTX(a, uint8_t, 0x3F);
case TX_8X8: MERGE_CTX(a, uint16_t, 0x3F3F);
case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
case TX_4X4: MERGE_CTX(a, uint8_t, 0x40);
case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040);
case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
}
switch (t_dim->lh) {
default: assert(0); /* fall-through */
case TX_4X4: MERGE_CTX(l, uint8_t, 0x3F);
case TX_8X8: MERGE_CTX(l, uint16_t, 0x3F3F);
case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
case TX_4X4: MERGE_CTX(l, uint8_t, 0x40);
case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040);
case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
}
#undef MERGE_CTX
@ -352,13 +352,17 @@ static int decode_coefs(Dav1dTileContext *const t,
if (lossless) {
assert(t_dim->max == TX_4X4);
*txtp = WHT_WHT;
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id] ||
t_dim->max + intra >= TX_64X64)
{
} else if (t_dim->max + intra >= TX_64X64) {
*txtp = DCT_DCT;
} else if (chroma) {
// inferred from either the luma txtp (inter) or a LUT (intra)
*txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
get_uv_inter_txtp(t_dim, *txtp);
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
// In libaom, lossless is checked by a literal qidx == 0, but not all
// such blocks are actually lossless. The remainder gets an implicit
// transform type (for luma)
*txtp = DCT_DCT;
} else {
unsigned idx;
if (intra) {
@ -1993,7 +1997,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
imin(sby * sbsz + n_blks, f->bh));
}
if (f->frame_hdr->super_res.enabled) {
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;

5
third_party/dav1d/src/ref_mvs.c поставляемый
Просмотреть файл

@ -2094,10 +2094,7 @@ void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
AV1_COMMON *dav1d_alloc_ref_mv_common(void);
AV1_COMMON *dav1d_alloc_ref_mv_common(void) {
AV1_COMMON *cm = malloc(sizeof(*cm));
if (!cm) return NULL;
memset(cm, 0, sizeof(*cm));
return cm;
return calloc(1, sizeof(AV1_COMMON));
}
void dav1d_free_ref_mv_common(AV1_COMMON *cm);

15
third_party/dav1d/src/tables.c поставляемый
Просмотреть файл

@ -397,6 +397,21 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = {
.delta = 0,
};
const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
{ 1 * 12 + 0, 2 * 12 + 0 }, // 6
{ 1 * 12 + 0, 2 * 12 - 1 }, // 7
{ -1 * 12 + 1, -2 * 12 + 2 }, // 0
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
{ 0 * 12 + 1, 0 * 12 + 2 }, // 2
{ 0 * 12 + 1, 1 * 12 + 2 }, // 3
{ 1 * 12 + 1, 2 * 12 + 2 }, // 4
{ 1 * 12 + 0, 2 * 12 + 1 }, // 5
{ 1 * 12 + 0, 2 * 12 + 0 }, // 6
{ 1 * 12 + 0, 2 * 12 - 1 }, // 7
{ -1 * 12 + 1, -2 * 12 + 2 }, // 0
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
};
const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
{ 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 },

2
third_party/dav1d/src/tables.h поставляемый
Просмотреть файл

@ -105,6 +105,8 @@ static const unsigned interintra_allowed_mask =
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int8_t dav1d_cdef_directions[12][2];
extern const int16_t dav1d_sgr_params[16][4];
extern const uint8_t dav1d_sgr_x_by_x[256];

6
third_party/dav1d/src/thread.h поставляемый
Просмотреть файл

@ -30,6 +30,7 @@
#if defined(_WIN32)
#include <limits.h>
#include <windows.h>
#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
@ -72,9 +73,10 @@ static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
}
static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
const unsigned stack_size)
const size_t stack_size)
{
attr->stack_size = stack_size;
if (stack_size > UINT_MAX) return 1;
attr->stack_size = (unsigned) stack_size;
return 0;
}

1772
third_party/dav1d/src/x86/cdef.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

33
third_party/dav1d/src/x86/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -28,20 +28,16 @@
#include "src/cpu.h"
#include "src/cdef.h"
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
decl_cdef_fn(dav1d_cdef_filter_8x8_sse2);
#define decl_cdef_size_fn(sz) \
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
decl_cdef_fn(dav1d_cdef_filter_4x8_sse2);
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_fn(dav1d_cdef_filter_4x4_sse2);
decl_cdef_size_fn(4x4);
decl_cdef_size_fn(4x8);
decl_cdef_size_fn(8x8);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
@ -76,12 +72,21 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
#endif
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_avx2;
c->fb[0] = dav1d_cdef_filter_8x8_avx2;
c->fb[1] = dav1d_cdef_filter_4x8_avx2;
c->fb[2] = dav1d_cdef_filter_4x4_avx2;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
#if BITDEPTH == 8
c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
#endif
#endif
}

39
third_party/dav1d/src/x86/cdef_sse.asm поставляемый
Просмотреть файл

@ -364,26 +364,19 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
.body_done:
; top
%if ARCH_X86_64
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
%else
DEFINE_ARGS dst, stride, left, top2, stride3, top1, edge
%endif
LOAD_ARG32 top
test edged, 4 ; have_top
jz .no_top
mov top1q, [top2q+0*gprsize]
mov top2q, [top2q+1*gprsize]
test edged, 1 ; have_left
jz .top_no_left
test edged, 2 ; have_right
jz .top_no_right
%if %1 == 4
PMOVZXBW m0, [top1q-2]
PMOVZXBW m1, [top2q-2]
PMOVZXBW m0, [topq+strideq*0-2]
PMOVZXBW m1, [topq+strideq*1-2]
%else
movu m0, [top1q-4]
movu m1, [top2q-4]
movu m0, [topq+strideq*0-4]
movu m1, [topq+strideq*1-4]
punpckhbw m2, m0, m15
punpcklbw m0, m15
punpckhbw m3, m1, m15
@ -396,13 +389,13 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
jmp .top_done
.top_no_right:
%if %1 == 4
PMOVZXBW m0, [top1q-%1]
PMOVZXBW m1, [top2q-%1]
PMOVZXBW m0, [topq+strideq*0-%1]
PMOVZXBW m1, [topq+strideq*1-%1]
movu [px-2*%3-4*2], m0
movu [px-1*%3-4*2], m1
%else
movu m0, [top1q-%1]
movu m1, [top2q-%2]
movu m0, [topq+strideq*0-%1]
movu m1, [topq+strideq*1-%2]
punpckhbw m2, m0, m15
punpcklbw m0, m15
punpckhbw m3, m1, m15
@ -419,11 +412,11 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
test edged, 2 ; have_right
jz .top_no_left_right
%if %1 == 4
PMOVZXBW m0, [top1q]
PMOVZXBW m1, [top2q]
PMOVZXBW m0, [topq+strideq*0]
PMOVZXBW m1, [topq+strideq*1]
%else
movu m0, [top1q]
movu m1, [top2q]
movu m0, [topq+strideq*0]
movu m1, [topq+strideq*1]
punpckhbw m2, m0, m15
punpcklbw m0, m15
punpckhbw m3, m1, m15
@ -437,8 +430,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
mov dword [px-1*%3-4], OUT_OF_BOUNDS
jmp .top_done
.top_no_left_right:
PMOVZXBW m0, [top1q], %1 == 4
PMOVZXBW m1, [top2q], %1 == 4
PMOVZXBW m0, [topq+strideq*0], %1 == 4
PMOVZXBW m1, [topq+strideq*1], %1 == 4
mova [px-2*%3], m0
mova [px-1*%3], m1
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
@ -630,9 +623,9 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
sub secdmpd, dampingd
xor dampingd, dampingd
neg pridmpd
cmovl pridmpd, dampingd
cmovs pridmpd, dampingd
neg secdmpd
cmovl secdmpd, dampingd
cmovs secdmpd, dampingd
%if ARCH_X86_64
mov [rsp+ 0], pridmpq ; pri_shift
mov [rsp+16], secdmpq ; sec_shift

55
third_party/dav1d/src/x86/cpu.c поставляемый
Просмотреть файл

@ -33,37 +33,44 @@
#include "src/x86/cpu.h"
void dav1d_cpu_cpuid(uint32_t *info, int leaf);
uint64_t dav1d_cpu_xgetbv(int xcr);
typedef struct {
uint32_t eax, ebx, ecx, edx;
} CpuidRegisters;
void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
uint64_t dav1d_cpu_xgetbv(unsigned xcr);
#define X(reg, mask) (((reg) & (mask)) == (mask))
COLD unsigned dav1d_get_cpu_flags_x86(void) {
uint32_t info[4] = {0}, n_ids;
CpuidRegisters r = { 0 };
dav1d_cpu_cpuid(&r, 0, 0);
const unsigned max_leaf = r.eax;
unsigned flags = 0;
dav1d_cpu_cpuid(info, 0);
n_ids = info[0];
if (n_ids >= 1) {
dav1d_cpu_cpuid(info, 1);
if (info[3] & (1 << 25)) flags |= DAV1D_X86_CPU_FLAG_SSE;
if (info[3] & (1 << 26)) flags |= DAV1D_X86_CPU_FLAG_SSE2;
if (info[2] & (1 << 0)) flags |= DAV1D_X86_CPU_FLAG_SSE3;
if (info[2] & (1 << 9)) flags |= DAV1D_X86_CPU_FLAG_SSSE3;
if (info[2] & (1 << 19)) flags |= DAV1D_X86_CPU_FLAG_SSE41;
if (info[2] & (1 << 20)) flags |= DAV1D_X86_CPU_FLAG_SSE42;
if (max_leaf >= 1) {
dav1d_cpu_cpuid(&r, 1, 0);
if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
flags |= DAV1D_X86_CPU_FLAG_SSE2;
if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
flags |= DAV1D_X86_CPU_FLAG_SSSE3;
if (X(r.ecx, 0x00080000)) /* SSE4.1 */
flags |= DAV1D_X86_CPU_FLAG_SSE41;
}
}
#if ARCH_X86_64
/* We only support >128-bit SIMD on x86-64. */
if (info[2] & (1 << 27)) /* OSXSAVE */ {
uint64_t xcr = dav1d_cpu_xgetbv(0);
if ((xcr & 0x00000006) == 0x00000006) /* XMM/YMM */ {
if (info[2] & (1 << 28)) flags |= DAV1D_X86_CPU_FLAG_AVX;
if (n_ids >= 7) {
dav1d_cpu_cpuid(info, 7);
if ((info[1] & 0x00000128) == 0x00000128)
if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
if (max_leaf >= 7) {
dav1d_cpu_cpuid(&r, 7, 0);
if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
flags |= DAV1D_X86_CPU_FLAG_AVX2;
if ((xcr & 0x000000e0) == 0x000000e0) /* ZMM/OPMASK */ {
if ((info[1] & 0xd0030000) == 0xd0030000)
flags |= DAV1D_X86_CPU_FLAG_AVX512;
if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
}
}
}
}

15
third_party/dav1d/src/x86/cpu.h поставляемый
Просмотреть файл

@ -29,15 +29,12 @@
#define DAV1D_SRC_X86_CPU_H
enum CpuFlags {
DAV1D_X86_CPU_FLAG_SSE = 1 << 0,
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 1,
DAV1D_X86_CPU_FLAG_SSE3 = 1 << 2,
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 3,
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 4,
DAV1D_X86_CPU_FLAG_SSE42 = 1 << 5,
DAV1D_X86_CPU_FLAG_AVX = 1 << 6,
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 7,
DAV1D_X86_CPU_FLAG_AVX512 = 1 << 8, /* F + CD + BW + DQ + VL */
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
* VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
};
unsigned dav1d_get_cpu_flags_x86(void);

10
third_party/dav1d/src/x86/cpuid.asm поставляемый
Просмотреть файл

@ -27,12 +27,12 @@
SECTION .text
cglobal cpu_cpuid, 0, 5, 0, info, leaf
mov r4, infomp
cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
mov r4, regsmp
mov eax, leafm
xor ecx, ecx
mov ecx, subleafm
%if ARCH_X86_64
push rbx
mov r5, rbx
%endif
cpuid
mov [r4+4*0], eax
@ -40,7 +40,7 @@ cglobal cpu_cpuid, 0, 5, 0, info, leaf
mov [r4+4*2], ecx
mov [r4+4*3], edx
%if ARCH_X86_64
pop rbx
mov rbx, r5
%endif
RET

77
third_party/dav1d/src/x86/film_grain.asm поставляемый
Просмотреть файл

@ -44,6 +44,7 @@ round_vals: dw 32, 64, 128, 256, 512
max: dw 255, 240, 235
min: dw 0, 16
pb_27_17_17_27: db 27, 17, 17, 27
pw_1: dw 1
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
@ -56,6 +57,7 @@ pb_27_17_17_27: db 27, 17, 17, 27
%endrep
%endmacro
ALIGN 4
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
@ -69,8 +71,8 @@ struc FGData
.scaling_shift: resd 1
.ar_coeff_lag: resd 1
.ar_coeffs_y: resb 24
.ar_coeffs_uv: resb 2 * 26 ; includes padding
.ar_coeff_shift: resd 1
.ar_coeffs_uv: resb 2 * 28 ; includes padding
.ar_coeff_shift: resq 1
.grain_scale_shift: resd 1
.uv_mult: resd 2
.uv_luma_mult: resd 2
@ -169,9 +171,9 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
cmovg val3d, maxd
cmovns val3d, maxd
cmp val3d, mind
cmovl val3d, mind
cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
@ -190,18 +192,19 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
.ar2:
DEFINE_ARGS buf, fg_data, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd xm14, [base+hmul_bits-10+shiftq*2]
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
movq xm15, [base+byte_blend+1]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
pmovsxbw xm9, xm9
DEFINE_ARGS buf, h, x
DEFINE_ARGS buf, fg_data, h, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
pshufd xm11, xm8, q3333
pshufd xm10, xm8, q2222
pshufd xm9, xm8, q1111
pshufd xm8, xm8, q0000
pmovzxwd xm14, xm14
sub bufq, 82*73-(82*3+79)
mov hd, 70
.y_loop_ar2:
@ -233,6 +236,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
paddd xm4, xm6
paddd xm2, xm7
paddd xm2, xm4
paddd xm2, xm14
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
.x_loop_ar2_inner:
@ -241,9 +245,8 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
paddd xm3, xm2
psrldq xm1, 4 ; y=0,x=0
psrldq xm2, 4 ; shift top to next pixel
psrad xm3, 5
packssdw xm3, xm3
pmulhrsw xm3, xm14
psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
; don't packssdw since we only care about one value
paddw xm3, xm1
packsswb xm3, xm3
pextrb [bufq+xq], xm3, 0
@ -274,7 +277,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
ALLOC_STACK 16*12
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd xm14, [base+hmul_bits-10+shiftq*2]
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
movq xm15, [base+byte_blend]
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15
@ -288,10 +291,11 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
pshufd xm8, xm1, q3333
pshufd xm1, xm1, q0000
pshufd xm3, xm2, q1111
psrldq xm13, xm2, 10
pinsrw xm2, [pw_1], 5
pshufd xm4, xm2, q2222
psrldq xm5, xm2, 10
pshufd xm2, xm2, q0000
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
pinsrw xm13, [base+round_vals+shiftq*2-10], 3
mova [rsp+ 0*16], xm0
mova [rsp+ 1*16], xm9
mova [rsp+ 2*16], xm10
@ -303,9 +307,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
mova [rsp+ 8*16], xm2
mova [rsp+ 9*16], xm3
mova [rsp+10*16], xm4
mova [rsp+11*16], xm5
pxor xm13, xm13
DEFINE_ARGS buf, h, x
DEFINE_ARGS buf, fg_data, h, x
sub bufq, 82*73-(82*3+79)
mov hd, 70
.y_loop_ar3:
@ -374,7 +376,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
punpcklwd xm6, xm7
punpcklwd xm8, xm9
punpcklwd xm5, xm13
punpcklwd xm5, xm14
pmaddwd xm6, [rsp+ 8*16]
pmaddwd xm8, [rsp+ 9*16]
pmaddwd xm5, [rsp+10*16]
@ -385,14 +387,13 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
pmovsxbw xm2, xm1
pmaddwd xm2, [rsp+16*11]
pmaddwd xm2, xm13
pshufd xm3, xm2, q1111
paddd xm2, xm3 ; left+cur
paddd xm2, xm0 ; add top
psrldq xm0, 4
psrad xm2, 5
packssdw xm2, xm2
pmulhrsw xm2, xm14
psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
; don't packssdw since we only care about one value
packsswb xm2, xm2
pextrb [bufq+xq], xm2, 0
pslldq xm2, 3
@ -468,7 +469,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
.ar0:
INIT_YMM avx2
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
imul uvd, 25
imul uvd, 28
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
movd xm3, [base+hmul_bits+shiftq*2]
@ -538,7 +539,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
.ar1:
INIT_XMM avx2
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
imul uvd, 25
imul uvd, 28
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
@ -584,9 +585,9 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
cmovg val3d, maxd
cmovns val3d, maxd
cmp val3d, mind
cmovl val3d, mind
cmovs val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
@ -605,18 +606,17 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
.ar2:
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 25
movd xm15, [base+hmul_bits-10+shiftq*2]
imul uvd, 28
vpbroadcastw xm15, [base+round_vals-12+shiftq*2]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
pinsrw xm9, [base+pw_1], 5
vpbroadcastw xm7, [base+hmul_bits+4]
vpbroadcastd xm6, [base+pb_1]
DEFINE_ARGS buf, bufy, h, x
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
pshufd xm14, xm9, q2222
pxor xm10, xm10
vpblendw xm14, xm10, 10101010b
pshufd xm11, xm8, q3333
pshufd xm10, xm8, q2222
pshufd xm9, xm8, q1111
@ -660,7 +660,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
pmaddubsw xm3, xm6, xm3
paddw xm0, xm3
pmulhrsw xm0, xm7
punpcklwd xm0, xm0
punpcklwd xm0, xm15
pmaddwd xm0, xm14
paddd xm2, xm0
@ -670,9 +670,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
pmaddwd xm3, xm0, xm13
paddd xm3, xm2
psrldq xm2, 4 ; shift top to next pixel
psrad xm3, 5
packssdw xm3, xm3
pmulhrsw xm3, xm15
psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
pslldq xm3, 2
psrldq xm0, 2
paddw xm3, xm0
@ -698,8 +696,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
%assign stack_size_padded (stack_size_padded+16*12)
%assign stack_size (stack_size+16*12)
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 25
movd xm14, [base+hmul_bits-10+shiftq*2]
imul uvd, 28
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15
pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
@ -719,6 +717,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
psrldq xm5, xm2, 10
pshufd xm2, xm2, q0000
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
pmovzxwd xm14, xm14
mova [rsp+ 0*16], xm0
mova [rsp+ 1*16], xm9
mova [rsp+ 2*16], xm10
@ -733,7 +732,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
mova [rsp+11*16], xm5
vpbroadcastd xm13, [base+pb_1]
vpbroadcastw xm15, [base+hmul_bits+4]
DEFINE_ARGS buf, bufy, h, x
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
sub bufq, 82*38+44-(82*3+41)
add bufyq, 79+82*3
mov hd, 35
@ -817,6 +816,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
paddd xm0, xm6
paddd xm8, xm5
paddd xm0, xm8
paddd xm0, xm14
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
@ -826,9 +826,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
paddd xm2, xm3 ; left+cur
paddd xm2, xm0 ; add top
psrldq xm0, 4
psrad xm2, 5
packssdw xm2, xm2
pmulhrsw xm2, xm14
psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
; don't packssdw, we only care about one value
pslldq xm2, 6
vpblendw xm1, xm2, 1000b
packsswb xm1, xm1

Просмотреть файл

@ -28,6 +28,11 @@
#include "src/cpu.h"
#include "src/film_grain.h"
decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
@ -36,6 +41,15 @@ decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->generate_grain_y = dav1d_generate_grain_y_ssse3;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64

2938
third_party/dav1d/src/x86/film_grain_ssse3.asm поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

46
third_party/dav1d/src/x86/ipred.asm поставляемый
Просмотреть файл

@ -308,7 +308,7 @@ ALIGN function_align
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmovz r6d, r2d
cmove r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w8_end:
@ -1441,7 +1441,7 @@ ALIGN function_align
mov r3d, 9
mov tlq, rsp
cmp hd, 4
cmova maxbased, r3d
cmovne maxbased, r3d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [tlq], xm0
@ -1628,8 +1628,8 @@ ALIGN function_align
sar r5d, 1
mov tlq, rsp
add r5d, 17 ; w*2 + (filter_strength == 3)
cmp hd, 8
cmova maxbased, r5d
cmp hd, 16
cmovns maxbased, r5d
mov [tlq+r5], r3b
vextracti128 xm0, m1, 1
packuswb xm0, xm1
@ -1745,8 +1745,8 @@ ALIGN function_align
sar r5d, 1
mov tlq, rsp
add r5d, 33
cmp hd, 16
cmova maxbased, r5d
cmp hd, 32
cmovns maxbased, r5d
mov [tlq+r5], r3b
packuswb m0, m1
vpermq m0, m0, q3120
@ -1812,7 +1812,7 @@ ALIGN function_align
lea r3d, [hq+31]
mov maxbased, 63
cmp hd, 32
cmovb maxbased, r3d
cmovs maxbased, r3d
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
vbroadcasti128 m0, [pb_0to15]
@ -1889,8 +1889,8 @@ ALIGN function_align
mov tlq, rsp
mov [tlq+65], r3b
mov r3d, 65
cmp hd, 32
cmova maxbased, r3d
cmp hd, 64
cmove maxbased, r3d
packuswb m0, m2
packuswb m1, m6
mova [tlq+ 0], m0
@ -2294,7 +2294,7 @@ ALIGN function_align
cmp hd, 16
movu xm2, [rsp+49]
vinserti128 m2, [rsp+43], 1
cmovl r5d, hd
cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vbroadcasti128 m1, [base+z_filter_s+12]
@ -2501,7 +2501,7 @@ ALIGN function_align
.w8_filter_left_h16:
mov r5d, 10
cmp hd, 16
cmovl r5d, hd
cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vpbroadcastb m0, xm0
@ -2742,7 +2742,7 @@ ALIGN function_align
.w16_filter_left_h16:
mov r5d, 10
cmp hd, 16
cmovl r5d, hd
cmovs r5d, hd
xor r5d, 15 ; h == 16 ? 5 : 15 - h
movd xm0, r5d
vpbroadcastb m0, xm0
@ -3115,7 +3115,7 @@ ALIGN function_align
mov r4d, 9
lea tlq, [rsp+15]
cmp wd, 4
cmova maxbased, r4d
cmovne maxbased, r4d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [rsp], xm0
@ -3321,8 +3321,8 @@ ALIGN function_align
sar r5d, 1
lea tlq, [rsp+31]
add r5d, 17
cmp wd, 8
cmova maxbased, r5d
cmp wd, 16
cmovns maxbased, r5d
neg r5
mov [tlq+r5], r4b
vextracti128 xm1, m0, 1
@ -3385,7 +3385,7 @@ ALIGN function_align
sub org_wd, 8
lea r2, [strideq*3]
lea r6, [dstq+org_wq]
cmovg dstq, r6
cmovns dstq, r6
punpcklwd xm1, xm2, xm0
punpckhwd xm2, xm0
lea r6, [dstq+strideq*4]
@ -3493,8 +3493,8 @@ ALIGN function_align
sar r5d, 1
lea tlq, [rsp+63]
add r5d, 33
cmp wd, 16
cmova maxbased, r5d
cmp wd, 32
cmovns maxbased, r5d
neg r5
mov [tlq+r5], r4b
packuswb m0, m1
@ -3563,7 +3563,7 @@ ALIGN function_align
sub org_wd, 8
lea r2, [strideq*3]
lea r6, [dstq+org_wq]
cmovg dstq, r6
cmovns dstq, r6
punpcklbw m1, m2, m0
punpckhbw m2, m0
lea r3, [strideq*5]
@ -3652,7 +3652,7 @@ ALIGN function_align
movu xm11, [tlq-66] ; 56-63
vinserti128 m11, [tlq-52], 1 ; 40-47
sub r4d, wd ; 21-w
cmovg r5d, r4d
cmovns r5d, r4d
movu xm12, [tlq-58] ; 48-55
vinserti128 m12, [tlq-44], 1 ; 32-39
sub r4d, 8 ; 13-w
@ -3721,8 +3721,8 @@ ALIGN function_align
lea tlq, [rsp+95]
mov [tlq-65], r4b
mov r4d, 65
cmp wd, 32
cmova maxbased, r4d
cmp wd, 64
cmove maxbased, r4d
packuswb m0, m2
packuswb m1, m6
mova [tlq-63], m0
@ -4553,7 +4553,7 @@ ALIGN function_align
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmovz r6d, r2d
cmove r6d, r2d
movd xm1, r6d
pmulhuw xm0, xm1
.w8_end:

226
third_party/dav1d/src/x86/itx.asm поставляемый
Просмотреть файл

@ -60,7 +60,6 @@ pw_16384: times 2 dw 16384
pw_1697x16: times 2 dw 1697*16
pw_1697x8: times 2 dw 1697*8
pw_2896x8: times 2 dw 2896*8
pw_5793x4: times 2 dw 5793*4
pd_2048: dd 2048
@ -393,7 +392,7 @@ ALIGN function_align
pmulhrsw m0, [cq]
vpbroadcastd m1, [o(pw_1697x8)]
pmulhrsw m1, m0
paddw m0, m1
paddsw m0, m1
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
@ -405,7 +404,7 @@ ALIGN function_align
vpbroadcastd m2, [o(pw_2896x8)]
packusdw m0, m0
pmulhrsw m1, m0
paddw m0, m1
paddsw m0, m1
pmulhrsw m0, m2
mova m1, m0
jmp m(iadst_4x4_internal).end
@ -561,8 +560,8 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
paddsw m0, m2
paddsw m1, m3
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2
@ -572,8 +571,8 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
paddsw m0, m2
paddsw m1, m3
jmp m(iadst_4x4_internal).end
%macro WRITE_4X8 2 ; coefs[1-2]
@ -626,7 +625,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
punpckldq xm0, xm1
pmulhrsw xm0, xm2
pmulhrsw xm3, xm0
paddw xm0, xm3
paddsw xm0, xm3
pmulhrsw xm0, xm2
pmulhrsw xm0, xm4
vpbroadcastq m0, xm0
@ -907,8 +906,8 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
punpckhwd m1, m2
pmulhrsw m2, m4, m0
pmulhrsw m4, m1
paddw m0, m2
paddw m1, m4
paddsw m0, m2
paddsw m1, m4
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_4096)]
@ -925,8 +924,8 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd m3, [o(pw_2048)]
pmulhrsw m0, m1
pmulhrsw m2, m0
paddw m0, m0
paddw m0, m2
paddsw m0, m0
paddsw m0, m2
pmulhrsw m3, m0
punpcklwd m1, m3, m3
punpckhwd m3, m3
@ -941,15 +940,16 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
movd xm1, [cq+32*2]
punpcklwd xm1, [cq+32*3]
vpbroadcastd xm2, [o(pw_1697x8)]
vpbroadcastd xm3, [o(pw_16384)]
vpbroadcastd xm4, [o(pw_2896x8)]
vpbroadcastd xm3, [o(pw_2896x8)]
vpbroadcastd xm4, [o(pw_2048)]
punpckldq xm0, xm1
pcmpeqw xm1, xm1
pmulhrsw xm2, xm0
paddw xm0, xm2
pcmpeqw xm1, xm0
pxor xm0, xm1
pavgw xm0, xm2
pmulhrsw xm0, xm3
psrlw xm3, 3 ; pw_2048
pmulhrsw xm0, xm4
pmulhrsw xm0, xm3
vpbroadcastq m0, xm0
mova m1, m0
mova m2, m0
@ -1283,26 +1283,33 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m3, [cq+32*0]
mova m2, [cq+32*1]
mova m4, [cq+32*2]
mova m0, [cq+32*3]
vpbroadcastd m5, [o(pw_1697x8)]
mova m5, [cq+32*3]
vpbroadcastd m8, [o(pw_1697x8)]
pcmpeqw m0, m0 ; -1
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
pmulhrsw m0, m5, m1
pmulhrsw m6, m5, m2
pmulhrsw m7, m5, m3
pmulhrsw m5, m4
paddw m1, m0
paddw m2, m6
paddw m3, m7
paddw m4, m5
vpbroadcastd m5, [o(pw_16384)]
punpcklwd m2, m4, m5
punpckhwd m4, m5
pmulhrsw m5, m8, m1
pmulhrsw m6, m8, m2
pmulhrsw m7, m8, m3
pmulhrsw m8, m4
pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is
pxor m1, m9 ; unsigned. as long as both signs are equal
pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the
pxor m2, m9 ; pmulhrsw result will become 0 which causes
pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
pxor m3, m9 ; we explicitly deal with that case here.
pcmpeqw m0, m4
pxor m4, m0
pavgw m1, m5
pavgw m2, m6
pavgw m3, m7
pavgw m4, m8
punpckldq m0, m1, m2
punpckhdq m1, m2
punpckldq m2, m3, m4
punpckhdq m3, m4
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m8, [o(pw_1697x16)]
@ -1311,11 +1318,11 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
pmulhrsw m6, m8, m1
pmulhrsw m7, m8, m2
pmulhrsw m8, m3
REPX {paddw x, x}, m0, m1, m2, m3
paddw m0, m4
paddw m1, m6
paddw m2, m7
paddw m3, m8
REPX {paddsw x, x}, m0, m1, m2, m3
paddsw m0, m4
paddsw m1, m6
paddsw m2, m7
paddsw m3, m8
jmp m(iadst_4x16_internal).end2
%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
@ -1353,7 +1360,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpbroadcastd xm3, [o(pw_2048)]
pmulhrsw xm1, xm0
pmulhrsw xm2, xm1
paddw xm1, xm2
paddsw xm1, xm2
pmulhrsw xm1, xm3
punpcklwd xm1, xm1
punpckldq xm0, xm1, xm1
@ -1369,7 +1376,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpbroadcastd xm3, [o(pw_2048)]
packusdw xm0, xm1
pmulhrsw xm0, xm2
paddw xm0, xm0
paddsw xm0, xm0
pmulhrsw xm0, xm2
pmulhrsw xm0, xm3
vinserti128 m0, m0, xm0, 1
@ -1447,7 +1454,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
punpckhwd m2, m0, m1
punpcklwd m0, m1
pxor m3, m3
psubw m3, m2
psubsw m3, m2
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
@ -1492,7 +1499,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
punpckhwd m1, m3, m2
punpcklwd m3, m2
pxor m0, m0
psubw m0, m1
psubsw m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
@ -1520,15 +1527,15 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw m2, m3
punpcklwd m0, m1, m2
punpckhwd m1, m2
paddw m0, m0
paddw m1, m1
paddsw m0, m0
paddsw m1, m1
jmp tx2q
.pass2:
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
paddsw m0, m2
paddsw m1, m3
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -1796,8 +1803,8 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw m7, m1
psrlw m1, 3 ; pw_2048
pmulhrsw m2, m7
paddw m7, m7
paddw m7, m2
paddsw m7, m7
paddsw m7, m2
pmulhrsw m7, m1
punpcklwd m5, m7, m7
punpckhwd m7, m7
@ -2120,12 +2127,12 @@ INV_TXFM_8X16_FN identity, identity
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
pmulhrsw m%2, m%3, m%1
%if %0 == 4 ; if we're going to downshift by 1 doing so here eliminates the paddw
%if %0 == 4 ; if downshifting by 1
pmulhrsw m%2, m%4
%else
paddw m%1, m%1
paddsw m%1, m%1
%endif
paddw m%1, m%2
paddsw m%1, m%2
%endmacro
cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
@ -2201,7 +2208,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
pmulhrsw xm3, xm0
psrlw xm0, 3 ; pw_2048
pmulhrsw xm1, xm3
paddw xm3, xm1
paddsw xm3, xm1
pmulhrsw xm3, xm0
punpcklwd xm3, xm3
punpckldq xm1, xm3, xm3
@ -2228,7 +2235,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
vpbroadcastd m1, [o(pw_2896x8)]
pmulhrsw m4, m0
pmulhrsw m4, m5
paddw m0, m4
paddsw m0, m4
psrlw m5, 3 ; pw_2048
pmulhrsw m0, m1
pmulhrsw m0, m5
@ -2503,10 +2510,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
pmulhrsw m6, m7, m3
pmulhrsw m7, m4
REPX {pmulhrsw x, m8}, m0, m5, m6, m7
paddw m1, m0
paddw m2, m5
paddw m3, m6
paddw m4, m7
paddsw m1, m0
paddsw m2, m5
paddsw m3, m6
paddsw m4, m7
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
punpcklqdq m2, m3, m4
@ -2518,10 +2525,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
jmp m(iadst_16x4_internal).end
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -2581,7 +2588,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
pmulhrsw m0, m4
pmulhrsw m5, m0
pmulhrsw m5, m2
paddw m0, m5
paddsw m0, m5
psrlw m2, 3 ; pw_2048
pmulhrsw m0, m4
pmulhrsw m0, m2
@ -2903,7 +2910,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
vpbroadcastd m3, [o(pw_2896x8)]
pmulhrsw m3, [cq]
vpbroadcastd m0, [o(pw_8192)]
vpbroadcastd m1, [o(pw_5793x4)]
vpbroadcastd m1, [o(pw_1697x16)]
vpbroadcastw m4, [o(deint_shuf)] ; pb_0_1
pcmpeqb m5, m5
pxor m6, m6
@ -2911,8 +2918,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
paddb m5, m5 ; pb_m2
pmulhrsw m3, m0
psrlw m0, 2 ; pw_2048
psllw m3, 2
pmulhrsw m3, m1
IDTX16 3, 1, 1
pmulhrsw m3, m0
mov r3d, 8
.loop:
@ -2954,17 +2960,15 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
punpcklwd m1, m3
vpbroadcastd m3, [o(pw_1697x16)]
punpcklwd m2, m4
vpbroadcastd m4, [o(pw_8192)]
vpbroadcastd m4, [o(pw_2896x8)]
punpckldq m1, m2
vpbroadcastd m2, [o(pw_2896x8)]
vpbroadcastd m2, [o(pw_2048)]
punpcklqdq m0, m1
pmulhrsw m3, m0
paddw m0, m0
paddw m0, m3
psraw m3, 1
pavgw m0, m3
pmulhrsw m0, m4
psrlw m4, 2 ; pw_2048
pmulhrsw m0, m2
pmulhrsw m0, m4
mov r3d, 8
jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
@ -3385,6 +3389,12 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3
jmp m(idct_16x16_internal).end3
%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
pmulhrsw m%2, m%3, m%1
psraw m%2, 1
pavgw m%1, m%2 ; signs are guaranteed to be equal
%endmacro
INV_TXFM_16X16_FN identity, dct, 15
INV_TXFM_16X16_FN identity, identity
@ -3419,22 +3429,17 @@ cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
vinserti128 m13, [cq+16*13], 1
mova xm14, [cq-16* 1]
vinserti128 m14, [cq+16*15], 1
REPX {IDTX16 x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
10, 4, 11, 5, 12, 13, 14
mova xm6, [cq-16* 4]
vinserti128 m6, [cq+16*12], 1
mova [rsp], m1
IDTX16 6, 1, 7
mova xm1, [cq-16* 2]
vinserti128 m1, [cq+16*14], 1
pmulhrsw m7, m1
paddw m1, m1
paddw m7, m1
vpbroadcastd m1, [o(pw_8192)]
REPX {pmulhrsw x, m1}, m0, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
pmulhrsw m1, [rsp]
mova [rsp], m0
IDTX16B 6, 0, 7
mova xm0, [cq-16* 2]
vinserti128 m0, [cq+16*14], 1
pmulhrsw m7, m0
psraw m7, 1
pavgw m7, m0
jmp m(idct_16x16_internal).pass1_end3
ALIGN function_align
.pass2:
@ -3447,8 +3452,8 @@ ALIGN function_align
IDTX16 0, 1, 15
mova m1, [rsp+32*0]
pmulhrsw m15, m1
paddw m1, m1
paddw m15, m1
paddsw m1, m1
paddsw m15, m1
jmp m(idct_16x16_internal).end
%define o_base iadst4_dconly2a + 128
@ -3963,7 +3968,7 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
vinserti128 m6, m6, [cq+16* 9], 1
vinserti128 m7, m7, [cq+16*13], 1
REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
REPX {paddw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
call .transpose8x8
REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4
@ -4572,12 +4577,12 @@ ALIGN function_align
IDCT32_PASS1_END 1, 9, 6, 7
ret
cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob
%undef cmp
lea rax, [o_base]
vpbroadcastd m9, [o(pw_2896x8)]
vpbroadcastd m10, [o(pw_5793x4)]
vpbroadcastd m11, [o(pw_5)]
vpbroadcastd m10, [o(pw_1697x16)]
vpbroadcastd m12, [o(pw_8192)]
cmp eobd, 43 ; if (eob > 43)
setg r4b ; iteration_count++
cmp eobd, 150 ; if (eob > 150)
@ -4586,6 +4591,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
adc r4b, al ; iteration_count++
lea r3, [strideq*3]
mov rax, cq
paddw m11, m12, m12 ; pw_16384
.loop:
mova xm0, [cq+64* 0]
mova xm1, [cq+64* 1]
@ -4604,11 +4610,9 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
vinserti128 m6, m6, [cq+64*14], 1
vinserti128 m7, m7, [cq+64*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
lea dstq, [dstq+strideq*4]
@ -4622,13 +4626,13 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
pxor m0, m0
mov r0d, 8
cmp cq, rax
jg .zero_loop
ja .zero_loop
.zero_loop_half:
mova [rax+64*0], m0
mova [rax+64*1], m0
mova [rax+64*2], m0
mova [rax+64*3], m0
add rax, 64*4
mova [rax-64*2], m0
mova [rax-64*1], m0
sub r0d, 2
jg .zero_loop_half
RET
@ -4646,7 +4650,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
%undef cmp
lea rax, [o_base]
vpbroadcastd m9, [o(pw_2896x8)]
vpbroadcastd m10, [o(pw_1697x8)]
vpbroadcastd m10, [o(pw_1697x16)]
vpbroadcastd m11, [o(pw_2048)]
cmp eobd, 35 ; if (eob > 35)
setg r4b ; iteration_count++
@ -4674,24 +4678,9 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
vinserti128 m6, m6, [cq+32*14], 1
vinserti128 m7, m7, [cq+32*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
pmulhrsw m8, m10, m0
paddw m0, m8
pmulhrsw m8, m10, m1
paddw m1, m8
pmulhrsw m8, m10, m2
paddw m2, m8
pmulhrsw m8, m10, m3
paddw m3, m8
pmulhrsw m8, m10, m4
paddw m4, m8
pmulhrsw m8, m10, m5
paddw m5, m8
pmulhrsw m8, m10, m6
paddw m6, m8
pmulhrsw m8, m10, m7
paddw m7, m8
REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
@ -4708,20 +4697,17 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
lea dstq, [r5+16]
jmp .loop
.ret:
sub cq, 32
sub cd, eax
pxor m0, m0
mov r0d, 4
mov r1d, 8
cmp cq, rax
cmovg r0d, r1d
add cd, 384
.zero_loop:
mova [rax+32*0], m0
mova [rax+32*1], m0
mova [rax+32*2], m0
mova [rax+32*3], m0
add rax, 32*4
dec r0d
jg .zero_loop
sub cd, 128
jge .zero_loop
RET
cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
@ -4859,7 +4845,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
call m(inv_txfm_add_dct_dct_16x32).pass2_end
lea tmp3q, [tmp1q-32*32]
cmp tmp2q, tmp3q
jl .ret
jb .ret
sub tmp2q, 32*32
sub dstq, r3
lea r2, [r2+r3+16]

732
third_party/dav1d/src/x86/itx_ssse3.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

24
third_party/dav1d/src/x86/looprestoration.asm поставляемый
Просмотреть файл

@ -347,7 +347,7 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
punpckhbw xm0, xm1
; when we reach this, xm0 contains left two px in highest words
cmp xq, -16
cmp xd, -16
jle .loop_x
.partial_load_and_extend:
vpbroadcastb m3, [srcq-1]
@ -396,17 +396,17 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
; else if x < xlimd we extend from previous load (this implies have_right=0)
; else we are done
cmp xq, -16
cmp xd, -16
jle .loop_x
test xq, xq
test xd, xd
jl .partial_load_and_extend
cmp xq, xlimq
cmp xd, xlimd
jl .right_extend
add sumsqq, (384+16)*4
add sumq, (384+16)*2
add srcq, strideq
dec hd
dec hd
jg .loop_y
RET
@ -418,7 +418,7 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
shr ylimd, 2
sub ylimd, 2 ; -2 if have_bottom=0, else 0
.loop_x:
lea yd, [hd+ylimd+2]
lea yd, [hq+ylimq+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
test edged, 4 ; have_top
@ -720,9 +720,9 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
punpckhbw xm0, xm1
; when we reach this, xm0 contains left two px in highest words
cmp xq, -16
cmp xd, -16
jle .loop_x
test xq, xq
test xd, xd
jge .right_extend
.partial_load_and_extend:
vpbroadcastb m3, [srcq-1]
@ -781,11 +781,11 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
; else if x < xlimd we extend from previous load (this implies have_right=0)
; else we are done
cmp xq, -16
cmp xd, -16
jle .loop_x
test xq, xq
test xd, xd
jl .partial_load_and_extend
cmp xq, xlimq
cmp xd, xlimd
jl .right_extend
add sumsqq, (384+16)*4
@ -803,7 +803,7 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
shr ylimd, 2
sub ylimd, 3 ; -3 if have_bottom=0, else -1
.loop_x:
lea yd, [hd+ylimd+2]
lea yd, [hq+ylimq+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
test edged, 4 ; have_top

Просмотреть файл

@ -725,7 +725,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
punpckhbw xm0, xm1
; when we reach this, m0 contains left two px in highest words
cmp xq, -8
cmp xd, -8
jle .loop_x
.partial_load_and_extend:
movd m3, [srcq-4]
@ -1299,9 +1299,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
punpckhbw m0, m1
; when we reach this, m0 contains left two px in highest words
cmp xq, -8
cmp xd, -8
jle .loop_x
test xq, xq
test xd, xd
jge .right_extend
.partial_load_and_extend:
XCHG_PIC_REG
@ -1394,11 +1394,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
; else if x < xlimd we extend from previous load (this implies have_right=0)
; else we are done
cmp xq, -8
cmp xd, -8
jle .loop_x
test xq, xq
test xd, xd
jl .partial_load_and_extend
cmp xq, xlimq
cmp xd, xlimd
jl .right_extend
add sumsqq, (384+16)*4

3631
third_party/dav1d/src/x86/mc.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

43
third_party/dav1d/src/x86/mc_init_tmpl.c поставляемый
Просмотреть файл

@ -49,36 +49,52 @@ decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3);
decl_mc_fn(dav1d_put_bilin_avx2);
decl_mc_fn(dav1d_put_bilin_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
decl_mct_fn(dav1d_prep_bilin_avx512icl);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_mct_fn(dav1d_prep_bilin_ssse3);
decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2);
decl_avg_fn(dav1d_avg_ssse3);
decl_w_avg_fn(dav1d_w_avg_avx512icl);
decl_w_avg_fn(dav1d_w_avg_avx2);
decl_w_avg_fn(dav1d_w_avg_ssse3);
decl_mask_fn(dav1d_mask_avx512icl);
decl_mask_fn(dav1d_mask_avx2);
decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_w_mask_fn(dav1d_w_mask_420_ssse3);
decl_w_mask_fn(dav1d_w_mask_422_avx512icl);
decl_w_mask_fn(dav1d_w_mask_422_avx2);
decl_w_mask_fn(dav1d_w_mask_444_avx512icl);
decl_w_mask_fn(dav1d_w_mask_444_avx2);
decl_blend_fn(dav1d_blend_avx2);
decl_blend_fn(dav1d_blend_ssse3);
@ -162,10 +178,11 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
#endif
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
return;
#if BITDEPTH == 8 && ARCH_X86_64
#if BITDEPTH == 8
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
@ -202,5 +219,29 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
c->emu_edge = dav1d_emu_edge_avx2;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
return;
#if BITDEPTH == 8
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
c->avg = dav1d_avg_avx512icl;
c->w_avg = dav1d_w_avg_avx512icl;
c->mask = dav1d_mask_avx512icl;
c->w_mask[0] = dav1d_w_mask_444_avx512icl;
c->w_mask[1] = dav1d_w_mask_422_avx512icl;
c->w_mask[2] = dav1d_w_mask_420_avx512icl;
#endif
#endif
}

88
third_party/dav1d/src/x86/mc_ssse3.asm поставляемый
Просмотреть файл

@ -1425,7 +1425,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jmp wq
.h_w2:
%if ARCH_X86_32
and mxd, 0xff
and mxd, 0x7f
%else
movzx mxd, mxb
%endif
@ -1455,7 +1455,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
RET
.h_w4:
%if ARCH_X86_32
and mxd, 0xff
and mxd, 0x7f
%else
movzx mxd, mxb
%endif
@ -1564,16 +1564,16 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%if ARCH_X86_32
movzx mxd, ssb
shr ssd, 16
cmp hd, 4
cmovle ssd, mxd
cmp hd, 6
cmovs ssd, mxd
lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
%else
%assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmovle myd, mxd
cmp hd, 6
cmovs myd, mxd
lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
%endif
tzcnt r6d, wd
@ -1850,14 +1850,18 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%assign stack_offset org_stack_offset
cmp wd, 4
jg .hv_w8
and mxd, 0xff
%if ARCH_X86_32
and mxd, 0x7f
%else
movzx mxd, mxb
%endif
dec srcq
movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
%if ARCH_X86_32
movzx mxd, ssb
shr ssd, 16
cmp hd, 4
cmovle ssd, mxd
cmp hd, 6
cmovs ssd, mxd
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
W32_RESTORE_SSQ
lea r6, [ssq*3]
@ -1886,8 +1890,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%else
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmovle myd, mxd
cmp hd, 6
cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
ALLOC_STACK mmsize*14, 14
lea ss3q, [ssq*3]
@ -2202,8 +2206,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
movzx mxd, ssb
shr ssd, 16
cmp hd, 4
cmovle ssd, mxd
cmp hd, 6
cmovs ssd, mxd
movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
mov ssq, ssmp
ALLOC_STACK -mmsize*13
@ -2243,8 +2247,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmovle myd, mxd
cmp hd, 6
cmovs myd, mxd
movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
@ -2511,7 +2515,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
jmp wq
.h_w4:
%if ARCH_X86_32
and mxd, 0xff
and mxd, 0x7f
%else
movzx mxd, mxb
%endif
@ -2635,15 +2639,15 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
.v:
%if ARCH_X86_32
mov mxd, myd
and mxd, 0xff
and mxd, 0x7f
%else
%assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
%endif
shr myd, 16
cmp hd, 4
cmovle myd, mxd
cmp hd, 6
cmovs myd, mxd
lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
mova m2, [base+pw_512]
psrlw m2, m2, 1 ; 0x0100
@ -2849,14 +2853,14 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%assign stack_offset org_stack_offset
cmp wd, 4
jg .hv_w8
and mxd, 0xff
and mxd, 0x7f
movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
%if ARCH_X86_32
mov mxd, myd
and mxd, 0xff
shr myd, 16
cmp hd, 4
cmovle myd, mxd
and mxd, 0x7f
cmp hd, 6
cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
mov r5, r2; use as new base
%define base_reg r5
@ -2885,8 +2889,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%else
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmovle myd, mxd
cmp hd, 6
cmovs myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
ALLOC_STACK mmsize*14, 14
lea stride3q, [strideq*3]
@ -3101,11 +3105,11 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define accuv0 [rsp+mmsize*11]
%define accuv1 [rsp+mmsize*12]
movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
movzx mxd, myw
and mxd, 0xff
mov mxd, myd
shr myd, 16
cmp hd, 4
cmovle myd, mxd
and mxd, 0x7f
cmp hd, 6
cmovs myd, mxd
movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
ALLOC_STACK -mmsize*13
%if STACK_ALIGNMENT < mmsize
@ -3150,8 +3154,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmovle myd, mxd
cmp hd, 6
cmovs myd, mxd
movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
@ -4743,9 +4747,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
xor reg_zero, reg_zero
lea reg_tmp, [ihq-1]
cmp yq, ihq
cmovl reg_tmp, yq
cmovs reg_tmp, yq
test yq, yq
cmovl reg_tmp, reg_zero
cmovs reg_tmp, reg_zero
%if ARCH_X86_64
imul reg_tmp, sstrideq
add srcq, reg_tmp
@ -4758,9 +4762,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
; ref += iclip(x, 0, iw - 1)
lea reg_tmp, [iwq-1]
cmp xq, iwq
cmovl reg_tmp, xq
cmovs reg_tmp, xq
test xq, xq
cmovl reg_tmp, reg_zero
cmovs reg_tmp, reg_zero
add reg_src, reg_tmp
%if ARCH_X86_32
mov srcm, reg_src
@ -4773,7 +4777,7 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
lea reg_bottomext, [yq+bhq]
sub reg_bottomext, ihq
lea r3, [bhq-1]
cmovl reg_bottomext, reg_zero
cmovs reg_bottomext, reg_zero
;
DEFINE_ARGS bw, bh, iw, ih, x, \
@ -4782,9 +4786,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
; top_ext = iclip(-y, 0, bh - 1)
neg topextq
cmovl topextq, reg_zero
cmovs topextq, reg_zero
cmp reg_bottomext, bhq
cmovge reg_bottomext, r3
cmovns reg_bottomext, r3
cmp topextq, bhq
cmovg topextq, r3
%if ARCH_X86_32
@ -4796,7 +4800,7 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
lea reg_rightext, [xq+bwq]
sub reg_rightext, iwq
lea r2, [bwq-1]
cmovl reg_rightext, reg_zero
cmovs reg_rightext, reg_zero
DEFINE_ARGS bw, bh, iw, ih, leftext, \
topext, dst, dstride, src, sstride, \
@ -4804,14 +4808,14 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
; left_ext = iclip(-x, 0, bw - 1)
neg leftextq
cmovl leftextq, reg_zero
cmovs leftextq, reg_zero
cmp reg_rightext, bwq
cmovge reg_rightext, r2
cmovns reg_rightext, r2
%if ARCH_X86_32
mov r3m, r1
%endif
cmp leftextq, bwq
cmovge leftextq, r2
cmovns leftextq, r2
%undef reg_zero
%undef reg_tmp

90
third_party/dav1d/src/x86/msac.asm поставляемый
Просмотреть файл

@ -67,7 +67,7 @@ struc msac
.update_cdf: resd 1
endstruc
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
SECTION .text
@ -167,7 +167,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
%endif
lea t5, [t2+gprsize]
cmp t5, rcx
jg .refill_eob
ja .refill_eob
mov t2, [t2]
lea ecx, [t1+23]
add t1d, 16
@ -195,7 +195,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
sub ecx, t1d ; c
.refill_eob_loop:
cmp t2, t5
jge .refill_eob_end ; eob reached
jae .refill_eob_end ; eob reached
movzx t1d, byte [t2]
inc t2
shl t1, cl
@ -240,7 +240,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
pcmpeqw m1, m2
pmovmskb eax, m1
test t3d, t3d
jz m(msac_decode_symbol_adapt4).renorm
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
movzx t3d, word [t1+t4*2]
pcmpeqw m2, m2
mov t2d, t3d
@ -257,7 +257,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
paddw m0, m2
mova [t1], m0
mov [t1+t4*2], t2w
jmp m(msac_decode_symbol_adapt4).renorm
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
cglobal msac_decode_symbol_adapt16, 0, 6, 6
DECODE_SYMBOL_ADAPT_INIT
@ -330,7 +330,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
%if WIN64
add rsp, 48
%endif
jmp m(msac_decode_symbol_adapt4).renorm2
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
cglobal msac_decode_bool_adapt, 0, 6, 0
movifnidn t1, r1mp
@ -366,7 +366,7 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
%endif
not t4
test t3d, t3d
jz m(msac_decode_symbol_adapt4).renorm3
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
%if UNIX64 == 0
push t6
%endif
@ -390,13 +390,13 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
%if WIN64
mov t1d, [t7+msac.cnt]
pop t6
jmp m(msac_decode_symbol_adapt4).renorm4
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
%else
%if ARCH_X86_64 == 0
pop t5
pop t6
%endif
jmp m(msac_decode_symbol_adapt4).renorm3
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
%endif
cglobal msac_decode_bool_equi, 0, 6, 0
@ -418,7 +418,7 @@ cglobal msac_decode_bool_equi, 0, 6, 0
%if ARCH_X86_64 == 0
movzx eax, al
%endif
jmp m(msac_decode_symbol_adapt4).renorm3
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
cglobal msac_decode_bool, 0, 6, 0
movifnidn t0, r0mp
@ -442,7 +442,7 @@ cglobal msac_decode_bool, 0, 6, 0
%if ARCH_X86_64 == 0
movzx eax, al
%endif
jmp m(msac_decode_symbol_adapt4).renorm3
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
%macro HI_TOK 1 ; update_cdf
%if ARCH_X86_64 == 0
@ -598,3 +598,71 @@ cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
HI_TOK 1
.no_update_cdf:
HI_TOK 0
%if ARCH_X86_64
INIT_YMM avx2
cglobal msac_decode_symbol_adapt16, 3, 6, 6
lea rax, [pw_0xff00]
vpbroadcastw m2, [t0+msac.rng]
mova m0, [t1]
vpbroadcastw m3, [t0+msac.dif+6]
vbroadcasti128 m4, [rax]
mov t3d, [t0+msac.update_cdf]
mov t4d, t2d
not t2
%if STACK_ALIGNMENT < 32
mov r5, rsp
%if WIN64
and rsp, ~31
sub rsp, 40
%else
and r5, ~31
%define buf r5-32
%endif
%elif WIN64
sub rsp, 64
%else
%define buf rsp-56
%endif
psrlw m1, m0, 6
movd [buf-4], xm2
pand m2, m4
psllw m1, 7
pmulhuw m1, m2
paddw m1, [rax+t2*2]
mova [buf], m1
pmaxuw m1, m3
pcmpeqw m1, m3
pmovmskb eax, m1
test t3d, t3d
jz .renorm
movzx t3d, word [t1+t4*2]
pcmpeqw m2, m2
lea t2d, [t3+80]
shr t2d, 4
cmp t3d, 32
adc t3d, 0
movd xm3, t2d
pavgw m2, m1
psubw m2, m0
psubw m0, m1
psraw m2, xm3
paddw m0, m2
mova [t1], m0
mov [t1+t4*2], t3w
.renorm:
tzcnt eax, eax
mov t4, [t0+msac.dif]
movzx t1d, word [buf+rax-0]
movzx t2d, word [buf+rax-2]
shr eax, 1
%if WIN64
%if STACK_ALIGNMENT < 32
mov rsp, r5
%else
add rsp, 64
%endif
%endif
vzeroupper
jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
%endif

13
third_party/dav1d/src/x86/msac.h поставляемый
Просмотреть файл

@ -39,10 +39,13 @@ unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
/* Needed for checkasm */
unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
#endif
@ -50,4 +53,12 @@ unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
#if ARCH_X86_64
#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#endif
void dav1d_msac_init_x86(MsacContext *const s);
#endif /* DAV1D_SRC_X86_MSAC_H */

42
third_party/dav1d/src/x86/msac_init.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,42 @@
/*
* Copyright © 2020, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/msac.h"
#include "src/x86/msac.h"
void dav1d_msac_init_x86(MsacContext *const s) {
const unsigned flags = dav1d_get_cpu_flags();
if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
}
if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
}
}

99
third_party/dav1d/tests/checkasm/cdef.c поставляемый
Просмотреть файл

@ -35,72 +35,82 @@
#include "src/levels.h"
#include "src/cdef.h"
static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
while (n--)
*buf++ = rnd() & bitdepth_max;
static int to_binary(int x) { /* 0-15 -> 0000-1111 */
return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
}
static void check_cdef_filter(const cdef_fn fn, const int w, const int h,
const char *const name)
{
ALIGN_STK_32(pixel, src, 10 * 16 + 8, );
ALIGN_STK_32(pixel, c_src, 10 * 16 + 8, ), *const c_src_ptr = c_src + 8;
ALIGN_STK_32(pixel, a_src, 10 * 16 + 8, ), *const a_src_ptr = a_src + 8;
ALIGN_STK_32(pixel, top, 16 * 2 + 8, ), *const top_ptr = top + 8;
pixel left[8][2];
static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
const int fill_type = rnd() & 7;
if (fill_type == 0)
while (n--) /* check for cdef_filter underflows */
*buf++ = rnd() & 1;
else if (fill_type == 1)
while (n--) /* check for cdef_filter overflows */
*buf++ = bitdepth_max - (rnd() & 1);
else
while (n--)
*buf++ = rnd() & bitdepth_max;
}
static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8;
ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8;
ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8;
ALIGN_STK_16(pixel, left, 8,[2]);
const ptrdiff_t stride = 16 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
pixel *const top[2], int pri_strength, int sec_strength,
const pixel *top, int pri_strength, int sec_strength,
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
if (check_func(fn, "%s_%dbpc", name, BITDEPTH)) {
if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
for (int dir = 0; dir < 8; dir++) {
for (enum CdefEdgeFlags edges = 0; edges <= 0xf; edges++) {
for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) {
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
init_tmp(src, 10 * 16 + 8, bitdepth_max);
init_tmp(top, 16 * 2 + 8, bitdepth_max);
init_tmp((pixel *) left,8 * 2, bitdepth_max);
memcpy(a_src, src, (10 * 16 + 8) * sizeof(pixel));
memcpy(c_src, src, (10 * 16 + 8) * sizeof(pixel));
init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
init_tmp((pixel *) left, 8 * 2, bitdepth_max);
memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
const int lvl = 1 + (rnd() % 62);
const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
const int pri_strength = (lvl >> 2) << bitdepth_min_8;
int pri_strength = (lvl >> 2) << bitdepth_min_8;
int sec_strength = lvl & 3;
sec_strength += sec_strength == 3;
sec_strength <<= bitdepth_min_8;
call_ref(c_src_ptr, 16 * sizeof(pixel), left,
(pixel *[2]) { top_ptr, top_ptr + 16 },
pri_strength, sec_strength, dir, damping, edges
HIGHBD_TAIL_SUFFIX);
call_new(a_src_ptr, 16 * sizeof(pixel), left,
(pixel *[2]) { top_ptr, top_ptr + 16 },
pri_strength, sec_strength, dir, damping, edges
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_src, 16 * sizeof(pixel),
a_src, 16 * sizeof(pixel),
16, 10, "src");
checkasm_check_pixel(c_src + 16 * 10, 16 * sizeof(pixel),
a_src + 16 * 10, 16 * sizeof(pixel),
8, 1, "src last row");
bench_new(a_src_ptr, 16 * sizeof(pixel), left,
(pixel *[2]) { top_ptr, top_ptr + 16 },
pri_strength, sec_strength, dir, damping, edges
HIGHBD_TAIL_SUFFIX);
call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
pri_strength, sec_strength, dir, damping, to_binary(edges));
return;
}
if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) {
/* Benchmark a fixed set of cases to get consistent results:
* 1) top/left edges and pri_strength only
* 2) bottom/right edges and sec_strength only
* 3) all edges and both pri_strength and sec_strength
*/
pri_strength = (edges & 1) << bitdepth_min_8;
sec_strength = (edges & 2) << bitdepth_min_8;
bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
}
}
}
}
report(name);
}
static void check_cdef_direction(const cdef_dir_fn fn) {
ALIGN_STK_32(pixel, src, 8 * 8,);
ALIGN_STK_64(pixel, src, 8 * 8,);
declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
HIGHBD_DECL_SUFFIX);
@ -129,11 +139,12 @@ static void check_cdef_direction(const cdef_dir_fn fn) {
void bitfn(checkasm_check_cdef)(void) {
Dav1dCdefDSPContext c;
bitfn(dav1d_cdef_dsp_init)(&c);
check_cdef_direction(c.dir);
check_cdef_filter(c.fb[0], 8, 8, "cdef_filter_8x8");
check_cdef_filter(c.fb[1], 4, 8, "cdef_filter_4x8");
check_cdef_filter(c.fb[2], 4, 4, "cdef_filter_4x4");
check_cdef_filter(c.fb[0], 8, 8);
check_cdef_filter(c.fb[1], 4, 8);
check_cdef_filter(c.fb[2], 4, 4);
report("cdef_filter");
}

44
third_party/dav1d/tests/checkasm/checkasm.c поставляемый
Просмотреть файл

@ -98,19 +98,15 @@ static const struct {
unsigned flag;
} cpus[] = {
#if ARCH_X86
{ "SSE", "sse", DAV1D_X86_CPU_FLAG_SSE },
{ "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
{ "SSE3", "sse3", DAV1D_X86_CPU_FLAG_SSE3 },
{ "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
{ "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
{ "SSE4.2", "sse42", DAV1D_X86_CPU_FLAG_SSE42 },
{ "AVX", "avx", DAV1D_X86_CPU_FLAG_AVX },
{ "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
{ "AVX-512", "avx512", DAV1D_X86_CPU_FLAG_AVX512 },
{ "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
{ "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
{ "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
{ "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
{ "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
#elif ARCH_AARCH64 || ARCH_ARM
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
#elif ARCH_PPC64LE
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
#endif
{ 0 }
};
@ -150,6 +146,9 @@ static struct {
int bench_c;
int verbose;
int function_listing;
#if ARCH_X86_64
void (*simd_warmup)(void);
#endif
} state;
/* float compare support code */
@ -569,13 +568,26 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
dav1d_init_cpu();
#if ARCH_X86_64
void checkasm_warmup_avx2(void);
void checkasm_warmup_avx512(void);
unsigned cpu_flags = dav1d_get_cpu_flags();
if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
state.simd_warmup = checkasm_warmup_avx512;
else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
state.simd_warmup = checkasm_warmup_avx2;
else
state.simd_warmup = NULL;
checkasm_simd_warmup();
#endif
check_cpu_flag(NULL, 0);
if (state.function_listing) {
print_functions(state.funcs);
} else {
for (int i = 0; cpus[i].flag; i++)
check_cpu_flag(cpus[i].name, cpus[i].flag);
if (!state.num_checked) {
fprintf(stderr, "checkasm: no tests to perform\n");
} else if (state.num_failed) {
@ -774,3 +786,11 @@ DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x")
DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d")
DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d")
#if ARCH_X86_64
void checkasm_simd_warmup(void)
{
if (state.simd_warmup)
state.simd_warmup();
}
#endif

8
third_party/dav1d/tests/checkasm/checkasm.h поставляемый
Просмотреть файл

@ -193,12 +193,20 @@ void checkasm_checked_call(void *func, ...);
* not guaranteed and false negatives is theoretically possible, but there
* can never be any false positives. */
void checkasm_stack_clobber(uint64_t clobber, ...);
/* YMM and ZMM registers on x86 are turned off to save power when they haven't
* been used for some period of time. When they are used there will be a
* "warmup" period during which performance will be reduced and inconsistent
* which is problematic when trying to benchmark individual functions. We can
* work around this by periodically issuing "dummy" instructions that uses
* those registers to keep them powered on. */
void checkasm_simd_warmup(void);
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__) =\
(void *)checkasm_checked_call;
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checkasm_simd_warmup(),\
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\

191
third_party/dav1d/tests/checkasm/filmgrain.c поставляемый
Просмотреть файл

@ -49,29 +49,29 @@ static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
for (int i = 0; i < 4; i++) {
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
if (memcmp(grain_lut_c, grain_lut_a,
GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
{
fail();
}
bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
}
}
@ -97,38 +97,38 @@ static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
"gen_grain_uv_ar%d_%dbpc_%s",
i, BITDEPTH, ss_name[layout_idx]))
{
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
fg_data.num_y_points = rnd() & 1;
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
fg_data[0].num_y_points = rnd() & 1;
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);
const int uv = rnd() & 1;
const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
for (int n = 0; n < num_uv_pos; n++)
fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
if (!fg_data.num_y_points)
fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
if (!fg_data[0].num_y_points)
fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
if (diff) fail();
bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
}
}
}
@ -137,9 +137,9 @@ static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
}
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_32(pixel, src, 128 * 32,);
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
ALIGN_STK_64(pixel, src, 128 * 32,);
const ptrdiff_t stride = 128 * sizeof(pixel);
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
@ -149,8 +149,8 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
int bh, int row_num HIGHBD_DECL_SUFFIX);
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
@ -160,23 +160,23 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
uint8_t scaling[SCALING_SIZE];
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);
fg_data.num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data.num_y_points;
for (int n = 0; n < fg_data.num_y_points; n++) {
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
fg_data.y_points[n][0] += rnd() % pad;
fg_data.y_points[n][1] = rnd() & 0xff;
fg_data[0].num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data[0].num_y_points;
for (int n = 0; n < fg_data[0].num_y_points; n++) {
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
fg_data[0].y_points[n][0] += rnd() % pad;
fg_data[0].y_points[n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
fg_data.num_y_points, scaling);
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
fg_data[0].num_y_points, scaling);
const int w = 1 + (rnd() & 127);
const int h = 1 + (rnd() & 31);
@ -186,20 +186,20 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
fg_data.clip_to_restricted_range = rnd() & 1;
fg_data.scaling_shift = (rnd() & 3) + 8;
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
fg_data.overlap_flag++)
fg_data[0].clip_to_restricted_range = rnd() & 1;
fg_data[0].scaling_shift = (rnd() & 3) + 8;
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
fg_data[0].overlap_flag++)
{
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
}
fg_data.overlap_flag = 1;
bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
fg_data[0].overlap_flag = 1;
bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
row_num HIGHBD_TAIL_SUFFIX);
}
@ -207,10 +207,10 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
}
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_32(pixel, src, 128 * 32,);
ALIGN_STK_32(pixel, luma_src, 128 * 32,);
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
ALIGN_STK_64(pixel, src, 128 * 32,);
ALIGN_STK_64(pixel, luma_src, 128 * 32,);
const ptrdiff_t lstride = 128 * sizeof(pixel);
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
@ -231,9 +231,9 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
"fguv_32x32xn_%dbpc_%s_csfl%d",
BITDEPTH, ss_name[layout_idx], csfl))
{
Dav1dFilmGrainData fg_data;
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
fg_data.seed = rnd() & 0xFFFF;
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
@ -245,15 +245,18 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
uint8_t scaling[SCALING_SIZE];
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
fg_data[0].ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
const int num_uv_pos = num_y_pos + 1;
for (int n = 0; n < num_uv_pos; n++)
fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
&fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
const int w = 1 + (rnd() & (127 >> ss_x));
const int h = 1 + (rnd() & (31 >> ss_y));
@ -268,47 +271,47 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
if (csfl) {
fg_data.num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data.num_y_points;
for (int n = 0; n < fg_data.num_y_points; n++) {
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
fg_data.y_points[n][0] += rnd() % pad;
fg_data.y_points[n][1] = rnd() & 0xff;
fg_data[0].num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data[0].num_y_points;
for (int n = 0; n < fg_data[0].num_y_points; n++) {
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
fg_data[0].y_points[n][0] += rnd() % pad;
fg_data[0].y_points[n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
fg_data.num_y_points, scaling);
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
fg_data[0].num_y_points, scaling);
} else {
fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
const int pad = 0xff / fg_data.num_uv_points[uv_pl];
for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
fg_data.num_uv_points[uv_pl], scaling);
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
fg_data[0].num_uv_points[uv_pl], scaling);
fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
}
fg_data.clip_to_restricted_range = rnd() & 1;
fg_data.scaling_shift = (rnd() & 3) + 8;
fg_data.chroma_scaling_from_luma = csfl;
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
fg_data.overlap_flag++)
fg_data[0].clip_to_restricted_range = rnd() & 1;
fg_data[0].scaling_shift = (rnd() & 3) + 8;
fg_data[0].chroma_scaling_from_luma = csfl;
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
fg_data[0].overlap_flag++)
{
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
}
fg_data.overlap_flag = 1;
bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
fg_data[0].overlap_flag = 1;
bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
}
}

26
third_party/dav1d/tests/checkasm/ipred.c поставляемый
Просмотреть файл

@ -66,9 +66,9 @@ static const uint8_t z_angles[27] = {
};
static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
ALIGN_STK_32(pixel, topleft_buf, 257,);
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@ -132,9 +132,9 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
}
static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_32(int16_t, c_dst, 32 * 32,);
ALIGN_STK_32(int16_t, a_dst, 32 * 32,);
ALIGN_STK_32(pixel, luma, 32 * 32,);
ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
ALIGN_STK_64(pixel, luma, 32 * 32,);
declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
int w_pad, int h_pad, int cw, int ch);
@ -175,10 +175,10 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
}
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 32 * 32,);
ALIGN_STK_32(pixel, a_dst, 32 * 32,);
ALIGN_STK_32(int16_t, ac, 32 * 32,);
ALIGN_STK_32(pixel, topleft_buf, 257,);
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
ALIGN_STK_64(int16_t, ac, 32 * 32,);
ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@ -227,9 +227,9 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
}
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
ALIGN_STK_32(uint8_t, idx, 64 * 64,);
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
ALIGN_STK_16(uint16_t, pal, 8,);
declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,

6
third_party/dav1d/tests/checkasm/itx.c поставляемый
Просмотреть файл

@ -226,9 +226,9 @@ void bitfn(checkasm_check_itx)(void) {
Dav1dInvTxfmDSPContext c;
bitfn(dav1d_itx_dsp_init)(&c);
ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
TX_4X4, RTX_4X8, RTX_4X16,

Просмотреть файл

@ -95,8 +95,8 @@ static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
const int n_blks, const int lf_idx,
const int is_chroma, const int dir)
{
ALIGN_STK_32(pixel, c_dst_mem, 128 * 16,);
ALIGN_STK_32(pixel, a_dst_mem, 128 * 16,);
ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,

Просмотреть файл

@ -43,10 +43,10 @@ static void init_tmp(pixel *buf, const ptrdiff_t stride,
}
}
static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 448 * 64,);
ALIGN_STK_32(pixel, a_dst, 448 * 64,);
ALIGN_STK_32(pixel, h_edge, 448 * 8,);
static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
pixel left[64][4];
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@ -58,7 +58,7 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
for (int pl = 0; pl < 2; pl++) {
if (check_func(c->wiener, "wiener_%s_%dbpc",
pl ? "chroma" : "luma", BITDEPTH))
pl ? "chroma" : "luma", bpc))
{
int16_t filter[2][3], filter_v[7], filter_h[7];
@ -81,11 +81,7 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
const int base_w = 1 + (rnd() % 384);
const int base_h = 1 + (rnd() & 63);
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const int bitdepth_max = (1 << bpc) - 1;
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
@ -112,13 +108,12 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX);
}
}
report("wiener");
}
static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 448 * 64,);
ALIGN_STK_32(pixel, a_dst, 448 * 64,);
ALIGN_STK_32(pixel, h_edge, 448 * 8,);
static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
pixel left[64][4];
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@ -130,7 +125,7 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
if (check_func(c->selfguided, "selfguided_%s_%dbpc",
sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", BITDEPTH))
sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
{
int16_t sgr_wt[2];
@ -140,11 +135,7 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
const int base_w = 1 + (rnd() % 384);
const int base_h = 1 + (rnd() & 63);
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const int bitdepth_max = (1 << bpc) - 1;
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
@ -171,14 +162,24 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
}
}
report("sgr");
}
void bitfn(checkasm_check_looprestoration)(void) {
Dav1dLoopRestorationDSPContext c;
bitfn(dav1d_loop_restoration_dsp_init)(&c);
check_wiener(&c);
check_sgr(&c);
#if BITDEPTH == 16
const int bpc_min = 10, bpc_max = 12;
#else
const int bpc_min = 8, bpc_max = 8;
#endif
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
Dav1dLoopRestorationDSPContext c;
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
check_wiener(&c, bpc);
}
report("wiener");
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
Dav1dLoopRestorationDSPContext c;
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
check_sgr(&c, bpc);
}
report("sgr");
}

80
third_party/dav1d/tests/checkasm/mc.c поставляемый
Просмотреть файл

@ -55,9 +55,9 @@ static int mc_h_next(const int h) {
}
static void check_mc(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, src_buf, 135 * 135,);
ALIGN_STK_32(pixel, c_dst, 128 * 128,);
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
@ -118,9 +118,9 @@ static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
}
static void check_mct(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, src_buf, 135 * 135,);
ALIGN_STK_32(int16_t, c_tmp, 128 * 128,);
ALIGN_STK_32(int16_t, a_tmp, 128 * 128,);
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
@ -173,9 +173,9 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
}
static void check_avg(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
@ -204,9 +204,9 @@ static void check_avg(Dav1dMCDSPContext *const c) {
}
static void check_w_avg(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
@ -236,10 +236,10 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
}
static void check_mask(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
ALIGN_STK_32(uint8_t, mask, 128 * 128,);
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(uint8_t, mask, 128 * 128,);
for (int i = 0; i < 128 * 128; i++)
mask[i] = rnd() % 65;
@ -271,11 +271,11 @@ static void check_mask(Dav1dMCDSPContext *const c) {
}
static void check_w_mask(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
ALIGN_STK_32(uint8_t, c_mask, 128 * 128,);
ALIGN_STK_32(uint8_t, a_mask, 128 * 128,);
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
@ -321,10 +321,10 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
}
static void check_blend(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 32 * 32,);
ALIGN_STK_32(pixel, c_dst, 32 * 32,);
ALIGN_STK_32(pixel, a_dst, 32 * 32,);
ALIGN_STK_32(uint8_t, mask, 32 * 32,);
ALIGN_STK_64(pixel, tmp, 32 * 32,);
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
ALIGN_STK_64(uint8_t, mask, 32 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h, const uint8_t *mask);
@ -357,9 +357,9 @@ static void check_blend(Dav1dMCDSPContext *const c) {
}
static void check_blend_v(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 32 * 128,);
ALIGN_STK_32(pixel, c_dst, 32 * 128,);
ALIGN_STK_32(pixel, a_dst, 32 * 128,);
ALIGN_STK_64(pixel, tmp, 32 * 128,);
ALIGN_STK_64(pixel, c_dst, 32 * 128,);
ALIGN_STK_64(pixel, a_dst, 32 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
@ -391,9 +391,9 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
}
static void check_blend_h(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 128 * 32,);
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_64(pixel, tmp, 128 * 32,);
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
@ -424,9 +424,9 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
}
static void check_warp8x8(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, src_buf, 15 * 15,);
ALIGN_STK_32(pixel, c_dst, 8 * 8,);
ALIGN_STK_32(pixel, a_dst, 8 * 8,);
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
ALIGN_STK_64(pixel, c_dst, 8 * 8,);
ALIGN_STK_64(pixel, a_dst, 8 * 8,);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
@ -462,9 +462,9 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) {
}
static void check_warp8x8t(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, src_buf, 15 * 15,);
ALIGN_STK_32(int16_t, c_tmp, 8 * 8,);
ALIGN_STK_32(int16_t, a_tmp, 8 * 8,);
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
ALIGN_STK_64(int16_t, c_tmp, 8 * 8,);
ALIGN_STK_64(int16_t, a_tmp, 8 * 8,);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t src_stride = 15 * sizeof(pixel);
@ -534,9 +534,9 @@ static void random_offset_for_edge(int *const x, int *const y,
}
static void check_emuedge(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 135 * 192,);
ALIGN_STK_32(pixel, a_dst, 135 * 192,);
ALIGN_STK_32(pixel, src, 160 * 160,);
ALIGN_STK_64(pixel, c_dst, 135 * 192,);
ALIGN_STK_64(pixel, a_dst, 135 * 192,);
ALIGN_STK_64(pixel, src, 160 * 160,);
for (int i = 0; i < 160 * 160; i++)
src[i] = rnd() & ((1U << BITDEPTH) - 1);

6
third_party/dav1d/tests/checkasm/msac.c поставляемый
Просмотреть файл

@ -258,6 +258,12 @@ void checkasm_check_msac(void) {
c.bool = dav1d_msac_decode_bool_sse2;
c.hi_tok = dav1d_msac_decode_hi_tok_sse2;
}
#if ARCH_X86_64
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) {
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
}
#endif
#endif
uint8_t buf[BUF_SIZE];

Просмотреть файл

@ -170,6 +170,19 @@ cglobal checked_call, 2,15,16,max_args*8+8
.ok:
RET
; trigger a warmup of vector units
%macro WARMUP 0
cglobal warmup, 0, 0
xorps m0, m0
mulps m0, m0
RET
%endmacro
INIT_YMM avx2
WARMUP
INIT_ZMM avx512
WARMUP
%else
; just random numbers to reduce the chance of incidental match

98
third_party/dav1d/tests/libfuzzer/meson.build поставляемый Normal file
Просмотреть файл

@ -0,0 +1,98 @@
# Copyright © 2020, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Build definition for the dav1d fuzzing binaries
#
dav1d_fuzzer_sources = files('dav1d_fuzzer.c')
fuzzer_ldflags = []
fuzzer_link_lang = {}
if get_option('fuzzer_ldflags') != ''
fuzzer_ldflags += [get_option('fuzzer_ldflags')]
endif
if fuzzing_engine == 'none'
dav1d_fuzzer_sources += files('main.c')
elif fuzzing_engine == 'libfuzzer'
fuzzer_ldflags += ['-fsanitize=fuzzer']
elif fuzzing_engine == 'oss-fuzz'
# libFuzzingEngine needs c++
add_languages('cpp')
fuzzer_link_lang = {'link_language': 'cpp'}
endif
dav1d_fuzzer = executable('dav1d_fuzzer',
dav1d_fuzzer_sources,
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag],
link_args: fuzzer_ldflags,
link_with : libdav1d,
build_by_default: true,
dependencies : [thread_dependency],
kwargs: fuzzer_link_lang
)
dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
dav1d_fuzzer_sources,
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
link_args: fuzzer_ldflags,
link_with : libdav1d,
build_by_default: true,
dependencies : [thread_dependency],
kwargs: fuzzer_link_lang
)
objcopy = find_program('objcopy',
required: false)
if (objcopy.found() and
not get_option('b_lto') and
get_option('default_library') == 'static' and
cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
libdav1d_af = custom_target('libdav1d_af',
input: libdav1d,
output: 'libdav1d_af.a',
depends: libdav1d,
command: [objcopy,
'--redefine-sym', 'malloc=__wrap_malloc',
'--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
'--redefine-sym', 'pthread_create=__wrap_pthread_create',
'--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
'--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
'@INPUT@', '@OUTPUT@'])
dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
dav1d_fuzzer_sources + ['alloc_fail.c'],
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
link_depends: libdav1d_af,
build_by_default: false,
dependencies : [thread_dependency],
kwargs: fuzzer_link_lang
)
endif

74
third_party/dav1d/tests/meson.build поставляемый
Просмотреть файл

@ -90,76 +90,20 @@ if is_asm_enabled
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag],
build_by_default: false,
dependencies : [thread_dependency, rt_dependency, m_lib],
dependencies : [
thread_dependency,
rt_dependency,
libdl_dependency,
m_lib,
],
)
test('checkasm', checkasm, is_parallel: false)
endif
dav1d_fuzzer_sources = files('libfuzzer/dav1d_fuzzer.c')
fuzzer_ldflags = []
if get_option('fuzzer_ldflags') != ''
fuzzer_ldflags += [get_option('fuzzer_ldflags')]
endif
if fuzzing_engine == 'none'
dav1d_fuzzer_sources += files('libfuzzer/main.c')
elif fuzzing_engine == 'libfuzzer'
fuzzer_ldflags += ['-fsanitize=fuzzer']
elif fuzzing_engine == 'oss-fuzz'
# libFuzzingEngine needs libc++
fuzzer_ldflags += ['-lc++']
endif
dav1d_fuzzer = executable('dav1d_fuzzer',
dav1d_fuzzer_sources,
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag],
link_args: fuzzer_ldflags,
link_with : libdav1d,
build_by_default: true,
dependencies : [thread_dependency],
)
dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
dav1d_fuzzer_sources,
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
link_args: fuzzer_ldflags,
link_with : libdav1d,
build_by_default: true,
dependencies : [thread_dependency],
)
objcopy = find_program('objcopy',
required: false)
if (objcopy.found() and
not get_option('b_lto') and
get_option('default_library') == 'static' and
cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
libdav1d_af = custom_target('libdav1d_af',
input: libdav1d,
output: 'libdav1d_af.a',
depends: libdav1d,
command: [objcopy,
'--redefine-sym', 'malloc=__wrap_malloc',
'--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
'--redefine-sym', 'pthread_create=__wrap_pthread_create',
'--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
'--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
'@INPUT@', '@OUTPUT@'])
dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
dav1d_fuzzer_sources + ['libfuzzer/alloc_fail.c'],
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
link_depends: libdav1d_af,
build_by_default: false,
dependencies : [thread_dependency],
)
# fuzzing binaries
if meson.version().version_compare('>=0.49')
subdir('libfuzzer')
endif
# Include dav1d test data repository with additional tests

30
third_party/dav1d/tools/dav1d.c поставляемый
Просмотреть файл

@ -113,18 +113,24 @@ static void synchronize(const int realtime, const unsigned cache,
static void print_stats(const int istty, const unsigned n, const unsigned num,
const uint64_t elapsed, const double i_fps)
{
if (istty) fputs("\r", stderr);
const double d_fps = 1e9 * n / elapsed;
const double speed = d_fps / i_fps;
if (num == 0xFFFFFFFF) {
fprintf(stderr, "Decoded %u frames", n);
} else {
fprintf(stderr, "Decoded %u/%u frames (%.1lf%%)", n, num,
100.0 * n / num);
char buf[80], *b = buf, *const end = buf + 80;
if (istty)
*b++ = '\r';
if (num == 0xFFFFFFFF)
b += snprintf(b, end - b, "Decoded %u frames", n);
else
b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
n, num, 100.0 * n / num);
if (i_fps && b < end) {
const double d_fps = 1e9 * n / elapsed;
const double speed = d_fps / i_fps;
b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
d_fps, i_fps, speed);
}
if (i_fps)
fprintf(stderr, " - %.2lf/%.2lf fps (%.2lfx)", d_fps, i_fps, speed);
if (!istty) fputs("\n", stderr);
if (!istty)
strcpy(b > end - 2 ? end - 2 : b, "\n");
fputs(buf, stderr);
}
int main(const int argc, char *const *const argv) {
@ -149,8 +155,6 @@ int main(const int argc, char *const *const argv) {
return EXIT_FAILURE;
}
init_demuxers();
init_muxers();
parse(argc, argv, &cli_settings, &lib_settings);
if ((res = input_open(&in, cli_settings.demuxer,

26
third_party/dav1d/tools/dav1d_cli_parse.c поставляемый
Просмотреть файл

@ -86,7 +86,7 @@ static const struct option long_opts[] = {
#define ALLOWED_CPU_MASKS " or 'neon'"
#elif ARCH_X86
#define ALLOWED_CPU_MASKS \
", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512'"
", 'sse2', 'ssse3', 'sse41', 'avx2', 'avx512' or 'avx512icl'"
#else
#define ALLOWED_CPU_MASKS "not yet implemented for this architecture"
#endif
@ -176,15 +176,11 @@ typedef struct EnumParseTable {
#if ARCH_X86
enum CpuMask {
X86_CPU_MASK_SSE = DAV1D_X86_CPU_FLAG_SSE,
X86_CPU_MASK_SSE2 = DAV1D_X86_CPU_FLAG_SSE2 | X86_CPU_MASK_SSE,
X86_CPU_MASK_SSE3 = DAV1D_X86_CPU_FLAG_SSE3 | X86_CPU_MASK_SSE2,
X86_CPU_MASK_SSSE3 = DAV1D_X86_CPU_FLAG_SSSE3 | X86_CPU_MASK_SSE3,
X86_CPU_MASK_SSE41 = DAV1D_X86_CPU_FLAG_SSE41 | X86_CPU_MASK_SSSE3,
X86_CPU_MASK_SSE42 = DAV1D_X86_CPU_FLAG_SSE42 | X86_CPU_MASK_SSE41,
X86_CPU_MASK_AVX = DAV1D_X86_CPU_FLAG_AVX | X86_CPU_MASK_SSE42,
X86_CPU_MASK_AVX2 = DAV1D_X86_CPU_FLAG_AVX2 | X86_CPU_MASK_AVX,
X86_CPU_MASK_AVX512 = DAV1D_X86_CPU_FLAG_AVX512 | X86_CPU_MASK_AVX2,
X86_CPU_MASK_SSE2 = DAV1D_X86_CPU_FLAG_SSE2,
X86_CPU_MASK_SSSE3 = DAV1D_X86_CPU_FLAG_SSSE3 | X86_CPU_MASK_SSE2,
X86_CPU_MASK_SSE41 = DAV1D_X86_CPU_FLAG_SSE41 | X86_CPU_MASK_SSSE3,
X86_CPU_MASK_AVX2 = DAV1D_X86_CPU_FLAG_AVX2 | X86_CPU_MASK_SSE41,
X86_CPU_MASK_AVX512ICL = DAV1D_X86_CPU_FLAG_AVX512ICL | X86_CPU_MASK_AVX2,
};
#endif
@ -192,11 +188,11 @@ static const EnumParseTable cpu_mask_tbl[] = {
#if ARCH_AARCH64 || ARCH_ARM
{ "neon", DAV1D_ARM_CPU_FLAG_NEON },
#elif ARCH_X86
{ "sse2", X86_CPU_MASK_SSE2 },
{ "ssse3", X86_CPU_MASK_SSSE3 },
{ "sse41", X86_CPU_MASK_SSE41 },
{ "avx2", X86_CPU_MASK_AVX2 },
{ "avx512", X86_CPU_MASK_AVX512 },
{ "sse2", X86_CPU_MASK_SSE2 },
{ "ssse3", X86_CPU_MASK_SSSE3 },
{ "sse41", X86_CPU_MASK_SSE41 },
{ "avx2", X86_CPU_MASK_AVX2 },
{ "avx512icl", X86_CPU_MASK_AVX512ICL },
#endif
{ 0 },
};

37
third_party/dav1d/tools/input/input.c поставляемый
Просмотреть файл

@ -43,21 +43,15 @@ struct DemuxerContext {
const Demuxer *impl;
};
#define MAX_NUM_DEMUXERS 3
static const Demuxer *demuxers[MAX_NUM_DEMUXERS];
static int num_demuxers = 0;
#define register_demuxer(impl) { \
extern const Demuxer impl; \
assert(num_demuxers < MAX_NUM_DEMUXERS); \
demuxers[num_demuxers++] = &impl; \
}
void init_demuxers(void) {
register_demuxer(ivf_demuxer);
register_demuxer(annexb_demuxer);
register_demuxer(section5_demuxer);
}
extern const Demuxer ivf_demuxer;
extern const Demuxer annexb_demuxer;
extern const Demuxer section5_demuxer;
static const Demuxer *const demuxers[] = {
&ivf_demuxer,
&annexb_demuxer,
&section5_demuxer,
NULL
};
int input_open(DemuxerContext **const c_out,
const char *const name, const char *const filename,
@ -68,19 +62,19 @@ int input_open(DemuxerContext **const c_out,
int res, i;
if (name) {
for (i = 0; i < num_demuxers; i++) {
for (i = 0; demuxers[i]; i++) {
if (!strcmp(demuxers[i]->name, name)) {
impl = demuxers[i];
break;
}
}
if (i == num_demuxers) {
if (!demuxers[i]) {
fprintf(stderr, "Failed to find demuxer named \"%s\"\n", name);
return DAV1D_ERR(ENOPROTOOPT);
}
} else {
int probe_sz = 0;
for (i = 0; i < num_demuxers; i++)
for (i = 0; demuxers[i]; i++)
probe_sz = imax(probe_sz, demuxers[i]->probe_sz);
uint8_t *const probe_data = malloc(probe_sz);
if (!probe_data) {
@ -96,14 +90,14 @@ int input_open(DemuxerContext **const c_out,
return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
}
for (i = 0; i < num_demuxers; i++) {
for (i = 0; demuxers[i]; i++) {
if (demuxers[i]->probe(probe_data)) {
impl = demuxers[i];
break;
}
}
free(probe_data);
if (i == num_demuxers) {
if (!demuxers[i]) {
fprintf(stderr,
"Failed to probe demuxer for file %s\n",
filename);
@ -111,11 +105,10 @@ int input_open(DemuxerContext **const c_out,
}
}
if (!(c = malloc(sizeof(DemuxerContext) + impl->priv_data_size))) {
if (!(c = calloc(1, sizeof(DemuxerContext) + impl->priv_data_size))) {
fprintf(stderr, "Failed to allocate memory\n");
return DAV1D_ERR(ENOMEM);
}
memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
c->impl = impl;
c->data = (DemuxerPriv *) &c[1];
if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше