зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1620471 - Update libdav1d to 0.6.0 r=dminor
Differential Revision: https://phabricator.services.mozilla.com/D67511 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
05300c66f1
Коммит
4ef4d2b585
|
@ -2,7 +2,7 @@ This directory contains build files for dav1d. The actual library
|
|||
source is in $TOPSRCDIR/third_party/dav1d/
|
||||
|
||||
Any patches or additional configuration to be applied to the
|
||||
upstream source should be kept here in the media/libaom
|
||||
upstream source should be kept here in the media/libdav1d
|
||||
directory.
|
||||
|
||||
To update the library source and build config files, execute
|
||||
|
@ -13,8 +13,35 @@ To update to a specific upstream git tag or commit, use
|
|||
|
||||
./mach vendor dav1d -r <commit>
|
||||
|
||||
The upstream git repository is https://aomedia.googlesource.com/aom
|
||||
The upstream git repository is https://code.videolan.org/videolan/dav1d
|
||||
|
||||
To update to a fork, use
|
||||
|
||||
./mach vendor dav1d --repo <repository url> [-r <commit>]
|
||||
|
||||
|
||||
The rough steps are:
|
||||
- Execute ./mach vendor dav1d -r {tag-name} # ex: ./mach vendor dav1d -r 0.6.0
|
||||
- Update moz.build with the new files, check the
|
||||
third_party/dav1d/src/meson.build (confirm with the diff) (note the
|
||||
empty .asm file in x86_64)
|
||||
- Build a stand-alone libdav1d following the steps here:
|
||||
https://code.videolan.org/videolan/dav1d#compile
|
||||
- Copy vcs_version.h from the local build/include/vcs_version.h
|
||||
to media/libdav1d/vcs_version.h
|
||||
- Copy version.h from local build/include/dav1di/version.h to
|
||||
media/libdav1d/version.h
|
||||
- Update dav1d.rc:
|
||||
- update the API_VERSION_NUMBER, API_VERSION_NUMBER_STR, defines to
|
||||
match the 'dav1d_soname_version' field in
|
||||
third_party/dav1d/meson.build.
|
||||
- update the PROJECT_VERSION_NUMBER, PROJECT_VERSION_NUMBER_STR
|
||||
defines to match the new project versions from the git tag (or from
|
||||
the project version found in third_party/dav1d/meson.build).
|
||||
- Add new options, if any, in moz.build or config.h
|
||||
|
||||
Tips:
|
||||
- If you see build failures in build-linux64-base-toolchains (or
|
||||
similar jobs) dav1d may now require a higher minimum nasm version
|
||||
than our base toolchains currently support. A bug updating the
|
||||
minimum nasm version will probably be necessary.
|
||||
|
|
|
@ -88,11 +88,13 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
'../../../third_party/dav1d/src/x86/loopfilter.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc.asm',
|
||||
'../../../third_party/dav1d/src/x86/msac_init.c',
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/cpuid.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter_ssse3.asm',
|
||||
|
@ -192,11 +194,18 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
if CONFIG['CPU_ARCH'] == 'aarch64':
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/arm/64/cdef.S',
|
||||
'../../../third_party/dav1d/src/arm/64/cdef16.S',
|
||||
'../../../third_party/dav1d/src/arm/64/cdef_tmpl.S',
|
||||
'../../../third_party/dav1d/src/arm/64/ipred.S',
|
||||
'../../../third_party/dav1d/src/arm/64/itx.S',
|
||||
'../../../third_party/dav1d/src/arm/64/loopfilter.S',
|
||||
'../../../third_party/dav1d/src/arm/64/loopfilter16.S',
|
||||
'../../../third_party/dav1d/src/arm/64/looprestoration.S',
|
||||
'../../../third_party/dav1d/src/arm/64/looprestoration16.S',
|
||||
'../../../third_party/dav1d/src/arm/64/looprestoration_common.S',
|
||||
'../../../third_party/dav1d/src/arm/64/looprestoration_tmpl.S',
|
||||
'../../../third_party/dav1d/src/arm/64/mc.S',
|
||||
'../../../third_party/dav1d/src/arm/64/mc16.S',
|
||||
'../../../third_party/dav1d/src/arm/64/msac.S',
|
||||
]
|
||||
elif CONFIG['CPU_ARCH'] == 'arm':
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#define API_VERSION_NUMBER 3,1,0,0
|
||||
#define API_VERSION_NUMBER_STR "3.1.0"
|
||||
#define PROJECT_VERSION_NUMBER 0,5,2,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "0.5.2"
|
||||
#define API_VERSION_NUMBER 4,0,0,0
|
||||
#define API_VERSION_NUMBER_STR "4.0.0"
|
||||
#define PROJECT_VERSION_NUMBER 0,6,0,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "0.6.0"
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
|
|
|
@ -79,6 +79,7 @@ SOURCES += [
|
|||
'../../third_party/dav1d/src/dequant_tables.c',
|
||||
'../../third_party/dav1d/src/getbits.c',
|
||||
'../../third_party/dav1d/src/intra_edge.c',
|
||||
'../../third_party/dav1d/src/itx_1d.c',
|
||||
'../../third_party/dav1d/src/lf_mask.c',
|
||||
'../../third_party/dav1d/src/log.c',
|
||||
'../../third_party/dav1d/src/msac.c',
|
||||
|
@ -167,6 +168,7 @@ EXPORTS.dav1d.src += [
|
|||
'../../third_party/dav1d/src/ipred.h',
|
||||
'../../third_party/dav1d/src/ipred_prepare.h',
|
||||
'../../third_party/dav1d/src/itx.h',
|
||||
'../../third_party/dav1d/src/itx_1d.h',
|
||||
'../../third_party/dav1d/src/lf_apply.h',
|
||||
'../../third_party/dav1d/src/loopfilter.h',
|
||||
'../../third_party/dav1d/src/looprestoration.h',
|
||||
|
|
|
@ -20,7 +20,7 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 39667c751d427e447cbe8be783cfecd296659e24 (2019-12-02T18:19:06.000+01:00).
|
||||
release: commit efd9e5518e0ed5114f8b4579debd7ee6dbede21f (2020-03-06T00:16:53.000+01:00).
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.5.2-0-g39667c7"
|
||||
#define DAV1D_VERSION "0.6.0-0-gefd9e55"
|
||||
|
|
|
@ -27,8 +27,8 @@
|
|||
#ifndef DAV1D_VERSION_H
|
||||
#define DAV1D_VERSION_H
|
||||
|
||||
#define DAV1D_API_VERSION_MAJOR 3
|
||||
#define DAV1D_API_VERSION_MINOR 1
|
||||
#define DAV1D_API_VERSION_MAJOR 4
|
||||
#define DAV1D_API_VERSION_MINOR 0
|
||||
#define DAV1D_API_VERSION_PATCH 0
|
||||
|
||||
#endif /* DAV1D_VERSION_H */
|
||||
|
|
|
@ -38,7 +38,7 @@ build-debian:
|
|||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: build
|
||||
tags:
|
||||
- debian
|
||||
- avx2
|
||||
- amd64
|
||||
script:
|
||||
- meson build --buildtype release --werror
|
||||
|
@ -173,7 +173,7 @@ build-win-arm64:
|
|||
|
||||
build-debian-aarch64:
|
||||
stage: build
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
|
||||
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
|
||||
tags:
|
||||
- aarch64
|
||||
- debian
|
||||
|
@ -184,7 +184,7 @@ build-debian-aarch64:
|
|||
|
||||
build-debian-aarch64-clang-5:
|
||||
stage: build
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
|
||||
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
|
||||
tags:
|
||||
- aarch64
|
||||
- debian
|
||||
|
@ -203,7 +203,7 @@ build-macos:
|
|||
- cd build && meson test -v
|
||||
|
||||
build-debian-werror:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
|
||||
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
|
||||
stage: build
|
||||
tags:
|
||||
- aarch64
|
||||
|
@ -219,7 +219,7 @@ build-debian-armv7:
|
|||
- armv7
|
||||
- debian
|
||||
script:
|
||||
- meson build --buildtype debugoptimized --werror
|
||||
- linux32 meson build --buildtype debugoptimized --werror
|
||||
- ninja -C build
|
||||
- cd build && meson test -v
|
||||
|
||||
|
@ -230,13 +230,13 @@ build-debian-armv7-clang-5:
|
|||
- armv7
|
||||
- debian
|
||||
script:
|
||||
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
|
||||
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
|
||||
- ninja -C build
|
||||
- cd build && meson test -v
|
||||
|
||||
build-ubuntu-snap:
|
||||
stage: build
|
||||
image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127
|
||||
image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
|
@ -292,7 +292,7 @@ test-debian-unaligned-stack:
|
|||
stage: test
|
||||
needs: ["build-debian"]
|
||||
tags:
|
||||
- debian
|
||||
- avx2
|
||||
- amd64
|
||||
cache:
|
||||
key: testdata.git-20190215
|
||||
|
@ -382,7 +382,7 @@ test-win64:
|
|||
stage: test
|
||||
needs: ["build-win64"]
|
||||
tags:
|
||||
- debian
|
||||
- avx2
|
||||
- amd64
|
||||
cache:
|
||||
key: testdata.git-20190215
|
||||
|
@ -403,7 +403,7 @@ test-win64:
|
|||
dependencies: []
|
||||
|
||||
test-debian-aarch64:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
|
||||
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
|
||||
stage: test
|
||||
needs: ["build-debian-aarch64"]
|
||||
tags:
|
||||
|
@ -464,7 +464,7 @@ test-debian-armv7-clang-5:
|
|||
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
|
||||
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
|
||||
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
|
||||
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
|
||||
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
|
||||
-Dtestdata_tests=true
|
||||
-Dlogging=false
|
||||
- ninja -C build
|
||||
|
|
|
@ -1,3 +1,26 @@
|
|||
Changes for 0.6.0 'Gyrfalcon':
|
||||
------------------------------
|
||||
|
||||
0.6.0 is a major release for dav1d:
|
||||
- New ARM64 optimizations for the 10/12bit depth:
|
||||
- mc_avg, mc_w_avg, mc_mask
|
||||
- mc_put/mc_prep 8tap/bilin
|
||||
- mc_warp_8x8
|
||||
- mc_w_mask
|
||||
- mc_blend
|
||||
- wiener
|
||||
- SGR
|
||||
- loopfilter
|
||||
- cdef
|
||||
- New AVX-512 optimizations for prep_bilin, prep_8tap, cdef_filter, mc_avg/w_avg/mask
|
||||
- New SSSE3 optimizations for film grain
|
||||
- New AVX2 optimizations for msac_adapt16
|
||||
- Fix rare mismatches against the reference decoder, notably because of clipping
|
||||
- Improvements on ARM64 on msac, cdef and looprestoration optimizations
|
||||
- Improvements on AVX2 optimizations for cdef_filter
|
||||
- Improvements in the C version for itxfm, cdef_filter
|
||||
|
||||
|
||||
Changes for 0.5.2 'Asiatic Cheetah':
|
||||
------------------------------------
|
||||
|
||||
|
@ -32,7 +55,7 @@ and improving speed significantly:
|
|||
- NEON optimizations for CDEF and warp on ARM32
|
||||
- SSE2 optimizations for MSAC hi_tok decoding
|
||||
- SSSE3 optimizations for deblocking loopfilters and warp_affine
|
||||
- AVX-2 optimizations for film grain and ipred_z2
|
||||
- AVX2 optimizations for film grain and ipred_z2
|
||||
- SSE4 optimizations for warp_affine
|
||||
- VSX optimizations for wiener
|
||||
- Fix inverse transform overflows in x86 and NEON asm
|
||||
|
@ -81,7 +104,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
|
|||
-----------------------------
|
||||
|
||||
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
|
||||
The impact is important on SSSE3, SSE4 and AVX-2 cpus
|
||||
The impact is important on SSSE3, SSE4 and AVX2 cpus
|
||||
- SSSE3 optimizations for all blocks size in itx
|
||||
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
|
||||
- Speed improvements on CDEF for SSE4 CPUs
|
||||
|
@ -93,7 +116,7 @@ Changes for 0.2.1 'Antelope':
|
|||
----------------------------
|
||||
|
||||
- SSSE3 optimization for cdef_dir
|
||||
- AVX-2 improvements of the existing CDEF optimizations
|
||||
- AVX2 improvements of the existing CDEF optimizations
|
||||
- NEON improvements of the existing CDEF and wiener optimizations
|
||||
- Clarification about the numbering/versionning scheme
|
||||
|
||||
|
@ -103,7 +126,7 @@ Changes for 0.2.0 'Antelope':
|
|||
|
||||
- ARM64 and ARM optimizations using NEON instructions
|
||||
- SSSE3 optimizations for both 32 and 64bits
|
||||
- More AVX-2 assembly, reaching almost completion
|
||||
- More AVX2 assembly, reaching almost completion
|
||||
- Fix installation of includes
|
||||
- Rewrite inverse transforms to avoid overflows
|
||||
- Snap packaging for Linux
|
||||
|
@ -118,6 +141,6 @@ Initial release of dav1d, the fast and small AV1 decoder.
|
|||
- Support for all features of the AV1 bitstream
|
||||
- Support for all bitdepth, 8, 10 and 12bits
|
||||
- Support for all chroma subsamplings 4:2:0, 4:2:2, 4:4:4 *and* grayscale
|
||||
- Full acceleration for AVX-2 64bits processors, making it the fastest decoder
|
||||
- Full acceleration for AVX2 64bits processors, making it the fastest decoder
|
||||
- Partial acceleration for SSSE3 processors
|
||||
- Partial acceleration for NEON processors
|
||||
|
|
|
@ -73,7 +73,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
|
|||
|
||||
# Compile
|
||||
|
||||
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
|
||||
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
|
||||
2. Run `mkdir build && cd build` to create a build directory and enter it
|
||||
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
|
||||
4. Run `ninja` to compile
|
||||
|
|
|
@ -43,15 +43,18 @@
|
|||
#endif
|
||||
|
||||
#if ARCH_X86_64
|
||||
/* x86-64 needs 32-byte alignment for AVX2. */
|
||||
/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
|
||||
#define ALIGN_64_VAL 64
|
||||
#define ALIGN_32_VAL 32
|
||||
#define ALIGN_16_VAL 16
|
||||
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
|
||||
/* ARM doesn't benefit from anything more than 16-byte alignment. */
|
||||
#define ALIGN_64_VAL 16
|
||||
#define ALIGN_32_VAL 16
|
||||
#define ALIGN_16_VAL 16
|
||||
#else
|
||||
/* No need for extra alignment on platforms without assembly. */
|
||||
#define ALIGN_64_VAL 8
|
||||
#define ALIGN_32_VAL 8
|
||||
#define ALIGN_16_VAL 8
|
||||
#endif
|
||||
|
@ -76,9 +79,10 @@
|
|||
* becomes:
|
||||
* ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
|
||||
*/
|
||||
#define ALIGN_STK_64(type, var, sz1d, sznd) \
|
||||
ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
|
||||
#define ALIGN_STK_32(type, var, sz1d, sznd) \
|
||||
ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
|
||||
// as long as stack is itself 16-byte aligned, this works (win64, gcc)
|
||||
#define ALIGN_STK_16(type, var, sz1d, sznd) \
|
||||
ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
|
||||
|
||||
|
@ -92,6 +96,12 @@
|
|||
#define NOINLINE __attribute__((noinline))
|
||||
#endif /* !_MSC_VER */
|
||||
|
||||
#ifdef __clang__
|
||||
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
|
||||
#else
|
||||
#define NO_SANITIZE(x)
|
||||
#endif
|
||||
|
||||
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
|
||||
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
|
||||
#elif defined(NDEBUG) && defined(_MSC_VER)
|
||||
|
|
|
@ -31,6 +31,8 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#if !defined(BITDEPTH)
|
||||
typedef void pixel;
|
||||
typedef void coef;
|
||||
|
@ -47,12 +49,14 @@ typedef int16_t coef;
|
|||
#define iclip_pixel iclip_u8
|
||||
#define PIX_HEX_FMT "%02x"
|
||||
#define bitfn(x) x##_8bpc
|
||||
#define PXSTRIDE(x) x
|
||||
#define BF(x, suffix) x##_8bpc_##suffix
|
||||
#define PXSTRIDE(x) (x)
|
||||
#define highbd_only(x)
|
||||
#define HIGHBD_DECL_SUFFIX /* nothing */
|
||||
#define HIGHBD_CALL_SUFFIX /* nothing */
|
||||
#define HIGHBD_TAIL_SUFFIX /* nothing */
|
||||
#define bitdepth_from_max(x) 8
|
||||
#define BITDEPTH_MAX 0xff
|
||||
#elif BITDEPTH == 16
|
||||
typedef uint16_t pixel;
|
||||
typedef int32_t coef;
|
||||
|
@ -69,8 +73,13 @@ static inline void pixel_set(pixel *const dst, const int val, const int num) {
|
|||
#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
|
||||
#define HIGHBD_TAIL_SUFFIX , bitdepth_max
|
||||
#define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
|
||||
#define BITDEPTH_MAX bitdepth_max
|
||||
#define bitfn(x) x##_16bpc
|
||||
#define PXSTRIDE(x) (x >> 1)
|
||||
#define BF(x, suffix) x##_16bpc_##suffix
|
||||
static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
|
||||
assert(!(x & 1));
|
||||
return x >> 1;
|
||||
}
|
||||
#define highbd_only(x) x
|
||||
#else
|
||||
#error invalid value for bitdepth
|
||||
|
|
|
@ -318,8 +318,8 @@ typedef struct Dav1dFilmGrainData {
|
|||
int scaling_shift;
|
||||
int ar_coeff_lag;
|
||||
int8_t ar_coeffs_y[24];
|
||||
int8_t ar_coeffs_uv[2][25];
|
||||
int ar_coeff_shift;
|
||||
int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
|
||||
uint64_t ar_coeff_shift;
|
||||
int grain_scale_shift;
|
||||
int uv_mult[2];
|
||||
int uv_luma_mult[2];
|
||||
|
@ -329,13 +329,13 @@ typedef struct Dav1dFilmGrainData {
|
|||
} Dav1dFilmGrainData;
|
||||
|
||||
typedef struct Dav1dFrameHeader {
|
||||
struct {
|
||||
Dav1dFilmGrainData data;
|
||||
int present, update;
|
||||
} film_grain; ///< film grain parameters
|
||||
enum Dav1dFrameType frame_type; ///< type of the picture
|
||||
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
|
||||
int frame_offset; ///< frame number
|
||||
struct {
|
||||
int present, update;
|
||||
Dav1dFilmGrainData data;
|
||||
} film_grain; ///< film grain parameters
|
||||
int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC
|
||||
|
||||
int show_existing_frame;
|
||||
|
|
|
@ -23,14 +23,14 @@
|
|||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '0.5.2',
|
||||
version: '0.6.0',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.47.0')
|
||||
|
||||
dav1d_soname_version = '3.1.0'
|
||||
dav1d_soname_version = '4.0.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
|
@ -84,13 +84,15 @@ test_args = []
|
|||
|
||||
optional_arguments = []
|
||||
|
||||
# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
|
||||
test_args += '-D_POSIX_C_SOURCE=200112L'
|
||||
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
|
||||
|
||||
if host_machine.system() == 'darwin'
|
||||
if host_machine.system() == 'linux'
|
||||
test_args += '-D_GNU_SOURCE'
|
||||
add_project_arguments('-D_GNU_SOURCE', language: 'c')
|
||||
elif host_machine.system() == 'darwin'
|
||||
test_args += '-D_DARWIN_C_SOURCE'
|
||||
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
|
||||
else
|
||||
test_args += '-D_POSIX_C_SOURCE=200112L'
|
||||
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
|
||||
endif
|
||||
|
||||
if host_machine.system() == 'windows'
|
||||
|
@ -131,6 +133,15 @@ else
|
|||
endif
|
||||
endif
|
||||
|
||||
libdl_dependency = []
|
||||
if host_machine.system() == 'linux'
|
||||
libdl_dependency = cc.find_library('dl', required : false)
|
||||
if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
|
||||
cdata.set('HAVE_DLSYM', 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
# Header checks
|
||||
|
||||
stdatomic_dependency = []
|
||||
|
@ -257,12 +268,12 @@ if host_machine.cpu_family().startswith('x86')
|
|||
if get_option('stack_alignment') > 0
|
||||
stack_alignment = get_option('stack_alignment')
|
||||
elif host_machine.cpu_family() == 'x86_64'
|
||||
if cc.has_argument('-mpreferred-stack-boundary=5')
|
||||
stackalign_flag = ['-mpreferred-stack-boundary=5']
|
||||
if cc.has_argument('-mpreferred-stack-boundary=6')
|
||||
stackalign_flag = ['-mpreferred-stack-boundary=6']
|
||||
stackrealign_flag = ['-mincoming-stack-boundary=4']
|
||||
stack_alignment = 32
|
||||
elif cc.has_argument('-mstack-alignment=32')
|
||||
stackalign_flag = ['-mstack-alignment=32']
|
||||
elif cc.has_argument('-mstack-alignment=64')
|
||||
stackalign_flag = ['-mstack-alignment=64']
|
||||
stackrealign_flag = ['-mstackrealign']
|
||||
stack_alignment = 32
|
||||
else
|
||||
|
@ -364,8 +375,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
|
|||
|
||||
out = nasm_r.stdout().strip().split()
|
||||
if out[1].to_lower() == 'version'
|
||||
if out[2].version_compare('<2.13.02')
|
||||
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
|
||||
if out[2].version_compare('<2.14')
|
||||
error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
|
||||
endif
|
||||
else
|
||||
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
|
||||
|
@ -390,7 +401,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
|
|||
depfile: '@BASENAME@.obj.ndep',
|
||||
arguments: [
|
||||
'-f', nasm_format,
|
||||
'-I', '@SOURCE_DIR@/src/',
|
||||
'-I', '@0@/src/'.format(meson.current_source_dir()),
|
||||
'-I', '@0@/'.format(meson.current_build_dir()),
|
||||
'-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
|
||||
'@EXTRA_ARGS@',
|
||||
|
|
|
@ -148,20 +148,22 @@
|
|||
.endif
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// /*const*/ pixel *const top[2], int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// const pixel *const top, int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
|
||||
// n1 = s0/d0
|
||||
// w1 = d0/q0
|
||||
// n2 = s4/d2
|
||||
// w2 = d2/q1
|
||||
.macro padding_func w, stride, n1, w1, n2, w2, align
|
||||
function cdef_padding\w\()_neon, export=1
|
||||
function cdef_padding\w\()_8bpc_neon, export=1
|
||||
push {r4-r7,lr}
|
||||
ldrd r4, r5, [sp, #20]
|
||||
ldr r6, [sp, #28]
|
||||
cmp r6, #0xf // fully edged
|
||||
beq cdef_padding\w\()_edged_8bpc_neon
|
||||
vmov.i16 q3, #0x8000
|
||||
tst r6, #4 // CDEF_HAVE_TOP
|
||||
bne 1f
|
||||
|
@ -175,10 +177,9 @@ function cdef_padding\w\()_neon, export=1
|
|||
b 3f
|
||||
1:
|
||||
// CDEF_HAVE_TOP
|
||||
ldr r7, [r4]
|
||||
ldr lr, [r4, #4]
|
||||
add r7, r4, r2
|
||||
sub r0, r0, #2*(2*\stride)
|
||||
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
|
||||
pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
|
||||
|
||||
// Middle section
|
||||
3:
|
||||
|
@ -267,6 +268,65 @@ endfunc
|
|||
padding_func 8, 16, d0, q0, d2, q1, 128
|
||||
padding_func 4, 8, s0, d0, s4, d2, 64
|
||||
|
||||
// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// const pixel *const top, int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
|
||||
.macro padding_func_edged w, stride, reg, align
|
||||
function cdef_padding\w\()_edged_8bpc_neon
|
||||
sub r0, r0, #(2*\stride)
|
||||
|
||||
ldrh r12, [r4, #-2]
|
||||
vldr \reg, [r4]
|
||||
add r7, r4, r2
|
||||
strh r12, [r0, #-2]
|
||||
ldrh r12, [r4, #\w]
|
||||
vstr \reg, [r0]
|
||||
strh r12, [r0, #\w]
|
||||
|
||||
ldrh r12, [r7, #-2]
|
||||
vldr \reg, [r7]
|
||||
strh r12, [r0, #\stride-2]
|
||||
ldrh r12, [r7, #\w]
|
||||
vstr \reg, [r0, #\stride]
|
||||
strh r12, [r0, #\stride+\w]
|
||||
add r0, r0, #2*\stride
|
||||
|
||||
0:
|
||||
ldrh r12, [r3], #2
|
||||
vldr \reg, [r1]
|
||||
str r12, [r0, #-2]
|
||||
ldrh r12, [r1, #\w]
|
||||
add r1, r1, r2
|
||||
subs r5, r5, #1
|
||||
vstr \reg, [r0]
|
||||
str r12, [r0, #\w]
|
||||
add r0, r0, #\stride
|
||||
bgt 0b
|
||||
|
||||
ldrh r12, [r1, #-2]
|
||||
vldr \reg, [r1]
|
||||
add r7, r1, r2
|
||||
strh r12, [r0, #-2]
|
||||
ldrh r12, [r1, #\w]
|
||||
vstr \reg, [r0]
|
||||
strh r12, [r0, #\w]
|
||||
|
||||
ldrh r12, [r7, #-2]
|
||||
vldr \reg, [r7]
|
||||
strh r12, [r0, #\stride-2]
|
||||
ldrh r12, [r7, #\w]
|
||||
vstr \reg, [r0, #\stride]
|
||||
strh r12, [r0, #\stride+\w]
|
||||
|
||||
pop {r4-r7,pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
padding_func_edged 8, 16, d0, 64
|
||||
padding_func_edged 4, 8, s0, 32
|
||||
|
||||
.macro dir_table w, stride
|
||||
const directions\w
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
|
@ -311,14 +371,13 @@ endconst
|
|||
vld1.16 {\d22}, [r9] // p1
|
||||
.endif
|
||||
.endm
|
||||
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
|
||||
cmp \threshold, #0
|
||||
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
|
||||
.if \min
|
||||
vmin.u16 q2, q2, \s1
|
||||
vmax.s16 q3, q3, \s1
|
||||
vmin.u16 q2, q2, \s2
|
||||
vmax.s16 q3, q3, \s2
|
||||
|
||||
beq 3f
|
||||
.endif
|
||||
vabd.u16 q8, q0, \s1 // abs(diff)
|
||||
vabd.u16 q11, q0, \s2 // abs(diff)
|
||||
vshl.u16 q9, q8, \shift // abs(diff) >> shift
|
||||
|
@ -326,7 +385,7 @@ endconst
|
|||
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
vsub.i16 q10, \s1, q0 // diff = p0 - px
|
||||
vsub.u16 q13, \s2, q0 // diff = p1 - px
|
||||
vsub.i16 q13, \s2, q0 // diff = p1 - px
|
||||
vneg.s16 q8, q9 // -clip
|
||||
vneg.s16 q11, q12 // -clip
|
||||
vmin.s16 q10, q10, q9 // imin(diff, clip)
|
||||
|
@ -336,36 +395,44 @@ endconst
|
|||
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
|
||||
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
|
||||
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
|
||||
3:
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping, int h);
|
||||
.macro filter w
|
||||
function cdef_filter\w\()_neon, export=1
|
||||
push {r4-r9,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #92]
|
||||
ldrd r6, r7, [sp, #100]
|
||||
// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping,
|
||||
// int h, size_t edges);
|
||||
.macro filter_func w, pri, sec, min, suffix
|
||||
function cdef_filter\w\suffix\()_neon
|
||||
cmp r8, #0xf
|
||||
beq cdef_filter\w\suffix\()_edged_neon
|
||||
.if \pri
|
||||
movrel_local r8, pri_taps
|
||||
and r9, r3, #1
|
||||
add r8, r8, r9, lsl #1
|
||||
.endif
|
||||
movrel_local r9, directions\w
|
||||
add r5, r9, r5, lsl #1
|
||||
vmov.u16 d17, #15
|
||||
vdup.16 d16, r6 // damping
|
||||
|
||||
.if \pri
|
||||
vdup.16 q5, r3 // threshold
|
||||
.endif
|
||||
.if \sec
|
||||
vdup.16 q7, r4 // threshold
|
||||
.endif
|
||||
vmov.16 d8[0], r3
|
||||
vmov.16 d8[1], r4
|
||||
vclz.i16 d8, d8 // clz(threshold)
|
||||
vsub.i16 d8, d17, d8 // ulog2(threshold)
|
||||
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
|
||||
vneg.s16 d8, d8 // -shift
|
||||
.if \sec
|
||||
vdup.16 q6, d8[1]
|
||||
.endif
|
||||
.if \pri
|
||||
vdup.16 q4, d8[0]
|
||||
.endif
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
|
@ -377,47 +444,64 @@ function cdef_filter\w\()_neon, export=1
|
|||
.endif
|
||||
|
||||
vmov.u16 q1, #0 // sum
|
||||
.if \min
|
||||
vmov.u16 q2, q0 // min
|
||||
vmov.u16 q3, q0 // max
|
||||
.endif
|
||||
|
||||
// Instead of loading sec_taps 2, 1 from memory, just set it
|
||||
// to 2 initially and decrease for the second round.
|
||||
// This is also used as loop counter.
|
||||
mov lr, #2 // sec_taps[0]
|
||||
|
||||
2:
|
||||
.if \pri
|
||||
ldrsb r9, [r5] // off1
|
||||
|
||||
load_px d28, d29, d30, d31, \w
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add r5, r5, #4 // +2*2
|
||||
ldrsb r9, [r5] // off2
|
||||
.endif
|
||||
|
||||
.if \pri
|
||||
ldrb r12, [r8] // *pri_taps
|
||||
|
||||
handle_pixel q14, q15, r3, q5, q4, r12
|
||||
handle_pixel q14, q15, q5, q4, r12, \min
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
load_px d28, d29, d30, d31, \w
|
||||
|
||||
add r5, r5, #8 // +2*4
|
||||
ldrsb r9, [r5] // off3
|
||||
|
||||
handle_pixel q14, q15, r4, q7, q6, lr
|
||||
handle_pixel q14, q15, q7, q6, lr, \min
|
||||
|
||||
load_px d28, d29, d30, d31, \w
|
||||
|
||||
handle_pixel q14, q15, r4, q7, q6, lr
|
||||
handle_pixel q14, q15, q7, q6, lr, \min
|
||||
|
||||
sub r5, r5, #11 // x8 -= 2*(2+4); x8 += 1;
|
||||
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
|
||||
.else
|
||||
add r5, r5, #1 // r5 += 1
|
||||
.endif
|
||||
subs lr, lr, #1 // sec_tap-- (value)
|
||||
.if \pri
|
||||
add r8, r8, #1 // pri_taps++ (pointer)
|
||||
.endif
|
||||
bne 2b
|
||||
|
||||
vshr.s16 q14, q1, #15 // -(sum < 0)
|
||||
vadd.i16 q1, q1, q14 // sum - (sum < 0)
|
||||
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
|
||||
.if \min
|
||||
vmin.s16 q0, q0, q3
|
||||
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
|
||||
.endif
|
||||
vmovn.u16 d0, q0
|
||||
.if \w == 8
|
||||
add r2, r2, #2*16 // tmp += tmp_stride
|
||||
|
@ -430,9 +514,11 @@ function cdef_filter\w\()_neon, export=1
|
|||
vst1.32 {d0[1]}, [r0, :32], r1
|
||||
.endif
|
||||
|
||||
// Reset pri_taps/sec_taps back to the original point
|
||||
// Reset pri_taps and directions back to the original point
|
||||
sub r5, r5, #2
|
||||
.if \pri
|
||||
sub r8, r8, #2
|
||||
.endif
|
||||
|
||||
bgt 1b
|
||||
vpop {q4-q7}
|
||||
|
@ -440,9 +526,237 @@ function cdef_filter\w\()_neon, export=1
|
|||
endfunc
|
||||
.endm
|
||||
|
||||
.macro filter w
|
||||
filter_func \w, pri=1, sec=0, min=0, suffix=_pri
|
||||
filter_func \w, pri=0, sec=1, min=0, suffix=_sec
|
||||
filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
|
||||
|
||||
function cdef_filter\w\()_8bpc_neon, export=1
|
||||
push {r4-r9,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #92]
|
||||
ldrd r6, r7, [sp, #100]
|
||||
ldr r8, [sp, #108]
|
||||
cmp r3, #0 // pri_strength
|
||||
bne 1f
|
||||
b cdef_filter\w\()_sec_neon // only sec
|
||||
1:
|
||||
cmp r4, #0 // sec_strength
|
||||
bne 1f
|
||||
b cdef_filter\w\()_pri_neon // only pri
|
||||
1:
|
||||
b cdef_filter\w\()_pri_sec_neon // both pri and sec
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
filter 8
|
||||
filter 4
|
||||
|
||||
.macro load_px_8 d11, d12, d21, d22, w
|
||||
.if \w == 8
|
||||
add r6, r2, r9 // x + off
|
||||
sub r9, r2, r9 // x - off
|
||||
vld1.8 {\d11}, [r6] // p0
|
||||
add r6, r6, #16 // += stride
|
||||
vld1.8 {\d21}, [r9] // p1
|
||||
add r9, r9, #16 // += stride
|
||||
vld1.8 {\d12}, [r6] // p0
|
||||
vld1.8 {\d22}, [r9] // p1
|
||||
.else
|
||||
add r6, r2, r9 // x + off
|
||||
sub r9, r2, r9 // x - off
|
||||
vld1.32 {\d11[0]}, [r6] // p0
|
||||
add r6, r6, #8 // += stride
|
||||
vld1.32 {\d21[0]}, [r9] // p1
|
||||
add r9, r9, #8 // += stride
|
||||
vld1.32 {\d11[1]}, [r6] // p0
|
||||
add r6, r6, #8 // += stride
|
||||
vld1.32 {\d21[1]}, [r9] // p1
|
||||
add r9, r9, #8 // += stride
|
||||
vld1.32 {\d12[0]}, [r6] // p0
|
||||
add r6, r6, #8 // += stride
|
||||
vld1.32 {\d22[0]}, [r9] // p1
|
||||
add r9, r9, #8 // += stride
|
||||
vld1.32 {\d12[1]}, [r6] // p0
|
||||
vld1.32 {\d22[1]}, [r9] // p1
|
||||
.endif
|
||||
.endm
|
||||
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
|
||||
.if \min
|
||||
vmin.u8 q3, q3, \s1
|
||||
vmax.u8 q4, q4, \s1
|
||||
vmin.u8 q3, q3, \s2
|
||||
vmax.u8 q4, q4, \s2
|
||||
.endif
|
||||
vabd.u8 q8, q0, \s1 // abs(diff)
|
||||
vabd.u8 q11, q0, \s2 // abs(diff)
|
||||
vshl.u8 q9, q8, \shift // abs(diff) >> shift
|
||||
vshl.u8 q12, q11, \shift // abs(diff) >> shift
|
||||
vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
vcgt.u8 q10, q0, \s1 // px > p0
|
||||
vcgt.u8 q13, q0, \s2 // px > p1
|
||||
vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
|
||||
vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
|
||||
vneg.s8 q8, q9 // -imin()
|
||||
vneg.s8 q11, q12 // -imin()
|
||||
vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
|
||||
vdup.8 d18, \tap // taps[k]
|
||||
vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
|
||||
vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
|
||||
vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
|
||||
vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
|
||||
vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
|
||||
.endm
|
||||
|
||||
// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping,
|
||||
// int h, size_t edges);
|
||||
.macro filter_func_8 w, pri, sec, min, suffix
|
||||
function cdef_filter\w\suffix\()_edged_neon
|
||||
.if \pri
|
||||
movrel_local r8, pri_taps
|
||||
and r9, r3, #1
|
||||
add r8, r8, r9, lsl #1
|
||||
.endif
|
||||
movrel_local r9, directions\w
|
||||
add r5, r9, r5, lsl #1
|
||||
vmov.u8 d17, #7
|
||||
vdup.8 d16, r6 // damping
|
||||
|
||||
vmov.8 d8[0], r3
|
||||
vmov.8 d8[1], r4
|
||||
vclz.i8 d8, d8 // clz(threshold)
|
||||
vsub.i8 d8, d17, d8 // ulog2(threshold)
|
||||
vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
|
||||
vneg.s8 d8, d8 // -shift
|
||||
.if \sec
|
||||
vdup.8 q6, d8[1]
|
||||
.endif
|
||||
.if \pri
|
||||
vdup.8 q5, d8[0]
|
||||
.endif
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
add r12, r2, #16
|
||||
vld1.8 {d0}, [r2, :64] // px
|
||||
vld1.8 {d1}, [r12, :64] // px
|
||||
.else
|
||||
add r12, r2, #8
|
||||
vld1.32 {d0[0]}, [r2, :32] // px
|
||||
add r9, r2, #2*8
|
||||
vld1.32 {d0[1]}, [r12, :32] // px
|
||||
add r12, r12, #2*8
|
||||
vld1.32 {d1[0]}, [r9, :32] // px
|
||||
vld1.32 {d1[1]}, [r12, :32] // px
|
||||
.endif
|
||||
|
||||
vmov.u8 q1, #0 // sum
|
||||
vmov.u8 q2, #0 // sum
|
||||
.if \min
|
||||
vmov.u16 q3, q0 // min
|
||||
vmov.u16 q4, q0 // max
|
||||
.endif
|
||||
|
||||
// Instead of loading sec_taps 2, 1 from memory, just set it
|
||||
// to 2 initially and decrease for the second round.
|
||||
// This is also used as loop counter.
|
||||
mov lr, #2 // sec_taps[0]
|
||||
|
||||
2:
|
||||
.if \pri
|
||||
ldrsb r9, [r5] // off1
|
||||
|
||||
load_px_8 d28, d29, d30, d31, \w
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add r5, r5, #4 // +2*2
|
||||
ldrsb r9, [r5] // off2
|
||||
.endif
|
||||
|
||||
.if \pri
|
||||
ldrb r12, [r8] // *pri_taps
|
||||
vdup.8 q7, r3 // threshold
|
||||
|
||||
handle_pixel_8 q14, q15, q7, q5, r12, \min
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
load_px_8 d28, d29, d30, d31, \w
|
||||
|
||||
add r5, r5, #8 // +2*4
|
||||
ldrsb r9, [r5] // off3
|
||||
|
||||
vdup.8 q7, r4 // threshold
|
||||
|
||||
handle_pixel_8 q14, q15, q7, q6, lr, \min
|
||||
|
||||
load_px_8 d28, d29, d30, d31, \w
|
||||
|
||||
handle_pixel_8 q14, q15, q7, q6, lr, \min
|
||||
|
||||
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
|
||||
.else
|
||||
add r5, r5, #1 // r5 += 1
|
||||
.endif
|
||||
subs lr, lr, #1 // sec_tap-- (value)
|
||||
.if \pri
|
||||
add r8, r8, #1 // pri_taps++ (pointer)
|
||||
.endif
|
||||
bne 2b
|
||||
|
||||
vshr.s16 q14, q1, #15 // -(sum < 0)
|
||||
vshr.s16 q15, q2, #15 // -(sum < 0)
|
||||
vadd.i16 q1, q1, q14 // sum - (sum < 0)
|
||||
vadd.i16 q2, q2, q15 // sum - (sum < 0)
|
||||
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
|
||||
vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
|
||||
vqmovun.s16 d0, q1
|
||||
vqmovun.s16 d1, q2
|
||||
.if \min
|
||||
vmin.u8 q0, q0, q4
|
||||
vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
|
||||
.endif
|
||||
.if \w == 8
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
add r2, r2, #2*16 // tmp += 2*tmp_stride
|
||||
subs r7, r7, #2 // h -= 2
|
||||
vst1.8 {d1}, [r0, :64], r1
|
||||
.else
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
add r2, r2, #4*8 // tmp += 4*tmp_stride
|
||||
vst1.32 {d0[1]}, [r0, :32], r1
|
||||
subs r7, r7, #4 // h -= 4
|
||||
vst1.32 {d1[0]}, [r0, :32], r1
|
||||
vst1.32 {d1[1]}, [r0, :32], r1
|
||||
.endif
|
||||
|
||||
// Reset pri_taps and directions back to the original point
|
||||
sub r5, r5, #2
|
||||
.if \pri
|
||||
sub r8, r8, #2
|
||||
.endif
|
||||
|
||||
bgt 1b
|
||||
vpop {q4-q7}
|
||||
pop {r4-r9,pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro filter_8 w
|
||||
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
|
||||
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
|
||||
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
|
||||
.endm
|
||||
|
||||
filter_8 8
|
||||
filter_8 4
|
||||
|
||||
const div_table, align=4
|
||||
.short 840, 420, 280, 210, 168, 140, 120, 105
|
||||
endconst
|
||||
|
@ -451,9 +765,9 @@ const alt_fact, align=4
|
|||
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
|
||||
endconst
|
||||
|
||||
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
function cdef_find_dir_neon, export=1
|
||||
// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
function cdef_find_dir_8bpc_neon, export=1
|
||||
push {lr}
|
||||
vpush {q4-q7}
|
||||
sub sp, sp, #32 // cost
|
||||
|
|
|
@ -143,8 +143,8 @@ function lpf_8_wd\wd\()_neon
|
|||
vaddw.s8 q1, q1, d4
|
||||
vmov.i8 d7, #3
|
||||
vqmovn.s16 d2, q1 // f
|
||||
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
|
||||
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
|
||||
vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
|
||||
vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
|
||||
vshr.s8 d4, d4, #3 // f1
|
||||
vshr.s8 d5, d5, #3 // f2
|
||||
vmovl.u8 q1, d23 // p0
|
||||
|
@ -734,13 +734,13 @@ function lpf_h_16_8_neon
|
|||
bx r12
|
||||
endfunc
|
||||
|
||||
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint32_t *const vmask,
|
||||
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
// const Av1FilterLUT *lut, const int w)
|
||||
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint32_t *const vmask,
|
||||
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
// const Av1FilterLUT *lut, const int w)
|
||||
|
||||
.macro lpf_func dir, type
|
||||
function lpf_\dir\()_sb_\type\()_neon, export=1
|
||||
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
|
|
|
@ -28,11 +28,11 @@
|
|||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[7], const intptr_t w,
|
||||
// int h, enum LrEdgeFlags edges);
|
||||
function wiener_filter_h_neon, export=1
|
||||
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[7], const intptr_t w,
|
||||
// int h, enum LrEdgeFlags edges);
|
||||
function wiener_filter_h_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4}
|
||||
ldrd r4, r5, [sp, #52]
|
||||
|
@ -367,11 +367,11 @@ L(variable_shift_tbl):
|
|||
.purgem filter_4
|
||||
endfunc
|
||||
|
||||
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const int16_t *mid, int w, int h,
|
||||
// const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
// ptrdiff_t mid_stride);
|
||||
function wiener_filter_v_neon, export=1
|
||||
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const int16_t *mid, int w, int h,
|
||||
// const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
// ptrdiff_t mid_stride);
|
||||
function wiener_filter_v_8bpc_neon, export=1
|
||||
push {r4-r7,lr}
|
||||
ldrd r4, r5, [sp, #20]
|
||||
ldrd r6, r7, [sp, #28]
|
||||
|
@ -548,9 +548,9 @@ function wiener_filter_v_neon, export=1
|
|||
.purgem filter
|
||||
endfunc
|
||||
|
||||
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_neon, export=1
|
||||
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_8bpc_neon, export=1
|
||||
push {r4,lr}
|
||||
ldr r4, [sp, #8]
|
||||
adr r12, L(copy_narrow_tbl)
|
||||
|
@ -687,12 +687,12 @@ endfunc
|
|||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_h_neon, export=1
|
||||
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_h_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
|
@ -925,11 +925,11 @@ L(box3_variable_shift_tbl):
|
|||
vmull.u8 q6, d9, d9
|
||||
|
||||
add3 4
|
||||
subs r5, r5, #4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q8}, [r10, :128]!
|
||||
subs r5, r5, #4
|
||||
ble 9f
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q1, q1, q2, #8
|
||||
|
@ -961,12 +961,12 @@ L(box3_variable_shift_tbl):
|
|||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_h_neon, export=1
|
||||
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_h_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
|
@ -1038,7 +1038,7 @@ function sgr_box5_h_neon, export=1
|
|||
b 2f
|
||||
0:
|
||||
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
|
||||
// and shift q0 to have 2x the first byte at the front.
|
||||
// and shift q0 to have 3x the first byte at the front.
|
||||
vdup.8 q1, d0[0]
|
||||
vdup.8 q5, d8[0]
|
||||
// Move r3 back to account for the last 3 bytes we loaded before,
|
||||
|
@ -1215,11 +1215,11 @@ L(box5_variable_shift_tbl):
|
|||
vmull.u8 q6, d9, d9
|
||||
|
||||
add5 4
|
||||
subs r5, r5, #4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q10}, [r10, :128]!
|
||||
subs r5, r5, #4
|
||||
ble 9f
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q1, q1, q2, #8
|
||||
|
@ -1661,11 +1661,11 @@ endfunc
|
|||
|
||||
#define FILTER_OUT_STRIDE 384
|
||||
|
||||
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter1_neon, export=1
|
||||
// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter1_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
|
@ -1765,11 +1765,11 @@ function sgr_finish_filter1_neon, export=1
|
|||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter2_neon, export=1
|
||||
// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter2_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
|
@ -1925,11 +1925,11 @@ function sgr_finish_filter2_neon, export=1
|
|||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const coef *t1, const int w, const int h,
|
||||
// const int wt);
|
||||
function sgr_weighted1_neon, export=1
|
||||
// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const int16_t *t1, const int w, const int h,
|
||||
// const int wt);
|
||||
function sgr_weighted1_8bpc_neon, export=1
|
||||
push {r4-r9,lr}
|
||||
ldrd r4, r5, [sp, #28]
|
||||
ldrd r6, r7, [sp, #36]
|
||||
|
@ -2009,12 +2009,12 @@ function sgr_weighted1_neon, export=1
|
|||
pop {r4-r9,pc}
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const coef *t1, const coef *t2,
|
||||
// const int w, const int h,
|
||||
// const int16_t wt[2]);
|
||||
function sgr_weighted2_neon, export=1
|
||||
// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const int16_t *t1, const int16_t *t2,
|
||||
// const int w, const int h,
|
||||
// const int16_t wt[2]);
|
||||
function sgr_weighted2_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
ldrd r4, r5, [sp, #36]
|
||||
ldrd r6, r7, [sp, #44]
|
||||
|
|
|
@ -753,7 +753,7 @@ L(blend_v_tbl):
|
|||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 d5, d22, d4
|
||||
sub r1, r1, #3
|
||||
sub r1, r1, #2
|
||||
4:
|
||||
vld1.u8 {d2}, [r2, :64]!
|
||||
vld1.32 {d0[]}, [r0, :32]
|
||||
|
@ -764,10 +764,8 @@ L(blend_v_tbl):
|
|||
vrshrn.i16 d20, q3, #6
|
||||
vst1.16 {d20[0]}, [r0, :16]!
|
||||
vst1.16 {d20[2]}, [r12, :16]!
|
||||
vst1.8 {d20[2]}, [r0]!
|
||||
vst1.8 {d20[6]}, [r12]!
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
vst1.8 {d20[2]}, [r0], r1
|
||||
vst1.8 {d20[6]}, [r12], r1
|
||||
bgt 4b
|
||||
pop {r4-r5,pc}
|
||||
80:
|
||||
|
@ -776,7 +774,7 @@ L(blend_v_tbl):
|
|||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 d17, d16, d2
|
||||
sub r1, r1, #6
|
||||
sub r1, r1, #4
|
||||
8:
|
||||
vld1.u8 {d4, d5}, [r2, :128]!
|
||||
vld1.u8 {d0}, [r0, :64]
|
||||
|
@ -790,10 +788,8 @@ L(blend_v_tbl):
|
|||
vrshrn.i16 d23, q10, #6
|
||||
vst1.32 {d22[0]}, [r0, :32]!
|
||||
vst1.32 {d23[0]}, [r12, :32]!
|
||||
vst1.16 {d22[2]}, [r0, :16]!
|
||||
vst1.16 {d23[2]}, [r12, :16]!
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
vst1.16 {d22[2]}, [r0, :16], r1
|
||||
vst1.16 {d23[2]}, [r12, :16], r1
|
||||
bgt 8b
|
||||
pop {r4-r5,pc}
|
||||
160:
|
||||
|
@ -802,7 +798,7 @@ L(blend_v_tbl):
|
|||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 q11, q12, q14
|
||||
sub r1, r1, #12
|
||||
sub r1, r1, #8
|
||||
16:
|
||||
vld1.u8 {q1, q2}, [r2, :128]!
|
||||
vld1.u8 {q0}, [r0, :128]
|
||||
|
@ -822,20 +818,18 @@ L(blend_v_tbl):
|
|||
vrshrn.i16 d21, q8, #6
|
||||
vst1.u8 {d18}, [r0, :64]!
|
||||
vst1.u8 {d20}, [r12, :64]!
|
||||
vst1.32 {d19[0]}, [r0, :32]!
|
||||
vst1.32 {d21[0]}, [r12, :32]!
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
vst1.32 {d19[0]}, [r0, :32], r1
|
||||
vst1.32 {d21[0]}, [r12, :32], r1
|
||||
bgt 16b
|
||||
pop {r4-r5,pc}
|
||||
320:
|
||||
vmov.i8 q10, #64
|
||||
vld1.u8 {q2, q3}, [r5, :128]
|
||||
vsub.i8 q11, q10, q2
|
||||
vsub.i8 q12, q10, q3
|
||||
vsub.i8 d24, d20, d6
|
||||
32:
|
||||
vld1.u8 {q8, q9}, [r2, :128]!
|
||||
vld1.u8 {q0, q1}, [r0, :128]
|
||||
vld1.u8 {d0, d1, d2}, [r0, :64]
|
||||
subs r4, r4, #1
|
||||
vmull.u8 q15, d16, d4
|
||||
vmlal.u8 q15, d0, d22
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
#include "cdef_tmpl.S"
|
||||
|
||||
.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
|
||||
tst w6, #1 // CDEF_HAVE_LEFT
|
||||
|
@ -137,13 +138,15 @@
|
|||
.endif
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// /*const*/ pixel *const top[2], int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// const pixel *const top, int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
|
||||
.macro padding_func w, stride, rn, rw
|
||||
function cdef_padding\w\()_neon, export=1
|
||||
function cdef_padding\w\()_8bpc_neon, export=1
|
||||
cmp w6, #0xf // fully edged
|
||||
b.eq cdef_padding\w\()_edged_8bpc_neon
|
||||
movi v30.8h, #0x80, lsl #8
|
||||
mov v31.16b, v30.16b
|
||||
sub x0, x0, #2*(2*\stride+2)
|
||||
|
@ -157,9 +160,8 @@ function cdef_padding\w\()_neon, export=1
|
|||
b 3f
|
||||
1:
|
||||
// CDEF_HAVE_TOP
|
||||
ldr x8, [x4]
|
||||
ldr x9, [x4, #8]
|
||||
pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0
|
||||
add x9, x4, x2
|
||||
pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
|
||||
|
||||
// Middle section
|
||||
3:
|
||||
|
@ -242,358 +244,274 @@ endfunc
|
|||
padding_func 8, 16, d, q
|
||||
padding_func 4, 8, s, d
|
||||
|
||||
.macro dir_table w, stride
|
||||
const directions\w
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
.byte 1 * \stride + 0, 2 * \stride + 0
|
||||
.byte 1 * \stride + 0, 2 * \stride - 1
|
||||
// Repeated, to avoid & 7
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
endconst
|
||||
// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// const pixel *const top, int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
|
||||
.macro padding_func_edged w, stride, reg
|
||||
function cdef_padding\w\()_edged_8bpc_neon, export=1
|
||||
sub x4, x4, #2
|
||||
sub x0, x0, #(2*\stride+2)
|
||||
|
||||
.if \w == 4
|
||||
ldr d0, [x4]
|
||||
ldr d1, [x4, x2]
|
||||
st1 {v0.8b, v1.8b}, [x0], #16
|
||||
.else
|
||||
add x9, x4, x2
|
||||
ldr d0, [x4]
|
||||
ldr s1, [x4, #8]
|
||||
ldr d2, [x9]
|
||||
ldr s3, [x9, #8]
|
||||
str d0, [x0]
|
||||
str s1, [x0, #8]
|
||||
str d2, [x0, #\stride]
|
||||
str s3, [x0, #\stride+8]
|
||||
add x0, x0, #2*\stride
|
||||
.endif
|
||||
|
||||
0:
|
||||
ld1 {v0.h}[0], [x3], #2
|
||||
ldr h2, [x1, #\w]
|
||||
load_n_incr v1, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
str h0, [x0]
|
||||
stur \reg\()1, [x0, #2]
|
||||
str h2, [x0, #2+\w]
|
||||
add x0, x0, #\stride
|
||||
b.gt 0b
|
||||
|
||||
sub x1, x1, #2
|
||||
.if \w == 4
|
||||
ldr d0, [x1]
|
||||
ldr d1, [x1, x2]
|
||||
st1 {v0.8b, v1.8b}, [x0], #16
|
||||
.else
|
||||
add x9, x1, x2
|
||||
ldr d0, [x1]
|
||||
ldr s1, [x1, #8]
|
||||
ldr d2, [x9]
|
||||
ldr s3, [x9, #8]
|
||||
str d0, [x0]
|
||||
str s1, [x0, #8]
|
||||
str d2, [x0, #\stride]
|
||||
str s3, [x0, #\stride+8]
|
||||
.endif
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
dir_table 8, 16
|
||||
dir_table 4, 8
|
||||
padding_func_edged 8, 16, d
|
||||
padding_func_edged 4, 8, s
|
||||
|
||||
const pri_taps
|
||||
.byte 4, 2, 3, 3
|
||||
endconst
|
||||
tables
|
||||
|
||||
.macro load_px d1, d2, w
|
||||
filter 8, 8
|
||||
filter 4, 8
|
||||
|
||||
find_dir 8
|
||||
|
||||
.macro load_px_8 d1, d2, w
|
||||
.if \w == 8
|
||||
add x6, x2, w9, sxtb #1 // x + off
|
||||
sub x9, x2, w9, sxtb #1 // x - off
|
||||
ld1 {\d1\().8h}, [x6] // p0
|
||||
ld1 {\d2\().8h}, [x9] // p1
|
||||
.else
|
||||
add x6, x2, w9, sxtb #1 // x + off
|
||||
sub x9, x2, w9, sxtb #1 // x - off
|
||||
ld1 {\d1\().4h}, [x6] // p0
|
||||
add x6, x6, #2*8 // += stride
|
||||
ld1 {\d2\().4h}, [x9] // p1
|
||||
add x9, x9, #2*8 // += stride
|
||||
add x6, x2, w9, sxtb // x + off
|
||||
sub x9, x2, w9, sxtb // x - off
|
||||
ld1 {\d1\().d}[0], [x6] // p0
|
||||
add x6, x6, #16 // += stride
|
||||
ld1 {\d2\().d}[0], [x9] // p1
|
||||
add x9, x9, #16 // += stride
|
||||
ld1 {\d1\().d}[1], [x6] // p0
|
||||
ld1 {\d2\().d}[1], [x9] // p1
|
||||
ld1 {\d2\().d}[1], [x9] // p0
|
||||
.else
|
||||
add x6, x2, w9, sxtb // x + off
|
||||
sub x9, x2, w9, sxtb // x - off
|
||||
ld1 {\d1\().s}[0], [x6] // p0
|
||||
add x6, x6, #8 // += stride
|
||||
ld1 {\d2\().s}[0], [x9] // p1
|
||||
add x9, x9, #8 // += stride
|
||||
ld1 {\d1\().s}[1], [x6] // p0
|
||||
add x6, x6, #8 // += stride
|
||||
ld1 {\d2\().s}[1], [x9] // p1
|
||||
add x9, x9, #8 // += stride
|
||||
ld1 {\d1\().s}[2], [x6] // p0
|
||||
add x6, x6, #8 // += stride
|
||||
ld1 {\d2\().s}[2], [x9] // p1
|
||||
add x9, x9, #8 // += stride
|
||||
ld1 {\d1\().s}[3], [x6] // p0
|
||||
ld1 {\d2\().s}[3], [x9] // p1
|
||||
.endif
|
||||
.endm
|
||||
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
|
||||
umin v2.8h, v2.8h, \s1\().8h
|
||||
smax v3.8h, v3.8h, \s1\().8h
|
||||
umin v2.8h, v2.8h, \s2\().8h
|
||||
smax v3.8h, v3.8h, \s2\().8h
|
||||
|
||||
cbz \threshold, 3f
|
||||
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
|
||||
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
|
||||
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
|
||||
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
|
||||
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
|
||||
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
|
||||
neg v16.8h, v17.8h // -clip
|
||||
neg v20.8h, v21.8h // -clip
|
||||
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
|
||||
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
|
||||
dup v19.8h, \tap // taps[k]
|
||||
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
|
||||
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
|
||||
3:
|
||||
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
|
||||
.if \min
|
||||
umin v3.16b, v3.16b, \s1\().16b
|
||||
umax v4.16b, v4.16b, \s1\().16b
|
||||
umin v3.16b, v3.16b, \s2\().16b
|
||||
umax v4.16b, v4.16b, \s2\().16b
|
||||
.endif
|
||||
uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
|
||||
uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
|
||||
ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
|
||||
ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
|
||||
uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
cmhi v18.16b, v0.16b, \s1\().16b // px > p0
|
||||
cmhi v22.16b, v0.16b, \s2\().16b // px > p1
|
||||
umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
|
||||
umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
|
||||
dup v19.16b, \tap // taps[k]
|
||||
neg v16.16b, v17.16b // -imin()
|
||||
neg v20.16b, v21.16b // -imin()
|
||||
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
|
||||
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
|
||||
smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain()
|
||||
smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain()
|
||||
smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain()
|
||||
smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain()
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping, int h);
|
||||
.macro filter w
|
||||
function cdef_filter\w\()_neon, export=1
|
||||
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint8_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping,
|
||||
// int h);
|
||||
.macro filter_func_8 w, pri, sec, min, suffix
|
||||
function cdef_filter\w\suffix\()_edged_8bpc_neon
|
||||
.if \pri
|
||||
movrel x8, pri_taps
|
||||
and w9, w3, #1
|
||||
add x8, x8, w9, uxtw #1
|
||||
.endif
|
||||
movrel x9, directions\w
|
||||
add x5, x9, w5, uxtw #1
|
||||
movi v30.4h, #15
|
||||
dup v28.4h, w6 // damping
|
||||
movi v30.8b, #7
|
||||
dup v28.8b, w6 // damping
|
||||
|
||||
dup v25.8h, w3 // threshold
|
||||
dup v27.8h, w4 // threshold
|
||||
trn1 v24.4h, v25.4h, v27.4h
|
||||
clz v24.4h, v24.4h // clz(threshold)
|
||||
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
|
||||
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
|
||||
neg v24.4h, v24.4h // -shift
|
||||
dup v26.8h, v24.h[1]
|
||||
dup v24.8h, v24.h[0]
|
||||
.if \pri
|
||||
dup v25.16b, w3 // threshold
|
||||
.endif
|
||||
.if \sec
|
||||
dup v27.16b, w4 // threshold
|
||||
.endif
|
||||
trn1 v24.8b, v25.8b, v27.8b
|
||||
clz v24.8b, v24.8b // clz(threshold)
|
||||
sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
|
||||
uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
|
||||
neg v24.8b, v24.8b // -shift
|
||||
.if \sec
|
||||
dup v26.16b, v24.b[1]
|
||||
.endif
|
||||
.if \pri
|
||||
dup v24.16b, v24.b[0]
|
||||
.endif
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
ld1 {v0.8h}, [x2] // px
|
||||
.else
|
||||
add x12, x2, #2*8
|
||||
ld1 {v0.4h}, [x2] // px
|
||||
add x12, x2, #16
|
||||
ld1 {v0.d}[0], [x2] // px
|
||||
ld1 {v0.d}[1], [x12] // px
|
||||
.else
|
||||
add x12, x2, #1*8
|
||||
add x13, x2, #2*8
|
||||
add x14, x2, #3*8
|
||||
ld1 {v0.s}[0], [x2] // px
|
||||
ld1 {v0.s}[1], [x12] // px
|
||||
ld1 {v0.s}[2], [x13] // px
|
||||
ld1 {v0.s}[3], [x14] // px
|
||||
.endif
|
||||
|
||||
movi v1.8h, #0 // sum
|
||||
mov v2.16b, v0.16b // min
|
||||
mov v3.16b, v0.16b // max
|
||||
movi v2.8h, #0 // sum
|
||||
.if \min
|
||||
mov v3.16b, v0.16b // min
|
||||
mov v4.16b, v0.16b // max
|
||||
.endif
|
||||
|
||||
// Instead of loading sec_taps 2, 1 from memory, just set it
|
||||
// to 2 initially and decrease for the second round.
|
||||
// This is also used as loop counter.
|
||||
mov w11, #2 // sec_taps[0]
|
||||
|
||||
2:
|
||||
.if \pri
|
||||
ldrb w9, [x5] // off1
|
||||
|
||||
load_px v4, v5, \w
|
||||
|
||||
add x5, x5, #4 // +2*2
|
||||
ldrb w9, [x5] // off2
|
||||
load_px v6, v7, \w
|
||||
|
||||
ldrb w10, [x8] // *pri_taps
|
||||
|
||||
handle_pixel v4, v5, w3, v25.8h, v24.8h, w10
|
||||
|
||||
add x5, x5, #8 // +2*4
|
||||
ldrb w9, [x5] // off3
|
||||
load_px v4, v5, \w
|
||||
|
||||
handle_pixel v6, v7, w4, v27.8h, v26.8h, w11
|
||||
|
||||
handle_pixel v4, v5, w4, v27.8h, v26.8h, w11
|
||||
|
||||
sub x5, x5, #11 // x8 -= 2*(2+4); x8 += 1;
|
||||
subs w11, w11, #1 // sec_tap-- (value)
|
||||
add x8, x8, #1 // pri_taps++ (pointer)
|
||||
b.ne 2b
|
||||
|
||||
sshr v4.8h, v1.8h, #15 // -(sum < 0)
|
||||
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
|
||||
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
|
||||
smin v0.8h, v0.8h, v3.8h
|
||||
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
|
||||
xtn v0.8b, v0.8h
|
||||
.if \w == 8
|
||||
add x2, x2, #2*16 // tmp += tmp_stride
|
||||
subs w7, w7, #1 // h--
|
||||
st1 {v0.8b}, [x0], x1
|
||||
.else
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
add x2, x2, #2*16 // tmp += 2*tmp_stride
|
||||
subs w7, w7, #2 // h -= 2
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
load_px_8 v5, v6, \w
|
||||
.endif
|
||||
|
||||
// Reset pri_taps/sec_taps back to the original point
|
||||
.if \sec
|
||||
add x5, x5, #4 // +2*2
|
||||
ldrb w9, [x5] // off2
|
||||
load_px_8 v28, v29, \w
|
||||
.endif
|
||||
|
||||
.if \pri
|
||||
ldrb w10, [x8] // *pri_taps
|
||||
|
||||
handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add x5, x5, #8 // +2*4
|
||||
ldrb w9, [x5] // off3
|
||||
load_px_8 v5, v6, \w
|
||||
|
||||
handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
|
||||
|
||||
handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
|
||||
|
||||
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
|
||||
.else
|
||||
add x5, x5, #1 // x5 += 1
|
||||
.endif
|
||||
subs w11, w11, #1 // sec_tap-- (value)
|
||||
.if \pri
|
||||
add x8, x8, #1 // pri_taps++ (pointer)
|
||||
.endif
|
||||
b.ne 2b
|
||||
|
||||
sshr v5.8h, v1.8h, #15 // -(sum < 0)
|
||||
sshr v6.8h, v2.8h, #15 // -(sum < 0)
|
||||
add v1.8h, v1.8h, v5.8h // sum - (sum < 0)
|
||||
add v2.8h, v2.8h, v6.8h // sum - (sum < 0)
|
||||
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4
|
||||
uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4
|
||||
sqxtun v0.8b, v1.8h
|
||||
sqxtun2 v0.16b, v2.8h
|
||||
.if \min
|
||||
umin v0.16b, v0.16b, v4.16b
|
||||
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
|
||||
.endif
|
||||
.if \w == 8
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
add x2, x2, #2*16 // tmp += 2*tmp_stride
|
||||
subs w7, w7, #2 // h -= 2
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
.else
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
add x2, x2, #4*8 // tmp += 4*tmp_stride
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
subs w7, w7, #4 // h -= 4
|
||||
st1 {v0.s}[2], [x0], x1
|
||||
st1 {v0.s}[3], [x0], x1
|
||||
.endif
|
||||
|
||||
// Reset pri_taps and directions back to the original point
|
||||
sub x5, x5, #2
|
||||
.if \pri
|
||||
sub x8, x8, #2
|
||||
.endif
|
||||
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
filter 8
|
||||
filter 4
|
||||
|
||||
const div_table
|
||||
.short 840, 420, 280, 210, 168, 140, 120, 105
|
||||
endconst
|
||||
|
||||
const alt_fact
|
||||
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
|
||||
endconst
|
||||
|
||||
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
function cdef_find_dir_neon, export=1
|
||||
sub sp, sp, #32 // cost
|
||||
mov w3, #8
|
||||
movi v31.16b, #128
|
||||
movi v30.16b, #0
|
||||
movi v1.8h, #0 // v0-v1 sum_diag[0]
|
||||
movi v3.8h, #0 // v2-v3 sum_diag[1]
|
||||
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
|
||||
movi v7.8h, #0 // v6-v7 sum_alt[0]
|
||||
movi v17.8h, #0 // v16-v17 sum_alt[1]
|
||||
movi v18.8h, #0 // v18-v19 sum_alt[2]
|
||||
movi v19.8h, #0
|
||||
movi v21.8h, #0 // v20-v21 sum_alt[3]
|
||||
|
||||
.irpc i, 01234567
|
||||
ld1 {v26.8b}, [x0], x1
|
||||
usubl v26.8h, v26.8b, v31.8b
|
||||
|
||||
addv h25, v26.8h // [y]
|
||||
rev64 v27.8h, v26.8h
|
||||
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
|
||||
add v5.8h, v5.8h, v26.8h // sum_hv[1]
|
||||
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
|
||||
rev64 v29.4h, v28.4h // [-(x >> 1)]
|
||||
ins v4.h[\i], v25.h[0] // sum_hv[0]
|
||||
|
||||
.if \i == 0
|
||||
mov v0.16b, v26.16b // sum_diag[0]
|
||||
mov v2.16b, v27.16b // sum_diag[1]
|
||||
mov v6.16b, v28.16b // sum_alt[0]
|
||||
mov v16.16b, v29.16b // sum_alt[1]
|
||||
.else
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
|
||||
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
|
||||
add v0.8h, v0.8h, v22.8h // sum_diag[0]
|
||||
add v1.8h, v1.8h, v23.8h // sum_diag[0]
|
||||
add v2.8h, v2.8h, v24.8h // sum_diag[1]
|
||||
add v3.8h, v3.8h, v25.8h // sum_diag[1]
|
||||
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
|
||||
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
|
||||
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
|
||||
add v6.8h, v6.8h, v22.8h // sum_alt[0]
|
||||
add v7.4h, v7.4h, v23.4h // sum_alt[0]
|
||||
add v16.8h, v16.8h, v24.8h // sum_alt[1]
|
||||
add v17.4h, v17.4h, v25.4h // sum_alt[1]
|
||||
.endif
|
||||
.if \i < 6
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
|
||||
add v18.8h, v18.8h, v22.8h // sum_alt[2]
|
||||
add v19.4h, v19.4h, v23.4h // sum_alt[2]
|
||||
.else
|
||||
add v18.8h, v18.8h, v26.8h // sum_alt[2]
|
||||
.endif
|
||||
.if \i == 0
|
||||
mov v20.16b, v26.16b // sum_alt[3]
|
||||
.elseif \i == 1
|
||||
add v20.8h, v20.8h, v26.8h // sum_alt[3]
|
||||
.else
|
||||
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
|
||||
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
|
||||
add v20.8h, v20.8h, v24.8h // sum_alt[3]
|
||||
add v21.4h, v21.4h, v25.4h // sum_alt[3]
|
||||
.endif
|
||||
.endr
|
||||
|
||||
movi v31.4s, #105
|
||||
|
||||
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
|
||||
smlal2 v26.4s, v4.8h, v4.8h
|
||||
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
|
||||
smlal2 v27.4s, v5.8h, v5.8h
|
||||
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
|
||||
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
|
||||
addv s4, v26.4s // cost[2]
|
||||
addv s5, v27.4s // cost[6]
|
||||
|
||||
rev64 v1.8h, v1.8h
|
||||
rev64 v3.8h, v3.8h
|
||||
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
|
||||
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
|
||||
|
||||
str s4, [sp, #2*4] // cost[2]
|
||||
str s5, [sp, #6*4] // cost[6]
|
||||
|
||||
movrel x4, div_table
|
||||
ld1 {v31.8h}, [x4]
|
||||
|
||||
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
|
||||
smull2 v23.4s, v0.8h, v0.8h
|
||||
smlal v22.4s, v1.4h, v1.4h
|
||||
smlal2 v23.4s, v1.8h, v1.8h
|
||||
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
|
||||
smull2 v25.4s, v2.8h, v2.8h
|
||||
smlal v24.4s, v3.4h, v3.4h
|
||||
smlal2 v25.4s, v3.8h, v3.8h
|
||||
uxtl v30.4s, v31.4h // div_table
|
||||
uxtl2 v31.4s, v31.8h
|
||||
mul v22.4s, v22.4s, v30.4s // cost[0]
|
||||
mla v22.4s, v23.4s, v31.4s // cost[0]
|
||||
mul v24.4s, v24.4s, v30.4s // cost[4]
|
||||
mla v24.4s, v25.4s, v31.4s // cost[4]
|
||||
addv s0, v22.4s // cost[0]
|
||||
addv s2, v24.4s // cost[4]
|
||||
|
||||
movrel x5, alt_fact
|
||||
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
|
||||
|
||||
str s0, [sp, #0*4] // cost[0]
|
||||
str s2, [sp, #4*4] // cost[4]
|
||||
|
||||
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
|
||||
uxtl v30.4s, v30.4h
|
||||
uxtl v31.4s, v31.4h
|
||||
|
||||
.macro cost_alt d1, d2, s1, s2, s3, s4
|
||||
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
|
||||
smull2 v23.4s, \s1\().8h, \s1\().8h
|
||||
smull v24.4s, \s2\().4h, \s2\().4h
|
||||
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
|
||||
smull2 v26.4s, \s3\().8h, \s3\().8h
|
||||
smull v27.4s, \s4\().4h, \s4\().4h
|
||||
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
|
||||
mla v22.4s, v23.4s, v30.4s
|
||||
mla v22.4s, v24.4s, v31.4s
|
||||
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
|
||||
mla v25.4s, v26.4s, v30.4s
|
||||
mla v25.4s, v27.4s, v31.4s
|
||||
addv \d1, v22.4s // *cost_ptr
|
||||
addv \d2, v25.4s // *cost_ptr
|
||||
.macro filter_8 w
|
||||
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
|
||||
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
|
||||
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
|
||||
.endm
|
||||
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
|
||||
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
|
||||
str s6, [sp, #1*4] // cost[1]
|
||||
str s16, [sp, #3*4] // cost[3]
|
||||
|
||||
mov w0, #0 // best_dir
|
||||
mov w1, v0.s[0] // best_cost
|
||||
mov w3, #1 // n
|
||||
|
||||
str s18, [sp, #5*4] // cost[5]
|
||||
str s20, [sp, #7*4] // cost[7]
|
||||
|
||||
mov w4, v6.s[0]
|
||||
|
||||
.macro find_best s1, s2, s3
|
||||
.ifnb \s2
|
||||
mov w5, \s2\().s[0]
|
||||
.endif
|
||||
cmp w4, w1 // cost[n] > best_cost
|
||||
csel w0, w3, w0, gt // best_dir = n
|
||||
csel w1, w4, w1, gt // best_cost = cost[n]
|
||||
.ifnb \s2
|
||||
add w3, w3, #1 // n++
|
||||
cmp w5, w1 // cost[n] > best_cost
|
||||
mov w4, \s3\().s[0]
|
||||
csel w0, w3, w0, gt // best_dir = n
|
||||
csel w1, w5, w1, gt // best_cost = cost[n]
|
||||
add w3, w3, #1 // n++
|
||||
.endif
|
||||
.endm
|
||||
find_best v6, v4, v16
|
||||
find_best v16, v2, v18
|
||||
find_best v18, v5, v20
|
||||
find_best v20
|
||||
|
||||
eor w3, w0, #4 // best_dir ^4
|
||||
ldr w4, [sp, w3, uxtw #2]
|
||||
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
|
||||
lsr w1, w1, #10
|
||||
str w1, [x2] // *var
|
||||
|
||||
add sp, sp, #32
|
||||
ret
|
||||
endfunc
|
||||
filter_8 8
|
||||
filter_8 4
|
||||
|
|
|
@ -0,0 +1,228 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2020, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
#include "cdef_tmpl.S"
|
||||
|
||||
.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
|
||||
tst w6, #1 // CDEF_HAVE_LEFT
|
||||
b.eq 2f
|
||||
// CDEF_HAVE_LEFT
|
||||
sub \s1, \s1, #4
|
||||
sub \s2, \s2, #4
|
||||
tst w6, #2 // CDEF_HAVE_RIGHT
|
||||
b.eq 1f
|
||||
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
ldr \reg\()0, [\s1]
|
||||
ldr d1, [\s1, #2*\w]
|
||||
ldr \reg\()2, [\s2]
|
||||
ldr d3, [\s2, #2*\w]
|
||||
str \reg\()0, [x0]
|
||||
str d1, [x0, #2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
str \reg\()2, [x0]
|
||||
str d3, [x0, #2*\w]
|
||||
.if \ret
|
||||
ret
|
||||
.else
|
||||
add x0, x0, #2*\stride
|
||||
b 3f
|
||||
.endif
|
||||
|
||||
1:
|
||||
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
ldr \reg\()0, [\s1]
|
||||
ldr s1, [\s1, #2*\w]
|
||||
ldr \reg\()2, [\s2]
|
||||
ldr s3, [\s2, #2*\w]
|
||||
str \reg\()0, [x0]
|
||||
str s1, [x0, #2*\w]
|
||||
str s31, [x0, #2*\w+4]
|
||||
add x0, x0, #2*\stride
|
||||
str \reg\()2, [x0]
|
||||
str s3, [x0, #2*\w]
|
||||
str s31, [x0, #2*\w+4]
|
||||
.if \ret
|
||||
ret
|
||||
.else
|
||||
add x0, x0, #2*\stride
|
||||
b 3f
|
||||
.endif
|
||||
|
||||
2:
|
||||
// !CDEF_HAVE_LEFT
|
||||
tst w6, #2 // CDEF_HAVE_RIGHT
|
||||
b.eq 1f
|
||||
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
ldr \reg\()0, [\s1]
|
||||
ldr s1, [\s1, #2*\w]
|
||||
ldr \reg\()2, [\s2]
|
||||
ldr s3, [\s2, #2*\w]
|
||||
str s31, [x0]
|
||||
stur \reg\()0, [x0, #4]
|
||||
str s1, [x0, #4+2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
str s31, [x0]
|
||||
stur \reg\()2, [x0, #4]
|
||||
str s3, [x0, #4+2*\w]
|
||||
.if \ret
|
||||
ret
|
||||
.else
|
||||
add x0, x0, #2*\stride
|
||||
b 3f
|
||||
.endif
|
||||
|
||||
1:
|
||||
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
ldr \reg\()0, [\s1]
|
||||
ldr \reg\()1, [\s2]
|
||||
str s31, [x0]
|
||||
stur \reg\()0, [x0, #4]
|
||||
str s31, [x0, #4+2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
str s31, [x0]
|
||||
stur \reg\()1, [x0, #4]
|
||||
str s31, [x0, #4+2*\w]
|
||||
.if \ret
|
||||
ret
|
||||
.else
|
||||
add x0, x0, #2*\stride
|
||||
.endif
|
||||
3:
|
||||
.endm
|
||||
|
||||
.macro load_n_incr_16 dst, src, incr, w
|
||||
.if \w == 4
|
||||
ld1 {\dst\().4h}, [\src], \incr
|
||||
.else
|
||||
ld1 {\dst\().8h}, [\src], \incr
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// const pixel *const top, int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
|
||||
.macro padding_func_16 w, stride, reg
|
||||
function cdef_padding\w\()_16bpc_neon, export=1
|
||||
movi v30.8h, #0x80, lsl #8
|
||||
mov v31.16b, v30.16b
|
||||
sub x0, x0, #2*(2*\stride+2)
|
||||
tst w6, #4 // CDEF_HAVE_TOP
|
||||
b.ne 1f
|
||||
// !CDEF_HAVE_TOP
|
||||
st1 {v30.8h, v31.8h}, [x0], #32
|
||||
.if \w == 8
|
||||
st1 {v30.8h, v31.8h}, [x0], #32
|
||||
.endif
|
||||
b 3f
|
||||
1:
|
||||
// CDEF_HAVE_TOP
|
||||
add x9, x4, x2
|
||||
pad_top_bot_16 x4, x9, \w, \stride, \reg, 0
|
||||
|
||||
// Middle section
|
||||
3:
|
||||
tst w6, #1 // CDEF_HAVE_LEFT
|
||||
b.eq 2f
|
||||
// CDEF_HAVE_LEFT
|
||||
tst w6, #2 // CDEF_HAVE_RIGHT
|
||||
b.eq 1f
|
||||
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
0:
|
||||
ld1 {v0.s}[0], [x3], #4
|
||||
ldr s2, [x1, #2*\w]
|
||||
load_n_incr_16 v1, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
str s0, [x0]
|
||||
stur \reg\()1, [x0, #4]
|
||||
str s2, [x0, #4+2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
b.gt 0b
|
||||
b 3f
|
||||
1:
|
||||
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
ld1 {v0.s}[0], [x3], #4
|
||||
load_n_incr_16 v1, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
str s0, [x0]
|
||||
stur \reg\()1, [x0, #4]
|
||||
str s31, [x0, #4+2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
b.gt 1b
|
||||
b 3f
|
||||
2:
|
||||
tst w6, #2 // CDEF_HAVE_RIGHT
|
||||
b.eq 1f
|
||||
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
0:
|
||||
ldr s1, [x1, #2*\w]
|
||||
load_n_incr_16 v0, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
str s31, [x0]
|
||||
stur \reg\()0, [x0, #4]
|
||||
str s1, [x0, #4+2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
b.gt 0b
|
||||
b 3f
|
||||
1:
|
||||
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
load_n_incr_16 v0, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
str s31, [x0]
|
||||
stur \reg\()0, [x0, #4]
|
||||
str s31, [x0, #4+2*\w]
|
||||
add x0, x0, #2*\stride
|
||||
b.gt 1b
|
||||
|
||||
3:
|
||||
tst w6, #8 // CDEF_HAVE_BOTTOM
|
||||
b.ne 1f
|
||||
// !CDEF_HAVE_BOTTOM
|
||||
st1 {v30.8h, v31.8h}, [x0], #32
|
||||
.if \w == 8
|
||||
st1 {v30.8h, v31.8h}, [x0], #32
|
||||
.endif
|
||||
ret
|
||||
1:
|
||||
// CDEF_HAVE_BOTTOM
|
||||
add x9, x1, x2
|
||||
pad_top_bot_16 x1, x9, \w, \stride, \reg, 1
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
padding_func_16 8, 16, q
|
||||
padding_func_16 4, 8, d
|
||||
|
||||
tables
|
||||
|
||||
filter 8, 16
|
||||
filter 4, 16
|
||||
|
||||
find_dir 16
|
|
@ -0,0 +1,482 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2020, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
.macro dir_table w, stride
|
||||
const directions\w
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
.byte 1 * \stride + 0, 2 * \stride + 0
|
||||
.byte 1 * \stride + 0, 2 * \stride - 1
|
||||
// Repeated, to avoid & 7
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
endconst
|
||||
.endm
|
||||
|
||||
.macro tables
|
||||
dir_table 8, 16
|
||||
dir_table 4, 8
|
||||
|
||||
const pri_taps
|
||||
.byte 4, 2, 3, 3
|
||||
endconst
|
||||
.endm
|
||||
|
||||
.macro load_px d1, d2, w
|
||||
.if \w == 8
|
||||
add x6, x2, w9, sxtb #1 // x + off
|
||||
sub x9, x2, w9, sxtb #1 // x - off
|
||||
ld1 {\d1\().8h}, [x6] // p0
|
||||
ld1 {\d2\().8h}, [x9] // p1
|
||||
.else
|
||||
add x6, x2, w9, sxtb #1 // x + off
|
||||
sub x9, x2, w9, sxtb #1 // x - off
|
||||
ld1 {\d1\().4h}, [x6] // p0
|
||||
add x6, x6, #2*8 // += stride
|
||||
ld1 {\d2\().4h}, [x9] // p1
|
||||
add x9, x9, #2*8 // += stride
|
||||
ld1 {\d1\().d}[1], [x6] // p0
|
||||
ld1 {\d2\().d}[1], [x9] // p1
|
||||
.endif
|
||||
.endm
|
||||
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
|
||||
.if \min
|
||||
umin v2.8h, v2.8h, \s1\().8h
|
||||
smax v3.8h, v3.8h, \s1\().8h
|
||||
umin v2.8h, v2.8h, \s2\().8h
|
||||
smax v3.8h, v3.8h, \s2\().8h
|
||||
.endif
|
||||
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
|
||||
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
|
||||
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
|
||||
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
|
||||
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
|
||||
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
|
||||
neg v16.8h, v17.8h // -clip
|
||||
neg v20.8h, v21.8h // -clip
|
||||
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
|
||||
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
|
||||
dup v19.8h, \tap // taps[k]
|
||||
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
|
||||
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping,
|
||||
// int h, size_t edges);
|
||||
.macro filter_func w, bpc, pri, sec, min, suffix
|
||||
function cdef_filter\w\suffix\()_\bpc\()bpc_neon
|
||||
.if \bpc == 8
|
||||
ldr w8, [sp] // bitdepth_max
|
||||
cmp w8, #0xf
|
||||
b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
|
||||
.endif
|
||||
.if \pri
|
||||
.if \bpc == 16
|
||||
ldr w9, [sp, #8] // bitdepth_max
|
||||
clz w9, w9
|
||||
sub w9, w9, #24 // -bitdepth_min_8
|
||||
neg w9, w9 // bitdepth_min_8
|
||||
.endif
|
||||
movrel x8, pri_taps
|
||||
.if \bpc == 16
|
||||
lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
|
||||
and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
|
||||
.else
|
||||
and w9, w3, #1
|
||||
.endif
|
||||
add x8, x8, w9, uxtw #1
|
||||
.endif
|
||||
movrel x9, directions\w
|
||||
add x5, x9, w5, uxtw #1
|
||||
movi v30.4h, #15
|
||||
dup v28.4h, w6 // damping
|
||||
|
||||
.if \pri
|
||||
dup v25.8h, w3 // threshold
|
||||
.endif
|
||||
.if \sec
|
||||
dup v27.8h, w4 // threshold
|
||||
.endif
|
||||
trn1 v24.4h, v25.4h, v27.4h
|
||||
clz v24.4h, v24.4h // clz(threshold)
|
||||
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
|
||||
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
|
||||
neg v24.4h, v24.4h // -shift
|
||||
.if \sec
|
||||
dup v26.8h, v24.h[1]
|
||||
.endif
|
||||
.if \pri
|
||||
dup v24.8h, v24.h[0]
|
||||
.endif
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
ld1 {v0.8h}, [x2] // px
|
||||
.else
|
||||
add x12, x2, #2*8
|
||||
ld1 {v0.4h}, [x2] // px
|
||||
ld1 {v0.d}[1], [x12] // px
|
||||
.endif
|
||||
|
||||
movi v1.8h, #0 // sum
|
||||
.if \min
|
||||
mov v2.16b, v0.16b // min
|
||||
mov v3.16b, v0.16b // max
|
||||
.endif
|
||||
|
||||
// Instead of loading sec_taps 2, 1 from memory, just set it
|
||||
// to 2 initially and decrease for the second round.
|
||||
// This is also used as loop counter.
|
||||
mov w11, #2 // sec_taps[0]
|
||||
|
||||
2:
|
||||
.if \pri
|
||||
ldrb w9, [x5] // off1
|
||||
|
||||
load_px v4, v5, \w
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add x5, x5, #4 // +2*2
|
||||
ldrb w9, [x5] // off2
|
||||
load_px v6, v7, \w
|
||||
.endif
|
||||
|
||||
.if \pri
|
||||
ldrb w10, [x8] // *pri_taps
|
||||
|
||||
handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
|
||||
.endif
|
||||
|
||||
.if \sec
|
||||
add x5, x5, #8 // +2*4
|
||||
ldrb w9, [x5] // off3
|
||||
load_px v4, v5, \w
|
||||
|
||||
handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
|
||||
|
||||
handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
|
||||
|
||||
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
|
||||
.else
|
||||
add x5, x5, #1 // x5 += 1
|
||||
.endif
|
||||
subs w11, w11, #1 // sec_tap-- (value)
|
||||
.if \pri
|
||||
add x8, x8, #1 // pri_taps++ (pointer)
|
||||
.endif
|
||||
b.ne 2b
|
||||
|
||||
sshr v4.8h, v1.8h, #15 // -(sum < 0)
|
||||
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
|
||||
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
|
||||
.if \min
|
||||
smin v0.8h, v0.8h, v3.8h
|
||||
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
|
||||
.endif
|
||||
.if \bpc == 8
|
||||
xtn v0.8b, v0.8h
|
||||
.endif
|
||||
.if \w == 8
|
||||
add x2, x2, #2*16 // tmp += tmp_stride
|
||||
subs w7, w7, #1 // h--
|
||||
.if \bpc == 8
|
||||
st1 {v0.8b}, [x0], x1
|
||||
.else
|
||||
st1 {v0.8h}, [x0], x1
|
||||
.endif
|
||||
.else
|
||||
.if \bpc == 8
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
.else
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
.endif
|
||||
add x2, x2, #2*16 // tmp += 2*tmp_stride
|
||||
subs w7, w7, #2 // h -= 2
|
||||
.if \bpc == 8
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
.else
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
.endif
|
||||
.endif
|
||||
|
||||
// Reset pri_taps and directions back to the original point
|
||||
sub x5, x5, #2
|
||||
.if \pri
|
||||
sub x8, x8, #2
|
||||
.endif
|
||||
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro filter w, bpc
|
||||
filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
|
||||
filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
|
||||
filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
|
||||
|
||||
function cdef_filter\w\()_\bpc\()bpc_neon, export=1
|
||||
cbnz w3, 1f // pri_strength
|
||||
b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
|
||||
1:
|
||||
cbnz w4, 1f // sec_strength
|
||||
b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
|
||||
1:
|
||||
b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
const div_table
|
||||
.short 840, 420, 280, 210, 168, 140, 120, 105
|
||||
endconst
|
||||
|
||||
const alt_fact
|
||||
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
|
||||
endconst
|
||||
|
||||
.macro cost_alt d1, d2, s1, s2, s3, s4
|
||||
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
|
||||
smull2 v23.4s, \s1\().8h, \s1\().8h
|
||||
smull v24.4s, \s2\().4h, \s2\().4h
|
||||
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
|
||||
smull2 v26.4s, \s3\().8h, \s3\().8h
|
||||
smull v27.4s, \s4\().4h, \s4\().4h
|
||||
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
|
||||
mla v22.4s, v23.4s, v30.4s
|
||||
mla v22.4s, v24.4s, v31.4s
|
||||
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
|
||||
mla v25.4s, v26.4s, v30.4s
|
||||
mla v25.4s, v27.4s, v31.4s
|
||||
addv \d1, v22.4s // *cost_ptr
|
||||
addv \d2, v25.4s // *cost_ptr
|
||||
.endm
|
||||
|
||||
.macro find_best s1, s2, s3
|
||||
.ifnb \s2
|
||||
mov w5, \s2\().s[0]
|
||||
.endif
|
||||
cmp w4, w1 // cost[n] > best_cost
|
||||
csel w0, w3, w0, gt // best_dir = n
|
||||
csel w1, w4, w1, gt // best_cost = cost[n]
|
||||
.ifnb \s2
|
||||
add w3, w3, #1 // n++
|
||||
cmp w5, w1 // cost[n] > best_cost
|
||||
mov w4, \s3\().s[0]
|
||||
csel w0, w3, w0, gt // best_dir = n
|
||||
csel w1, w5, w1, gt // best_cost = cost[n]
|
||||
add w3, w3, #1 // n++
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
.macro find_dir bpc
|
||||
function cdef_find_dir_\bpc\()bpc_neon, export=1
|
||||
.if \bpc == 16
|
||||
str d8, [sp, #-0x10]!
|
||||
clz w3, w3 // clz(bitdepth_max)
|
||||
sub w3, w3, #24 // -bitdepth_min_8
|
||||
dup v8.8h, w3
|
||||
.endif
|
||||
sub sp, sp, #32 // cost
|
||||
mov w3, #8
|
||||
.if \bpc == 8
|
||||
movi v31.16b, #128
|
||||
.else
|
||||
movi v31.8h, #128
|
||||
.endif
|
||||
movi v30.16b, #0
|
||||
movi v1.8h, #0 // v0-v1 sum_diag[0]
|
||||
movi v3.8h, #0 // v2-v3 sum_diag[1]
|
||||
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
|
||||
movi v7.8h, #0 // v6-v7 sum_alt[0]
|
||||
movi v17.8h, #0 // v16-v17 sum_alt[1]
|
||||
movi v18.8h, #0 // v18-v19 sum_alt[2]
|
||||
movi v19.8h, #0
|
||||
movi v21.8h, #0 // v20-v21 sum_alt[3]
|
||||
|
||||
.irpc i, 01234567
|
||||
.if \bpc == 8
|
||||
ld1 {v26.8b}, [x0], x1
|
||||
usubl v26.8h, v26.8b, v31.8b
|
||||
.else
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ushl v26.8h, v26.8h, v8.8h
|
||||
sub v26.8h, v26.8h, v31.8h
|
||||
.endif
|
||||
|
||||
addv h25, v26.8h // [y]
|
||||
rev64 v27.8h, v26.8h
|
||||
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
|
||||
add v5.8h, v5.8h, v26.8h // sum_hv[1]
|
||||
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
|
||||
rev64 v29.4h, v28.4h // [-(x >> 1)]
|
||||
ins v4.h[\i], v25.h[0] // sum_hv[0]
|
||||
|
||||
.if \i == 0
|
||||
mov v0.16b, v26.16b // sum_diag[0]
|
||||
mov v2.16b, v27.16b // sum_diag[1]
|
||||
mov v6.16b, v28.16b // sum_alt[0]
|
||||
mov v16.16b, v29.16b // sum_alt[1]
|
||||
.else
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
|
||||
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
|
||||
add v0.8h, v0.8h, v22.8h // sum_diag[0]
|
||||
add v1.8h, v1.8h, v23.8h // sum_diag[0]
|
||||
add v2.8h, v2.8h, v24.8h // sum_diag[1]
|
||||
add v3.8h, v3.8h, v25.8h // sum_diag[1]
|
||||
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
|
||||
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
|
||||
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
|
||||
add v6.8h, v6.8h, v22.8h // sum_alt[0]
|
||||
add v7.4h, v7.4h, v23.4h // sum_alt[0]
|
||||
add v16.8h, v16.8h, v24.8h // sum_alt[1]
|
||||
add v17.4h, v17.4h, v25.4h // sum_alt[1]
|
||||
.endif
|
||||
.if \i < 6
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
|
||||
add v18.8h, v18.8h, v22.8h // sum_alt[2]
|
||||
add v19.4h, v19.4h, v23.4h // sum_alt[2]
|
||||
.else
|
||||
add v18.8h, v18.8h, v26.8h // sum_alt[2]
|
||||
.endif
|
||||
.if \i == 0
|
||||
mov v20.16b, v26.16b // sum_alt[3]
|
||||
.elseif \i == 1
|
||||
add v20.8h, v20.8h, v26.8h // sum_alt[3]
|
||||
.else
|
||||
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
|
||||
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
|
||||
add v20.8h, v20.8h, v24.8h // sum_alt[3]
|
||||
add v21.4h, v21.4h, v25.4h // sum_alt[3]
|
||||
.endif
|
||||
.endr
|
||||
|
||||
movi v31.4s, #105
|
||||
|
||||
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
|
||||
smlal2 v26.4s, v4.8h, v4.8h
|
||||
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
|
||||
smlal2 v27.4s, v5.8h, v5.8h
|
||||
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
|
||||
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
|
||||
addv s4, v26.4s // cost[2]
|
||||
addv s5, v27.4s // cost[6]
|
||||
|
||||
rev64 v1.8h, v1.8h
|
||||
rev64 v3.8h, v3.8h
|
||||
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
|
||||
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
|
||||
|
||||
str s4, [sp, #2*4] // cost[2]
|
||||
str s5, [sp, #6*4] // cost[6]
|
||||
|
||||
movrel x4, div_table
|
||||
ld1 {v31.8h}, [x4]
|
||||
|
||||
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
|
||||
smull2 v23.4s, v0.8h, v0.8h
|
||||
smlal v22.4s, v1.4h, v1.4h
|
||||
smlal2 v23.4s, v1.8h, v1.8h
|
||||
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
|
||||
smull2 v25.4s, v2.8h, v2.8h
|
||||
smlal v24.4s, v3.4h, v3.4h
|
||||
smlal2 v25.4s, v3.8h, v3.8h
|
||||
uxtl v30.4s, v31.4h // div_table
|
||||
uxtl2 v31.4s, v31.8h
|
||||
mul v22.4s, v22.4s, v30.4s // cost[0]
|
||||
mla v22.4s, v23.4s, v31.4s // cost[0]
|
||||
mul v24.4s, v24.4s, v30.4s // cost[4]
|
||||
mla v24.4s, v25.4s, v31.4s // cost[4]
|
||||
addv s0, v22.4s // cost[0]
|
||||
addv s2, v24.4s // cost[4]
|
||||
|
||||
movrel x5, alt_fact
|
||||
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
|
||||
|
||||
str s0, [sp, #0*4] // cost[0]
|
||||
str s2, [sp, #4*4] // cost[4]
|
||||
|
||||
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
|
||||
uxtl v30.4s, v30.4h
|
||||
uxtl v31.4s, v31.4h
|
||||
|
||||
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
|
||||
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
|
||||
str s6, [sp, #1*4] // cost[1]
|
||||
str s16, [sp, #3*4] // cost[3]
|
||||
|
||||
mov w0, #0 // best_dir
|
||||
mov w1, v0.s[0] // best_cost
|
||||
mov w3, #1 // n
|
||||
|
||||
str s18, [sp, #5*4] // cost[5]
|
||||
str s20, [sp, #7*4] // cost[7]
|
||||
|
||||
mov w4, v6.s[0]
|
||||
|
||||
find_best v6, v4, v16
|
||||
find_best v16, v2, v18
|
||||
find_best v18, v5, v20
|
||||
find_best v20
|
||||
|
||||
eor w3, w0, #4 // best_dir ^4
|
||||
ldr w4, [sp, w3, uxtw #2]
|
||||
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
|
||||
lsr w1, w1, #10
|
||||
str w1, [x2] // *var
|
||||
|
||||
add sp, sp, #32
|
||||
.if \bpc == 16
|
||||
ldr d8, [sp], 0x10
|
||||
.endif
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
|
@ -161,31 +161,6 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
.macro scale_wide sz, c, r0, r1, r2 r3, r4, r5, r6, r7
|
||||
smull_sz v2, v3, \r0, \c, \sz
|
||||
smull_sz v4, v5, \r1, \c, \sz
|
||||
smull_sz v6, v7, \r2, \c, \sz
|
||||
rshrn_sz \r0, v2, v3, #12, \sz
|
||||
smull_sz v2, v3, \r3, \c, \sz
|
||||
rshrn_sz \r1, v4, v5, #12, \sz
|
||||
.ifnb \r4
|
||||
smull_sz v4, v5, \r4, \c, \sz
|
||||
.endif
|
||||
rshrn_sz \r2, v6, v7, #12, \sz
|
||||
.ifnb \r4
|
||||
smull_sz v6, v7, \r5, \c, \sz
|
||||
.endif
|
||||
rshrn_sz \r3, v2, v3, #12, \sz
|
||||
.ifnb \r4
|
||||
smull_sz v2, v3, \r6, \c, \sz
|
||||
rshrn_sz \r4, v4, v5, #12, \sz
|
||||
smull_sz v4, v5, \r7, \c, \sz
|
||||
rshrn_sz \r5, v6, v7, #12, \sz
|
||||
rshrn_sz \r6, v2, v3, #12, \sz
|
||||
rshrn_sz \r7, v4, v5, #12, \sz
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
|
||||
.ifnb \load
|
||||
ld1 {\load}, [\src], x1
|
||||
|
@ -599,41 +574,40 @@ function inv_flipadst_8x4_neon
|
|||
endfunc
|
||||
|
||||
function inv_identity_4x4_neon
|
||||
mov w16, #5793
|
||||
mov w16, #(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
smull v4.4s, v16.4h, v0.h[0]
|
||||
smull v5.4s, v17.4h, v0.h[0]
|
||||
smull v6.4s, v18.4h, v0.h[0]
|
||||
smull v7.4s, v19.4h, v0.h[0]
|
||||
rshrn v16.4h, v4.4s, #12
|
||||
rshrn v17.4h, v5.4s, #12
|
||||
rshrn v18.4h, v6.4s, #12
|
||||
rshrn v19.4h, v7.4s, #12
|
||||
sqrdmulh v4.4h, v16.4h, v0.h[0]
|
||||
sqrdmulh v5.4h, v17.4h, v0.h[0]
|
||||
sqrdmulh v6.4h, v18.4h, v0.h[0]
|
||||
sqrdmulh v7.4h, v19.4h, v0.h[0]
|
||||
sqadd v16.4h, v16.4h, v4.4h
|
||||
sqadd v17.4h, v17.4h, v5.4h
|
||||
sqadd v18.4h, v18.4h, v6.4h
|
||||
sqadd v19.4h, v19.4h, v7.4h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function inv_identity_8x4_neon
|
||||
mov w16, #5793
|
||||
mov w16, #(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
smull v2.4s, v16.4h, v0.h[0]
|
||||
smull2 v3.4s, v16.8h, v0.h[0]
|
||||
smull v4.4s, v17.4h, v0.h[0]
|
||||
smull2 v5.4s, v17.8h, v0.h[0]
|
||||
rshrn v16.4h, v2.4s, #12
|
||||
rshrn2 v16.8h, v3.4s, #12
|
||||
smull v6.4s, v18.4h, v0.h[0]
|
||||
smull2 v7.4s, v18.8h, v0.h[0]
|
||||
rshrn v17.4h, v4.4s, #12
|
||||
rshrn2 v17.8h, v5.4s, #12
|
||||
smull v2.4s, v19.4h, v0.h[0]
|
||||
smull2 v3.4s, v19.8h, v0.h[0]
|
||||
rshrn v18.4h, v6.4s, #12
|
||||
rshrn2 v18.8h, v7.4s, #12
|
||||
rshrn v19.4h, v2.4s, #12
|
||||
rshrn2 v19.8h, v3.4s, #12
|
||||
sqrdmulh v4.8h, v16.8h, v0.h[0]
|
||||
sqrdmulh v5.8h, v17.8h, v0.h[0]
|
||||
sqrdmulh v6.8h, v18.8h, v0.h[0]
|
||||
sqrdmulh v7.8h, v19.8h, v0.h[0]
|
||||
sqadd v16.8h, v16.8h, v4.8h
|
||||
sqadd v17.8h, v17.8h, v5.8h
|
||||
sqadd v18.8h, v18.8h, v6.8h
|
||||
sqadd v19.8h, v19.8h, v7.8h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro identity_8x4_shift1 r0, r1, r2, r3, c
|
||||
.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
|
||||
sqrdmulh v2.8h, \i, \c
|
||||
srhadd \i, \i, v2.8h
|
||||
.endr
|
||||
.endm
|
||||
|
||||
function inv_txfm_add_wht_wht_4x4_neon, export=1
|
||||
mov x15, x30
|
||||
movi v31.8h, #0
|
||||
|
@ -877,30 +851,31 @@ function inv_flipadst_4x8_neon
|
|||
endfunc
|
||||
|
||||
function inv_identity_8x8_neon
|
||||
shl v16.8h, v16.8h, #1
|
||||
shl v17.8h, v17.8h, #1
|
||||
shl v18.8h, v18.8h, #1
|
||||
shl v19.8h, v19.8h, #1
|
||||
shl v20.8h, v20.8h, #1
|
||||
shl v21.8h, v21.8h, #1
|
||||
shl v22.8h, v22.8h, #1
|
||||
shl v23.8h, v23.8h, #1
|
||||
sqshl v16.8h, v16.8h, #1
|
||||
sqshl v17.8h, v17.8h, #1
|
||||
sqshl v18.8h, v18.8h, #1
|
||||
sqshl v19.8h, v19.8h, #1
|
||||
sqshl v20.8h, v20.8h, #1
|
||||
sqshl v21.8h, v21.8h, #1
|
||||
sqshl v22.8h, v22.8h, #1
|
||||
sqshl v23.8h, v23.8h, #1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function inv_identity_4x8_neon
|
||||
shl v16.4h, v16.4h, #1
|
||||
shl v17.4h, v17.4h, #1
|
||||
shl v18.4h, v18.4h, #1
|
||||
shl v19.4h, v19.4h, #1
|
||||
shl v20.4h, v20.4h, #1
|
||||
shl v21.4h, v21.4h, #1
|
||||
shl v22.4h, v22.4h, #1
|
||||
shl v23.4h, v23.4h, #1
|
||||
sqshl v16.4h, v16.4h, #1
|
||||
sqshl v17.4h, v17.4h, #1
|
||||
sqshl v18.4h, v18.4h, #1
|
||||
sqshl v19.4h, v19.4h, #1
|
||||
sqshl v20.4h, v20.4h, #1
|
||||
sqshl v21.4h, v21.4h, #1
|
||||
sqshl v22.4h, v22.4h, #1
|
||||
sqshl v23.4h, v23.4h, #1
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function inv_txfm_add_8x8_neon
|
||||
.macro def_fn_8x8_base variant
|
||||
function inv_txfm_\variant\()add_8x8_neon
|
||||
movi v28.8h, #0
|
||||
movi v29.8h, #0
|
||||
movi v30.8h, #0
|
||||
|
@ -910,6 +885,9 @@ function inv_txfm_add_8x8_neon
|
|||
ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
|
||||
st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
|
||||
|
||||
.ifc \variant, identity_
|
||||
// The identity shl #1 and downshift srshr #1 cancel out
|
||||
.else
|
||||
blr x4
|
||||
|
||||
srshr v16.8h, v16.8h, #1
|
||||
|
@ -920,6 +898,7 @@ function inv_txfm_add_8x8_neon
|
|||
srshr v21.8h, v21.8h, #1
|
||||
srshr v22.8h, v22.8h, #1
|
||||
srshr v23.8h, v23.8h, #1
|
||||
.endif
|
||||
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
|
||||
|
||||
|
@ -928,6 +907,10 @@ function inv_txfm_add_8x8_neon
|
|||
load_add_store_8x8 x0, x7
|
||||
br x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fn_8x8_base
|
||||
def_fn_8x8_base identity_
|
||||
|
||||
.macro def_fn_8x8 txfm1, txfm2
|
||||
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
|
||||
|
@ -936,9 +919,13 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
|
|||
.ifc \txfm1\()_\txfm2, dct_dct
|
||||
idct_dc 8, 8, 1
|
||||
.endif
|
||||
adr x4, inv_\txfm1\()_8x8_neon
|
||||
adr x5, inv_\txfm2\()_8x8_neon
|
||||
.ifc \txfm1, identity
|
||||
b inv_txfm_identity_add_8x8_neon
|
||||
.else
|
||||
adr x4, inv_\txfm1\()_8x8_neon
|
||||
b inv_txfm_add_8x8_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -1083,9 +1070,12 @@ def_fns_48 8, 4
|
|||
rshrn_sz v27, v6, v7, #12, \sz // t14a
|
||||
|
||||
smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
|
||||
neg v29\sz, v29\sz
|
||||
smull_smlsl v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
|
||||
smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
|
||||
rshrn_sz v29, v4, v5, #12, \sz // t13a
|
||||
neg v6.4s, v6.4s
|
||||
.ifc \sz, .8h
|
||||
neg v7.4s, v7.4s
|
||||
.endif
|
||||
rshrn_sz v23, v6, v7, #12, \sz // t10a
|
||||
|
||||
sqsub v2\sz, v17\sz, v19\sz // t11a
|
||||
|
@ -1333,27 +1323,59 @@ function inv_flipadst_4x16_neon
|
|||
endfunc
|
||||
|
||||
function inv_identity_8x16_neon
|
||||
mov w16, #2*5793
|
||||
mov w16, #2*(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
smull v2.4s, v\i\().4h, v0.h[0]
|
||||
smull2 v3.4s, v\i\().8h, v0.h[0]
|
||||
rshrn v\i\().4h, v2.4s, #12
|
||||
rshrn2 v\i\().8h, v3.4s, #12
|
||||
sqrdmulh v2.8h, v\i\().8h, v0.h[0]
|
||||
sqadd v\i\().8h, v\i\().8h, v\i\().8h
|
||||
sqadd v\i\().8h, v\i\().8h, v2.8h
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function inv_identity_4x16_neon
|
||||
mov w16, #2*5793
|
||||
mov w16, #2*(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
smull v2.4s, v\i\().4h, v0.h[0]
|
||||
rshrn v\i\().4h, v2.4s, #12
|
||||
sqrdmulh v2.4h, v\i\().4h, v0.h[0]
|
||||
sqadd v\i\().4h, v\i\().4h, v\i\().4h
|
||||
sqadd v\i\().4h, v\i\().4h, v2.4h
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro identity_8x16_shift2 c
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
|
||||
sqrdmulh v2.8h, \i, \c
|
||||
sshr v2.8h, v2.8h, #1
|
||||
srhadd \i, \i, v2.8h
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro identity_8x16_shift1 c
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
|
||||
sqrdmulh v2.8h, \i, \c
|
||||
srshr v2.8h, v2.8h, #1
|
||||
sqadd \i, \i, v2.8h
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro identity_8x8_shift1 c
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
sqrdmulh v2.8h, \i, \c
|
||||
srshr v2.8h, v2.8h, #1
|
||||
sqadd \i, \i, v2.8h
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro identity_8x8 c
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
sqrdmulh v2.8h, \i, \c
|
||||
sqadd \i, \i, \i
|
||||
sqadd \i, \i, v2.8h
|
||||
.endr
|
||||
.endm
|
||||
|
||||
function inv_txfm_horz_16x8_neon
|
||||
mov x14, x30
|
||||
movi v7.8h, #0
|
||||
|
@ -1375,6 +1397,26 @@ function inv_txfm_horz_16x8_neon
|
|||
br x14
|
||||
endfunc
|
||||
|
||||
function inv_txfm_horz_identity_16x8_neon
|
||||
mov x14, x30
|
||||
movi v7.8h, #0
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
ld1 {v\i\().8h}, [x7]
|
||||
st1 {v7.8h}, [x7], x8
|
||||
.endr
|
||||
mov w16, #2*(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
identity_8x16_shift2 v0.h[0]
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
|
||||
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
|
||||
|
||||
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
|
||||
st1 {v\i\().8h}, [x6], #16
|
||||
.endr
|
||||
|
||||
br x14
|
||||
endfunc
|
||||
|
||||
function inv_txfm_horz_scale_16x8_neon
|
||||
mov x14, x30
|
||||
movi v7.8h, #0
|
||||
|
@ -1421,7 +1463,7 @@ function inv_txfm_add_16x16_neon
|
|||
.endif
|
||||
add x7, x2, #(\i*2)
|
||||
mov x8, #16*2
|
||||
bl inv_txfm_horz_16x8_neon
|
||||
blr x9
|
||||
.endr
|
||||
b 2f
|
||||
1:
|
||||
|
@ -1449,7 +1491,12 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
|
|||
.ifc \txfm1\()_\txfm2, dct_dct
|
||||
idct_dc 16, 16, 2
|
||||
.endif
|
||||
.ifc \txfm1, identity
|
||||
adr x9, inv_txfm_horz_identity_16x8_neon
|
||||
.else
|
||||
adr x9, inv_txfm_horz_16x8_neon
|
||||
adr x4, inv_\txfm1\()_8x16_neon
|
||||
.endif
|
||||
adr x5, inv_\txfm2\()_8x16_neon
|
||||
mov x13, #\eob_half
|
||||
b inv_txfm_add_16x16_neon
|
||||
|
@ -1469,12 +1516,35 @@ def_fn_16x16 flipadst, adst, 36
|
|||
def_fn_16x16 flipadst, flipadst, 36
|
||||
def_fn_16x16 identity, dct, 8
|
||||
|
||||
function inv_txfm_add_16x4_neon
|
||||
.macro def_fn_416_base variant
|
||||
function inv_txfm_\variant\()add_16x4_neon
|
||||
mov x15, x30
|
||||
movi v4.8h, #0
|
||||
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
ld1 {v\i\().4h}, [x2]
|
||||
.ifc \variant, identity_
|
||||
.irp i, v16.4h, v17.4h, v18.4h, v19.4h
|
||||
ld1 {\i}, [x2]
|
||||
st1 {v4.4h}, [x2], #8
|
||||
.endr
|
||||
.irp i, v16.d, v17.d, v18.d, v19.d
|
||||
ld1 {\i}[1], [x2]
|
||||
st1 {v4.4h}, [x2], #8
|
||||
.endr
|
||||
mov w16, #2*(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
.irp i, v20.4h, v21.4h, v22.4h, v23.4h
|
||||
ld1 {\i}, [x2]
|
||||
st1 {v4.4h}, [x2], #8
|
||||
.endr
|
||||
.irp i, v20.d, v21.d, v22.d, v23.d
|
||||
ld1 {\i}[1], [x2]
|
||||
st1 {v4.4h}, [x2], #8
|
||||
.endr
|
||||
|
||||
identity_8x16_shift1 v0.h[0]
|
||||
.else
|
||||
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
|
||||
ld1 {\i}, [x2]
|
||||
st1 {v4.4h}, [x2], #8
|
||||
.endr
|
||||
|
||||
|
@ -1484,14 +1554,21 @@ function inv_txfm_add_16x4_neon
|
|||
ins v17.d[1], v21.d[0]
|
||||
ins v18.d[1], v22.d[0]
|
||||
ins v19.d[1], v23.d[0]
|
||||
.irp i, 16, 17, 18, 19
|
||||
srshr v\i\().8h, v\i\().8h, #1
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
|
||||
srshr \i, \i, #1
|
||||
.endr
|
||||
.endif
|
||||
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
|
||||
blr x5
|
||||
mov x6, x0
|
||||
load_add_store_8x4 x6, x7
|
||||
|
||||
.ifc \variant, identity_
|
||||
mov v16.16b, v20.16b
|
||||
mov v17.16b, v21.16b
|
||||
mov v18.16b, v22.16b
|
||||
mov v19.16b, v23.16b
|
||||
.else
|
||||
ins v24.d[1], v28.d[0]
|
||||
ins v25.d[1], v29.d[0]
|
||||
ins v26.d[1], v30.d[0]
|
||||
|
@ -1500,6 +1577,7 @@ function inv_txfm_add_16x4_neon
|
|||
srshr v17.8h, v25.8h, #1
|
||||
srshr v18.8h, v26.8h, #1
|
||||
srshr v19.8h, v27.8h, #1
|
||||
.endif
|
||||
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
|
||||
blr x5
|
||||
add x6, x0, #8
|
||||
|
@ -1508,7 +1586,7 @@ function inv_txfm_add_16x4_neon
|
|||
br x15
|
||||
endfunc
|
||||
|
||||
function inv_txfm_add_4x16_neon
|
||||
function inv_txfm_\variant\()add_4x16_neon
|
||||
mov x15, x30
|
||||
movi v2.8h, #0
|
||||
|
||||
|
@ -1517,8 +1595,17 @@ function inv_txfm_add_4x16_neon
|
|||
b.lt 1f
|
||||
|
||||
add x6, x2, #16
|
||||
.irp i, 16, 17, 18, 19
|
||||
ld1 {v\i\().8h}, [x6]
|
||||
.ifc \variant, identity_
|
||||
.irp i, v24.8h, v25.8h, v26.8h, v27.8h
|
||||
ld1 {\i}, [x6]
|
||||
st1 {v2.8h}, [x6], x11
|
||||
.endr
|
||||
mov w16, #(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
|
||||
.else
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
|
||||
ld1 {\i}, [x6]
|
||||
st1 {v2.8h}, [x6], x11
|
||||
.endr
|
||||
blr x4
|
||||
|
@ -1526,6 +1613,7 @@ function inv_txfm_add_4x16_neon
|
|||
srshr v25.8h, v17.8h, #1
|
||||
srshr v26.8h, v18.8h, #1
|
||||
srshr v27.8h, v19.8h, #1
|
||||
.endif
|
||||
transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
|
||||
ins v28.d[0], v24.d[1]
|
||||
ins v29.d[0], v25.d[1]
|
||||
|
@ -1534,19 +1622,25 @@ function inv_txfm_add_4x16_neon
|
|||
|
||||
b 2f
|
||||
1:
|
||||
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
movi v\i\().4h, #0
|
||||
.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
|
||||
movi \i, #0
|
||||
.endr
|
||||
2:
|
||||
movi v2.8h, #0
|
||||
.irp i, 16, 17, 18, 19
|
||||
ld1 {v\i\().8h}, [x2]
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
|
||||
ld1 {\i}, [x2]
|
||||
st1 {v2.8h}, [x2], x11
|
||||
.endr
|
||||
.ifc \variant, identity_
|
||||
mov w16, #(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
|
||||
.else
|
||||
blr x4
|
||||
.irp i, 16, 17, 18, 19
|
||||
srshr v\i\().8h, v\i\().8h, #1
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
|
||||
srshr \i, \i, #1
|
||||
.endr
|
||||
.endif
|
||||
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
|
||||
ins v20.d[0], v16.d[1]
|
||||
ins v21.d[0], v17.d[1]
|
||||
|
@ -1559,6 +1653,10 @@ function inv_txfm_add_4x16_neon
|
|||
|
||||
br x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fn_416_base
|
||||
def_fn_416_base identity_
|
||||
|
||||
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
|
||||
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
|
||||
|
@ -1573,7 +1671,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
|
|||
adr x4, inv_\txfm1\()_4x\w\()_neon
|
||||
adr x5, inv_\txfm2\()_8x\h\()_neon
|
||||
.endif
|
||||
.ifc \txfm1, identity
|
||||
b inv_txfm_identity_add_\w\()x\h\()_neon
|
||||
.else
|
||||
b inv_txfm_add_\w\()x\h\()_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -1600,24 +1702,31 @@ def_fns_416 4, 16
|
|||
def_fns_416 16, 4
|
||||
|
||||
|
||||
function inv_txfm_add_16x8_neon
|
||||
.macro def_fn_816_base variant
|
||||
function inv_txfm_\variant\()add_16x8_neon
|
||||
mov x15, x30
|
||||
movi v4.8h, #0
|
||||
mov w16, #2896*8
|
||||
dup v0.4h, w16
|
||||
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
ld1 {v\i\().8h}, [x2]
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
|
||||
ld1 {\i}, [x2]
|
||||
st1 {v4.8h}, [x2], #16
|
||||
.endr
|
||||
|
||||
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
|
||||
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
|
||||
.ifc \variant, identity_
|
||||
mov w16, #2*(5793-4096)*8
|
||||
dup v0.4h, w16
|
||||
identity_8x16_shift1 v0.h[0]
|
||||
.else
|
||||
blr x4
|
||||
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
srshr v\i\().8h, v\i\().8h, #1
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
srshr \i, \i, #1
|
||||
.endr
|
||||
.endif
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
||||
|
||||
blr x5
|
||||
|
@ -1625,6 +1734,16 @@ function inv_txfm_add_16x8_neon
|
|||
mov x6, x0
|
||||
load_add_store_8x8 x6, x7
|
||||
|
||||
.ifc \variant, identity_
|
||||
mov v16.16b, v24.16b
|
||||
mov v17.16b, v25.16b
|
||||
mov v18.16b, v26.16b
|
||||
mov v19.16b, v27.16b
|
||||
mov v20.16b, v28.16b
|
||||
mov v21.16b, v29.16b
|
||||
mov v22.16b, v30.16b
|
||||
mov v23.16b, v31.16b
|
||||
.else
|
||||
srshr v16.8h, v24.8h, #1
|
||||
srshr v17.8h, v25.8h, #1
|
||||
srshr v18.8h, v26.8h, #1
|
||||
|
@ -1633,6 +1752,7 @@ function inv_txfm_add_16x8_neon
|
|||
srshr v21.8h, v29.8h, #1
|
||||
srshr v22.8h, v30.8h, #1
|
||||
srshr v23.8h, v31.8h, #1
|
||||
.endif
|
||||
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
||||
|
||||
|
@ -1644,7 +1764,7 @@ function inv_txfm_add_16x8_neon
|
|||
br x15
|
||||
endfunc
|
||||
|
||||
function inv_txfm_add_8x16_neon
|
||||
function inv_txfm_\variant\()add_8x16_neon
|
||||
mov x15, x30
|
||||
movi v4.8h, #0
|
||||
mov w16, #2896*8
|
||||
|
@ -1655,8 +1775,16 @@ function inv_txfm_add_8x16_neon
|
|||
b.lt 1f
|
||||
|
||||
add x6, x2, #16
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
ld1 {v\i\().8h}, [x6]
|
||||
.ifc \variant, identity_
|
||||
.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
|
||||
ld1 {\i}, [x6]
|
||||
st1 {v4.8h}, [x6], x11
|
||||
.endr
|
||||
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
|
||||
// The identity shl #1 and downshift srshr #1 cancel out
|
||||
.else
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
ld1 {\i}, [x6]
|
||||
st1 {v4.8h}, [x6], x11
|
||||
.endr
|
||||
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
|
||||
|
@ -1670,13 +1798,14 @@ function inv_txfm_add_8x16_neon
|
|||
srshr v29.8h, v21.8h, #1
|
||||
srshr v30.8h, v22.8h, #1
|
||||
srshr v31.8h, v23.8h, #1
|
||||
.endif
|
||||
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
|
||||
|
||||
b 2f
|
||||
|
||||
1:
|
||||
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
movi v\i\().8h, #0
|
||||
.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
|
||||
movi \i, #0
|
||||
.endr
|
||||
|
||||
2:
|
||||
|
@ -1684,16 +1813,20 @@ function inv_txfm_add_8x16_neon
|
|||
mov w16, #2896*8
|
||||
dup v0.4h, w16
|
||||
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
ld1 {v\i\().8h}, [x2]
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
ld1 {\i}, [x2]
|
||||
st1 {v4.8h}, [x2], x11
|
||||
.endr
|
||||
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
|
||||
.ifc \variant, identity_
|
||||
// The identity shl #1 and downshift srshr #1 cancel out
|
||||
.else
|
||||
blr x4
|
||||
|
||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
srshr v\i\().8h, v\i\().8h, #1
|
||||
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
srshr \i, \i, #1
|
||||
.endr
|
||||
.endif
|
||||
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
|
||||
|
||||
|
@ -1703,6 +1836,10 @@ function inv_txfm_add_8x16_neon
|
|||
|
||||
br x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fn_816_base
|
||||
def_fn_816_base identity_
|
||||
|
||||
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
|
||||
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
|
||||
|
@ -1714,7 +1851,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
|
|||
.if \w == 8
|
||||
mov x13, #\eob_half
|
||||
.endif
|
||||
.ifc \txfm1, identity
|
||||
b inv_txfm_identity_add_\w\()x\h\()_neon
|
||||
.else
|
||||
b inv_txfm_add_\w\()x\h\()_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -2120,7 +2261,7 @@ endfunc
|
|||
.macro def_identity_1632 w, h, wshort, hshort
|
||||
function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
|
||||
mov w16, #2896*8
|
||||
mov w17, #2*5793
|
||||
mov w17, #2*(5793-4096)*8
|
||||
dup v1.4h, w16
|
||||
movi v0.8h, #0
|
||||
mov v1.h[1], w17
|
||||
|
@ -2140,12 +2281,11 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
|
|||
|
||||
.if \w == 16
|
||||
// 16x32
|
||||
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
|
||||
shift_8_regs srshr, 1
|
||||
identity_8x8_shift1 v1.h[1]
|
||||
.else
|
||||
// 32x16
|
||||
shift_8_regs shl, 1
|
||||
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
|
||||
shift_8_regs sqshl, 1
|
||||
identity_8x8 v1.h[1]
|
||||
.endif
|
||||
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
|
||||
|
|
|
@ -151,8 +151,8 @@ function lpf_16_wd\wd\()_neon
|
|||
movi v7.16b, #3
|
||||
sqxtn v2.8b, v2.8h // f
|
||||
sqxtn2 v2.16b, v3.8h
|
||||
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 128)
|
||||
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 128)
|
||||
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127)
|
||||
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
|
||||
sshr v4.16b, v4.16b, #3 // f1
|
||||
sshr v5.16b, v5.16b, #3 // f2
|
||||
uxtl v2.8h, v23.8b // p0
|
||||
|
@ -981,13 +981,13 @@ function lpf_h_16_16_neon
|
|||
br x15
|
||||
endfunc
|
||||
|
||||
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint32_t *const vmask,
|
||||
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
// const Av1FilterLUT *lut, const int w)
|
||||
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint32_t *const vmask,
|
||||
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
// const Av1FilterLUT *lut, const int w)
|
||||
|
||||
.macro lpf_func dir, type
|
||||
function lpf_\dir\()_sb_\type\()_neon, export=1
|
||||
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
|
||||
mov x11, x30
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
|
|
|
@ -0,0 +1,907 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2020, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
.macro loop_filter wd
|
||||
function lpf_8_wd\wd\()_neon
|
||||
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
|
||||
uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
|
||||
uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
|
||||
.if \wd >= 6
|
||||
uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
|
||||
uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
|
||||
.endif
|
||||
.if \wd >= 8
|
||||
uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
|
||||
uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
|
||||
.endif
|
||||
.if \wd >= 6
|
||||
umax v4.8h, v4.8h, v5.8h
|
||||
.endif
|
||||
uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
|
||||
.if \wd >= 8
|
||||
umax v6.8h, v6.8h, v7.8h
|
||||
.endif
|
||||
ushr v3.8h, v3.8h, #1
|
||||
.if \wd >= 8
|
||||
umax v4.8h, v4.8h, v6.8h
|
||||
.endif
|
||||
.if \wd >= 6
|
||||
and v4.16b, v4.16b, v14.16b
|
||||
.endif
|
||||
umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
|
||||
uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
||||
.if \wd >= 6
|
||||
umax v4.8h, v0.8h, v4.8h
|
||||
cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
|
||||
.else
|
||||
cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
|
||||
.endif
|
||||
cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
|
||||
and v1.16b, v1.16b, v2.16b // fm
|
||||
and v1.16b, v1.16b, v13.16b // fm && wd >= 4
|
||||
.if \wd >= 6
|
||||
and v14.16b, v14.16b, v1.16b // fm && wd > 4
|
||||
.endif
|
||||
.if \wd >= 16
|
||||
and v15.16b, v15.16b, v1.16b // fm && wd == 16
|
||||
.endif
|
||||
|
||||
mov x16, v1.d[0]
|
||||
mov x17, v1.d[1]
|
||||
adds x16, x16, x17
|
||||
b.eq 9f // if (!fm || wd < 4) return;
|
||||
|
||||
.if \wd >= 6
|
||||
movi v10.8h, #1
|
||||
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
|
||||
uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
|
||||
uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
|
||||
dup v9.8h, w9 // bitdepth_min_8
|
||||
.if \wd >= 8
|
||||
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
|
||||
uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
|
||||
.endif
|
||||
umax v2.8h, v2.8h, v3.8h
|
||||
umax v4.8h, v4.8h, v5.8h
|
||||
.if \wd >= 8
|
||||
umax v6.8h, v6.8h, v7.8h
|
||||
.endif
|
||||
umax v2.8h, v2.8h, v4.8h
|
||||
ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
|
||||
.if \wd >= 8
|
||||
umax v2.8h, v2.8h, v6.8h
|
||||
.endif
|
||||
|
||||
.if \wd == 16
|
||||
uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
|
||||
uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
|
||||
uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
|
||||
.endif
|
||||
cmhs v2.8h, v10.8h, v2.8h // flat8in
|
||||
.if \wd == 16
|
||||
uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
|
||||
uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
|
||||
uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
|
||||
.endif
|
||||
and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
|
||||
bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
|
||||
.if \wd == 16
|
||||
umax v3.8h, v3.8h, v4.8h
|
||||
umax v5.8h, v5.8h, v6.8h
|
||||
.endif
|
||||
mov x16, v1.d[0]
|
||||
mov x17, v1.d[1]
|
||||
.if \wd == 16
|
||||
umax v7.8h, v7.8h, v8.8h
|
||||
umax v3.8h, v3.8h, v5.8h
|
||||
umax v3.8h, v3.8h, v7.8h
|
||||
cmhs v3.8h, v10.8h, v3.8h // flat8out
|
||||
.endif
|
||||
adds x16, x16, x17
|
||||
.if \wd == 16
|
||||
and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
|
||||
and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
|
||||
bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
|
||||
.endif
|
||||
b.eq 1f // skip wd == 4 case
|
||||
.endif
|
||||
|
||||
dup v3.8h, w8 // bitdepth_max
|
||||
sub v2.8h, v22.8h, v25.8h // p1 - q1
|
||||
ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
|
||||
cmhi v0.8h, v0.8h, v12.8h // hev
|
||||
not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
|
||||
smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
|
||||
smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
|
||||
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
|
||||
sub v2.8h, v24.8h, v23.8h
|
||||
movi v5.8h, #3
|
||||
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
|
||||
mul v2.8h, v2.8h, v5.8h
|
||||
movi v6.8h, #4
|
||||
add v2.8h, v2.8h, v4.8h
|
||||
smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
|
||||
movi v7.8h, #3
|
||||
smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
|
||||
sqadd v4.8h, v6.8h, v2.8h // f + 4
|
||||
sqadd v5.8h, v7.8h, v2.8h // f + 3
|
||||
smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
|
||||
smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
|
||||
sshr v4.8h, v4.8h, #3 // f1
|
||||
sshr v5.8h, v5.8h, #3 // f2
|
||||
movi v9.8h, #0
|
||||
dup v3.8h, w8 // bitdepth_max
|
||||
sqadd v2.8h, v23.8h, v5.8h // p0 + f2
|
||||
sqsub v6.8h, v24.8h, v4.8h // q0 - f1
|
||||
srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
|
||||
smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
|
||||
smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
|
||||
smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
|
||||
smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
|
||||
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
|
||||
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
|
||||
sqadd v2.8h, v22.8h, v4.8h // p1 + f
|
||||
sqsub v6.8h, v25.8h, v4.8h // q1 - f
|
||||
smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
|
||||
smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
|
||||
smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
|
||||
smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
|
||||
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
|
||||
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
|
||||
1:
|
||||
|
||||
.if \wd == 6
|
||||
mov x16, v14.d[0]
|
||||
mov x17, v14.d[1]
|
||||
adds x16, x16, x17
|
||||
b.eq 2f // skip if there's no flat8in
|
||||
|
||||
add v0.8h, v21.8h, v21.8h // p2 * 2
|
||||
add v2.8h, v21.8h, v22.8h // p2 + p1
|
||||
add v4.8h, v22.8h, v23.8h // p1 + p0
|
||||
add v6.8h, v23.8h, v24.8h // p0 + q0
|
||||
add v8.8h, v0.8h, v2.8h
|
||||
add v10.8h, v4.8h, v6.8h
|
||||
add v12.8h, v24.8h, v25.8h // q0 + q1
|
||||
add v8.8h, v8.8h, v10.8h
|
||||
sub v12.8h, v12.8h, v0.8h
|
||||
add v10.8h, v25.8h, v26.8h // q1 + q2
|
||||
urshr v0.8h, v8.8h, #3 // out p1
|
||||
|
||||
add v8.8h, v8.8h, v12.8h
|
||||
sub v10.8h, v10.8h, v2.8h
|
||||
add v12.8h, v26.8h, v26.8h // q2 + q2
|
||||
urshr v1.8h, v8.8h, #3 // out p0
|
||||
|
||||
add v8.8h, v8.8h, v10.8h
|
||||
sub v12.8h, v12.8h, v4.8h
|
||||
urshr v2.8h, v8.8h, #3 // out q0
|
||||
|
||||
bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
|
||||
add v8.8h, v8.8h, v12.8h
|
||||
bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
|
||||
urshr v3.8h, v8.8h, #3 // out q1
|
||||
bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
|
||||
bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
|
||||
.elseif \wd >= 8
|
||||
mov x16, v14.d[0]
|
||||
mov x17, v14.d[1]
|
||||
adds x16, x16, x17
|
||||
.if \wd == 8
|
||||
b.eq 8f // skip if there's no flat8in
|
||||
.else
|
||||
b.eq 2f // skip if there's no flat8in
|
||||
.endif
|
||||
|
||||
add v0.8h, v20.8h, v21.8h // p3 + p2
|
||||
add v2.8h, v22.8h, v25.8h // p1 + q1
|
||||
add v4.8h, v20.8h, v22.8h // p3 + p1
|
||||
add v6.8h, v23.8h, v26.8h // p0 + q2
|
||||
add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
|
||||
add v9.8h, v23.8h, v24.8h // p0 + q0
|
||||
add v8.8h, v8.8h, v4.8h // + p3 + p1
|
||||
sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
|
||||
add v8.8h, v8.8h, v9.8h // + p0 + q0
|
||||
sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
|
||||
urshr v10.8h, v8.8h, #3 // out p2
|
||||
|
||||
add v8.8h, v8.8h, v2.8h
|
||||
add v0.8h, v20.8h, v23.8h // p3 + p0
|
||||
add v2.8h, v24.8h, v27.8h // q0 + q3
|
||||
urshr v11.8h, v8.8h, #3 // out p1
|
||||
|
||||
add v8.8h, v8.8h, v6.8h
|
||||
sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
|
||||
add v4.8h, v21.8h, v24.8h // p2 + q0
|
||||
add v6.8h, v25.8h, v27.8h // q1 + q3
|
||||
urshr v12.8h, v8.8h, #3 // out p0
|
||||
|
||||
add v8.8h, v8.8h, v2.8h
|
||||
sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
|
||||
add v0.8h, v22.8h, v25.8h // p1 + q1
|
||||
add v2.8h, v26.8h, v27.8h // q2 + q3
|
||||
urshr v13.8h, v8.8h, #3 // out q0
|
||||
|
||||
add v8.8h, v8.8h, v6.8h
|
||||
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
|
||||
urshr v0.8h, v8.8h, #3 // out q1
|
||||
|
||||
add v8.8h, v8.8h, v2.8h
|
||||
|
||||
bit v21.16b, v10.16b, v14.16b
|
||||
bit v22.16b, v11.16b, v14.16b
|
||||
bit v23.16b, v12.16b, v14.16b
|
||||
urshr v1.8h, v8.8h, #3 // out q2
|
||||
bit v24.16b, v13.16b, v14.16b
|
||||
bit v25.16b, v0.16b, v14.16b
|
||||
bit v26.16b, v1.16b, v14.16b
|
||||
.endif
|
||||
2:
|
||||
.if \wd == 16
|
||||
mov x16, v15.d[0]
|
||||
mov x17, v15.d[1]
|
||||
adds x16, x16, x17
|
||||
b.ne 1f // check if flat8out is needed
|
||||
mov x16, v14.d[0]
|
||||
mov x17, v14.d[1]
|
||||
adds x16, x16, x17
|
||||
b.eq 8f // if there was no flat8in, just write the inner 4 pixels
|
||||
b 7f // if flat8in was used, write the inner 6 pixels
|
||||
1:
|
||||
|
||||
add v2.8h, v17.8h, v17.8h // p6 + p6
|
||||
add v4.8h, v17.8h, v18.8h // p6 + p5
|
||||
add v6.8h, v17.8h, v19.8h // p6 + p4
|
||||
add v8.8h, v17.8h, v20.8h // p6 + p3
|
||||
add v12.8h, v2.8h, v4.8h
|
||||
add v10.8h, v6.8h, v8.8h
|
||||
add v6.8h, v17.8h, v21.8h // p6 + p2
|
||||
add v12.8h, v12.8h, v10.8h
|
||||
add v8.8h, v17.8h, v22.8h // p6 + p1
|
||||
add v10.8h, v18.8h, v23.8h // p5 + p0
|
||||
add v6.8h, v6.8h, v8.8h
|
||||
add v8.8h, v19.8h, v24.8h // p4 + q0
|
||||
add v12.8h, v12.8h, v6.8h
|
||||
add v10.8h, v10.8h, v8.8h
|
||||
add v6.8h, v20.8h, v25.8h // p3 + q1
|
||||
add v12.8h, v12.8h, v10.8h
|
||||
sub v6.8h, v6.8h, v2.8h
|
||||
add v2.8h, v21.8h, v26.8h // p2 + q2
|
||||
urshr v0.8h, v12.8h, #4 // out p5
|
||||
add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
|
||||
sub v2.8h, v2.8h, v4.8h
|
||||
add v4.8h, v22.8h, v27.8h // p1 + q3
|
||||
add v6.8h, v17.8h, v19.8h // p6 + p4
|
||||
urshr v1.8h, v12.8h, #4 // out p4
|
||||
add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
|
||||
sub v4.8h, v4.8h, v6.8h
|
||||
add v6.8h, v23.8h, v28.8h // p0 + q4
|
||||
add v8.8h, v17.8h, v20.8h // p6 + p3
|
||||
urshr v2.8h, v12.8h, #4 // out p3
|
||||
add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
|
||||
sub v6.8h, v6.8h, v8.8h
|
||||
add v8.8h, v24.8h, v29.8h // q0 + q5
|
||||
add v4.8h, v17.8h, v21.8h // p6 + p2
|
||||
urshr v3.8h, v12.8h, #4 // out p2
|
||||
add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
|
||||
sub v8.8h, v8.8h, v4.8h
|
||||
add v6.8h, v25.8h, v30.8h // q1 + q6
|
||||
add v10.8h, v17.8h, v22.8h // p6 + p1
|
||||
urshr v4.8h, v12.8h, #4 // out p1
|
||||
add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
|
||||
sub v6.8h, v6.8h, v10.8h
|
||||
add v8.8h, v26.8h, v30.8h // q2 + q6
|
||||
bif v0.16b, v18.16b, v15.16b // out p5
|
||||
add v10.8h, v18.8h, v23.8h // p5 + p0
|
||||
urshr v5.8h, v12.8h, #4 // out p0
|
||||
add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
|
||||
sub v8.8h, v8.8h, v10.8h
|
||||
add v10.8h, v27.8h, v30.8h // q3 + q6
|
||||
bif v1.16b, v19.16b, v15.16b // out p4
|
||||
add v18.8h, v19.8h, v24.8h // p4 + q0
|
||||
urshr v6.8h, v12.8h, #4 // out q0
|
||||
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
|
||||
sub v10.8h, v10.8h, v18.8h
|
||||
add v8.8h, v28.8h, v30.8h // q4 + q6
|
||||
bif v2.16b, v20.16b, v15.16b // out p3
|
||||
add v18.8h, v20.8h, v25.8h // p3 + q1
|
||||
urshr v7.8h, v12.8h, #4 // out q1
|
||||
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
|
||||
sub v18.8h, v8.8h, v18.8h
|
||||
add v10.8h, v29.8h, v30.8h // q5 + q6
|
||||
bif v3.16b, v21.16b, v15.16b // out p2
|
||||
add v20.8h, v21.8h, v26.8h // p2 + q2
|
||||
urshr v8.8h, v12.8h, #4 // out q2
|
||||
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
|
||||
sub v10.8h, v10.8h, v20.8h
|
||||
add v18.8h, v30.8h, v30.8h // q6 + q6
|
||||
bif v4.16b, v22.16b, v15.16b // out p1
|
||||
add v20.8h, v22.8h, v27.8h // p1 + q3
|
||||
urshr v9.8h, v12.8h, #4 // out q3
|
||||
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
|
||||
sub v18.8h, v18.8h, v20.8h
|
||||
bif v5.16b, v23.16b, v15.16b // out p0
|
||||
urshr v10.8h, v12.8h, #4 // out q4
|
||||
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
|
||||
urshr v11.8h, v12.8h, #4 // out q5
|
||||
bif v6.16b, v24.16b, v15.16b // out q0
|
||||
bif v7.16b, v25.16b, v15.16b // out q1
|
||||
bif v8.16b, v26.16b, v15.16b // out q2
|
||||
bif v9.16b, v27.16b, v15.16b // out q3
|
||||
bif v10.16b, v28.16b, v15.16b // out q4
|
||||
bif v11.16b, v29.16b, v15.16b // out q5
|
||||
.endif
|
||||
|
||||
ret
|
||||
.if \wd == 16
|
||||
7:
|
||||
// Return to a shorter epilogue, writing only the inner 6 pixels
|
||||
br x13
|
||||
.endif
|
||||
.if \wd >= 8
|
||||
8:
|
||||
// Return to a shorter epilogue, writing only the inner 4 pixels
|
||||
br x14
|
||||
.endif
|
||||
9:
|
||||
// Return directly without writing back any pixels
|
||||
br x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
loop_filter 16
|
||||
loop_filter 8
|
||||
loop_filter 6
|
||||
loop_filter 4
|
||||
|
||||
.macro lpf_8_wd16
|
||||
adr x13, 7f
|
||||
adr x14, 8f
|
||||
bl lpf_8_wd16_neon
|
||||
.endm
|
||||
|
||||
.macro lpf_8_wd8
|
||||
adr x14, 8f
|
||||
bl lpf_8_wd8_neon
|
||||
.endm
|
||||
|
||||
.macro lpf_8_wd6
|
||||
bl lpf_8_wd6_neon
|
||||
.endm
|
||||
|
||||
.macro lpf_8_wd4
|
||||
bl lpf_8_wd4_neon
|
||||
.endm
|
||||
|
||||
function lpf_v_4_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, x1, lsl #1
|
||||
ld1 {v22.8h}, [x16], x1 // p1
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v23.8h}, [x16], x1 // p0
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
sub x0, x0, x1, lsl #1
|
||||
|
||||
lpf_8_wd4
|
||||
|
||||
sub x16, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x16], x1 // p1
|
||||
st1 {v24.8h}, [x0], x1 // q0
|
||||
st1 {v23.8h}, [x16], x1 // p0
|
||||
st1 {v25.8h}, [x0], x1 // q1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_h_4_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, #4
|
||||
add x0, x16, x1, lsl #2
|
||||
ld1 {v22.d}[0], [x16], x1
|
||||
ld1 {v22.d}[1], [x0], x1
|
||||
ld1 {v23.d}[0], [x16], x1
|
||||
ld1 {v23.d}[1], [x0], x1
|
||||
ld1 {v24.d}[0], [x16], x1
|
||||
ld1 {v24.d}[1], [x0], x1
|
||||
ld1 {v25.d}[0], [x16], x1
|
||||
ld1 {v25.d}[1], [x0], x1
|
||||
add x0, x0, #4
|
||||
|
||||
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
lpf_8_wd4
|
||||
|
||||
sub x16, x0, x1, lsl #3
|
||||
sub x16, x16, #4
|
||||
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
|
||||
add x0, x16, x1, lsl #2
|
||||
|
||||
st1 {v22.d}[0], [x16], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x16], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x16], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x16], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
add x0, x0, #4
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_v_6_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, x1, lsl #1
|
||||
sub x16, x16, x1
|
||||
ld1 {v21.8h}, [x16], x1 // p2
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v22.8h}, [x16], x1 // p1
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v23.8h}, [x16], x1 // p0
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
|
||||
lpf_8_wd6
|
||||
|
||||
sub x16, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x16], x1 // p1
|
||||
st1 {v24.8h}, [x0], x1 // q0
|
||||
st1 {v23.8h}, [x16], x1 // p0
|
||||
st1 {v25.8h}, [x0], x1 // q1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_h_6_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, #8
|
||||
add x0, x16, x1, lsl #2
|
||||
ld1 {v20.8h}, [x16], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x16], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x16], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x16], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
add x0, x0, #8
|
||||
|
||||
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
lpf_8_wd6
|
||||
|
||||
sub x16, x0, x1, lsl #3
|
||||
sub x16, x16, #4
|
||||
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
|
||||
add x0, x16, x1, lsl #2
|
||||
|
||||
st1 {v22.d}[0], [x16], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x16], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x16], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x16], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
add x0, x0, #4
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_v_8_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, x1, lsl #2
|
||||
ld1 {v20.8h}, [x16], x1 // p3
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v21.8h}, [x16], x1 // p2
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v22.8h}, [x16], x1 // p1
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v23.8h}, [x16], x1 // p0
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
sub x0, x0, x1, lsl #2
|
||||
|
||||
lpf_8_wd8
|
||||
|
||||
sub x16, x0, x1, lsl #1
|
||||
sub x16, x16, x1
|
||||
st1 {v21.8h}, [x16], x1 // p2
|
||||
st1 {v24.8h}, [x0], x1 // q0
|
||||
st1 {v22.8h}, [x16], x1 // p1
|
||||
st1 {v25.8h}, [x0], x1 // q1
|
||||
st1 {v23.8h}, [x16], x1 // p0
|
||||
st1 {v26.8h}, [x0], x1 // q2
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
br x15
|
||||
|
||||
8:
|
||||
sub x16, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x16], x1 // p1
|
||||
st1 {v24.8h}, [x0], x1 // q0
|
||||
st1 {v23.8h}, [x16], x1 // p0
|
||||
st1 {v25.8h}, [x0], x1 // q1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_h_8_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, #8
|
||||
add x0, x16, x1, lsl #2
|
||||
ld1 {v20.8h}, [x16], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x16], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x16], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x16], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
add x0, x0, #8
|
||||
|
||||
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
lpf_8_wd8
|
||||
|
||||
sub x16, x0, x1, lsl #3
|
||||
sub x16, x16, #8
|
||||
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
add x0, x16, x1, lsl #2
|
||||
|
||||
st1 {v20.8h}, [x16], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v21.8h}, [x16], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x16], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x16], x1
|
||||
st1 {v27.8h}, [x0], x1
|
||||
add x0, x0, #8
|
||||
br x15
|
||||
8:
|
||||
sub x16, x0, x1, lsl #3
|
||||
sub x16, x16, #4
|
||||
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
|
||||
add x0, x16, x1, lsl #2
|
||||
|
||||
st1 {v22.d}[0], [x16], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x16], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x16], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x16], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
add x0, x0, #4
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_v_16_8_neon
|
||||
mov x15, x30
|
||||
|
||||
sub x16, x0, x1, lsl #3
|
||||
add x16, x16, x1
|
||||
ld1 {v17.8h}, [x16], x1 // p6
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v18.8h}, [x16], x1 // p5
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v19.8h}, [x16], x1 // p4
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v20.8h}, [x16], x1 // p3
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
ld1 {v21.8h}, [x16], x1 // p2
|
||||
ld1 {v28.8h}, [x0], x1 // q4
|
||||
ld1 {v22.8h}, [x16], x1 // p1
|
||||
ld1 {v29.8h}, [x0], x1 // q5
|
||||
ld1 {v23.8h}, [x16], x1 // p0
|
||||
ld1 {v30.8h}, [x0], x1 // q6
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, x1
|
||||
|
||||
lpf_8_wd16
|
||||
|
||||
sub x16, x0, x1, lsl #2
|
||||
sub x16, x16, x1, lsl #1
|
||||
st1 {v0.8h}, [x16], x1 // p5
|
||||
st1 {v6.8h}, [x0], x1 // q0
|
||||
st1 {v1.8h}, [x16], x1 // p4
|
||||
st1 {v7.8h}, [x0], x1 // q1
|
||||
st1 {v2.8h}, [x16], x1 // p3
|
||||
st1 {v8.8h}, [x0], x1 // q2
|
||||
st1 {v3.8h}, [x16], x1 // p2
|
||||
st1 {v9.8h}, [x0], x1 // q3
|
||||
st1 {v4.8h}, [x16], x1 // p1
|
||||
st1 {v10.8h}, [x0], x1 // q4
|
||||
st1 {v5.8h}, [x16], x1 // p0
|
||||
st1 {v11.8h}, [x0], x1 // q5
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x15
|
||||
7:
|
||||
sub x16, x0, x1
|
||||
sub x16, x16, x1, lsl #1
|
||||
st1 {v21.8h}, [x16], x1 // p2
|
||||
st1 {v24.8h}, [x0], x1 // q0
|
||||
st1 {v22.8h}, [x16], x1 // p1
|
||||
st1 {v25.8h}, [x0], x1 // q1
|
||||
st1 {v23.8h}, [x16], x1 // p0
|
||||
st1 {v26.8h}, [x0], x1 // q2
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
br x15
|
||||
|
||||
8:
|
||||
sub x16, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x16], x1 // p1
|
||||
st1 {v24.8h}, [x0], x1 // q0
|
||||
st1 {v23.8h}, [x16], x1 // p0
|
||||
st1 {v25.8h}, [x0], x1 // q1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
function lpf_h_16_8_neon
|
||||
mov x15, x30
|
||||
sub x16, x0, #16
|
||||
ld1 {v16.8h}, [x16], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v17.8h}, [x16], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v18.8h}, [x16], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v19.8h}, [x16], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
ld1 {v20.8h}, [x16], x1
|
||||
ld1 {v28.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x16], x1
|
||||
ld1 {v29.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x16], x1
|
||||
ld1 {v30.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x16], x1
|
||||
ld1 {v31.8h}, [x0], x1
|
||||
|
||||
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
||||
|
||||
lpf_8_wd16
|
||||
|
||||
sub x0, x0, x1, lsl #3
|
||||
sub x16, x0, #16
|
||||
|
||||
transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
|
||||
transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
|
||||
|
||||
st1 {v16.8h}, [x16], x1
|
||||
st1 {v6.8h}, [x0], x1
|
||||
st1 {v17.8h}, [x16], x1
|
||||
st1 {v7.8h}, [x0], x1
|
||||
st1 {v0.8h}, [x16], x1
|
||||
st1 {v8.8h}, [x0], x1
|
||||
st1 {v1.8h}, [x16], x1
|
||||
st1 {v9.8h}, [x0], x1
|
||||
st1 {v2.8h}, [x16], x1
|
||||
st1 {v10.8h}, [x0], x1
|
||||
st1 {v3.8h}, [x16], x1
|
||||
st1 {v11.8h}, [x0], x1
|
||||
st1 {v4.8h}, [x16], x1
|
||||
st1 {v30.8h}, [x0], x1
|
||||
st1 {v5.8h}, [x16], x1
|
||||
st1 {v31.8h}, [x0], x1
|
||||
br x15
|
||||
|
||||
7:
|
||||
sub x16, x0, x1, lsl #3
|
||||
sub x16, x16, #8
|
||||
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
add x0, x16, x1, lsl #2
|
||||
|
||||
st1 {v20.8h}, [x16], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v21.8h}, [x16], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x16], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x16], x1
|
||||
st1 {v27.8h}, [x0], x1
|
||||
add x0, x0, #8
|
||||
br x15
|
||||
8:
|
||||
sub x16, x0, x1, lsl #3
|
||||
sub x16, x16, #4
|
||||
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
|
||||
add x0, x16, x1, lsl #2
|
||||
|
||||
st1 {v22.d}[0], [x16], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x16], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x16], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x16], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
add x0, x0, #4
|
||||
br x15
|
||||
endfunc
|
||||
|
||||
// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint32_t *const vmask,
|
||||
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
// const Av1FilterLUT *lut, const int w,
|
||||
// const int bitdepth_max)
|
||||
|
||||
.macro lpf_func dir, type
|
||||
function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
|
||||
mov x11, x30
|
||||
mov w8, w7 // bitdepth_max
|
||||
clz w9, w8
|
||||
mov w10, #24
|
||||
sub w9, w10, w9 // bitdepth_min_8
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
stp d14, d15, [sp, #0x30]
|
||||
ldp w6, w7, [x2] // vmask[0], vmask[1]
|
||||
.ifc \type, y
|
||||
ldr w2, [x2, #8] // vmask[2]
|
||||
.endif
|
||||
add x5, x5, #128 // Move to sharp part of lut
|
||||
.ifc \type, y
|
||||
orr w7, w7, w2 // vmask[1] |= vmask[2]
|
||||
.endif
|
||||
.ifc \dir, v
|
||||
sub x4, x3, x4, lsl #2
|
||||
.else
|
||||
sub x3, x3, #4
|
||||
lsl x4, x4, #2
|
||||
.endif
|
||||
orr w6, w6, w7 // vmask[0] |= vmask[1]
|
||||
|
||||
1:
|
||||
tst w6, #0x0f
|
||||
.ifc \dir, v
|
||||
ld1 {v0.8b}, [x4], #8
|
||||
ld1 {v1.8b}, [x3], #8
|
||||
.else
|
||||
ld2 {v0.s,v1.s}[0], [x3], x4
|
||||
ld2 {v0.s,v1.s}[1], [x3], x4
|
||||
.endif
|
||||
b.eq 7f // if (!(vm & bits)) continue;
|
||||
|
||||
ld1r {v5.8b}, [x5] // sharp[0]
|
||||
add x5, x5, #8
|
||||
movi v2.2s, #0xff
|
||||
dup v13.2s, w6 // vmask[0]
|
||||
dup v31.8h, w9 // bitdepth_min_8
|
||||
|
||||
and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
|
||||
and v1.8b, v1.8b, v2.8b
|
||||
cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
|
||||
movi v4.8b, #1
|
||||
ld1r {v6.8b}, [x5] // sharp[1]
|
||||
sub x5, x5, #8
|
||||
bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
|
||||
mul v1.2s, v1.2s, v4.2s // L
|
||||
.ifc \type, y
|
||||
dup v15.2s, w2 // vmask[2]
|
||||
.endif
|
||||
cmtst v2.2s, v1.2s, v2.2s // L != 0
|
||||
dup v14.2s, w7 // vmask[1]
|
||||
mov x16, v2.d[0]
|
||||
cmp x16, #0
|
||||
b.eq 7f // if (!L) continue;
|
||||
neg v5.8b, v5.8b // -sharp[0]
|
||||
movrel x16, word_12
|
||||
ushr v12.8b, v1.8b, #4 // H
|
||||
ld1 {v16.2s}, [x16]
|
||||
sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
|
||||
.ifc \type, y
|
||||
cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
|
||||
.endif
|
||||
movi v7.8b, #2
|
||||
umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
|
||||
add v0.8b, v1.8b, v7.8b // L + 2
|
||||
umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
|
||||
add v0.8b, v0.8b, v0.8b // 2*(L + 2)
|
||||
cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
|
||||
uxtl v12.8h, v12.8b
|
||||
add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
|
||||
cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
|
||||
uxtl v11.8h, v11.8b
|
||||
uxtl v10.8h, v10.8b
|
||||
and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
|
||||
sxtl v14.8h, v14.8b
|
||||
sxtl v13.8h, v13.8b
|
||||
.ifc \type, y
|
||||
sxtl v15.8h, v15.8b
|
||||
.endif
|
||||
ushl v12.8h, v12.8h, v31.8h
|
||||
ushl v11.8h, v11.8h, v31.8h
|
||||
ushl v10.8h, v10.8h, v31.8h
|
||||
|
||||
.ifc \type, y
|
||||
tst w2, #0x0f
|
||||
b.eq 2f
|
||||
// wd16
|
||||
bl lpf_\dir\()_16_8_neon
|
||||
b 8f
|
||||
2:
|
||||
.endif
|
||||
tst w7, #0x0f
|
||||
b.eq 3f
|
||||
.ifc \type, y
|
||||
// wd8
|
||||
bl lpf_\dir\()_8_8_neon
|
||||
.else
|
||||
// wd6
|
||||
bl lpf_\dir\()_6_8_neon
|
||||
.endif
|
||||
b 8f
|
||||
3:
|
||||
// wd4
|
||||
bl lpf_\dir\()_4_8_neon
|
||||
.ifc \dir, h
|
||||
b 8f
|
||||
7:
|
||||
// For dir h, the functions above increment x0.
|
||||
// If the whole function is skipped, increment it here instead.
|
||||
add x0, x0, x1, lsl #3
|
||||
.else
|
||||
7:
|
||||
.endif
|
||||
8:
|
||||
lsr w6, w6, #2 // vmask[0] >>= 2
|
||||
lsr w7, w7, #2 // vmask[1] >>= 2
|
||||
.ifc \type, y
|
||||
lsr w2, w2, #2 // vmask[2] >>= 2
|
||||
.endif
|
||||
.ifc \dir, v
|
||||
add x0, x0, #16
|
||||
.else
|
||||
// For dir h, x0 is returned incremented
|
||||
.endif
|
||||
cbnz w6, 1b
|
||||
|
||||
ldp d14, d15, [sp, #0x30]
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x40
|
||||
br x11
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
lpf_func v, y
|
||||
lpf_func h, y
|
||||
lpf_func v, uv
|
||||
lpf_func h, uv
|
||||
|
||||
const word_12
|
||||
.word 1, 2
|
||||
endconst
|
|
@ -28,11 +28,11 @@
|
|||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[7], const intptr_t w,
|
||||
// int h, enum LrEdgeFlags edges);
|
||||
function wiener_filter_h_neon, export=1
|
||||
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[7], const intptr_t w,
|
||||
// int h, enum LrEdgeFlags edges);
|
||||
function wiener_filter_h_8bpc_neon, export=1
|
||||
mov w8, w5
|
||||
ld1 {v0.8h}, [x4]
|
||||
mov w9, #(1 << 14) - (1 << 2)
|
||||
|
@ -306,11 +306,11 @@ L(variable_shift_tbl):
|
|||
.purgem filter
|
||||
endfunc
|
||||
|
||||
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const int16_t *mid, int w, int h,
|
||||
// const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
// ptrdiff_t mid_stride);
|
||||
function wiener_filter_v_neon, export=1
|
||||
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const int16_t *mid, int w, int h,
|
||||
// const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
// ptrdiff_t mid_stride);
|
||||
function wiener_filter_v_8bpc_neon, export=1
|
||||
mov w8, w4
|
||||
ld1 {v0.8h}, [x5]
|
||||
movi v1.8h, #128
|
||||
|
@ -482,9 +482,9 @@ function wiener_filter_v_neon, export=1
|
|||
.purgem filter
|
||||
endfunc
|
||||
|
||||
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_neon, export=1
|
||||
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_8bpc_neon, export=1
|
||||
adr x5, L(copy_narrow_tbl)
|
||||
ldrh w6, [x5, w3, uxtw #1]
|
||||
sub x5, x5, w6, uxth
|
||||
|
@ -617,12 +617,14 @@ endfunc
|
|||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_h_neon, export=1
|
||||
#include "looprestoration_tmpl.S"
|
||||
|
||||
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_h_8bpc_neon, export=1
|
||||
add w5, w5, #2 // w += 2
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
|
@ -844,11 +846,11 @@ L(box3_variable_shift_tbl):
|
|||
umull2 v6.8h, v4.16b, v4.16b
|
||||
|
||||
add3 4
|
||||
subs w5, w5, #4
|
||||
st1 {v3.4h}, [x1], #8
|
||||
st1 {v7.4h}, [x11], #8
|
||||
st1 {v26.4s}, [x0], #16
|
||||
st1 {v28.4s}, [x10], #16
|
||||
subs w5, w5, #4
|
||||
b.le 9f
|
||||
ext v0.16b, v0.16b, v0.16b, #4
|
||||
ext v4.16b, v4.16b, v4.16b, #4
|
||||
|
@ -879,12 +881,12 @@ L(box3_variable_shift_tbl):
|
|||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_h_neon, export=1
|
||||
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_h_8bpc_neon, export=1
|
||||
add w5, w5, #2 // w += 2
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
|
@ -950,7 +952,7 @@ function sgr_box5_h_neon, export=1
|
|||
b 2f
|
||||
0:
|
||||
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
|
||||
// and shift v0 to have 2x the first byte at the front.
|
||||
// and shift v0 to have 3x the first byte at the front.
|
||||
dup v1.16b, v0.b[0]
|
||||
dup v5.16b, v4.b[0]
|
||||
// Move x3 back to account for the last 3 bytes we loaded before,
|
||||
|
@ -1114,11 +1116,11 @@ L(box5_variable_shift_tbl):
|
|||
umull2 v6.8h, v4.16b, v4.16b
|
||||
|
||||
add5 4
|
||||
subs w5, w5, #4
|
||||
st1 {v3.4h}, [x1], #8
|
||||
st1 {v7.4h}, [x11], #8
|
||||
st1 {v26.4s}, [x0], #16
|
||||
st1 {v28.4s}, [x10], #16
|
||||
subs w5, w5, #4
|
||||
b.le 9f
|
||||
ext v0.16b, v0.16b, v0.16b, #4
|
||||
ext v1.16b, v1.16b, v2.16b, #8
|
||||
|
@ -1147,839 +1149,4 @@ L(box5_variable_shift_tbl):
|
|||
.purgem add5
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_v_neon, export=1
|
||||
add w10, w3, #2 // Number of output rows to move back
|
||||
mov w11, w3 // Number of input rows to move back
|
||||
add w2, w2, #2 // Actual summed width
|
||||
mov x7, #(4*SUM_STRIDE) // sumsq stride
|
||||
mov x8, #(2*SUM_STRIDE) // sum stride
|
||||
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
|
||||
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
|
||||
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
b.eq 0f
|
||||
// If have top, read from row -2.
|
||||
sub x5, x0, #(4*SUM_STRIDE)
|
||||
sub x6, x1, #(2*SUM_STRIDE)
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_TOP
|
||||
// If we don't have top, read from row 0 even if
|
||||
// we start writing to row -1.
|
||||
add x5, x0, #(4*SUM_STRIDE)
|
||||
add x6, x1, #(2*SUM_STRIDE)
|
||||
1:
|
||||
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.eq 1f
|
||||
// LR_HAVE_BOTTOM
|
||||
add w3, w3, #2 // Sum all h+2 lines with the main loop
|
||||
add w11, w11, #2
|
||||
1:
|
||||
mov w9, w3 // Backup of h for next loops
|
||||
|
||||
1:
|
||||
// Start of horizontal loop; start one vertical filter slice.
|
||||
// Start loading rows into v16-v21 and v24-v26 taking top
|
||||
// padding into consideration.
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
ld1 {v16.4s, v17.4s}, [x5], x7
|
||||
ld1 {v24.8h}, [x6], x8
|
||||
b.eq 2f
|
||||
// LR_HAVE_TOP
|
||||
ld1 {v18.4s, v19.4s}, [x5], x7
|
||||
ld1 {v25.8h}, [x6], x8
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v25.16b, v24.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v17.16b
|
||||
mov v26.16b, v24.16b
|
||||
|
||||
3:
|
||||
subs w3, w3, #1
|
||||
.macro add3
|
||||
add v16.4s, v16.4s, v18.4s
|
||||
add v17.4s, v17.4s, v19.4s
|
||||
add v24.8h, v24.8h, v25.8h
|
||||
add v16.4s, v16.4s, v20.4s
|
||||
add v17.4s, v17.4s, v21.4s
|
||||
add v24.8h, v24.8h, v26.8h
|
||||
st1 {v16.4s, v17.4s}, [x0], x7
|
||||
st1 {v24.8h}, [x1], x8
|
||||
.endm
|
||||
add3
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v24.16b, v25.16b
|
||||
mov v18.16b, v20.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v25.16b, v26.16b
|
||||
b.le 4f
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b 3b
|
||||
|
||||
4:
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.ne 5f
|
||||
// !LR_HAVE_BOTTOM
|
||||
// Produce two more rows, extending the already loaded rows.
|
||||
add3
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v24.16b, v25.16b
|
||||
add3
|
||||
|
||||
5: // End of one vertical slice.
|
||||
subs w2, w2, #8
|
||||
b.le 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
// Input pointers
|
||||
msub x5, x7, x11, x5
|
||||
msub x6, x8, x11, x6
|
||||
// Output pointers
|
||||
msub x0, x7, x10, x0
|
||||
msub x1, x8, x10, x1
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
add x5, x5, #32
|
||||
add x6, x6, #16
|
||||
mov w3, w9
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_v_neon, export=1
|
||||
add w10, w3, #2 // Number of output rows to move back
|
||||
mov w11, w3 // Number of input rows to move back
|
||||
add w2, w2, #8 // Actual summed width
|
||||
mov x7, #(4*SUM_STRIDE) // sumsq stride
|
||||
mov x8, #(2*SUM_STRIDE) // sum stride
|
||||
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
|
||||
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
|
||||
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
b.eq 0f
|
||||
// If have top, read from row -2.
|
||||
sub x5, x0, #(4*SUM_STRIDE)
|
||||
sub x6, x1, #(2*SUM_STRIDE)
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_TOP
|
||||
// If we don't have top, read from row 0 even if
|
||||
// we start writing to row -1.
|
||||
add x5, x0, #(4*SUM_STRIDE)
|
||||
add x6, x1, #(2*SUM_STRIDE)
|
||||
1:
|
||||
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.eq 0f
|
||||
// LR_HAVE_BOTTOM
|
||||
add w3, w3, #2 // Handle h+2 lines with the main loop
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_BOTTOM
|
||||
sub w3, w3, #1 // Handle h-1 lines with the main loop
|
||||
1:
|
||||
mov w9, w3 // Backup of h for next loops
|
||||
|
||||
1:
|
||||
// Start of horizontal loop; start one vertical filter slice.
|
||||
// Start loading rows into v16-v25 and v26-v30 taking top
|
||||
// padding into consideration.
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
ld1 {v16.4s, v17.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b.eq 2f
|
||||
// LR_HAVE_TOP
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v28.8h}, [x6], x8
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v27.16b, v26.16b
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v27.16b, v26.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v17.16b
|
||||
mov v28.16b, v26.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v17.16b
|
||||
mov v29.16b, v26.16b
|
||||
|
||||
3:
|
||||
cbz w3, 4f
|
||||
ld1 {v24.4s, v25.4s}, [x5], x7
|
||||
ld1 {v30.8h}, [x6], x8
|
||||
|
||||
3:
|
||||
// Start of vertical loop
|
||||
subs w3, w3, #2
|
||||
.macro add5
|
||||
add v16.4s, v16.4s, v18.4s
|
||||
add v17.4s, v17.4s, v19.4s
|
||||
add v26.8h, v26.8h, v27.8h
|
||||
add v0.4s, v20.4s, v22.4s
|
||||
add v1.4s, v21.4s, v23.4s
|
||||
add v2.8h, v28.8h, v29.8h
|
||||
add v16.4s, v16.4s, v24.4s
|
||||
add v17.4s, v17.4s, v25.4s
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
add v16.4s, v16.4s, v0.4s
|
||||
add v17.4s, v17.4s, v1.4s
|
||||
add v26.8h, v26.8h, v2.8h
|
||||
st1 {v16.4s, v17.4s}, [x0], x7
|
||||
st1 {v26.8h}, [x1], x8
|
||||
.endm
|
||||
add5
|
||||
.macro shift2
|
||||
mov v16.16b, v20.16b
|
||||
mov v17.16b, v21.16b
|
||||
mov v26.16b, v28.16b
|
||||
mov v18.16b, v22.16b
|
||||
mov v19.16b, v23.16b
|
||||
mov v27.16b, v29.16b
|
||||
mov v20.16b, v24.16b
|
||||
mov v21.16b, v25.16b
|
||||
mov v28.16b, v30.16b
|
||||
.endm
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
b.le 5f
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
ld1 {v24.4s, v25.4s}, [x5], x7
|
||||
ld1 {v30.8h}, [x6], x8
|
||||
b 3b
|
||||
|
||||
4:
|
||||
// h == 1, !LR_HAVE_BOTTOM.
|
||||
// Pad the last row with the only content row, and add.
|
||||
mov v24.16b, v22.16b
|
||||
mov v25.16b, v23.16b
|
||||
mov v30.16b, v29.16b
|
||||
add5
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
add5
|
||||
b 6f
|
||||
|
||||
5:
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.ne 6f
|
||||
// !LR_HAVE_BOTTOM
|
||||
cbnz w3, 5f
|
||||
// The intended three edge rows left; output the one at h-2 and
|
||||
// the past edge one at h.
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
// Pad the past-edge row from the last content row.
|
||||
mov v24.16b, v22.16b
|
||||
mov v25.16b, v23.16b
|
||||
mov v30.16b, v29.16b
|
||||
add5
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
// The last two rows are already padded properly here.
|
||||
add5
|
||||
b 6f
|
||||
|
||||
5:
|
||||
// w3 == -1, two rows left, output one.
|
||||
// Pad the last two rows from the mid one.
|
||||
mov v22.16b, v20.16b
|
||||
mov v23.16b, v21.16b
|
||||
mov v29.16b, v28.16b
|
||||
mov v24.16b, v20.16b
|
||||
mov v25.16b, v21.16b
|
||||
mov v30.16b, v28.16b
|
||||
add5
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
b 6f
|
||||
|
||||
6: // End of one vertical slice.
|
||||
subs w2, w2, #8
|
||||
b.le 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
// Input pointers
|
||||
msub x5, x7, x11, x5
|
||||
msub x6, x8, x11, x6
|
||||
// Output pointers
|
||||
msub x0, x7, x10, x0
|
||||
msub x1, x8, x10, x1
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
add x5, x5, #32
|
||||
add x6, x6, #16
|
||||
mov w3, w9
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
.purgem add5
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
|
||||
// const int w, const int h, const int strength);
|
||||
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
|
||||
// const int w, const int h, const int strength);
|
||||
function sgr_calc_ab1_neon, export=1
|
||||
add x3, x3, #2 // h += 2
|
||||
movi v31.4s, #9 // n
|
||||
mov x5, #455
|
||||
mov x8, #SUM_STRIDE
|
||||
b sgr_calc_ab_neon
|
||||
endfunc
|
||||
|
||||
function sgr_calc_ab2_neon, export=1
|
||||
add x3, x3, #3 // h += 3
|
||||
asr x3, x3, #1 // h /= 2
|
||||
movi v31.4s, #25 // n
|
||||
mov x5, #164
|
||||
mov x8, #(2*SUM_STRIDE)
|
||||
endfunc
|
||||
|
||||
function sgr_calc_ab_neon
|
||||
movrel x12, X(sgr_x_by_x)
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
|
||||
movi v19.16b, #5
|
||||
movi v20.8b, #55 // idx of last 5
|
||||
movi v21.8b, #72 // idx of last 4
|
||||
movi v22.8b, #101 // idx of last 3
|
||||
movi v23.8b, #169 // idx of last 2
|
||||
movi v24.8b, #254 // idx of last 1
|
||||
add x2, x2, #2 // w += 2
|
||||
add x7, x2, #7
|
||||
bic x7, x7, #7 // aligned w
|
||||
sub x7, x8, x7 // increment between rows
|
||||
movi v29.8h, #1, lsl #8
|
||||
dup v28.4s, w4
|
||||
dup v30.4s, w5 // one_by_x
|
||||
sub x0, x0, #(4*(SUM_STRIDE))
|
||||
sub x1, x1, #(2*(SUM_STRIDE))
|
||||
mov x6, x2 // backup of w
|
||||
sub v16.16b, v16.16b, v19.16b
|
||||
sub v17.16b, v17.16b, v19.16b
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
1:
|
||||
subs x2, x2, #8
|
||||
ld1 {v0.4s, v1.4s}, [x0] // a
|
||||
ld1 {v2.8h}, [x1] // b
|
||||
mul v0.4s, v0.4s, v31.4s // a * n
|
||||
mul v1.4s, v1.4s, v31.4s // a * n
|
||||
umull v3.4s, v2.4h, v2.4h // b * b
|
||||
umull2 v4.4s, v2.8h, v2.8h // b * b
|
||||
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
|
||||
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
|
||||
mul v0.4s, v0.4s, v28.4s // p * s
|
||||
mul v1.4s, v1.4s, v28.4s // p * s
|
||||
uqshrn v0.4h, v0.4s, #16
|
||||
uqshrn2 v0.8h, v1.4s, #16
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v5.8b
|
||||
add v6.8b, v6.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
add v1.8b, v1.8b, v6.8b
|
||||
add v1.8b, v1.8b, v25.8b
|
||||
uxtl v1.8h, v1.8b // x
|
||||
|
||||
umull v3.4s, v1.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
sub v2.8h, v29.8h, v1.8h // 256 - x
|
||||
|
||||
st1 {v3.4s, v4.4s}, [x0], #32
|
||||
st1 {v2.8h}, [x1], #16
|
||||
b.gt 1b
|
||||
|
||||
subs x3, x3, #1
|
||||
b.le 0f
|
||||
add x0, x0, x7, lsl #2
|
||||
add x1, x1, x7, lsl #1
|
||||
mov x2, x6
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#define FILTER_OUT_STRIDE 384
|
||||
|
||||
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter1_neon, export=1
|
||||
sub x7, x3, #(4*SUM_STRIDE)
|
||||
add x8, x3, #(4*SUM_STRIDE)
|
||||
sub x9, x4, #(2*SUM_STRIDE)
|
||||
add x10, x4, #(2*SUM_STRIDE)
|
||||
mov x11, #SUM_STRIDE
|
||||
mov x12, #FILTER_OUT_STRIDE
|
||||
add x13, x5, #7
|
||||
bic x13, x13, #7 // Aligned width
|
||||
sub x2, x2, x13
|
||||
sub x12, x12, x13
|
||||
sub x11, x11, x13
|
||||
sub x11, x11, #4 // We read 4 extra elements from a
|
||||
sub x14, x11, #4 // We read 8 extra elements from b
|
||||
mov x13, x5
|
||||
movi v6.8h, #3
|
||||
movi v7.4s, #3
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x9], #32
|
||||
ld1 {v2.8h, v3.8h}, [x4], #32
|
||||
ld1 {v4.8h, v5.8h}, [x10], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
|
||||
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v25.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v26.16b, v2.16b, v3.16b, #2 // 0
|
||||
ext v27.16b, v4.16b, v5.16b, #2 // +stride
|
||||
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v29.16b, v2.16b, v3.16b, #4 // +1
|
||||
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
|
||||
add v2.8h, v2.8h, v25.8h // -1, -stride
|
||||
add v26.8h, v26.8h, v27.8h // 0, +stride
|
||||
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
|
||||
add v2.8h, v2.8h, v26.8h
|
||||
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
|
||||
add v2.8h, v2.8h, v29.8h // +1
|
||||
add v0.8h, v0.8h, v4.8h
|
||||
|
||||
ext v25.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v26.16b, v17.16b, v18.16b, #4
|
||||
shl v2.8h, v2.8h, #2
|
||||
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
|
||||
ext v28.16b, v17.16b, v18.16b, #8
|
||||
ext v29.16b, v19.16b, v20.16b, #4 // 0
|
||||
ext v30.16b, v20.16b, v21.16b, #4
|
||||
mla v2.8h, v0.8h, v6.8h // * 3 -> a
|
||||
add v25.4s, v25.4s, v19.4s // -stride, -1
|
||||
add v26.4s, v26.4s, v20.4s
|
||||
add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
|
||||
add v17.4s, v17.4s, v28.4s
|
||||
ext v27.16b, v19.16b, v20.16b, #8 // +1
|
||||
ext v28.16b, v20.16b, v21.16b, #8
|
||||
add v16.4s, v16.4s, v22.4s // -1+stride
|
||||
add v17.4s, v17.4s, v23.4s
|
||||
add v29.4s, v29.4s, v27.4s // 0, +1
|
||||
add v30.4s, v30.4s, v28.4s
|
||||
add v25.4s, v25.4s, v29.4s
|
||||
add v26.4s, v26.4s, v30.4s
|
||||
ext v27.16b, v22.16b, v23.16b, #4 // +stride
|
||||
ext v28.16b, v23.16b, v24.16b, #4
|
||||
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
|
||||
ext v30.16b, v23.16b, v24.16b, #8
|
||||
ld1 {v19.8b}, [x1], #8 // src
|
||||
add v25.4s, v25.4s, v27.4s // +stride
|
||||
add v26.4s, v26.4s, v28.4s
|
||||
add v16.4s, v16.4s, v29.4s // +1+stride
|
||||
add v17.4s, v17.4s, v30.4s
|
||||
shl v25.4s, v25.4s, #2
|
||||
shl v26.4s, v26.4s, #2
|
||||
mla v25.4s, v16.4s, v7.4s // * 3 -> b
|
||||
mla v26.4s, v17.4s, v7.4s
|
||||
uxtl v19.8h, v19.8b // src
|
||||
mov v0.16b, v1.16b
|
||||
umlal v25.4s, v2.4h, v19.4h // b + a * src
|
||||
umlal2 v26.4s, v2.8h, v19.8h
|
||||
mov v2.16b, v3.16b
|
||||
rshrn v25.4h, v25.4s, #9
|
||||
rshrn2 v25.8h, v26.4s, #9
|
||||
mov v4.16b, v5.16b
|
||||
st1 {v25.8h}, [x0], #16
|
||||
|
||||
b.le 3f
|
||||
mov v16.16b, v18.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v22.16b, v24.16b
|
||||
ld1 {v1.8h}, [x9], #16
|
||||
ld1 {v3.8h}, [x4], #16
|
||||
ld1 {v5.8h}, [x10], #16
|
||||
ld1 {v17.4s, v18.4s}, [x7], #32
|
||||
ld1 {v20.4s, v21.4s}, [x3], #32
|
||||
ld1 {v23.4s, v24.4s}, [x8], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x13
|
||||
add x0, x0, x12, lsl #1
|
||||
add x1, x1, x2
|
||||
add x3, x3, x11, lsl #2
|
||||
add x7, x7, x11, lsl #2
|
||||
add x8, x8, x11, lsl #2
|
||||
add x4, x4, x14, lsl #1
|
||||
add x9, x9, x14, lsl #1
|
||||
add x10, x10, x14, lsl #1
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter2_neon, export=1
|
||||
add x7, x3, #(4*(SUM_STRIDE))
|
||||
sub x3, x3, #(4*(SUM_STRIDE))
|
||||
add x8, x4, #(2*(SUM_STRIDE))
|
||||
sub x4, x4, #(2*(SUM_STRIDE))
|
||||
mov x9, #(2*SUM_STRIDE)
|
||||
mov x10, #FILTER_OUT_STRIDE
|
||||
add x11, x5, #7
|
||||
bic x11, x11, #7 // Aligned width
|
||||
sub x2, x2, x11
|
||||
sub x10, x10, x11
|
||||
sub x9, x9, x11
|
||||
sub x9, x9, #4 // We read 4 extra elements from a
|
||||
sub x12, x9, #4 // We read 8 extra elements from b
|
||||
mov x11, x5
|
||||
movi v4.8h, #5
|
||||
movi v5.4s, #5
|
||||
movi v6.8h, #6
|
||||
movi v7.4s, #6
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x4], #32
|
||||
ld1 {v2.8h, v3.8h}, [x8], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v23.16b, v2.16b, v3.16b, #2 // +stride
|
||||
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
|
||||
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
|
||||
add v2.8h, v22.8h, v23.8h // -stride, +stride
|
||||
add v0.8h, v0.8h, v25.8h
|
||||
|
||||
ext v22.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v23.16b, v17.16b, v18.16b, #4
|
||||
ext v24.16b, v19.16b, v20.16b, #4 // +stride
|
||||
ext v25.16b, v20.16b, v21.16b, #4
|
||||
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
|
||||
ext v27.16b, v17.16b, v18.16b, #8
|
||||
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
|
||||
ext v29.16b, v20.16b, v21.16b, #8
|
||||
mul v0.8h, v0.8h, v4.8h // * 5
|
||||
mla v0.8h, v2.8h, v6.8h // * 6
|
||||
ld1 {v31.8b}, [x1], #8
|
||||
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
|
||||
add v20.4s, v20.4s, v29.4s
|
||||
add v16.4s, v16.4s, v19.4s
|
||||
add v17.4s, v17.4s, v20.4s
|
||||
|
||||
add v22.4s, v22.4s, v24.4s // -stride, +stride
|
||||
add v23.4s, v23.4s, v25.4s
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
// mul+mla pairs are further apart, on Cortex A53.
|
||||
mul v16.4s, v16.4s, v5.4s // * 5
|
||||
mla v16.4s, v22.4s, v7.4s // * 6
|
||||
mul v17.4s, v17.4s, v5.4s // * 5
|
||||
mla v17.4s, v23.4s, v7.4s // * 6
|
||||
|
||||
uxtl v31.8h, v31.8b
|
||||
umlal v16.4s, v0.4h, v31.4h // b + a * src
|
||||
umlal2 v17.4s, v0.8h, v31.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v16.4h, v16.4s, #9
|
||||
rshrn2 v16.8h, v17.4s, #9
|
||||
mov v2.16b, v3.16b
|
||||
st1 {v16.8h}, [x0], #16
|
||||
|
||||
b.le 3f
|
||||
mov v16.16b, v18.16b
|
||||
mov v19.16b, v21.16b
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v3.8h}, [x8], #16
|
||||
ld1 {v17.4s, v18.4s}, [x3], #32
|
||||
ld1 {v20.4s, v21.4s}, [x7], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x11
|
||||
add x0, x0, x10, lsl #1
|
||||
add x1, x1, x2
|
||||
add x3, x3, x9, lsl #2
|
||||
add x7, x7, x9, lsl #2
|
||||
add x4, x4, x12, lsl #1
|
||||
add x8, x8, x12, lsl #1
|
||||
mov x13, x3
|
||||
mov x14, x4
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x4], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
|
||||
|
||||
4:
|
||||
subs x5, x5, #8
|
||||
ext v23.16b, v0.16b, v1.16b, #4 // +1
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // 0
|
||||
add v0.8h, v0.8h, v23.8h // -1, +1
|
||||
|
||||
ext v24.16b, v16.16b, v17.16b, #4 // 0
|
||||
ext v25.16b, v17.16b, v18.16b, #4
|
||||
ext v26.16b, v16.16b, v17.16b, #8 // +1
|
||||
ext v27.16b, v17.16b, v18.16b, #8
|
||||
mul v2.8h, v22.8h, v6.8h // * 6
|
||||
mla v2.8h, v0.8h, v4.8h // * 5 -> a
|
||||
ld1 {v31.8b}, [x1], #8
|
||||
add v16.4s, v16.4s, v26.4s // -1, +1
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
uxtl v31.8h, v31.8b
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
// mul+mla pairs are further apart, on Cortex A53.
|
||||
mul v24.4s, v24.4s, v7.4s // * 6
|
||||
mla v24.4s, v16.4s, v5.4s // * 5 -> b
|
||||
mul v25.4s, v25.4s, v7.4s // * 6
|
||||
mla v25.4s, v17.4s, v5.4s // * 5 -> b
|
||||
|
||||
umlal v24.4s, v2.4h, v31.4h // b + a * src
|
||||
umlal2 v25.4s, v2.8h, v31.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v24.4h, v24.4s, #8
|
||||
rshrn2 v24.8h, v25.4s, #8
|
||||
mov v16.16b, v18.16b
|
||||
st1 {v24.8h}, [x0], #16
|
||||
|
||||
b.le 5f
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v17.4s, v18.4s}, [x3], #32
|
||||
b 4b
|
||||
|
||||
5:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x11
|
||||
add x0, x0, x10, lsl #1
|
||||
add x1, x1, x2
|
||||
mov x3, x13 // Rewind x3/x4 to where they started
|
||||
mov x4, x14
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const coef *t1, const int w, const int h,
|
||||
// const int wt);
|
||||
function sgr_weighted1_neon, export=1
|
||||
dup v31.8h, w7
|
||||
cmp x6, #2
|
||||
add x9, x0, x1
|
||||
add x10, x2, x3
|
||||
add x11, x4, #2*FILTER_OUT_STRIDE
|
||||
mov x7, #(4*FILTER_OUT_STRIDE)
|
||||
lsl x1, x1, #1
|
||||
lsl x3, x3, #1
|
||||
add x8, x5, #7
|
||||
bic x8, x8, #7 // Aligned width
|
||||
sub x1, x1, x8
|
||||
sub x3, x3, x8
|
||||
sub x7, x7, x8, lsl #1
|
||||
mov x8, x5
|
||||
b.lt 2f
|
||||
1:
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v4.8b}, [x10], #8
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v5.8h}, [x11], #16
|
||||
subs x5, x5, #8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
ushll v4.8h, v4.8b, #4 // u
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v5.8h, v5.8h, v4.8h // t1 - u
|
||||
ushll v2.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v3.4s, v0.8h, #7 // u << 7
|
||||
ushll v6.4s, v4.4h, #7 // u << 7
|
||||
ushll2 v7.4s, v4.8h, #7 // u << 7
|
||||
smlal v2.4s, v1.4h, v31.4h // v
|
||||
smlal2 v3.4s, v1.8h, v31.8h // v
|
||||
smlal v6.4s, v5.4h, v31.4h // v
|
||||
smlal2 v7.4s, v5.8h, v31.8h // v
|
||||
rshrn v2.4h, v2.4s, #11
|
||||
rshrn2 v2.8h, v3.4s, #11
|
||||
rshrn v6.4h, v6.4s, #11
|
||||
rshrn2 v6.8h, v7.4s, #11
|
||||
sqxtun v2.8b, v2.8h
|
||||
sqxtun v6.8b, v6.8h
|
||||
st1 {v2.8b}, [x0], #8
|
||||
st1 {v6.8b}, [x9], #8
|
||||
b.gt 1b
|
||||
|
||||
sub x6, x6, #2
|
||||
cmp x6, #1
|
||||
b.lt 0f
|
||||
mov x5, x8
|
||||
add x0, x0, x1
|
||||
add x9, x9, x1
|
||||
add x2, x2, x3
|
||||
add x10, x10, x3
|
||||
add x4, x4, x7
|
||||
add x11, x11, x7
|
||||
b.eq 2f
|
||||
b 1b
|
||||
|
||||
2:
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
subs x5, x5, #8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
ushll v2.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v3.4s, v0.8h, #7 // u << 7
|
||||
smlal v2.4s, v1.4h, v31.4h // v
|
||||
smlal2 v3.4s, v1.8h, v31.8h // v
|
||||
rshrn v2.4h, v2.4s, #11
|
||||
rshrn2 v2.8h, v3.4s, #11
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v2.8b}, [x0], #8
|
||||
b.gt 2b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const coef *t1, const coef *t2,
|
||||
// const int w, const int h,
|
||||
// const int16_t wt[2]);
|
||||
function sgr_weighted2_neon, export=1
|
||||
ldr x8, [sp]
|
||||
cmp x7, #2
|
||||
add x10, x0, x1
|
||||
add x11, x2, x3
|
||||
add x12, x4, #2*FILTER_OUT_STRIDE
|
||||
add x13, x5, #2*FILTER_OUT_STRIDE
|
||||
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
|
||||
mov x8, #4*FILTER_OUT_STRIDE
|
||||
lsl x1, x1, #1
|
||||
lsl x3, x3, #1
|
||||
add x9, x6, #7
|
||||
bic x9, x9, #7 // Aligned width
|
||||
sub x1, x1, x9
|
||||
sub x3, x3, x9
|
||||
sub x8, x8, x9, lsl #1
|
||||
mov x9, x6
|
||||
b.lt 2f
|
||||
1:
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v16.8b}, [x11], #8
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v17.8h}, [x12], #16
|
||||
ld1 {v2.8h}, [x5], #16
|
||||
ld1 {v18.8h}, [x13], #16
|
||||
subs x6, x6, #8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
ushll v16.8h, v16.8b, #4 // u
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v2.8h, v2.8h, v0.8h // t2 - u
|
||||
sub v17.8h, v17.8h, v16.8h // t1 - u
|
||||
sub v18.8h, v18.8h, v16.8h // t2 - u
|
||||
ushll v3.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v4.4s, v0.8h, #7 // u << 7
|
||||
ushll v19.4s, v16.4h, #7 // u << 7
|
||||
ushll2 v20.4s, v16.8h, #7 // u << 7
|
||||
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
|
||||
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
|
||||
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
|
||||
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
|
||||
smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
|
||||
smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
|
||||
smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
|
||||
smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
|
||||
rshrn v3.4h, v3.4s, #11
|
||||
rshrn2 v3.8h, v4.4s, #11
|
||||
rshrn v19.4h, v19.4s, #11
|
||||
rshrn2 v19.8h, v20.4s, #11
|
||||
sqxtun v3.8b, v3.8h
|
||||
sqxtun v19.8b, v19.8h
|
||||
st1 {v3.8b}, [x0], #8
|
||||
st1 {v19.8b}, [x10], #8
|
||||
b.gt 1b
|
||||
|
||||
subs x7, x7, #2
|
||||
cmp x7, #1
|
||||
b.lt 0f
|
||||
mov x6, x9
|
||||
add x0, x0, x1
|
||||
add x10, x10, x1
|
||||
add x2, x2, x3
|
||||
add x11, x11, x3
|
||||
add x4, x4, x8
|
||||
add x12, x12, x8
|
||||
add x5, x5, x8
|
||||
add x13, x13, x8
|
||||
b.eq 2f
|
||||
b 1b
|
||||
|
||||
2:
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v2.8h}, [x5], #16
|
||||
subs x6, x6, #8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v2.8h, v2.8h, v0.8h // t2 - u
|
||||
ushll v3.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v4.4s, v0.8h, #7 // u << 7
|
||||
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
|
||||
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
|
||||
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
|
||||
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
|
||||
rshrn v3.4h, v3.4s, #11
|
||||
rshrn2 v3.8h, v4.4s, #11
|
||||
sqxtun v3.8b, v3.8h
|
||||
st1 {v3.8b}, [x0], #8
|
||||
b.gt 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
sgr_funcs 8
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,432 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_v_neon, export=1
|
||||
add w10, w3, #2 // Number of output rows to move back
|
||||
mov w11, w3 // Number of input rows to move back
|
||||
add w2, w2, #2 // Actual summed width
|
||||
mov x7, #(4*SUM_STRIDE) // sumsq stride
|
||||
mov x8, #(2*SUM_STRIDE) // sum stride
|
||||
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
|
||||
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
|
||||
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
b.eq 0f
|
||||
// If have top, read from row -2.
|
||||
sub x5, x0, #(4*SUM_STRIDE)
|
||||
sub x6, x1, #(2*SUM_STRIDE)
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_TOP
|
||||
// If we don't have top, read from row 0 even if
|
||||
// we start writing to row -1.
|
||||
add x5, x0, #(4*SUM_STRIDE)
|
||||
add x6, x1, #(2*SUM_STRIDE)
|
||||
1:
|
||||
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.eq 1f
|
||||
// LR_HAVE_BOTTOM
|
||||
add w3, w3, #2 // Sum all h+2 lines with the main loop
|
||||
add w11, w11, #2
|
||||
1:
|
||||
mov w9, w3 // Backup of h for next loops
|
||||
|
||||
1:
|
||||
// Start of horizontal loop; start one vertical filter slice.
|
||||
// Start loading rows into v16-v21 and v24-v26 taking top
|
||||
// padding into consideration.
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
ld1 {v16.4s, v17.4s}, [x5], x7
|
||||
ld1 {v24.8h}, [x6], x8
|
||||
b.eq 2f
|
||||
// LR_HAVE_TOP
|
||||
ld1 {v18.4s, v19.4s}, [x5], x7
|
||||
ld1 {v25.8h}, [x6], x8
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v25.16b, v24.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v17.16b
|
||||
mov v26.16b, v24.16b
|
||||
|
||||
3:
|
||||
subs w3, w3, #1
|
||||
.macro add3
|
||||
add v16.4s, v16.4s, v18.4s
|
||||
add v17.4s, v17.4s, v19.4s
|
||||
add v24.8h, v24.8h, v25.8h
|
||||
add v16.4s, v16.4s, v20.4s
|
||||
add v17.4s, v17.4s, v21.4s
|
||||
add v24.8h, v24.8h, v26.8h
|
||||
st1 {v16.4s, v17.4s}, [x0], x7
|
||||
st1 {v24.8h}, [x1], x8
|
||||
.endm
|
||||
add3
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v24.16b, v25.16b
|
||||
mov v18.16b, v20.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v25.16b, v26.16b
|
||||
b.le 4f
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b 3b
|
||||
|
||||
4:
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.ne 5f
|
||||
// !LR_HAVE_BOTTOM
|
||||
// Produce two more rows, extending the already loaded rows.
|
||||
add3
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v24.16b, v25.16b
|
||||
add3
|
||||
|
||||
5: // End of one vertical slice.
|
||||
subs w2, w2, #8
|
||||
b.le 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
// Input pointers
|
||||
msub x5, x7, x11, x5
|
||||
msub x6, x8, x11, x6
|
||||
// Output pointers
|
||||
msub x0, x7, x10, x0
|
||||
msub x1, x8, x10, x1
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
add x5, x5, #32
|
||||
add x6, x6, #16
|
||||
mov w3, w9
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_v_neon, export=1
|
||||
add w10, w3, #2 // Number of output rows to move back
|
||||
mov w11, w3 // Number of input rows to move back
|
||||
add w2, w2, #8 // Actual summed width
|
||||
mov x7, #(4*SUM_STRIDE) // sumsq stride
|
||||
mov x8, #(2*SUM_STRIDE) // sum stride
|
||||
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
|
||||
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
|
||||
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
b.eq 0f
|
||||
// If have top, read from row -2.
|
||||
sub x5, x0, #(4*SUM_STRIDE)
|
||||
sub x6, x1, #(2*SUM_STRIDE)
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_TOP
|
||||
// If we don't have top, read from row 0 even if
|
||||
// we start writing to row -1.
|
||||
add x5, x0, #(4*SUM_STRIDE)
|
||||
add x6, x1, #(2*SUM_STRIDE)
|
||||
1:
|
||||
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.eq 0f
|
||||
// LR_HAVE_BOTTOM
|
||||
add w3, w3, #2 // Handle h+2 lines with the main loop
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_BOTTOM
|
||||
sub w3, w3, #1 // Handle h-1 lines with the main loop
|
||||
1:
|
||||
mov w9, w3 // Backup of h for next loops
|
||||
|
||||
1:
|
||||
// Start of horizontal loop; start one vertical filter slice.
|
||||
// Start loading rows into v16-v25 and v26-v30 taking top
|
||||
// padding into consideration.
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
ld1 {v16.4s, v17.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b.eq 2f
|
||||
// LR_HAVE_TOP
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v28.8h}, [x6], x8
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v27.16b, v26.16b
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v27.16b, v26.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v17.16b
|
||||
mov v28.16b, v26.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v17.16b
|
||||
mov v29.16b, v26.16b
|
||||
|
||||
3:
|
||||
cbz w3, 4f
|
||||
ld1 {v24.4s, v25.4s}, [x5], x7
|
||||
ld1 {v30.8h}, [x6], x8
|
||||
|
||||
3:
|
||||
// Start of vertical loop
|
||||
subs w3, w3, #2
|
||||
.macro add5
|
||||
add v16.4s, v16.4s, v18.4s
|
||||
add v17.4s, v17.4s, v19.4s
|
||||
add v26.8h, v26.8h, v27.8h
|
||||
add v0.4s, v20.4s, v22.4s
|
||||
add v1.4s, v21.4s, v23.4s
|
||||
add v2.8h, v28.8h, v29.8h
|
||||
add v16.4s, v16.4s, v24.4s
|
||||
add v17.4s, v17.4s, v25.4s
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
add v16.4s, v16.4s, v0.4s
|
||||
add v17.4s, v17.4s, v1.4s
|
||||
add v26.8h, v26.8h, v2.8h
|
||||
st1 {v16.4s, v17.4s}, [x0], x7
|
||||
st1 {v26.8h}, [x1], x8
|
||||
.endm
|
||||
add5
|
||||
.macro shift2
|
||||
mov v16.16b, v20.16b
|
||||
mov v17.16b, v21.16b
|
||||
mov v26.16b, v28.16b
|
||||
mov v18.16b, v22.16b
|
||||
mov v19.16b, v23.16b
|
||||
mov v27.16b, v29.16b
|
||||
mov v20.16b, v24.16b
|
||||
mov v21.16b, v25.16b
|
||||
mov v28.16b, v30.16b
|
||||
.endm
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
b.le 5f
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
ld1 {v24.4s, v25.4s}, [x5], x7
|
||||
ld1 {v30.8h}, [x6], x8
|
||||
b 3b
|
||||
|
||||
4:
|
||||
// h == 1, !LR_HAVE_BOTTOM.
|
||||
// Pad the last row with the only content row, and add.
|
||||
mov v24.16b, v22.16b
|
||||
mov v25.16b, v23.16b
|
||||
mov v30.16b, v29.16b
|
||||
add5
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
add5
|
||||
b 6f
|
||||
|
||||
5:
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.ne 6f
|
||||
// !LR_HAVE_BOTTOM
|
||||
cbnz w3, 5f
|
||||
// The intended three edge rows left; output the one at h-2 and
|
||||
// the past edge one at h.
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
// Pad the past-edge row from the last content row.
|
||||
mov v24.16b, v22.16b
|
||||
mov v25.16b, v23.16b
|
||||
mov v30.16b, v29.16b
|
||||
add5
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
// The last two rows are already padded properly here.
|
||||
add5
|
||||
b 6f
|
||||
|
||||
5:
|
||||
// w3 == -1, two rows left, output one.
|
||||
// Pad the last two rows from the mid one.
|
||||
mov v22.16b, v20.16b
|
||||
mov v23.16b, v21.16b
|
||||
mov v29.16b, v28.16b
|
||||
mov v24.16b, v20.16b
|
||||
mov v25.16b, v21.16b
|
||||
mov v30.16b, v28.16b
|
||||
add5
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
b 6f
|
||||
|
||||
6: // End of one vertical slice.
|
||||
subs w2, w2, #8
|
||||
b.le 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
// Input pointers
|
||||
msub x5, x7, x11, x5
|
||||
msub x6, x8, x11, x6
|
||||
// Output pointers
|
||||
msub x0, x7, x10, x0
|
||||
msub x1, x8, x10, x1
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
add x5, x5, #32
|
||||
add x6, x6, #16
|
||||
mov w3, w9
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
.purgem add5
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
|
||||
// const int w, const int h, const int strength,
|
||||
// const int bitdepth_max);
|
||||
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
|
||||
// const int w, const int h, const int strength,
|
||||
// const int bitdepth_max);
|
||||
function sgr_calc_ab1_neon, export=1
|
||||
clz w9, w5
|
||||
add x3, x3, #2 // h += 2
|
||||
movi v31.4s, #9 // n
|
||||
mov x5, #455
|
||||
mov x8, #SUM_STRIDE
|
||||
b sgr_calc_ab_neon
|
||||
endfunc
|
||||
|
||||
function sgr_calc_ab2_neon, export=1
|
||||
clz w9, w5
|
||||
add x3, x3, #3 // h += 3
|
||||
asr x3, x3, #1 // h /= 2
|
||||
movi v31.4s, #25 // n
|
||||
mov x5, #164
|
||||
mov x8, #(2*SUM_STRIDE)
|
||||
endfunc
|
||||
|
||||
function sgr_calc_ab_neon
|
||||
sub w9, w9, #24 // -bitdepth_min_8
|
||||
movrel x12, X(sgr_x_by_x)
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
|
||||
dup v6.8h, w9 // -bitdepth_min_8
|
||||
movi v19.16b, #5
|
||||
movi v20.8b, #55 // idx of last 5
|
||||
movi v21.8b, #72 // idx of last 4
|
||||
movi v22.8b, #101 // idx of last 3
|
||||
movi v23.8b, #169 // idx of last 2
|
||||
movi v24.8b, #254 // idx of last 1
|
||||
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
|
||||
add x2, x2, #2 // w += 2
|
||||
add x7, x2, #7
|
||||
bic x7, x7, #7 // aligned w
|
||||
sub x7, x8, x7 // increment between rows
|
||||
movi v29.8h, #1, lsl #8
|
||||
dup v28.4s, w4
|
||||
dup v30.4s, w5 // one_by_x
|
||||
sub x0, x0, #(4*(SUM_STRIDE))
|
||||
sub x1, x1, #(2*(SUM_STRIDE))
|
||||
mov x6, x2 // backup of w
|
||||
sub v16.16b, v16.16b, v19.16b
|
||||
sub v17.16b, v17.16b, v19.16b
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
1:
|
||||
subs x2, x2, #8
|
||||
ld1 {v0.4s, v1.4s}, [x0] // a
|
||||
ld1 {v2.8h}, [x1] // b
|
||||
srshl v0.4s, v0.4s, v7.4s
|
||||
srshl v1.4s, v1.4s, v7.4s
|
||||
srshl v4.8h, v2.8h, v6.8h
|
||||
mul v0.4s, v0.4s, v31.4s // a * n
|
||||
mul v1.4s, v1.4s, v31.4s // a * n
|
||||
umull v3.4s, v4.4h, v4.4h // b * b
|
||||
umull2 v4.4s, v4.8h, v4.8h // b * b
|
||||
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
|
||||
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
|
||||
mul v0.4s, v0.4s, v28.4s // p * s
|
||||
mul v1.4s, v1.4s, v28.4s // p * s
|
||||
uqshrn v0.4h, v0.4s, #16
|
||||
uqshrn2 v0.8h, v1.4s, #16
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v4.8b
|
||||
add v5.8b, v5.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
add v1.8b, v1.8b, v5.8b
|
||||
add v1.8b, v1.8b, v25.8b
|
||||
uxtl v1.8h, v1.8b // x
|
||||
|
||||
umull v3.4s, v1.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
sub v2.8h, v29.8h, v1.8h // 256 - x
|
||||
|
||||
st1 {v3.4s, v4.4s}, [x0], #32
|
||||
st1 {v2.8h}, [x1], #16
|
||||
b.gt 1b
|
||||
|
||||
subs x3, x3, #1
|
||||
b.le 0f
|
||||
add x0, x0, x7, lsl #2
|
||||
add x1, x1, x7, lsl #1
|
||||
mov x2, x6
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
|
@ -0,0 +1,597 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
|
||||
#define FILTER_OUT_STRIDE 384
|
||||
|
||||
.macro sgr_funcs bpc
|
||||
// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter1_\bpc\()bpc_neon, export=1
|
||||
sub x7, x3, #(4*SUM_STRIDE)
|
||||
add x8, x3, #(4*SUM_STRIDE)
|
||||
sub x9, x4, #(2*SUM_STRIDE)
|
||||
add x10, x4, #(2*SUM_STRIDE)
|
||||
mov x11, #SUM_STRIDE
|
||||
mov x12, #FILTER_OUT_STRIDE
|
||||
add x13, x5, #7
|
||||
bic x13, x13, #7 // Aligned width
|
||||
.if \bpc == 8
|
||||
sub x2, x2, x13
|
||||
.else
|
||||
sub x2, x2, x13, lsl #1
|
||||
.endif
|
||||
sub x12, x12, x13
|
||||
sub x11, x11, x13
|
||||
sub x11, x11, #4 // We read 4 extra elements from a
|
||||
sub x14, x11, #4 // We read 8 extra elements from b
|
||||
mov x13, x5
|
||||
movi v6.8h, #3
|
||||
movi v7.4s, #3
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x9], #32
|
||||
ld1 {v2.8h, v3.8h}, [x4], #32
|
||||
ld1 {v4.8h, v5.8h}, [x10], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
|
||||
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v25.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v26.16b, v2.16b, v3.16b, #2 // 0
|
||||
ext v27.16b, v4.16b, v5.16b, #2 // +stride
|
||||
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v29.16b, v2.16b, v3.16b, #4 // +1
|
||||
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
|
||||
add v2.8h, v2.8h, v25.8h // -1, -stride
|
||||
add v26.8h, v26.8h, v27.8h // 0, +stride
|
||||
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
|
||||
add v2.8h, v2.8h, v26.8h
|
||||
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
|
||||
add v2.8h, v2.8h, v29.8h // +1
|
||||
add v0.8h, v0.8h, v4.8h
|
||||
|
||||
ext v25.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v26.16b, v17.16b, v18.16b, #4
|
||||
shl v2.8h, v2.8h, #2
|
||||
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
|
||||
ext v28.16b, v17.16b, v18.16b, #8
|
||||
ext v29.16b, v19.16b, v20.16b, #4 // 0
|
||||
ext v30.16b, v20.16b, v21.16b, #4
|
||||
mla v2.8h, v0.8h, v6.8h // * 3 -> a
|
||||
add v25.4s, v25.4s, v19.4s // -stride, -1
|
||||
add v26.4s, v26.4s, v20.4s
|
||||
add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
|
||||
add v17.4s, v17.4s, v28.4s
|
||||
ext v27.16b, v19.16b, v20.16b, #8 // +1
|
||||
ext v28.16b, v20.16b, v21.16b, #8
|
||||
add v16.4s, v16.4s, v22.4s // -1+stride
|
||||
add v17.4s, v17.4s, v23.4s
|
||||
add v29.4s, v29.4s, v27.4s // 0, +1
|
||||
add v30.4s, v30.4s, v28.4s
|
||||
add v25.4s, v25.4s, v29.4s
|
||||
add v26.4s, v26.4s, v30.4s
|
||||
ext v27.16b, v22.16b, v23.16b, #4 // +stride
|
||||
ext v28.16b, v23.16b, v24.16b, #4
|
||||
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
|
||||
ext v30.16b, v23.16b, v24.16b, #8
|
||||
.if \bpc == 8
|
||||
ld1 {v19.8b}, [x1], #8 // src
|
||||
.else
|
||||
ld1 {v19.8h}, [x1], #16 // src
|
||||
.endif
|
||||
add v25.4s, v25.4s, v27.4s // +stride
|
||||
add v26.4s, v26.4s, v28.4s
|
||||
add v16.4s, v16.4s, v29.4s // +1+stride
|
||||
add v17.4s, v17.4s, v30.4s
|
||||
shl v25.4s, v25.4s, #2
|
||||
shl v26.4s, v26.4s, #2
|
||||
mla v25.4s, v16.4s, v7.4s // * 3 -> b
|
||||
mla v26.4s, v17.4s, v7.4s
|
||||
.if \bpc == 8
|
||||
uxtl v19.8h, v19.8b // src
|
||||
.endif
|
||||
mov v0.16b, v1.16b
|
||||
umlal v25.4s, v2.4h, v19.4h // b + a * src
|
||||
umlal2 v26.4s, v2.8h, v19.8h
|
||||
mov v2.16b, v3.16b
|
||||
rshrn v25.4h, v25.4s, #9
|
||||
rshrn2 v25.8h, v26.4s, #9
|
||||
mov v4.16b, v5.16b
|
||||
st1 {v25.8h}, [x0], #16
|
||||
|
||||
b.le 3f
|
||||
mov v16.16b, v18.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v22.16b, v24.16b
|
||||
ld1 {v1.8h}, [x9], #16
|
||||
ld1 {v3.8h}, [x4], #16
|
||||
ld1 {v5.8h}, [x10], #16
|
||||
ld1 {v17.4s, v18.4s}, [x7], #32
|
||||
ld1 {v20.4s, v21.4s}, [x3], #32
|
||||
ld1 {v23.4s, v24.4s}, [x8], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x13
|
||||
add x0, x0, x12, lsl #1
|
||||
add x1, x1, x2
|
||||
add x3, x3, x11, lsl #2
|
||||
add x7, x7, x11, lsl #2
|
||||
add x8, x8, x11, lsl #2
|
||||
add x4, x4, x14, lsl #1
|
||||
add x9, x9, x14, lsl #1
|
||||
add x10, x10, x14, lsl #1
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
add x7, x3, #(4*(SUM_STRIDE))
|
||||
sub x3, x3, #(4*(SUM_STRIDE))
|
||||
add x8, x4, #(2*(SUM_STRIDE))
|
||||
sub x4, x4, #(2*(SUM_STRIDE))
|
||||
mov x9, #(2*SUM_STRIDE)
|
||||
mov x10, #FILTER_OUT_STRIDE
|
||||
add x11, x5, #7
|
||||
bic x11, x11, #7 // Aligned width
|
||||
.if \bpc == 8
|
||||
sub x2, x2, x11
|
||||
.else
|
||||
sub x2, x2, x11, lsl #1
|
||||
.endif
|
||||
sub x10, x10, x11
|
||||
sub x9, x9, x11
|
||||
sub x9, x9, #4 // We read 4 extra elements from a
|
||||
sub x12, x9, #4 // We read 8 extra elements from b
|
||||
mov x11, x5
|
||||
movi v4.8h, #5
|
||||
movi v5.4s, #5
|
||||
movi v6.8h, #6
|
||||
movi v7.4s, #6
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x4], #32
|
||||
ld1 {v2.8h, v3.8h}, [x8], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v23.16b, v2.16b, v3.16b, #2 // +stride
|
||||
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
|
||||
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
|
||||
add v2.8h, v22.8h, v23.8h // -stride, +stride
|
||||
add v0.8h, v0.8h, v25.8h
|
||||
|
||||
ext v22.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v23.16b, v17.16b, v18.16b, #4
|
||||
ext v24.16b, v19.16b, v20.16b, #4 // +stride
|
||||
ext v25.16b, v20.16b, v21.16b, #4
|
||||
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
|
||||
ext v27.16b, v17.16b, v18.16b, #8
|
||||
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
|
||||
ext v29.16b, v20.16b, v21.16b, #8
|
||||
mul v0.8h, v0.8h, v4.8h // * 5
|
||||
mla v0.8h, v2.8h, v6.8h // * 6
|
||||
.if \bpc == 8
|
||||
ld1 {v31.8b}, [x1], #8
|
||||
.else
|
||||
ld1 {v31.8h}, [x1], #16
|
||||
.endif
|
||||
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
|
||||
add v20.4s, v20.4s, v29.4s
|
||||
add v16.4s, v16.4s, v19.4s
|
||||
add v17.4s, v17.4s, v20.4s
|
||||
|
||||
add v22.4s, v22.4s, v24.4s // -stride, +stride
|
||||
add v23.4s, v23.4s, v25.4s
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
// mul+mla pairs are further apart, on Cortex A53.
|
||||
mul v16.4s, v16.4s, v5.4s // * 5
|
||||
mla v16.4s, v22.4s, v7.4s // * 6
|
||||
mul v17.4s, v17.4s, v5.4s // * 5
|
||||
mla v17.4s, v23.4s, v7.4s // * 6
|
||||
|
||||
.if \bpc == 8
|
||||
uxtl v31.8h, v31.8b
|
||||
.endif
|
||||
umlal v16.4s, v0.4h, v31.4h // b + a * src
|
||||
umlal2 v17.4s, v0.8h, v31.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v16.4h, v16.4s, #9
|
||||
rshrn2 v16.8h, v17.4s, #9
|
||||
mov v2.16b, v3.16b
|
||||
st1 {v16.8h}, [x0], #16
|
||||
|
||||
b.le 3f
|
||||
mov v16.16b, v18.16b
|
||||
mov v19.16b, v21.16b
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v3.8h}, [x8], #16
|
||||
ld1 {v17.4s, v18.4s}, [x3], #32
|
||||
ld1 {v20.4s, v21.4s}, [x7], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x11
|
||||
add x0, x0, x10, lsl #1
|
||||
add x1, x1, x2
|
||||
add x3, x3, x9, lsl #2
|
||||
add x7, x7, x9, lsl #2
|
||||
add x4, x4, x12, lsl #1
|
||||
add x8, x8, x12, lsl #1
|
||||
mov x13, x3
|
||||
mov x14, x4
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x4], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
|
||||
|
||||
4:
|
||||
subs x5, x5, #8
|
||||
ext v23.16b, v0.16b, v1.16b, #4 // +1
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // 0
|
||||
add v0.8h, v0.8h, v23.8h // -1, +1
|
||||
|
||||
ext v24.16b, v16.16b, v17.16b, #4 // 0
|
||||
ext v25.16b, v17.16b, v18.16b, #4
|
||||
ext v26.16b, v16.16b, v17.16b, #8 // +1
|
||||
ext v27.16b, v17.16b, v18.16b, #8
|
||||
mul v2.8h, v22.8h, v6.8h // * 6
|
||||
mla v2.8h, v0.8h, v4.8h // * 5 -> a
|
||||
.if \bpc == 8
|
||||
ld1 {v31.8b}, [x1], #8
|
||||
.else
|
||||
ld1 {v31.8h}, [x1], #16
|
||||
.endif
|
||||
add v16.4s, v16.4s, v26.4s // -1, +1
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
.if \bpc == 8
|
||||
uxtl v31.8h, v31.8b
|
||||
.endif
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
// mul+mla pairs are further apart, on Cortex A53.
|
||||
mul v24.4s, v24.4s, v7.4s // * 6
|
||||
mla v24.4s, v16.4s, v5.4s // * 5 -> b
|
||||
mul v25.4s, v25.4s, v7.4s // * 6
|
||||
mla v25.4s, v17.4s, v5.4s // * 5 -> b
|
||||
|
||||
umlal v24.4s, v2.4h, v31.4h // b + a * src
|
||||
umlal2 v25.4s, v2.8h, v31.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v24.4h, v24.4s, #8
|
||||
rshrn2 v24.8h, v25.4s, #8
|
||||
mov v16.16b, v18.16b
|
||||
st1 {v24.8h}, [x0], #16
|
||||
|
||||
b.le 5f
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v17.4s, v18.4s}, [x3], #32
|
||||
b 4b
|
||||
|
||||
5:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x11
|
||||
add x0, x0, x10, lsl #1
|
||||
add x1, x1, x2
|
||||
mov x3, x13 // Rewind x3/x4 to where they started
|
||||
mov x4, x14
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const int16_t *t1, const int w, const int h,
|
||||
// const int wt, const int bitdepth_max);
|
||||
function sgr_weighted1_\bpc\()bpc_neon, export=1
|
||||
.if \bpc == 16
|
||||
ldr w8, [sp]
|
||||
.endif
|
||||
dup v31.8h, w7
|
||||
cmp x6, #2
|
||||
.if \bpc == 16
|
||||
dup v30.8h, w8
|
||||
.endif
|
||||
add x9, x0, x1
|
||||
add x10, x2, x3
|
||||
add x11, x4, #2*FILTER_OUT_STRIDE
|
||||
mov x7, #(4*FILTER_OUT_STRIDE)
|
||||
lsl x1, x1, #1
|
||||
lsl x3, x3, #1
|
||||
add x8, x5, #7
|
||||
bic x8, x8, #7 // Aligned width
|
||||
.if \bpc == 8
|
||||
sub x1, x1, x8
|
||||
sub x3, x3, x8
|
||||
.else
|
||||
sub x1, x1, x8, lsl #1
|
||||
sub x3, x3, x8, lsl #1
|
||||
.endif
|
||||
sub x7, x7, x8, lsl #1
|
||||
mov x8, x5
|
||||
b.lt 2f
|
||||
1:
|
||||
.if \bpc == 8
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v4.8b}, [x10], #8
|
||||
.else
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
ld1 {v4.8h}, [x10], #16
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v5.8h}, [x11], #16
|
||||
subs x5, x5, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
ushll v4.8h, v4.8b, #4 // u
|
||||
.else
|
||||
shl v0.8h, v0.8h, #4 // u
|
||||
shl v4.8h, v4.8h, #4 // u
|
||||
.endif
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v5.8h, v5.8h, v4.8h // t1 - u
|
||||
ushll v2.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v3.4s, v0.8h, #7 // u << 7
|
||||
ushll v6.4s, v4.4h, #7 // u << 7
|
||||
ushll2 v7.4s, v4.8h, #7 // u << 7
|
||||
smlal v2.4s, v1.4h, v31.4h // v
|
||||
smlal2 v3.4s, v1.8h, v31.8h // v
|
||||
smlal v6.4s, v5.4h, v31.4h // v
|
||||
smlal2 v7.4s, v5.8h, v31.8h // v
|
||||
.if \bpc == 8
|
||||
rshrn v2.4h, v2.4s, #11
|
||||
rshrn2 v2.8h, v3.4s, #11
|
||||
rshrn v6.4h, v6.4s, #11
|
||||
rshrn2 v6.8h, v7.4s, #11
|
||||
sqxtun v2.8b, v2.8h
|
||||
sqxtun v6.8b, v6.8h
|
||||
st1 {v2.8b}, [x0], #8
|
||||
st1 {v6.8b}, [x9], #8
|
||||
.else
|
||||
sqrshrun v2.4h, v2.4s, #11
|
||||
sqrshrun2 v2.8h, v3.4s, #11
|
||||
sqrshrun v6.4h, v6.4s, #11
|
||||
sqrshrun2 v6.8h, v7.4s, #11
|
||||
umin v2.8h, v2.8h, v30.8h
|
||||
umin v6.8h, v6.8h, v30.8h
|
||||
st1 {v2.8h}, [x0], #16
|
||||
st1 {v6.8h}, [x9], #16
|
||||
.endif
|
||||
b.gt 1b
|
||||
|
||||
sub x6, x6, #2
|
||||
cmp x6, #1
|
||||
b.lt 0f
|
||||
mov x5, x8
|
||||
add x0, x0, x1
|
||||
add x9, x9, x1
|
||||
add x2, x2, x3
|
||||
add x10, x10, x3
|
||||
add x4, x4, x7
|
||||
add x11, x11, x7
|
||||
b.eq 2f
|
||||
b 1b
|
||||
|
||||
2:
|
||||
.if \bpc == 8
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
.else
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
subs x5, x5, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
.else
|
||||
shl v0.8h, v0.8h, #4 // u
|
||||
.endif
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
ushll v2.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v3.4s, v0.8h, #7 // u << 7
|
||||
smlal v2.4s, v1.4h, v31.4h // v
|
||||
smlal2 v3.4s, v1.8h, v31.8h // v
|
||||
.if \bpc == 8
|
||||
rshrn v2.4h, v2.4s, #11
|
||||
rshrn2 v2.8h, v3.4s, #11
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v2.8b}, [x0], #8
|
||||
.else
|
||||
sqrshrun v2.4h, v2.4s, #11
|
||||
sqrshrun2 v2.8h, v3.4s, #11
|
||||
umin v2.8h, v2.8h, v30.8h
|
||||
st1 {v2.8h}, [x0], #16
|
||||
.endif
|
||||
b.gt 2b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const int16_t *t1, const int16_t *t2,
|
||||
// const int w, const int h,
|
||||
// const int16_t wt[2]);
|
||||
function sgr_weighted2_\bpc\()bpc_neon, export=1
|
||||
.if \bpc == 8
|
||||
ldr x8, [sp]
|
||||
.else
|
||||
ldp x8, x9, [sp]
|
||||
.endif
|
||||
cmp x7, #2
|
||||
add x10, x0, x1
|
||||
add x11, x2, x3
|
||||
add x12, x4, #2*FILTER_OUT_STRIDE
|
||||
add x13, x5, #2*FILTER_OUT_STRIDE
|
||||
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
|
||||
.if \bpc == 16
|
||||
dup v29.8h, w9
|
||||
.endif
|
||||
mov x8, #4*FILTER_OUT_STRIDE
|
||||
lsl x1, x1, #1
|
||||
lsl x3, x3, #1
|
||||
add x9, x6, #7
|
||||
bic x9, x9, #7 // Aligned width
|
||||
.if \bpc == 8
|
||||
sub x1, x1, x9
|
||||
sub x3, x3, x9
|
||||
.else
|
||||
sub x1, x1, x9, lsl #1
|
||||
sub x3, x3, x9, lsl #1
|
||||
.endif
|
||||
sub x8, x8, x9, lsl #1
|
||||
mov x9, x6
|
||||
b.lt 2f
|
||||
1:
|
||||
.if \bpc == 8
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v16.8b}, [x11], #8
|
||||
.else
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
ld1 {v16.8h}, [x11], #16
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v17.8h}, [x12], #16
|
||||
ld1 {v2.8h}, [x5], #16
|
||||
ld1 {v18.8h}, [x13], #16
|
||||
subs x6, x6, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
ushll v16.8h, v16.8b, #4 // u
|
||||
.else
|
||||
shl v0.8h, v0.8h, #4 // u
|
||||
shl v16.8h, v16.8h, #4 // u
|
||||
.endif
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v2.8h, v2.8h, v0.8h // t2 - u
|
||||
sub v17.8h, v17.8h, v16.8h // t1 - u
|
||||
sub v18.8h, v18.8h, v16.8h // t2 - u
|
||||
ushll v3.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v4.4s, v0.8h, #7 // u << 7
|
||||
ushll v19.4s, v16.4h, #7 // u << 7
|
||||
ushll2 v20.4s, v16.8h, #7 // u << 7
|
||||
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
|
||||
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
|
||||
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
|
||||
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
|
||||
smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
|
||||
smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
|
||||
smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
|
||||
smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
|
||||
.if \bpc == 8
|
||||
rshrn v3.4h, v3.4s, #11
|
||||
rshrn2 v3.8h, v4.4s, #11
|
||||
rshrn v19.4h, v19.4s, #11
|
||||
rshrn2 v19.8h, v20.4s, #11
|
||||
sqxtun v3.8b, v3.8h
|
||||
sqxtun v19.8b, v19.8h
|
||||
st1 {v3.8b}, [x0], #8
|
||||
st1 {v19.8b}, [x10], #8
|
||||
.else
|
||||
sqrshrun v3.4h, v3.4s, #11
|
||||
sqrshrun2 v3.8h, v4.4s, #11
|
||||
sqrshrun v19.4h, v19.4s, #11
|
||||
sqrshrun2 v19.8h, v20.4s, #11
|
||||
umin v3.8h, v3.8h, v29.8h
|
||||
umin v19.8h, v19.8h, v29.8h
|
||||
st1 {v3.8h}, [x0], #16
|
||||
st1 {v19.8h}, [x10], #16
|
||||
.endif
|
||||
b.gt 1b
|
||||
|
||||
subs x7, x7, #2
|
||||
cmp x7, #1
|
||||
b.lt 0f
|
||||
mov x6, x9
|
||||
add x0, x0, x1
|
||||
add x10, x10, x1
|
||||
add x2, x2, x3
|
||||
add x11, x11, x3
|
||||
add x4, x4, x8
|
||||
add x12, x12, x8
|
||||
add x5, x5, x8
|
||||
add x13, x13, x8
|
||||
b.eq 2f
|
||||
b 1b
|
||||
|
||||
2:
|
||||
.if \bpc == 8
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
.else
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v2.8h}, [x5], #16
|
||||
subs x6, x6, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
.else
|
||||
shl v0.8h, v0.8h, #4 // u
|
||||
.endif
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v2.8h, v2.8h, v0.8h // t2 - u
|
||||
ushll v3.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v4.4s, v0.8h, #7 // u << 7
|
||||
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
|
||||
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
|
||||
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
|
||||
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
|
||||
.if \bpc == 8
|
||||
rshrn v3.4h, v3.4s, #11
|
||||
rshrn2 v3.8h, v4.4s, #11
|
||||
sqxtun v3.8b, v3.8h
|
||||
st1 {v3.8b}, [x0], #8
|
||||
.else
|
||||
sqrshrun v3.4h, v3.4s, #11
|
||||
sqrshrun2 v3.8h, v4.4s, #11
|
||||
umin v3.8h, v3.8h, v29.8h
|
||||
st1 {v3.8h}, [x0], #16
|
||||
.endif
|
||||
b.gt 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
|
@ -29,14 +29,7 @@
|
|||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
.macro avg dst, t0, t1
|
||||
ld1 {\t0\().8h}, [x2], 16
|
||||
ld1 {\t1\().8h}, [x3], 16
|
||||
add \t0\().8h, \t0\().8h, \t1\().8h
|
||||
sqrshrun \dst\().8b, \t0\().8h, #5
|
||||
.endm
|
||||
|
||||
.macro avg16 dst, t0, t1, t2, t3
|
||||
.macro avg dst, t0, t1, t2, t3
|
||||
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
|
||||
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
|
||||
add \t0\().8h, \t0\().8h, \t2\().8h
|
||||
|
@ -45,16 +38,7 @@
|
|||
sqrshrun2 \dst\().16b, \t1\().8h, #5
|
||||
.endm
|
||||
|
||||
.macro w_avg dst, t0, t1
|
||||
ld1 {\t0\().8h}, [x2], 16
|
||||
ld1 {\t1\().8h}, [x3], 16
|
||||
sub \t0\().8h, \t1\().8h, \t0\().8h
|
||||
sqdmulh \t0\().8h, \t0\().8h, v30.8h
|
||||
add \t0\().8h, \t1\().8h, \t0\().8h
|
||||
sqrshrun \dst\().8b, \t0\().8h, #4
|
||||
.endm
|
||||
|
||||
.macro w_avg16 dst, t0, t1, t2, t3
|
||||
.macro w_avg dst, t0, t1, t2, t3
|
||||
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
|
||||
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
|
||||
sub \t0\().8h, \t2\().8h, \t0\().8h
|
||||
|
@ -67,19 +51,7 @@
|
|||
sqrshrun2 \dst\().16b, \t1\().8h, #4
|
||||
.endm
|
||||
|
||||
.macro mask dst, t0, t1
|
||||
ld1 {v30.8b}, [x6], 8
|
||||
ld1 {\t0\().8h}, [x2], 16
|
||||
mul v30.8b, v30.8b, v31.8b
|
||||
ld1 {\t1\().8h}, [x3], 16
|
||||
shll v30.8h, v30.8b, #8
|
||||
sub \t0\().8h, \t1\().8h, \t0\().8h
|
||||
sqdmulh \t0\().8h, \t0\().8h, v30.8h
|
||||
add \t0\().8h, \t1\().8h, \t0\().8h
|
||||
sqrshrun \dst\().8b, \t0\().8h, #4
|
||||
.endm
|
||||
|
||||
.macro mask16 dst, t0, t1, t2, t3
|
||||
.macro mask dst, t0, t1, t2, t3
|
||||
ld1 {v30.16b}, [x6], 16
|
||||
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
|
||||
mul v30.16b, v30.16b, v31.16b
|
||||
|
@ -109,113 +81,108 @@ function \type\()_8bpc_neon, export=1
|
|||
.endif
|
||||
adr x7, L(\type\()_tbl)
|
||||
sub w4, w4, #24
|
||||
\type v4, v0, v1
|
||||
ldrh w4, [x7, x4, lsl #1]
|
||||
\type v5, v2, v3
|
||||
\type v4, v0, v1, v2, v3
|
||||
sub x7, x7, w4, uxtw
|
||||
br x7
|
||||
40:
|
||||
add x7, x0, x1
|
||||
lsl x1, x1, #1
|
||||
4:
|
||||
cmp w5, #4
|
||||
st1 {v4.s}[0], [x0], x1
|
||||
st1 {v4.s}[1], [x0], x1
|
||||
st1 {v5.s}[0], [x0], x1
|
||||
st1 {v5.s}[1], [x0], x1
|
||||
st1 {v4.s}[1], [x7], x1
|
||||
st1 {v4.s}[2], [x0], x1
|
||||
st1 {v4.s}[3], [x7], x1
|
||||
b.eq 0f
|
||||
\type v6, v0, v1
|
||||
\type v7, v2, v3
|
||||
\type v5, v0, v1, v2, v3
|
||||
cmp w5, #8
|
||||
st1 {v6.s}[0], [x0], x1
|
||||
st1 {v6.s}[1], [x0], x1
|
||||
st1 {v7.s}[0], [x0], x1
|
||||
st1 {v7.s}[1], [x0], x1
|
||||
b.eq 0f
|
||||
\type v4, v0, v1
|
||||
\type v5, v2, v3
|
||||
st1 {v4.s}[0], [x0], x1
|
||||
st1 {v4.s}[1], [x0], x1
|
||||
\type v6, v0, v1
|
||||
st1 {v5.s}[0], [x0], x1
|
||||
st1 {v5.s}[1], [x0], x1
|
||||
\type v7, v2, v3
|
||||
st1 {v6.s}[0], [x0], x1
|
||||
st1 {v6.s}[1], [x0], x1
|
||||
st1 {v7.s}[0], [x0], x1
|
||||
st1 {v7.s}[1], [x0], x1
|
||||
st1 {v5.s}[1], [x7], x1
|
||||
st1 {v5.s}[2], [x0], x1
|
||||
st1 {v5.s}[3], [x7], x1
|
||||
b.eq 0f
|
||||
\type v4, v0, v1, v2, v3
|
||||
st1 {v4.s}[0], [x0], x1
|
||||
st1 {v4.s}[1], [x7], x1
|
||||
\type v5, v0, v1, v2, v3
|
||||
st1 {v4.s}[2], [x0], x1
|
||||
st1 {v4.s}[3], [x7], x1
|
||||
st1 {v5.s}[0], [x0], x1
|
||||
st1 {v5.s}[1], [x7], x1
|
||||
st1 {v5.s}[2], [x0], x1
|
||||
st1 {v5.s}[3], [x7], x1
|
||||
ret
|
||||
80:
|
||||
add x7, x0, x1
|
||||
lsl x1, x1, #1
|
||||
8:
|
||||
st1 {v4.8b}, [x0], x1
|
||||
\type v6, v0, v1
|
||||
st1 {v5.8b}, [x0], x1
|
||||
\type v7, v0, v1
|
||||
st1 {v6.8b}, [x0], x1
|
||||
st1 {v4.d}[0], [x0], x1
|
||||
\type v5, v0, v1, v2, v3
|
||||
st1 {v4.d}[1], [x7], x1
|
||||
st1 {v5.d}[0], [x0], x1
|
||||
subs w5, w5, #4
|
||||
st1 {v7.8b}, [x0], x1
|
||||
st1 {v5.d}[1], [x7], x1
|
||||
b.le 0f
|
||||
\type v4, v0, v1
|
||||
\type v5, v2, v3
|
||||
\type v4, v0, v1, v2, v3
|
||||
b 8b
|
||||
160:
|
||||
trn1 v4.2d, v4.2d, v5.2d
|
||||
16:
|
||||
\type\()16 v5, v0, v1, v2, v3
|
||||
\type v5, v0, v1, v2, v3
|
||||
st1 {v4.16b}, [x0], x1
|
||||
\type\()16 v6, v0, v1, v2, v3
|
||||
\type v6, v0, v1, v2, v3
|
||||
st1 {v5.16b}, [x0], x1
|
||||
\type\()16 v7, v0, v1, v2, v3
|
||||
\type v7, v0, v1, v2, v3
|
||||
st1 {v6.16b}, [x0], x1
|
||||
subs w5, w5, #4
|
||||
st1 {v7.16b}, [x0], x1
|
||||
b.le 0f
|
||||
\type\()16 v4, v0, v1, v2, v3
|
||||
\type v4, v0, v1, v2, v3
|
||||
b 16b
|
||||
320:
|
||||
trn1 v4.2d, v4.2d, v5.2d
|
||||
add x7, x0, x1
|
||||
lsl x1, x1, #1
|
||||
32:
|
||||
\type\()16 v5, v0, v1, v2, v3
|
||||
\type\()16 v6, v0, v1, v2, v3
|
||||
\type v5, v0, v1, v2, v3
|
||||
\type v6, v0, v1, v2, v3
|
||||
st1 {v4.16b,v5.16b}, [x0], x1
|
||||
\type\()16 v7, v0, v1, v2, v3
|
||||
\type v7, v0, v1, v2, v3
|
||||
subs w5, w5, #2
|
||||
st1 {v6.16b,v7.16b}, [x7], x1
|
||||
b.le 0f
|
||||
\type\()16 v4, v0, v1, v2, v3
|
||||
\type v4, v0, v1, v2, v3
|
||||
b 32b
|
||||
640:
|
||||
trn1 v4.2d, v4.2d, v5.2d
|
||||
add x7, x0, x1
|
||||
lsl x1, x1, #1
|
||||
64:
|
||||
\type\()16 v5, v0, v1, v2, v3
|
||||
\type\()16 v6, v0, v1, v2, v3
|
||||
\type\()16 v7, v0, v1, v2, v3
|
||||
\type\()16 v16, v0, v1, v2, v3
|
||||
\type\()16 v17, v0, v1, v2, v3
|
||||
\type v5, v0, v1, v2, v3
|
||||
\type v6, v0, v1, v2, v3
|
||||
\type v7, v0, v1, v2, v3
|
||||
\type v16, v0, v1, v2, v3
|
||||
\type v17, v0, v1, v2, v3
|
||||
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
|
||||
\type\()16 v18, v0, v1, v2, v3
|
||||
\type\()16 v19, v0, v1, v2, v3
|
||||
\type v18, v0, v1, v2, v3
|
||||
\type v19, v0, v1, v2, v3
|
||||
subs w5, w5, #2
|
||||
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
|
||||
b.le 0f
|
||||
\type\()16 v4, v0, v1, v2, v3
|
||||
\type v4, v0, v1, v2, v3
|
||||
b 64b
|
||||
1280:
|
||||
trn1 v4.2d, v4.2d, v5.2d
|
||||
add x7, x0, #64
|
||||
128:
|
||||
\type\()16 v5, v0, v1, v2, v3
|
||||
\type\()16 v6, v0, v1, v2, v3
|
||||
\type\()16 v7, v0, v1, v2, v3
|
||||
\type\()16 v16, v0, v1, v2, v3
|
||||
\type\()16 v17, v0, v1, v2, v3
|
||||
\type v5, v0, v1, v2, v3
|
||||
\type v6, v0, v1, v2, v3
|
||||
\type v7, v0, v1, v2, v3
|
||||
\type v16, v0, v1, v2, v3
|
||||
\type v17, v0, v1, v2, v3
|
||||
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
|
||||
\type\()16 v18, v0, v1, v2, v3
|
||||
\type\()16 v19, v0, v1, v2, v3
|
||||
\type v18, v0, v1, v2, v3
|
||||
\type v19, v0, v1, v2, v3
|
||||
subs w5, w5, #1
|
||||
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
|
||||
b.le 0f
|
||||
\type\()16 v4, v0, v1, v2, v3
|
||||
\type v4, v0, v1, v2, v3
|
||||
b 128b
|
||||
0:
|
||||
ret
|
||||
|
@ -223,9 +190,9 @@ L(\type\()_tbl):
|
|||
.hword L(\type\()_tbl) - 1280b
|
||||
.hword L(\type\()_tbl) - 640b
|
||||
.hword L(\type\()_tbl) - 320b
|
||||
.hword L(\type\()_tbl) - 160b
|
||||
.hword L(\type\()_tbl) - 8b
|
||||
.hword L(\type\()_tbl) - 4b
|
||||
.hword L(\type\()_tbl) - 16b
|
||||
.hword L(\type\()_tbl) - 80b
|
||||
.hword L(\type\()_tbl) - 40b
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -464,10 +431,10 @@ function blend_8bpc_neon, export=1
|
|||
sub x6, x6, w3, uxtw
|
||||
movi v4.16b, #64
|
||||
add x8, x0, x1
|
||||
lsl w1, w1, #1
|
||||
lsl x1, x1, #1
|
||||
br x6
|
||||
4:
|
||||
ld1 {v2.d}[0], [x5], #8
|
||||
ld1 {v2.8b}, [x5], #8
|
||||
ld1 {v1.d}[0], [x2], #8
|
||||
ld1 {v0.s}[0], [x0]
|
||||
subs w4, w4, #2
|
||||
|
@ -481,8 +448,8 @@ function blend_8bpc_neon, export=1
|
|||
b.gt 4b
|
||||
ret
|
||||
8:
|
||||
ld1 {v2.2d}, [x5], #16
|
||||
ld1 {v1.2d}, [x2], #16
|
||||
ld1 {v2.16b}, [x5], #16
|
||||
ld1 {v1.16b}, [x2], #16
|
||||
ld1 {v0.d}[0], [x0]
|
||||
ld1 {v0.d}[1], [x8]
|
||||
sub v3.16b, v4.16b, v2.16b
|
||||
|
@ -498,13 +465,13 @@ function blend_8bpc_neon, export=1
|
|||
b.gt 8b
|
||||
ret
|
||||
16:
|
||||
ld1 {v1.2d, v2.2d}, [x5], #32
|
||||
ld1 {v5.2d, v6.2d}, [x2], #32
|
||||
ld1 {v0.2d}, [x0]
|
||||
ld1 {v1.16b, v2.16b}, [x5], #32
|
||||
ld1 {v5.16b, v6.16b}, [x2], #32
|
||||
ld1 {v0.16b}, [x0]
|
||||
subs w4, w4, #2
|
||||
sub v7.16b, v4.16b, v1.16b
|
||||
sub v20.16b, v4.16b, v2.16b
|
||||
ld1 {v3.2d}, [x8]
|
||||
ld1 {v3.16b}, [x8]
|
||||
umull v16.8h, v5.8b, v1.8b
|
||||
umlal v16.8h, v0.8b, v7.8b
|
||||
umull2 v17.8h, v5.16b, v1.16b
|
||||
|
@ -517,16 +484,16 @@ function blend_8bpc_neon, export=1
|
|||
rshrn2 v18.16b, v17.8h, #6
|
||||
rshrn v19.8b, v21.8h, #6
|
||||
rshrn2 v19.16b, v22.8h, #6
|
||||
st1 {v18.2d}, [x0], x1
|
||||
st1 {v19.2d}, [x8], x1
|
||||
st1 {v18.16b}, [x0], x1
|
||||
st1 {v19.16b}, [x8], x1
|
||||
b.gt 16b
|
||||
ret
|
||||
32:
|
||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
|
||||
ld1 {v20.2d, v21.2d}, [x0]
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
|
||||
ld1 {v20.16b, v21.16b}, [x0]
|
||||
subs w4, w4, #2
|
||||
ld1 {v22.2d, v23.2d}, [x8]
|
||||
ld1 {v22.16b, v23.16b}, [x8]
|
||||
sub v5.16b, v4.16b, v0.16b
|
||||
sub v6.16b, v4.16b, v1.16b
|
||||
sub v30.16b, v4.16b, v2.16b
|
||||
|
@ -555,8 +522,8 @@ function blend_8bpc_neon, export=1
|
|||
rshrn2 v27.16b, v1.8h, #6
|
||||
rshrn v28.8b, v29.8h, #6
|
||||
rshrn2 v28.16b, v21.8h, #6
|
||||
st1 {v24.2d, v25.2d}, [x0], x1
|
||||
st1 {v27.2d, v28.2d}, [x8], x1
|
||||
st1 {v24.16b, v25.16b}, [x0], x1
|
||||
st1 {v27.16b, v28.16b}, [x8], x1
|
||||
b.gt 32b
|
||||
ret
|
||||
L(blend_tbl):
|
||||
|
@ -567,7 +534,7 @@ L(blend_tbl):
|
|||
endfunc
|
||||
|
||||
function blend_h_8bpc_neon, export=1
|
||||
adr x6, L(blend_h_tbl)
|
||||
adr x6, L(blend_h_tbl)
|
||||
movrel x5, X(obmc_masks)
|
||||
add x5, x5, w4, uxtw
|
||||
sub w4, w4, w4, lsr #2
|
||||
|
@ -596,7 +563,7 @@ function blend_h_8bpc_neon, export=1
|
|||
ret
|
||||
4:
|
||||
ld2r {v0.8b, v1.8b}, [x5], #2
|
||||
ld1 {v2.2s}, [x2], #8
|
||||
ld1 {v2.8b}, [x2], #8
|
||||
subs w4, w4, #2
|
||||
ext v0.8b, v0.8b, v1.8b, #4
|
||||
ld1 {v3.s}[0], [x0]
|
||||
|
@ -742,8 +709,8 @@ function blend_v_8bpc_neon, export=1
|
|||
ret
|
||||
40:
|
||||
ld1r {v0.2s}, [x5]
|
||||
sub x1, x1, #2
|
||||
sub v1.8b, v4.8b, v0.8b
|
||||
sub x1, x1, #3
|
||||
4:
|
||||
ld1 {v2.8b}, [x2], #8
|
||||
ld1 {v3.s}[0], [x0]
|
||||
|
@ -754,16 +721,14 @@ function blend_v_8bpc_neon, export=1
|
|||
rshrn v5.8b, v5.8h, #6
|
||||
st1 {v5.h}[0], [x0], #2
|
||||
st1 {v5.h}[2], [x8], #2
|
||||
st1 {v5.b}[2], [x0], #1
|
||||
st1 {v5.b}[6], [x8], #1
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
st1 {v5.b}[2], [x0], x1
|
||||
st1 {v5.b}[6], [x8], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
80:
|
||||
ld1r {v0.2d}, [x5]
|
||||
sub x1, x1, #4
|
||||
sub v1.16b, v4.16b, v0.16b
|
||||
sub x1, x1, #6
|
||||
8:
|
||||
ld1 {v2.16b}, [x2], #16
|
||||
ld1 {v3.d}[0], [x0]
|
||||
|
@ -777,16 +742,14 @@ function blend_v_8bpc_neon, export=1
|
|||
rshrn2 v7.16b, v6.8h, #6
|
||||
st1 {v7.s}[0], [x0], #4
|
||||
st1 {v7.s}[2], [x8], #4
|
||||
st1 {v7.h}[2], [x0], #2
|
||||
st1 {v7.h}[6], [x8], #2
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
st1 {v7.h}[2], [x0], x1
|
||||
st1 {v7.h}[6], [x8], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
ld1 {v0.16b}, [x5]
|
||||
sub x1, x1, #8
|
||||
sub v2.16b, v4.16b, v0.16b
|
||||
sub x1, x1, #12
|
||||
16:
|
||||
ld1 {v5.16b, v6.16b}, [x2], #32
|
||||
ld1 {v7.16b}, [x0]
|
||||
|
@ -806,17 +769,15 @@ function blend_v_8bpc_neon, export=1
|
|||
rshrn2 v22.16b, v21.8h, #6
|
||||
st1 {v19.8b}, [x0], #8
|
||||
st1 {v22.8b}, [x8], #8
|
||||
st1 {v19.s}[2], [x0], #4
|
||||
st1 {v22.s}[2], [x8], #4
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
st1 {v19.s}[2], [x0], x1
|
||||
st1 {v22.s}[2], [x8], x1
|
||||
b.gt 16b
|
||||
ret
|
||||
320:
|
||||
ld1 {v0.16b, v1.16b}, [x5]
|
||||
sub x1, x1, #16
|
||||
sub v2.16b, v4.16b, v0.16b
|
||||
sub v3.16b, v4.16b, v1.16b
|
||||
sub x1, x1, #24
|
||||
sub v3.8b, v4.8b, v1.8b
|
||||
32:
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
|
||||
ld1 {v5.16b, v6.16b}, [x0]
|
||||
|
@ -828,30 +789,22 @@ function blend_v_8bpc_neon, export=1
|
|||
umlal2 v23.8h, v5.16b, v2.16b
|
||||
umull v28.8h, v17.8b, v1.8b
|
||||
umlal v28.8h, v6.8b, v3.8b
|
||||
umull2 v29.8h, v17.16b, v1.16b
|
||||
umlal2 v29.8h, v6.16b, v3.16b
|
||||
umull v30.8h, v18.8b, v0.8b
|
||||
umlal v30.8h, v20.8b, v2.8b
|
||||
umull2 v31.8h, v18.16b, v0.16b
|
||||
umlal2 v31.8h, v20.16b, v2.16b
|
||||
umull v25.8h, v19.8b, v1.8b
|
||||
umlal v25.8h, v21.8b, v3.8b
|
||||
umull2 v26.8h, v19.16b, v1.16b
|
||||
umlal2 v26.8h, v21.16b, v3.16b
|
||||
rshrn v24.8b, v22.8h, #6
|
||||
rshrn2 v24.16b, v23.8h, #6
|
||||
rshrn v28.8b, v28.8h, #6
|
||||
rshrn2 v28.16b, v29.8h, #6
|
||||
rshrn v30.8b, v30.8h, #6
|
||||
rshrn2 v30.16b, v31.8h, #6
|
||||
rshrn v27.8b, v25.8h, #6
|
||||
rshrn2 v27.16b, v26.8h, #6
|
||||
st1 {v24.16b}, [x0], #16
|
||||
st1 {v30.16b}, [x8], #16
|
||||
st1 {v28.8b}, [x0], #8
|
||||
st1 {v27.8b}, [x8], #8
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
st1 {v28.8b}, [x0], x1
|
||||
st1 {v27.8b}, [x8], x1
|
||||
b.gt 32b
|
||||
ret
|
||||
L(blend_v_tbl):
|
||||
|
@ -2106,9 +2059,9 @@ L(\type\()_8tap_filter_2):
|
|||
st1 {v3.4h}, [\ds2], \d_strd
|
||||
.endif
|
||||
b.le 0f
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v28.16b
|
||||
mov v18.16b, v29.16b
|
||||
mov v16.8b, v18.8b
|
||||
mov v17.8b, v28.8b
|
||||
mov v18.8b, v29.8b
|
||||
b 4b
|
||||
|
||||
480: // 4x8, 4x16, 4x32 hv
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -110,25 +110,10 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n
|
||||
umull \d0\().4s, \s0\().4h, \s2\().4h
|
||||
.if \n >= 8
|
||||
umull2 \d1\().4s, \s0\().8h, \s2\().8h
|
||||
.endif
|
||||
.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
|
||||
sqdmulh \d0\sz, \s0\sz, \s2\sz
|
||||
.if \n == 16
|
||||
umull \d2\().4s, \s1\().4h, \s3\().4h
|
||||
umull2 \d3\().4s, \s1\().8h, \s3\().8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n
|
||||
shrn \d0\().4h, \s0\().4s, \shift
|
||||
.if \n >= 8
|
||||
shrn2 \d0\().8h, \s1\().4s, \shift
|
||||
.endif
|
||||
.if \n == 16
|
||||
shrn \d1\().4h, \s2\().4s, \shift
|
||||
shrn2 \d1\().8h, \s3\().4s, \shift
|
||||
sqdmulh \d1\sz, \s1\sz, \s3\sz
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
@ -149,17 +134,19 @@ function msac_decode_symbol_adapt4_neon, export=1
|
|||
ld1_n v0, v1, x1, \sz, \n // cdf
|
||||
ld1r {v4\sz}, [x8] // rng
|
||||
movrel x9, coeffs, 30
|
||||
movi v31\sz, #0x7f, lsl #8 // 0x7f00
|
||||
sub x9, x9, x2, lsl #1
|
||||
ushr_n v2, v3, v0, v1, #6, \sz, \n // cdf >> EC_PROB_SHIFT
|
||||
mvni v30\sz, #0x3f // 0xffc0
|
||||
and v7\szb, v4\szb, v31\szb // rng & 0x7f00
|
||||
str h4, [sp, #14] // store original u = s->rng
|
||||
ushr v4\sz, v4\sz, #8 // r = rng >> 8
|
||||
and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
|
||||
|
||||
umull_n v16, v17, v18, v19, v4, v4, v2, v3, \n // r * (cdf >> EC_PROB_SHIFT)
|
||||
ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
|
||||
shrn_n v2, v3, v16, v17, v18, v19, #1, \n // v >>= 7 - EC_PROB_SHIFT
|
||||
sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
|
||||
add x8, x0, #DIF + 6
|
||||
|
||||
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v += EC_MIN_PROB * (n_symbols - ret)
|
||||
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
|
||||
add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
|
||||
|
||||
ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
|
||||
movrel x8, bits
|
||||
|
|
|
@ -27,42 +27,46 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/cdef.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
|
||||
|
||||
void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
|
||||
ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
/*const*/ pixel *const top[2], int h,
|
||||
enum CdefEdgeFlags edges);
|
||||
void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src,
|
||||
ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
/*const*/ pixel *const top[2], int h,
|
||||
enum CdefEdgeFlags edges);
|
||||
void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
|
||||
ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
const pixel *const top, int h,
|
||||
enum CdefEdgeFlags edges);
|
||||
void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
|
||||
ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
const pixel *const top, int h,
|
||||
enum CdefEdgeFlags edges);
|
||||
|
||||
void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp, int pri_strength,
|
||||
int sec_strength, int dir, int damping, int h);
|
||||
void dav1d_cdef_filter8_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp, int pri_strength,
|
||||
int sec_strength, int dir, int damping, int h);
|
||||
// Passing edges to this function, to allow it to switch to a more
|
||||
// optimized version for fully edged cases. Using size_t for edges,
|
||||
// to avoid ABI differences for passing more than one argument on the stack.
|
||||
void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp, int pri_strength,
|
||||
int sec_strength, int dir, int damping, int h,
|
||||
size_t edges HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
|
||||
const uint16_t *tmp, int pri_strength,
|
||||
int sec_strength, int dir, int damping, int h,
|
||||
size_t edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
#define DEFINE_FILTER(w, h, tmp_stride) \
|
||||
static void \
|
||||
cdef_filter_##w##x##h##_neon(pixel *dst, \
|
||||
const ptrdiff_t stride, \
|
||||
const pixel (*left)[2], \
|
||||
/*const*/ pixel *const top[2], \
|
||||
const int pri_strength, \
|
||||
const int sec_strength, \
|
||||
const int dir, \
|
||||
const int damping, \
|
||||
const enum CdefEdgeFlags edges) \
|
||||
const pixel (*left)[2], const pixel *const top, \
|
||||
const int pri_strength, const int sec_strength, \
|
||||
const int dir, const int damping, \
|
||||
const enum CdefEdgeFlags edges \
|
||||
HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,); \
|
||||
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
|
||||
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
|
||||
dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges); \
|
||||
dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength, \
|
||||
sec_strength, dir, damping, h); \
|
||||
BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges); \
|
||||
BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
|
||||
sec_strength, dir, damping, h, edges \
|
||||
HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
DEFINE_FILTER(8, 8, 16)
|
||||
|
@ -76,8 +80,8 @@ COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_find_dir_neon;
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
c->dir = BF(dav1d_cdef_find_dir, neon);
|
||||
c->fb[0] = cdef_filter_8x8_neon;
|
||||
c->fb[1] = cdef_filter_4x8_neon;
|
||||
c->fb[2] = cdef_filter_4x4_neon;
|
||||
|
|
|
@ -28,20 +28,20 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
|
||||
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
|
||||
|
||||
COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
|
||||
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
|
||||
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
|
||||
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -29,19 +29,26 @@
|
|||
#include "src/looprestoration.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
// This calculates things slightly differently than the reference C version.
|
||||
// This version calculates roughly this:
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
// The 8bpc version calculates things slightly differently than the reference
|
||||
// C version. That version calculates roughly this:
|
||||
// int16_t sum = 0;
|
||||
// for (int i = 0; i < 7; i++)
|
||||
// sum += src[idx] * fh[i];
|
||||
// int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
|
||||
// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
|
||||
// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
|
||||
// sum += 2048;
|
||||
void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
|
||||
const pixel *src, ptrdiff_t stride,
|
||||
const int16_t fh[7], const intptr_t w,
|
||||
int h, enum LrEdgeFlags edges);
|
||||
// sum += 1 << (bitdepth + 6 - round_bits_h);
|
||||
// Compared to the reference C version, this is the output of the first pass
|
||||
// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
|
||||
// with round_offset precompensated.
|
||||
// The 16bpc version calculates things pretty much the same way as the
|
||||
// reference C version, but with the end result subtracted by
|
||||
// 1 << (bitdepth + 6 - round_bits_h).
|
||||
void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
|
||||
const pixel *src, ptrdiff_t stride,
|
||||
const int16_t fh[7], const intptr_t w,
|
||||
int h, enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
// This calculates things slightly differently than the reference C version.
|
||||
// This version calculates roughly this:
|
||||
// fv[3] += 128;
|
||||
|
@ -50,217 +57,242 @@ void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
|
|||
// sum += mid[idx] * fv[i];
|
||||
// sum = (sum + rounding_off_v) >> round_bits_v;
|
||||
// This function assumes that the width is a multiple of 8.
|
||||
void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
|
||||
const int16_t *mid, int w, int h,
|
||||
const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
ptrdiff_t mid_stride);
|
||||
void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
|
||||
const pixel *src, int w, int h);
|
||||
void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
|
||||
const int16_t *mid, int w, int h,
|
||||
const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
|
||||
const pixel *src, int w, int h);
|
||||
|
||||
static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int16_t fh[7],
|
||||
const int16_t fv[7], const enum LrEdgeFlags edges)
|
||||
const int16_t fv[7], const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int16_t, mid, 68 * 384,);
|
||||
int mid_stride = (w + 7) & ~7;
|
||||
|
||||
// Horizontal filter
|
||||
dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
|
||||
fh, w, h, edges);
|
||||
BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
|
||||
fh, w, h, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
|
||||
fh, w, 2, edges);
|
||||
BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
|
||||
fh, w, 2, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
|
||||
lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
|
||||
fh, w, 2, edges);
|
||||
BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
|
||||
lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, fh, w, 2, edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
// Vertical filter
|
||||
if (w >= 8)
|
||||
dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
|
||||
w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
|
||||
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
|
||||
w & ~7, h, fv, edges,
|
||||
mid_stride * sizeof(*mid)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into dest.
|
||||
ALIGN_STK_16(pixel, tmp, 64 * 8,);
|
||||
dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
|
||||
w & 7, h, fv, edges, mid_stride * sizeof(*mid));
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
|
||||
BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
|
||||
&mid[2*mid_stride + (w & ~7)],
|
||||
w & 7, h, fv, edges,
|
||||
mid_stride * sizeof(*mid)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
|
||||
}
|
||||
}
|
||||
|
||||
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
|
||||
const int w, const int h, const int strength);
|
||||
void dav1d_sgr_finish_filter1_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
const int w, const int h, const int strength,
|
||||
const int bitdepth_max);
|
||||
void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
|
||||
/* filter with a 3x3 box (radius=1) */
|
||||
static void dav1d_sgr_filter1_neon(coef *tmp,
|
||||
static void dav1d_sgr_filter1_neon(int16_t *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int strength,
|
||||
const enum LrEdgeFlags edges)
|
||||
const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
|
||||
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
|
||||
|
||||
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
|
||||
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
|
||||
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
|
||||
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
|
||||
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
|
||||
BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
const int w, const int h,
|
||||
const enum LrEdgeFlags edges);
|
||||
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
|
||||
const int w, const int h, const int strength);
|
||||
void dav1d_sgr_finish_filter2_neon(coef *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
const int w, const int h, const int strength,
|
||||
const int bitdepth_max);
|
||||
void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const int32_t *a, const int16_t *b,
|
||||
const int w, const int h);
|
||||
|
||||
/* filter with a 5x5 box (radius=2) */
|
||||
static void dav1d_sgr_filter2_neon(coef *tmp,
|
||||
static void dav1d_sgr_filter2_neon(int16_t *tmp,
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int strength,
|
||||
const enum LrEdgeFlags edges)
|
||||
const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
|
||||
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
|
||||
|
||||
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
|
||||
lpf_stride, w, 2, edges);
|
||||
|
||||
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
|
||||
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
|
||||
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
|
||||
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
|
||||
BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
|
||||
}
|
||||
|
||||
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const coef *t1, const int w, const int h,
|
||||
const int wt);
|
||||
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const coef *t1, const coef *t2,
|
||||
const int w, const int h,
|
||||
const int16_t wt[2]);
|
||||
void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const int16_t *t1, const int w, const int h,
|
||||
const int wt HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const int16_t *t1, const int16_t *t2,
|
||||
const int w, const int h,
|
||||
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
|
||||
|
||||
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int sgr_idx,
|
||||
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
|
||||
const int16_t sgr_wt[7], const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
if (!dav1d_sgr_params[sgr_idx][0]) {
|
||||
ALIGN_STK_16(coef, tmp, 64 * 384,);
|
||||
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges);
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);
|
||||
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h,
|
||||
(1 << 7) - sgr_wt[1]);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
|
||||
dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h,
|
||||
(1 << 7) - sgr_wt[1]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else if (!dav1d_sgr_params[sgr_idx][1]) {
|
||||
ALIGN_STK_16(coef, tmp, 64 * 384,);
|
||||
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges);
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, sgr_wt[0]);
|
||||
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, sgr_wt[0]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h, sgr_wt[0]);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
|
||||
dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h, sgr_wt[0]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else {
|
||||
ALIGN_STK_16(coef, tmp1, 64 * 384,);
|
||||
ALIGN_STK_16(coef, tmp2, 64 * 384,);
|
||||
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
|
||||
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges);
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges);
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
|
||||
if (w >= 8)
|
||||
dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,
|
||||
tmp1, tmp2, w & ~7, h, wt);
|
||||
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp1, tmp2, w & ~7, h, wt
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,
|
||||
tmp1 + (w & ~7), tmp2 + (w & ~7),
|
||||
w & 7, h, wt);
|
||||
dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
|
||||
dst + (w & ~7), dst_stride,
|
||||
tmp1 + (w & ~7), tmp2 + (w & ~7),
|
||||
w & 7, h, wt HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // BITDEPTH == 8
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
c->wiener = wiener_filter_neon;
|
||||
c->selfguided = sgr_filter_neon;
|
||||
if (bpc <= 10)
|
||||
c->selfguided = sgr_filter_neon;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -30,52 +30,52 @@
|
|||
#include "src/mc.h"
|
||||
#include "src/cpu.h"
|
||||
|
||||
decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
|
||||
decl_mc_fn(dav1d_put_bilin_8bpc_neon);
|
||||
decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
|
||||
decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
|
||||
decl_mc_fn(BF(dav1d_put_bilin, neon));
|
||||
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
|
||||
decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
|
||||
decl_mct_fn(BF(dav1d_prep_bilin, neon));
|
||||
|
||||
decl_avg_fn(dav1d_avg_8bpc_neon);
|
||||
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
|
||||
decl_mask_fn(dav1d_mask_8bpc_neon);
|
||||
decl_blend_fn(dav1d_blend_8bpc_neon);
|
||||
decl_blend_dir_fn(dav1d_blend_h_8bpc_neon);
|
||||
decl_blend_dir_fn(dav1d_blend_v_8bpc_neon);
|
||||
decl_avg_fn(BF(dav1d_avg, neon));
|
||||
decl_w_avg_fn(BF(dav1d_w_avg, neon));
|
||||
decl_mask_fn(BF(dav1d_mask, neon));
|
||||
decl_blend_fn(BF(dav1d_blend, neon));
|
||||
decl_blend_dir_fn(BF(dav1d_blend_h, neon));
|
||||
decl_blend_dir_fn(BF(dav1d_blend_v, neon));
|
||||
|
||||
decl_w_mask_fn(dav1d_w_mask_444_8bpc_neon);
|
||||
decl_w_mask_fn(dav1d_w_mask_422_8bpc_neon);
|
||||
decl_w_mask_fn(dav1d_w_mask_420_8bpc_neon);
|
||||
decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
|
||||
decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
|
||||
decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
|
||||
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
|
||||
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
|
||||
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
|
||||
|
||||
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
||||
#define init_mc_fn(type, name, suffix) \
|
||||
c->mc[type] = dav1d_put_##name##_8bpc_##suffix
|
||||
c->mc[type] = BF(dav1d_put_##name, suffix)
|
||||
#define init_mct_fn(type, name, suffix) \
|
||||
c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
|
||||
c->mct[type] = BF(dav1d_prep_##name, suffix)
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
|
||||
|
@ -98,16 +98,16 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
|||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
|
||||
|
||||
c->avg = dav1d_avg_8bpc_neon;
|
||||
c->w_avg = dav1d_w_avg_8bpc_neon;
|
||||
c->mask = dav1d_mask_8bpc_neon;
|
||||
c->blend = dav1d_blend_8bpc_neon;
|
||||
c->blend_h = dav1d_blend_h_8bpc_neon;
|
||||
c->blend_v = dav1d_blend_v_8bpc_neon;
|
||||
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
|
||||
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
|
||||
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
|
||||
c->avg = BF(dav1d_avg, neon);
|
||||
c->w_avg = BF(dav1d_w_avg, neon);
|
||||
c->mask = BF(dav1d_mask, neon);
|
||||
c->blend = BF(dav1d_blend, neon);
|
||||
c->blend_h = BF(dav1d_blend_h, neon);
|
||||
c->blend_v = BF(dav1d_blend_v, neon);
|
||||
c->w_mask[0] = BF(dav1d_w_mask_444, neon);
|
||||
c->w_mask[1] = BF(dav1d_w_mask_422, neon);
|
||||
c->w_mask[2] = BF(dav1d_w_mask_420, neon);
|
||||
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
|
||||
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ typedef const void *const_left_pixel_row_2px;
|
|||
// order to get access to pre-filter top pixels, use $top.
|
||||
#define decl_cdef_fn(name) \
|
||||
void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
|
||||
/*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
|
||||
const pixel *top, int pri_strength, int sec_strength, \
|
||||
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_cdef_fn(*cdef_fn);
|
||||
|
||||
|
|
|
@ -39,24 +39,28 @@ enum Backup2x8Flags {
|
|||
BACKUP_2X8_UV = 1 << 1,
|
||||
};
|
||||
|
||||
static void backup2lines(pixel *const dst[3][2],
|
||||
/*const*/ pixel *const src[3],
|
||||
const ptrdiff_t src_stride[2], int y_off, int w,
|
||||
static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
|
||||
const ptrdiff_t stride[2],
|
||||
const enum Dav1dPixelLayout layout)
|
||||
{
|
||||
pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
|
||||
pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
|
||||
const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
|
||||
if (y_stride < 0)
|
||||
pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
|
||||
else
|
||||
pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
|
||||
|
||||
if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
|
||||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
|
||||
w >>= ss_hor;
|
||||
y_off >>= ss_ver;
|
||||
pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
|
||||
pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
|
||||
pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
|
||||
pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
|
||||
if (layout != DAV1D_PIXEL_LAYOUT_I400) {
|
||||
const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
|
||||
if (uv_stride < 0) {
|
||||
const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
|
||||
pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
|
||||
pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
|
||||
} else {
|
||||
const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
|
||||
pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
|
||||
pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void backup2x8(pixel dst[3][8][2],
|
||||
|
@ -105,7 +109,6 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
|
||||
const enum Dav1dPixelLayout layout = f->cur.p.layout;
|
||||
const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
|
||||
const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
|
||||
|
@ -114,19 +117,16 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
const int by_idx = by & 30;
|
||||
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
|
||||
|
||||
if (edges & CDEF_HAVE_BOTTOM) {
|
||||
// backup pre-filter data for next iteration
|
||||
backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,
|
||||
8, f->bw * 4, layout);
|
||||
}
|
||||
if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
|
||||
backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);
|
||||
|
||||
pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
|
||||
ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
|
||||
pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
|
||||
edges &= ~CDEF_HAVE_LEFT;
|
||||
edges |= CDEF_HAVE_RIGHT;
|
||||
enum Backup2x8Flags prev_flag = 0;
|
||||
for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
|
||||
const int sb128x = sbx >>1;
|
||||
const int sb128x = sbx >> 1;
|
||||
const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
|
||||
const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
|
||||
if (cdef_idx == -1 ||
|
||||
|
@ -141,6 +141,16 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
|
||||
const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
|
||||
|
||||
const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
|
||||
int y_sec_lvl = y_lvl & 3;
|
||||
y_sec_lvl += y_sec_lvl == 3;
|
||||
y_sec_lvl <<= bitdepth_min_8;
|
||||
|
||||
const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
|
||||
int uv_sec_lvl = uv_lvl & 3;
|
||||
uv_sec_lvl += uv_sec_lvl == 3;
|
||||
uv_sec_lvl <<= bitdepth_min_8;
|
||||
|
||||
pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
|
||||
for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
|
||||
bx += 2, edges |= CDEF_HAVE_LEFT)
|
||||
|
@ -169,41 +179,32 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
|
||||
}
|
||||
|
||||
// the actual filter
|
||||
const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
|
||||
int y_sec_lvl = y_lvl & 3;
|
||||
y_sec_lvl += y_sec_lvl == 3;
|
||||
y_sec_lvl <<= bitdepth_min_8;
|
||||
const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
|
||||
int uv_sec_lvl = uv_lvl & 3;
|
||||
uv_sec_lvl += uv_sec_lvl == 3;
|
||||
uv_sec_lvl <<= bitdepth_min_8;
|
||||
int dir;
|
||||
unsigned variance;
|
||||
const int dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
|
||||
&variance HIGHBD_CALL_SUFFIX);
|
||||
if (y_lvl) {
|
||||
if (y_pri_lvl || uv_pri_lvl)
|
||||
dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
|
||||
&variance HIGHBD_CALL_SUFFIX);
|
||||
|
||||
if (y_pri_lvl) {
|
||||
const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
|
||||
if (adj_y_pri_lvl || y_sec_lvl)
|
||||
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
|
||||
&f->lf.cdef_line[tf][0][bx * 4],
|
||||
adj_y_pri_lvl, y_sec_lvl, dir,
|
||||
damping, edges HIGHBD_CALL_SUFFIX);
|
||||
} else if (y_sec_lvl)
|
||||
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
|
||||
(pixel *const [2]) {
|
||||
&f->lf.cdef_line[tf][0][0][bx * 4],
|
||||
&f->lf.cdef_line[tf][0][1][bx * 4],
|
||||
},
|
||||
adjust_strength(y_pri_lvl, variance),
|
||||
y_sec_lvl, y_pri_lvl ? dir : 0,
|
||||
&f->lf.cdef_line[tf][0][bx * 4],
|
||||
0, y_sec_lvl, 0,
|
||||
damping, edges HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
if (uv_lvl && has_chroma) {
|
||||
const int uvdir =
|
||||
f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
|
||||
((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
|
||||
if (uv_lvl) {
|
||||
assert(layout != DAV1D_PIXEL_LAYOUT_I400);
|
||||
const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ?
|
||||
((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0;
|
||||
for (int pl = 1; pl <= 2; pl++) {
|
||||
dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
|
||||
lr_bak[bit][pl],
|
||||
(pixel *const [2]) {
|
||||
&f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],
|
||||
&f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],
|
||||
},
|
||||
uv_pri_lvl, uv_sec_lvl,
|
||||
uv_pri_lvl ? uvdir : 0,
|
||||
dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
|
||||
&f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
|
||||
uv_pri_lvl, uv_sec_lvl, uvdir,
|
||||
damping - 1, edges HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,29 +32,30 @@
|
|||
#include "common/intops.h"
|
||||
|
||||
#include "src/cdef.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
static inline int constrain(const int diff, const int threshold,
|
||||
const int damping)
|
||||
const int shift)
|
||||
{
|
||||
if (!threshold) return 0;
|
||||
const int shift = imax(0, damping - ulog2(threshold));
|
||||
return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
|
||||
diff);
|
||||
const int adiff = abs(diff);
|
||||
return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
|
||||
}
|
||||
|
||||
static inline void fill(uint16_t *tmp, const ptrdiff_t stride,
|
||||
static inline void fill(int16_t *tmp, const ptrdiff_t stride,
|
||||
const int w, const int h)
|
||||
{
|
||||
/* Use a value that's a large positive number when interpreted as unsigned,
|
||||
* and a large negative number when interpreted as signed. */
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++)
|
||||
tmp[x] = INT16_MAX;
|
||||
tmp[x] = INT16_MIN;
|
||||
tmp += stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
||||
static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const pixel (*left)[2], pixel *const top[2],
|
||||
const pixel (*left)[2], const pixel *top,
|
||||
const int w, const int h,
|
||||
const enum CdefEdgeFlags edges)
|
||||
{
|
||||
|
@ -77,9 +78,11 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
|||
x_end -= 2;
|
||||
}
|
||||
|
||||
for (int y = y_start; y < 0; y++)
|
||||
for (int y = y_start; y < 0; y++) {
|
||||
for (int x = x_start; x < x_end; x++)
|
||||
tmp[x + y * tmp_stride] = top[y & 1][x];
|
||||
tmp[x + y * tmp_stride] = top[x];
|
||||
top += PXSTRIDE(src_stride);
|
||||
}
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = x_start; x < 0; x++)
|
||||
tmp[x + y * tmp_stride] = left[y][2 + x];
|
||||
|
@ -93,75 +96,113 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
|||
|
||||
static NOINLINE void
|
||||
cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*left)[2], /*const*/ pixel *const top[2],
|
||||
const int w, const int h, const int pri_strength,
|
||||
const int sec_strength, const int dir,
|
||||
const int damping, const enum CdefEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
const pixel (*left)[2], const pixel *const top,
|
||||
const int pri_strength, const int sec_strength,
|
||||
const int dir, const int damping, const int w, int h,
|
||||
const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
static const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
|
||||
{ -1 * 12 + 1, -2 * 12 + 2 },
|
||||
{ 0 * 12 + 1, -1 * 12 + 2 },
|
||||
{ 0 * 12 + 1, 0 * 12 + 2 },
|
||||
{ 0 * 12 + 1, 1 * 12 + 2 },
|
||||
{ 1 * 12 + 1, 2 * 12 + 2 },
|
||||
{ 1 * 12 + 0, 2 * 12 + 1 },
|
||||
{ 1 * 12 + 0, 2 * 12 + 0 },
|
||||
{ 1 * 12 + 0, 2 * 12 - 1 }
|
||||
};
|
||||
const ptrdiff_t tmp_stride = 12;
|
||||
assert((w == 4 || w == 8) && (h == 4 || h == 8));
|
||||
uint16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
|
||||
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
|
||||
int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
|
||||
int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
|
||||
|
||||
padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
|
||||
|
||||
// run actual filter
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
int sum = 0;
|
||||
const int px = dst[x];
|
||||
int max = px, min = px;
|
||||
int pri_tap_k = pri_tap;
|
||||
for (int k = 0; k < 2; k++) {
|
||||
const int off1 = cdef_directions[dir][k];
|
||||
const int p0 = tmp[x + off1];
|
||||
const int p1 = tmp[x - off1];
|
||||
sum += pri_tap_k * constrain(p0 - px, pri_strength, damping);
|
||||
sum += pri_tap_k * constrain(p1 - px, pri_strength, damping);
|
||||
// if pri_tap_k == 4 then it becomes 2 else it remains 3
|
||||
pri_tap_k -= (pri_tap_k << 1) - 6;
|
||||
if (p0 != INT16_MAX) max = imax(p0, max);
|
||||
if (p1 != INT16_MAX) max = imax(p1, max);
|
||||
min = imin(p0, min);
|
||||
min = imin(p1, min);
|
||||
const int off2 = cdef_directions[(dir + 2) & 7][k];
|
||||
const int s0 = tmp[x + off2];
|
||||
const int s1 = tmp[x - off2];
|
||||
const int off3 = cdef_directions[(dir + 6) & 7][k];
|
||||
const int s2 = tmp[x + off3];
|
||||
const int s3 = tmp[x - off3];
|
||||
if (s0 != INT16_MAX) max = imax(s0, max);
|
||||
if (s1 != INT16_MAX) max = imax(s1, max);
|
||||
if (s2 != INT16_MAX) max = imax(s2, max);
|
||||
if (s3 != INT16_MAX) max = imax(s3, max);
|
||||
min = imin(s0, min);
|
||||
min = imin(s1, min);
|
||||
min = imin(s2, min);
|
||||
min = imin(s3, min);
|
||||
// sec_tap starts at 2 and becomes 1
|
||||
const int sec_tap = 2 - k;
|
||||
sum += sec_tap * constrain(s0 - px, sec_strength, damping);
|
||||
sum += sec_tap * constrain(s1 - px, sec_strength, damping);
|
||||
sum += sec_tap * constrain(s2 - px, sec_strength, damping);
|
||||
sum += sec_tap * constrain(s3 - px, sec_strength, damping);
|
||||
}
|
||||
dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
|
||||
if (pri_strength) {
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
|
||||
const int pri_shift = imax(0, damping - ulog2(pri_strength));
|
||||
if (sec_strength) {
|
||||
const int sec_shift = imax(0, damping - ulog2(sec_strength));
|
||||
do {
|
||||
for (int x = 0; x < w; x++) {
|
||||
const int px = dst[x];
|
||||
int sum = 0;
|
||||
int max = px, min = px;
|
||||
int pri_tap_k = pri_tap;
|
||||
for (int k = 0; k < 2; k++) {
|
||||
const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
|
||||
const int p0 = tmp[x + off1];
|
||||
const int p1 = tmp[x - off1];
|
||||
sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
|
||||
sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
|
||||
// if pri_tap_k == 4 then it becomes 2 else it remains 3
|
||||
pri_tap_k = (pri_tap_k & 3) | 2;
|
||||
min = umin(p0, min);
|
||||
max = imax(p0, max);
|
||||
min = umin(p1, min);
|
||||
max = imax(p1, max);
|
||||
const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
|
||||
const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
|
||||
const int s0 = tmp[x + off2];
|
||||
const int s1 = tmp[x - off2];
|
||||
const int s2 = tmp[x + off3];
|
||||
const int s3 = tmp[x - off3];
|
||||
// sec_tap starts at 2 and becomes 1
|
||||
const int sec_tap = 2 - k;
|
||||
sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
|
||||
min = umin(s0, min);
|
||||
max = imax(s0, max);
|
||||
min = umin(s1, min);
|
||||
max = imax(s1, max);
|
||||
min = umin(s2, min);
|
||||
max = imax(s2, max);
|
||||
min = umin(s3, min);
|
||||
max = imax(s3, max);
|
||||
}
|
||||
dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
|
||||
}
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
tmp += tmp_stride;
|
||||
} while (--h);
|
||||
} else { // pri_strength only
|
||||
do {
|
||||
for (int x = 0; x < w; x++) {
|
||||
const int px = dst[x];
|
||||
int sum = 0;
|
||||
int pri_tap_k = pri_tap;
|
||||
for (int k = 0; k < 2; k++) {
|
||||
const int off = dav1d_cdef_directions[dir + 2][k]; // dir
|
||||
const int p0 = tmp[x + off];
|
||||
const int p1 = tmp[x - off];
|
||||
sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
|
||||
sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
|
||||
pri_tap_k = (pri_tap_k & 3) | 2;
|
||||
}
|
||||
dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
|
||||
}
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
tmp += tmp_stride;
|
||||
} while (--h);
|
||||
}
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
tmp += tmp_stride;
|
||||
} else { // sec_strength only
|
||||
assert(sec_strength);
|
||||
const int sec_shift = imax(0, damping - ulog2(sec_strength));
|
||||
do {
|
||||
for (int x = 0; x < w; x++) {
|
||||
const int px = dst[x];
|
||||
int sum = 0;
|
||||
for (int k = 0; k < 2; k++) {
|
||||
const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
|
||||
const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
|
||||
const int s0 = tmp[x + off1];
|
||||
const int s1 = tmp[x - off1];
|
||||
const int s2 = tmp[x + off2];
|
||||
const int s3 = tmp[x - off2];
|
||||
const int sec_tap = 2 - k;
|
||||
sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
|
||||
sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
|
||||
}
|
||||
dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
|
||||
}
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
tmp += tmp_stride;
|
||||
} while (--h);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -169,7 +210,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
|
|||
static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
|
||||
const ptrdiff_t stride, \
|
||||
const pixel (*left)[2], \
|
||||
/*const*/ pixel *const top[2], \
|
||||
const pixel *const top, \
|
||||
const int pri_strength, \
|
||||
const int sec_strength, \
|
||||
const int dir, \
|
||||
|
@ -177,8 +218,8 @@ static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
|
|||
const enum CdefEdgeFlags edges \
|
||||
HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
cdef_filter_block_c(dst, stride, left, top, w, h, pri_strength, sec_strength, \
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX); \
|
||||
cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \
|
||||
dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
cdef_fn(4, 4);
|
||||
|
|
|
@ -30,24 +30,27 @@
|
|||
|
||||
#include "src/cpu.h"
|
||||
|
||||
static unsigned flags = 0;
|
||||
#if ARCH_X86
|
||||
/* Disable AVX-512 by default for the time being */
|
||||
static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL;
|
||||
#else
|
||||
static unsigned flags_mask = -1;
|
||||
#endif
|
||||
|
||||
COLD void dav1d_init_cpu(void) {
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
flags = dav1d_get_cpu_flags_arm();
|
||||
#elif ARCH_PPC64LE
|
||||
flags = dav1d_get_cpu_flags_ppc();
|
||||
#elif ARCH_X86
|
||||
flags = dav1d_get_cpu_flags_x86();
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags(void) {
|
||||
static unsigned flags;
|
||||
static uint8_t checked = 0;
|
||||
|
||||
if (!checked) {
|
||||
#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
|
||||
flags = dav1d_get_cpu_flags_arm();
|
||||
#elif ARCH_PPC64LE && HAVE_ASM
|
||||
flags = dav1d_get_cpu_flags_ppc();
|
||||
#elif ARCH_X86 && HAVE_ASM
|
||||
flags = dav1d_get_cpu_flags_x86();
|
||||
#else
|
||||
flags = 0;
|
||||
#endif
|
||||
checked = 1;
|
||||
}
|
||||
return flags & flags_mask;
|
||||
}
|
||||
|
||||
|
|
|
@ -42,7 +42,8 @@
|
|||
#include "src/x86/cpu.h"
|
||||
#endif
|
||||
|
||||
void dav1d_init_cpu(void);
|
||||
unsigned dav1d_get_cpu_flags(void);
|
||||
DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask);
|
||||
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
|
||||
|
||||
#endif /* DAV1D_SRC_CPU_H */
|
||||
|
|
|
@ -627,8 +627,8 @@ static void read_vartx_tree(Dav1dTileContext *const t,
|
|||
// var-tx tree coding
|
||||
b->tx_split[0] = b->tx_split[1] = 0;
|
||||
b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
|
||||
if (f->frame_hdr->segmentation.lossless[b->seg_id] ||
|
||||
b->max_ytx == TX_4X4)
|
||||
if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
|
||||
b->max_ytx == TX_4X4))
|
||||
{
|
||||
b->max_ytx = b->uvtx = TX_4X4;
|
||||
if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
|
||||
|
@ -645,8 +645,6 @@ static void read_vartx_tree(Dav1dTileContext *const t,
|
|||
case_set(bh4, l., 1, by4);
|
||||
case_set(bw4, a->, 0, bx4);
|
||||
#undef set_ctx
|
||||
} else {
|
||||
assert(f->frame_hdr->txfm_mode == DAV1D_TX_LARGEST);
|
||||
}
|
||||
b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
|
||||
} else {
|
||||
|
@ -1878,10 +1876,11 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
|
||||
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
|
||||
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
|
||||
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
|
||||
lf_lvls, t->bx, t->by, f->w4, f->h4,
|
||||
b->skip, bs, b->tx_split, b->uvtx,
|
||||
f->cur.p.layout,
|
||||
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
|
||||
t->bx, t->by, f->w4, f->h4, b->skip, bs,
|
||||
f->frame_hdr->segmentation.lossless[b->seg_id] ?
|
||||
(enum RectTxfmSize) TX_4X4 : b->max_ytx,
|
||||
b->tx_split, b->uvtx, f->cur.p.layout,
|
||||
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
|
||||
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
|
||||
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
|
||||
|
@ -2350,7 +2349,7 @@ static void setup_tile(Dav1dTileState *const ts,
|
|||
|
||||
// Reference Restoration Unit (used for exp coding)
|
||||
int sb_idx, unit_idx;
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
// vertical components only
|
||||
sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
|
||||
unit_idx = (ts->tiling.row_start & 16) >> 3;
|
||||
|
@ -2363,7 +2362,7 @@ static void setup_tile(Dav1dTileState *const ts,
|
|||
if (!((f->lf.restore_planes >> p) & 1U))
|
||||
continue;
|
||||
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int d = f->frame_hdr->super_res.width_scale_denominator;
|
||||
const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
|
||||
|
@ -2543,7 +2542,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
|
|||
|
||||
const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
|
||||
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
|
||||
const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
|
||||
|
||||
|
@ -2763,24 +2762,42 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
}
|
||||
|
||||
// update allocation of block contexts for above
|
||||
const int line_sz = (int)f->b4_stride << hbd;
|
||||
if (line_sz != f->lf.line_sz) {
|
||||
dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);
|
||||
uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);
|
||||
const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
|
||||
if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) {
|
||||
dav1d_free_aligned(f->lf.cdef_line_buf);
|
||||
size_t alloc_sz = 64;
|
||||
alloc_sz += (y_stride < 0 ? -y_stride : y_stride ) * 4;
|
||||
alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8;
|
||||
uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
|
||||
if (!ptr) {
|
||||
f->lf.line_sz = 0;
|
||||
f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0;
|
||||
goto error;
|
||||
}
|
||||
|
||||
for (int pl = 0; pl <= 2; pl++) {
|
||||
f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;
|
||||
f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;
|
||||
f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;
|
||||
f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;
|
||||
ptr += line_sz * 4 * 4;
|
||||
ptr += 32;
|
||||
if (y_stride < 0) {
|
||||
f->lf.cdef_line[0][0] = ptr - y_stride * 1;
|
||||
f->lf.cdef_line[1][0] = ptr - y_stride * 3;
|
||||
ptr -= y_stride * 4;
|
||||
} else {
|
||||
f->lf.cdef_line[0][0] = ptr + y_stride * 0;
|
||||
f->lf.cdef_line[1][0] = ptr + y_stride * 2;
|
||||
ptr += y_stride * 4;
|
||||
}
|
||||
if (uv_stride < 0) {
|
||||
f->lf.cdef_line[0][1] = ptr - uv_stride * 1;
|
||||
f->lf.cdef_line[0][2] = ptr - uv_stride * 3;
|
||||
f->lf.cdef_line[1][1] = ptr - uv_stride * 5;
|
||||
f->lf.cdef_line[1][2] = ptr - uv_stride * 7;
|
||||
} else {
|
||||
f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
|
||||
f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
|
||||
f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
|
||||
f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
|
||||
}
|
||||
|
||||
f->lf.line_sz = line_sz;
|
||||
f->lf.cdef_line_sz[0] = (int) y_stride;
|
||||
f->lf.cdef_line_sz[1] = (int) uv_stride;
|
||||
}
|
||||
|
||||
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
|
||||
|
@ -2944,14 +2961,19 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
}
|
||||
}
|
||||
|
||||
// init loopfilter pointers
|
||||
/* Init loopfilter pointers. Increasing NULL pointers is technically UB,
|
||||
* so just point the chroma pointers in 4:0:0 to the luma plane here to
|
||||
* avoid having additional in-loop branches in various places. We never
|
||||
* dereference those pointers so it doesn't really matter what they
|
||||
* point at, as long as the pointers are valid. */
|
||||
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
f->lf.mask_ptr = f->lf.mask;
|
||||
f->lf.p[0] = f->cur.data[0];
|
||||
f->lf.p[1] = f->cur.data[1];
|
||||
f->lf.p[2] = f->cur.data[2];
|
||||
f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
|
||||
f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
|
||||
f->lf.sr_p[0] = f->sr_cur.p.data[0];
|
||||
f->lf.sr_p[1] = f->sr_cur.p.data[1];
|
||||
f->lf.sr_p[2] = f->sr_cur.p.data[2];
|
||||
f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
|
||||
f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
|
||||
f->lf.tile_row = 1;
|
||||
|
||||
dav1d_cdf_thread_wait(&f->in_cdf);
|
||||
|
@ -3220,7 +3242,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
|
||||
dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \
|
||||
dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
|
||||
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
|
||||
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
|
||||
dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
|
||||
dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
|
||||
break
|
||||
|
@ -3301,7 +3323,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
}
|
||||
f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
|
||||
!f->frame_hdr->force_integer_mv &&
|
||||
!dav1d_get_shear_params(&f->frame_hdr->gmv[i]);
|
||||
!dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
|
||||
!f->svc[i][0].scale;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3338,14 +3361,14 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
res = dav1d_thread_picture_alloc(c, f, bpc);
|
||||
if (res < 0) goto error;
|
||||
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
|
||||
if (res < 0) goto error;
|
||||
} else {
|
||||
dav1d_picture_ref(&f->cur, &f->sr_cur.p);
|
||||
}
|
||||
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
|
||||
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
|
||||
|
|
|
@ -651,8 +651,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||
|
||||
%if WIN64 == 0
|
||||
%macro WIN64_SPILL_XMM 1
|
||||
%assign xmm_regs_used %1
|
||||
%endmacro
|
||||
%macro WIN64_RESTORE_XMM 0
|
||||
%assign xmm_regs_used 0
|
||||
%endmacro
|
||||
%macro WIN64_PUSH_XMM 0
|
||||
%endmacro
|
||||
|
@ -824,33 +826,34 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||
|
||||
; cpuflags
|
||||
|
||||
%assign cpuflags_mmx (1<<0)
|
||||
%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
|
||||
%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
|
||||
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
|
||||
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
|
||||
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
|
||||
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
|
||||
%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
|
||||
%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
|
||||
%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
|
||||
%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
|
||||
%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
|
||||
%assign cpuflags_aesni (1<<12)| cpuflags_sse42
|
||||
%assign cpuflags_gfni (1<<13)| cpuflags_sse42
|
||||
%assign cpuflags_avx (1<<14)| cpuflags_sse42
|
||||
%assign cpuflags_xop (1<<15)| cpuflags_avx
|
||||
%assign cpuflags_fma4 (1<<16)| cpuflags_avx
|
||||
%assign cpuflags_fma3 (1<<17)| cpuflags_avx
|
||||
%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
|
||||
%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
|
||||
%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
|
||||
%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
|
||||
%assign cpuflags_mmx (1<<0)
|
||||
%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
|
||||
%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
|
||||
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
|
||||
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
|
||||
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
|
||||
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
|
||||
%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
|
||||
%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
|
||||
%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
|
||||
%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
|
||||
%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
|
||||
%assign cpuflags_aesni (1<<12) | cpuflags_sse42
|
||||
%assign cpuflags_gfni (1<<13) | cpuflags_sse42
|
||||
%assign cpuflags_avx (1<<14) | cpuflags_sse42
|
||||
%assign cpuflags_xop (1<<15) | cpuflags_avx
|
||||
%assign cpuflags_fma4 (1<<16) | cpuflags_avx
|
||||
%assign cpuflags_fma3 (1<<17) | cpuflags_avx
|
||||
%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt
|
||||
%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1
|
||||
%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2
|
||||
%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
|
||||
%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
|
||||
|
||||
%assign cpuflags_cache32 (1<<22)
|
||||
%assign cpuflags_cache64 (1<<23)
|
||||
%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
|
||||
%assign cpuflags_atom (1<<25)
|
||||
%assign cpuflags_cache32 (1<<23)
|
||||
%assign cpuflags_cache64 (1<<24)
|
||||
%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant
|
||||
%assign cpuflags_atom (1<<26)
|
||||
|
||||
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
|
||||
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
|
||||
|
|
|
@ -122,17 +122,32 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
|||
// TODO: eliminate in favor of per-plane refs
|
||||
assert(out->stride[0] == in->stride[0]);
|
||||
if (!data->num_y_points) {
|
||||
memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
|
||||
const ptrdiff_t stride = out->stride[0];
|
||||
const ptrdiff_t sz = out->p.h * stride;
|
||||
if (sz < 0)
|
||||
memcpy((uint8_t*) out->data[0] + sz - stride,
|
||||
(uint8_t*) in->data[0] + sz - stride, -sz);
|
||||
else
|
||||
memcpy(out->data[0], in->data[0], sz);
|
||||
}
|
||||
|
||||
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
|
||||
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
|
||||
assert(out->stride[1] == in->stride[1]);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
|
||||
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
memcpy(out->data[1+i], in->data[1+i],
|
||||
(out->p.h >> suby) * out->stride[1]);
|
||||
}
|
||||
const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const ptrdiff_t stride = out->stride[1];
|
||||
const ptrdiff_t sz = (out->p.h * stride) >> ss_ver;
|
||||
if (sz < 0) {
|
||||
if (!data->num_uv_points[0])
|
||||
memcpy((uint8_t*) out->data[1] + sz - stride,
|
||||
(uint8_t*) in->data[1] + sz - stride, -sz);
|
||||
if (!data->num_uv_points[1])
|
||||
memcpy((uint8_t*) out->data[2] + sz - stride,
|
||||
(uint8_t*) in->data[2] + sz - stride, -sz);
|
||||
} else {
|
||||
if (!data->num_uv_points[0])
|
||||
memcpy(out->data[1], in->data[1], sz);
|
||||
if (!data->num_uv_points[1])
|
||||
memcpy(out->data[2], in->data[2], sz);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ static inline int get_random_number(const int bits, unsigned *const state) {
|
|||
return (*state >> (16 - bits)) & ((1 << bits) - 1);
|
||||
}
|
||||
|
||||
static inline int round2(const int x, const int shift) {
|
||||
static inline int round2(const int x, const uint64_t shift) {
|
||||
return (x + ((1 << shift) >> 1)) >> shift;
|
||||
}
|
||||
|
||||
|
|
|
@ -216,12 +216,14 @@ struct Dav1dFrameContext {
|
|||
Av1Filter *mask;
|
||||
Av1Restoration *lr_mask;
|
||||
int top_pre_cdef_toggle;
|
||||
int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
|
||||
int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
|
||||
int lr_line_sz, re_sz /* h */;
|
||||
ALIGN(Av1FilterLUT lim_lut, 16);
|
||||
int last_sharpness;
|
||||
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
|
||||
uint8_t *tx_lpf_right_edge[2];
|
||||
pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];
|
||||
uint8_t *cdef_line_buf;
|
||||
pixel *cdef_line[2 /* pre, post */][3 /* plane */];
|
||||
pixel *lr_lpf_line[3 /* plane */];
|
||||
|
||||
// in-loop filter per-frame state keeping
|
||||
|
@ -288,7 +290,7 @@ struct Dav1dTileContext {
|
|||
uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
|
||||
uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
|
||||
uint8_t txtp_map[32 * 32]; // inter-only
|
||||
ALIGN(union, 32) {
|
||||
ALIGN(union, 64) {
|
||||
struct {
|
||||
union {
|
||||
uint8_t lap_8bpc [128 * 32];
|
||||
|
|
|
@ -66,7 +66,7 @@
|
|||
* range, in the following order:
|
||||
* - [0] will be the top/left edge pixel;
|
||||
* - [1..w] will be the top edge pixels (1 being left-most, w being right-most);
|
||||
* - [w+1..w*w] will be the top/right edge pixels;
|
||||
* - [w+1..2*w] will be the top/right edge pixels;
|
||||
* - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom-
|
||||
* most);
|
||||
* - [-w-1..-2*w] will be the bottom/left edge pixels.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* Copyright © 2018-2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -30,7 +30,9 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "src/itx_1d.h"
|
||||
|
||||
#define CLIP(a) iclip(a, min, max)
|
||||
|
||||
|
@ -60,41 +62,62 @@
|
|||
* wrap around.
|
||||
*/
|
||||
|
||||
static void NOINLINE
|
||||
inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
static NOINLINE void
|
||||
inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max, const int tx64)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
|
||||
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
|
||||
assert(stride > 0);
|
||||
const int in0 = c[0 * stride], in1 = c[1 * stride];
|
||||
|
||||
int t0 = ((in0 + in2) * 181 + 128) >> 8;
|
||||
int t1 = ((in0 - in2) * 181 + 128) >> 8;
|
||||
int t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
|
||||
int t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1;
|
||||
int t0, t1, t2, t3;
|
||||
if (tx64) {
|
||||
t0 = t1 = (in0 * 181 + 128) >> 8;
|
||||
t2 = (in1 * 1567 + 2048) >> 12;
|
||||
t3 = (in1 * 3784 + 2048) >> 12;
|
||||
} else {
|
||||
const int in2 = c[2 * stride], in3 = c[3 * stride];
|
||||
|
||||
out[0 * out_s] = CLIP(t0 + t3);
|
||||
out[1 * out_s] = CLIP(t1 + t2);
|
||||
out[2 * out_s] = CLIP(t1 - t2);
|
||||
out[3 * out_s] = CLIP(t0 - t3);
|
||||
t0 = ((in0 + in2) * 181 + 128) >> 8;
|
||||
t1 = ((in0 - in2) * 181 + 128) >> 8;
|
||||
t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
|
||||
t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1;
|
||||
}
|
||||
|
||||
c[0 * stride] = CLIP(t0 + t3);
|
||||
c[1 * stride] = CLIP(t1 + t2);
|
||||
c[2 * stride] = CLIP(t1 - t2);
|
||||
c[3 * stride] = CLIP(t0 - t3);
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
coef tmp[4];
|
||||
inv_dct4_1d_internal_c(c, stride, min, max, 0);
|
||||
}
|
||||
|
||||
inv_dct4_1d(in, in_s * 2, tmp, 1, max);
|
||||
static NOINLINE void
|
||||
inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max, const int tx64)
|
||||
{
|
||||
assert(stride > 0);
|
||||
inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);
|
||||
|
||||
const int in1 = in[1 * in_s], in3 = in[3 * in_s];
|
||||
const int in5 = in[5 * in_s], in7 = in[7 * in_s];
|
||||
const int in1 = c[1 * stride], in3 = c[3 * stride];
|
||||
|
||||
int t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7;
|
||||
int t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11;
|
||||
int t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11;
|
||||
int t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1;
|
||||
int t4a, t5a, t6a, t7a;
|
||||
if (tx64) {
|
||||
t4a = (in1 * 799 + 2048) >> 12;
|
||||
t5a = (in3 * -2276 + 2048) >> 12;
|
||||
t6a = (in3 * 3406 + 2048) >> 12;
|
||||
t7a = (in1 * 4017 + 2048) >> 12;
|
||||
} else {
|
||||
const int in5 = c[5 * stride], in7 = c[7 * stride];
|
||||
|
||||
t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7;
|
||||
t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11;
|
||||
t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11;
|
||||
t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1;
|
||||
}
|
||||
|
||||
int t4 = CLIP(t4a + t5a);
|
||||
t5a = CLIP(t4a - t5a);
|
||||
|
@ -104,38 +127,60 @@ inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
int t5 = ((t6a - t5a) * 181 + 128) >> 8;
|
||||
int t6 = ((t6a + t5a) * 181 + 128) >> 8;
|
||||
|
||||
out[0 * out_s] = CLIP(tmp[0] + t7);
|
||||
out[1 * out_s] = CLIP(tmp[1] + t6);
|
||||
out[2 * out_s] = CLIP(tmp[2] + t5);
|
||||
out[3 * out_s] = CLIP(tmp[3] + t4);
|
||||
out[4 * out_s] = CLIP(tmp[3] - t4);
|
||||
out[5 * out_s] = CLIP(tmp[2] - t5);
|
||||
out[6 * out_s] = CLIP(tmp[1] - t6);
|
||||
out[7 * out_s] = CLIP(tmp[0] - t7);
|
||||
const int t0 = c[0 * stride];
|
||||
const int t1 = c[2 * stride];
|
||||
const int t2 = c[4 * stride];
|
||||
const int t3 = c[6 * stride];
|
||||
|
||||
c[0 * stride] = CLIP(t0 + t7);
|
||||
c[1 * stride] = CLIP(t1 + t6);
|
||||
c[2 * stride] = CLIP(t2 + t5);
|
||||
c[3 * stride] = CLIP(t3 + t4);
|
||||
c[4 * stride] = CLIP(t3 - t4);
|
||||
c[5 * stride] = CLIP(t2 - t5);
|
||||
c[6 * stride] = CLIP(t1 - t6);
|
||||
c[7 * stride] = CLIP(t0 - t7);
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
coef tmp[8];
|
||||
inv_dct8_1d_internal_c(c, stride, min, max, 0);
|
||||
}
|
||||
|
||||
inv_dct8_1d(in, in_s * 2, tmp, 1, max);
|
||||
static NOINLINE void
|
||||
inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max, int tx64)
|
||||
{
|
||||
assert(stride > 0);
|
||||
inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);
|
||||
|
||||
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
|
||||
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
|
||||
const int in9 = in[ 9 * in_s], in11 = in[11 * in_s];
|
||||
const int in13 = in[13 * in_s], in15 = in[15 * in_s];
|
||||
const int in1 = c[1 * stride], in3 = c[3 * stride];
|
||||
const int in5 = c[5 * stride], in7 = c[7 * stride];
|
||||
|
||||
int t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15;
|
||||
int t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1;
|
||||
int t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11;
|
||||
int t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11;
|
||||
int t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11;
|
||||
int t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5;
|
||||
int t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13;
|
||||
int t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3;
|
||||
int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
|
||||
if (tx64) {
|
||||
t8a = (in1 * 401 + 2048) >> 12;
|
||||
t9a = (in7 * -2598 + 2048) >> 12;
|
||||
t10a = (in5 * 1931 + 2048) >> 12;
|
||||
t11a = (in3 * -1189 + 2048) >> 12;
|
||||
t12a = (in3 * 3920 + 2048) >> 12;
|
||||
t13a = (in5 * 3612 + 2048) >> 12;
|
||||
t14a = (in7 * 3166 + 2048) >> 12;
|
||||
t15a = (in1 * 4076 + 2048) >> 12;
|
||||
} else {
|
||||
const int in9 = c[ 9 * stride], in11 = c[11 * stride];
|
||||
const int in13 = c[13 * stride], in15 = c[15 * stride];
|
||||
|
||||
t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15;
|
||||
t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11;
|
||||
t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11;
|
||||
t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13;
|
||||
t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3;
|
||||
t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5;
|
||||
t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11;
|
||||
t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1;
|
||||
}
|
||||
|
||||
int t8 = CLIP(t8a + t9a);
|
||||
int t9 = CLIP(t8a - t9a);
|
||||
|
@ -165,58 +210,93 @@ inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
t11 = ((t12a - t11a) * 181 + 128) >> 8;
|
||||
t12 = ((t12a + t11a) * 181 + 128) >> 8;
|
||||
|
||||
out[ 0 * out_s] = CLIP(tmp[0] + t15a);
|
||||
out[ 1 * out_s] = CLIP(tmp[1] + t14);
|
||||
out[ 2 * out_s] = CLIP(tmp[2] + t13a);
|
||||
out[ 3 * out_s] = CLIP(tmp[3] + t12);
|
||||
out[ 4 * out_s] = CLIP(tmp[4] + t11);
|
||||
out[ 5 * out_s] = CLIP(tmp[5] + t10a);
|
||||
out[ 6 * out_s] = CLIP(tmp[6] + t9);
|
||||
out[ 7 * out_s] = CLIP(tmp[7] + t8a);
|
||||
out[ 8 * out_s] = CLIP(tmp[7] - t8a);
|
||||
out[ 9 * out_s] = CLIP(tmp[6] - t9);
|
||||
out[10 * out_s] = CLIP(tmp[5] - t10a);
|
||||
out[11 * out_s] = CLIP(tmp[4] - t11);
|
||||
out[12 * out_s] = CLIP(tmp[3] - t12);
|
||||
out[13 * out_s] = CLIP(tmp[2] - t13a);
|
||||
out[14 * out_s] = CLIP(tmp[1] - t14);
|
||||
out[15 * out_s] = CLIP(tmp[0] - t15a);
|
||||
const int t0 = c[ 0 * stride];
|
||||
const int t1 = c[ 2 * stride];
|
||||
const int t2 = c[ 4 * stride];
|
||||
const int t3 = c[ 6 * stride];
|
||||
const int t4 = c[ 8 * stride];
|
||||
const int t5 = c[10 * stride];
|
||||
const int t6 = c[12 * stride];
|
||||
const int t7 = c[14 * stride];
|
||||
|
||||
c[ 0 * stride] = CLIP(t0 + t15a);
|
||||
c[ 1 * stride] = CLIP(t1 + t14);
|
||||
c[ 2 * stride] = CLIP(t2 + t13a);
|
||||
c[ 3 * stride] = CLIP(t3 + t12);
|
||||
c[ 4 * stride] = CLIP(t4 + t11);
|
||||
c[ 5 * stride] = CLIP(t5 + t10a);
|
||||
c[ 6 * stride] = CLIP(t6 + t9);
|
||||
c[ 7 * stride] = CLIP(t7 + t8a);
|
||||
c[ 8 * stride] = CLIP(t7 - t8a);
|
||||
c[ 9 * stride] = CLIP(t6 - t9);
|
||||
c[10 * stride] = CLIP(t5 - t10a);
|
||||
c[11 * stride] = CLIP(t4 - t11);
|
||||
c[12 * stride] = CLIP(t3 - t12);
|
||||
c[13 * stride] = CLIP(t2 - t13a);
|
||||
c[14 * stride] = CLIP(t1 - t14);
|
||||
c[15 * stride] = CLIP(t0 - t15a);
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
coef tmp[16];
|
||||
inv_dct16_1d_internal_c(c, stride, min, max, 0);
|
||||
}
|
||||
|
||||
inv_dct16_1d(in, in_s * 2, tmp, 1, max);
|
||||
static NOINLINE void
|
||||
inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max, const int tx64)
|
||||
{
|
||||
assert(stride > 0);
|
||||
inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);
|
||||
|
||||
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
|
||||
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
|
||||
const int in9 = in[ 9 * in_s], in11 = in[11 * in_s];
|
||||
const int in13 = in[13 * in_s], in15 = in[15 * in_s];
|
||||
const int in17 = in[17 * in_s], in19 = in[19 * in_s];
|
||||
const int in21 = in[21 * in_s], in23 = in[23 * in_s];
|
||||
const int in25 = in[25 * in_s], in27 = in[27 * in_s];
|
||||
const int in29 = in[29 * in_s], in31 = in[31 * in_s];
|
||||
const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
|
||||
const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
|
||||
const int in9 = c[ 9 * stride], in11 = c[11 * stride];
|
||||
const int in13 = c[13 * stride], in15 = c[15 * stride];
|
||||
|
||||
int t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31;
|
||||
int t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1;
|
||||
int t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17;
|
||||
int t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15;
|
||||
int t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23;
|
||||
int t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9;
|
||||
int t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25;
|
||||
int t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7;
|
||||
int t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27;
|
||||
int t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5;
|
||||
int t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21;
|
||||
int t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11;
|
||||
int t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11;
|
||||
int t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11;
|
||||
int t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29;
|
||||
int t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3;
|
||||
int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
|
||||
int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
|
||||
if (tx64) {
|
||||
t16a = (in1 * 201 + 2048) >> 12;
|
||||
t17a = (in15 * -2751 + 2048) >> 12;
|
||||
t18a = (in9 * 1751 + 2048) >> 12;
|
||||
t19a = (in7 * -1380 + 2048) >> 12;
|
||||
t20a = (in5 * 995 + 2048) >> 12;
|
||||
t21a = (in11 * -2106 + 2048) >> 12;
|
||||
t22a = (in13 * 2440 + 2048) >> 12;
|
||||
t23a = (in3 * -601 + 2048) >> 12;
|
||||
t24a = (in3 * 4052 + 2048) >> 12;
|
||||
t25a = (in13 * 3290 + 2048) >> 12;
|
||||
t26a = (in11 * 3513 + 2048) >> 12;
|
||||
t27a = (in5 * 3973 + 2048) >> 12;
|
||||
t28a = (in7 * 3857 + 2048) >> 12;
|
||||
t29a = (in9 * 3703 + 2048) >> 12;
|
||||
t30a = (in15 * 3035 + 2048) >> 12;
|
||||
t31a = (in1 * 4091 + 2048) >> 12;
|
||||
} else {
|
||||
const int in17 = c[17 * stride], in19 = c[19 * stride];
|
||||
const int in21 = c[21 * stride], in23 = c[23 * stride];
|
||||
const int in25 = c[25 * stride], in27 = c[27 * stride];
|
||||
const int in29 = c[29 * stride], in31 = c[31 * stride];
|
||||
|
||||
t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31;
|
||||
t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17;
|
||||
t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23;
|
||||
t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25;
|
||||
t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27;
|
||||
t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21;
|
||||
t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11;
|
||||
t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29;
|
||||
t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3;
|
||||
t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11;
|
||||
t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11;
|
||||
t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5;
|
||||
t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7;
|
||||
t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9;
|
||||
t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15;
|
||||
t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1;
|
||||
}
|
||||
|
||||
int t16 = CLIP(t16a + t17a);
|
||||
int t17 = CLIP(t16a - t17a);
|
||||
|
@ -296,98 +376,110 @@ inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
t23a = ((t24 - t23 ) * 181 + 128) >> 8;
|
||||
t24a = ((t24 + t23 ) * 181 + 128) >> 8;
|
||||
|
||||
out[ 0 * out_s] = CLIP(tmp[ 0] + t31);
|
||||
out[ 1 * out_s] = CLIP(tmp[ 1] + t30a);
|
||||
out[ 2 * out_s] = CLIP(tmp[ 2] + t29);
|
||||
out[ 3 * out_s] = CLIP(tmp[ 3] + t28a);
|
||||
out[ 4 * out_s] = CLIP(tmp[ 4] + t27);
|
||||
out[ 5 * out_s] = CLIP(tmp[ 5] + t26a);
|
||||
out[ 6 * out_s] = CLIP(tmp[ 6] + t25);
|
||||
out[ 7 * out_s] = CLIP(tmp[ 7] + t24a);
|
||||
out[ 8 * out_s] = CLIP(tmp[ 8] + t23a);
|
||||
out[ 9 * out_s] = CLIP(tmp[ 9] + t22);
|
||||
out[10 * out_s] = CLIP(tmp[10] + t21a);
|
||||
out[11 * out_s] = CLIP(tmp[11] + t20);
|
||||
out[12 * out_s] = CLIP(tmp[12] + t19a);
|
||||
out[13 * out_s] = CLIP(tmp[13] + t18);
|
||||
out[14 * out_s] = CLIP(tmp[14] + t17a);
|
||||
out[15 * out_s] = CLIP(tmp[15] + t16);
|
||||
out[16 * out_s] = CLIP(tmp[15] - t16);
|
||||
out[17 * out_s] = CLIP(tmp[14] - t17a);
|
||||
out[18 * out_s] = CLIP(tmp[13] - t18);
|
||||
out[19 * out_s] = CLIP(tmp[12] - t19a);
|
||||
out[20 * out_s] = CLIP(tmp[11] - t20);
|
||||
out[21 * out_s] = CLIP(tmp[10] - t21a);
|
||||
out[22 * out_s] = CLIP(tmp[ 9] - t22);
|
||||
out[23 * out_s] = CLIP(tmp[ 8] - t23a);
|
||||
out[24 * out_s] = CLIP(tmp[ 7] - t24a);
|
||||
out[25 * out_s] = CLIP(tmp[ 6] - t25);
|
||||
out[26 * out_s] = CLIP(tmp[ 5] - t26a);
|
||||
out[27 * out_s] = CLIP(tmp[ 4] - t27);
|
||||
out[28 * out_s] = CLIP(tmp[ 3] - t28a);
|
||||
out[29 * out_s] = CLIP(tmp[ 2] - t29);
|
||||
out[30 * out_s] = CLIP(tmp[ 1] - t30a);
|
||||
out[31 * out_s] = CLIP(tmp[ 0] - t31);
|
||||
const int t0 = c[ 0 * stride];
|
||||
const int t1 = c[ 2 * stride];
|
||||
const int t2 = c[ 4 * stride];
|
||||
const int t3 = c[ 6 * stride];
|
||||
const int t4 = c[ 8 * stride];
|
||||
const int t5 = c[10 * stride];
|
||||
const int t6 = c[12 * stride];
|
||||
const int t7 = c[14 * stride];
|
||||
const int t8 = c[16 * stride];
|
||||
const int t9 = c[18 * stride];
|
||||
const int t10 = c[20 * stride];
|
||||
const int t11 = c[22 * stride];
|
||||
const int t12 = c[24 * stride];
|
||||
const int t13 = c[26 * stride];
|
||||
const int t14 = c[28 * stride];
|
||||
const int t15 = c[30 * stride];
|
||||
|
||||
c[ 0 * stride] = CLIP(t0 + t31);
|
||||
c[ 1 * stride] = CLIP(t1 + t30a);
|
||||
c[ 2 * stride] = CLIP(t2 + t29);
|
||||
c[ 3 * stride] = CLIP(t3 + t28a);
|
||||
c[ 4 * stride] = CLIP(t4 + t27);
|
||||
c[ 5 * stride] = CLIP(t5 + t26a);
|
||||
c[ 6 * stride] = CLIP(t6 + t25);
|
||||
c[ 7 * stride] = CLIP(t7 + t24a);
|
||||
c[ 8 * stride] = CLIP(t8 + t23a);
|
||||
c[ 9 * stride] = CLIP(t9 + t22);
|
||||
c[10 * stride] = CLIP(t10 + t21a);
|
||||
c[11 * stride] = CLIP(t11 + t20);
|
||||
c[12 * stride] = CLIP(t12 + t19a);
|
||||
c[13 * stride] = CLIP(t13 + t18);
|
||||
c[14 * stride] = CLIP(t14 + t17a);
|
||||
c[15 * stride] = CLIP(t15 + t16);
|
||||
c[16 * stride] = CLIP(t15 - t16);
|
||||
c[17 * stride] = CLIP(t14 - t17a);
|
||||
c[18 * stride] = CLIP(t13 - t18);
|
||||
c[19 * stride] = CLIP(t12 - t19a);
|
||||
c[20 * stride] = CLIP(t11 - t20);
|
||||
c[21 * stride] = CLIP(t10 - t21a);
|
||||
c[22 * stride] = CLIP(t9 - t22);
|
||||
c[23 * stride] = CLIP(t8 - t23a);
|
||||
c[24 * stride] = CLIP(t7 - t24a);
|
||||
c[25 * stride] = CLIP(t6 - t25);
|
||||
c[26 * stride] = CLIP(t5 - t26a);
|
||||
c[27 * stride] = CLIP(t4 - t27);
|
||||
c[28 * stride] = CLIP(t3 - t28a);
|
||||
c[29 * stride] = CLIP(t2 - t29);
|
||||
c[30 * stride] = CLIP(t1 - t30a);
|
||||
c[31 * stride] = CLIP(t0 - t31);
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
coef tmp[32];
|
||||
inv_dct32_1d_internal_c(c, stride, min, max, 0);
|
||||
}
|
||||
|
||||
inv_dct32_1d(in, in_s * 2, tmp, 1, max);
|
||||
void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
assert(stride > 0);
|
||||
inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
|
||||
|
||||
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
|
||||
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
|
||||
const int in9 = in[ 9 * in_s], in11 = in[11 * in_s];
|
||||
const int in13 = in[13 * in_s], in15 = in[15 * in_s];
|
||||
const int in17 = in[17 * in_s], in19 = in[19 * in_s];
|
||||
const int in21 = in[21 * in_s], in23 = in[23 * in_s];
|
||||
const int in25 = in[25 * in_s], in27 = in[27 * in_s];
|
||||
const int in29 = in[29 * in_s], in31 = in[31 * in_s];
|
||||
const int in33 = in[33 * in_s], in35 = in[35 * in_s];
|
||||
const int in37 = in[37 * in_s], in39 = in[39 * in_s];
|
||||
const int in41 = in[41 * in_s], in43 = in[43 * in_s];
|
||||
const int in45 = in[45 * in_s], in47 = in[47 * in_s];
|
||||
const int in49 = in[49 * in_s], in51 = in[51 * in_s];
|
||||
const int in53 = in[53 * in_s], in55 = in[55 * in_s];
|
||||
const int in57 = in[57 * in_s], in59 = in[59 * in_s];
|
||||
const int in61 = in[61 * in_s], in63 = in[63 * in_s];
|
||||
const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
|
||||
const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
|
||||
const int in9 = c[ 9 * stride], in11 = c[11 * stride];
|
||||
const int in13 = c[13 * stride], in15 = c[15 * stride];
|
||||
const int in17 = c[17 * stride], in19 = c[19 * stride];
|
||||
const int in21 = c[21 * stride], in23 = c[23 * stride];
|
||||
const int in25 = c[25 * stride], in27 = c[27 * stride];
|
||||
const int in29 = c[29 * stride], in31 = c[31 * stride];
|
||||
|
||||
int t32a = ((in1 * 101 - in63 * (4095 - 4096) + 2048) >> 12) - in63;
|
||||
int t33a = ((in33 * (2967 - 4096) - in31 * 2824 + 2048) >> 12) + in33;
|
||||
int t34a = ((in17 * 1660 - in47 * (3745 - 4096) + 2048) >> 12) - in47;
|
||||
int t35a = (in49 * 1911 - in15 * 737 + 1024) >> 11;
|
||||
int t36a = ((in9 * 897 - in55 * (3996 - 4096) + 2048) >> 12) - in55;
|
||||
int t37a = ((in41 * (3461 - 4096) - in23 * 2191 + 2048) >> 12) + in41;
|
||||
int t38a = ((in25 * 2359 - in39 * (3349 - 4096) + 2048) >> 12) - in39;
|
||||
int t39a = (in57 * 2018 - in7 * 350 + 1024) >> 11;
|
||||
int t40a = ((in5 * 501 - in59 * (4065 - 4096) + 2048) >> 12) - in59;
|
||||
int t41a = ((in37 * (3229 - 4096) - in27 * 2520 + 2048) >> 12) + in37;
|
||||
int t42a = ((in21 * 2019 - in43 * (3564 - 4096) + 2048) >> 12) - in43;
|
||||
int t43a = (in53 * 1974 - in11 * 546 + 1024) >> 11;
|
||||
int t44a = ((in13 * 1285 - in51 * (3889 - 4096) + 2048) >> 12) - in51;
|
||||
int t45a = ((in45 * (3659 - 4096) - in19 * 1842 + 2048) >> 12) + in45;
|
||||
int t46a = ((in29 * 2675 - in35 * (3102 - 4096) + 2048) >> 12) - in35;
|
||||
int t47a = ((in61 * (4085 - 4096) - in3 * 301 + 2048) >> 12) + in61;
|
||||
int t48a = ((in61 * 301 + in3 * (4085 - 4096) + 2048) >> 12) + in3;
|
||||
int t49a = ((in29 * (3102 - 4096) + in35 * 2675 + 2048) >> 12) + in29;
|
||||
int t50a = ((in45 * 1842 + in19 * (3659 - 4096) + 2048) >> 12) + in19;
|
||||
int t51a = ((in13 * (3889 - 4096) + in51 * 1285 + 2048) >> 12) + in13;
|
||||
int t52a = (in53 * 546 + in11 * 1974 + 1024) >> 11;
|
||||
int t53a = ((in21 * (3564 - 4096) + in43 * 2019 + 2048) >> 12) + in21;
|
||||
int t54a = ((in37 * 2520 + in27 * (3229 - 4096) + 2048) >> 12) + in27;
|
||||
int t55a = ((in5 * (4065 - 4096) + in59 * 501 + 2048) >> 12) + in5;
|
||||
int t56a = (in57 * 350 + in7 * 2018 + 1024) >> 11;
|
||||
int t57a = ((in25 * (3349 - 4096) + in39 * 2359 + 2048) >> 12) + in25;
|
||||
int t58a = ((in41 * 2191 + in23 * (3461 - 4096) + 2048) >> 12) + in23;
|
||||
int t59a = ((in9 * (3996 - 4096) + in55 * 897 + 2048) >> 12) + in9;
|
||||
int t60a = (in49 * 737 + in15 * 1911 + 1024) >> 11;
|
||||
int t61a = ((in17 * (3745 - 4096) + in47 * 1660 + 2048) >> 12) + in17;
|
||||
int t62a = ((in33 * 2824 + in31 * (2967 - 4096) + 2048) >> 12) + in31;
|
||||
int t63a = ((in1 * (4095 - 4096) + in63 * 101 + 2048) >> 12) + in1;
|
||||
int t32a = (in1 * 101 + 2048) >> 12;
|
||||
int t33a = (in31 * -2824 + 2048) >> 12;
|
||||
int t34a = (in17 * 1660 + 2048) >> 12;
|
||||
int t35a = (in15 * -1474 + 2048) >> 12;
|
||||
int t36a = (in9 * 897 + 2048) >> 12;
|
||||
int t37a = (in23 * -2191 + 2048) >> 12;
|
||||
int t38a = (in25 * 2359 + 2048) >> 12;
|
||||
int t39a = (in7 * -700 + 2048) >> 12;
|
||||
int t40a = (in5 * 501 + 2048) >> 12;
|
||||
int t41a = (in27 * -2520 + 2048) >> 12;
|
||||
int t42a = (in21 * 2019 + 2048) >> 12;
|
||||
int t43a = (in11 * -1092 + 2048) >> 12;
|
||||
int t44a = (in13 * 1285 + 2048) >> 12;
|
||||
int t45a = (in19 * -1842 + 2048) >> 12;
|
||||
int t46a = (in29 * 2675 + 2048) >> 12;
|
||||
int t47a = (in3 * -301 + 2048) >> 12;
|
||||
int t48a = (in3 * 4085 + 2048) >> 12;
|
||||
int t49a = (in29 * 3102 + 2048) >> 12;
|
||||
int t50a = (in19 * 3659 + 2048) >> 12;
|
||||
int t51a = (in13 * 3889 + 2048) >> 12;
|
||||
int t52a = (in11 * 3948 + 2048) >> 12;
|
||||
int t53a = (in21 * 3564 + 2048) >> 12;
|
||||
int t54a = (in27 * 3229 + 2048) >> 12;
|
||||
int t55a = (in5 * 4065 + 2048) >> 12;
|
||||
int t56a = (in7 * 4036 + 2048) >> 12;
|
||||
int t57a = (in25 * 3349 + 2048) >> 12;
|
||||
int t58a = (in23 * 3461 + 2048) >> 12;
|
||||
int t59a = (in9 * 3996 + 2048) >> 12;
|
||||
int t60a = (in15 * 3822 + 2048) >> 12;
|
||||
int t61a = (in17 * 3745 + 2048) >> 12;
|
||||
int t62a = (in31 * 2967 + 2048) >> 12;
|
||||
int t63a = (in1 * 4095 + 2048) >> 12;
|
||||
|
||||
int t32 = CLIP(t32a + t33a);
|
||||
int t33 = CLIP(t32a - t33a);
|
||||
|
@ -589,76 +681,111 @@ inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
t54 = ((t41a + t54a) * 181 + 128) >> 8;
|
||||
t55a = ((t40 + t55 ) * 181 + 128) >> 8;
|
||||
|
||||
out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);
|
||||
out[ 1 * out_s] = CLIP(tmp[ 1] + t62);
|
||||
out[ 2 * out_s] = CLIP(tmp[ 2] + t61a);
|
||||
out[ 3 * out_s] = CLIP(tmp[ 3] + t60);
|
||||
out[ 4 * out_s] = CLIP(tmp[ 4] + t59a);
|
||||
out[ 5 * out_s] = CLIP(tmp[ 5] + t58);
|
||||
out[ 6 * out_s] = CLIP(tmp[ 6] + t57a);
|
||||
out[ 7 * out_s] = CLIP(tmp[ 7] + t56);
|
||||
out[ 8 * out_s] = CLIP(tmp[ 8] + t55a);
|
||||
out[ 9 * out_s] = CLIP(tmp[ 9] + t54);
|
||||
out[10 * out_s] = CLIP(tmp[10] + t53a);
|
||||
out[11 * out_s] = CLIP(tmp[11] + t52);
|
||||
out[12 * out_s] = CLIP(tmp[12] + t51a);
|
||||
out[13 * out_s] = CLIP(tmp[13] + t50);
|
||||
out[14 * out_s] = CLIP(tmp[14] + t49a);
|
||||
out[15 * out_s] = CLIP(tmp[15] + t48);
|
||||
out[16 * out_s] = CLIP(tmp[16] + t47);
|
||||
out[17 * out_s] = CLIP(tmp[17] + t46a);
|
||||
out[18 * out_s] = CLIP(tmp[18] + t45);
|
||||
out[19 * out_s] = CLIP(tmp[19] + t44a);
|
||||
out[20 * out_s] = CLIP(tmp[20] + t43);
|
||||
out[21 * out_s] = CLIP(tmp[21] + t42a);
|
||||
out[22 * out_s] = CLIP(tmp[22] + t41);
|
||||
out[23 * out_s] = CLIP(tmp[23] + t40a);
|
||||
out[24 * out_s] = CLIP(tmp[24] + t39);
|
||||
out[25 * out_s] = CLIP(tmp[25] + t38a);
|
||||
out[26 * out_s] = CLIP(tmp[26] + t37);
|
||||
out[27 * out_s] = CLIP(tmp[27] + t36a);
|
||||
out[28 * out_s] = CLIP(tmp[28] + t35);
|
||||
out[29 * out_s] = CLIP(tmp[29] + t34a);
|
||||
out[30 * out_s] = CLIP(tmp[30] + t33);
|
||||
out[31 * out_s] = CLIP(tmp[31] + t32a);
|
||||
out[32 * out_s] = CLIP(tmp[31] - t32a);
|
||||
out[33 * out_s] = CLIP(tmp[30] - t33);
|
||||
out[34 * out_s] = CLIP(tmp[29] - t34a);
|
||||
out[35 * out_s] = CLIP(tmp[28] - t35);
|
||||
out[36 * out_s] = CLIP(tmp[27] - t36a);
|
||||
out[37 * out_s] = CLIP(tmp[26] - t37);
|
||||
out[38 * out_s] = CLIP(tmp[25] - t38a);
|
||||
out[39 * out_s] = CLIP(tmp[24] - t39);
|
||||
out[40 * out_s] = CLIP(tmp[23] - t40a);
|
||||
out[41 * out_s] = CLIP(tmp[22] - t41);
|
||||
out[42 * out_s] = CLIP(tmp[21] - t42a);
|
||||
out[43 * out_s] = CLIP(tmp[20] - t43);
|
||||
out[44 * out_s] = CLIP(tmp[19] - t44a);
|
||||
out[45 * out_s] = CLIP(tmp[18] - t45);
|
||||
out[46 * out_s] = CLIP(tmp[17] - t46a);
|
||||
out[47 * out_s] = CLIP(tmp[16] - t47);
|
||||
out[48 * out_s] = CLIP(tmp[15] - t48);
|
||||
out[49 * out_s] = CLIP(tmp[14] - t49a);
|
||||
out[50 * out_s] = CLIP(tmp[13] - t50);
|
||||
out[51 * out_s] = CLIP(tmp[12] - t51a);
|
||||
out[52 * out_s] = CLIP(tmp[11] - t52);
|
||||
out[53 * out_s] = CLIP(tmp[10] - t53a);
|
||||
out[54 * out_s] = CLIP(tmp[ 9] - t54);
|
||||
out[55 * out_s] = CLIP(tmp[ 8] - t55a);
|
||||
out[56 * out_s] = CLIP(tmp[ 7] - t56);
|
||||
out[57 * out_s] = CLIP(tmp[ 6] - t57a);
|
||||
out[58 * out_s] = CLIP(tmp[ 5] - t58);
|
||||
out[59 * out_s] = CLIP(tmp[ 4] - t59a);
|
||||
out[60 * out_s] = CLIP(tmp[ 3] - t60);
|
||||
out[61 * out_s] = CLIP(tmp[ 2] - t61a);
|
||||
out[62 * out_s] = CLIP(tmp[ 1] - t62);
|
||||
out[63 * out_s] = CLIP(tmp[ 0] - t63a);
|
||||
const int t0 = c[ 0 * stride];
|
||||
const int t1 = c[ 2 * stride];
|
||||
const int t2 = c[ 4 * stride];
|
||||
const int t3 = c[ 6 * stride];
|
||||
const int t4 = c[ 8 * stride];
|
||||
const int t5 = c[10 * stride];
|
||||
const int t6 = c[12 * stride];
|
||||
const int t7 = c[14 * stride];
|
||||
const int t8 = c[16 * stride];
|
||||
const int t9 = c[18 * stride];
|
||||
const int t10 = c[20 * stride];
|
||||
const int t11 = c[22 * stride];
|
||||
const int t12 = c[24 * stride];
|
||||
const int t13 = c[26 * stride];
|
||||
const int t14 = c[28 * stride];
|
||||
const int t15 = c[30 * stride];
|
||||
const int t16 = c[32 * stride];
|
||||
const int t17 = c[34 * stride];
|
||||
const int t18 = c[36 * stride];
|
||||
const int t19 = c[38 * stride];
|
||||
const int t20 = c[40 * stride];
|
||||
const int t21 = c[42 * stride];
|
||||
const int t22 = c[44 * stride];
|
||||
const int t23 = c[46 * stride];
|
||||
const int t24 = c[48 * stride];
|
||||
const int t25 = c[50 * stride];
|
||||
const int t26 = c[52 * stride];
|
||||
const int t27 = c[54 * stride];
|
||||
const int t28 = c[56 * stride];
|
||||
const int t29 = c[58 * stride];
|
||||
const int t30 = c[60 * stride];
|
||||
const int t31 = c[62 * stride];
|
||||
|
||||
c[ 0 * stride] = CLIP(t0 + t63a);
|
||||
c[ 1 * stride] = CLIP(t1 + t62);
|
||||
c[ 2 * stride] = CLIP(t2 + t61a);
|
||||
c[ 3 * stride] = CLIP(t3 + t60);
|
||||
c[ 4 * stride] = CLIP(t4 + t59a);
|
||||
c[ 5 * stride] = CLIP(t5 + t58);
|
||||
c[ 6 * stride] = CLIP(t6 + t57a);
|
||||
c[ 7 * stride] = CLIP(t7 + t56);
|
||||
c[ 8 * stride] = CLIP(t8 + t55a);
|
||||
c[ 9 * stride] = CLIP(t9 + t54);
|
||||
c[10 * stride] = CLIP(t10 + t53a);
|
||||
c[11 * stride] = CLIP(t11 + t52);
|
||||
c[12 * stride] = CLIP(t12 + t51a);
|
||||
c[13 * stride] = CLIP(t13 + t50);
|
||||
c[14 * stride] = CLIP(t14 + t49a);
|
||||
c[15 * stride] = CLIP(t15 + t48);
|
||||
c[16 * stride] = CLIP(t16 + t47);
|
||||
c[17 * stride] = CLIP(t17 + t46a);
|
||||
c[18 * stride] = CLIP(t18 + t45);
|
||||
c[19 * stride] = CLIP(t19 + t44a);
|
||||
c[20 * stride] = CLIP(t20 + t43);
|
||||
c[21 * stride] = CLIP(t21 + t42a);
|
||||
c[22 * stride] = CLIP(t22 + t41);
|
||||
c[23 * stride] = CLIP(t23 + t40a);
|
||||
c[24 * stride] = CLIP(t24 + t39);
|
||||
c[25 * stride] = CLIP(t25 + t38a);
|
||||
c[26 * stride] = CLIP(t26 + t37);
|
||||
c[27 * stride] = CLIP(t27 + t36a);
|
||||
c[28 * stride] = CLIP(t28 + t35);
|
||||
c[29 * stride] = CLIP(t29 + t34a);
|
||||
c[30 * stride] = CLIP(t30 + t33);
|
||||
c[31 * stride] = CLIP(t31 + t32a);
|
||||
c[32 * stride] = CLIP(t31 - t32a);
|
||||
c[33 * stride] = CLIP(t30 - t33);
|
||||
c[34 * stride] = CLIP(t29 - t34a);
|
||||
c[35 * stride] = CLIP(t28 - t35);
|
||||
c[36 * stride] = CLIP(t27 - t36a);
|
||||
c[37 * stride] = CLIP(t26 - t37);
|
||||
c[38 * stride] = CLIP(t25 - t38a);
|
||||
c[39 * stride] = CLIP(t24 - t39);
|
||||
c[40 * stride] = CLIP(t23 - t40a);
|
||||
c[41 * stride] = CLIP(t22 - t41);
|
||||
c[42 * stride] = CLIP(t21 - t42a);
|
||||
c[43 * stride] = CLIP(t20 - t43);
|
||||
c[44 * stride] = CLIP(t19 - t44a);
|
||||
c[45 * stride] = CLIP(t18 - t45);
|
||||
c[46 * stride] = CLIP(t17 - t46a);
|
||||
c[47 * stride] = CLIP(t16 - t47);
|
||||
c[48 * stride] = CLIP(t15 - t48);
|
||||
c[49 * stride] = CLIP(t14 - t49a);
|
||||
c[50 * stride] = CLIP(t13 - t50);
|
||||
c[51 * stride] = CLIP(t12 - t51a);
|
||||
c[52 * stride] = CLIP(t11 - t52);
|
||||
c[53 * stride] = CLIP(t10 - t53a);
|
||||
c[54 * stride] = CLIP(t9 - t54);
|
||||
c[55 * stride] = CLIP(t8 - t55a);
|
||||
c[56 * stride] = CLIP(t7 - t56);
|
||||
c[57 * stride] = CLIP(t6 - t57a);
|
||||
c[58 * stride] = CLIP(t5 - t58);
|
||||
c[59 * stride] = CLIP(t4 - t59a);
|
||||
c[60 * stride] = CLIP(t3 - t60);
|
||||
c[61 * stride] = CLIP(t2 - t61a);
|
||||
c[62 * stride] = CLIP(t1 - t62);
|
||||
c[63 * stride] = CLIP(t0 - t63a);
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int range)
|
||||
static NOINLINE void
|
||||
inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
|
||||
const int min, const int max,
|
||||
int32_t *const out, const ptrdiff_t out_s)
|
||||
{
|
||||
assert(in_s > 0 && out_s != 0);
|
||||
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
|
||||
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
|
||||
|
||||
|
@ -674,11 +801,12 @@ inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
in0 + in2 - in1;
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
static NOINLINE void
|
||||
inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
|
||||
const int min, const int max,
|
||||
int32_t *const out, const ptrdiff_t out_s)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
assert(in_s > 0 && out_s != 0);
|
||||
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
|
||||
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
|
||||
const int in4 = in[4 * in_s], in5 = in[5 * in_s];
|
||||
|
@ -707,15 +835,14 @@ inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7;
|
||||
t7a = (( 1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
|
||||
|
||||
out[0 * out_s] = CLIP( t0 + t2);
|
||||
out[7 * out_s] = CLIP(-(t1 + t3));
|
||||
t2 = CLIP( t0 - t2);
|
||||
t3 = CLIP( t1 - t3);
|
||||
|
||||
out[1 * out_s] = CLIP(-(t4a + t6a));
|
||||
out[6 * out_s] = CLIP( t5a + t7a );
|
||||
t6 = CLIP( t4a - t6a );
|
||||
t7 = CLIP( t5a - t7a );
|
||||
out[0 * out_s] = CLIP(t0 + t2 );
|
||||
out[7 * out_s] = -CLIP(t1 + t3 );
|
||||
t2 = CLIP(t0 - t2 );
|
||||
t3 = CLIP(t1 - t3 );
|
||||
out[1 * out_s] = -CLIP(t4a + t6a);
|
||||
out[6 * out_s] = CLIP(t5a + t7a);
|
||||
t6 = CLIP(t4a - t6a);
|
||||
t7 = CLIP(t5a - t7a);
|
||||
|
||||
out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8);
|
||||
out[4 * out_s] = ((t2 - t3) * 181 + 128) >> 8;
|
||||
|
@ -723,11 +850,12 @@ inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int max)
|
||||
static NOINLINE void
|
||||
inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
|
||||
const int min, const int max,
|
||||
int32_t *const out, const ptrdiff_t out_s)
|
||||
{
|
||||
const int min = -max - 1;
|
||||
assert(in_s > 0 && out_s != 0);
|
||||
const int in0 = in[ 0 * in_s], in1 = in[ 1 * in_s];
|
||||
const int in2 = in[ 2 * in_s], in3 = in[ 3 * in_s];
|
||||
const int in4 = in[ 4 * in_s], in5 = in[ 5 * in_s];
|
||||
|
@ -806,22 +934,22 @@ inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
t14 = ((t15a * (3784 - 4096) - t14a * 1567 + 2048) >> 12) + t15a;
|
||||
t15 = ((t15a * 1567 + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
|
||||
|
||||
out[ 0 * out_s] = CLIP( t0 + t2 );
|
||||
out[15 * out_s] = CLIP(-(t1 + t3) );
|
||||
t2a = CLIP( t0 - t2 );
|
||||
t3a = CLIP( t1 - t3 );
|
||||
out[ 3 * out_s] = CLIP(-(t4a + t6a) );
|
||||
out[12 * out_s] = CLIP( t5a + t7a );
|
||||
t6 = CLIP( t4a - t6a );
|
||||
t7 = CLIP( t5a - t7a );
|
||||
out[ 1 * out_s] = CLIP(-(t8a + t10a));
|
||||
out[14 * out_s] = CLIP( t9a + t11a );
|
||||
t10 = CLIP( t8a - t10a );
|
||||
t11 = CLIP( t9a - t11a );
|
||||
out[ 2 * out_s] = CLIP( t12 + t14 );
|
||||
out[13 * out_s] = CLIP(-(t13 + t15) );
|
||||
t14a = CLIP( t12 - t14 );
|
||||
t15a = CLIP( t13 - t15 );
|
||||
out[ 0 * out_s] = CLIP(t0 + t2 );
|
||||
out[15 * out_s] = -CLIP(t1 + t3 );
|
||||
t2a = CLIP(t0 - t2 );
|
||||
t3a = CLIP(t1 - t3 );
|
||||
out[ 3 * out_s] = -CLIP(t4a + t6a );
|
||||
out[12 * out_s] = CLIP(t5a + t7a );
|
||||
t6 = CLIP(t4a - t6a );
|
||||
t7 = CLIP(t5a - t7a );
|
||||
out[ 1 * out_s] = -CLIP(t8a + t10a);
|
||||
out[14 * out_s] = CLIP(t9a + t11a);
|
||||
t10 = CLIP(t8a - t10a);
|
||||
t11 = CLIP(t9a - t11a);
|
||||
out[ 2 * out_s] = CLIP(t12 + t14 );
|
||||
out[13 * out_s] = -CLIP(t13 + t15 );
|
||||
t14a = CLIP(t12 - t14 );
|
||||
t15a = CLIP(t13 - t15 );
|
||||
|
||||
out[ 7 * out_s] = -(((t2a + t3a) * 181 + 128) >> 8);
|
||||
out[ 8 * out_s] = ((t2a - t3a) * 181 + 128) >> 8;
|
||||
|
@ -833,67 +961,74 @@ inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
|
|||
out[10 * out_s] = ((t14a - t15a) * 181 + 128) >> 8;
|
||||
}
|
||||
|
||||
#define flip_inv_adst(sz) \
|
||||
static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \
|
||||
coef *const out, const ptrdiff_t out_s, const int range) \
|
||||
#define inv_adst_1d(sz) \
|
||||
void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
|
||||
const int min, const int max) \
|
||||
{ \
|
||||
inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \
|
||||
inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
|
||||
} \
|
||||
void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
|
||||
const int min, const int max) \
|
||||
{ \
|
||||
inv_adst##sz##_1d_internal_c(c, stride, min, max, \
|
||||
&c[(sz - 1) * stride], -stride); \
|
||||
}
|
||||
|
||||
flip_inv_adst(4)
|
||||
flip_inv_adst(8)
|
||||
flip_inv_adst(16)
|
||||
inv_adst_1d( 4)
|
||||
inv_adst_1d( 8)
|
||||
inv_adst_1d(16)
|
||||
|
||||
#undef flip_inv_adst
|
||||
#undef inv_adst_1d
|
||||
|
||||
static void NOINLINE
|
||||
inv_identity4_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int range)
|
||||
void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
out[out_s * i] = in[in_s * i] + ((in[in_s * i] * 1697 + 2048) >> 12);
|
||||
assert(stride > 0);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
const int in = c[stride * i];
|
||||
c[stride * i] = in + ((in * 1697 + 2048) >> 12);
|
||||
}
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_identity8_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int range)
|
||||
void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
assert(stride > 0);
|
||||
for (int i = 0; i < 8; i++)
|
||||
out[out_s * i] = in[in_s * i] * 2;
|
||||
c[stride * i] *= 2;
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_identity16_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int range)
|
||||
void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
for (int i = 0; i < 16; i++)
|
||||
out[out_s * i] = 2 * in[in_s * i] + ((in[in_s * i] * 1697 + 1024) >> 11);
|
||||
assert(stride > 0);
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const int in = c[stride * i];
|
||||
c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);
|
||||
}
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_identity32_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s, const int range)
|
||||
void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
|
||||
const int min, const int max)
|
||||
{
|
||||
assert(stride > 0);
|
||||
for (int i = 0; i < 32; i++)
|
||||
out[out_s * i] = in[in_s * i] * 4;
|
||||
c[stride * i] *= 4;
|
||||
}
|
||||
|
||||
static void NOINLINE
|
||||
inv_wht4_1d(const coef *const in, const ptrdiff_t in_s,
|
||||
coef *const out, const ptrdiff_t out_s,
|
||||
const int pass)
|
||||
{
|
||||
const int sh = 2 * !pass;
|
||||
const int in0 = in[0 * in_s] >> sh, in1 = in[1 * in_s] >> sh;
|
||||
const int in2 = in[2 * in_s] >> sh, in3 = in[3 * in_s] >> sh;
|
||||
void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
|
||||
assert(stride > 0);
|
||||
const int in0 = c[0 * stride], in1 = c[1 * stride];
|
||||
const int in2 = c[2 * stride], in3 = c[3 * stride];
|
||||
|
||||
const int t0 = in0 + in1;
|
||||
const int t2 = in2 - in3;
|
||||
const int t4 = (t0 - t2) >> 1;
|
||||
const int t3 = t4 - in3;
|
||||
const int t1 = t4 - in1;
|
||||
|
||||
out[0 * out_s] = t0 - t3;
|
||||
out[1 * out_s] = t3;
|
||||
out[2 * out_s] = t1;
|
||||
out[3 * out_s] = t2 + t1;
|
||||
c[0 * stride] = t0 - t3;
|
||||
c[1 * stride] = t3;
|
||||
c[2 * stride] = t1;
|
||||
c[3 * stride] = t2 + t1;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Copyright © 2018-2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef DAV1D_SRC_ITX_1D_H
|
||||
#define DAV1D_SRC_ITX_1D_H
|
||||
|
||||
#define decl_itx_1d_fn(name) \
|
||||
void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
|
||||
typedef decl_itx_1d_fn(*itx_1d_fn);
|
||||
|
||||
decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
|
||||
|
||||
decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
|
||||
|
||||
decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
|
||||
|
||||
decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
|
||||
decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
|
||||
|
||||
void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
|
||||
|
||||
#endif /* DAV1D_SRC_ITX_1D_H */
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* Copyright © 2018-2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -35,78 +35,68 @@
|
|||
#include "common/intops.h"
|
||||
|
||||
#include "src/itx.h"
|
||||
#include "src/itx_1d.h"
|
||||
|
||||
#include "src/itx_1d.c"
|
||||
|
||||
typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
|
||||
coef *out, ptrdiff_t out_s, const int range);
|
||||
|
||||
static void NOINLINE
|
||||
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
|
||||
coef *const coeff, const int eob,
|
||||
const int w, const int h, const int shift,
|
||||
static NOINLINE void
|
||||
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
|
||||
const int eob, const int w, const int h, const int shift,
|
||||
const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
|
||||
const int has_dconly HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
int i, j;
|
||||
assert((h >= 4 && h <= 64) && (w >= 4 && w <= 64));
|
||||
assert(w >= 4 && w <= 64);
|
||||
assert(h >= 4 && h <= 64);
|
||||
assert(eob >= 0);
|
||||
|
||||
const int is_rect2 = w * 2 == h || h * 2 == w;
|
||||
const int bitdepth = bitdepth_from_max(bitdepth_max);
|
||||
const int rnd = (1 << shift) >> 1;
|
||||
|
||||
if (has_dconly && eob == 0) {
|
||||
if (eob < has_dconly) {
|
||||
int dc = coeff[0];
|
||||
coeff[0] = 0;
|
||||
if (is_rect2)
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc * 181 + 128) >> 8;
|
||||
dc = (dc * 181 + 128) >> 8;
|
||||
dc = (dc + rnd) >> shift;
|
||||
dc = (dc * 2896 + 2048) >> 12;
|
||||
dc = (dc + 8) >> 4;
|
||||
for (j = 0; j < h; j++)
|
||||
for (i = 0; i < w; i++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
iclip_pixel(dst[i + j * PXSTRIDE(stride)] + dc);
|
||||
dc = (dc * 181 + 128 + 2048) >> 12;
|
||||
for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
|
||||
for (int x = 0; x < w; x++)
|
||||
dst[x] = iclip_pixel(dst[x] + dc);
|
||||
return;
|
||||
}
|
||||
assert(eob > 0 || (eob == 0 && !has_dconly));
|
||||
|
||||
const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
|
||||
// Maximum value for h and w is 64
|
||||
coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
|
||||
const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;
|
||||
const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
|
||||
|
||||
if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
|
||||
for (i = 0; i < sh; i++) {
|
||||
if (w != sw || is_rect2) {
|
||||
for (j = 0; j < sw; j++) {
|
||||
in_mem[j] = coeff[i + j * sh];
|
||||
if (is_rect2)
|
||||
in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
|
||||
}
|
||||
first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
|
||||
} else {
|
||||
first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
|
||||
}
|
||||
for (j = 0; j < w; j++)
|
||||
const int sh = imin(h, 32), sw = imin(w, 32);
|
||||
#if BITDEPTH == 8
|
||||
tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
|
||||
const int row_clip_min = INT16_MIN;
|
||||
const int col_clip_min = INT16_MIN;
|
||||
#else
|
||||
tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
|
||||
-col_clip_max - 1, col_clip_max);
|
||||
const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
|
||||
const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
|
||||
#endif
|
||||
const int row_clip_max = ~row_clip_min;
|
||||
const int col_clip_max = ~col_clip_min;
|
||||
|
||||
int32_t tmp[64 * 64], *c = tmp;
|
||||
for (int y = 0; y < sh; y++, c += w) {
|
||||
if (is_rect2)
|
||||
for (int x = 0; x < sw; x++)
|
||||
c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
|
||||
else
|
||||
for (int x = 0; x < sw; x++)
|
||||
c[x] = coeff[y + x * sh];
|
||||
first_1d_fn(c, 1, row_clip_min, row_clip_max);
|
||||
}
|
||||
|
||||
if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
|
||||
for (i = 0; i < w; i++) {
|
||||
second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
|
||||
for (j = 0; j < h; j++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
|
||||
((out[j] + 8) >> 4));
|
||||
}
|
||||
memset(coeff, 0, sizeof(*coeff) * sh * sw);
|
||||
memset(coeff, 0, sizeof(*coeff) * sw * sh);
|
||||
for (int i = 0; i < w * sh; i++)
|
||||
tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
|
||||
|
||||
for (int x = 0; x < w; x++)
|
||||
second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
|
||||
|
||||
c = tmp;
|
||||
for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
|
||||
for (int x = 0; x < w; x++)
|
||||
dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
|
||||
}
|
||||
|
||||
#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
|
||||
|
@ -118,8 +108,8 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
|
|||
HIGHBD_DECL_SUFFIX) \
|
||||
{ \
|
||||
inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
|
||||
inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
|
||||
HIGHBD_TAIL_SUFFIX); \
|
||||
dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
|
||||
has_dconly HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
#define inv_txfm_fn64(w, h, shift) \
|
||||
|
@ -173,23 +163,21 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
|
|||
coef *const coeff, const int eob
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int bitdepth = bitdepth_from_max(bitdepth_max);
|
||||
const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
|
||||
const int col_clip_min = -col_clip_max - 1;
|
||||
coef tmp[4 * 4], out[4];
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
|
||||
for (int k = 0; k < 4 * 4; k++)
|
||||
tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
inv_wht4_1d(&tmp[i], 4, out, 1, 1);
|
||||
for (int j = 0; j < 4; j++)
|
||||
dst[i + j * PXSTRIDE(stride)] =
|
||||
iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
|
||||
int32_t tmp[4 * 4], *c = tmp;
|
||||
for (int y = 0; y < 4; y++, c += 4) {
|
||||
for (int x = 0; x < 4; x++)
|
||||
c[x] = coeff[y + x * 4] >> 2;
|
||||
dav1d_inv_wht4_1d_c(c, 1);
|
||||
}
|
||||
memset(coeff, 0, sizeof(*coeff) * 4 * 4);
|
||||
|
||||
for (int x = 0; x < 4; x++)
|
||||
dav1d_inv_wht4_1d_c(&tmp[x], 4);
|
||||
|
||||
c = tmp;
|
||||
for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))
|
||||
for (int x = 0; x < 4; x++)
|
||||
dst[x] = iclip_pixel(dst[x] + *c++);
|
||||
}
|
||||
|
||||
COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
|
||||
|
|
|
@ -43,8 +43,8 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /*
|
|||
const uint16_t *const tx_masks)
|
||||
{
|
||||
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
|
||||
const int is_split =
|
||||
depth > 1 ? 0 : (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
|
||||
const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
|
||||
(tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
|
||||
|
||||
if (is_split) {
|
||||
const enum RectTxfmSize sub = t_dim->sub;
|
||||
|
@ -350,6 +350,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
|
|||
const int bx, const int by,
|
||||
const int iw, const int ih,
|
||||
const int skip, const enum BlockSize bs,
|
||||
const enum RectTxfmSize max_ytx,
|
||||
const uint16_t *const tx_masks,
|
||||
const enum RectTxfmSize uvtx,
|
||||
const enum Dav1dPixelLayout layout,
|
||||
|
@ -373,7 +374,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
|
|||
}
|
||||
|
||||
mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
|
||||
dav1d_max_txfm_size_for_bs[bs][0], tx_masks, ay, ly);
|
||||
max_ytx, tx_masks, ay, ly);
|
||||
}
|
||||
|
||||
if (!auv) return;
|
||||
|
|
|
@ -72,8 +72,8 @@ void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
|
|||
const ptrdiff_t b4_stride,
|
||||
const uint8_t (*level)[8][2], int bx, int by,
|
||||
int iw, int ih, int skip_inter,
|
||||
enum BlockSize bs, const uint16_t *tx_mask,
|
||||
enum RectTxfmSize uvtx,
|
||||
enum BlockSize bs, enum RectTxfmSize max_ytx,
|
||||
const uint16_t *tx_mask, enum RectTxfmSize uvtx,
|
||||
enum Dav1dPixelLayout layout, uint8_t *ay,
|
||||
uint8_t *ly, uint8_t *auv, uint8_t *luv);
|
||||
void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);
|
||||
|
|
|
@ -31,12 +31,17 @@
|
|||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#include "dav1d/dav1d.h"
|
||||
#include "dav1d/data.h"
|
||||
|
||||
#include "common/mem.h"
|
||||
#include "common/validate.h"
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/fg_apply.h"
|
||||
#include "src/internal.h"
|
||||
#include "src/log.h"
|
||||
|
@ -47,10 +52,11 @@
|
|||
#include "src/wedge.h"
|
||||
|
||||
static COLD void init_internal(void) {
|
||||
dav1d_init_wedge_masks();
|
||||
dav1d_init_cpu();
|
||||
dav1d_init_interintra_masks();
|
||||
dav1d_init_qm_tables();
|
||||
dav1d_init_thread();
|
||||
dav1d_init_wedge_masks();
|
||||
}
|
||||
|
||||
COLD const char *dav1d_version(void) {
|
||||
|
@ -73,6 +79,22 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
|
|||
|
||||
static void close_internal(Dav1dContext **const c_out, int flush);
|
||||
|
||||
NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
|
||||
static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
|
||||
#if defined(__linux__) && defined(HAVE_DLSYM)
|
||||
/* glibc has an issue where the size of the TLS is subtracted from the stack
|
||||
* size instead of allocated separately. As a result the specified stack
|
||||
* size may be insufficient when used in an application with large amounts
|
||||
* of TLS data. The following is a workaround to compensate for that.
|
||||
* See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
|
||||
size_t (*const get_minstack)(const pthread_attr_t*) =
|
||||
dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
|
||||
if (get_minstack)
|
||||
return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
||||
static pthread_once_t initted = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&initted, init_internal);
|
||||
|
@ -92,7 +114,9 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
|
||||
pthread_attr_t thread_attr;
|
||||
if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
|
||||
pthread_attr_setstacksize(&thread_attr, 1024 * 1024);
|
||||
size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
|
||||
|
||||
pthread_attr_setstacksize(&thread_attr, stack_size);
|
||||
|
||||
Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
|
||||
if (!c) goto error;
|
||||
|
@ -124,17 +148,15 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
|
||||
if (c->n_fc > 1) {
|
||||
c->frame_thread.out_delayed =
|
||||
malloc(sizeof(*c->frame_thread.out_delayed) * c->n_fc);
|
||||
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
|
||||
if (!c->frame_thread.out_delayed) goto error;
|
||||
memset(c->frame_thread.out_delayed, 0,
|
||||
sizeof(*c->frame_thread.out_delayed) * c->n_fc);
|
||||
}
|
||||
for (int n = 0; n < s->n_frame_threads; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
f->c = c;
|
||||
f->lf.last_sharpness = -1;
|
||||
f->n_tc = s->n_tile_threads;
|
||||
f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 32);
|
||||
f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
|
||||
if (!f->tc) goto error;
|
||||
memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
|
||||
if (f->n_tc > 1) {
|
||||
|
@ -512,7 +534,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
free(f->lf.level);
|
||||
free(f->lf.tx_lpf_right_edge[0]);
|
||||
if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
|
||||
dav1d_free_aligned(f->lf.cdef_line[0][0][0]);
|
||||
dav1d_free_aligned(f->lf.cdef_line_buf);
|
||||
dav1d_free_aligned(f->lf.lr_lpf_line[0]);
|
||||
}
|
||||
dav1d_free_aligned(c->fc);
|
||||
|
|
|
@ -72,8 +72,8 @@ typedef struct Dav1dLoopRestorationDSPContext {
|
|||
selfguided_fn selfguided;
|
||||
} Dav1dLoopRestorationDSPContext;
|
||||
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);
|
||||
|
||||
|
|
|
@ -573,13 +573,13 @@ static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
|
|||
}
|
||||
}
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
|
||||
c->wiener = wiener_c;
|
||||
c->selfguided = selfguided_c;
|
||||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
|
||||
bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc);
|
||||
#elif ARCH_PPC64LE
|
||||
bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
|
||||
#elif ARCH_X86
|
||||
|
|
|
@ -35,6 +35,7 @@ libdav1d_sources = files(
|
|||
'dequant_tables.c',
|
||||
'getbits.c',
|
||||
'intra_edge.c',
|
||||
'itx_1d.c',
|
||||
'lf_mask.c',
|
||||
'log.c',
|
||||
'msac.c',
|
||||
|
@ -101,22 +102,47 @@ if is_asm_enabled
|
|||
)
|
||||
if host_machine.cpu_family() == 'aarch64'
|
||||
libdav1d_sources += files(
|
||||
'arm/64/cdef.S',
|
||||
'arm/64/ipred.S',
|
||||
'arm/64/itx.S',
|
||||
'arm/64/loopfilter.S',
|
||||
'arm/64/looprestoration.S',
|
||||
'arm/64/mc.S',
|
||||
'arm/64/looprestoration_common.S',
|
||||
'arm/64/msac.S',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources += files(
|
||||
'arm/64/cdef.S',
|
||||
'arm/64/ipred.S',
|
||||
'arm/64/itx.S',
|
||||
'arm/64/loopfilter.S',
|
||||
'arm/64/looprestoration.S',
|
||||
'arm/64/mc.S',
|
||||
)
|
||||
endif
|
||||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources += files(
|
||||
'arm/64/cdef16.S',
|
||||
'arm/64/loopfilter16.S',
|
||||
'arm/64/looprestoration16.S',
|
||||
'arm/64/mc16.S',
|
||||
)
|
||||
endif
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
libdav1d_sources += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/ipred.S',
|
||||
'arm/32/loopfilter.S',
|
||||
'arm/32/looprestoration.S',
|
||||
'arm/32/mc.S',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/ipred.S',
|
||||
'arm/32/loopfilter.S',
|
||||
'arm/32/looprestoration.S',
|
||||
'arm/32/mc.S',
|
||||
)
|
||||
endif
|
||||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources += files(
|
||||
)
|
||||
endif
|
||||
endif
|
||||
elif host_machine.cpu_family().startswith('x86')
|
||||
|
||||
|
@ -124,6 +150,12 @@ if is_asm_enabled
|
|||
'x86/cpu.c',
|
||||
)
|
||||
|
||||
if host_machine.cpu_family() == 'x86_64'
|
||||
libdav1d_sources += files(
|
||||
'x86/msac_init.c',
|
||||
)
|
||||
endif
|
||||
|
||||
libdav1d_tmpl_sources += files(
|
||||
'x86/cdef_init_tmpl.c',
|
||||
'x86/film_grain_init_tmpl.c',
|
||||
|
@ -150,6 +182,7 @@ if is_asm_enabled
|
|||
'x86/looprestoration.asm',
|
||||
'x86/mc.asm',
|
||||
'x86/cdef_sse.asm',
|
||||
'x86/film_grain_ssse3.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
'x86/loopfilter_ssse3.asm',
|
||||
|
@ -278,6 +311,7 @@ libdav1d = library('dav1d',
|
|||
stdatomic_dependency,
|
||||
thread_dependency,
|
||||
thread_compat_dep,
|
||||
libdl_dependency,
|
||||
],
|
||||
c_args : [stackalign_flag, api_export_flags],
|
||||
version : dav1d_soname_version,
|
||||
|
@ -285,6 +319,10 @@ libdav1d = library('dav1d',
|
|||
install : true,
|
||||
)
|
||||
|
||||
dav1d_dep = declare_dependency(link_with: libdav1d,
|
||||
include_directories : include_directories('../include/dav1d')
|
||||
)
|
||||
|
||||
#
|
||||
# Generate pkg-config .pc file
|
||||
#
|
||||
|
|
|
@ -196,5 +196,12 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
|
|||
s->rng = 0x8000;
|
||||
s->cnt = -15;
|
||||
s->allow_update_cdf = !disable_cdf_update_flag;
|
||||
|
||||
#if ARCH_X86_64 && HAVE_ASM
|
||||
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
|
||||
|
||||
dav1d_msac_init_x86(s);
|
||||
#endif
|
||||
|
||||
ctx_refill(s);
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
typedef size_t ec_win;
|
||||
|
||||
|
@ -42,6 +42,10 @@ typedef struct MsacContext {
|
|||
unsigned rng;
|
||||
int cnt;
|
||||
int allow_update_cdf;
|
||||
|
||||
#if ARCH_X86_64 && HAVE_ASM
|
||||
unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols);
|
||||
#endif
|
||||
} MsacContext;
|
||||
|
||||
#if HAVE_ASM
|
||||
|
|
|
@ -917,10 +917,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
hdr->skip_mode_allowed = 0;
|
||||
if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
|
||||
const unsigned poc = hdr->frame_offset;
|
||||
unsigned off_before[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
|
||||
unsigned off_before = 0xFFFFFFFFU;
|
||||
int off_after = -1;
|
||||
int off_before_idx[2], off_after_idx;
|
||||
off_before_idx[0] = 0;
|
||||
int off_before_idx, off_after_idx;
|
||||
for (int i = 0; i < 7; i++) {
|
||||
if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
|
||||
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
|
||||
|
@ -933,36 +932,42 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
off_after = refpoc;
|
||||
off_after_idx = i;
|
||||
}
|
||||
} else if (diff < 0) {
|
||||
if (off_before[0] == 0xFFFFFFFFU ||
|
||||
get_poc_diff(seqhdr->order_hint_n_bits,
|
||||
refpoc, off_before[0]) > 0)
|
||||
{
|
||||
off_before[1] = off_before[0];
|
||||
off_before[0] = refpoc;
|
||||
off_before_idx[1] = off_before_idx[0];
|
||||
off_before_idx[0] = i;
|
||||
} else if (refpoc != off_before[0] &&
|
||||
(off_before[1] == 0xFFFFFFFFU ||
|
||||
get_poc_diff(seqhdr->order_hint_n_bits,
|
||||
refpoc, off_before[1]) > 0))
|
||||
{
|
||||
off_before[1] = refpoc;
|
||||
off_before_idx[1] = i;
|
||||
}
|
||||
} else if (diff < 0 && (off_before == 0xFFFFFFFFU ||
|
||||
get_poc_diff(seqhdr->order_hint_n_bits,
|
||||
refpoc, off_before) > 0))
|
||||
{
|
||||
off_before = refpoc;
|
||||
off_before_idx = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (off_before[0] != 0xFFFFFFFFU && off_after != -1) {
|
||||
hdr->skip_mode_refs[0] = imin(off_before_idx[0], off_after_idx);
|
||||
hdr->skip_mode_refs[1] = imax(off_before_idx[0], off_after_idx);
|
||||
hdr->skip_mode_allowed = 1;
|
||||
} else if (off_before[0] != 0xFFFFFFFFU &&
|
||||
off_before[1] != 0xFFFFFFFFU)
|
||||
{
|
||||
hdr->skip_mode_refs[0] = imin(off_before_idx[0], off_before_idx[1]);
|
||||
hdr->skip_mode_refs[1] = imax(off_before_idx[0], off_before_idx[1]);
|
||||
if (off_before != 0xFFFFFFFFU && off_after != -1) {
|
||||
hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
|
||||
hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
|
||||
hdr->skip_mode_allowed = 1;
|
||||
} else if (off_before != 0xFFFFFFFFU) {
|
||||
unsigned off_before2 = 0xFFFFFFFFU;
|
||||
int off_before2_idx;
|
||||
for (int i = 0; i < 7; i++) {
|
||||
if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
|
||||
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
|
||||
if (get_poc_diff(seqhdr->order_hint_n_bits,
|
||||
refpoc, off_before) < 0) {
|
||||
if (off_before2 == 0xFFFFFFFFU ||
|
||||
get_poc_diff(seqhdr->order_hint_n_bits,
|
||||
refpoc, off_before2) > 0)
|
||||
{
|
||||
off_before2 = refpoc;
|
||||
off_before2_idx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (off_before2 != 0xFFFFFFFFU) {
|
||||
hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
|
||||
hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
|
||||
hdr->skip_mode_allowed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;
|
||||
|
|
|
@ -53,7 +53,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
|
|||
|
||||
static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
||||
const uint8_t *src, const ptrdiff_t src_stride,
|
||||
const uint8_t (*left)[2], uint8_t *const top[2],
|
||||
const uint8_t (*left)[2], const uint8_t *const top,
|
||||
const int w, const int h,
|
||||
const enum CdefEdgeFlags edges)
|
||||
{
|
||||
|
@ -70,8 +70,8 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
|||
l1 = fill;
|
||||
y_start = 0;
|
||||
} else {
|
||||
l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
|
||||
l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
|
||||
l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
|
||||
l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
|
||||
}
|
||||
|
||||
vec_st(l0, 0, tmp - 2 * 8);
|
||||
|
@ -115,7 +115,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
|||
|
||||
static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
||||
const uint8_t *src, const ptrdiff_t src_stride,
|
||||
const uint8_t (*left)[2], uint8_t *const top[2],
|
||||
const uint8_t (*left)[2], const uint8_t *const top,
|
||||
const int w, const int h,
|
||||
const enum CdefEdgeFlags edges)
|
||||
{
|
||||
|
@ -134,8 +134,8 @@ static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
|||
l1l = fill;
|
||||
y_start = 0;
|
||||
} else {
|
||||
u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
|
||||
u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
|
||||
u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
|
||||
u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
|
||||
l0h = u8h_to_u16(l0);
|
||||
l0l = u8l_to_u16(l0);
|
||||
l1h = u8h_to_u16(l1);
|
||||
|
@ -275,7 +275,7 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
|
|||
|
||||
static inline void
|
||||
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*left)[2], /*const*/ pixel *const top[2],
|
||||
const pixel (*left)[2], const pixel *const top,
|
||||
const int w, const int h, const int pri_strength,
|
||||
const int sec_strength, const int dir,
|
||||
const int damping, const enum CdefEdgeFlags edges,
|
||||
|
@ -364,7 +364,7 @@ filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
|
|||
|
||||
static inline void
|
||||
filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*left)[2], /*const*/ pixel *const top[2],
|
||||
const pixel (*left)[2], const pixel *const top,
|
||||
const int w, const int h, const int pri_strength,
|
||||
const int sec_strength, const int dir,
|
||||
const int damping, const enum CdefEdgeFlags edges,
|
||||
|
@ -456,7 +456,7 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
|
|||
static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
|
||||
const ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[2], \
|
||||
/*const*/ pixel *const top[2], \
|
||||
const pixel *const top, \
|
||||
const int pri_strength, \
|
||||
const int sec_strength, \
|
||||
const int dir, \
|
||||
|
|
|
@ -70,10 +70,10 @@ static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
|
|||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
|
||||
b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
|
||||
int ca, cl;
|
||||
unsigned ca, cl;
|
||||
|
||||
#define MERGE_CTX(dir, type, mask) \
|
||||
c##dir = !!((*(const type *) dir) & mask); \
|
||||
#define MERGE_CTX(dir, type, no_val) \
|
||||
c##dir = *(const type *) dir != no_val; \
|
||||
break
|
||||
|
||||
switch (t_dim->lw) {
|
||||
|
@ -83,17 +83,17 @@ static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
|
|||
* and will therefore complain about the use of uninitialized variables
|
||||
* when compiled in debug mode if we put the default case at the end. */
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, 0x3F);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, 0x3F3F);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
|
||||
case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, 0x40);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
|
||||
case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
|
||||
}
|
||||
switch (t_dim->lh) {
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, 0x3F);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, 0x3F3F);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
|
||||
case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, 0x40);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
|
||||
case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
|
||||
}
|
||||
#undef MERGE_CTX
|
||||
|
||||
|
@ -352,13 +352,17 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
if (lossless) {
|
||||
assert(t_dim->max == TX_4X4);
|
||||
*txtp = WHT_WHT;
|
||||
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id] ||
|
||||
t_dim->max + intra >= TX_64X64)
|
||||
{
|
||||
} else if (t_dim->max + intra >= TX_64X64) {
|
||||
*txtp = DCT_DCT;
|
||||
} else if (chroma) {
|
||||
// inferred from either the luma txtp (inter) or a LUT (intra)
|
||||
*txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
|
||||
get_uv_inter_txtp(t_dim, *txtp);
|
||||
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
|
||||
// In libaom, lossless is checked by a literal qidx == 0, but not all
|
||||
// such blocks are actually lossless. The remainder gets an implicit
|
||||
// transform type (for luma)
|
||||
*txtp = DCT_DCT;
|
||||
} else {
|
||||
unsigned idx;
|
||||
if (intra) {
|
||||
|
@ -1993,7 +1997,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
|||
bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
|
||||
imin(sby * sbsz + n_blks, f->bh));
|
||||
}
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
|
||||
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
|
|
@ -2094,10 +2094,7 @@ void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
|
|||
|
||||
AV1_COMMON *dav1d_alloc_ref_mv_common(void);
|
||||
AV1_COMMON *dav1d_alloc_ref_mv_common(void) {
|
||||
AV1_COMMON *cm = malloc(sizeof(*cm));
|
||||
if (!cm) return NULL;
|
||||
memset(cm, 0, sizeof(*cm));
|
||||
return cm;
|
||||
return calloc(1, sizeof(AV1_COMMON));
|
||||
}
|
||||
|
||||
void dav1d_free_ref_mv_common(AV1_COMMON *cm);
|
||||
|
|
|
@ -397,6 +397,21 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = {
|
|||
.delta = 0,
|
||||
};
|
||||
|
||||
const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
|
||||
{ 1 * 12 + 0, 2 * 12 + 0 }, // 6
|
||||
{ 1 * 12 + 0, 2 * 12 - 1 }, // 7
|
||||
{ -1 * 12 + 1, -2 * 12 + 2 }, // 0
|
||||
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
|
||||
{ 0 * 12 + 1, 0 * 12 + 2 }, // 2
|
||||
{ 0 * 12 + 1, 1 * 12 + 2 }, // 3
|
||||
{ 1 * 12 + 1, 2 * 12 + 2 }, // 4
|
||||
{ 1 * 12 + 0, 2 * 12 + 1 }, // 5
|
||||
{ 1 * 12 + 0, 2 * 12 + 0 }, // 6
|
||||
{ 1 * 12 + 0, 2 * 12 - 1 }, // 7
|
||||
{ -1 * 12 + 1, -2 * 12 + 2 }, // 0
|
||||
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
|
||||
};
|
||||
|
||||
const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
|
||||
{ 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
|
||||
{ 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 },
|
||||
|
|
|
@ -105,6 +105,8 @@ static const unsigned interintra_allowed_mask =
|
|||
|
||||
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
|
||||
|
||||
extern const int8_t dav1d_cdef_directions[12][2];
|
||||
|
||||
extern const int16_t dav1d_sgr_params[16][4];
|
||||
extern const uint8_t dav1d_sgr_x_by_x[256];
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#include <limits.h>
|
||||
#include <windows.h>
|
||||
|
||||
#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
|
||||
|
@ -72,9 +73,10 @@ static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
|
|||
}
|
||||
|
||||
static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
|
||||
const unsigned stack_size)
|
||||
const size_t stack_size)
|
||||
{
|
||||
attr->stack_size = stack_size;
|
||||
if (stack_size > UINT_MAX) return 1;
|
||||
attr->stack_size = (unsigned) stack_size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -28,20 +28,16 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/cdef.h"
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_sse2);
|
||||
#define decl_cdef_size_fn(sz) \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_sse2);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_sse2);
|
||||
decl_cdef_size_fn(4x4);
|
||||
decl_cdef_size_fn(4x8);
|
||||
decl_cdef_size_fn(8x8);
|
||||
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
|
||||
|
@ -76,12 +72,21 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
|||
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
|
||||
#endif
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_avx2;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_avx2;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_avx2;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_avx2;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -364,26 +364,19 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
.body_done:
|
||||
|
||||
; top
|
||||
%if ARCH_X86_64
|
||||
DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
|
||||
%else
|
||||
DEFINE_ARGS dst, stride, left, top2, stride3, top1, edge
|
||||
%endif
|
||||
LOAD_ARG32 top
|
||||
test edged, 4 ; have_top
|
||||
jz .no_top
|
||||
mov top1q, [top2q+0*gprsize]
|
||||
mov top2q, [top2q+1*gprsize]
|
||||
test edged, 1 ; have_left
|
||||
jz .top_no_left
|
||||
test edged, 2 ; have_right
|
||||
jz .top_no_right
|
||||
%if %1 == 4
|
||||
PMOVZXBW m0, [top1q-2]
|
||||
PMOVZXBW m1, [top2q-2]
|
||||
PMOVZXBW m0, [topq+strideq*0-2]
|
||||
PMOVZXBW m1, [topq+strideq*1-2]
|
||||
%else
|
||||
movu m0, [top1q-4]
|
||||
movu m1, [top2q-4]
|
||||
movu m0, [topq+strideq*0-4]
|
||||
movu m1, [topq+strideq*1-4]
|
||||
punpckhbw m2, m0, m15
|
||||
punpcklbw m0, m15
|
||||
punpckhbw m3, m1, m15
|
||||
|
@ -396,13 +389,13 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
jmp .top_done
|
||||
.top_no_right:
|
||||
%if %1 == 4
|
||||
PMOVZXBW m0, [top1q-%1]
|
||||
PMOVZXBW m1, [top2q-%1]
|
||||
PMOVZXBW m0, [topq+strideq*0-%1]
|
||||
PMOVZXBW m1, [topq+strideq*1-%1]
|
||||
movu [px-2*%3-4*2], m0
|
||||
movu [px-1*%3-4*2], m1
|
||||
%else
|
||||
movu m0, [top1q-%1]
|
||||
movu m1, [top2q-%2]
|
||||
movu m0, [topq+strideq*0-%1]
|
||||
movu m1, [topq+strideq*1-%2]
|
||||
punpckhbw m2, m0, m15
|
||||
punpcklbw m0, m15
|
||||
punpckhbw m3, m1, m15
|
||||
|
@ -419,11 +412,11 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
test edged, 2 ; have_right
|
||||
jz .top_no_left_right
|
||||
%if %1 == 4
|
||||
PMOVZXBW m0, [top1q]
|
||||
PMOVZXBW m1, [top2q]
|
||||
PMOVZXBW m0, [topq+strideq*0]
|
||||
PMOVZXBW m1, [topq+strideq*1]
|
||||
%else
|
||||
movu m0, [top1q]
|
||||
movu m1, [top2q]
|
||||
movu m0, [topq+strideq*0]
|
||||
movu m1, [topq+strideq*1]
|
||||
punpckhbw m2, m0, m15
|
||||
punpcklbw m0, m15
|
||||
punpckhbw m3, m1, m15
|
||||
|
@ -437,8 +430,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
mov dword [px-1*%3-4], OUT_OF_BOUNDS
|
||||
jmp .top_done
|
||||
.top_no_left_right:
|
||||
PMOVZXBW m0, [top1q], %1 == 4
|
||||
PMOVZXBW m1, [top2q], %1 == 4
|
||||
PMOVZXBW m0, [topq+strideq*0], %1 == 4
|
||||
PMOVZXBW m1, [topq+strideq*1], %1 == 4
|
||||
mova [px-2*%3], m0
|
||||
mova [px-1*%3], m1
|
||||
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
|
||||
|
@ -630,9 +623,9 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
sub secdmpd, dampingd
|
||||
xor dampingd, dampingd
|
||||
neg pridmpd
|
||||
cmovl pridmpd, dampingd
|
||||
cmovs pridmpd, dampingd
|
||||
neg secdmpd
|
||||
cmovl secdmpd, dampingd
|
||||
cmovs secdmpd, dampingd
|
||||
%if ARCH_X86_64
|
||||
mov [rsp+ 0], pridmpq ; pri_shift
|
||||
mov [rsp+16], secdmpq ; sec_shift
|
||||
|
|
|
@ -33,37 +33,44 @@
|
|||
|
||||
#include "src/x86/cpu.h"
|
||||
|
||||
void dav1d_cpu_cpuid(uint32_t *info, int leaf);
|
||||
uint64_t dav1d_cpu_xgetbv(int xcr);
|
||||
typedef struct {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} CpuidRegisters;
|
||||
|
||||
void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
|
||||
uint64_t dav1d_cpu_xgetbv(unsigned xcr);
|
||||
|
||||
#define X(reg, mask) (((reg) & (mask)) == (mask))
|
||||
|
||||
COLD unsigned dav1d_get_cpu_flags_x86(void) {
|
||||
uint32_t info[4] = {0}, n_ids;
|
||||
CpuidRegisters r = { 0 };
|
||||
dav1d_cpu_cpuid(&r, 0, 0);
|
||||
const unsigned max_leaf = r.eax;
|
||||
unsigned flags = 0;
|
||||
|
||||
dav1d_cpu_cpuid(info, 0);
|
||||
n_ids = info[0];
|
||||
|
||||
if (n_ids >= 1) {
|
||||
dav1d_cpu_cpuid(info, 1);
|
||||
if (info[3] & (1 << 25)) flags |= DAV1D_X86_CPU_FLAG_SSE;
|
||||
if (info[3] & (1 << 26)) flags |= DAV1D_X86_CPU_FLAG_SSE2;
|
||||
if (info[2] & (1 << 0)) flags |= DAV1D_X86_CPU_FLAG_SSE3;
|
||||
if (info[2] & (1 << 9)) flags |= DAV1D_X86_CPU_FLAG_SSSE3;
|
||||
if (info[2] & (1 << 19)) flags |= DAV1D_X86_CPU_FLAG_SSE41;
|
||||
if (info[2] & (1 << 20)) flags |= DAV1D_X86_CPU_FLAG_SSE42;
|
||||
if (max_leaf >= 1) {
|
||||
dav1d_cpu_cpuid(&r, 1, 0);
|
||||
if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
|
||||
flags |= DAV1D_X86_CPU_FLAG_SSE2;
|
||||
if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
|
||||
flags |= DAV1D_X86_CPU_FLAG_SSSE3;
|
||||
if (X(r.ecx, 0x00080000)) /* SSE4.1 */
|
||||
flags |= DAV1D_X86_CPU_FLAG_SSE41;
|
||||
}
|
||||
}
|
||||
#if ARCH_X86_64
|
||||
/* We only support >128-bit SIMD on x86-64. */
|
||||
if (info[2] & (1 << 27)) /* OSXSAVE */ {
|
||||
uint64_t xcr = dav1d_cpu_xgetbv(0);
|
||||
if ((xcr & 0x00000006) == 0x00000006) /* XMM/YMM */ {
|
||||
if (info[2] & (1 << 28)) flags |= DAV1D_X86_CPU_FLAG_AVX;
|
||||
if (n_ids >= 7) {
|
||||
dav1d_cpu_cpuid(info, 7);
|
||||
if ((info[1] & 0x00000128) == 0x00000128)
|
||||
if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
|
||||
const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
|
||||
if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
|
||||
if (max_leaf >= 7) {
|
||||
dav1d_cpu_cpuid(&r, 7, 0);
|
||||
if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
|
||||
flags |= DAV1D_X86_CPU_FLAG_AVX2;
|
||||
if ((xcr & 0x000000e0) == 0x000000e0) /* ZMM/OPMASK */ {
|
||||
if ((info[1] & 0xd0030000) == 0xd0030000)
|
||||
flags |= DAV1D_X86_CPU_FLAG_AVX512;
|
||||
if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
|
||||
if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
|
||||
flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,15 +29,12 @@
|
|||
#define DAV1D_SRC_X86_CPU_H
|
||||
|
||||
enum CpuFlags {
|
||||
DAV1D_X86_CPU_FLAG_SSE = 1 << 0,
|
||||
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 1,
|
||||
DAV1D_X86_CPU_FLAG_SSE3 = 1 << 2,
|
||||
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 3,
|
||||
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 4,
|
||||
DAV1D_X86_CPU_FLAG_SSE42 = 1 << 5,
|
||||
DAV1D_X86_CPU_FLAG_AVX = 1 << 6,
|
||||
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 7,
|
||||
DAV1D_X86_CPU_FLAG_AVX512 = 1 << 8, /* F + CD + BW + DQ + VL */
|
||||
DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
|
||||
DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
|
||||
DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
|
||||
DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
|
||||
DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
|
||||
* VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
|
||||
};
|
||||
|
||||
unsigned dav1d_get_cpu_flags_x86(void);
|
||||
|
|
|
@ -27,12 +27,12 @@
|
|||
|
||||
SECTION .text
|
||||
|
||||
cglobal cpu_cpuid, 0, 5, 0, info, leaf
|
||||
mov r4, infomp
|
||||
cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
|
||||
mov r4, regsmp
|
||||
mov eax, leafm
|
||||
xor ecx, ecx
|
||||
mov ecx, subleafm
|
||||
%if ARCH_X86_64
|
||||
push rbx
|
||||
mov r5, rbx
|
||||
%endif
|
||||
cpuid
|
||||
mov [r4+4*0], eax
|
||||
|
@ -40,7 +40,7 @@ cglobal cpu_cpuid, 0, 5, 0, info, leaf
|
|||
mov [r4+4*2], ecx
|
||||
mov [r4+4*3], edx
|
||||
%if ARCH_X86_64
|
||||
pop rbx
|
||||
mov rbx, r5
|
||||
%endif
|
||||
RET
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ round_vals: dw 32, 64, 128, 256, 512
|
|||
max: dw 255, 240, 235
|
||||
min: dw 0, 16
|
||||
pb_27_17_17_27: db 27, 17, 17, 27
|
||||
pw_1: dw 1
|
||||
|
||||
%macro JMP_TABLE 1-*
|
||||
%xdefine %1_table %%table
|
||||
|
@ -56,6 +57,7 @@ pb_27_17_17_27: db 27, 17, 17, 27
|
|||
%endrep
|
||||
%endmacro
|
||||
|
||||
ALIGN 4
|
||||
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
|
||||
|
||||
|
@ -69,8 +71,8 @@ struc FGData
|
|||
.scaling_shift: resd 1
|
||||
.ar_coeff_lag: resd 1
|
||||
.ar_coeffs_y: resb 24
|
||||
.ar_coeffs_uv: resb 2 * 26 ; includes padding
|
||||
.ar_coeff_shift: resd 1
|
||||
.ar_coeffs_uv: resb 2 * 28 ; includes padding
|
||||
.ar_coeff_shift: resq 1
|
||||
.grain_scale_shift: resd 1
|
||||
.uv_mult: resd 2
|
||||
.uv_luma_mult: resd 2
|
||||
|
@ -169,9 +171,9 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
movsx val0d, byte [bufq+xq]
|
||||
add val3d, val0d
|
||||
cmp val3d, maxd
|
||||
cmovg val3d, maxd
|
||||
cmovns val3d, maxd
|
||||
cmp val3d, mind
|
||||
cmovl val3d, mind
|
||||
cmovs val3d, mind
|
||||
mov byte [bufq+xq], val3b
|
||||
; keep val3d in-place as left for next x iteration
|
||||
inc xq
|
||||
|
@ -190,18 +192,19 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
.ar2:
|
||||
DEFINE_ARGS buf, fg_data, shift
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
movd xm14, [base+hmul_bits-10+shiftq*2]
|
||||
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
|
||||
movq xm15, [base+byte_blend+1]
|
||||
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
|
||||
movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
|
||||
pmovsxbw xm9, xm9
|
||||
DEFINE_ARGS buf, h, x
|
||||
DEFINE_ARGS buf, fg_data, h, x
|
||||
pshufd xm12, xm9, q0000
|
||||
pshufd xm13, xm9, q1111
|
||||
pshufd xm11, xm8, q3333
|
||||
pshufd xm10, xm8, q2222
|
||||
pshufd xm9, xm8, q1111
|
||||
pshufd xm8, xm8, q0000
|
||||
pmovzxwd xm14, xm14
|
||||
sub bufq, 82*73-(82*3+79)
|
||||
mov hd, 70
|
||||
.y_loop_ar2:
|
||||
|
@ -233,6 +236,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
paddd xm4, xm6
|
||||
paddd xm2, xm7
|
||||
paddd xm2, xm4
|
||||
paddd xm2, xm14
|
||||
|
||||
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
|
||||
.x_loop_ar2_inner:
|
||||
|
@ -241,9 +245,8 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
paddd xm3, xm2
|
||||
psrldq xm1, 4 ; y=0,x=0
|
||||
psrldq xm2, 4 ; shift top to next pixel
|
||||
psrad xm3, 5
|
||||
packssdw xm3, xm3
|
||||
pmulhrsw xm3, xm14
|
||||
psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
|
||||
; don't packssdw since we only care about one value
|
||||
paddw xm3, xm1
|
||||
packsswb xm3, xm3
|
||||
pextrb [bufq+xq], xm3, 0
|
||||
|
@ -274,7 +277,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
ALLOC_STACK 16*12
|
||||
%endif
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
movd xm14, [base+hmul_bits-10+shiftq*2]
|
||||
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
|
||||
movq xm15, [base+byte_blend]
|
||||
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7
|
||||
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15
|
||||
|
@ -288,10 +291,11 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
pshufd xm8, xm1, q3333
|
||||
pshufd xm1, xm1, q0000
|
||||
pshufd xm3, xm2, q1111
|
||||
psrldq xm13, xm2, 10
|
||||
pinsrw xm2, [pw_1], 5
|
||||
pshufd xm4, xm2, q2222
|
||||
psrldq xm5, xm2, 10
|
||||
pshufd xm2, xm2, q0000
|
||||
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
|
||||
pinsrw xm13, [base+round_vals+shiftq*2-10], 3
|
||||
mova [rsp+ 0*16], xm0
|
||||
mova [rsp+ 1*16], xm9
|
||||
mova [rsp+ 2*16], xm10
|
||||
|
@ -303,9 +307,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
mova [rsp+ 8*16], xm2
|
||||
mova [rsp+ 9*16], xm3
|
||||
mova [rsp+10*16], xm4
|
||||
mova [rsp+11*16], xm5
|
||||
pxor xm13, xm13
|
||||
DEFINE_ARGS buf, h, x
|
||||
DEFINE_ARGS buf, fg_data, h, x
|
||||
sub bufq, 82*73-(82*3+79)
|
||||
mov hd, 70
|
||||
.y_loop_ar3:
|
||||
|
@ -374,7 +376,7 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
|
||||
punpcklwd xm6, xm7
|
||||
punpcklwd xm8, xm9
|
||||
punpcklwd xm5, xm13
|
||||
punpcklwd xm5, xm14
|
||||
pmaddwd xm6, [rsp+ 8*16]
|
||||
pmaddwd xm8, [rsp+ 9*16]
|
||||
pmaddwd xm5, [rsp+10*16]
|
||||
|
@ -385,14 +387,13 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
|
||||
.x_loop_ar3_inner:
|
||||
pmovsxbw xm2, xm1
|
||||
pmaddwd xm2, [rsp+16*11]
|
||||
pmaddwd xm2, xm13
|
||||
pshufd xm3, xm2, q1111
|
||||
paddd xm2, xm3 ; left+cur
|
||||
paddd xm2, xm0 ; add top
|
||||
psrldq xm0, 4
|
||||
psrad xm2, 5
|
||||
packssdw xm2, xm2
|
||||
pmulhrsw xm2, xm14
|
||||
psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
|
||||
; don't packssdw since we only care about one value
|
||||
packsswb xm2, xm2
|
||||
pextrb [bufq+xq], xm2, 0
|
||||
pslldq xm2, 3
|
||||
|
@ -468,7 +469,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
.ar0:
|
||||
INIT_YMM avx2
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
|
||||
imul uvd, 25
|
||||
imul uvd, 28
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
|
||||
movd xm3, [base+hmul_bits+shiftq*2]
|
||||
|
@ -538,7 +539,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
.ar1:
|
||||
INIT_XMM avx2
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
|
||||
imul uvd, 25
|
||||
imul uvd, 28
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
|
||||
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
|
||||
|
@ -584,9 +585,9 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
movsx val0d, byte [bufq+xq]
|
||||
add val3d, val0d
|
||||
cmp val3d, maxd
|
||||
cmovg val3d, maxd
|
||||
cmovns val3d, maxd
|
||||
cmp val3d, mind
|
||||
cmovl val3d, mind
|
||||
cmovs val3d, mind
|
||||
mov byte [bufq+xq], val3b
|
||||
; keep val3d in-place as left for next x iteration
|
||||
inc xq
|
||||
|
@ -605,18 +606,17 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
.ar2:
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
imul uvd, 25
|
||||
movd xm15, [base+hmul_bits-10+shiftq*2]
|
||||
imul uvd, 28
|
||||
vpbroadcastw xm15, [base+round_vals-12+shiftq*2]
|
||||
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
|
||||
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
|
||||
pinsrw xm9, [base+pw_1], 5
|
||||
vpbroadcastw xm7, [base+hmul_bits+4]
|
||||
vpbroadcastd xm6, [base+pb_1]
|
||||
DEFINE_ARGS buf, bufy, h, x
|
||||
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
|
||||
pshufd xm12, xm9, q0000
|
||||
pshufd xm13, xm9, q1111
|
||||
pshufd xm14, xm9, q2222
|
||||
pxor xm10, xm10
|
||||
vpblendw xm14, xm10, 10101010b
|
||||
pshufd xm11, xm8, q3333
|
||||
pshufd xm10, xm8, q2222
|
||||
pshufd xm9, xm8, q1111
|
||||
|
@ -660,7 +660,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
pmaddubsw xm3, xm6, xm3
|
||||
paddw xm0, xm3
|
||||
pmulhrsw xm0, xm7
|
||||
punpcklwd xm0, xm0
|
||||
punpcklwd xm0, xm15
|
||||
pmaddwd xm0, xm14
|
||||
paddd xm2, xm0
|
||||
|
||||
|
@ -670,9 +670,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
pmaddwd xm3, xm0, xm13
|
||||
paddd xm3, xm2
|
||||
psrldq xm2, 4 ; shift top to next pixel
|
||||
psrad xm3, 5
|
||||
packssdw xm3, xm3
|
||||
pmulhrsw xm3, xm15
|
||||
psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
|
||||
pslldq xm3, 2
|
||||
psrldq xm0, 2
|
||||
paddw xm3, xm0
|
||||
|
@ -698,8 +696,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
%assign stack_size_padded (stack_size_padded+16*12)
|
||||
%assign stack_size (stack_size+16*12)
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
imul uvd, 25
|
||||
movd xm14, [base+hmul_bits-10+shiftq*2]
|
||||
imul uvd, 28
|
||||
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
|
||||
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7
|
||||
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15
|
||||
pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
|
||||
|
@ -719,6 +717,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
psrldq xm5, xm2, 10
|
||||
pshufd xm2, xm2, q0000
|
||||
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
|
||||
pmovzxwd xm14, xm14
|
||||
mova [rsp+ 0*16], xm0
|
||||
mova [rsp+ 1*16], xm9
|
||||
mova [rsp+ 2*16], xm10
|
||||
|
@ -733,7 +732,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
mova [rsp+11*16], xm5
|
||||
vpbroadcastd xm13, [base+pb_1]
|
||||
vpbroadcastw xm15, [base+hmul_bits+4]
|
||||
DEFINE_ARGS buf, bufy, h, x
|
||||
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
|
||||
sub bufq, 82*38+44-(82*3+41)
|
||||
add bufyq, 79+82*3
|
||||
mov hd, 35
|
||||
|
@ -817,6 +816,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
paddd xm0, xm6
|
||||
paddd xm8, xm5
|
||||
paddd xm0, xm8
|
||||
paddd xm0, xm14
|
||||
|
||||
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
|
||||
.x_loop_ar3_inner:
|
||||
|
@ -826,9 +826,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
|||
paddd xm2, xm3 ; left+cur
|
||||
paddd xm2, xm0 ; add top
|
||||
psrldq xm0, 4
|
||||
psrad xm2, 5
|
||||
packssdw xm2, xm2
|
||||
pmulhrsw xm2, xm14
|
||||
psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
|
||||
; don't packssdw, we only care about one value
|
||||
pslldq xm2, 6
|
||||
vpblendw xm1, xm2, 1000b
|
||||
packsswb xm1, xm1
|
||||
|
|
|
@ -28,6 +28,11 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/film_grain.h"
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
|
||||
|
@ -36,6 +41,15 @@ decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
|
|||
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->generate_grain_y = dav1d_generate_grain_y_ssse3;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -308,7 +308,7 @@ ALIGN function_align
|
|||
mov r6d, 0x5556
|
||||
mov r2d, 0x3334
|
||||
cmp hd, 32
|
||||
cmovz r6d, r2d
|
||||
cmove r6d, r2d
|
||||
movd xm1, r6d
|
||||
pmulhuw xm0, xm1
|
||||
.w8_end:
|
||||
|
@ -1441,7 +1441,7 @@ ALIGN function_align
|
|||
mov r3d, 9
|
||||
mov tlq, rsp
|
||||
cmp hd, 4
|
||||
cmova maxbased, r3d
|
||||
cmovne maxbased, r3d
|
||||
vextracti128 xm1, m0, 1
|
||||
packuswb xm0, xm1
|
||||
mova [tlq], xm0
|
||||
|
@ -1628,8 +1628,8 @@ ALIGN function_align
|
|||
sar r5d, 1
|
||||
mov tlq, rsp
|
||||
add r5d, 17 ; w*2 + (filter_strength == 3)
|
||||
cmp hd, 8
|
||||
cmova maxbased, r5d
|
||||
cmp hd, 16
|
||||
cmovns maxbased, r5d
|
||||
mov [tlq+r5], r3b
|
||||
vextracti128 xm0, m1, 1
|
||||
packuswb xm0, xm1
|
||||
|
@ -1745,8 +1745,8 @@ ALIGN function_align
|
|||
sar r5d, 1
|
||||
mov tlq, rsp
|
||||
add r5d, 33
|
||||
cmp hd, 16
|
||||
cmova maxbased, r5d
|
||||
cmp hd, 32
|
||||
cmovns maxbased, r5d
|
||||
mov [tlq+r5], r3b
|
||||
packuswb m0, m1
|
||||
vpermq m0, m0, q3120
|
||||
|
@ -1812,7 +1812,7 @@ ALIGN function_align
|
|||
lea r3d, [hq+31]
|
||||
mov maxbased, 63
|
||||
cmp hd, 32
|
||||
cmovb maxbased, r3d
|
||||
cmovs maxbased, r3d
|
||||
test angled, 0x400 ; !enable_intra_edge_filter
|
||||
jnz .w32_main
|
||||
vbroadcasti128 m0, [pb_0to15]
|
||||
|
@ -1889,8 +1889,8 @@ ALIGN function_align
|
|||
mov tlq, rsp
|
||||
mov [tlq+65], r3b
|
||||
mov r3d, 65
|
||||
cmp hd, 32
|
||||
cmova maxbased, r3d
|
||||
cmp hd, 64
|
||||
cmove maxbased, r3d
|
||||
packuswb m0, m2
|
||||
packuswb m1, m6
|
||||
mova [tlq+ 0], m0
|
||||
|
@ -2294,7 +2294,7 @@ ALIGN function_align
|
|||
cmp hd, 16
|
||||
movu xm2, [rsp+49]
|
||||
vinserti128 m2, [rsp+43], 1
|
||||
cmovl r5d, hd
|
||||
cmovs r5d, hd
|
||||
xor r5d, 15 ; h == 16 ? 5 : 15 - h
|
||||
movd xm0, r5d
|
||||
vbroadcasti128 m1, [base+z_filter_s+12]
|
||||
|
@ -2501,7 +2501,7 @@ ALIGN function_align
|
|||
.w8_filter_left_h16:
|
||||
mov r5d, 10
|
||||
cmp hd, 16
|
||||
cmovl r5d, hd
|
||||
cmovs r5d, hd
|
||||
xor r5d, 15 ; h == 16 ? 5 : 15 - h
|
||||
movd xm0, r5d
|
||||
vpbroadcastb m0, xm0
|
||||
|
@ -2742,7 +2742,7 @@ ALIGN function_align
|
|||
.w16_filter_left_h16:
|
||||
mov r5d, 10
|
||||
cmp hd, 16
|
||||
cmovl r5d, hd
|
||||
cmovs r5d, hd
|
||||
xor r5d, 15 ; h == 16 ? 5 : 15 - h
|
||||
movd xm0, r5d
|
||||
vpbroadcastb m0, xm0
|
||||
|
@ -3115,7 +3115,7 @@ ALIGN function_align
|
|||
mov r4d, 9
|
||||
lea tlq, [rsp+15]
|
||||
cmp wd, 4
|
||||
cmova maxbased, r4d
|
||||
cmovne maxbased, r4d
|
||||
vextracti128 xm1, m0, 1
|
||||
packuswb xm0, xm1
|
||||
mova [rsp], xm0
|
||||
|
@ -3321,8 +3321,8 @@ ALIGN function_align
|
|||
sar r5d, 1
|
||||
lea tlq, [rsp+31]
|
||||
add r5d, 17
|
||||
cmp wd, 8
|
||||
cmova maxbased, r5d
|
||||
cmp wd, 16
|
||||
cmovns maxbased, r5d
|
||||
neg r5
|
||||
mov [tlq+r5], r4b
|
||||
vextracti128 xm1, m0, 1
|
||||
|
@ -3385,7 +3385,7 @@ ALIGN function_align
|
|||
sub org_wd, 8
|
||||
lea r2, [strideq*3]
|
||||
lea r6, [dstq+org_wq]
|
||||
cmovg dstq, r6
|
||||
cmovns dstq, r6
|
||||
punpcklwd xm1, xm2, xm0
|
||||
punpckhwd xm2, xm0
|
||||
lea r6, [dstq+strideq*4]
|
||||
|
@ -3493,8 +3493,8 @@ ALIGN function_align
|
|||
sar r5d, 1
|
||||
lea tlq, [rsp+63]
|
||||
add r5d, 33
|
||||
cmp wd, 16
|
||||
cmova maxbased, r5d
|
||||
cmp wd, 32
|
||||
cmovns maxbased, r5d
|
||||
neg r5
|
||||
mov [tlq+r5], r4b
|
||||
packuswb m0, m1
|
||||
|
@ -3563,7 +3563,7 @@ ALIGN function_align
|
|||
sub org_wd, 8
|
||||
lea r2, [strideq*3]
|
||||
lea r6, [dstq+org_wq]
|
||||
cmovg dstq, r6
|
||||
cmovns dstq, r6
|
||||
punpcklbw m1, m2, m0
|
||||
punpckhbw m2, m0
|
||||
lea r3, [strideq*5]
|
||||
|
@ -3652,7 +3652,7 @@ ALIGN function_align
|
|||
movu xm11, [tlq-66] ; 56-63
|
||||
vinserti128 m11, [tlq-52], 1 ; 40-47
|
||||
sub r4d, wd ; 21-w
|
||||
cmovg r5d, r4d
|
||||
cmovns r5d, r4d
|
||||
movu xm12, [tlq-58] ; 48-55
|
||||
vinserti128 m12, [tlq-44], 1 ; 32-39
|
||||
sub r4d, 8 ; 13-w
|
||||
|
@ -3721,8 +3721,8 @@ ALIGN function_align
|
|||
lea tlq, [rsp+95]
|
||||
mov [tlq-65], r4b
|
||||
mov r4d, 65
|
||||
cmp wd, 32
|
||||
cmova maxbased, r4d
|
||||
cmp wd, 64
|
||||
cmove maxbased, r4d
|
||||
packuswb m0, m2
|
||||
packuswb m1, m6
|
||||
mova [tlq-63], m0
|
||||
|
@ -4553,7 +4553,7 @@ ALIGN function_align
|
|||
mov r6d, 0x5556
|
||||
mov r2d, 0x3334
|
||||
cmp hd, 32
|
||||
cmovz r6d, r2d
|
||||
cmove r6d, r2d
|
||||
movd xm1, r6d
|
||||
pmulhuw xm0, xm1
|
||||
.w8_end:
|
||||
|
|
|
@ -60,7 +60,6 @@ pw_16384: times 2 dw 16384
|
|||
pw_1697x16: times 2 dw 1697*16
|
||||
pw_1697x8: times 2 dw 1697*8
|
||||
pw_2896x8: times 2 dw 2896*8
|
||||
pw_5793x4: times 2 dw 5793*4
|
||||
|
||||
pd_2048: dd 2048
|
||||
|
||||
|
@ -393,7 +392,7 @@ ALIGN function_align
|
|||
pmulhrsw m0, [cq]
|
||||
vpbroadcastd m1, [o(pw_1697x8)]
|
||||
pmulhrsw m1, m0
|
||||
paddw m0, m1
|
||||
paddsw m0, m1
|
||||
punpcklwd m0, m0
|
||||
punpckhdq m1, m0, m0
|
||||
punpckldq m0, m0
|
||||
|
@ -405,7 +404,7 @@ ALIGN function_align
|
|||
vpbroadcastd m2, [o(pw_2896x8)]
|
||||
packusdw m0, m0
|
||||
pmulhrsw m1, m0
|
||||
paddw m0, m1
|
||||
paddsw m0, m1
|
||||
pmulhrsw m0, m2
|
||||
mova m1, m0
|
||||
jmp m(iadst_4x4_internal).end
|
||||
|
@ -561,8 +560,8 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddsw m0, m2
|
||||
paddsw m1, m3
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m0, m2
|
||||
|
@ -572,8 +571,8 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddsw m0, m2
|
||||
paddsw m1, m3
|
||||
jmp m(iadst_4x4_internal).end
|
||||
|
||||
%macro WRITE_4X8 2 ; coefs[1-2]
|
||||
|
@ -626,7 +625,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
|||
punpckldq xm0, xm1
|
||||
pmulhrsw xm0, xm2
|
||||
pmulhrsw xm3, xm0
|
||||
paddw xm0, xm3
|
||||
paddsw xm0, xm3
|
||||
pmulhrsw xm0, xm2
|
||||
pmulhrsw xm0, xm4
|
||||
vpbroadcastq m0, xm0
|
||||
|
@ -907,8 +906,8 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
punpckhwd m1, m2
|
||||
pmulhrsw m2, m4, m0
|
||||
pmulhrsw m4, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m4
|
||||
paddsw m0, m2
|
||||
paddsw m1, m4
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m4, [o(pw_4096)]
|
||||
|
@ -925,8 +924,8 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m3, [o(pw_2048)]
|
||||
pmulhrsw m0, m1
|
||||
pmulhrsw m2, m0
|
||||
paddw m0, m0
|
||||
paddw m0, m2
|
||||
paddsw m0, m0
|
||||
paddsw m0, m2
|
||||
pmulhrsw m3, m0
|
||||
punpcklwd m1, m3, m3
|
||||
punpckhwd m3, m3
|
||||
|
@ -941,15 +940,16 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
movd xm1, [cq+32*2]
|
||||
punpcklwd xm1, [cq+32*3]
|
||||
vpbroadcastd xm2, [o(pw_1697x8)]
|
||||
vpbroadcastd xm3, [o(pw_16384)]
|
||||
vpbroadcastd xm4, [o(pw_2896x8)]
|
||||
vpbroadcastd xm3, [o(pw_2896x8)]
|
||||
vpbroadcastd xm4, [o(pw_2048)]
|
||||
punpckldq xm0, xm1
|
||||
pcmpeqw xm1, xm1
|
||||
pmulhrsw xm2, xm0
|
||||
paddw xm0, xm2
|
||||
pcmpeqw xm1, xm0
|
||||
pxor xm0, xm1
|
||||
pavgw xm0, xm2
|
||||
pmulhrsw xm0, xm3
|
||||
psrlw xm3, 3 ; pw_2048
|
||||
pmulhrsw xm0, xm4
|
||||
pmulhrsw xm0, xm3
|
||||
vpbroadcastq m0, xm0
|
||||
mova m1, m0
|
||||
mova m2, m0
|
||||
|
@ -1283,26 +1283,33 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
mova m3, [cq+32*0]
|
||||
mova m2, [cq+32*1]
|
||||
mova m4, [cq+32*2]
|
||||
mova m0, [cq+32*3]
|
||||
vpbroadcastd m5, [o(pw_1697x8)]
|
||||
mova m5, [cq+32*3]
|
||||
vpbroadcastd m8, [o(pw_1697x8)]
|
||||
pcmpeqw m0, m0 ; -1
|
||||
punpcklwd m1, m3, m2
|
||||
punpckhwd m3, m2
|
||||
punpcklwd m2, m4, m0
|
||||
punpckhwd m4, m0
|
||||
pmulhrsw m0, m5, m1
|
||||
pmulhrsw m6, m5, m2
|
||||
pmulhrsw m7, m5, m3
|
||||
pmulhrsw m5, m4
|
||||
paddw m1, m0
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
paddw m4, m5
|
||||
vpbroadcastd m5, [o(pw_16384)]
|
||||
punpcklwd m2, m4, m5
|
||||
punpckhwd m4, m5
|
||||
pmulhrsw m5, m8, m1
|
||||
pmulhrsw m6, m8, m2
|
||||
pmulhrsw m7, m8, m3
|
||||
pmulhrsw m8, m4
|
||||
pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is
|
||||
pxor m1, m9 ; unsigned. as long as both signs are equal
|
||||
pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the
|
||||
pxor m2, m9 ; pmulhrsw result will become 0 which causes
|
||||
pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
|
||||
pxor m3, m9 ; we explicitly deal with that case here.
|
||||
pcmpeqw m0, m4
|
||||
pxor m4, m0
|
||||
pavgw m1, m5
|
||||
pavgw m2, m6
|
||||
pavgw m3, m7
|
||||
pavgw m4, m8
|
||||
punpckldq m0, m1, m2
|
||||
punpckhdq m1, m2
|
||||
punpckldq m2, m3, m4
|
||||
punpckhdq m3, m4
|
||||
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m8, [o(pw_1697x16)]
|
||||
|
@ -1311,11 +1318,11 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
pmulhrsw m6, m8, m1
|
||||
pmulhrsw m7, m8, m2
|
||||
pmulhrsw m8, m3
|
||||
REPX {paddw x, x}, m0, m1, m2, m3
|
||||
paddw m0, m4
|
||||
paddw m1, m6
|
||||
paddw m2, m7
|
||||
paddw m3, m8
|
||||
REPX {paddsw x, x}, m0, m1, m2, m3
|
||||
paddsw m0, m4
|
||||
paddsw m1, m6
|
||||
paddsw m2, m7
|
||||
paddsw m3, m8
|
||||
jmp m(iadst_4x16_internal).end2
|
||||
|
||||
%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
|
||||
|
@ -1353,7 +1360,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
vpbroadcastd xm3, [o(pw_2048)]
|
||||
pmulhrsw xm1, xm0
|
||||
pmulhrsw xm2, xm1
|
||||
paddw xm1, xm2
|
||||
paddsw xm1, xm2
|
||||
pmulhrsw xm1, xm3
|
||||
punpcklwd xm1, xm1
|
||||
punpckldq xm0, xm1, xm1
|
||||
|
@ -1369,7 +1376,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
vpbroadcastd xm3, [o(pw_2048)]
|
||||
packusdw xm0, xm1
|
||||
pmulhrsw xm0, xm2
|
||||
paddw xm0, xm0
|
||||
paddsw xm0, xm0
|
||||
pmulhrsw xm0, xm2
|
||||
pmulhrsw xm0, xm3
|
||||
vinserti128 m0, m0, xm0, 1
|
||||
|
@ -1447,7 +1454,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
pxor m3, m3
|
||||
psubw m3, m2
|
||||
psubsw m3, m2
|
||||
punpckhwd m1, m0, m3
|
||||
punpcklwd m0, m3
|
||||
jmp tx2q
|
||||
|
@ -1492,7 +1499,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
punpckhwd m1, m3, m2
|
||||
punpcklwd m3, m2
|
||||
pxor m0, m0
|
||||
psubw m0, m1
|
||||
psubsw m0, m1
|
||||
punpckhwd m1, m0, m3
|
||||
punpcklwd m0, m3
|
||||
jmp tx2q
|
||||
|
@ -1520,15 +1527,15 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
pmulhrsw m2, m3
|
||||
punpcklwd m0, m1, m2
|
||||
punpckhwd m1, m2
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
paddsw m0, m0
|
||||
paddsw m1, m1
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddsw m0, m2
|
||||
paddsw m1, m3
|
||||
jmp m(iadst_8x4_internal).end
|
||||
|
||||
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
|
@ -1796,8 +1803,8 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
pmulhrsw m7, m1
|
||||
psrlw m1, 3 ; pw_2048
|
||||
pmulhrsw m2, m7
|
||||
paddw m7, m7
|
||||
paddw m7, m2
|
||||
paddsw m7, m7
|
||||
paddsw m7, m2
|
||||
pmulhrsw m7, m1
|
||||
punpcklwd m5, m7, m7
|
||||
punpckhwd m7, m7
|
||||
|
@ -2120,12 +2127,12 @@ INV_TXFM_8X16_FN identity, identity
|
|||
|
||||
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
|
||||
pmulhrsw m%2, m%3, m%1
|
||||
%if %0 == 4 ; if we're going to downshift by 1 doing so here eliminates the paddw
|
||||
%if %0 == 4 ; if downshifting by 1
|
||||
pmulhrsw m%2, m%4
|
||||
%else
|
||||
paddw m%1, m%1
|
||||
paddsw m%1, m%1
|
||||
%endif
|
||||
paddw m%1, m%2
|
||||
paddsw m%1, m%2
|
||||
%endmacro
|
||||
|
||||
cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
|
@ -2201,7 +2208,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
|||
pmulhrsw xm3, xm0
|
||||
psrlw xm0, 3 ; pw_2048
|
||||
pmulhrsw xm1, xm3
|
||||
paddw xm3, xm1
|
||||
paddsw xm3, xm1
|
||||
pmulhrsw xm3, xm0
|
||||
punpcklwd xm3, xm3
|
||||
punpckldq xm1, xm3, xm3
|
||||
|
@ -2228,7 +2235,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m1, [o(pw_2896x8)]
|
||||
pmulhrsw m4, m0
|
||||
pmulhrsw m4, m5
|
||||
paddw m0, m4
|
||||
paddsw m0, m4
|
||||
psrlw m5, 3 ; pw_2048
|
||||
pmulhrsw m0, m1
|
||||
pmulhrsw m0, m5
|
||||
|
@ -2503,10 +2510,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
pmulhrsw m6, m7, m3
|
||||
pmulhrsw m7, m4
|
||||
REPX {pmulhrsw x, m8}, m0, m5, m6, m7
|
||||
paddw m1, m0
|
||||
paddw m2, m5
|
||||
paddw m3, m6
|
||||
paddw m4, m7
|
||||
paddsw m1, m0
|
||||
paddsw m2, m5
|
||||
paddsw m3, m6
|
||||
paddsw m4, m7
|
||||
punpcklqdq m0, m1, m2
|
||||
punpckhqdq m1, m2
|
||||
punpcklqdq m2, m3, m4
|
||||
|
@ -2518,10 +2525,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
pmulhrsw m5, m7, m1
|
||||
pmulhrsw m6, m7, m2
|
||||
pmulhrsw m7, m3
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
paddsw m0, m4
|
||||
paddsw m1, m5
|
||||
paddsw m2, m6
|
||||
paddsw m3, m7
|
||||
jmp m(iadst_16x4_internal).end
|
||||
|
||||
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
|
@ -2581,7 +2588,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
|||
pmulhrsw m0, m4
|
||||
pmulhrsw m5, m0
|
||||
pmulhrsw m5, m2
|
||||
paddw m0, m5
|
||||
paddsw m0, m5
|
||||
psrlw m2, 3 ; pw_2048
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m0, m2
|
||||
|
@ -2903,7 +2910,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m3, [o(pw_2896x8)]
|
||||
pmulhrsw m3, [cq]
|
||||
vpbroadcastd m0, [o(pw_8192)]
|
||||
vpbroadcastd m1, [o(pw_5793x4)]
|
||||
vpbroadcastd m1, [o(pw_1697x16)]
|
||||
vpbroadcastw m4, [o(deint_shuf)] ; pb_0_1
|
||||
pcmpeqb m5, m5
|
||||
pxor m6, m6
|
||||
|
@ -2911,8 +2918,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
|||
paddb m5, m5 ; pb_m2
|
||||
pmulhrsw m3, m0
|
||||
psrlw m0, 2 ; pw_2048
|
||||
psllw m3, 2
|
||||
pmulhrsw m3, m1
|
||||
IDTX16 3, 1, 1
|
||||
pmulhrsw m3, m0
|
||||
mov r3d, 8
|
||||
.loop:
|
||||
|
@ -2954,17 +2960,15 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
|||
punpcklwd m1, m3
|
||||
vpbroadcastd m3, [o(pw_1697x16)]
|
||||
punpcklwd m2, m4
|
||||
vpbroadcastd m4, [o(pw_8192)]
|
||||
vpbroadcastd m4, [o(pw_2896x8)]
|
||||
punpckldq m1, m2
|
||||
vpbroadcastd m2, [o(pw_2896x8)]
|
||||
vpbroadcastd m2, [o(pw_2048)]
|
||||
punpcklqdq m0, m1
|
||||
pmulhrsw m3, m0
|
||||
paddw m0, m0
|
||||
paddw m0, m3
|
||||
psraw m3, 1
|
||||
pavgw m0, m3
|
||||
pmulhrsw m0, m4
|
||||
psrlw m4, 2 ; pw_2048
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m0, m4
|
||||
mov r3d, 8
|
||||
jmp m(inv_txfm_add_identity_dct_16x4).end
|
||||
%endif
|
||||
|
@ -3385,6 +3389,12 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
|||
WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3
|
||||
jmp m(idct_16x16_internal).end3
|
||||
|
||||
%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
|
||||
pmulhrsw m%2, m%3, m%1
|
||||
psraw m%2, 1
|
||||
pavgw m%1, m%2 ; signs are guaranteed to be equal
|
||||
%endmacro
|
||||
|
||||
INV_TXFM_16X16_FN identity, dct, 15
|
||||
INV_TXFM_16X16_FN identity, identity
|
||||
|
||||
|
@ -3419,22 +3429,17 @@ cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
|||
vinserti128 m13, [cq+16*13], 1
|
||||
mova xm14, [cq-16* 1]
|
||||
vinserti128 m14, [cq+16*15], 1
|
||||
REPX {IDTX16 x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
|
||||
REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
|
||||
10, 4, 11, 5, 12, 13, 14
|
||||
mova xm6, [cq-16* 4]
|
||||
vinserti128 m6, [cq+16*12], 1
|
||||
mova [rsp], m1
|
||||
IDTX16 6, 1, 7
|
||||
mova xm1, [cq-16* 2]
|
||||
vinserti128 m1, [cq+16*14], 1
|
||||
pmulhrsw m7, m1
|
||||
paddw m1, m1
|
||||
paddw m7, m1
|
||||
vpbroadcastd m1, [o(pw_8192)]
|
||||
REPX {pmulhrsw x, m1}, m0, m2, m3, m4, m5, m6, m7, \
|
||||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
pmulhrsw m1, [rsp]
|
||||
mova [rsp], m0
|
||||
IDTX16B 6, 0, 7
|
||||
mova xm0, [cq-16* 2]
|
||||
vinserti128 m0, [cq+16*14], 1
|
||||
pmulhrsw m7, m0
|
||||
psraw m7, 1
|
||||
pavgw m7, m0
|
||||
jmp m(idct_16x16_internal).pass1_end3
|
||||
ALIGN function_align
|
||||
.pass2:
|
||||
|
@ -3447,8 +3452,8 @@ ALIGN function_align
|
|||
IDTX16 0, 1, 15
|
||||
mova m1, [rsp+32*0]
|
||||
pmulhrsw m15, m1
|
||||
paddw m1, m1
|
||||
paddw m15, m1
|
||||
paddsw m1, m1
|
||||
paddsw m15, m1
|
||||
jmp m(idct_16x16_internal).end
|
||||
|
||||
%define o_base iadst4_dconly2a + 128
|
||||
|
@ -3963,7 +3968,7 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
|
|||
vinserti128 m6, m6, [cq+16* 9], 1
|
||||
vinserti128 m7, m7, [cq+16*13], 1
|
||||
REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
|
||||
REPX {paddw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
call .transpose8x8
|
||||
REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4
|
||||
|
@ -4572,12 +4577,12 @@ ALIGN function_align
|
|||
IDCT32_PASS1_END 1, 9, 6, 7
|
||||
ret
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
|
||||
cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob
|
||||
%undef cmp
|
||||
lea rax, [o_base]
|
||||
vpbroadcastd m9, [o(pw_2896x8)]
|
||||
vpbroadcastd m10, [o(pw_5793x4)]
|
||||
vpbroadcastd m11, [o(pw_5)]
|
||||
vpbroadcastd m10, [o(pw_1697x16)]
|
||||
vpbroadcastd m12, [o(pw_8192)]
|
||||
cmp eobd, 43 ; if (eob > 43)
|
||||
setg r4b ; iteration_count++
|
||||
cmp eobd, 150 ; if (eob > 150)
|
||||
|
@ -4586,6 +4591,7 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
|
|||
adc r4b, al ; iteration_count++
|
||||
lea r3, [strideq*3]
|
||||
mov rax, cq
|
||||
paddw m11, m12, m12 ; pw_16384
|
||||
.loop:
|
||||
mova xm0, [cq+64* 0]
|
||||
mova xm1, [cq+64* 1]
|
||||
|
@ -4604,11 +4610,9 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
|
|||
vinserti128 m6, m6, [cq+64*14], 1
|
||||
vinserti128 m7, m7, [cq+64*15], 1
|
||||
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
|
||||
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {paddw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
|
||||
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
|
@ -4622,13 +4626,13 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
|
|||
pxor m0, m0
|
||||
mov r0d, 8
|
||||
cmp cq, rax
|
||||
jg .zero_loop
|
||||
ja .zero_loop
|
||||
.zero_loop_half:
|
||||
mova [rax+64*0], m0
|
||||
mova [rax+64*1], m0
|
||||
mova [rax+64*2], m0
|
||||
mova [rax+64*3], m0
|
||||
add rax, 64*4
|
||||
mova [rax-64*2], m0
|
||||
mova [rax-64*1], m0
|
||||
sub r0d, 2
|
||||
jg .zero_loop_half
|
||||
RET
|
||||
|
@ -4646,7 +4650,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
|
|||
%undef cmp
|
||||
lea rax, [o_base]
|
||||
vpbroadcastd m9, [o(pw_2896x8)]
|
||||
vpbroadcastd m10, [o(pw_1697x8)]
|
||||
vpbroadcastd m10, [o(pw_1697x16)]
|
||||
vpbroadcastd m11, [o(pw_2048)]
|
||||
cmp eobd, 35 ; if (eob > 35)
|
||||
setg r4b ; iteration_count++
|
||||
|
@ -4674,24 +4678,9 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
|
|||
vinserti128 m6, m6, [cq+32*14], 1
|
||||
vinserti128 m7, m7, [cq+32*15], 1
|
||||
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
|
||||
pmulhrsw m8, m10, m0
|
||||
paddw m0, m8
|
||||
pmulhrsw m8, m10, m1
|
||||
paddw m1, m8
|
||||
pmulhrsw m8, m10, m2
|
||||
paddw m2, m8
|
||||
pmulhrsw m8, m10, m3
|
||||
paddw m3, m8
|
||||
pmulhrsw m8, m10, m4
|
||||
paddw m4, m8
|
||||
pmulhrsw m8, m10, m5
|
||||
paddw m5, m8
|
||||
pmulhrsw m8, m10, m6
|
||||
paddw m6, m8
|
||||
pmulhrsw m8, m10, m7
|
||||
paddw m7, m8
|
||||
REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
|
||||
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
|
||||
|
@ -4708,20 +4697,17 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
|
|||
lea dstq, [r5+16]
|
||||
jmp .loop
|
||||
.ret:
|
||||
sub cq, 32
|
||||
sub cd, eax
|
||||
pxor m0, m0
|
||||
mov r0d, 4
|
||||
mov r1d, 8
|
||||
cmp cq, rax
|
||||
cmovg r0d, r1d
|
||||
add cd, 384
|
||||
.zero_loop:
|
||||
mova [rax+32*0], m0
|
||||
mova [rax+32*1], m0
|
||||
mova [rax+32*2], m0
|
||||
mova [rax+32*3], m0
|
||||
add rax, 32*4
|
||||
dec r0d
|
||||
jg .zero_loop
|
||||
sub cd, 128
|
||||
jge .zero_loop
|
||||
RET
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
|
||||
|
@ -4859,7 +4845,7 @@ cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
|
|||
call m(inv_txfm_add_dct_dct_16x32).pass2_end
|
||||
lea tmp3q, [tmp1q-32*32]
|
||||
cmp tmp2q, tmp3q
|
||||
jl .ret
|
||||
jb .ret
|
||||
sub tmp2q, 32*32
|
||||
sub dstq, r3
|
||||
lea r2, [r2+r3+16]
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -347,7 +347,7 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
|
|||
punpckhbw xm0, xm1
|
||||
|
||||
; when we reach this, xm0 contains left two px in highest words
|
||||
cmp xq, -16
|
||||
cmp xd, -16
|
||||
jle .loop_x
|
||||
.partial_load_and_extend:
|
||||
vpbroadcastb m3, [srcq-1]
|
||||
|
@ -396,17 +396,17 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
|
|||
; else if x < xlimd we extend from previous load (this implies have_right=0)
|
||||
; else we are done
|
||||
|
||||
cmp xq, -16
|
||||
cmp xd, -16
|
||||
jle .loop_x
|
||||
test xq, xq
|
||||
test xd, xd
|
||||
jl .partial_load_and_extend
|
||||
cmp xq, xlimq
|
||||
cmp xd, xlimd
|
||||
jl .right_extend
|
||||
|
||||
add sumsqq, (384+16)*4
|
||||
add sumq, (384+16)*2
|
||||
add srcq, strideq
|
||||
dec hd
|
||||
dec hd
|
||||
jg .loop_y
|
||||
RET
|
||||
|
||||
|
@ -418,7 +418,7 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
|
|||
shr ylimd, 2
|
||||
sub ylimd, 2 ; -2 if have_bottom=0, else 0
|
||||
.loop_x:
|
||||
lea yd, [hd+ylimd+2]
|
||||
lea yd, [hq+ylimq+2]
|
||||
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
|
||||
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
|
||||
test edged, 4 ; have_top
|
||||
|
@ -720,9 +720,9 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
|
|||
punpckhbw xm0, xm1
|
||||
|
||||
; when we reach this, xm0 contains left two px in highest words
|
||||
cmp xq, -16
|
||||
cmp xd, -16
|
||||
jle .loop_x
|
||||
test xq, xq
|
||||
test xd, xd
|
||||
jge .right_extend
|
||||
.partial_load_and_extend:
|
||||
vpbroadcastb m3, [srcq-1]
|
||||
|
@ -781,11 +781,11 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
|
|||
; else if x < xlimd we extend from previous load (this implies have_right=0)
|
||||
; else we are done
|
||||
|
||||
cmp xq, -16
|
||||
cmp xd, -16
|
||||
jle .loop_x
|
||||
test xq, xq
|
||||
test xd, xd
|
||||
jl .partial_load_and_extend
|
||||
cmp xq, xlimq
|
||||
cmp xd, xlimd
|
||||
jl .right_extend
|
||||
|
||||
add sumsqq, (384+16)*4
|
||||
|
@ -803,7 +803,7 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
|
|||
shr ylimd, 2
|
||||
sub ylimd, 3 ; -3 if have_bottom=0, else -1
|
||||
.loop_x:
|
||||
lea yd, [hd+ylimd+2]
|
||||
lea yd, [hq+ylimq+2]
|
||||
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
|
||||
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
|
||||
test edged, 4 ; have_top
|
||||
|
|
|
@ -725,7 +725,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
|||
punpckhbw xm0, xm1
|
||||
|
||||
; when we reach this, m0 contains left two px in highest words
|
||||
cmp xq, -8
|
||||
cmp xd, -8
|
||||
jle .loop_x
|
||||
.partial_load_and_extend:
|
||||
movd m3, [srcq-4]
|
||||
|
@ -1299,9 +1299,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
|
|||
punpckhbw m0, m1
|
||||
|
||||
; when we reach this, m0 contains left two px in highest words
|
||||
cmp xq, -8
|
||||
cmp xd, -8
|
||||
jle .loop_x
|
||||
test xq, xq
|
||||
test xd, xd
|
||||
jge .right_extend
|
||||
.partial_load_and_extend:
|
||||
XCHG_PIC_REG
|
||||
|
@ -1394,11 +1394,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
|
|||
; else if x < xlimd we extend from previous load (this implies have_right=0)
|
||||
; else we are done
|
||||
|
||||
cmp xq, -8
|
||||
cmp xd, -8
|
||||
jle .loop_x
|
||||
test xq, xq
|
||||
test xd, xd
|
||||
jl .partial_load_and_extend
|
||||
cmp xq, xlimq
|
||||
cmp xd, xlimd
|
||||
jl .right_extend
|
||||
|
||||
add sumsqq, (384+16)*4
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -49,36 +49,52 @@ decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3);
|
|||
decl_mc_fn(dav1d_put_bilin_avx2);
|
||||
decl_mc_fn(dav1d_put_bilin_ssse3);
|
||||
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
|
||||
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
|
||||
decl_mct_fn(dav1d_prep_bilin_avx512icl);
|
||||
decl_mct_fn(dav1d_prep_bilin_avx2);
|
||||
decl_mct_fn(dav1d_prep_bilin_ssse3);
|
||||
|
||||
decl_avg_fn(dav1d_avg_avx512icl);
|
||||
decl_avg_fn(dav1d_avg_avx2);
|
||||
decl_avg_fn(dav1d_avg_ssse3);
|
||||
decl_w_avg_fn(dav1d_w_avg_avx512icl);
|
||||
decl_w_avg_fn(dav1d_w_avg_avx2);
|
||||
decl_w_avg_fn(dav1d_w_avg_ssse3);
|
||||
decl_mask_fn(dav1d_mask_avx512icl);
|
||||
decl_mask_fn(dav1d_mask_avx2);
|
||||
decl_mask_fn(dav1d_mask_ssse3);
|
||||
decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
|
||||
decl_w_mask_fn(dav1d_w_mask_420_avx2);
|
||||
decl_w_mask_fn(dav1d_w_mask_420_ssse3);
|
||||
decl_w_mask_fn(dav1d_w_mask_422_avx512icl);
|
||||
decl_w_mask_fn(dav1d_w_mask_422_avx2);
|
||||
decl_w_mask_fn(dav1d_w_mask_444_avx512icl);
|
||||
decl_w_mask_fn(dav1d_w_mask_444_avx2);
|
||||
decl_blend_fn(dav1d_blend_avx2);
|
||||
decl_blend_fn(dav1d_blend_ssse3);
|
||||
|
@ -162,10 +178,11 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
|
||||
#endif
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
#if BITDEPTH == 8
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
|
||||
|
@ -202,5 +219,29 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_avx2;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
|
||||
|
||||
c->avg = dav1d_avg_avx512icl;
|
||||
c->w_avg = dav1d_w_avg_avx512icl;
|
||||
c->mask = dav1d_mask_avx512icl;
|
||||
c->w_mask[0] = dav1d_w_mask_444_avx512icl;
|
||||
c->w_mask[1] = dav1d_w_mask_422_avx512icl;
|
||||
c->w_mask[2] = dav1d_w_mask_420_avx512icl;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1425,7 +1425,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
jmp wq
|
||||
.h_w2:
|
||||
%if ARCH_X86_32
|
||||
and mxd, 0xff
|
||||
and mxd, 0x7f
|
||||
%else
|
||||
movzx mxd, mxb
|
||||
%endif
|
||||
|
@ -1455,7 +1455,7 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
RET
|
||||
.h_w4:
|
||||
%if ARCH_X86_32
|
||||
and mxd, 0xff
|
||||
and mxd, 0x7f
|
||||
%else
|
||||
movzx mxd, mxb
|
||||
%endif
|
||||
|
@ -1564,16 +1564,16 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
%if ARCH_X86_32
|
||||
movzx mxd, ssb
|
||||
shr ssd, 16
|
||||
cmp hd, 4
|
||||
cmovle ssd, mxd
|
||||
cmp hd, 6
|
||||
cmovs ssd, mxd
|
||||
lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
|
||||
%else
|
||||
%assign stack_offset org_stack_offset
|
||||
WIN64_SPILL_XMM 16
|
||||
movzx mxd, myb
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
|
||||
%endif
|
||||
tzcnt r6d, wd
|
||||
|
@ -1850,14 +1850,18 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
%assign stack_offset org_stack_offset
|
||||
cmp wd, 4
|
||||
jg .hv_w8
|
||||
and mxd, 0xff
|
||||
%if ARCH_X86_32
|
||||
and mxd, 0x7f
|
||||
%else
|
||||
movzx mxd, mxb
|
||||
%endif
|
||||
dec srcq
|
||||
movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
|
||||
%if ARCH_X86_32
|
||||
movzx mxd, ssb
|
||||
shr ssd, 16
|
||||
cmp hd, 4
|
||||
cmovle ssd, mxd
|
||||
cmp hd, 6
|
||||
cmovs ssd, mxd
|
||||
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
|
||||
W32_RESTORE_SSQ
|
||||
lea r6, [ssq*3]
|
||||
|
@ -1886,8 +1890,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
%else
|
||||
movzx mxd, myb
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
|
||||
ALLOC_STACK mmsize*14, 14
|
||||
lea ss3q, [ssq*3]
|
||||
|
@ -2202,8 +2206,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
|
||||
movzx mxd, ssb
|
||||
shr ssd, 16
|
||||
cmp hd, 4
|
||||
cmovle ssd, mxd
|
||||
cmp hd, 6
|
||||
cmovs ssd, mxd
|
||||
movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
|
||||
mov ssq, ssmp
|
||||
ALLOC_STACK -mmsize*13
|
||||
|
@ -2243,8 +2247,8 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
|
|||
movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
|
||||
movzx mxd, myb
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
|
||||
pshufd subpelh0, m0, q0000
|
||||
pshufd subpelh1, m0, q1111
|
||||
|
@ -2511,7 +2515,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
jmp wq
|
||||
.h_w4:
|
||||
%if ARCH_X86_32
|
||||
and mxd, 0xff
|
||||
and mxd, 0x7f
|
||||
%else
|
||||
movzx mxd, mxb
|
||||
%endif
|
||||
|
@ -2635,15 +2639,15 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
.v:
|
||||
%if ARCH_X86_32
|
||||
mov mxd, myd
|
||||
and mxd, 0xff
|
||||
and mxd, 0x7f
|
||||
%else
|
||||
%assign stack_offset org_stack_offset
|
||||
WIN64_SPILL_XMM 16
|
||||
movzx mxd, myb
|
||||
%endif
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
|
||||
mova m2, [base+pw_512]
|
||||
psrlw m2, m2, 1 ; 0x0100
|
||||
|
@ -2849,14 +2853,14 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
%assign stack_offset org_stack_offset
|
||||
cmp wd, 4
|
||||
jg .hv_w8
|
||||
and mxd, 0xff
|
||||
and mxd, 0x7f
|
||||
movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
|
||||
%if ARCH_X86_32
|
||||
mov mxd, myd
|
||||
and mxd, 0xff
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
and mxd, 0x7f
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
|
||||
mov r5, r2; use as new base
|
||||
%define base_reg r5
|
||||
|
@ -2885,8 +2889,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
%else
|
||||
movzx mxd, myb
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
|
||||
ALLOC_STACK mmsize*14, 14
|
||||
lea stride3q, [strideq*3]
|
||||
|
@ -3101,11 +3105,11 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
%define accuv0 [rsp+mmsize*11]
|
||||
%define accuv1 [rsp+mmsize*12]
|
||||
movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
|
||||
movzx mxd, myw
|
||||
and mxd, 0xff
|
||||
mov mxd, myd
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
and mxd, 0x7f
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
|
||||
ALLOC_STACK -mmsize*13
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
|
@ -3150,8 +3154,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
|
||||
movzx mxd, myb
|
||||
shr myd, 16
|
||||
cmp hd, 4
|
||||
cmovle myd, mxd
|
||||
cmp hd, 6
|
||||
cmovs myd, mxd
|
||||
movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
|
||||
pshufd subpelh0, m0, q0000
|
||||
pshufd subpelh1, m0, q1111
|
||||
|
@ -4743,9 +4747,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
|
|||
xor reg_zero, reg_zero
|
||||
lea reg_tmp, [ihq-1]
|
||||
cmp yq, ihq
|
||||
cmovl reg_tmp, yq
|
||||
cmovs reg_tmp, yq
|
||||
test yq, yq
|
||||
cmovl reg_tmp, reg_zero
|
||||
cmovs reg_tmp, reg_zero
|
||||
%if ARCH_X86_64
|
||||
imul reg_tmp, sstrideq
|
||||
add srcq, reg_tmp
|
||||
|
@ -4758,9 +4762,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
|
|||
; ref += iclip(x, 0, iw - 1)
|
||||
lea reg_tmp, [iwq-1]
|
||||
cmp xq, iwq
|
||||
cmovl reg_tmp, xq
|
||||
cmovs reg_tmp, xq
|
||||
test xq, xq
|
||||
cmovl reg_tmp, reg_zero
|
||||
cmovs reg_tmp, reg_zero
|
||||
add reg_src, reg_tmp
|
||||
%if ARCH_X86_32
|
||||
mov srcm, reg_src
|
||||
|
@ -4773,7 +4777,7 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
|
|||
lea reg_bottomext, [yq+bhq]
|
||||
sub reg_bottomext, ihq
|
||||
lea r3, [bhq-1]
|
||||
cmovl reg_bottomext, reg_zero
|
||||
cmovs reg_bottomext, reg_zero
|
||||
;
|
||||
|
||||
DEFINE_ARGS bw, bh, iw, ih, x, \
|
||||
|
@ -4782,9 +4786,9 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
|
|||
|
||||
; top_ext = iclip(-y, 0, bh - 1)
|
||||
neg topextq
|
||||
cmovl topextq, reg_zero
|
||||
cmovs topextq, reg_zero
|
||||
cmp reg_bottomext, bhq
|
||||
cmovge reg_bottomext, r3
|
||||
cmovns reg_bottomext, r3
|
||||
cmp topextq, bhq
|
||||
cmovg topextq, r3
|
||||
%if ARCH_X86_32
|
||||
|
@ -4796,7 +4800,7 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
|
|||
lea reg_rightext, [xq+bwq]
|
||||
sub reg_rightext, iwq
|
||||
lea r2, [bwq-1]
|
||||
cmovl reg_rightext, reg_zero
|
||||
cmovs reg_rightext, reg_zero
|
||||
|
||||
DEFINE_ARGS bw, bh, iw, ih, leftext, \
|
||||
topext, dst, dstride, src, sstride, \
|
||||
|
@ -4804,14 +4808,14 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
|
|||
|
||||
; left_ext = iclip(-x, 0, bw - 1)
|
||||
neg leftextq
|
||||
cmovl leftextq, reg_zero
|
||||
cmovs leftextq, reg_zero
|
||||
cmp reg_rightext, bwq
|
||||
cmovge reg_rightext, r2
|
||||
cmovns reg_rightext, r2
|
||||
%if ARCH_X86_32
|
||||
mov r3m, r1
|
||||
%endif
|
||||
cmp leftextq, bwq
|
||||
cmovge leftextq, r2
|
||||
cmovns leftextq, r2
|
||||
|
||||
%undef reg_zero
|
||||
%undef reg_tmp
|
||||
|
|
|
@ -67,7 +67,7 @@ struc msac
|
|||
.update_cdf: resd 1
|
||||
endstruc
|
||||
|
||||
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
|
||||
%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
@ -167,7 +167,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
|
|||
%endif
|
||||
lea t5, [t2+gprsize]
|
||||
cmp t5, rcx
|
||||
jg .refill_eob
|
||||
ja .refill_eob
|
||||
mov t2, [t2]
|
||||
lea ecx, [t1+23]
|
||||
add t1d, 16
|
||||
|
@ -195,7 +195,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
|
|||
sub ecx, t1d ; c
|
||||
.refill_eob_loop:
|
||||
cmp t2, t5
|
||||
jge .refill_eob_end ; eob reached
|
||||
jae .refill_eob_end ; eob reached
|
||||
movzx t1d, byte [t2]
|
||||
inc t2
|
||||
shl t1, cl
|
||||
|
@ -240,7 +240,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
|
|||
pcmpeqw m1, m2
|
||||
pmovmskb eax, m1
|
||||
test t3d, t3d
|
||||
jz m(msac_decode_symbol_adapt4).renorm
|
||||
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
|
||||
movzx t3d, word [t1+t4*2]
|
||||
pcmpeqw m2, m2
|
||||
mov t2d, t3d
|
||||
|
@ -257,7 +257,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
|
|||
paddw m0, m2
|
||||
mova [t1], m0
|
||||
mov [t1+t4*2], t2w
|
||||
jmp m(msac_decode_symbol_adapt4).renorm
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
|
||||
|
||||
cglobal msac_decode_symbol_adapt16, 0, 6, 6
|
||||
DECODE_SYMBOL_ADAPT_INIT
|
||||
|
@ -330,7 +330,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
|
|||
%if WIN64
|
||||
add rsp, 48
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm2
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
|
||||
|
||||
cglobal msac_decode_bool_adapt, 0, 6, 0
|
||||
movifnidn t1, r1mp
|
||||
|
@ -366,7 +366,7 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
|
|||
%endif
|
||||
not t4
|
||||
test t3d, t3d
|
||||
jz m(msac_decode_symbol_adapt4).renorm3
|
||||
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
|
||||
%if UNIX64 == 0
|
||||
push t6
|
||||
%endif
|
||||
|
@ -390,13 +390,13 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
|
|||
%if WIN64
|
||||
mov t1d, [t7+msac.cnt]
|
||||
pop t6
|
||||
jmp m(msac_decode_symbol_adapt4).renorm4
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
|
||||
%else
|
||||
%if ARCH_X86_64 == 0
|
||||
pop t5
|
||||
pop t6
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm3
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
|
||||
%endif
|
||||
|
||||
cglobal msac_decode_bool_equi, 0, 6, 0
|
||||
|
@ -418,7 +418,7 @@ cglobal msac_decode_bool_equi, 0, 6, 0
|
|||
%if ARCH_X86_64 == 0
|
||||
movzx eax, al
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm3
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
|
||||
|
||||
cglobal msac_decode_bool, 0, 6, 0
|
||||
movifnidn t0, r0mp
|
||||
|
@ -442,7 +442,7 @@ cglobal msac_decode_bool, 0, 6, 0
|
|||
%if ARCH_X86_64 == 0
|
||||
movzx eax, al
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm3
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
|
||||
|
||||
%macro HI_TOK 1 ; update_cdf
|
||||
%if ARCH_X86_64 == 0
|
||||
|
@ -598,3 +598,71 @@ cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
|
|||
HI_TOK 1
|
||||
.no_update_cdf:
|
||||
HI_TOK 0
|
||||
|
||||
%if ARCH_X86_64
|
||||
INIT_YMM avx2
|
||||
cglobal msac_decode_symbol_adapt16, 3, 6, 6
|
||||
lea rax, [pw_0xff00]
|
||||
vpbroadcastw m2, [t0+msac.rng]
|
||||
mova m0, [t1]
|
||||
vpbroadcastw m3, [t0+msac.dif+6]
|
||||
vbroadcasti128 m4, [rax]
|
||||
mov t3d, [t0+msac.update_cdf]
|
||||
mov t4d, t2d
|
||||
not t2
|
||||
%if STACK_ALIGNMENT < 32
|
||||
mov r5, rsp
|
||||
%if WIN64
|
||||
and rsp, ~31
|
||||
sub rsp, 40
|
||||
%else
|
||||
and r5, ~31
|
||||
%define buf r5-32
|
||||
%endif
|
||||
%elif WIN64
|
||||
sub rsp, 64
|
||||
%else
|
||||
%define buf rsp-56
|
||||
%endif
|
||||
psrlw m1, m0, 6
|
||||
movd [buf-4], xm2
|
||||
pand m2, m4
|
||||
psllw m1, 7
|
||||
pmulhuw m1, m2
|
||||
paddw m1, [rax+t2*2]
|
||||
mova [buf], m1
|
||||
pmaxuw m1, m3
|
||||
pcmpeqw m1, m3
|
||||
pmovmskb eax, m1
|
||||
test t3d, t3d
|
||||
jz .renorm
|
||||
movzx t3d, word [t1+t4*2]
|
||||
pcmpeqw m2, m2
|
||||
lea t2d, [t3+80]
|
||||
shr t2d, 4
|
||||
cmp t3d, 32
|
||||
adc t3d, 0
|
||||
movd xm3, t2d
|
||||
pavgw m2, m1
|
||||
psubw m2, m0
|
||||
psubw m0, m1
|
||||
psraw m2, xm3
|
||||
paddw m0, m2
|
||||
mova [t1], m0
|
||||
mov [t1+t4*2], t3w
|
||||
.renorm:
|
||||
tzcnt eax, eax
|
||||
mov t4, [t0+msac.dif]
|
||||
movzx t1d, word [buf+rax-0]
|
||||
movzx t2d, word [buf+rax-2]
|
||||
shr eax, 1
|
||||
%if WIN64
|
||||
%if STACK_ALIGNMENT < 32
|
||||
mov rsp, r5
|
||||
%else
|
||||
add rsp, 64
|
||||
%endif
|
||||
%endif
|
||||
vzeroupper
|
||||
jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
|
||||
%endif
|
||||
|
|
|
@ -39,10 +39,13 @@ unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
|
|||
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
|
||||
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
|
||||
|
||||
/* Needed for checkasm */
|
||||
unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
|
||||
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
|
||||
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
|
||||
#endif
|
||||
|
||||
|
@ -50,4 +53,12 @@ unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
|
|||
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
|
||||
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
|
||||
#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
|
||||
#endif
|
||||
|
||||
void dav1d_msac_init_x86(MsacContext *const s);
|
||||
|
||||
#endif /* DAV1D_SRC_X86_MSAC_H */
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright © 2020, VideoLAN and dav1d authors
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/msac.h"
|
||||
#include "src/x86/msac.h"
|
||||
|
||||
void dav1d_msac_init_x86(MsacContext *const s) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
|
||||
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
|
||||
}
|
||||
|
||||
if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
|
||||
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
|
||||
}
|
||||
}
|
||||
|
|
@ -35,72 +35,82 @@
|
|||
#include "src/levels.h"
|
||||
#include "src/cdef.h"
|
||||
|
||||
static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
|
||||
while (n--)
|
||||
*buf++ = rnd() & bitdepth_max;
|
||||
static int to_binary(int x) { /* 0-15 -> 0000-1111 */
|
||||
return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
|
||||
}
|
||||
|
||||
static void check_cdef_filter(const cdef_fn fn, const int w, const int h,
|
||||
const char *const name)
|
||||
{
|
||||
ALIGN_STK_32(pixel, src, 10 * 16 + 8, );
|
||||
ALIGN_STK_32(pixel, c_src, 10 * 16 + 8, ), *const c_src_ptr = c_src + 8;
|
||||
ALIGN_STK_32(pixel, a_src, 10 * 16 + 8, ), *const a_src_ptr = a_src + 8;
|
||||
ALIGN_STK_32(pixel, top, 16 * 2 + 8, ), *const top_ptr = top + 8;
|
||||
pixel left[8][2];
|
||||
static void init_tmp(pixel *buf, int n, const int bitdepth_max) {
|
||||
const int fill_type = rnd() & 7;
|
||||
if (fill_type == 0)
|
||||
while (n--) /* check for cdef_filter underflows */
|
||||
*buf++ = rnd() & 1;
|
||||
else if (fill_type == 1)
|
||||
while (n--) /* check for cdef_filter overflows */
|
||||
*buf++ = bitdepth_max - (rnd() & 1);
|
||||
else
|
||||
while (n--)
|
||||
*buf++ = rnd() & bitdepth_max;
|
||||
}
|
||||
|
||||
static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
|
||||
ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8;
|
||||
ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8;
|
||||
ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8;
|
||||
ALIGN_STK_16(pixel, left, 8,[2]);
|
||||
const ptrdiff_t stride = 16 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
|
||||
pixel *const top[2], int pri_strength, int sec_strength,
|
||||
const pixel *top, int pri_strength, int sec_strength,
|
||||
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(fn, "%s_%dbpc", name, BITDEPTH)) {
|
||||
if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
|
||||
for (int dir = 0; dir < 8; dir++) {
|
||||
for (enum CdefEdgeFlags edges = 0; edges <= 0xf; edges++) {
|
||||
for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) {
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
init_tmp(src, 10 * 16 + 8, bitdepth_max);
|
||||
init_tmp(top, 16 * 2 + 8, bitdepth_max);
|
||||
init_tmp((pixel *) left,8 * 2, bitdepth_max);
|
||||
|
||||
memcpy(a_src, src, (10 * 16 + 8) * sizeof(pixel));
|
||||
memcpy(c_src, src, (10 * 16 + 8) * sizeof(pixel));
|
||||
init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
|
||||
init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
|
||||
init_tmp((pixel *) left, 8 * 2, bitdepth_max);
|
||||
memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
|
||||
|
||||
const int lvl = 1 + (rnd() % 62);
|
||||
const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
|
||||
const int pri_strength = (lvl >> 2) << bitdepth_min_8;
|
||||
int pri_strength = (lvl >> 2) << bitdepth_min_8;
|
||||
int sec_strength = lvl & 3;
|
||||
sec_strength += sec_strength == 3;
|
||||
sec_strength <<= bitdepth_min_8;
|
||||
call_ref(c_src_ptr, 16 * sizeof(pixel), left,
|
||||
(pixel *[2]) { top_ptr, top_ptr + 16 },
|
||||
pri_strength, sec_strength, dir, damping, edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_src_ptr, 16 * sizeof(pixel), left,
|
||||
(pixel *[2]) { top_ptr, top_ptr + 16 },
|
||||
pri_strength, sec_strength, dir, damping, edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_src, 16 * sizeof(pixel),
|
||||
a_src, 16 * sizeof(pixel),
|
||||
16, 10, "src");
|
||||
checkasm_check_pixel(c_src + 16 * 10, 16 * sizeof(pixel),
|
||||
a_src + 16 * 10, 16 * sizeof(pixel),
|
||||
8, 1, "src last row");
|
||||
bench_new(a_src_ptr, 16 * sizeof(pixel), left,
|
||||
(pixel *[2]) { top_ptr, top_ptr + 16 },
|
||||
pri_strength, sec_strength, dir, damping, edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride, left, top, pri_strength, sec_strength,
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
|
||||
fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
|
||||
pri_strength, sec_strength, dir, damping, to_binary(edges));
|
||||
return;
|
||||
}
|
||||
if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) {
|
||||
/* Benchmark a fixed set of cases to get consistent results:
|
||||
* 1) top/left edges and pri_strength only
|
||||
* 2) bottom/right edges and sec_strength only
|
||||
* 3) all edges and both pri_strength and sec_strength
|
||||
*/
|
||||
pri_strength = (edges & 1) << bitdepth_min_8;
|
||||
sec_strength = (edges & 2) << bitdepth_min_8;
|
||||
bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
|
||||
dir, damping, edges HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
report(name);
|
||||
}
|
||||
|
||||
static void check_cdef_direction(const cdef_dir_fn fn) {
|
||||
ALIGN_STK_32(pixel, src, 8 * 8,);
|
||||
ALIGN_STK_64(pixel, src, 8 * 8,);
|
||||
|
||||
declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
|
@ -129,11 +139,12 @@ static void check_cdef_direction(const cdef_dir_fn fn) {
|
|||
|
||||
void bitfn(checkasm_check_cdef)(void) {
|
||||
Dav1dCdefDSPContext c;
|
||||
|
||||
bitfn(dav1d_cdef_dsp_init)(&c);
|
||||
|
||||
check_cdef_direction(c.dir);
|
||||
check_cdef_filter(c.fb[0], 8, 8, "cdef_filter_8x8");
|
||||
check_cdef_filter(c.fb[1], 4, 8, "cdef_filter_4x8");
|
||||
check_cdef_filter(c.fb[2], 4, 4, "cdef_filter_4x4");
|
||||
|
||||
check_cdef_filter(c.fb[0], 8, 8);
|
||||
check_cdef_filter(c.fb[1], 4, 8);
|
||||
check_cdef_filter(c.fb[2], 4, 4);
|
||||
report("cdef_filter");
|
||||
}
|
||||
|
|
|
@ -98,19 +98,15 @@ static const struct {
|
|||
unsigned flag;
|
||||
} cpus[] = {
|
||||
#if ARCH_X86
|
||||
{ "SSE", "sse", DAV1D_X86_CPU_FLAG_SSE },
|
||||
{ "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
|
||||
{ "SSE3", "sse3", DAV1D_X86_CPU_FLAG_SSE3 },
|
||||
{ "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
|
||||
{ "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
|
||||
{ "SSE4.2", "sse42", DAV1D_X86_CPU_FLAG_SSE42 },
|
||||
{ "AVX", "avx", DAV1D_X86_CPU_FLAG_AVX },
|
||||
{ "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
|
||||
{ "AVX-512", "avx512", DAV1D_X86_CPU_FLAG_AVX512 },
|
||||
{ "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
|
||||
{ "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
|
||||
{ "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
|
||||
{ "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
|
||||
{ "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
|
||||
#elif ARCH_AARCH64 || ARCH_ARM
|
||||
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
|
||||
{ "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
|
||||
#elif ARCH_PPC64LE
|
||||
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
|
||||
{ "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
|
||||
#endif
|
||||
{ 0 }
|
||||
};
|
||||
|
@ -150,6 +146,9 @@ static struct {
|
|||
int bench_c;
|
||||
int verbose;
|
||||
int function_listing;
|
||||
#if ARCH_X86_64
|
||||
void (*simd_warmup)(void);
|
||||
#endif
|
||||
} state;
|
||||
|
||||
/* float compare support code */
|
||||
|
@ -569,13 +568,26 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
|
||||
|
||||
dav1d_init_cpu();
|
||||
#if ARCH_X86_64
|
||||
void checkasm_warmup_avx2(void);
|
||||
void checkasm_warmup_avx512(void);
|
||||
unsigned cpu_flags = dav1d_get_cpu_flags();
|
||||
if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
|
||||
state.simd_warmup = checkasm_warmup_avx512;
|
||||
else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
|
||||
state.simd_warmup = checkasm_warmup_avx2;
|
||||
else
|
||||
state.simd_warmup = NULL;
|
||||
checkasm_simd_warmup();
|
||||
#endif
|
||||
check_cpu_flag(NULL, 0);
|
||||
|
||||
if (state.function_listing) {
|
||||
print_functions(state.funcs);
|
||||
} else {
|
||||
for (int i = 0; cpus[i].flag; i++)
|
||||
check_cpu_flag(cpus[i].name, cpus[i].flag);
|
||||
|
||||
if (!state.num_checked) {
|
||||
fprintf(stderr, "checkasm: no tests to perform\n");
|
||||
} else if (state.num_failed) {
|
||||
|
@ -774,3 +786,11 @@ DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x")
|
|||
DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
|
||||
DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d")
|
||||
DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d")
|
||||
|
||||
#if ARCH_X86_64
|
||||
void checkasm_simd_warmup(void)
|
||||
{
|
||||
if (state.simd_warmup)
|
||||
state.simd_warmup();
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -193,12 +193,20 @@ void checkasm_checked_call(void *func, ...);
|
|||
* not guaranteed and false negatives is theoretically possible, but there
|
||||
* can never be any false positives. */
|
||||
void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
/* YMM and ZMM registers on x86 are turned off to save power when they haven't
|
||||
* been used for some period of time. When they are used there will be a
|
||||
* "warmup" period during which performance will be reduced and inconsistent
|
||||
* which is problematic when trying to benchmark individual functions. We can
|
||||
* work around this by periodically issuing "dummy" instructions that uses
|
||||
* those registers to keep them powered on. */
|
||||
void checkasm_simd_warmup(void);
|
||||
#define declare_new(ret, ...)\
|
||||
ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__) =\
|
||||
(void *)checkasm_checked_call;
|
||||
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
|
||||
#define call_new(...)\
|
||||
(checkasm_set_signal_handler_state(1),\
|
||||
checkasm_simd_warmup(),\
|
||||
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
|
||||
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
|
||||
|
|
|
@ -49,29 +49,29 @@ static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
|
||||
Dav1dFilmGrainData fg_data;
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#endif
|
||||
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
|
||||
call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
if (memcmp(grain_lut_c, grain_lut_a,
|
||||
GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
|
||||
{
|
||||
fail();
|
||||
}
|
||||
|
||||
bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,38 +97,38 @@ static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
"gen_grain_uv_ar%d_%dbpc_%s",
|
||||
i, BITDEPTH, ss_name[layout_idx]))
|
||||
{
|
||||
Dav1dFilmGrainData fg_data;
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#endif
|
||||
|
||||
fg_data.num_y_points = rnd() & 1;
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
fg_data[0].num_y_points = rnd() & 1;
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int uv = rnd() & 1;
|
||||
const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
|
||||
const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
|
||||
for (int n = 0; n < num_uv_pos; n++)
|
||||
fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
|
||||
if (!fg_data.num_y_points)
|
||||
fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
|
||||
fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
|
||||
if (!fg_data[0].num_y_points)
|
||||
fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
|
||||
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
|
||||
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
|
||||
call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
|
||||
for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
|
||||
diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
|
||||
if (diff) fail();
|
||||
|
||||
bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -137,9 +137,9 @@ static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
}
|
||||
|
||||
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, src, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, src, 128 * 32,);
|
||||
const ptrdiff_t stride = 128 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
|
||||
|
@ -149,8 +149,8 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
int bh, int row_num HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
|
||||
Dav1dFilmGrainData fg_data;
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
|
@ -160,23 +160,23 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
|
||||
uint8_t scaling[SCALING_SIZE];
|
||||
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
fg_data.num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data.num_y_points;
|
||||
for (int n = 0; n < fg_data.num_y_points; n++) {
|
||||
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
|
||||
fg_data.y_points[n][0] += rnd() % pad;
|
||||
fg_data.y_points[n][1] = rnd() & 0xff;
|
||||
fg_data[0].num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data[0].num_y_points;
|
||||
for (int n = 0; n < fg_data[0].num_y_points; n++) {
|
||||
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
|
||||
fg_data[0].y_points[n][0] += rnd() % pad;
|
||||
fg_data[0].y_points[n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
|
||||
fg_data.num_y_points, scaling);
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
|
||||
fg_data[0].num_y_points, scaling);
|
||||
|
||||
const int w = 1 + (rnd() & 127);
|
||||
const int h = 1 + (rnd() & 31);
|
||||
|
@ -186,20 +186,20 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
|
||||
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
|
||||
|
||||
fg_data.clip_to_restricted_range = rnd() & 1;
|
||||
fg_data.scaling_shift = (rnd() & 3) + 8;
|
||||
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
|
||||
fg_data.overlap_flag++)
|
||||
fg_data[0].clip_to_restricted_range = rnd() & 1;
|
||||
fg_data[0].scaling_shift = (rnd() & 3) + 8;
|
||||
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
|
||||
fg_data[0].overlap_flag++)
|
||||
{
|
||||
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
|
||||
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
|
||||
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
|
||||
}
|
||||
fg_data.overlap_flag = 1;
|
||||
bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
|
||||
fg_data[0].overlap_flag = 1;
|
||||
bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
|
@ -207,10 +207,10 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
}
|
||||
|
||||
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, src, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, luma_src, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, src, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, luma_src, 128 * 32,);
|
||||
const ptrdiff_t lstride = 128 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
|
||||
|
@ -231,9 +231,9 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
"fguv_32x32xn_%dbpc_%s_csfl%d",
|
||||
BITDEPTH, ss_name[layout_idx], csfl))
|
||||
{
|
||||
Dav1dFilmGrainData fg_data;
|
||||
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
|
||||
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
fg_data[0].seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
|
@ -245,15 +245,18 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
|
||||
uint8_t scaling[SCALING_SIZE];
|
||||
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
fg_data[0].grain_scale_shift = rnd() & 3;
|
||||
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data[0].ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
const int num_uv_pos = num_y_pos + 1;
|
||||
for (int n = 0; n < num_uv_pos; n++)
|
||||
fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
|
||||
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
|
||||
&fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
|
||||
fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int w = 1 + (rnd() & (127 >> ss_x));
|
||||
const int h = 1 + (rnd() & (31 >> ss_y));
|
||||
|
@ -268,47 +271,47 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
|
||||
|
||||
if (csfl) {
|
||||
fg_data.num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data.num_y_points;
|
||||
for (int n = 0; n < fg_data.num_y_points; n++) {
|
||||
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
|
||||
fg_data.y_points[n][0] += rnd() % pad;
|
||||
fg_data.y_points[n][1] = rnd() & 0xff;
|
||||
fg_data[0].num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data[0].num_y_points;
|
||||
for (int n = 0; n < fg_data[0].num_y_points; n++) {
|
||||
fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
|
||||
fg_data[0].y_points[n][0] += rnd() % pad;
|
||||
fg_data[0].y_points[n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
|
||||
fg_data.num_y_points, scaling);
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
|
||||
fg_data[0].num_y_points, scaling);
|
||||
} else {
|
||||
fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
|
||||
const int pad = 0xff / fg_data.num_uv_points[uv_pl];
|
||||
for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
|
||||
fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
|
||||
fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
|
||||
fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
|
||||
fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
|
||||
const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
|
||||
for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
|
||||
fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
|
||||
fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
|
||||
fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
|
||||
fg_data.num_uv_points[uv_pl], scaling);
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
|
||||
fg_data[0].num_uv_points[uv_pl], scaling);
|
||||
|
||||
fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
|
||||
fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
|
||||
}
|
||||
|
||||
fg_data.clip_to_restricted_range = rnd() & 1;
|
||||
fg_data.scaling_shift = (rnd() & 3) + 8;
|
||||
fg_data.chroma_scaling_from_luma = csfl;
|
||||
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
|
||||
fg_data.overlap_flag++)
|
||||
fg_data[0].clip_to_restricted_range = rnd() & 1;
|
||||
fg_data[0].scaling_shift = (rnd() & 3) + 8;
|
||||
fg_data[0].chroma_scaling_from_luma = csfl;
|
||||
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
|
||||
fg_data[0].overlap_flag++)
|
||||
{
|
||||
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
|
||||
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
|
||||
call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
|
||||
}
|
||||
|
||||
fg_data.overlap_flag = 1;
|
||||
bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
|
||||
fg_data[0].overlap_flag = 1;
|
||||
bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,9 +66,9 @@ static const uint8_t z_angles[27] = {
|
|||
};
|
||||
|
||||
static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
|
||||
ALIGN_STK_32(pixel, topleft_buf, 257,);
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, topleft_buf, 257,);
|
||||
pixel *const topleft = topleft_buf + 128;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
|
||||
|
@ -132,9 +132,9 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_32(int16_t, c_dst, 32 * 32,);
|
||||
ALIGN_STK_32(int16_t, a_dst, 32 * 32,);
|
||||
ALIGN_STK_32(pixel, luma, 32 * 32,);
|
||||
ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, luma, 32 * 32,);
|
||||
|
||||
declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
|
||||
int w_pad, int h_pad, int cw, int ch);
|
||||
|
@ -175,10 +175,10 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 32 * 32,);
|
||||
ALIGN_STK_32(int16_t, ac, 32 * 32,);
|
||||
ALIGN_STK_32(pixel, topleft_buf, 257,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
|
||||
ALIGN_STK_64(int16_t, ac, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, topleft_buf, 257,);
|
||||
pixel *const topleft = topleft_buf + 128;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
|
||||
|
@ -227,9 +227,9 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
|
||||
ALIGN_STK_32(uint8_t, idx, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
|
||||
ALIGN_STK_16(uint16_t, pal, 8,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
|
||||
|
|
|
@ -226,9 +226,9 @@ void bitfn(checkasm_check_itx)(void) {
|
|||
Dav1dInvTxfmDSPContext c;
|
||||
bitfn(dav1d_itx_dsp_init)(&c);
|
||||
|
||||
ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
|
||||
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
|
||||
ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
|
||||
ALIGN_STK_64(pixel, c_dst, 64 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 64 * 64,);
|
||||
|
||||
static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
|
||||
TX_4X4, RTX_4X8, RTX_4X16,
|
||||
|
|
|
@ -95,8 +95,8 @@ static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
|
|||
const int n_blks, const int lf_idx,
|
||||
const int is_chroma, const int dir)
|
||||
{
|
||||
ALIGN_STK_32(pixel, c_dst_mem, 128 * 16,);
|
||||
ALIGN_STK_32(pixel, a_dst_mem, 128 * 16,);
|
||||
ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
|
||||
ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
|
||||
const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
|
|
|
@ -43,10 +43,10 @@ static void init_tmp(pixel *buf, const ptrdiff_t stride,
|
|||
}
|
||||
}
|
||||
|
||||
static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, c_dst, 448 * 64,);
|
||||
ALIGN_STK_32(pixel, a_dst, 448 * 64,);
|
||||
ALIGN_STK_32(pixel, h_edge, 448 * 8,);
|
||||
static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
|
||||
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
|
||||
pixel left[64][4];
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
|
@ -58,7 +58,7 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
|
|||
|
||||
for (int pl = 0; pl < 2; pl++) {
|
||||
if (check_func(c->wiener, "wiener_%s_%dbpc",
|
||||
pl ? "chroma" : "luma", BITDEPTH))
|
||||
pl ? "chroma" : "luma", bpc))
|
||||
{
|
||||
int16_t filter[2][3], filter_v[7], filter_h[7];
|
||||
|
||||
|
@ -81,11 +81,7 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
|
|||
|
||||
const int base_w = 1 + (rnd() % 384);
|
||||
const int base_h = 1 + (rnd() & 63);
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
|
@ -112,13 +108,12 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
|
|||
256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("wiener");
|
||||
}
|
||||
|
||||
static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, c_dst, 448 * 64,);
|
||||
ALIGN_STK_32(pixel, a_dst, 448 * 64,);
|
||||
ALIGN_STK_32(pixel, h_edge, 448 * 8,);
|
||||
static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
|
||||
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
|
||||
pixel left[64][4];
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
|
@ -130,7 +125,7 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
|
|||
|
||||
for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
|
||||
if (check_func(c->selfguided, "selfguided_%s_%dbpc",
|
||||
sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", BITDEPTH))
|
||||
sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
|
||||
{
|
||||
int16_t sgr_wt[2];
|
||||
|
||||
|
@ -140,11 +135,7 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
|
|||
|
||||
const int base_w = 1 + (rnd() % 384);
|
||||
const int base_h = 1 + (rnd() & 63);
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
|
@ -171,14 +162,24 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
|
|||
256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
report("sgr");
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_looprestoration)(void) {
|
||||
Dav1dLoopRestorationDSPContext c;
|
||||
|
||||
bitfn(dav1d_loop_restoration_dsp_init)(&c);
|
||||
|
||||
check_wiener(&c);
|
||||
check_sgr(&c);
|
||||
#if BITDEPTH == 16
|
||||
const int bpc_min = 10, bpc_max = 12;
|
||||
#else
|
||||
const int bpc_min = 8, bpc_max = 8;
|
||||
#endif
|
||||
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
|
||||
Dav1dLoopRestorationDSPContext c;
|
||||
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
|
||||
check_wiener(&c, bpc);
|
||||
}
|
||||
report("wiener");
|
||||
for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
|
||||
Dav1dLoopRestorationDSPContext c;
|
||||
bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc);
|
||||
check_sgr(&c, bpc);
|
||||
}
|
||||
report("sgr");
|
||||
}
|
||||
|
|
|
@ -55,9 +55,9 @@ static int mc_h_next(const int h) {
|
|||
}
|
||||
|
||||
static void check_mc(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 128,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
const pixel *src = src_buf + 135 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 135 * sizeof(pixel);
|
||||
|
||||
|
@ -118,9 +118,9 @@ static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
|
|||
}
|
||||
|
||||
static void check_mct(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_32(int16_t, c_tmp, 128 * 128,);
|
||||
ALIGN_STK_32(int16_t, a_tmp, 128 * 128,);
|
||||
ALIGN_STK_64(pixel, src_buf, 135 * 135,);
|
||||
ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
|
||||
const pixel *src = src_buf + 135 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 135 * sizeof(pixel);
|
||||
|
||||
|
@ -173,9 +173,9 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
|
|||
}
|
||||
|
||||
static void check_avg(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
|
||||
|
@ -204,9 +204,9 @@ static void check_avg(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_w_avg(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
|
||||
|
@ -236,10 +236,10 @@ static void check_w_avg(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_mask(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_32(uint8_t, mask, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, mask, 128 * 128,);
|
||||
|
||||
for (int i = 0; i < 128 * 128; i++)
|
||||
mask[i] = rnd() % 65;
|
||||
|
@ -271,11 +271,11 @@ static void check_mask(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_w_mask(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_32(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_32(uint8_t, c_mask, 128 * 128,);
|
||||
ALIGN_STK_32(uint8_t, a_mask, 128 * 128,);
|
||||
ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 135,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
|
||||
ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
|
||||
const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
|
||||
|
@ -321,10 +321,10 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_blend(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, tmp, 32 * 32,);
|
||||
ALIGN_STK_32(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 32 * 32,);
|
||||
ALIGN_STK_32(uint8_t, mask, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, tmp, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 32,);
|
||||
ALIGN_STK_64(uint8_t, mask, 32 * 32,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h, const uint8_t *mask);
|
||||
|
@ -357,9 +357,9 @@ static void check_blend(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_blend_v(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, tmp, 32 * 128,);
|
||||
ALIGN_STK_32(pixel, c_dst, 32 * 128,);
|
||||
ALIGN_STK_32(pixel, a_dst, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, tmp, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, c_dst, 32 * 128,);
|
||||
ALIGN_STK_64(pixel, a_dst, 32 * 128,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h);
|
||||
|
@ -391,9 +391,9 @@ static void check_blend_v(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_blend_h(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, tmp, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, tmp, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_64(pixel, a_dst, 128 * 32,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
|
||||
int w, int h);
|
||||
|
@ -424,9 +424,9 @@ static void check_blend_h(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_warp8x8(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_32(pixel, c_dst, 8 * 8,);
|
||||
ALIGN_STK_32(pixel, a_dst, 8 * 8,);
|
||||
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_64(pixel, c_dst, 8 * 8,);
|
||||
ALIGN_STK_64(pixel, a_dst, 8 * 8,);
|
||||
int16_t abcd[4];
|
||||
const pixel *src = src_buf + 15 * 3 + 3;
|
||||
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
|
||||
|
@ -462,9 +462,9 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) {
|
|||
}
|
||||
|
||||
static void check_warp8x8t(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_32(int16_t, c_tmp, 8 * 8,);
|
||||
ALIGN_STK_32(int16_t, a_tmp, 8 * 8,);
|
||||
ALIGN_STK_64(pixel, src_buf, 15 * 15,);
|
||||
ALIGN_STK_64(int16_t, c_tmp, 8 * 8,);
|
||||
ALIGN_STK_64(int16_t, a_tmp, 8 * 8,);
|
||||
int16_t abcd[4];
|
||||
const pixel *src = src_buf + 15 * 3 + 3;
|
||||
const ptrdiff_t src_stride = 15 * sizeof(pixel);
|
||||
|
@ -534,9 +534,9 @@ static void random_offset_for_edge(int *const x, int *const y,
|
|||
}
|
||||
|
||||
static void check_emuedge(Dav1dMCDSPContext *const c) {
|
||||
ALIGN_STK_32(pixel, c_dst, 135 * 192,);
|
||||
ALIGN_STK_32(pixel, a_dst, 135 * 192,);
|
||||
ALIGN_STK_32(pixel, src, 160 * 160,);
|
||||
ALIGN_STK_64(pixel, c_dst, 135 * 192,);
|
||||
ALIGN_STK_64(pixel, a_dst, 135 * 192,);
|
||||
ALIGN_STK_64(pixel, src, 160 * 160,);
|
||||
|
||||
for (int i = 0; i < 160 * 160; i++)
|
||||
src[i] = rnd() & ((1U << BITDEPTH) - 1);
|
||||
|
|
|
@ -258,6 +258,12 @@ void checkasm_check_msac(void) {
|
|||
c.bool = dav1d_msac_decode_bool_sse2;
|
||||
c.hi_tok = dav1d_msac_decode_hi_tok_sse2;
|
||||
}
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) {
|
||||
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
uint8_t buf[BUF_SIZE];
|
||||
|
|
|
@ -170,6 +170,19 @@ cglobal checked_call, 2,15,16,max_args*8+8
|
|||
.ok:
|
||||
RET
|
||||
|
||||
; trigger a warmup of vector units
|
||||
%macro WARMUP 0
|
||||
cglobal warmup, 0, 0
|
||||
xorps m0, m0
|
||||
mulps m0, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx2
|
||||
WARMUP
|
||||
INIT_ZMM avx512
|
||||
WARMUP
|
||||
|
||||
%else
|
||||
|
||||
; just random numbers to reduce the chance of incidental match
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
# Copyright © 2020, VideoLAN and dav1d authors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#
|
||||
# Build definition for the dav1d fuzzing binaries
|
||||
#
|
||||
|
||||
dav1d_fuzzer_sources = files('dav1d_fuzzer.c')
|
||||
fuzzer_ldflags = []
|
||||
fuzzer_link_lang = {}
|
||||
|
||||
if get_option('fuzzer_ldflags') != ''
|
||||
fuzzer_ldflags += [get_option('fuzzer_ldflags')]
|
||||
endif
|
||||
|
||||
if fuzzing_engine == 'none'
|
||||
dav1d_fuzzer_sources += files('main.c')
|
||||
elif fuzzing_engine == 'libfuzzer'
|
||||
fuzzer_ldflags += ['-fsanitize=fuzzer']
|
||||
elif fuzzing_engine == 'oss-fuzz'
|
||||
# libFuzzingEngine needs c++
|
||||
add_languages('cpp')
|
||||
fuzzer_link_lang = {'link_language': 'cpp'}
|
||||
endif
|
||||
|
||||
dav1d_fuzzer = executable('dav1d_fuzzer',
|
||||
dav1d_fuzzer_sources,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag],
|
||||
link_args: fuzzer_ldflags,
|
||||
link_with : libdav1d,
|
||||
build_by_default: true,
|
||||
dependencies : [thread_dependency],
|
||||
kwargs: fuzzer_link_lang
|
||||
)
|
||||
|
||||
dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
|
||||
dav1d_fuzzer_sources,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
|
||||
link_args: fuzzer_ldflags,
|
||||
link_with : libdav1d,
|
||||
build_by_default: true,
|
||||
dependencies : [thread_dependency],
|
||||
kwargs: fuzzer_link_lang
|
||||
)
|
||||
|
||||
objcopy = find_program('objcopy',
|
||||
required: false)
|
||||
if (objcopy.found() and
|
||||
not get_option('b_lto') and
|
||||
get_option('default_library') == 'static' and
|
||||
cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
|
||||
|
||||
libdav1d_af = custom_target('libdav1d_af',
|
||||
input: libdav1d,
|
||||
output: 'libdav1d_af.a',
|
||||
depends: libdav1d,
|
||||
command: [objcopy,
|
||||
'--redefine-sym', 'malloc=__wrap_malloc',
|
||||
'--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
|
||||
'--redefine-sym', 'pthread_create=__wrap_pthread_create',
|
||||
'--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
|
||||
'--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
|
||||
'@INPUT@', '@OUTPUT@'])
|
||||
|
||||
dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
|
||||
dav1d_fuzzer_sources + ['alloc_fail.c'],
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
|
||||
link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
|
||||
link_depends: libdav1d_af,
|
||||
build_by_default: false,
|
||||
dependencies : [thread_dependency],
|
||||
kwargs: fuzzer_link_lang
|
||||
)
|
||||
endif
|
|
@ -90,76 +90,20 @@ if is_asm_enabled
|
|||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag],
|
||||
build_by_default: false,
|
||||
dependencies : [thread_dependency, rt_dependency, m_lib],
|
||||
dependencies : [
|
||||
thread_dependency,
|
||||
rt_dependency,
|
||||
libdl_dependency,
|
||||
m_lib,
|
||||
],
|
||||
)
|
||||
|
||||
test('checkasm', checkasm, is_parallel: false)
|
||||
endif
|
||||
|
||||
dav1d_fuzzer_sources = files('libfuzzer/dav1d_fuzzer.c')
|
||||
fuzzer_ldflags = []
|
||||
|
||||
if get_option('fuzzer_ldflags') != ''
|
||||
fuzzer_ldflags += [get_option('fuzzer_ldflags')]
|
||||
endif
|
||||
|
||||
if fuzzing_engine == 'none'
|
||||
dav1d_fuzzer_sources += files('libfuzzer/main.c')
|
||||
elif fuzzing_engine == 'libfuzzer'
|
||||
fuzzer_ldflags += ['-fsanitize=fuzzer']
|
||||
elif fuzzing_engine == 'oss-fuzz'
|
||||
# libFuzzingEngine needs libc++
|
||||
fuzzer_ldflags += ['-lc++']
|
||||
endif
|
||||
|
||||
dav1d_fuzzer = executable('dav1d_fuzzer',
|
||||
dav1d_fuzzer_sources,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag],
|
||||
link_args: fuzzer_ldflags,
|
||||
link_with : libdav1d,
|
||||
build_by_default: true,
|
||||
dependencies : [thread_dependency],
|
||||
)
|
||||
|
||||
dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
|
||||
dav1d_fuzzer_sources,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
|
||||
link_args: fuzzer_ldflags,
|
||||
link_with : libdav1d,
|
||||
build_by_default: true,
|
||||
dependencies : [thread_dependency],
|
||||
)
|
||||
|
||||
objcopy = find_program('objcopy',
|
||||
required: false)
|
||||
if (objcopy.found() and
|
||||
not get_option('b_lto') and
|
||||
get_option('default_library') == 'static' and
|
||||
cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
|
||||
|
||||
libdav1d_af = custom_target('libdav1d_af',
|
||||
input: libdav1d,
|
||||
output: 'libdav1d_af.a',
|
||||
depends: libdav1d,
|
||||
command: [objcopy,
|
||||
'--redefine-sym', 'malloc=__wrap_malloc',
|
||||
'--redefine-sym', 'posix_memalign=__wrap_posix_memalign',
|
||||
'--redefine-sym', 'pthread_create=__wrap_pthread_create',
|
||||
'--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init',
|
||||
'--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init',
|
||||
'@INPUT@', '@OUTPUT@'])
|
||||
|
||||
dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
|
||||
dav1d_fuzzer_sources + ['libfuzzer/alloc_fail.c'],
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
|
||||
link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
|
||||
link_depends: libdav1d_af,
|
||||
build_by_default: false,
|
||||
dependencies : [thread_dependency],
|
||||
)
|
||||
# fuzzing binaries
|
||||
if meson.version().version_compare('>=0.49')
|
||||
subdir('libfuzzer')
|
||||
endif
|
||||
|
||||
# Include dav1d test data repository with additional tests
|
||||
|
|
|
@ -113,18 +113,24 @@ static void synchronize(const int realtime, const unsigned cache,
|
|||
static void print_stats(const int istty, const unsigned n, const unsigned num,
|
||||
const uint64_t elapsed, const double i_fps)
|
||||
{
|
||||
if (istty) fputs("\r", stderr);
|
||||
const double d_fps = 1e9 * n / elapsed;
|
||||
const double speed = d_fps / i_fps;
|
||||
if (num == 0xFFFFFFFF) {
|
||||
fprintf(stderr, "Decoded %u frames", n);
|
||||
} else {
|
||||
fprintf(stderr, "Decoded %u/%u frames (%.1lf%%)", n, num,
|
||||
100.0 * n / num);
|
||||
char buf[80], *b = buf, *const end = buf + 80;
|
||||
|
||||
if (istty)
|
||||
*b++ = '\r';
|
||||
if (num == 0xFFFFFFFF)
|
||||
b += snprintf(b, end - b, "Decoded %u frames", n);
|
||||
else
|
||||
b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
|
||||
n, num, 100.0 * n / num);
|
||||
if (i_fps && b < end) {
|
||||
const double d_fps = 1e9 * n / elapsed;
|
||||
const double speed = d_fps / i_fps;
|
||||
b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
|
||||
d_fps, i_fps, speed);
|
||||
}
|
||||
if (i_fps)
|
||||
fprintf(stderr, " - %.2lf/%.2lf fps (%.2lfx)", d_fps, i_fps, speed);
|
||||
if (!istty) fputs("\n", stderr);
|
||||
if (!istty)
|
||||
strcpy(b > end - 2 ? end - 2 : b, "\n");
|
||||
fputs(buf, stderr);
|
||||
}
|
||||
|
||||
int main(const int argc, char *const *const argv) {
|
||||
|
@ -149,8 +155,6 @@ int main(const int argc, char *const *const argv) {
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
init_demuxers();
|
||||
init_muxers();
|
||||
parse(argc, argv, &cli_settings, &lib_settings);
|
||||
|
||||
if ((res = input_open(&in, cli_settings.demuxer,
|
||||
|
|
|
@ -86,7 +86,7 @@ static const struct option long_opts[] = {
|
|||
#define ALLOWED_CPU_MASKS " or 'neon'"
|
||||
#elif ARCH_X86
|
||||
#define ALLOWED_CPU_MASKS \
|
||||
", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512'"
|
||||
", 'sse2', 'ssse3', 'sse41', 'avx2', 'avx512' or 'avx512icl'"
|
||||
#else
|
||||
#define ALLOWED_CPU_MASKS "not yet implemented for this architecture"
|
||||
#endif
|
||||
|
@ -176,15 +176,11 @@ typedef struct EnumParseTable {
|
|||
|
||||
#if ARCH_X86
|
||||
enum CpuMask {
|
||||
X86_CPU_MASK_SSE = DAV1D_X86_CPU_FLAG_SSE,
|
||||
X86_CPU_MASK_SSE2 = DAV1D_X86_CPU_FLAG_SSE2 | X86_CPU_MASK_SSE,
|
||||
X86_CPU_MASK_SSE3 = DAV1D_X86_CPU_FLAG_SSE3 | X86_CPU_MASK_SSE2,
|
||||
X86_CPU_MASK_SSSE3 = DAV1D_X86_CPU_FLAG_SSSE3 | X86_CPU_MASK_SSE3,
|
||||
X86_CPU_MASK_SSE41 = DAV1D_X86_CPU_FLAG_SSE41 | X86_CPU_MASK_SSSE3,
|
||||
X86_CPU_MASK_SSE42 = DAV1D_X86_CPU_FLAG_SSE42 | X86_CPU_MASK_SSE41,
|
||||
X86_CPU_MASK_AVX = DAV1D_X86_CPU_FLAG_AVX | X86_CPU_MASK_SSE42,
|
||||
X86_CPU_MASK_AVX2 = DAV1D_X86_CPU_FLAG_AVX2 | X86_CPU_MASK_AVX,
|
||||
X86_CPU_MASK_AVX512 = DAV1D_X86_CPU_FLAG_AVX512 | X86_CPU_MASK_AVX2,
|
||||
X86_CPU_MASK_SSE2 = DAV1D_X86_CPU_FLAG_SSE2,
|
||||
X86_CPU_MASK_SSSE3 = DAV1D_X86_CPU_FLAG_SSSE3 | X86_CPU_MASK_SSE2,
|
||||
X86_CPU_MASK_SSE41 = DAV1D_X86_CPU_FLAG_SSE41 | X86_CPU_MASK_SSSE3,
|
||||
X86_CPU_MASK_AVX2 = DAV1D_X86_CPU_FLAG_AVX2 | X86_CPU_MASK_SSE41,
|
||||
X86_CPU_MASK_AVX512ICL = DAV1D_X86_CPU_FLAG_AVX512ICL | X86_CPU_MASK_AVX2,
|
||||
};
|
||||
#endif
|
||||
|
||||
|
@ -192,11 +188,11 @@ static const EnumParseTable cpu_mask_tbl[] = {
|
|||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
{ "neon", DAV1D_ARM_CPU_FLAG_NEON },
|
||||
#elif ARCH_X86
|
||||
{ "sse2", X86_CPU_MASK_SSE2 },
|
||||
{ "ssse3", X86_CPU_MASK_SSSE3 },
|
||||
{ "sse41", X86_CPU_MASK_SSE41 },
|
||||
{ "avx2", X86_CPU_MASK_AVX2 },
|
||||
{ "avx512", X86_CPU_MASK_AVX512 },
|
||||
{ "sse2", X86_CPU_MASK_SSE2 },
|
||||
{ "ssse3", X86_CPU_MASK_SSSE3 },
|
||||
{ "sse41", X86_CPU_MASK_SSE41 },
|
||||
{ "avx2", X86_CPU_MASK_AVX2 },
|
||||
{ "avx512icl", X86_CPU_MASK_AVX512ICL },
|
||||
#endif
|
||||
{ 0 },
|
||||
};
|
||||
|
|
|
@ -43,21 +43,15 @@ struct DemuxerContext {
|
|||
const Demuxer *impl;
|
||||
};
|
||||
|
||||
#define MAX_NUM_DEMUXERS 3
|
||||
static const Demuxer *demuxers[MAX_NUM_DEMUXERS];
|
||||
static int num_demuxers = 0;
|
||||
|
||||
#define register_demuxer(impl) { \
|
||||
extern const Demuxer impl; \
|
||||
assert(num_demuxers < MAX_NUM_DEMUXERS); \
|
||||
demuxers[num_demuxers++] = &impl; \
|
||||
}
|
||||
|
||||
void init_demuxers(void) {
|
||||
register_demuxer(ivf_demuxer);
|
||||
register_demuxer(annexb_demuxer);
|
||||
register_demuxer(section5_demuxer);
|
||||
}
|
||||
extern const Demuxer ivf_demuxer;
|
||||
extern const Demuxer annexb_demuxer;
|
||||
extern const Demuxer section5_demuxer;
|
||||
static const Demuxer *const demuxers[] = {
|
||||
&ivf_demuxer,
|
||||
&annexb_demuxer,
|
||||
§ion5_demuxer,
|
||||
NULL
|
||||
};
|
||||
|
||||
int input_open(DemuxerContext **const c_out,
|
||||
const char *const name, const char *const filename,
|
||||
|
@ -68,19 +62,19 @@ int input_open(DemuxerContext **const c_out,
|
|||
int res, i;
|
||||
|
||||
if (name) {
|
||||
for (i = 0; i < num_demuxers; i++) {
|
||||
for (i = 0; demuxers[i]; i++) {
|
||||
if (!strcmp(demuxers[i]->name, name)) {
|
||||
impl = demuxers[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == num_demuxers) {
|
||||
if (!demuxers[i]) {
|
||||
fprintf(stderr, "Failed to find demuxer named \"%s\"\n", name);
|
||||
return DAV1D_ERR(ENOPROTOOPT);
|
||||
}
|
||||
} else {
|
||||
int probe_sz = 0;
|
||||
for (i = 0; i < num_demuxers; i++)
|
||||
for (i = 0; demuxers[i]; i++)
|
||||
probe_sz = imax(probe_sz, demuxers[i]->probe_sz);
|
||||
uint8_t *const probe_data = malloc(probe_sz);
|
||||
if (!probe_data) {
|
||||
|
@ -96,14 +90,14 @@ int input_open(DemuxerContext **const c_out,
|
|||
return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
|
||||
}
|
||||
|
||||
for (i = 0; i < num_demuxers; i++) {
|
||||
for (i = 0; demuxers[i]; i++) {
|
||||
if (demuxers[i]->probe(probe_data)) {
|
||||
impl = demuxers[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(probe_data);
|
||||
if (i == num_demuxers) {
|
||||
if (!demuxers[i]) {
|
||||
fprintf(stderr,
|
||||
"Failed to probe demuxer for file %s\n",
|
||||
filename);
|
||||
|
@ -111,11 +105,10 @@ int input_open(DemuxerContext **const c_out,
|
|||
}
|
||||
}
|
||||
|
||||
if (!(c = malloc(sizeof(DemuxerContext) + impl->priv_data_size))) {
|
||||
if (!(c = calloc(1, sizeof(DemuxerContext) + impl->priv_data_size))) {
|
||||
fprintf(stderr, "Failed to allocate memory\n");
|
||||
return DAV1D_ERR(ENOMEM);
|
||||
}
|
||||
memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
|
||||
c->impl = impl;
|
||||
c->data = (DemuxerPriv *) &c[1];
|
||||
if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче