зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1688992 - Update libdav1d to 0.8.2 for Firefox 88. r=dminor
Differential Revision: https://phabricator.services.mozilla.com/D106197
This commit is contained in:
Родитель
a34308d565
Коммит
7c5470c9ff
|
@ -25,6 +25,10 @@ The rough steps are:
|
|||
- Update ./moz.build and ./asm/moz.build to add new files and remove deleted ones using
|
||||
third_party/dav1d/src/meson.build as a guide (confirm with the diff) (note the
|
||||
empty .asm file in x86_64)
|
||||
- Some files will be automatically added to the various autovendored_sources.mozbuild files.
|
||||
In the case of the asm dir, these may cause build failures on particular platforms which
|
||||
can be resolved by moving those out of autovendored_sources.mozbuild and into the regular
|
||||
moz.build which has a condition on CONFIG['CPU_ARCH'].
|
||||
- Clone the tag from the dav1d repo and build a stand-alone libdav1d following the steps here:
|
||||
https://code.videolan.org/videolan/dav1d#compile
|
||||
- Copy vcs_version.h from the local build/include/vcs_version.h
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
sources = [
|
||||
'../../../third_party/dav1d/src/x86/cdef16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/cpuid.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain_ssse3.asm',
|
||||
|
|
|
@ -83,6 +83,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
# Empty file on all other archs. Nasm produces
|
||||
# an error when it compiles empty files.
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored
|
||||
'../../../third_party/dav1d/src/x86/cdef_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/cdef_avx512.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain.asm',
|
||||
|
@ -90,6 +91,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
'../../../third_party/dav1d/src/x86/itx.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm', # moved from autovendored
|
||||
'../../../third_party/dav1d/src/x86/mc_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc_avx512.asm',
|
||||
]
|
||||
|
@ -185,7 +187,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
'../../../third_party/dav1d/src/arm/32/cdef16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
|
||||
'../../../third_party/dav1d/src/arm/32/ipred.S',
|
||||
'../../../third_party/dav1d/src/arm/32/ipred16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/itx.S',
|
||||
'../../../third_party/dav1d/src/arm/32/itx16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
|
||||
'../../../third_party/dav1d/src/arm/32/loopfilter16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
|
||||
|
|
|
@ -163,6 +163,7 @@ EXPORTS.dav1d += [
|
|||
'../../third_party/dav1d/include/common/attributes.h',
|
||||
'../../third_party/dav1d/include/common/bitdepth.h',
|
||||
'../../third_party/dav1d/include/common/dump.h',
|
||||
'../../third_party/dav1d/include/common/frame.h',
|
||||
'../../third_party/dav1d/include/common/intops.h',
|
||||
'../../third_party/dav1d/include/common/validate.h',
|
||||
]
|
||||
|
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 6ed5fafb42c651c24b6a65fd4f50ed426fd72d65 (2021-01-01T21:36:25.000+01:00).
|
||||
release: commit f06148e7c755098666b9c0ed97a672a51785413a (2021-02-21T21:40:09.000+01:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 6ed5fafb42c651c24b6a65fd4f50ed426fd72d65
|
||||
revision: f06148e7c755098666b9c0ed97a672a51785413a
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.8.1-0-g6ed5faf"
|
||||
#define DAV1D_VERSION "0.8.2-0-gf06148e"
|
||||
|
|
|
@ -29,6 +29,6 @@
|
|||
|
||||
#define DAV1D_API_VERSION_MAJOR 5
|
||||
#define DAV1D_API_VERSION_MINOR 0
|
||||
#define DAV1D_API_VERSION_PATCH 0
|
||||
#define DAV1D_API_VERSION_PATCH 1
|
||||
|
||||
#endif /* DAV1D_VERSION_H */
|
||||
|
|
|
@ -1,4 +1,25 @@
|
|||
Changes for 0.8.1 'Eurasian hobby":
|
||||
Changes for 0.8.2 'Eurasian hobby':
|
||||
-----------------------------------
|
||||
|
||||
0.8.2 is a middle-size update of the 0.8.0 branch:
|
||||
- ARM32 optimizations for ipred and itx in 10/12bits,
|
||||
completing the 10b/12b work on ARM64 and ARM32
|
||||
- Give the post-filters their own threads
|
||||
- ARM64: rewrite the wiener functions
|
||||
- Speed up coefficient decoding, 0.5%-3% global decoding gain
|
||||
- x86 optimizations for CDEF_filter and wiener in 10/12bit
|
||||
- x86: rewrite the SGR AVX2 asm
|
||||
- x86: improve msac speed on SSE2+ machines
|
||||
- ARM32: improve speed of ipred and warp
|
||||
- ARM64: improve speed of ipred, cdef_dir, cdef_filter, warp_motion and itx16
|
||||
- ARM32/64: improve speed of looprestoration
|
||||
- Add seeking, pausing to the player
|
||||
- Update the player for rendering of 10b/12b
|
||||
- Misc speed improvements and fixes on all platforms
|
||||
- Add a xxh3 muxer in the dav1d application
|
||||
|
||||
|
||||
Changes for 0.8.1 'Eurasian hobby':
|
||||
-----------------------------------
|
||||
|
||||
0.8.1 is a minor update on 0.8.0:
|
||||
|
@ -10,7 +31,7 @@ Changes for 0.8.1 'Eurasian hobby":
|
|||
- x86 optimizations for wiener in SSE2/SSSE3/AVX2
|
||||
|
||||
|
||||
Changes for 0.8.0 'Eurasian hobby":
|
||||
Changes for 0.8.0 'Eurasian hobby':
|
||||
-----------------------------------
|
||||
|
||||
0.8.0 is a major update for dav1d:
|
||||
|
|
|
@ -39,6 +39,11 @@
|
|||
#include "dp_fifo.h"
|
||||
#include "dp_renderer.h"
|
||||
|
||||
#define FRAME_OFFSET_TO_PTS(foff) \
|
||||
(uint64_t)(((foff) * rd_ctx->spf) * 1000000000.0 + .5)
|
||||
#define TS_TO_PTS(ts) \
|
||||
(uint64_t)(((ts) * rd_ctx->timebase) * 1000000000.0 + .5)
|
||||
|
||||
// Selected renderer callbacks and cookie
|
||||
static const Dav1dPlayRenderInfo *renderer_info = { NULL };
|
||||
|
||||
|
@ -59,27 +64,43 @@ typedef struct render_context
|
|||
// Lock to protect access to the context structure
|
||||
SDL_mutex *lock;
|
||||
|
||||
// Timestamp of previous decoded frame
|
||||
int64_t last_pts;
|
||||
// Timestamp of current decoded frame
|
||||
int64_t current_pts;
|
||||
// Timestamp of last displayed frame (in timebase unit)
|
||||
int64_t last_ts;
|
||||
// Timestamp of last decoded frame (in timebase unit)
|
||||
int64_t current_ts;
|
||||
// Ticks when last frame was received
|
||||
uint32_t last_ticks;
|
||||
// PTS time base
|
||||
double timebase;
|
||||
// Seconds per frame
|
||||
double spf;
|
||||
// Number of frames
|
||||
uint32_t total;
|
||||
|
||||
// Fifo
|
||||
Dav1dPlayPtrFifo *fifo;
|
||||
|
||||
// Custom SDL2 event type
|
||||
uint32_t renderer_event_type;
|
||||
// Custom SDL2 event types
|
||||
uint32_t event_types;
|
||||
|
||||
// User pause state
|
||||
uint8_t user_paused;
|
||||
// Internal pause state
|
||||
uint8_t paused;
|
||||
// Start of internal pause state
|
||||
uint32_t pause_start;
|
||||
// Duration of internal pause state
|
||||
uint32_t pause_time;
|
||||
|
||||
// Seek accumulator
|
||||
int seek;
|
||||
|
||||
// Indicates if termination of the decoder thread was requested
|
||||
uint8_t dec_should_terminate;
|
||||
} Dav1dPlayRenderContext;
|
||||
|
||||
static void dp_settings_print_usage(const char *const app,
|
||||
const char *const reason, ...)
|
||||
const char *const reason, ...)
|
||||
{
|
||||
if (reason) {
|
||||
va_list args;
|
||||
|
@ -95,6 +116,7 @@ static void dp_settings_print_usage(const char *const app,
|
|||
" --untimed/-u: ignore PTS, render as fast as possible\n"
|
||||
" --framethreads $num: number of frame threads (default: 1)\n"
|
||||
" --tilethreads $num: number of tile threads (default: 1)\n"
|
||||
" --pfthreads $num: number of postfilter threads(default: 1)\n"
|
||||
" --highquality: enable high quality rendering\n"
|
||||
" --zerocopy/-z: enable zero copy upload path\n"
|
||||
" --gpugrain/-g: enable GPU grain synthesis\n"
|
||||
|
@ -115,7 +137,7 @@ static unsigned parse_unsigned(const char *const optarg, const int option,
|
|||
}
|
||||
|
||||
static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
|
||||
const int argc, char *const *const argv)
|
||||
const int argc, char *const *const argv)
|
||||
{
|
||||
int o;
|
||||
Dav1dPlaySettings *settings = &rd_ctx->settings;
|
||||
|
@ -127,6 +149,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
|
|||
enum {
|
||||
ARG_FRAME_THREADS = 256,
|
||||
ARG_TILE_THREADS,
|
||||
ARG_POSTFILTER_THREADS,
|
||||
ARG_HIGH_QUALITY,
|
||||
};
|
||||
|
||||
|
@ -137,6 +160,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
|
|||
{ "untimed", 0, NULL, 'u' },
|
||||
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
|
||||
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
|
||||
{ "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS },
|
||||
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
|
||||
{ "zerocopy", 0, NULL, 'z' },
|
||||
{ "gpugrain", 0, NULL, 'g' },
|
||||
|
@ -175,6 +199,10 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
|
|||
lib_settings->n_tile_threads =
|
||||
parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
|
||||
break;
|
||||
case ARG_POSTFILTER_THREADS:
|
||||
lib_settings->n_postfilter_threads =
|
||||
parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
|
||||
break;
|
||||
default:
|
||||
dp_settings_print_usage(argv[0], NULL);
|
||||
}
|
||||
|
@ -213,16 +241,16 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
|
|||
Dav1dPlayRenderContext *rd_ctx;
|
||||
|
||||
// Alloc
|
||||
rd_ctx = malloc(sizeof(Dav1dPlayRenderContext));
|
||||
rd_ctx = calloc(1, sizeof(Dav1dPlayRenderContext));
|
||||
if (rd_ctx == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Register a custom event to notify our SDL main thread
|
||||
// about new frames
|
||||
rd_ctx->renderer_event_type = SDL_RegisterEvents(1);
|
||||
if (rd_ctx->renderer_event_type == UINT32_MAX) {
|
||||
fprintf(stderr, "Failure to create custom SDL event type!\n");
|
||||
rd_ctx->event_types = SDL_RegisterEvents(3);
|
||||
if (rd_ctx->event_types == UINT32_MAX) {
|
||||
fprintf(stderr, "Failure to create custom SDL event types!\n");
|
||||
free(rd_ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -265,24 +293,17 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
rd_ctx->last_pts = 0;
|
||||
rd_ctx->last_ticks = 0;
|
||||
rd_ctx->current_pts = 0;
|
||||
rd_ctx->timebase = 0;
|
||||
rd_ctx->dec_should_terminate = 0;
|
||||
|
||||
return rd_ctx;
|
||||
}
|
||||
|
||||
/**
|
||||
* Notify about new available frame
|
||||
* Notify about new event
|
||||
*/
|
||||
static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
|
||||
static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t type)
|
||||
{
|
||||
SDL_Event event;
|
||||
SDL_zero(event);
|
||||
event.type = rd_ctx->renderer_event_type;
|
||||
event.user.code = code;
|
||||
event.type = type;
|
||||
SDL_PushEvent(&event);
|
||||
}
|
||||
|
||||
|
@ -294,10 +315,137 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
|
|||
* new picture.
|
||||
*/
|
||||
static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
|
||||
Dav1dPicture *dav1d_pic)
|
||||
Dav1dPicture *dav1d_pic)
|
||||
{
|
||||
rd_ctx->current_ts = dav1d_pic->m.timestamp;
|
||||
renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
|
||||
rd_ctx->current_pts = dav1d_pic->m.timestamp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle pause state
|
||||
*/
|
||||
static void dp_rd_ctx_toggle_pause(Dav1dPlayRenderContext *rd_ctx)
|
||||
{
|
||||
SDL_LockMutex(rd_ctx->lock);
|
||||
rd_ctx->user_paused = !rd_ctx->user_paused;
|
||||
if (rd_ctx->seek)
|
||||
goto out;
|
||||
rd_ctx->paused = rd_ctx->user_paused;
|
||||
uint32_t now = SDL_GetTicks();
|
||||
if (rd_ctx->paused)
|
||||
rd_ctx->pause_start = now;
|
||||
else {
|
||||
rd_ctx->pause_time += now - rd_ctx->pause_start;
|
||||
rd_ctx->pause_start = 0;
|
||||
rd_ctx->last_ticks = now;
|
||||
}
|
||||
out:
|
||||
SDL_UnlockMutex(rd_ctx->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* Query pause state
|
||||
*/
|
||||
static int dp_rd_ctx_is_paused(Dav1dPlayRenderContext *rd_ctx)
|
||||
{
|
||||
int ret;
|
||||
SDL_LockMutex(rd_ctx->lock);
|
||||
ret = rd_ctx->paused;
|
||||
SDL_UnlockMutex(rd_ctx->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Request seeking, in seconds
|
||||
*/
|
||||
static void dp_rd_ctx_seek(Dav1dPlayRenderContext *rd_ctx, int sec)
|
||||
{
|
||||
SDL_LockMutex(rd_ctx->lock);
|
||||
rd_ctx->seek += sec;
|
||||
if (!rd_ctx->paused)
|
||||
rd_ctx->pause_start = SDL_GetTicks();
|
||||
rd_ctx->paused = 1;
|
||||
SDL_UnlockMutex(rd_ctx->lock);
|
||||
}
|
||||
|
||||
static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
|
||||
Dav1dData *data, DemuxerContext *in_ctx);
|
||||
static inline void destroy_pic(void *a);
|
||||
|
||||
/**
|
||||
* Seek the stream, if requested
|
||||
*/
|
||||
static int dp_rd_ctx_handle_seek(Dav1dPlayRenderContext *rd_ctx,
|
||||
DemuxerContext *in_ctx,
|
||||
Dav1dContext *c, Dav1dData *data)
|
||||
{
|
||||
int res = 0;
|
||||
SDL_LockMutex(rd_ctx->lock);
|
||||
if (!rd_ctx->seek)
|
||||
goto out;
|
||||
int64_t seek = rd_ctx->seek * 1000000000ULL;
|
||||
uint64_t pts = TS_TO_PTS(rd_ctx->current_ts);
|
||||
pts = ((int64_t)pts > -seek) ? pts + seek : 0;
|
||||
int end = pts >= FRAME_OFFSET_TO_PTS(rd_ctx->total);
|
||||
if (end)
|
||||
pts = FRAME_OFFSET_TO_PTS(rd_ctx->total - 1);
|
||||
uint64_t target_pts = pts;
|
||||
dav1d_flush(c);
|
||||
uint64_t shift = FRAME_OFFSET_TO_PTS(5);
|
||||
while (1) {
|
||||
if (shift > pts)
|
||||
shift = pts;
|
||||
if ((res = input_seek(in_ctx, pts - shift)))
|
||||
goto out;
|
||||
Dav1dSequenceHeader seq;
|
||||
uint64_t cur_pts;
|
||||
do {
|
||||
if ((res = input_read(in_ctx, data)))
|
||||
break;
|
||||
cur_pts = TS_TO_PTS(data->m.timestamp);
|
||||
res = dav1d_parse_sequence_header(&seq, data->data, data->sz);
|
||||
} while (res && cur_pts < pts);
|
||||
if (!res && cur_pts <= pts)
|
||||
break;
|
||||
if (shift > pts)
|
||||
shift = pts;
|
||||
pts -= shift;
|
||||
}
|
||||
if (!res) {
|
||||
pts = TS_TO_PTS(data->m.timestamp);
|
||||
while (pts < target_pts) {
|
||||
Dav1dPicture *p;
|
||||
if ((res = decode_frame(&p, c, data, in_ctx)))
|
||||
break;
|
||||
if (p) {
|
||||
pts = TS_TO_PTS(p->m.timestamp);
|
||||
if (pts < target_pts)
|
||||
destroy_pic(p);
|
||||
else {
|
||||
dp_fifo_push(rd_ctx->fifo, p);
|
||||
uint32_t type = rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME;
|
||||
dp_rd_ctx_post_event(rd_ctx, type);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!res) {
|
||||
rd_ctx->last_ts = data->m.timestamp - rd_ctx->spf / rd_ctx->timebase;
|
||||
rd_ctx->current_ts = data->m.timestamp;
|
||||
}
|
||||
}
|
||||
out:
|
||||
rd_ctx->paused = rd_ctx->user_paused;
|
||||
if (!rd_ctx->paused && rd_ctx->seek) {
|
||||
uint32_t now = SDL_GetTicks();
|
||||
rd_ctx->pause_time += now - rd_ctx->pause_start;
|
||||
rd_ctx->pause_start = 0;
|
||||
rd_ctx->last_ticks = now;
|
||||
}
|
||||
rd_ctx->seek = 0;
|
||||
SDL_UnlockMutex(rd_ctx->lock);
|
||||
if (res)
|
||||
fprintf(stderr, "Error seeking, aborting\n");
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -329,14 +477,15 @@ static int dp_rd_ctx_should_terminate(Dav1dPlayRenderContext *rd_ctx)
|
|||
*/
|
||||
static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
|
||||
{
|
||||
SDL_LockMutex(rd_ctx->lock);
|
||||
// Calculate time since last frame was received
|
||||
uint32_t ticks_now = SDL_GetTicks();
|
||||
uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0;
|
||||
|
||||
// Calculate when to display the frame
|
||||
int64_t pts_diff = rd_ctx->current_pts - rd_ctx->last_pts;
|
||||
int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
|
||||
rd_ctx->last_pts = rd_ctx->current_pts;
|
||||
int64_t ts_diff = rd_ctx->current_ts - rd_ctx->last_ts;
|
||||
int32_t pts_diff = (ts_diff * rd_ctx->timebase) * 1000.0 + .5;
|
||||
int32_t wait_time = pts_diff - ticks_diff;
|
||||
|
||||
// In untimed mode, simply don't wait
|
||||
if (rd_ctx->settings.untimed)
|
||||
|
@ -347,13 +496,59 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
|
|||
// accurate player this would need to be done in a better way.
|
||||
if (wait_time > 0) {
|
||||
SDL_Delay(wait_time);
|
||||
} else if (wait_time < -10) { // Do not warn for minor time drifts
|
||||
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
|
||||
} else if (wait_time < -10 && !rd_ctx->paused) { // Do not warn for minor time drifts
|
||||
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time / 1000.0);
|
||||
}
|
||||
|
||||
renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);
|
||||
|
||||
rd_ctx->last_ts = rd_ctx->current_ts;
|
||||
rd_ctx->last_ticks = SDL_GetTicks();
|
||||
|
||||
SDL_UnlockMutex(rd_ctx->lock);
|
||||
}
|
||||
|
||||
static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
|
||||
Dav1dData *data, DemuxerContext *in_ctx)
|
||||
{
|
||||
int res;
|
||||
// Send data packets we got from the demuxer to dav1d
|
||||
if ((res = dav1d_send_data(c, data)) < 0) {
|
||||
// On EAGAIN, dav1d can not consume more data and
|
||||
// dav1d_get_picture needs to be called first, which
|
||||
// will happen below, so just keep going in that case
|
||||
// and do not error out.
|
||||
if (res != DAV1D_ERR(EAGAIN)) {
|
||||
dav1d_data_unref(data);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
*p = calloc(1, sizeof(**p));
|
||||
// Try to get a decoded frame
|
||||
if ((res = dav1d_get_picture(c, *p)) < 0) {
|
||||
// In all error cases, even EAGAIN, p needs to be freed as
|
||||
// it is never added to the queue and would leak.
|
||||
free(*p);
|
||||
*p = NULL;
|
||||
// On EAGAIN, it means dav1d has not enough data to decode
|
||||
// therefore this is not a decoding error but just means
|
||||
// we need to feed it more data, which happens in the next
|
||||
// run of the decoder loop.
|
||||
if (res != DAV1D_ERR(EAGAIN))
|
||||
goto err;
|
||||
}
|
||||
return data->sz == 0 ? input_read(in_ctx, data) : 0;
|
||||
err:
|
||||
fprintf(stderr, "Error decoding frame: %s\n",
|
||||
strerror(-res));
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline void destroy_pic(void *a)
|
||||
{
|
||||
Dav1dPicture *p = (Dav1dPicture *)a;
|
||||
dav1d_picture_unref(p);
|
||||
free(p);
|
||||
}
|
||||
|
||||
/* Decoder thread "main" function */
|
||||
|
@ -366,10 +561,7 @@ static int decoder_thread_main(void *cookie)
|
|||
Dav1dData data;
|
||||
DemuxerContext *in_ctx = NULL;
|
||||
int res = 0;
|
||||
unsigned n_out = 0, total, timebase[2], fps[2];
|
||||
|
||||
// Store current ticks for stats calculation
|
||||
uint32_t decoder_start = SDL_GetTicks();
|
||||
unsigned total, timebase[2], fps[2];
|
||||
|
||||
Dav1dPlaySettings settings = rd_ctx->settings;
|
||||
|
||||
|
@ -382,8 +574,9 @@ static int decoder_thread_main(void *cookie)
|
|||
goto cleanup;
|
||||
}
|
||||
|
||||
double timebase_d = timebase[1]/(double)timebase[0];
|
||||
rd_ctx->timebase = timebase_d;
|
||||
rd_ctx->timebase = (double)timebase[1] / timebase[0];
|
||||
rd_ctx->spf = (double)fps[1] / fps[0];
|
||||
rd_ctx->total = total;
|
||||
|
||||
if ((res = dav1d_open(&c, &rd_ctx->lib_settings))) {
|
||||
fprintf(stderr, "Failed opening dav1d decoder\n");
|
||||
|
@ -398,55 +591,29 @@ static int decoder_thread_main(void *cookie)
|
|||
}
|
||||
|
||||
// Decoder loop
|
||||
do {
|
||||
if (dp_rd_ctx_should_terminate(rd_ctx))
|
||||
while (1) {
|
||||
if (dp_rd_ctx_should_terminate(rd_ctx) ||
|
||||
(res = dp_rd_ctx_handle_seek(rd_ctx, in_ctx, c, &data)) ||
|
||||
(res = decode_frame(&p, c, &data, in_ctx)))
|
||||
{
|
||||
break;
|
||||
|
||||
// Send data packets we got from the demuxer to dav1d
|
||||
if ((res = dav1d_send_data(c, &data)) < 0) {
|
||||
// On EAGAIN, dav1d can not consume more data and
|
||||
// dav1d_get_picture needs to be called first, which
|
||||
// will happen below, so just keep going in that case
|
||||
// and do not error out.
|
||||
if (res != DAV1D_ERR(EAGAIN)) {
|
||||
dav1d_data_unref(&data);
|
||||
fprintf(stderr, "Error decoding frame: %s\n",
|
||||
strerror(-res));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
p = calloc(1, sizeof(*p));
|
||||
|
||||
// Try to get a decoded frame
|
||||
if ((res = dav1d_get_picture(c, p)) < 0) {
|
||||
// In all error cases, even EAGAIN, p needs to be freed as
|
||||
// it is never added to the queue and would leak.
|
||||
free(p);
|
||||
|
||||
// On EAGAIN, it means dav1d has not enough data to decode
|
||||
// therefore this is not a decoding error but just means
|
||||
// we need to feed it more data, which happens in the next
|
||||
// run of this decoder loop.
|
||||
if (res != DAV1D_ERR(EAGAIN)) {
|
||||
fprintf(stderr, "Error decoding frame: %s\n",
|
||||
strerror(-res));
|
||||
break;
|
||||
}
|
||||
res = 0;
|
||||
} else {
|
||||
|
||||
else if (p) {
|
||||
// Queue frame
|
||||
dp_fifo_push(rd_ctx->fifo, p);
|
||||
dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
|
||||
|
||||
n_out++;
|
||||
SDL_LockMutex(rd_ctx->lock);
|
||||
int seek = rd_ctx->seek;
|
||||
SDL_UnlockMutex(rd_ctx->lock);
|
||||
if (!seek) {
|
||||
dp_fifo_push(rd_ctx->fifo, p);
|
||||
uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
|
||||
dp_rd_ctx_post_event(rd_ctx, type);
|
||||
}
|
||||
}
|
||||
} while ((data.sz > 0 || !input_read(in_ctx, &data)));
|
||||
}
|
||||
|
||||
// Release remaining data
|
||||
if (data.sz > 0) dav1d_data_unref(&data);
|
||||
|
||||
if (data.sz > 0)
|
||||
dav1d_data_unref(&data);
|
||||
// Do not drain in case an error occured and caused us to leave the
|
||||
// decoding loop early.
|
||||
if (res < 0)
|
||||
|
@ -461,7 +628,6 @@ static int decoder_thread_main(void *cookie)
|
|||
do {
|
||||
if (dp_rd_ctx_should_terminate(rd_ctx))
|
||||
break;
|
||||
|
||||
p = calloc(1, sizeof(*p));
|
||||
res = dav1d_get_picture(c, p);
|
||||
if (res < 0) {
|
||||
|
@ -474,19 +640,13 @@ static int decoder_thread_main(void *cookie)
|
|||
} else {
|
||||
// Queue frame
|
||||
dp_fifo_push(rd_ctx->fifo, p);
|
||||
dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
|
||||
|
||||
n_out++;
|
||||
uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
|
||||
dp_rd_ctx_post_event(rd_ctx, type);
|
||||
}
|
||||
} while (res != DAV1D_ERR(EAGAIN));
|
||||
|
||||
// Print stats
|
||||
uint32_t decoding_time_ms = SDL_GetTicks() - decoder_start;
|
||||
printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
|
||||
n_out, decoding_time_ms/1000, n_out / (decoding_time_ms / 1000.0));
|
||||
|
||||
cleanup:
|
||||
dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_DEC_QUIT);
|
||||
dp_rd_ctx_post_event(rd_ctx, rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT);
|
||||
|
||||
if (in_ctx)
|
||||
input_close(in_ctx);
|
||||
|
@ -543,41 +703,84 @@ int main(int argc, char **argv)
|
|||
decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);
|
||||
|
||||
// Main loop
|
||||
#define NUM_MAX_EVENTS 8
|
||||
SDL_Event events[NUM_MAX_EVENTS];
|
||||
int num_frame_events = 0;
|
||||
uint32_t start_time = 0, n_out = 0;
|
||||
while (1) {
|
||||
|
||||
SDL_Event e;
|
||||
if (SDL_WaitEvent(&e)) {
|
||||
if (e.type == SDL_QUIT) {
|
||||
int num_events = 0;
|
||||
SDL_WaitEvent(NULL);
|
||||
while (num_events < NUM_MAX_EVENTS && SDL_PollEvent(&events[num_events++]))
|
||||
break;
|
||||
for (int i = 0; i < num_events; ++i) {
|
||||
SDL_Event *e = &events[i];
|
||||
if (e->type == SDL_QUIT) {
|
||||
dp_rd_ctx_request_shutdown(rd_ctx);
|
||||
} else if (e.type == SDL_WINDOWEVENT) {
|
||||
if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
|
||||
dp_fifo_flush(rd_ctx->fifo, destroy_pic);
|
||||
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
|
||||
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME);
|
||||
num_frame_events = 0;
|
||||
} else if (e->type == SDL_WINDOWEVENT) {
|
||||
if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
|
||||
// TODO: Handle window resizes
|
||||
} else if(e->window.event == SDL_WINDOWEVENT_EXPOSED) {
|
||||
dp_rd_ctx_render(rd_ctx);
|
||||
}
|
||||
} else if (e.type == rd_ctx->renderer_event_type) {
|
||||
if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
|
||||
// Dequeue frame and update the render context with it
|
||||
Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
|
||||
|
||||
// Do not update textures during termination
|
||||
if (!dp_rd_ctx_should_terminate(rd_ctx))
|
||||
dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
|
||||
dav1d_picture_unref(p);
|
||||
free(p);
|
||||
} else if (e.user.code == DAV1D_EVENT_DEC_QUIT) {
|
||||
break;
|
||||
} else if (e->type == SDL_KEYDOWN) {
|
||||
SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e;
|
||||
if (kbde->keysym.sym == SDLK_SPACE) {
|
||||
dp_rd_ctx_toggle_pause(rd_ctx);
|
||||
} else if (kbde->keysym.sym == SDLK_LEFT ||
|
||||
kbde->keysym.sym == SDLK_RIGHT)
|
||||
{
|
||||
if (kbde->keysym.sym == SDLK_LEFT)
|
||||
dp_rd_ctx_seek(rd_ctx, -5);
|
||||
else if (kbde->keysym.sym == SDLK_RIGHT)
|
||||
dp_rd_ctx_seek(rd_ctx, +5);
|
||||
dp_fifo_flush(rd_ctx->fifo, destroy_pic);
|
||||
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
|
||||
num_frame_events = 0;
|
||||
}
|
||||
} else if (e->type == rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME) {
|
||||
num_frame_events++;
|
||||
// Store current ticks for stats calculation
|
||||
if (start_time == 0)
|
||||
start_time = SDL_GetTicks();
|
||||
} else if (e->type == rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME) {
|
||||
// Dequeue frame and update the render context with it
|
||||
Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
|
||||
// Do not update textures during termination
|
||||
if (!dp_rd_ctx_should_terminate(rd_ctx)) {
|
||||
dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
|
||||
n_out++;
|
||||
}
|
||||
destroy_pic(p);
|
||||
} else if (e->type == rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
// Do not render during termination
|
||||
if (!dp_rd_ctx_should_terminate(rd_ctx))
|
||||
dp_rd_ctx_render(rd_ctx);
|
||||
if (num_frame_events && !dp_rd_ctx_is_paused(rd_ctx)) {
|
||||
// Dequeue frame and update the render context with it
|
||||
Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
|
||||
// Do not update textures during termination
|
||||
if (!dp_rd_ctx_should_terminate(rd_ctx)) {
|
||||
dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
|
||||
dp_rd_ctx_render(rd_ctx);
|
||||
n_out++;
|
||||
}
|
||||
destroy_pic(p);
|
||||
num_frame_events--;
|
||||
}
|
||||
}
|
||||
|
||||
out:;
|
||||
// Print stats
|
||||
uint32_t time_ms = SDL_GetTicks() - start_time - rd_ctx->pause_time;
|
||||
printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
|
||||
n_out, time_ms / 1000, n_out/ (time_ms / 1000.0));
|
||||
|
||||
int decoder_ret = 0;
|
||||
SDL_WaitThread(decoder_thread, &decoder_ret);
|
||||
|
||||
dp_rd_ctx_destroy(rd_ctx);
|
||||
|
||||
return decoder_ret;
|
||||
}
|
||||
|
|
|
@ -37,6 +37,8 @@ struct dp_fifo
|
|||
size_t capacity;
|
||||
size_t count;
|
||||
void **entries;
|
||||
int push_wait;
|
||||
int flush;
|
||||
};
|
||||
|
||||
|
||||
|
@ -54,6 +56,8 @@ Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
|
|||
|
||||
fifo->capacity = capacity;
|
||||
fifo->count = 0;
|
||||
fifo->push_wait = 0;
|
||||
fifo->flush = 0;
|
||||
|
||||
fifo->lock = SDL_CreateMutex();
|
||||
if (fifo->lock == NULL) {
|
||||
|
@ -90,8 +94,16 @@ void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
|
|||
void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
|
||||
{
|
||||
SDL_LockMutex(fifo->lock);
|
||||
while (fifo->count == fifo->capacity)
|
||||
while (fifo->count == fifo->capacity) {
|
||||
fifo->push_wait = 1;
|
||||
SDL_CondWait(fifo->cond_change, fifo->lock);
|
||||
fifo->push_wait = 0;
|
||||
if (fifo->flush) {
|
||||
SDL_CondSignal(fifo->cond_change);
|
||||
SDL_UnlockMutex(fifo->lock);
|
||||
return;
|
||||
}
|
||||
}
|
||||
fifo->entries[fifo->count++] = element;
|
||||
if (fifo->count == 1)
|
||||
SDL_CondSignal(fifo->cond_change);
|
||||
|
@ -120,4 +132,16 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
|
|||
return res;
|
||||
}
|
||||
|
||||
|
||||
void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *))
|
||||
{
|
||||
SDL_LockMutex(fifo->lock);
|
||||
fifo->flush = 1;
|
||||
if (fifo->push_wait) {
|
||||
SDL_CondSignal(fifo->cond_change);
|
||||
SDL_CondWait(fifo->cond_change, fifo->lock);
|
||||
}
|
||||
while (fifo->count)
|
||||
destroy_elem(fifo->entries[--fifo->count]);
|
||||
fifo->flush = 0;
|
||||
SDL_UnlockMutex(fifo->lock);
|
||||
}
|
||||
|
|
|
@ -59,3 +59,5 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
|
|||
* other thread will call dp_fifo_shift will lead to a deadlock.
|
||||
*/
|
||||
void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
|
||||
|
||||
void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *));
|
||||
|
|
|
@ -66,8 +66,11 @@ typedef struct {
|
|||
#define WINDOW_WIDTH 910
|
||||
#define WINDOW_HEIGHT 512
|
||||
|
||||
#define DAV1D_EVENT_NEW_FRAME 1
|
||||
#define DAV1D_EVENT_DEC_QUIT 2
|
||||
enum {
|
||||
DAV1D_EVENT_NEW_FRAME,
|
||||
DAV1D_EVENT_SEEK_FRAME,
|
||||
DAV1D_EVENT_DEC_QUIT
|
||||
};
|
||||
|
||||
/**
|
||||
* Renderer info
|
||||
|
@ -84,7 +87,7 @@ typedef struct rdr_info
|
|||
void (*destroy_renderer)(void *cookie);
|
||||
// Callback to the render function that renders a prevously sent frame
|
||||
void (*render)(void *cookie, const Dav1dPlaySettings *settings);
|
||||
// Callback to the send frame function
|
||||
// Callback to the send frame function, _may_ also unref dav1d_pic!
|
||||
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
|
||||
const Dav1dPlaySettings *settings);
|
||||
// Callback for alloc/release pictures (optional)
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
#include <assert.h>
|
||||
|
||||
#include <libplacebo/renderer.h>
|
||||
#include <libplacebo/utils/upload.h>
|
||||
#include <libplacebo/utils/dav1d.h>
|
||||
|
||||
#ifdef HAVE_PLACEBO_VULKAN
|
||||
# include <libplacebo/vulkan.h>
|
||||
|
@ -72,7 +72,7 @@ typedef struct renderer_priv_ctx
|
|||
// Lock protecting access to the texture
|
||||
SDL_mutex *lock;
|
||||
// Image to render, and planes backing them
|
||||
struct pl_image image;
|
||||
struct pl_frame image;
|
||||
const struct pl_tex *plane_tex[3];
|
||||
} Dav1dPlayRendererPrivateContext;
|
||||
|
||||
|
@ -319,22 +319,15 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
|
|||
if (settings->highquality)
|
||||
render_params = pl_render_default_params;
|
||||
|
||||
struct pl_render_target target;
|
||||
pl_render_target_from_swapchain(&target, &frame);
|
||||
target.profile = (struct pl_icc_profile) {
|
||||
.data = NULL,
|
||||
.len = 0,
|
||||
};
|
||||
|
||||
#if PL_API_VER >= 66
|
||||
pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
|
||||
if (pl_render_target_partial(&target))
|
||||
pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
|
||||
#endif
|
||||
struct pl_frame target;
|
||||
pl_frame_from_swapchain(&target, &frame);
|
||||
pl_rect2df_aspect_copy(&target.crop, &rd_priv_ctx->image.crop, 0.0);
|
||||
if (pl_frame_is_cropped(&target))
|
||||
pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 0.0 });
|
||||
|
||||
if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
|
||||
fprintf(stderr, "Failed rendering frame!\n");
|
||||
pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
|
||||
pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 1.0 });
|
||||
}
|
||||
|
||||
ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
|
||||
|
@ -351,320 +344,37 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
|
|||
static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
|
||||
const Dav1dPlaySettings *settings)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
Dav1dPlayRendererPrivateContext *p = cookie;
|
||||
assert(p != NULL);
|
||||
int ret = 0;
|
||||
|
||||
SDL_LockMutex(rd_priv_ctx->lock);
|
||||
if (!dav1d_pic)
|
||||
return ret;
|
||||
|
||||
if (dav1d_pic == NULL) {
|
||||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int width = dav1d_pic->p.w;
|
||||
int height = dav1d_pic->p.h;
|
||||
int sub_x = 0, sub_y = 0;
|
||||
int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
|
||||
enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
|
||||
|
||||
struct pl_image *image = &rd_priv_ctx->image;
|
||||
*image = (struct pl_image) {
|
||||
.num_planes = 3,
|
||||
.width = width,
|
||||
.height = height,
|
||||
.src_rect = {0, 0, width, height},
|
||||
|
||||
.repr = {
|
||||
.bits = {
|
||||
.sample_depth = bytes * 8,
|
||||
.color_depth = dav1d_pic->p.bpc,
|
||||
},
|
||||
},
|
||||
struct pl_dav1d_upload_params params = {
|
||||
.picture = dav1d_pic,
|
||||
.film_grain = settings->gpugrain,
|
||||
.gpu_allocated = settings->zerocopy,
|
||||
.asynchronous = true,
|
||||
};
|
||||
|
||||
// Figure out the correct plane dimensions/count
|
||||
switch (dav1d_pic->p.layout) {
|
||||
case DAV1D_PIXEL_LAYOUT_I400:
|
||||
image->num_planes = 1;
|
||||
break;
|
||||
case DAV1D_PIXEL_LAYOUT_I420:
|
||||
sub_x = sub_y = 1;
|
||||
break;
|
||||
case DAV1D_PIXEL_LAYOUT_I422:
|
||||
sub_x = 1;
|
||||
break;
|
||||
case DAV1D_PIXEL_LAYOUT_I444:
|
||||
break;
|
||||
}
|
||||
|
||||
// Set the right colorspace metadata etc.
|
||||
switch (dav1d_pic->seq_hdr->pri) {
|
||||
case DAV1D_COLOR_PRI_UNKNOWN: image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
|
||||
case DAV1D_COLOR_PRI_BT709: image->color.primaries = PL_COLOR_PRIM_BT_709; break;
|
||||
case DAV1D_COLOR_PRI_BT470M: image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
|
||||
case DAV1D_COLOR_PRI_BT470BG: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
|
||||
case DAV1D_COLOR_PRI_BT601: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
|
||||
case DAV1D_COLOR_PRI_BT2020: image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
|
||||
|
||||
case DAV1D_COLOR_PRI_XYZ:
|
||||
// Handled below
|
||||
assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
|
||||
break;
|
||||
|
||||
default:
|
||||
printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
|
||||
"may be very incorrect\n", dav1d_pic->seq_hdr->pri);
|
||||
break;
|
||||
}
|
||||
|
||||
switch (dav1d_pic->seq_hdr->trc) {
|
||||
case DAV1D_TRC_BT709:
|
||||
case DAV1D_TRC_BT470M:
|
||||
case DAV1D_TRC_BT470BG:
|
||||
case DAV1D_TRC_BT601:
|
||||
case DAV1D_TRC_SMPTE240:
|
||||
case DAV1D_TRC_BT2020_10BIT:
|
||||
case DAV1D_TRC_BT2020_12BIT:
|
||||
// These all map to the effective "SDR" CRT-based EOTF, BT.1886
|
||||
image->color.transfer = PL_COLOR_TRC_BT_1886;
|
||||
break;
|
||||
|
||||
case DAV1D_TRC_UNKNOWN: image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
|
||||
case DAV1D_TRC_LINEAR: image->color.transfer = PL_COLOR_TRC_LINEAR; break;
|
||||
case DAV1D_TRC_SRGB: image->color.transfer = PL_COLOR_TRC_SRGB; break;
|
||||
case DAV1D_TRC_SMPTE2084: image->color.transfer = PL_COLOR_TRC_PQ; break;
|
||||
case DAV1D_TRC_HLG: image->color.transfer = PL_COLOR_TRC_HLG; break;
|
||||
|
||||
default:
|
||||
printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
|
||||
"may be very incorrect\n", dav1d_pic->seq_hdr->trc);
|
||||
break;
|
||||
}
|
||||
|
||||
switch (dav1d_pic->seq_hdr->mtrx) {
|
||||
case DAV1D_MC_IDENTITY:
|
||||
// This is going to be either RGB or XYZ
|
||||
if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
|
||||
image->repr.sys = PL_COLOR_SYSTEM_XYZ;
|
||||
} else {
|
||||
image->repr.sys = PL_COLOR_SYSTEM_RGB;
|
||||
}
|
||||
break;
|
||||
|
||||
case DAV1D_MC_UNKNOWN:
|
||||
// PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
|
||||
image->repr.sys = pl_color_system_guess_ycbcr(width, height);
|
||||
break;
|
||||
|
||||
case DAV1D_MC_BT709: image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
|
||||
case DAV1D_MC_BT601: image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
|
||||
case DAV1D_MC_SMPTE240: image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
|
||||
case DAV1D_MC_SMPTE_YCGCO: image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
|
||||
case DAV1D_MC_BT2020_NCL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
|
||||
case DAV1D_MC_BT2020_CL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
|
||||
|
||||
case DAV1D_MC_ICTCP:
|
||||
// This one is split up based on the actual HDR curve in use
|
||||
if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
|
||||
image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
|
||||
} else {
|
||||
image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
|
||||
"may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
|
||||
break;
|
||||
}
|
||||
|
||||
if (dav1d_pic->seq_hdr->color_range) {
|
||||
image->repr.levels = PL_COLOR_LEVELS_PC;
|
||||
} else {
|
||||
image->repr.levels = PL_COLOR_LEVELS_TV;
|
||||
}
|
||||
|
||||
switch (dav1d_pic->seq_hdr->chr) {
|
||||
case DAV1D_CHR_UNKNOWN: chroma_loc = PL_CHROMA_UNKNOWN; break;
|
||||
case DAV1D_CHR_VERTICAL: chroma_loc = PL_CHROMA_LEFT; break;
|
||||
case DAV1D_CHR_COLOCATED: chroma_loc = PL_CHROMA_TOP_LEFT; break;
|
||||
}
|
||||
|
||||
#if PL_API_VER >= 63
|
||||
if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
|
||||
Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
|
||||
struct pl_av1_grain_data *dst = &image->av1_grain;
|
||||
*dst = (struct pl_av1_grain_data) {
|
||||
.grain_seed = src->seed,
|
||||
.num_points_y = src->num_y_points,
|
||||
.chroma_scaling_from_luma = src->chroma_scaling_from_luma,
|
||||
.num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] },
|
||||
.scaling_shift = src->scaling_shift,
|
||||
.ar_coeff_lag = src->ar_coeff_lag,
|
||||
.ar_coeff_shift = (int)src->ar_coeff_shift,
|
||||
.grain_scale_shift = src->grain_scale_shift,
|
||||
.uv_mult = { src->uv_mult[0], src->uv_mult[1] },
|
||||
.uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
|
||||
.uv_offset = { src->uv_offset[0], src->uv_offset[1] },
|
||||
.overlap = src->overlap_flag,
|
||||
};
|
||||
|
||||
assert(sizeof(dst->points_y) == sizeof(src->y_points));
|
||||
assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
|
||||
assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
|
||||
memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
|
||||
memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
|
||||
memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
|
||||
|
||||
// this one has different row sizes for alignment
|
||||
for (int c = 0; c < 2; c++) {
|
||||
for (int i = 0; i < 25; i++)
|
||||
dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Upload the actual planes
|
||||
struct pl_plane_data data[3] = {
|
||||
{
|
||||
// Y plane
|
||||
.type = PL_FMT_UNORM,
|
||||
.width = width,
|
||||
.height = height,
|
||||
.pixel_stride = bytes,
|
||||
.row_stride = dav1d_pic->stride[0],
|
||||
.component_size = {bytes * 8},
|
||||
.component_map = {0},
|
||||
}, {
|
||||
// U plane
|
||||
.type = PL_FMT_UNORM,
|
||||
.width = width >> sub_x,
|
||||
.height = height >> sub_y,
|
||||
.pixel_stride = bytes,
|
||||
.row_stride = dav1d_pic->stride[1],
|
||||
.component_size = {bytes * 8},
|
||||
.component_map = {1},
|
||||
}, {
|
||||
// V plane
|
||||
.type = PL_FMT_UNORM,
|
||||
.width = width >> sub_x,
|
||||
.height = height >> sub_y,
|
||||
.pixel_stride = bytes,
|
||||
.row_stride = dav1d_pic->stride[1],
|
||||
.component_size = {bytes * 8},
|
||||
.component_map = {2},
|
||||
},
|
||||
};
|
||||
|
||||
bool ok = true;
|
||||
|
||||
for (int i = 0; i < image->num_planes; i++) {
|
||||
if (settings->zerocopy) {
|
||||
const struct pl_buf *buf = dav1d_pic->allocator_data;
|
||||
assert(buf);
|
||||
data[i].buf = buf;
|
||||
data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
|
||||
} else {
|
||||
data[i].pixels = dav1d_pic->data[i];
|
||||
}
|
||||
|
||||
ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
|
||||
}
|
||||
|
||||
// Apply the correct chroma plane shift. This has to be done after pl_upload_plane
|
||||
#if PL_API_VER >= 67
|
||||
pl_image_set_chroma_location(image, chroma_loc);
|
||||
#else
|
||||
pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
|
||||
pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
|
||||
#endif
|
||||
|
||||
if (!ok) {
|
||||
SDL_LockMutex(p->lock);
|
||||
if (!pl_upload_dav1dpicture(p->gpu, &p->image, p->plane_tex, ¶ms)) {
|
||||
fprintf(stderr, "Failed uploading planes!\n");
|
||||
*image = (struct pl_image) {0};
|
||||
p->image = (struct pl_frame) {0};
|
||||
ret = -1;
|
||||
}
|
||||
|
||||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
return !ok;
|
||||
SDL_UnlockMutex(p->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Align to power of 2
|
||||
#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
|
||||
|
||||
static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
|
||||
static int placebo_alloc_pic(Dav1dPicture *const pic, void *cookie)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
|
||||
SDL_LockMutex(rd_priv_ctx->lock);
|
||||
|
||||
const struct pl_gpu *gpu = rd_priv_ctx->gpu;
|
||||
int ret = DAV1D_ERR(ENOMEM);
|
||||
|
||||
// Copied from dav1d_default_picture_alloc
|
||||
const int hbd = p->p.bpc > 8;
|
||||
const int aligned_w = ALIGN2(p->p.w, 128);
|
||||
const int aligned_h = ALIGN2(p->p.h, 128);
|
||||
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
p->stride[0] = aligned_w << hbd;
|
||||
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
|
||||
|
||||
// Align strides up to multiples of the GPU performance hints
|
||||
p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
|
||||
p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
|
||||
|
||||
// Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
|
||||
size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
|
||||
const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
|
||||
const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
|
||||
|
||||
// The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
|
||||
// even in the case that the driver gives us insane alignments
|
||||
const size_t pic_size = y_sz + 2 * uv_sz;
|
||||
const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
|
||||
|
||||
// Validate size limitations
|
||||
if (total_size > gpu->limits.max_xfer_size) {
|
||||
printf("alloc of %zu bytes exceeds limits\n", total_size);
|
||||
goto err;
|
||||
}
|
||||
|
||||
const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
|
||||
.type = PL_BUF_TEX_TRANSFER,
|
||||
.host_mapped = true,
|
||||
.size = total_size,
|
||||
.memory_type = PL_BUF_MEM_HOST,
|
||||
.user_data = p,
|
||||
});
|
||||
|
||||
if (!buf) {
|
||||
printf("alloc of GPU mapped buffer failed\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
assert(buf->data);
|
||||
uintptr_t base = (uintptr_t) buf->data, data[3];
|
||||
data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
|
||||
data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
|
||||
data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
|
||||
|
||||
// Sanity check offset alignment for the sake of debugging
|
||||
if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
|
||||
data[1] - base != ALIGN2(data[1] - base, off_align) ||
|
||||
data[2] - base != ALIGN2(data[2] - base, off_align))
|
||||
{
|
||||
printf("GPU buffer horribly misaligned, expect slowdown!\n");
|
||||
}
|
||||
|
||||
p->allocator_data = (void *) buf;
|
||||
p->data[0] = (void *) data[0];
|
||||
p->data[1] = (void *) data[1];
|
||||
p->data[2] = (void *) data[2];
|
||||
ret = 0;
|
||||
|
||||
// fall through
|
||||
err:
|
||||
int ret = pl_allocate_dav1dpicture(pic, rd_priv_ctx->gpu);
|
||||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
return ret;
|
||||
}
|
||||
|
@ -673,11 +383,9 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
|
|||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
assert(pic->allocator_data);
|
||||
|
||||
SDL_LockMutex(rd_priv_ctx->lock);
|
||||
const struct pl_gpu *gpu = rd_priv_ctx->gpu;
|
||||
pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
|
||||
pl_release_dav1dpicture(pic, rd_priv_ctx->gpu);
|
||||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
}
|
||||
|
||||
|
@ -690,10 +398,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
|
|||
.update_frame = placebo_upload_image,
|
||||
.alloc_pic = placebo_alloc_pic,
|
||||
.release_pic = placebo_release_pic,
|
||||
|
||||
# if PL_API_VER >= 63
|
||||
.supports_gpu_grain = 1,
|
||||
# endif
|
||||
};
|
||||
#else
|
||||
const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
|
||||
|
@ -706,12 +411,7 @@ const Dav1dPlayRenderInfo rdr_placebo_gl = {
|
|||
.destroy_renderer = placebo_renderer_destroy,
|
||||
.render = placebo_render,
|
||||
.update_frame = placebo_upload_image,
|
||||
.alloc_pic = placebo_alloc_pic,
|
||||
.release_pic = placebo_release_pic,
|
||||
|
||||
# if PL_API_VER >= 63
|
||||
.supports_gpu_grain = 1,
|
||||
# endif
|
||||
};
|
||||
#else
|
||||
const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
|
||||
|
|
|
@ -43,10 +43,10 @@ dav1dplay_sources = files(
|
|||
sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)
|
||||
|
||||
if sdl2_dependency.found()
|
||||
dav1dplay_deps = [sdl2_dependency]
|
||||
dav1dplay_deps = [sdl2_dependency, libm_dependency]
|
||||
dav1dplay_cflags = []
|
||||
|
||||
placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
|
||||
placebo_dependency = dependency('libplacebo', version: '>= 3.110.0', required: false)
|
||||
|
||||
if placebo_dependency.found()
|
||||
dav1dplay_deps += placebo_dependency
|
||||
|
|
|
@ -116,8 +116,8 @@
|
|||
# define dav1d_uninit(x) x
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#include <intrin.h>
|
||||
|
||||
static inline int ctz(const unsigned int mask) {
|
||||
unsigned long idx;
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_COMMON_FRAME_H
|
||||
#define DAV1D_COMMON_FRAME_H
|
||||
|
||||
/*
|
||||
* Checks whether Dav1dFrameType == INTER || == SWITCH
|
||||
* Both are defined as odd numbers {1, 3} and therefore have the LSB set.
|
||||
* See also: AV1 spec 6.8.2
|
||||
*/
|
||||
#define IS_INTER_OR_SWITCH(frame_header) \
|
||||
((frame_header)->frame_type & 1)
|
||||
|
||||
/*
|
||||
* Checks whether Dav1dFrameType == KEY || == INTRA
|
||||
* See also: AV1 spec 6.8.2
|
||||
*/
|
||||
#define IS_KEY_OR_INTRA(frame_header) \
|
||||
(!IS_INTER_OR_SWITCH(frame_header))
|
||||
|
||||
#endif /* DAV1D_COMMON_FRAME_H */
|
|
@ -45,6 +45,7 @@ typedef struct Dav1dRef Dav1dRef;
|
|||
|
||||
#define DAV1D_MAX_FRAME_THREADS 256
|
||||
#define DAV1D_MAX_TILE_THREADS 64
|
||||
#define DAV1D_MAX_POSTFILTER_THREADS 256
|
||||
|
||||
typedef struct Dav1dLogger {
|
||||
void *cookie; ///< Custom data to pass to the callback.
|
||||
|
@ -67,7 +68,8 @@ typedef struct Dav1dSettings {
|
|||
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
|
||||
Dav1dPicAllocator allocator; ///< Picture allocator callback.
|
||||
Dav1dLogger logger; ///< Logger callback.
|
||||
uint8_t reserved[32]; ///< reserved for future use
|
||||
int n_postfilter_threads;
|
||||
uint8_t reserved[28]; ///< reserved for future use
|
||||
} Dav1dSettings;
|
||||
|
||||
/**
|
||||
|
|
|
@ -25,9 +25,7 @@
|
|||
# Revision file (vcs_version.h) generation
|
||||
dav1d_git_dir = join_paths(dav1d_src_root, '.git')
|
||||
rev_target = vcs_tag(command: [
|
||||
'git', '--git-dir', dav1d_git_dir,
|
||||
'describe', '--tags', '--long',
|
||||
'--match', '?.*.*', '--always'
|
||||
'git', '--git-dir', dav1d_git_dir, 'describe', '--long', '--always'
|
||||
],
|
||||
input: 'vcs_version.h.in',
|
||||
output: 'vcs_version.h'
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright © 2018-2020, VideoLAN and dav1d authors
|
||||
# Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
|
@ -23,14 +23,14 @@
|
|||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '0.8.1',
|
||||
version: '0.8.2',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.49.0')
|
||||
|
||||
dav1d_soname_version = '5.0.0'
|
||||
dav1d_soname_version = '5.0.1'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
|
@ -128,7 +128,7 @@ if host_machine.system() == 'windows'
|
|||
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
|
||||
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
|
||||
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
|
||||
rc_data.set('COPYRIGHT_YEARS', '2020')
|
||||
rc_data.set('COPYRIGHT_YEARS', '2021')
|
||||
else
|
||||
thread_dependency = dependency('threads')
|
||||
thread_compat_dep = []
|
||||
|
@ -168,6 +168,8 @@ if host_machine.system() == 'linux'
|
|||
endif
|
||||
endif
|
||||
|
||||
libm_dependency = cc.find_library('m', required: false)
|
||||
|
||||
|
||||
# Header checks
|
||||
|
||||
|
@ -257,6 +259,7 @@ if cc.get_argument_syntax() != 'msvc'
|
|||
else
|
||||
optional_arguments += [
|
||||
'-wd4028', # parameter different from declaration
|
||||
'-wd4090', # broken with arrays of pointers
|
||||
'-wd4996' # use of POSIX functions
|
||||
]
|
||||
endif
|
||||
|
|
|
@ -53,3 +53,7 @@ option('fuzzer_ldflags',
|
|||
option('stack_alignment',
|
||||
type: 'integer',
|
||||
value: 0)
|
||||
|
||||
option('xxhash_muxer',
|
||||
type : 'feature',
|
||||
value : 'auto')
|
||||
|
|
|
@ -40,8 +40,7 @@ function ipred_dc_128_8bpc_neon, export=1
|
|||
adr r2, L(ipred_dc_128_tbl)
|
||||
sub r3, r3, #25
|
||||
ldr r3, [r2, r3, lsl #2]
|
||||
mov lr, #128
|
||||
vdup.8 q0, lr
|
||||
vmov.i8 q0, #128
|
||||
add r2, r2, r3
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
|
@ -79,7 +78,7 @@ L(ipred_dc_128_tbl):
|
|||
bgt 16b
|
||||
pop {r4, pc}
|
||||
320:
|
||||
vdup.8 q1, lr
|
||||
vmov.i8 q1, #128
|
||||
32:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
|
@ -89,20 +88,18 @@ L(ipred_dc_128_tbl):
|
|||
bgt 32b
|
||||
pop {r4, pc}
|
||||
640:
|
||||
vdup.8 q1, lr
|
||||
vdup.8 q2, lr
|
||||
vdup.8 q3, lr
|
||||
vmov.i8 q1, #128
|
||||
sub r1, r1, #32
|
||||
64:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 64b
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
@ -401,19 +398,17 @@ L(ipred_dc_top_tbl):
|
|||
vrshrn.u16 d18, q0, #6
|
||||
vdup.8 q0, d18[0]
|
||||
vdup.8 q1, d18[0]
|
||||
vdup.8 q2, d18[0]
|
||||
vdup.8 q3, d18[0]
|
||||
sub r1, r1, #32
|
||||
64:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 64b
|
||||
pop {r4-r5, pc}
|
||||
endfunc
|
||||
|
@ -538,20 +533,18 @@ L(ipred_dc_left_h64):
|
|||
vdup.8 q0, d0[0]
|
||||
bx r3
|
||||
L(ipred_dc_left_w64):
|
||||
sub r1, r1, #32
|
||||
vmov.8 q1, q0
|
||||
vmov.8 q2, q0
|
||||
vmov.8 q3, q0
|
||||
sub r1, r1, #32
|
||||
1:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 1b
|
||||
pop {r4-r5, pc}
|
||||
endfunc
|
||||
|
@ -600,10 +593,10 @@ L(ipred_dc_tbl):
|
|||
L(ipred_dc_h4):
|
||||
vld1.32 {d0[]}, [r2, :32]!
|
||||
vpaddl.u8 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w4):
|
||||
add r2, r2, #1
|
||||
vld1.32 {d1[]}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vpaddl.u8 d1, d1
|
||||
|
@ -635,10 +628,10 @@ L(ipred_dc_h8):
|
|||
vld1.8 {d0}, [r2, :64]!
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w8):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vpaddl.u8 d2, d2
|
||||
|
@ -672,10 +665,10 @@ L(ipred_dc_h16):
|
|||
vaddl.u8 q0, d0, d1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w16):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2, d3}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vaddl.u8 q1, d2, d3
|
||||
|
@ -712,10 +705,10 @@ L(ipred_dc_h32):
|
|||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w32):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2, d3, d4, d5}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vaddl.u8 q1, d2, d3
|
||||
|
@ -760,10 +753,10 @@ L(ipred_dc_h64):
|
|||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w64):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2, d3, d4, d5}, [r2]!
|
||||
vadd.s16 d0, d0, d30
|
||||
vaddl.u8 q2, d4, d5
|
||||
|
@ -789,11 +782,11 @@ L(ipred_dc_w64):
|
|||
vadd.s16 d0, d0, d2
|
||||
vadd.s16 d0, d0, d3
|
||||
vshl.u16 d18, d0, d28
|
||||
beq 1f // h = 16/32
|
||||
beq 1f
|
||||
// h = 16/32
|
||||
movw lr, #(0x5556/2)
|
||||
movt lr, #(0x3334/2)
|
||||
mov r5, r4
|
||||
and r5, r5, #31
|
||||
and r5, r4, #31
|
||||
lsr lr, lr, r5
|
||||
vdup.16 d30, lr
|
||||
vqdmulh.s16 d18, d18, d30
|
||||
|
@ -801,18 +794,16 @@ L(ipred_dc_w64):
|
|||
sub r1, r1, #32
|
||||
vdup.8 q0, d18[0]
|
||||
vdup.8 q1, d18[0]
|
||||
vdup.8 q2, d18[0]
|
||||
vdup.8 q3, d18[0]
|
||||
2:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
endfunc
|
||||
|
@ -1444,6 +1435,8 @@ function ipred_filter_8bpc_neon, export=1
|
|||
vmovl.s8 q13, d28
|
||||
vmovl.s8 q14, d29
|
||||
add r8, r2, #1
|
||||
sub r2, r2, #2
|
||||
mov r7, #-2
|
||||
bx r5
|
||||
|
||||
.align 2
|
||||
|
@ -1455,8 +1448,6 @@ L(ipred_filter_tbl):
|
|||
|
||||
40:
|
||||
vld1.32 {d0[]}, [r8] // top (0-3)
|
||||
sub r2, r2, #2
|
||||
mov r7, #-2
|
||||
vmovl.u8 q0, d0 // top (0-3)
|
||||
4:
|
||||
vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
|
||||
|
@ -1473,13 +1464,11 @@ L(ipred_filter_tbl):
|
|||
vst1.32 {d4[0]}, [r0, :32], r1
|
||||
vmovl.u8 q0, d4
|
||||
vst1.32 {d4[1]}, [r6, :32], r1
|
||||
vext.8 q0, q0, q0, #8 // move top from [4-7] to [0-3]
|
||||
vmov d0, d1 // move top from [4-7] to [0-3]
|
||||
bgt 4b
|
||||
pop {r4-r8, pc}
|
||||
80:
|
||||
vld1.8 {d0}, [r8] // top (0-7)
|
||||
sub r2, r2, #2
|
||||
mov r7, #-2
|
||||
vmovl.u8 q0, d0 // top (0-7)
|
||||
8:
|
||||
vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
|
||||
|
@ -1503,16 +1492,14 @@ L(ipred_filter_tbl):
|
|||
vqrshrun.s16 d5, q3, #4
|
||||
vzip.32 d4, d5
|
||||
subs r4, r4, #2
|
||||
vst1.64 {d4}, [r0, :64], r1
|
||||
vst1.8 {d4}, [r0, :64], r1
|
||||
vmovl.u8 q0, d5
|
||||
vst1.64 {d5}, [r6, :64], r1
|
||||
vst1.8 {d5}, [r6, :64], r1
|
||||
bgt 8b
|
||||
pop {r4-r8, pc}
|
||||
160:
|
||||
320:
|
||||
vpush {q4-q5}
|
||||
sub r2, r2, #2
|
||||
mov r7, #-2
|
||||
sub r1, r1, r3
|
||||
mov lr, r3
|
||||
|
||||
|
@ -2003,10 +1990,10 @@ L(ipred_cfl_tbl):
|
|||
L(ipred_cfl_h4):
|
||||
vld1.32 {d0[]}, [r2, :32]!
|
||||
vpaddl.u8 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.i16 d0, d0
|
||||
bx r12
|
||||
L(ipred_cfl_w4):
|
||||
add r2, r2, #1
|
||||
vld1.32 {d1[]}, [r2]
|
||||
vadd.i16 d0, d0, d16
|
||||
vpaddl.u8 d1, d1
|
||||
|
@ -2031,10 +2018,10 @@ L(ipred_cfl_h8):
|
|||
vld1.8 {d0}, [r2, :64]!
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.i16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.i16 d0, d0
|
||||
bx r12
|
||||
L(ipred_cfl_w8):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d1}, [r2]
|
||||
vadd.i16 d0, d0, d16
|
||||
vpaddl.u8 d1, d1
|
||||
|
@ -2061,10 +2048,10 @@ L(ipred_cfl_h16):
|
|||
vaddl.u8 q0, d0, d1
|
||||
vadd.i16 d0, d0, d1
|
||||
vpadd.i16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.i16 d0, d0
|
||||
bx r12
|
||||
L(ipred_cfl_w16):
|
||||
add r2, r2, #1
|
||||
vld1.8 {q2}, [r2]
|
||||
vadd.i16 d0, d0, d16
|
||||
vaddl.u8 q2, d4, d5
|
||||
|
@ -2094,10 +2081,10 @@ L(ipred_cfl_h32):
|
|||
vadd.i16 q0, q2, q3
|
||||
vadd.i16 d0, d0, d1
|
||||
vpadd.i16 d0, d0
|
||||
add r2, r2, #1
|
||||
vpadd.i16 d0, d0
|
||||
bx r12
|
||||
L(ipred_cfl_w32):
|
||||
add r2, r2, #1
|
||||
vld1.8 {q2, q3}, [r2]
|
||||
vadd.i16 d0, d0, d16
|
||||
vaddl.u8 q2, d4, d5
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -706,7 +706,7 @@ def_fn_4x4 identity, flipadst
|
|||
vrshrn_8h \r14, \r15, q4, q5, #12 // t7a
|
||||
vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
|
||||
vrshrn_8h \r6, \r7, q6, q7, #12 // t5a
|
||||
vrshrn_8h \r10, \r11, q2, q3, #12 // taa
|
||||
vrshrn_8h \r10, \r11, q2, q3, #12 // t6a
|
||||
|
||||
vqadd.s16 q2, \q1, \q3 // t4
|
||||
vqsub.s16 \q1, \q1, \q3 // t5a
|
||||
|
@ -1173,7 +1173,7 @@ function inv_dct_4h_x16_neon, export=1
|
|||
|
||||
vrshrn.i32 d6, q3, #12 // t11
|
||||
vrshrn.i32 d7, q4, #12 // t12
|
||||
vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a
|
||||
vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a
|
||||
vrshrn.i32 d4, q2, #12 // t10a
|
||||
vrshrn.i32 d5, q4, #12 // t13a
|
||||
|
||||
|
@ -1480,53 +1480,6 @@ function inv_txfm_add_vert_4x16_neon
|
|||
pop {pc}
|
||||
endfunc
|
||||
|
||||
.macro sub_sp_align space
|
||||
#if CONFIG_THUMB
|
||||
mov r7, sp
|
||||
and r7, r7, #15
|
||||
#else
|
||||
and r7, sp, #15
|
||||
#endif
|
||||
sub sp, sp, r7
|
||||
// Now the stack is aligned, store the amount of adjustment back
|
||||
// on the stack, as we don't want to waste a register as frame
|
||||
// pointer.
|
||||
str r7, [sp, #-16]!
|
||||
#ifdef _WIN32
|
||||
.if \space > 8192
|
||||
// Here, we'd need to touch two (or more) pages while decrementing
|
||||
// the stack pointer.
|
||||
.error "sub_sp_align doesn't support values over 8K at the moment"
|
||||
.elseif \space > 4096
|
||||
sub r7, sp, #4096
|
||||
ldr r12, [r7]
|
||||
sub r7, r7, #(\space - 4096)
|
||||
mov sp, r7
|
||||
.else
|
||||
sub sp, sp, #\space
|
||||
.endif
|
||||
#else
|
||||
.if \space >= 4096
|
||||
sub sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
sub sp, sp, #(\space)%4096
|
||||
.endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro add_sp_align space
|
||||
.if \space >= 4096
|
||||
add sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
add sp, sp, #(\space)%4096
|
||||
.endif
|
||||
ldr r7, [sp], #16
|
||||
// Add back the original stack adjustment
|
||||
add sp, sp, r7
|
||||
.endm
|
||||
|
||||
function inv_txfm_add_16x16_neon
|
||||
sub_sp_align 512
|
||||
ldrh r11, [r10], #2
|
||||
|
@ -3248,7 +3201,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
|
|||
mov r8, #(32 - \i)
|
||||
cmp r3, r11
|
||||
blt 1f
|
||||
.if \i < 28
|
||||
ldrh r11, [r10], #2
|
||||
.endif
|
||||
.endif
|
||||
add r7, r2, #(\i*2)
|
||||
mov r8, #32*2
|
||||
|
@ -3304,7 +3259,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
|
|||
add r6, r4, #(\i*64*2)
|
||||
mov r9, #-2 // shift
|
||||
bl inv_txfm_horz_dct_64x4_neon
|
||||
.if \i < 8
|
||||
.if \i < 12
|
||||
ldrh r11, [r10], #2
|
||||
.endif
|
||||
.endr
|
||||
|
@ -3353,7 +3308,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
|
|||
mov r8, #(32 - \i)
|
||||
cmp r3, r11
|
||||
blt 1f
|
||||
.if \i < 28
|
||||
ldrh r11, [r10], #2
|
||||
.endif
|
||||
.endif
|
||||
add r7, r2, #(\i*2)
|
||||
mov r8, #32*2
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -141,13 +141,12 @@ function lpf_4_wd\wd\()_neon
|
|||
vmov.i16 d6, #3
|
||||
vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
|
||||
vmul.i16 d2, d2, d6
|
||||
vmov.i16 d6, #4
|
||||
vmov.i16 d7, #4
|
||||
vadd.i16 d2, d2, d4
|
||||
vmin.s16 d2, d2, d3 // f = iclip_diff()
|
||||
vmov.i16 d7, #3
|
||||
vmax.s16 d2, d2, d9 // f = iclip_diff()
|
||||
vqadd.s16 d4, d6, d2 // f + 4
|
||||
vqadd.s16 d5, d7, d2 // f + 3
|
||||
vqadd.s16 d4, d7, d2 // f + 4
|
||||
vqadd.s16 d5, d6, d2 // f + 3
|
||||
vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1)
|
||||
vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1)
|
||||
vshr.s16 d4, d4, #3 // f1
|
||||
|
|
|
@ -28,15 +28,27 @@
|
|||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
const right_ext_mask_buf
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
right_ext_mask:
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
endconst
|
||||
|
||||
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[8], intptr_t w,
|
||||
// int h, enum LrEdgeFlags edges);
|
||||
function wiener_filter_h_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4}
|
||||
ldrd r4, r5, [sp, #52]
|
||||
ldrd r6, r7, [sp, #60]
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
ldrd r6, r7, [sp, #108]
|
||||
mov r8, r5
|
||||
vld1.16 {q0}, [r4, :128]
|
||||
movw r9, #(1 << 14) - (1 << 2)
|
||||
|
@ -47,27 +59,19 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
bic r10, r10, #7
|
||||
lsl r10, r10, #1
|
||||
|
||||
// Clear the last unused element of q0, to allow filtering a single
|
||||
// pixel with one plain vmul+vpadd.
|
||||
mov r12, #0
|
||||
vmov.16 d1[3], r12
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add r12, r0, r10
|
||||
lsl r10, r10, #1
|
||||
add lr, r2, r3
|
||||
lsl r3, r3, #1
|
||||
|
||||
// Subtract the width from mid_stride
|
||||
sub r10, r10, r5, lsl #1
|
||||
|
||||
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
|
||||
cmp r5, #8
|
||||
add r11, r5, #13
|
||||
// Subtract the aligned width from mid_stride
|
||||
add r11, r5, #7
|
||||
bic r11, r11, #7
|
||||
bge 1f
|
||||
mov r11, #16
|
||||
1:
|
||||
sub r10, r10, r11, lsl #1
|
||||
|
||||
// Subtract the number of pixels read from the source stride
|
||||
add r11, r11, #8
|
||||
sub r3, r3, r11
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
|
@ -131,47 +135,56 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
ldrb r11, [r2, r9]
|
||||
ldrb r9, [lr, r9]
|
||||
// Fill q12/q13 with the right padding pixel
|
||||
vdup.8 d24, r11
|
||||
vdup.8 d26, r9
|
||||
vmovl.u8 q12, d24
|
||||
vmovl.u8 q13, d26
|
||||
vdup.16 q12, r11
|
||||
vdup.16 q13, r9
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp r5, #11
|
||||
bge 4f // If w >= 11, all used input pixels are valid
|
||||
cmp r5, #7
|
||||
bge 5f // If w >= 7, we can filter 4 pixels
|
||||
b 6f
|
||||
|
||||
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
|
||||
// buffer pointer.
|
||||
movrel_local r4, right_ext_mask, -6
|
||||
sub r4, r4, r5, lsl #1
|
||||
vld1.8 {q10, q11}, [r4]
|
||||
|
||||
vbit q1, q12, q10
|
||||
vbit q2, q12, q11
|
||||
vbit q8, q13, q10
|
||||
vbit q9, q13, q11
|
||||
|
||||
4: // Loop horizontally
|
||||
// This is tuned as some sort of compromise between Cortex A7, A8,
|
||||
// A9 and A53.
|
||||
vmul.s16 q3, q1, d0[0]
|
||||
vext.8 q10, q1, q2, #2
|
||||
vext.8 q11, q1, q2, #4
|
||||
vmla.s16 q3, q10, d0[1]
|
||||
vmla.s16 q3, q11, d0[2]
|
||||
vext.8 q10, q1, q2, #6
|
||||
vext.8 q11, q1, q2, #8
|
||||
vmla.s16 q3, q10, d0[3]
|
||||
vmla.s16 q3, q11, d1[0]
|
||||
vext.8 q10, q1, q2, #10
|
||||
vext.8 q11, q1, q2, #12
|
||||
vmla.s16 q3, q10, d1[1]
|
||||
vmla.s16 q3, q11, d1[2]
|
||||
vext.8 q5, q1, q2, #8
|
||||
vext.8 q10, q1, q2, #2
|
||||
vext.8 q6, q1, q2, #10
|
||||
vext.8 q7, q1, q2, #12
|
||||
vext.8 q4, q1, q2, #6
|
||||
vadd.i16 q5, q5, q11
|
||||
vadd.i16 q6, q6, q10
|
||||
vadd.i16 q7, q7, q1
|
||||
vmul.s16 q3, q4, d0[3]
|
||||
vmla.s16 q3, q5, d1[0]
|
||||
vmla.s16 q3, q6, d1[1]
|
||||
vmla.s16 q3, q7, d1[2]
|
||||
|
||||
vmul.s16 q10, q8, d0[0]
|
||||
vext.8 q11, q8, q9, #2
|
||||
vext.8 q4, q8, q9, #4
|
||||
vmla.s16 q10, q11, d0[1]
|
||||
vmla.s16 q10, q4, d0[2]
|
||||
vext.8 q11, q8, q9, #6
|
||||
vext.8 q4, q8, q9, #8
|
||||
vmla.s16 q10, q11, d0[3]
|
||||
vmla.s16 q10, q4, d1[0]
|
||||
vext.8 q11, q8, q9, #10
|
||||
vext.8 q6, q8, q9, #8
|
||||
vext.8 q11, q8, q9, #2
|
||||
vext.8 q7, q8, q9, #10
|
||||
vadd.i16 q6, q6, q4
|
||||
vext.8 q4, q8, q9, #12
|
||||
vmla.s16 q10, q11, d1[1]
|
||||
vext.8 q5, q8, q9, #6
|
||||
vadd.i16 q7, q7, q11
|
||||
vadd.i16 q4, q4, q8
|
||||
vmul.s16 q10, q5, d0[3]
|
||||
vmla.s16 q10, q6, d1[0]
|
||||
vmla.s16 q10, q7, d1[1]
|
||||
vmla.s16 q10, q4, d1[2]
|
||||
|
||||
vext.8 q1, q1, q2, #6
|
||||
|
@ -186,10 +199,10 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
vshr.s16 q10, q10, #3
|
||||
vadd.s16 q3, q3, q15
|
||||
vadd.s16 q10, q10, q15
|
||||
subs r5, r5, #8
|
||||
vst1.16 {q3}, [r0, :128]!
|
||||
vst1.16 {q10}, [r12, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vmov q1, q2
|
||||
|
@ -201,145 +214,6 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
bne 4b // If we don't need to pad, just keep filtering.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Filter 4 pixels, 7 <= w < 11
|
||||
.macro filter_4
|
||||
vext.8 d20, d2, d3, #2
|
||||
vext.8 d21, d2, d3, #4
|
||||
vext.8 d22, d2, d3, #6
|
||||
vext.8 d23, d3, d4, #2
|
||||
vext.8 d8, d3, d4, #4
|
||||
vmul.s16 d6, d2, d0[0]
|
||||
vmla.s16 d6, d20, d0[1]
|
||||
vmla.s16 d6, d21, d0[2]
|
||||
vmla.s16 d6, d22, d0[3]
|
||||
vmla.s16 d6, d3, d1[0]
|
||||
vmla.s16 d6, d23, d1[1]
|
||||
vmla.s16 d6, d8, d1[2]
|
||||
|
||||
vext.8 d20, d16, d17, #2
|
||||
vext.8 d21, d16, d17, #4
|
||||
vext.8 d22, d16, d17, #6
|
||||
vext.8 d23, d17, d18, #2
|
||||
vext.8 d8, d17, d18, #4
|
||||
vmul.s16 d7, d16, d0[0]
|
||||
vmla.s16 d7, d20, d0[1]
|
||||
vmla.s16 d7, d21, d0[2]
|
||||
vmla.s16 d7, d22, d0[3]
|
||||
vmla.s16 d7, d17, d1[0]
|
||||
vmla.s16 d7, d23, d1[1]
|
||||
vmla.s16 d7, d8, d1[2]
|
||||
|
||||
vext.8 d22, d2, d3, #6
|
||||
vext.8 d23, d16, d17, #6
|
||||
vshl.s16 q11, q11, #7
|
||||
vsub.s16 q11, q11, q14
|
||||
vqadd.s16 q3, q3, q11
|
||||
vshr.s16 q3, q3, #3
|
||||
vadd.s16 q3, q3, q15
|
||||
.endm
|
||||
filter_4
|
||||
vst1.16 {d6}, [r0, :64]!
|
||||
vst1.16 {d7}, [r12, :64]!
|
||||
|
||||
subs r5, r5, #4 // 3 <= w < 7
|
||||
vext.8 q1, q1, q2, #8
|
||||
vext.8 q2, q2, q2, #8
|
||||
vext.8 q8, q8, q9, #8
|
||||
vext.8 q9, q9, q9, #8
|
||||
|
||||
6: // Pad the right edge and filter the last few pixels.
|
||||
// w < 7, w+3 pixels valid in q1-q2
|
||||
cmp r5, #5
|
||||
blt 7f
|
||||
bgt 8f
|
||||
// w == 5, 8 pixels valid in q1, q2 invalid
|
||||
vmov q2, q12
|
||||
vmov q9, q13
|
||||
b 88f
|
||||
|
||||
7: // 1 <= w < 5, 4-7 pixels valid in q1
|
||||
sub r9, r5, #1
|
||||
// r9 = (pixels valid - 4)
|
||||
adr r11, L(variable_shift_tbl)
|
||||
ldr r9, [r11, r9, lsl #2]
|
||||
add r11, r11, r9
|
||||
vmov q2, q12
|
||||
vmov q9, q13
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(variable_shift_tbl):
|
||||
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
44: // 4 pixels valid in d2/d16, fill d3/d17 with padding.
|
||||
vmov d3, d4
|
||||
vmov d17, d18
|
||||
b 88f
|
||||
// Shift q1 right, shifting out invalid pixels,
|
||||
// shift q1 left to the original offset, shifting in padding pixels.
|
||||
55: // 5 pixels valid
|
||||
vext.8 q1, q1, q1, #10
|
||||
vext.8 q1, q1, q2, #6
|
||||
vext.8 q8, q8, q8, #10
|
||||
vext.8 q8, q8, q9, #6
|
||||
b 88f
|
||||
66: // 6 pixels valid
|
||||
vext.8 q1, q1, q1, #12
|
||||
vext.8 q1, q1, q2, #4
|
||||
vext.8 q8, q8, q8, #12
|
||||
vext.8 q8, q8, q9, #4
|
||||
b 88f
|
||||
77: // 7 pixels valid
|
||||
vext.8 q1, q1, q1, #14
|
||||
vext.8 q1, q1, q2, #2
|
||||
vext.8 q8, q8, q8, #14
|
||||
vext.8 q8, q8, q9, #2
|
||||
b 88f
|
||||
|
||||
8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2
|
||||
vext.8 q2, q2, q2, #2
|
||||
vext.8 q2, q2, q12, #14
|
||||
vext.8 q9, q9, q9, #2
|
||||
vext.8 q9, q9, q13, #14
|
||||
|
||||
88:
|
||||
// w < 7, q1-q2 padded properly
|
||||
cmp r5, #4
|
||||
blt 888f
|
||||
|
||||
// w >= 4, filter 4 pixels
|
||||
filter_4
|
||||
vst1.16 {d6}, [r0, :64]!
|
||||
vst1.16 {d7}, [r12, :64]!
|
||||
subs r5, r5, #4 // 0 <= w < 4
|
||||
vext.8 q1, q1, q2, #8
|
||||
vext.8 q8, q8, q9, #8
|
||||
beq 9f
|
||||
888: // 1 <= w < 4, filter 1 pixel at a time
|
||||
vmul.s16 q3, q1, q0
|
||||
vmul.s16 q10, q8, q0
|
||||
vpadd.s16 d6, d6, d7
|
||||
vpadd.s16 d7, d20, d21
|
||||
vdup.16 d24, d2[3]
|
||||
vpadd.s16 d6, d6, d7
|
||||
vdup.16 d25, d16[3]
|
||||
vpadd.s16 d6, d6, d6
|
||||
vtrn.16 d24, d25
|
||||
vshl.s16 d24, d24, #7
|
||||
vsub.s16 d24, d24, d28
|
||||
vqadd.s16 d6, d6, d24
|
||||
vshr.s16 d6, d6, #3
|
||||
vadd.s16 d6, d6, d30
|
||||
vst1.s16 {d6[0]}, [r0, :16]!
|
||||
vst1.s16 {d6[1]}, [r12, :16]!
|
||||
subs r5, r5, #1
|
||||
vext.8 q1, q1, q2, #2
|
||||
vext.8 q8, q8, q9, #2
|
||||
bgt 888b
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
|
@ -351,9 +225,8 @@ L(variable_shift_tbl):
|
|||
mov r5, r8
|
||||
b 1b
|
||||
0:
|
||||
vpop {q4}
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem filter_4
|
||||
endfunc
|
||||
|
||||
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
|
@ -362,8 +235,9 @@ endfunc
|
|||
// ptrdiff_t mid_stride);
|
||||
function wiener_filter_v_8bpc_neon, export=1
|
||||
push {r4-r7,lr}
|
||||
ldrd r4, r5, [sp, #20]
|
||||
ldrd r6, r7, [sp, #28]
|
||||
vpush {q4-q6}
|
||||
ldrd r4, r5, [sp, #68]
|
||||
ldrd r6, r7, [sp, #76]
|
||||
mov lr, r4
|
||||
vld1.16 {q0}, [r5, :128]
|
||||
|
||||
|
@ -407,24 +281,21 @@ function wiener_filter_v_8bpc_neon, export=1
|
|||
// Interleaving the mul/mla chains actually hurts performance
|
||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||
// chained like this.
|
||||
vmull.s16 q2, d16, d0[0]
|
||||
vmlal.s16 q2, d18, d0[1]
|
||||
vmlal.s16 q2, d20, d0[2]
|
||||
vmlal.s16 q2, d22, d0[3]
|
||||
vmlal.s16 q2, d24, d1[0]
|
||||
vmlal.s16 q2, d26, d1[1]
|
||||
vmlal.s16 q2, d28, d1[2]
|
||||
vmull.s16 q3, d17, d0[0]
|
||||
vmlal.s16 q3, d19, d0[1]
|
||||
vmlal.s16 q3, d21, d0[2]
|
||||
vmlal.s16 q3, d23, d0[3]
|
||||
vmlal.s16 q3, d25, d1[0]
|
||||
vmlal.s16 q3, d27, d1[1]
|
||||
vmlal.s16 q3, d29, d1[2]
|
||||
vadd.i16 q4, q10, q12
|
||||
vadd.i16 q5, q9, q13
|
||||
vadd.i16 q6, q8, q14
|
||||
vmull.s16 q2, d22, d0[3]
|
||||
vmlal.s16 q2, d8, d1[0]
|
||||
vmlal.s16 q2, d10, d1[1]
|
||||
vmlal.s16 q2, d12, d1[2]
|
||||
vmull.s16 q3, d23, d0[3]
|
||||
vmlal.s16 q3, d9, d1[0]
|
||||
vmlal.s16 q3, d11, d1[1]
|
||||
vmlal.s16 q3, d13, d1[2]
|
||||
vqrshrun.s32 d4, q2, #11
|
||||
vqrshrun.s32 d5, q3, #11
|
||||
vqmovun.s16 d4, q2
|
||||
vst1.8 {d4}, [r0], r1
|
||||
vst1.8 {d4}, [r0, :64], r1
|
||||
.if \compare
|
||||
cmp r4, #4
|
||||
.else
|
||||
|
@ -529,147 +400,11 @@ function wiener_filter_v_8bpc_neon, export=1
|
|||
b 1b
|
||||
|
||||
0:
|
||||
vpop {q4-q6}
|
||||
pop {r4-r7,pc}
|
||||
.purgem filter
|
||||
endfunc
|
||||
|
||||
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_8bpc_neon, export=1
|
||||
push {r4,lr}
|
||||
ldr r4, [sp, #8]
|
||||
adr r12, L(copy_narrow_tbl)
|
||||
ldr r3, [r12, r3, lsl #2]
|
||||
add r12, r12, r3
|
||||
bx r12
|
||||
|
||||
.align 2
|
||||
L(copy_narrow_tbl):
|
||||
.word 0
|
||||
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
|
||||
10:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
18:
|
||||
subs r4, r4, #8
|
||||
blt 110f
|
||||
vld1.8 {d0}, [r2, :64]!
|
||||
vst1.8 {d0[0]}, [r0], r1
|
||||
vst1.8 {d0[1]}, [r3], r1
|
||||
vst1.8 {d0[2]}, [r0], r1
|
||||
vst1.8 {d0[3]}, [r3], r1
|
||||
vst1.8 {d0[4]}, [r0], r1
|
||||
vst1.8 {d0[5]}, [r3], r1
|
||||
vst1.8 {d0[6]}, [r0], r1
|
||||
vst1.8 {d0[7]}, [r3], r1
|
||||
ble 0f
|
||||
b 18b
|
||||
110:
|
||||
add r4, r4, #8
|
||||
asr r1, r1, #1
|
||||
11:
|
||||
subs r4, r4, #1
|
||||
vld1.8 {d0[]}, [r2]!
|
||||
vst1.8 {d0[0]}, [r0], r1
|
||||
bgt 11b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
20:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
24:
|
||||
subs r4, r4, #4
|
||||
blt 210f
|
||||
vld1.16 {d0}, [r2, :64]!
|
||||
vst1.16 {d0[0]}, [r0, :16], r1
|
||||
vst1.16 {d0[1]}, [r3, :16], r1
|
||||
vst1.16 {d0[2]}, [r0, :16], r1
|
||||
vst1.16 {d0[3]}, [r3, :16], r1
|
||||
ble 0f
|
||||
b 24b
|
||||
210:
|
||||
add r4, r4, #4
|
||||
asr r1, r1, #1
|
||||
22:
|
||||
subs r4, r4, #1
|
||||
vld1.16 {d0[]}, [r2, :16]!
|
||||
vst1.16 {d0[0]}, [r0, :16], r1
|
||||
bgt 22b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
30:
|
||||
ldrh r3, [r2]
|
||||
ldrb r12, [r2, #2]
|
||||
add r2, r2, #3
|
||||
subs r4, r4, #1
|
||||
strh r3, [r0]
|
||||
strb r12, [r0, #2]
|
||||
add r0, r0, r1
|
||||
bgt 30b
|
||||
pop {r4,pc}
|
||||
|
||||
40:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
42:
|
||||
subs r4, r4, #2
|
||||
blt 41f
|
||||
vld1.8 {d0}, [r2, :64]!
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[1]}, [r3, :32], r1
|
||||
ble 0f
|
||||
b 42b
|
||||
41:
|
||||
vld1.32 {d0[]}, [r2, :32]
|
||||
vst1.32 {d0[0]}, [r0, :32]
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
50:
|
||||
ldr r3, [r2]
|
||||
ldrb r12, [r2, #4]
|
||||
add r2, r2, #5
|
||||
subs r4, r4, #1
|
||||
str r3, [r0]
|
||||
strb r12, [r0, #4]
|
||||
add r0, r0, r1
|
||||
bgt 50b
|
||||
pop {r4,pc}
|
||||
|
||||
60:
|
||||
ldr r3, [r2]
|
||||
ldrh r12, [r2, #4]
|
||||
add r2, r2, #6
|
||||
subs r4, r4, #1
|
||||
str r3, [r0]
|
||||
strh r12, [r0, #4]
|
||||
add r0, r0, r1
|
||||
bgt 60b
|
||||
pop {r4,pc}
|
||||
|
||||
70:
|
||||
ldr r3, [r2]
|
||||
ldrh r12, [r2, #4]
|
||||
ldrb lr, [r2, #6]
|
||||
add r2, r2, #7
|
||||
subs r4, r4, #1
|
||||
str r3, [r0]
|
||||
strh r12, [r0, #4]
|
||||
strb lr, [r0, #6]
|
||||
add r0, r0, r1
|
||||
bgt 70b
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
#include "looprestoration_tmpl.S"
|
||||
|
@ -694,25 +429,15 @@ function sgr_box3_h_8bpc_neon, export=1
|
|||
mov r9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
bne 0f
|
||||
// !LR_HAVE_RIGHT
|
||||
add lr, r5, #3
|
||||
bic lr, lr, #3
|
||||
b 1f
|
||||
0:
|
||||
add lr, r5, #7
|
||||
bic lr, lr, #7
|
||||
1:
|
||||
sub r9, r9, lr, lsl #1
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov r8, r5
|
||||
|
||||
// Subtract the number of pixels read from the input from the stride
|
||||
add lr, r5, #14
|
||||
bic lr, lr, #7
|
||||
add lr, lr, #8
|
||||
sub r4, r4, lr
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
|
@ -781,34 +506,30 @@ function sgr_box3_h_8bpc_neon, export=1
|
|||
// Restore r11 after using it for a temporary value
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp r5, #10
|
||||
bge 4f // If w >= 10, all used input pixels are valid
|
||||
cmp r5, #6
|
||||
bge 5f // If w >= 6, we can filter 4 pixels
|
||||
b 6f
|
||||
|
||||
// 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
|
||||
// again; it's not strictly needed in those cases (we pad enough here),
|
||||
// but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in q0/4.b[w] onwards
|
||||
movrel_local lr, right_ext_mask
|
||||
sub lr, lr, r5
|
||||
vld1.8 {q13}, [lr]
|
||||
|
||||
vbit q0, q14, q13
|
||||
vbit q4, q15, q13
|
||||
|
||||
// Update the precalculated squares
|
||||
vmull.u8 q1, d0, d0
|
||||
vmull.u8 q2, d1, d1
|
||||
vmull.u8 q5, d8, d8
|
||||
vmull.u8 q6, d9, d9
|
||||
|
||||
4: // Loop horizontally
|
||||
.macro vaddl_u16_n dst1, dst2, src1, src2, src3, src4, w
|
||||
vaddl.u16 \dst1, \src1, \src3
|
||||
.if \w > 4
|
||||
vaddl.u16 \dst2, \src2, \src4
|
||||
.endif
|
||||
.endm
|
||||
.macro vaddw_u16_n dst1, dst2, src1, src2, w
|
||||
vaddw.u16 \dst1, \dst1, \src1
|
||||
.if \w > 4
|
||||
vaddw.u16 \dst2, \dst2, \src2
|
||||
.endif
|
||||
.endm
|
||||
.macro vadd_i32_n dst1, dst2, src1, src2, w
|
||||
vadd.i32 \dst1, \dst1, \src1
|
||||
.if \w > 4
|
||||
vadd.i32 \dst2, \dst2, \src2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro add3 w
|
||||
vext.8 d16, d0, d1, #1
|
||||
vext.8 d17, d0, d1, #2
|
||||
vext.8 d18, d8, d9, #1
|
||||
|
@ -823,19 +544,22 @@ function sgr_box3_h_8bpc_neon, export=1
|
|||
vext.8 q10, q5, q6, #2
|
||||
vext.8 q11, q5, q6, #4
|
||||
|
||||
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
|
||||
vaddw_u16_n q12, q13, d18, d19, \w
|
||||
vaddl.u16 q12, d2, d16
|
||||
vaddl.u16 q13, d3, d17
|
||||
vaddw.u16 q12, q12, d18
|
||||
vaddw.u16 q13, q13, d19
|
||||
|
||||
vaddl_u16_n q8, q9, d10, d11, d20, d21, \w
|
||||
vaddw_u16_n q8, q9, d22, d23, \w
|
||||
.endm
|
||||
add3 8
|
||||
vaddl.u16 q8, d10, d20
|
||||
vaddl.u16 q9, d11, d21
|
||||
vaddw.u16 q8, q8, d22
|
||||
vaddw.u16 q9, q9, d23
|
||||
|
||||
subs r5, r5, #8
|
||||
vst1.16 {q3}, [r1, :128]!
|
||||
vst1.16 {q7}, [r11, :128]!
|
||||
vst1.32 {q12, q13}, [r0, :128]!
|
||||
vst1.32 {q8, q9}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vld1.8 {d6}, [r3]!
|
||||
|
@ -850,86 +574,6 @@ function sgr_box3_h_8bpc_neon, export=1
|
|||
bne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Produce 4 pixels, 6 <= w < 10
|
||||
add3 4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q8}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #4 // 2 <= w < 6
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q4, q4, q4, #4
|
||||
|
||||
6: // Pad the right edge and produce the last few pixels.
|
||||
// 2 <= w < 6, 2-5 pixels valid in q0
|
||||
sub lr, r5, #2
|
||||
// lr = (pixels valid - 2)
|
||||
adr r11, L(box3_variable_shift_tbl)
|
||||
ldr lr, [r11, lr, lsl #2]
|
||||
add r11, r11, lr
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(box3_variable_shift_tbl):
|
||||
.word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
// Shift q0 right, shifting out invalid pixels,
|
||||
// shift q0 left to the original offset, shifting in padding pixels.
|
||||
22: // 2 pixels valid
|
||||
vext.8 q0, q0, q0, #2
|
||||
vext.8 q4, q4, q4, #2
|
||||
vext.8 q0, q0, q14, #14
|
||||
vext.8 q4, q4, q15, #14
|
||||
b 88f
|
||||
33: // 3 pixels valid
|
||||
vext.8 q0, q0, q0, #3
|
||||
vext.8 q4, q4, q4, #3
|
||||
vext.8 q0, q0, q14, #13
|
||||
vext.8 q4, q4, q15, #13
|
||||
b 88f
|
||||
44: // 4 pixels valid
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q4, q4, q4, #4
|
||||
vext.8 q0, q0, q14, #12
|
||||
vext.8 q4, q4, q15, #12
|
||||
b 88f
|
||||
55: // 5 pixels valid
|
||||
vext.8 q0, q0, q0, #5
|
||||
vext.8 q4, q4, q4, #5
|
||||
vext.8 q0, q0, q14, #11
|
||||
vext.8 q4, q4, q15, #11
|
||||
|
||||
88:
|
||||
// Restore r11 after using it for a temporary value above
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
vmull.u8 q1, d0, d0
|
||||
vmull.u8 q2, d1, d1
|
||||
vmull.u8 q5, d8, d8
|
||||
vmull.u8 q6, d9, d9
|
||||
|
||||
add3 4
|
||||
subs r5, r5, #4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q8}, [r10, :128]!
|
||||
ble 9f
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q1, q1, q2, #8
|
||||
vext.8 q4, q4, q4, #4
|
||||
vext.8 q5, q5, q6, #8
|
||||
// Only one needed pixel left, but do a normal 4 pixel
|
||||
// addition anyway
|
||||
add3 4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q8}, [r10, :128]!
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
|
@ -945,7 +589,6 @@ L(box3_variable_shift_tbl):
|
|||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
|
@ -968,23 +611,11 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
mov r9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
|
||||
// Subtract the number of pixels read from the input from the stride.
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
bne 0f
|
||||
// !LR_HAVE_RIGHT
|
||||
add lr, r5, #3
|
||||
bic lr, lr, #3
|
||||
add r8, r5, #13
|
||||
b 1f
|
||||
0:
|
||||
add lr, r5, #7
|
||||
bic lr, lr, #7
|
||||
add r8, r5, #15
|
||||
1:
|
||||
sub r9, r9, lr, lsl #1
|
||||
bic r8, r8, #7
|
||||
sub r4, r4, r8
|
||||
add lr, lr, #8
|
||||
sub r4, r4, lr
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov r8, r5
|
||||
|
@ -1054,15 +685,31 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
// Restore r11 after using it for a temporary value
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp r5, #11
|
||||
bge 4f // If w >= 11, all used input pixels are valid
|
||||
cmp r5, #7
|
||||
bge 5f // If w >= 7, we can produce 4 pixels
|
||||
b 6f
|
||||
|
||||
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
|
||||
// buffer pointer.
|
||||
movrel_local lr, right_ext_mask, -1
|
||||
sub lr, lr, r5
|
||||
vld1.8 {q13}, [lr]
|
||||
|
||||
vbit q0, q14, q13
|
||||
vbit q4, q15, q13
|
||||
|
||||
// Update the precalculated squares
|
||||
vmull.u8 q1, d0, d0
|
||||
vmull.u8 q2, d1, d1
|
||||
vmull.u8 q5, d8, d8
|
||||
vmull.u8 q6, d9, d9
|
||||
|
||||
4: // Loop horizontally
|
||||
.macro add5 w
|
||||
vext.8 d16, d0, d1, #1
|
||||
vext.8 d17, d0, d1, #2
|
||||
vext.8 d18, d0, d1, #3
|
||||
|
@ -1084,35 +731,33 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
vext.8 q9, q1, q2, #4
|
||||
vext.8 q10, q1, q2, #6
|
||||
vext.8 q11, q1, q2, #8
|
||||
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
|
||||
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
|
||||
vaddw_u16_n q12, q13, d22, d23, \w
|
||||
vadd_i32_n q12, q13, q8, q9, \w
|
||||
vaddl.u16 q12, d2, d16
|
||||
vaddl.u16 q13, d3, d17
|
||||
vaddl.u16 q8, d18, d20
|
||||
vaddl.u16 q9, d19, d21
|
||||
vaddw.u16 q12, q12, d22
|
||||
vaddw.u16 q13, q13, d23
|
||||
vadd.i32 q12, q12, q8
|
||||
vadd.i32 q13, q13, q9
|
||||
vext.8 q8, q5, q6, #2
|
||||
vext.8 q9, q5, q6, #4
|
||||
vext.8 q10, q5, q6, #6
|
||||
vext.8 q11, q5, q6, #8
|
||||
.if \w > 4
|
||||
vaddl_u16_n q1, q5, d10, d11, d16, d17, 8
|
||||
vaddl_u16_n q8, q9, d18, d19, d20, d21, 8
|
||||
vaddw_u16_n q1, q5, d22, d23, 8
|
||||
vaddl.u16 q1, d10, d16
|
||||
vaddl.u16 q5, d11, d17
|
||||
vaddl.u16 q8, d18, d20
|
||||
vaddl.u16 q9, d19, d21
|
||||
vaddw.u16 q1, q1, d22
|
||||
vaddw.u16 q5, q5, d23
|
||||
vadd.i32 q10, q1, q8
|
||||
vadd.i32 q11, q5, q9
|
||||
.else
|
||||
// Can't clobber q1/q5 if only doing 4 pixels
|
||||
vaddl.u16 q8, d10, d16
|
||||
vaddl.u16 q9, d18, d20
|
||||
vaddw.u16 q8, q8, d22
|
||||
vadd.i32 q10, q8, q9
|
||||
.endif
|
||||
.endm
|
||||
add5 8
|
||||
|
||||
subs r5, r5, #8
|
||||
vst1.16 {q3}, [r1, :128]!
|
||||
vst1.16 {q7}, [r11, :128]!
|
||||
vst1.32 {q12, q13}, [r0, :128]!
|
||||
vst1.32 {q10, q11}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vld1.8 {d6}, [r3]!
|
||||
|
@ -1126,98 +771,6 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
bne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Produce 4 pixels, 7 <= w < 11
|
||||
add5 4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q10}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #4 // 3 <= w < 7
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q4, q4, q4, #4
|
||||
|
||||
6: // Pad the right edge and produce the last few pixels.
|
||||
// w < 7, w+1 pixels valid in q0/q4
|
||||
sub lr, r5, #1
|
||||
// lr = pixels valid - 2
|
||||
adr r11, L(box5_variable_shift_tbl)
|
||||
ldr lr, [r11, lr, lsl #2]
|
||||
add r11, r11, lr
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(box5_variable_shift_tbl):
|
||||
.word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
// Shift q0 right, shifting out invalid pixels,
|
||||
// shift q0 left to the original offset, shifting in padding pixels.
|
||||
22: // 2 pixels valid
|
||||
vext.8 q0, q0, q0, #2
|
||||
vext.8 q4, q4, q4, #2
|
||||
vext.8 q0, q0, q14, #14
|
||||
vext.8 q4, q4, q15, #14
|
||||
b 88f
|
||||
33: // 3 pixels valid
|
||||
vext.8 q0, q0, q0, #3
|
||||
vext.8 q4, q4, q4, #3
|
||||
vext.8 q0, q0, q14, #13
|
||||
vext.8 q4, q4, q15, #13
|
||||
b 88f
|
||||
44: // 4 pixels valid
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q4, q4, q4, #4
|
||||
vext.8 q0, q0, q14, #12
|
||||
vext.8 q4, q4, q15, #12
|
||||
b 88f
|
||||
55: // 5 pixels valid
|
||||
vext.8 q0, q0, q0, #5
|
||||
vext.8 q4, q4, q4, #5
|
||||
vext.8 q0, q0, q14, #11
|
||||
vext.8 q4, q4, q15, #11
|
||||
b 88f
|
||||
66: // 6 pixels valid
|
||||
vext.8 q0, q0, q0, #6
|
||||
vext.8 q4, q4, q4, #6
|
||||
vext.8 q0, q0, q14, #10
|
||||
vext.8 q4, q4, q15, #10
|
||||
b 88f
|
||||
77: // 7 pixels valid
|
||||
vext.8 q0, q0, q0, #7
|
||||
vext.8 q4, q4, q4, #7
|
||||
vext.8 q0, q0, q14, #9
|
||||
vext.8 q4, q4, q15, #9
|
||||
|
||||
88:
|
||||
// Restore r11 after using it for a temporary value above
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
vmull.u8 q1, d0, d0
|
||||
vmull.u8 q2, d1, d1
|
||||
vmull.u8 q5, d8, d8
|
||||
vmull.u8 q6, d9, d9
|
||||
|
||||
add5 4
|
||||
subs r5, r5, #4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q10}, [r10, :128]!
|
||||
ble 9f
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q1, q1, q2, #8
|
||||
vext.8 q4, q4, q4, #4
|
||||
vext.8 q5, q5, q6, #8
|
||||
add5 4
|
||||
vst1.16 {d6}, [r1, :64]!
|
||||
vst1.16 {d14}, [r11, :64]!
|
||||
vst1.32 {q12}, [r0, :128]!
|
||||
vst1.32 {q10}, [r10, :128]!
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
|
@ -1233,7 +786,6 @@ L(box5_variable_shift_tbl):
|
|||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem add5
|
||||
endfunc
|
||||
|
||||
sgr_funcs 8
|
||||
|
|
|
@ -28,6 +28,18 @@
|
|||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
const right_ext_mask_buf
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
right_ext_mask:
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
endconst
|
||||
|
||||
// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[7], const intptr_t w,
|
||||
|
@ -55,27 +67,19 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||
bic r10, r10, #7
|
||||
lsl r10, r10, #1
|
||||
|
||||
// Clear the last unused element of q0, to allow filtering a single
|
||||
// pixel with one plain vmul+vpadd.
|
||||
mov r12, #0
|
||||
vmov.16 d1[3], r12
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add r12, r0, r10
|
||||
lsl r10, r10, #1
|
||||
add lr, r2, r3
|
||||
lsl r3, r3, #1
|
||||
|
||||
// Subtract the width from mid_stride
|
||||
sub r10, r10, r5, lsl #1
|
||||
|
||||
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
|
||||
cmp r5, #8
|
||||
add r11, r5, #13
|
||||
// Subtract the aligned width from mid_stride
|
||||
add r11, r5, #7
|
||||
bic r11, r11, #7
|
||||
bge 1f
|
||||
mov r11, #16
|
||||
1:
|
||||
sub r10, r10, r11, lsl #1
|
||||
|
||||
// Subtract the number of pixels read from the source stride
|
||||
add r11, r11, #8
|
||||
sub r3, r3, r11, lsl #1
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
|
@ -143,54 +147,62 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||
vdup.16 q11, r11
|
||||
vdup.16 q12, r9
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp r5, #11
|
||||
bge 4f // If w >= 11, all used input pixels are valid
|
||||
cmp r5, #7
|
||||
bge 5f // If w >= 7, we can filter 4 pixels
|
||||
b 6f
|
||||
|
||||
// 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
|
||||
// buffer pointer.
|
||||
movrel_local r4, right_ext_mask, -6
|
||||
sub r4, r4, r5, lsl #1
|
||||
vld1.8 {q9, q10}, [r4]
|
||||
|
||||
vbit q2, q11, q9
|
||||
vbit q3, q11, q10
|
||||
vbit q4, q12, q9
|
||||
vbit q5, q12, q10
|
||||
|
||||
4: // Loop horizontally
|
||||
vext.8 q8, q2, q3, #2
|
||||
vext.8 q9, q2, q3, #4
|
||||
vext.8 q10, q2, q3, #6
|
||||
vmull.s16 q6, d4, d0[0]
|
||||
vmlal.s16 q6, d16, d0[1]
|
||||
vmlal.s16 q6, d18, d0[2]
|
||||
vmlal.s16 q6, d20, d0[3]
|
||||
vmull.s16 q7, d5, d0[0]
|
||||
vmlal.s16 q7, d17, d0[1]
|
||||
vmlal.s16 q7, d19, d0[2]
|
||||
vmlal.s16 q7, d21, d0[3]
|
||||
vext.8 q7, q2, q3, #4
|
||||
vext.8 q8, q2, q3, #8
|
||||
vext.8 q6, q2, q3, #2
|
||||
vext.8 q9, q2, q3, #10
|
||||
vext.8 q10, q2, q3, #12
|
||||
vadd.i16 q8, q8, q7
|
||||
vadd.i16 q9, q9, q6
|
||||
vext.8 q6, q2, q3, #12
|
||||
vext.8 q7, q2, q3, #6
|
||||
vadd.i16 q2, q2, q6
|
||||
vmull.s16 q6, d14, d0[3]
|
||||
vmlal.s16 q6, d16, d1[0]
|
||||
vmlal.s16 q6, d18, d1[1]
|
||||
vmlal.s16 q6, d20, d1[2]
|
||||
vmlal.s16 q6, d4, d1[2]
|
||||
vmull.s16 q7, d15, d0[3]
|
||||
vmlal.s16 q7, d17, d1[0]
|
||||
vmlal.s16 q7, d19, d1[1]
|
||||
vmlal.s16 q7, d21, d1[2]
|
||||
vext.8 q2, q4, q5, #2
|
||||
vext.8 q10, q4, q5, #6
|
||||
vmull.s16 q8, d8, d0[0]
|
||||
vmlal.s16 q8, d4, d0[1]
|
||||
vmlal.s16 q8, d20, d0[3]
|
||||
vmull.s16 q9, d9, d0[0]
|
||||
vmlal.s16 q9, d5, d0[1]
|
||||
vmlal.s16 q9, d21, d0[3]
|
||||
vext.8 q2, q4, q5, #4
|
||||
vmlal.s16 q7, d5, d1[2]
|
||||
|
||||
vext.8 q8, q4, q5, #4
|
||||
vext.8 q10, q4, q5, #8
|
||||
vmlal.s16 q8, d4, d0[2]
|
||||
vmlal.s16 q8, d20, d1[0]
|
||||
vmlal.s16 q9, d5, d0[2]
|
||||
vmlal.s16 q9, d21, d1[0]
|
||||
vext.8 q9, q4, q5, #2
|
||||
vext.8 q2, q4, q5, #10
|
||||
vext.8 q10, q4, q5, #12
|
||||
vadd.i16 q10, q10, q8
|
||||
vadd.i16 q2, q2, q9
|
||||
vext.8 q8, q4, q5, #12
|
||||
vext.8 q9, q4, q5, #6
|
||||
vadd.i16 q4, q4, q8
|
||||
vmull.s16 q8, d18, d0[3]
|
||||
vmlal.s16 q8, d20, d1[0]
|
||||
vmlal.s16 q8, d4, d1[1]
|
||||
vmlal.s16 q8, d20, d1[2]
|
||||
vmlal.s16 q8, d8, d1[2]
|
||||
vmull.s16 q9, d19, d0[3]
|
||||
vmlal.s16 q9, d21, d1[0]
|
||||
vmlal.s16 q9, d5, d1[1]
|
||||
vmlal.s16 q9, d21, d1[2]
|
||||
vmlal.s16 q9, d9, d1[2]
|
||||
|
||||
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||
vadd.i32 q6, q6, q14
|
||||
|
@ -209,10 +221,10 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||
vmin.u16 q7, q7, q10
|
||||
vsub.i16 q6, q6, q15
|
||||
vsub.i16 q7, q7, q15
|
||||
subs r5, r5, #8
|
||||
vst1.16 {q6}, [r0, :128]!
|
||||
vst1.16 {q7}, [r12, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vmov q2, q3
|
||||
|
@ -222,148 +234,6 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||
bne 4b // If we don't need to pad, just keep filtering.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Filter 4 pixels, 7 <= w < 11
|
||||
.macro filter_4
|
||||
vext.8 d18, d4, d5, #6
|
||||
vext.8 d16, d4, d5, #2
|
||||
vext.8 d17, d4, d5, #4
|
||||
vext.8 d19, d5, d6, #2
|
||||
vext.8 d20, d5, d6, #4
|
||||
vmull.s16 q6, d4, d0[0]
|
||||
vmlal.s16 q6, d16, d0[1]
|
||||
vmlal.s16 q6, d17, d0[2]
|
||||
vmlal.s16 q6, d18, d0[3]
|
||||
vmlal.s16 q6, d5, d1[0]
|
||||
vmlal.s16 q6, d19, d1[1]
|
||||
vmlal.s16 q6, d20, d1[2]
|
||||
|
||||
vext.8 d18, d8, d9, #6
|
||||
vext.8 d16, d8, d9, #2
|
||||
vext.8 d17, d8, d9, #4
|
||||
vext.8 d19, d9, d10, #2
|
||||
vext.8 d20, d9, d10, #4
|
||||
vmull.s16 q7, d8, d0[0]
|
||||
vmlal.s16 q7, d16, d0[1]
|
||||
vmlal.s16 q7, d17, d0[2]
|
||||
vmlal.s16 q7, d18, d0[3]
|
||||
vmlal.s16 q7, d9, d1[0]
|
||||
vmlal.s16 q7, d19, d1[1]
|
||||
vmlal.s16 q7, d20, d1[2]
|
||||
|
||||
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||
vadd.i32 q6, q6, q14
|
||||
vadd.i32 q7, q7, q14
|
||||
vrshl.s32 q6, q6, q13
|
||||
vrshl.s32 q7, q7, q13
|
||||
vqmovun.s32 d12, q6
|
||||
vqmovun.s32 d13, q7
|
||||
vmin.u16 q6, q6, q10
|
||||
vsub.i16 q6, q6, q15
|
||||
.endm
|
||||
filter_4
|
||||
vst1.16 {d12}, [r0, :64]!
|
||||
vst1.16 {d13}, [r12, :64]!
|
||||
|
||||
subs r5, r5, #4 // 3 <= w < 7
|
||||
vext.8 q2, q2, q3, #8
|
||||
vext.8 q3, q3, q3, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
vext.8 q5, q5, q5, #8
|
||||
|
||||
6: // Pad the right edge and filter the last few pixels.
|
||||
// w < 7, w+3 pixels valid in q2-q3
|
||||
cmp r5, #5
|
||||
blt 7f
|
||||
bgt 8f
|
||||
// w == 5, 8 pixels valid in q2, q3 invalid
|
||||
vmov q3, q11
|
||||
vmov q5, q12
|
||||
b 88f
|
||||
|
||||
7: // 1 <= w < 5, 4-7 pixels valid in q2
|
||||
sub r9, r5, #1
|
||||
// r9 = (pixels valid - 4)
|
||||
adr r11, L(variable_shift_tbl)
|
||||
ldr r9, [r11, r9, lsl #2]
|
||||
add r11, r11, r9
|
||||
vmov q3, q11
|
||||
vmov q5, q12
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(variable_shift_tbl):
|
||||
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
44: // 4 pixels valid in q2/q4, fill the high half with padding.
|
||||
vmov d5, d6
|
||||
vmov d9, d10
|
||||
b 88f
|
||||
// Shift q2 right, shifting out invalid pixels,
|
||||
// shift q2 left to the original offset, shifting in padding pixels.
|
||||
55: // 5 pixels valid
|
||||
vext.8 q2, q2, q2, #10
|
||||
vext.8 q2, q2, q3, #6
|
||||
vext.8 q4, q4, q4, #10
|
||||
vext.8 q4, q4, q5, #6
|
||||
b 88f
|
||||
66: // 6 pixels valid
|
||||
vext.8 q2, q2, q2, #12
|
||||
vext.8 q2, q2, q3, #4
|
||||
vext.8 q4, q4, q4, #12
|
||||
vext.8 q4, q4, q5, #4
|
||||
b 88f
|
||||
77: // 7 pixels valid
|
||||
vext.8 q2, q2, q2, #14
|
||||
vext.8 q2, q2, q3, #2
|
||||
vext.8 q4, q4, q4, #14
|
||||
vext.8 q4, q4, q5, #2
|
||||
b 88f
|
||||
|
||||
8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
|
||||
vext.8 q3, q3, q3, #2
|
||||
vext.8 q3, q3, q11, #14
|
||||
vext.8 q5, q5, q5, #2
|
||||
vext.8 q5, q5, q12, #14
|
||||
|
||||
88:
|
||||
// w < 7, q2-q3 padded properly
|
||||
cmp r5, #4
|
||||
blt 888f
|
||||
|
||||
// w >= 4, filter 4 pixels
|
||||
filter_4
|
||||
vst1.16 {d12}, [r0, :64]!
|
||||
vst1.16 {d13}, [r12, :64]!
|
||||
subs r5, r5, #4 // 0 <= w < 4
|
||||
vext.8 q2, q2, q3, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
beq 9f
|
||||
888: // 1 <= w < 4, filter 1 pixel at a time
|
||||
vmull.s16 q6, d4, d0
|
||||
vmull.s16 q7, d5, d1
|
||||
vmull.s16 q8, d8, d0
|
||||
vmull.s16 q9, d9, d1
|
||||
vadd.i32 q6, q7
|
||||
vadd.i32 q8, q9
|
||||
vpadd.i32 d12, d12, d13
|
||||
vpadd.i32 d13, d16, d17
|
||||
vpadd.i32 d12, d12, d13
|
||||
vadd.i32 d12, d12, d28
|
||||
vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||
vrshl.s32 d12, d12, d26
|
||||
vqmovun.s32 d12, q6
|
||||
vmin.u16 d12, d12, d20
|
||||
vsub.i16 d12, d12, d30
|
||||
vst1.16 {d12[0]}, [r0, :16]!
|
||||
vst1.16 {d12[1]}, [r12, :16]!
|
||||
subs r5, r5, #1
|
||||
vext.8 q2, q2, q3, #2
|
||||
vext.8 q4, q4, q5, #2
|
||||
bgt 888b
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
|
@ -377,7 +247,6 @@ L(variable_shift_tbl):
|
|||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem filter_4
|
||||
endfunc
|
||||
|
||||
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
|
@ -457,7 +326,7 @@ function wiener_filter_v_16bpc_neon, export=1
|
|||
vqmovun.s32 d4, q2
|
||||
vqmovun.s32 d5, q3
|
||||
vmin.u16 q2, q2, q5 // bitdepth_max
|
||||
vst1.16 {q2}, [r0], r1
|
||||
vst1.16 {q2}, [r0, :128], r1
|
||||
.if \compare
|
||||
cmp r4, #4
|
||||
.else
|
||||
|
@ -567,143 +436,6 @@ function wiener_filter_v_16bpc_neon, export=1
|
|||
.purgem filter
|
||||
endfunc
|
||||
|
||||
// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_16bpc_neon, export=1
|
||||
push {r4,lr}
|
||||
ldr r4, [sp, #8]
|
||||
adr r12, L(copy_narrow_tbl)
|
||||
ldr r3, [r12, r3, lsl #2]
|
||||
add r12, r12, r3
|
||||
bx r12
|
||||
|
||||
.align 2
|
||||
L(copy_narrow_tbl):
|
||||
.word 0
|
||||
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
|
||||
10:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
18:
|
||||
subs r4, r4, #8
|
||||
blt 110f
|
||||
vld1.16 {q0}, [r2, :128]!
|
||||
vst1.16 {d0[0]}, [r0, :16], r1
|
||||
vst1.16 {d0[1]}, [r3, :16], r1
|
||||
vst1.16 {d0[2]}, [r0, :16], r1
|
||||
vst1.16 {d0[3]}, [r3, :16], r1
|
||||
vst1.16 {d1[0]}, [r0, :16], r1
|
||||
vst1.16 {d1[1]}, [r3, :16], r1
|
||||
vst1.16 {d1[2]}, [r0, :16], r1
|
||||
vst1.16 {d1[3]}, [r3, :16], r1
|
||||
ble 0f
|
||||
b 18b
|
||||
110:
|
||||
add r4, r4, #8
|
||||
asr r1, r1, #1
|
||||
11:
|
||||
subs r4, r4, #1
|
||||
vld1.16 {d0[]}, [r2]!
|
||||
vst1.16 {d0[0]}, [r0], r1
|
||||
bgt 11b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
20:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
24:
|
||||
subs r4, r4, #4
|
||||
blt 210f
|
||||
vld1.32 {q0}, [r2, :128]!
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[1]}, [r3, :32], r1
|
||||
vst1.32 {d1[0]}, [r0, :32], r1
|
||||
vst1.32 {d1[1]}, [r3, :32], r1
|
||||
ble 0f
|
||||
b 24b
|
||||
210:
|
||||
add r4, r4, #4
|
||||
asr r1, r1, #1
|
||||
22:
|
||||
subs r4, r4, #1
|
||||
vld1.32 {d0[]}, [r2, :32]!
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
bgt 22b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
30:
|
||||
ldr r3, [r2]
|
||||
ldrh r12, [r2, #4]
|
||||
add r2, r2, #6
|
||||
subs r4, r4, #1
|
||||
str r3, [r0]
|
||||
strh r12, [r0, #4]
|
||||
add r0, r0, r1
|
||||
bgt 30b
|
||||
pop {r4,pc}
|
||||
|
||||
40:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
42:
|
||||
subs r4, r4, #2
|
||||
blt 41f
|
||||
vld1.16 {q0}, [r2, :128]!
|
||||
vst1.16 {d0}, [r0, :64], r1
|
||||
vst1.16 {d1}, [r3, :64], r1
|
||||
ble 0f
|
||||
b 42b
|
||||
41:
|
||||
vld1.16 {d0}, [r2, :64]
|
||||
vst1.16 {d0}, [r0, :64]
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
50:
|
||||
vld1.16 {d0}, [r2]
|
||||
ldrh r12, [r2, #8]
|
||||
add r2, r2, #10
|
||||
subs r4, r4, #1
|
||||
vst1.16 {d0}, [r0]
|
||||
strh r12, [r0, #8]
|
||||
add r0, r0, r1
|
||||
bgt 50b
|
||||
pop {r4,pc}
|
||||
|
||||
60:
|
||||
vld1.16 {d0}, [r2]
|
||||
ldr r12, [r2, #8]
|
||||
add r2, r2, #12
|
||||
subs r4, r4, #1
|
||||
vst1.16 {d0}, [r0]
|
||||
str r12, [r0, #8]
|
||||
add r0, r0, r1
|
||||
bgt 60b
|
||||
pop {r4,pc}
|
||||
|
||||
70:
|
||||
vld1.16 {d0}, [r2]
|
||||
ldr r12, [r2, #8]
|
||||
ldrh lr, [r2, #12]
|
||||
add r2, r2, #14
|
||||
subs r4, r4, #1
|
||||
vst1.16 {d0}, [r0]
|
||||
str r12, [r0, #8]
|
||||
strh lr, [r0, #12]
|
||||
add r0, r0, r1
|
||||
bgt 70b
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
#include "looprestoration_tmpl.S"
|
||||
|
@ -728,25 +460,15 @@ function sgr_box3_h_16bpc_neon, export=1
|
|||
mov r9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
bne 0f
|
||||
// !LR_HAVE_RIGHT
|
||||
add lr, r5, #3
|
||||
bic lr, lr, #3
|
||||
b 1f
|
||||
0:
|
||||
add lr, r5, #7
|
||||
bic lr, lr, #7
|
||||
1:
|
||||
sub r9, r9, lr, lsl #1
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov r8, r5
|
||||
|
||||
// Subtract the number of pixels read from the input from the stride
|
||||
add lr, r5, #14
|
||||
bic lr, lr, #7
|
||||
add lr, lr, #8
|
||||
sub r4, r4, lr, lsl #1
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
|
@ -815,16 +537,26 @@ function sgr_box3_h_16bpc_neon, export=1
|
|||
// Restore r11 after using it for a temporary value
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp r5, #10
|
||||
bge 4f // If w >= 10, all used input pixels are valid
|
||||
cmp r5, #6
|
||||
bge 5f // If w >= 6, we can filter 4 pixels
|
||||
b 6f
|
||||
|
||||
// 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
|
||||
// again; it's not strictly needed in those cases (we pad enough here),
|
||||
// but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in q0/1.h[w] onwards
|
||||
movrel_local lr, right_ext_mask
|
||||
sub lr, lr, r5, lsl #1
|
||||
vld1.8 {q12, q13}, [lr]
|
||||
|
||||
vbit q0, q14, q12
|
||||
vbit q1, q14, q13
|
||||
vbit q4, q15, q12
|
||||
vbit q5, q15, q13
|
||||
|
||||
4: // Loop horizontally
|
||||
.macro add3 w
|
||||
.if \w > 4
|
||||
vext.8 q8, q0, q1, #2
|
||||
vext.8 q10, q4, q5, #2
|
||||
vext.8 q9, q0, q1, #4
|
||||
|
@ -833,16 +565,6 @@ function sgr_box3_h_16bpc_neon, export=1
|
|||
vadd.i16 q3, q4, q10
|
||||
vadd.i16 q2, q2, q9
|
||||
vadd.i16 q3, q3, q11
|
||||
.else
|
||||
vext.8 d16, d0, d1, #2
|
||||
vext.8 d20, d8, d9, #2
|
||||
vext.8 d18, d0, d1, #4
|
||||
vext.8 d22, d8, d9, #4
|
||||
vadd.i16 d4, d0, d16
|
||||
vadd.i16 d6, d8, d20
|
||||
vadd.i16 d4, d4, d18
|
||||
vadd.i16 d6, d6, d22
|
||||
.endif
|
||||
|
||||
vmull.u16 q6, d0, d0
|
||||
vmlal.u16 q6, d16, d16
|
||||
|
@ -850,22 +572,18 @@ function sgr_box3_h_16bpc_neon, export=1
|
|||
vmull.u16 q12, d8, d8
|
||||
vmlal.u16 q12, d20, d20
|
||||
vmlal.u16 q12, d22, d22
|
||||
.if \w > 4
|
||||
vmull.u16 q7, d1, d1
|
||||
vmlal.u16 q7, d17, d17
|
||||
vmlal.u16 q7, d19, d19
|
||||
vmull.u16 q13, d9, d9
|
||||
vmlal.u16 q13, d21, d21
|
||||
vmlal.u16 q13, d23, d23
|
||||
.endif
|
||||
.endm
|
||||
add3 8
|
||||
subs r5, r5, #8
|
||||
vst1.16 {q2}, [r1, :128]!
|
||||
vst1.16 {q3}, [r11, :128]!
|
||||
vst1.32 {q6, q7}, [r0, :128]!
|
||||
vst1.32 {q12, q13}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vmov q0, q1
|
||||
|
@ -876,78 +594,6 @@ function sgr_box3_h_16bpc_neon, export=1
|
|||
bne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Produce 4 pixels, 6 <= w < 10
|
||||
add3 4
|
||||
vst1.16 {d4}, [r1, :64]!
|
||||
vst1.16 {d6}, [r11, :64]!
|
||||
vst1.32 {q6}, [r0, :128]!
|
||||
vst1.32 {q12}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #4 // 2 <= w < 6
|
||||
vext.8 q0, q0, q1, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
|
||||
6: // Pad the right edge and produce the last few pixels.
|
||||
// 2 <= w < 6, 2-5 pixels valid in q0
|
||||
sub lr, r5, #2
|
||||
// lr = (pixels valid - 2)
|
||||
adr r11, L(box3_variable_shift_tbl)
|
||||
ldr lr, [r11, lr, lsl #2]
|
||||
add r11, r11, lr
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(box3_variable_shift_tbl):
|
||||
.word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
// Shift q0 right, shifting out invalid pixels,
|
||||
// shift q0 left to the original offset, shifting in padding pixels.
|
||||
22: // 2 pixels valid
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q4, q4, q4, #4
|
||||
vext.8 q0, q0, q14, #12
|
||||
vext.8 q4, q4, q15, #12
|
||||
b 88f
|
||||
33: // 3 pixels valid
|
||||
vext.8 q0, q0, q0, #6
|
||||
vext.8 q4, q4, q4, #6
|
||||
vext.8 q0, q0, q14, #10
|
||||
vext.8 q4, q4, q15, #10
|
||||
b 88f
|
||||
44: // 4 pixels valid
|
||||
vmov d1, d28
|
||||
vmov d9, d30
|
||||
b 88f
|
||||
55: // 5 pixels valid
|
||||
vext.8 q0, q0, q0, #10
|
||||
vext.8 q4, q4, q4, #10
|
||||
vext.8 q0, q0, q14, #6
|
||||
vext.8 q4, q4, q15, #6
|
||||
|
||||
88:
|
||||
// Restore r11 after using it for a temporary value above
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
|
||||
add3 4
|
||||
subs r5, r5, #4
|
||||
vst1.16 {d4}, [r1, :64]!
|
||||
vst1.16 {d6}, [r11, :64]!
|
||||
vst1.32 {q6}, [r0, :128]!
|
||||
vst1.32 {q12}, [r10, :128]!
|
||||
ble 9f
|
||||
vext.8 q0, q0, q0, #8
|
||||
vext.8 q4, q4, q4, #8
|
||||
// Only one needed pixel left, but do a normal 4 pixel
|
||||
// addition anyway
|
||||
add3 4
|
||||
vst1.16 {d4}, [r1, :64]!
|
||||
vst1.16 {d6}, [r11, :64]!
|
||||
vst1.32 {q6}, [r0, :128]!
|
||||
vst1.32 {q12}, [r10, :128]!
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
|
@ -963,7 +609,6 @@ L(box3_variable_shift_tbl):
|
|||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
|
@ -986,23 +631,11 @@ function sgr_box5_h_16bpc_neon, export=1
|
|||
mov r9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
|
||||
// Subtract the number of pixels read from the input from the stride.
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
bne 0f
|
||||
// !LR_HAVE_RIGHT
|
||||
add lr, r5, #3
|
||||
bic lr, lr, #3
|
||||
add r8, r5, #13
|
||||
b 1f
|
||||
0:
|
||||
add lr, r5, #7
|
||||
bic lr, lr, #7
|
||||
add r8, r5, #15
|
||||
1:
|
||||
sub r9, r9, lr, lsl #1
|
||||
bic r8, r8, #7
|
||||
sub r4, r4, r8, lsl #1
|
||||
add lr, lr, #8
|
||||
sub r4, r4, lr, lsl #1
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov r8, r5
|
||||
|
@ -1072,16 +705,27 @@ function sgr_box5_h_16bpc_neon, export=1
|
|||
// Restore r11 after using it for a temporary value
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp r5, #11
|
||||
bge 4f // If w >= 11, all used input pixels are valid
|
||||
cmp r5, #7
|
||||
bge 5f // If w >= 7, we can produce 4 pixels
|
||||
b 6f
|
||||
|
||||
// 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
|
||||
// buffer pointer.
|
||||
movrel_local lr, right_ext_mask, -2
|
||||
sub lr, lr, r5, lsl #1
|
||||
vld1.8 {q12, q13}, [lr]
|
||||
|
||||
vbit q0, q14, q12
|
||||
vbit q1, q14, q13
|
||||
vbit q4, q15, q12
|
||||
vbit q5, q15, q13
|
||||
|
||||
4: // Loop horizontally
|
||||
.macro add5 w
|
||||
.if \w > 4
|
||||
vext.8 q8, q0, q1, #2
|
||||
vext.8 q10, q4, q5, #2
|
||||
vext.8 q9, q0, q1, #4
|
||||
|
@ -1090,16 +734,6 @@ function sgr_box5_h_16bpc_neon, export=1
|
|||
vadd.i16 q3, q4, q10
|
||||
vadd.i16 q2, q2, q9
|
||||
vadd.i16 q3, q3, q11
|
||||
.else
|
||||
vext.8 d16, d0, d1, #2
|
||||
vext.8 d20, d8, d9, #2
|
||||
vext.8 d18, d0, d1, #4
|
||||
vext.8 d22, d8, d9, #4
|
||||
vadd.i16 d4, d0, d16
|
||||
vadd.i16 d6, d8, d20
|
||||
vadd.i16 d4, d4, d18
|
||||
vadd.i16 d6, d6, d22
|
||||
.endif
|
||||
|
||||
vmull.u16 q6, d0, d0
|
||||
vmlal.u16 q6, d16, d16
|
||||
|
@ -1107,16 +741,13 @@ function sgr_box5_h_16bpc_neon, export=1
|
|||
vmull.u16 q12, d8, d8
|
||||
vmlal.u16 q12, d20, d20
|
||||
vmlal.u16 q12, d22, d22
|
||||
.if \w > 4
|
||||
vmull.u16 q7, d1, d1
|
||||
vmlal.u16 q7, d17, d17
|
||||
vmlal.u16 q7, d19, d19
|
||||
vmull.u16 q13, d9, d9
|
||||
vmlal.u16 q13, d21, d21
|
||||
vmlal.u16 q13, d23, d23
|
||||
.endif
|
||||
|
||||
.if \w > 4
|
||||
vext.8 q8, q0, q1, #6
|
||||
vext.8 q10, q4, q5, #6
|
||||
vext.8 q9, q0, q1, #8
|
||||
|
@ -1125,35 +756,22 @@ function sgr_box5_h_16bpc_neon, export=1
|
|||
vadd.i16 q3, q3, q10
|
||||
vadd.i16 q2, q2, q9
|
||||
vadd.i16 q3, q3, q11
|
||||
.else
|
||||
vext.8 d16, d0, d1, #6
|
||||
// d18 would be equal to d1; using d1 instead
|
||||
vext.8 d20, d8, d9, #6
|
||||
// d22 would be equal to d9; using d9 instead
|
||||
vadd.i16 d4, d4, d16
|
||||
vadd.i16 d6, d6, d20
|
||||
vadd.i16 d4, d4, d1
|
||||
vadd.i16 d6, d6, d9
|
||||
.endif
|
||||
|
||||
vmlal.u16 q6, d16, d16
|
||||
vmlal.u16 q6, d1, d1
|
||||
vmlal.u16 q12, d20, d20
|
||||
vmlal.u16 q12, d9, d9
|
||||
.if \w > 4
|
||||
vmlal.u16 q7, d17, d17
|
||||
vmlal.u16 q7, d19, d19
|
||||
vmlal.u16 q13, d21, d21
|
||||
vmlal.u16 q13, d23, d23
|
||||
.endif
|
||||
.endm
|
||||
add5 8
|
||||
|
||||
subs r5, r5, #8
|
||||
vst1.16 {q2}, [r1, :128]!
|
||||
vst1.16 {q3}, [r11, :128]!
|
||||
vst1.32 {q6, q7}, [r0, :128]!
|
||||
vst1.32 {q12, q13}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vmov q0, q1
|
||||
|
@ -1163,92 +781,6 @@ function sgr_box5_h_16bpc_neon, export=1
|
|||
bne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Produce 4 pixels, 7 <= w < 11
|
||||
add5 4
|
||||
vst1.16 {d4}, [r1, :64]!
|
||||
vst1.16 {d6}, [r11, :64]!
|
||||
vst1.32 {q6}, [r0, :128]!
|
||||
vst1.32 {q12}, [r10, :128]!
|
||||
|
||||
subs r5, r5, #4 // 3 <= w < 7
|
||||
vext.8 q0, q0, q1, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
|
||||
6: // Pad the right edge and produce the last few pixels.
|
||||
// w < 7, w+1 pixels valid in q0/q4
|
||||
sub lr, r5, #1
|
||||
// lr = pixels valid - 2
|
||||
adr r11, L(box5_variable_shift_tbl)
|
||||
ldr lr, [r11, lr, lsl #2]
|
||||
vmov q1, q14
|
||||
vmov q5, q15
|
||||
add r11, r11, lr
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(box5_variable_shift_tbl):
|
||||
.word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
// Shift q0 right, shifting out invalid pixels,
|
||||
// shift q0 left to the original offset, shifting in padding pixels.
|
||||
22: // 2 pixels valid
|
||||
vext.8 q0, q0, q0, #4
|
||||
vext.8 q4, q4, q4, #4
|
||||
vext.8 q0, q0, q14, #12
|
||||
vext.8 q4, q4, q15, #12
|
||||
b 88f
|
||||
33: // 3 pixels valid
|
||||
vext.8 q0, q0, q0, #6
|
||||
vext.8 q4, q4, q4, #6
|
||||
vext.8 q0, q0, q14, #10
|
||||
vext.8 q4, q4, q15, #10
|
||||
b 88f
|
||||
44: // 4 pixels valid
|
||||
vmov d1, d28
|
||||
vmov d9, d30
|
||||
b 88f
|
||||
55: // 5 pixels valid
|
||||
vext.8 q0, q0, q0, #10
|
||||
vext.8 q4, q4, q4, #10
|
||||
vext.8 q0, q0, q14, #6
|
||||
vext.8 q4, q4, q15, #6
|
||||
b 88f
|
||||
66: // 6 pixels valid
|
||||
vext.8 q0, q0, q0, #12
|
||||
vext.8 q4, q4, q4, #12
|
||||
vext.8 q0, q0, q14, #4
|
||||
vext.8 q4, q4, q15, #4
|
||||
b 88f
|
||||
77: // 7 pixels valid
|
||||
vext.8 q0, q0, q0, #14
|
||||
vext.8 q4, q4, q4, #14
|
||||
vext.8 q0, q0, q14, #2
|
||||
vext.8 q4, q4, q15, #2
|
||||
|
||||
88:
|
||||
// Restore r11 after using it for a temporary value above
|
||||
add r11, r1, #(2*SUM_STRIDE)
|
||||
|
||||
add5 4
|
||||
subs r5, r5, #4
|
||||
vst1.16 {d4}, [r1, :64]!
|
||||
vst1.16 {d6}, [r11, :64]!
|
||||
vst1.32 {q6}, [r0, :128]!
|
||||
vst1.32 {q12}, [r10, :128]!
|
||||
ble 9f
|
||||
vext.8 q0, q0, q1, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
add5 4
|
||||
vst1.16 {d4}, [r1, :64]!
|
||||
vst1.16 {d6}, [r11, :64]!
|
||||
vst1.32 {q6}, [r0, :128]!
|
||||
vst1.32 {q12}, [r10, :128]!
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
|
@ -1264,7 +796,6 @@ L(box5_variable_shift_tbl):
|
|||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem add5
|
||||
endfunc
|
||||
|
||||
sgr_funcs 16
|
||||
|
|
|
@ -389,8 +389,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
|
|||
vrshrn.i32 d21, q11, #11
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d20, q10
|
||||
vst1.8 {d4}, [r0]!
|
||||
vst1.8 {d20}, [r9]!
|
||||
vst1.8 {d4}, [r0, :64]!
|
||||
vst1.8 {d20}, [r9, :64]!
|
||||
.else
|
||||
vqrshrun.s32 d4, q2, #11
|
||||
vqrshrun.s32 d5, q3, #11
|
||||
|
@ -398,8 +398,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
|
|||
vqrshrun.s32 d21, q11, #11
|
||||
vmin.u16 q2, q2, q14
|
||||
vmin.u16 q10, q10, q14
|
||||
vst1.16 {q2}, [r0]!
|
||||
vst1.16 {q10}, [r9]!
|
||||
vst1.16 {q2}, [r0, :128]!
|
||||
vst1.16 {q10}, [r9, :128]!
|
||||
.endif
|
||||
bgt 1b
|
||||
|
||||
|
@ -438,12 +438,12 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
|
|||
vrshrn.i32 d4, q2, #11
|
||||
vrshrn.i32 d5, q3, #11
|
||||
vqmovun.s16 d2, q2
|
||||
vst1.8 {d2}, [r0]!
|
||||
vst1.8 {d2}, [r0, :64]!
|
||||
.else
|
||||
vqrshrun.s32 d4, q2, #11
|
||||
vqrshrun.s32 d5, q3, #11
|
||||
vmin.u16 q2, q2, q14
|
||||
vst1.16 {q2}, [r0]!
|
||||
vst1.16 {q2}, [r0, :128]!
|
||||
.endif
|
||||
bgt 2b
|
||||
0:
|
||||
|
@ -531,8 +531,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
|||
vrshrn.i32 d23, q8, #11
|
||||
vqmovun.s16 d6, q3
|
||||
vqmovun.s16 d22, q11
|
||||
vst1.8 {d6}, [r0]!
|
||||
vst1.8 {d22}, [r10]!
|
||||
vst1.8 {d6}, [r0, :64]!
|
||||
vst1.8 {d22}, [r10, :64]!
|
||||
.else
|
||||
vqrshrun.s32 d6, q3, #11
|
||||
vqrshrun.s32 d7, q0, #11
|
||||
|
@ -540,8 +540,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
|||
vqrshrun.s32 d23, q8, #11
|
||||
vmin.u16 q3, q3, q14
|
||||
vmin.u16 q11, q11, q14
|
||||
vst1.16 {q3}, [r0]!
|
||||
vst1.16 {q11}, [r10]!
|
||||
vst1.16 {q3}, [r0, :128]!
|
||||
vst1.16 {q11}, [r10, :128]!
|
||||
.endif
|
||||
bgt 1b
|
||||
|
||||
|
@ -586,12 +586,12 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
|||
vrshrn.i32 d6, q3, #11
|
||||
vrshrn.i32 d7, q0, #11
|
||||
vqmovun.s16 d6, q3
|
||||
vst1.8 {d6}, [r0]!
|
||||
vst1.8 {d6}, [r0, :64]!
|
||||
.else
|
||||
vqrshrun.s32 d6, q3, #11
|
||||
vqrshrun.s32 d7, q0, #11
|
||||
vmin.u16 q3, q3, q14
|
||||
vst1.16 {q3}, [r0]!
|
||||
vst1.16 {q3}, [r0, :128]!
|
||||
.endif
|
||||
bgt 1b
|
||||
0:
|
||||
|
|
|
@ -2966,8 +2966,8 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
|
|||
.endm
|
||||
|
||||
.macro load_filter_coef dst, src, inc
|
||||
vld1.8 {\dst}, [r12, :64]
|
||||
add \src, \src, \inc
|
||||
vld1.8 {\dst}, [r12, :64]
|
||||
.endm
|
||||
|
||||
.macro load_filter_row dst, src, inc
|
||||
|
@ -2978,72 +2978,57 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
|
|||
function warp_filter_horz_neon
|
||||
load_filter_ptr r5 // filter 0
|
||||
vld1.16 {q7}, [r2], r3
|
||||
vmov.i8 q6, #128
|
||||
|
||||
load_filter_coef d0, r5, r7 // filter 0
|
||||
vmovl.u8 q6, d14 // original pixels
|
||||
load_filter_row d2, r5, r7 // filter 1
|
||||
vmovl.u8 q7, d15 // original pixels
|
||||
load_filter_row d4, r5, r7 // filter 2
|
||||
vmovl.s8 q0, d0 // filter 0
|
||||
vext.8 q3, q6, q7, #2*1 // filter 1 pixels
|
||||
load_filter_row d1, r5, r7 // filter 1
|
||||
load_filter_row d2, r5, r7 // filter 2
|
||||
load_filter_ptr r5 // filter 3
|
||||
vmovl.s8 q1, d2 // filter 1
|
||||
vmul.i16 q5, q6, q0 // filter 0 output
|
||||
load_filter_coef d0, r5, r7 // filter 3
|
||||
vmovl.s8 q2, d4 // filter 2
|
||||
veor q7, q7, q6 // subtract by 128 to allow using vmull
|
||||
load_filter_coef d3, r5, r7 // filter 3
|
||||
vext.8 d12, d14, d15, #1 // filter 1 pixels
|
||||
vext.8 d13, d14, d15, #2 // filter 2 pixels
|
||||
load_filter_ptr r5 // filter 4
|
||||
vext.8 q4, q6, q7, #2*2 // filter 2 pixels
|
||||
vmul.i16 q3, q3, q1 // filter 1 output
|
||||
load_filter_coef d2, r5, r7 // filter 4
|
||||
vmul.i16 q4, q4, q2 // filter 2 output
|
||||
vext.8 q2, q6, q7, #2*3 // filter 3 pixels
|
||||
vmovl.s8 q0, d0 // filter 3
|
||||
vpaddl.s16 q5, q5 // pixel 0 (4x32)
|
||||
vpaddl.s16 q3, q3 // pixel 1 (4x32)
|
||||
vmul.i16 q0, q2, q0 // filter 3 output
|
||||
vmull.s8 q2, d14, d0 // filter 0 output
|
||||
vmull.s8 q3, d12, d1 // filter 1 output
|
||||
load_filter_coef d0, r5, r7 // filter 4
|
||||
load_filter_ptr r5 // filter 5
|
||||
vext.8 q2, q6, q7, #2*4 // filter 4 pixels
|
||||
vmovl.s8 q1, d2 // filter 4
|
||||
vpaddl.s16 q4, q4 // pixel 2 (4x32)
|
||||
vpadd.s32 d10, d10, d11 // pixel 0 (2x32)
|
||||
vpadd.s32 d11, d6, d7 // pixel 1 (2x32)
|
||||
load_filter_coef d6, r5, r7 // filter 5
|
||||
vmul.i16 q1, q2, q1 // filter 4 output
|
||||
vpadd.s32 d8, d8, d9 // pixel 2 (2x32)
|
||||
vext.8 d12, d14, d15, #3 // filter 3 pixels
|
||||
vmull.s8 q4, d13, d2 // filter 2 output
|
||||
vext.8 d13, d14, d15, #4 // filter 4 pixels
|
||||
vpadd.i16 d4, d4, d5 // pixel 0 (4x16)
|
||||
vpadd.i16 d5, d6, d7 // pixel 1 (4x16)
|
||||
load_filter_coef d1, r5, r7 // filter 5
|
||||
load_filter_ptr r5 // filter 6
|
||||
vpaddl.s16 q0, q0 // pixel 3 (4x32)
|
||||
vpadd.s32 d10, d10, d11 // pixel 0,1
|
||||
vext.8 q2, q6, q7, #2*5 // filter 5 pixels
|
||||
vmovl.s8 q3, d6 // filter 5
|
||||
vpaddl.s16 q1, q1 // pixel 4 (4x32)
|
||||
vpadd.s32 d9, d0, d1 // pixel 3 (2x32)
|
||||
vmull.s8 q5, d12, d3 // filter 3 output
|
||||
vext.8 d12, d14, d15, #5 // filter 5 pixels
|
||||
vmull.s8 q3, d13, d0 // filter 4 output
|
||||
load_filter_coef d0, r5, r7 // filter 6
|
||||
vmul.i16 q2, q2, q3 // filter 5 output
|
||||
vpadd.s32 d11, d8, d9 // pixel 2,3
|
||||
vext.8 d13, d14, d15, #6 // filter 6 pixels
|
||||
load_filter_ptr r5 // filter 7
|
||||
vpaddl.s16 q2, q2 // pixel 5 (4x32)
|
||||
vpadd.s32 d8, d2, d3 // pixel 4 (2x32)
|
||||
vext.8 q3, q6, q7, #2*6 // filter 6 pixels
|
||||
vmovl.s8 q0, d0 // filter 6
|
||||
vpadd.s32 d9, d4, d5 // pixel 5 (2x32)
|
||||
load_filter_coef d4, r5, r7 // filter 7
|
||||
vpadd.s32 d8, d8, d9 // pixel 4,5
|
||||
vext.8 q1, q6, q7, #2*7 // filter 7 pixels
|
||||
vmovl.s8 q2, d4 // filter 7
|
||||
vmul.i16 q3, q3, q0 // filter 6 output
|
||||
vmul.i16 q1, q1, q2 // filter 7 output
|
||||
vpadd.i16 d8, d8, d9 // pixel 2 (4x16)
|
||||
vpadd.i16 d9, d10, d11 // pixel 3 (4x16)
|
||||
vmull.s8 q5, d12, d1 // filter 5 output
|
||||
load_filter_coef d1, r5, r7 // filter 7
|
||||
vext.8 d14, d14, d15, #7 // filter 7 pixels
|
||||
vpadd.i16 d6, d6, d7 // pixel 4 (4x16)
|
||||
vpadd.i16 d10, d10, d11 // pixel 5 (4x16)
|
||||
vmull.s8 q6, d13, d0 // filter 6 output
|
||||
vmull.s8 q7, d14, d1 // filter 7 output
|
||||
|
||||
sub r5, r5, r7, lsl #3
|
||||
vpaddl.s16 q3, q3 // pixel 6 (4x32)
|
||||
vpaddl.s16 q1, q1 // pixel 7 (4x32)
|
||||
vpadd.s32 d6, d6, d7 // pixel 6 (2x32)
|
||||
vpadd.s32 d2, d2, d3 // pixel 7 (2x32)
|
||||
vpadd.s32 d9, d6, d2 // pixel 6,7
|
||||
|
||||
vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16)
|
||||
vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16)
|
||||
vpadd.i16 d12, d12, d13 // pixel 6 (4x16)
|
||||
vpadd.i16 d14, d14, d15 // pixel 7 (4x16)
|
||||
vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16)
|
||||
vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16)
|
||||
vpadd.i16 d4, d4, d5 // pixel 0-3
|
||||
vpadd.i16 d5, d6, d10 // pixel 4-7
|
||||
|
||||
add r5, r5, r8
|
||||
|
||||
vrshrn.s32 d10, q5, #3
|
||||
vrshrn.s32 d11, q4, #3
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
@ -3074,23 +3059,23 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
add r6, r6, #512
|
||||
|
||||
bl warp_filter_horz_neon
|
||||
vmov q8, q5
|
||||
vrshr.s16 q8, q2, #3
|
||||
bl warp_filter_horz_neon
|
||||
vmov q9, q5
|
||||
vrshr.s16 q9, q2, #3
|
||||
bl warp_filter_horz_neon
|
||||
vmov q10, q5
|
||||
vrshr.s16 q10, q2, #3
|
||||
bl warp_filter_horz_neon
|
||||
vmov q11, q5
|
||||
vrshr.s16 q11, q2, #3
|
||||
bl warp_filter_horz_neon
|
||||
vmov q12, q5
|
||||
vrshr.s16 q12, q2, #3
|
||||
bl warp_filter_horz_neon
|
||||
vmov q13, q5
|
||||
vrshr.s16 q13, q2, #3
|
||||
bl warp_filter_horz_neon
|
||||
vmov q14, q5
|
||||
vrshr.s16 q14, q2, #3
|
||||
|
||||
1:
|
||||
bl warp_filter_horz_neon
|
||||
vmov q15, q5
|
||||
vrshr.s16 q15, q2, #3
|
||||
|
||||
load_filter_row d8, r6, r9
|
||||
load_filter_row d9, r6, r9
|
||||
|
@ -3133,12 +3118,19 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
vmlal.s16 q1, d29, d5
|
||||
vmlal.s16 q1, d31, d7
|
||||
|
||||
.ifb \t
|
||||
vmov.i16 q7, #128
|
||||
.else
|
||||
vmov.i16 q7, #0x800
|
||||
.endif
|
||||
|
||||
vmov q8, q9
|
||||
vmov q9, q10
|
||||
vqrshrn.s32 d0, q0, #\shift
|
||||
vmov q10, q11
|
||||
vqrshrn.s32 d1, q1, #\shift
|
||||
vmov q11, q12
|
||||
vadd.i16 q0, q0, q7
|
||||
vmov q12, q13
|
||||
.ifb \t
|
||||
vqmovun.s16 d0, q0
|
||||
|
|
|
@ -3154,8 +3154,8 @@ filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
|
|||
.endm
|
||||
|
||||
.macro load_filter_coef dst, src, inc
|
||||
vld1.8 {\dst}, [r12, :64]
|
||||
add \src, \src, \inc
|
||||
vld1.8 {\dst}, [r12, :64]
|
||||
.endm
|
||||
|
||||
.macro load_filter_row dst, src, inc
|
||||
|
|
|
@ -69,6 +69,56 @@
|
|||
#endif
|
||||
.endm
|
||||
|
||||
// This macro clobbers r7 (and r12 on windows) and stores data at the
|
||||
// bottom of the stack; sp is the start of the space allocated that
|
||||
// the caller can use.
|
||||
.macro sub_sp_align space
|
||||
#if CONFIG_THUMB
|
||||
mov r7, sp
|
||||
and r7, r7, #15
|
||||
#else
|
||||
and r7, sp, #15
|
||||
#endif
|
||||
sub sp, sp, r7
|
||||
// Now the stack is aligned, store the amount of adjustment back
|
||||
// on the stack, as we don't want to waste a register as frame
|
||||
// pointer.
|
||||
str r7, [sp, #-16]!
|
||||
#ifdef _WIN32
|
||||
.if \space > 8192
|
||||
// Here, we'd need to touch two (or more) pages while decrementing
|
||||
// the stack pointer.
|
||||
.error "sub_sp_align doesn't support values over 8K at the moment"
|
||||
.elseif \space > 4096
|
||||
sub r7, sp, #4096
|
||||
ldr r12, [r7]
|
||||
sub r7, r7, #(\space - 4096)
|
||||
mov sp, r7
|
||||
.else
|
||||
sub sp, sp, #\space
|
||||
.endif
|
||||
#else
|
||||
.if \space >= 4096
|
||||
sub sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
sub sp, sp, #(\space)%4096
|
||||
.endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro add_sp_align space
|
||||
.if \space >= 4096
|
||||
add sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
add sp, sp, #(\space)%4096
|
||||
.endif
|
||||
ldr r7, [sp], #16
|
||||
// Add back the original stack adjustment
|
||||
add sp, sp, r7
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
|
||||
vtrn.32 \q0, \q2
|
||||
vtrn.32 \q1, \q3
|
||||
|
@ -108,6 +158,14 @@
|
|||
vtrn.8 \r2, \r3
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
|
||||
vswp \r1, \r4 // vtrn.64 \q0, \q2
|
||||
vswp \r3, \r6 // vtrn.64 \q1, \q3
|
||||
|
||||
vtrn.32 \q0, \q1
|
||||
vtrn.32 \q2, \q3
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4h q0, q1, r0, r1, r2, r3
|
||||
vtrn.32 \q0, \q1
|
||||
|
||||
|
|
|
@ -363,10 +363,8 @@ find_dir 8
|
|||
neg v20.16b, v21.16b // -imin()
|
||||
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
|
||||
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
|
||||
smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain()
|
||||
smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain()
|
||||
smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain()
|
||||
smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain()
|
||||
mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
|
||||
mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
|
||||
.endm
|
||||
|
||||
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
|
@ -418,8 +416,11 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon
|
|||
ld1 {v0.s}[3], [x14] // px
|
||||
.endif
|
||||
|
||||
movi v1.8h, #0 // sum
|
||||
movi v2.8h, #0 // sum
|
||||
// We need 9-bits or two 8-bit accululators to fit the sum.
|
||||
// Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
|
||||
// Start sum at -1 instead of 0 to help handle rounding later.
|
||||
movi v1.16b, #255 // sum
|
||||
movi v2.16b, #0 // sum
|
||||
.if \min
|
||||
mov v3.16b, v0.16b // min
|
||||
mov v4.16b, v0.16b // max
|
||||
|
@ -468,16 +469,16 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon
|
|||
.endif
|
||||
b.ne 2b
|
||||
|
||||
sshr v5.8h, v1.8h, #15 // -(sum < 0)
|
||||
sshr v6.8h, v2.8h, #15 // -(sum < 0)
|
||||
add v1.8h, v1.8h, v5.8h // sum - (sum < 0)
|
||||
add v2.8h, v2.8h, v6.8h // sum - (sum < 0)
|
||||
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4
|
||||
uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4
|
||||
sqxtun v0.8b, v1.8h
|
||||
sqxtun2 v0.16b, v2.8h
|
||||
// Perform halving adds since the value won't fit otherwise.
|
||||
// To handle the offset for negative values, use both halving w/ and w/o rounding.
|
||||
srhadd v5.16b, v1.16b, v2.16b // sum >> 1
|
||||
shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
|
||||
sshr v1.16b, v5.16b, #7 // sum < 0
|
||||
bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
|
||||
|
||||
srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
|
||||
|
||||
usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
|
||||
.if \min
|
||||
umin v0.16b, v0.16b, v4.16b
|
||||
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
|
||||
|
|
|
@ -311,6 +311,30 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
// Steps for loading and preparing each row
|
||||
.macro dir_load_step1 s1, bpc
|
||||
.if \bpc == 8
|
||||
ld1 {\s1\().8b}, [x0], x1
|
||||
.else
|
||||
ld1 {\s1\().8h}, [x0], x1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro dir_load_step2 s1, bpc
|
||||
.if \bpc == 8
|
||||
usubl \s1\().8h, \s1\().8b, v31.8b
|
||||
.else
|
||||
ushl \s1\().8h, \s1\().8h, v8.8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro dir_load_step3 s1, bpc
|
||||
// Nothing for \bpc == 8
|
||||
.if \bpc != 8
|
||||
sub \s1\().8h, \s1\().8h, v31.8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
.macro find_dir bpc
|
||||
|
@ -333,21 +357,15 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
|
|||
movi v3.8h, #0 // v2-v3 sum_diag[1]
|
||||
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
|
||||
movi v7.8h, #0 // v6-v7 sum_alt[0]
|
||||
dir_load_step1 v26, \bpc // Setup first row early
|
||||
movi v17.8h, #0 // v16-v17 sum_alt[1]
|
||||
movi v18.8h, #0 // v18-v19 sum_alt[2]
|
||||
dir_load_step2 v26, \bpc
|
||||
movi v19.8h, #0
|
||||
dir_load_step3 v26, \bpc
|
||||
movi v21.8h, #0 // v20-v21 sum_alt[3]
|
||||
|
||||
.irpc i, 01234567
|
||||
.if \bpc == 8
|
||||
ld1 {v26.8b}, [x0], x1
|
||||
usubl v26.8h, v26.8b, v31.8b
|
||||
.else
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ushl v26.8h, v26.8h, v8.8h
|
||||
sub v26.8h, v26.8h, v31.8h
|
||||
.endif
|
||||
|
||||
addv h25, v26.8h // [y]
|
||||
rev64 v27.8h, v26.8h
|
||||
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
|
||||
|
@ -355,30 +373,6 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
|
|||
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
|
||||
rev64 v29.4h, v28.4h // [-(x >> 1)]
|
||||
ins v4.h[\i], v25.h[0] // sum_hv[0]
|
||||
|
||||
.if \i == 0
|
||||
mov v0.16b, v26.16b // sum_diag[0]
|
||||
mov v2.16b, v27.16b // sum_diag[1]
|
||||
mov v6.16b, v28.16b // sum_alt[0]
|
||||
mov v16.16b, v29.16b // sum_alt[1]
|
||||
.else
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
|
||||
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
|
||||
add v0.8h, v0.8h, v22.8h // sum_diag[0]
|
||||
add v1.8h, v1.8h, v23.8h // sum_diag[0]
|
||||
add v2.8h, v2.8h, v24.8h // sum_diag[1]
|
||||
add v3.8h, v3.8h, v25.8h // sum_diag[1]
|
||||
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
|
||||
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
|
||||
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
|
||||
add v6.8h, v6.8h, v22.8h // sum_alt[0]
|
||||
add v7.4h, v7.4h, v23.4h // sum_alt[0]
|
||||
add v16.8h, v16.8h, v24.8h // sum_alt[1]
|
||||
add v17.4h, v17.4h, v25.4h // sum_alt[1]
|
||||
.endif
|
||||
.if \i < 6
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
|
||||
|
@ -397,6 +391,41 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
|
|||
add v20.8h, v20.8h, v24.8h // sum_alt[3]
|
||||
add v21.4h, v21.4h, v25.4h // sum_alt[3]
|
||||
.endif
|
||||
.if \i == 0
|
||||
mov v0.16b, v26.16b // sum_diag[0]
|
||||
dir_load_step1 v26, \bpc
|
||||
mov v2.16b, v27.16b // sum_diag[1]
|
||||
dir_load_step2 v26, \bpc
|
||||
mov v6.16b, v28.16b // sum_alt[0]
|
||||
dir_load_step3 v26, \bpc
|
||||
mov v16.16b, v29.16b // sum_alt[1]
|
||||
.else
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
|
||||
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
|
||||
.if \i != 7 // Nothing to load for the final row
|
||||
dir_load_step1 v26, \bpc // Start setting up the next row early.
|
||||
.endif
|
||||
add v0.8h, v0.8h, v22.8h // sum_diag[0]
|
||||
add v1.8h, v1.8h, v23.8h // sum_diag[0]
|
||||
add v2.8h, v2.8h, v24.8h // sum_diag[1]
|
||||
add v3.8h, v3.8h, v25.8h // sum_diag[1]
|
||||
.if \i != 7
|
||||
dir_load_step2 v26, \bpc
|
||||
.endif
|
||||
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
|
||||
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
|
||||
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
|
||||
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
|
||||
.if \i != 7
|
||||
dir_load_step3 v26, \bpc
|
||||
.endif
|
||||
add v6.8h, v6.8h, v22.8h // sum_alt[0]
|
||||
add v7.4h, v7.4h, v23.4h // sum_alt[0]
|
||||
add v16.8h, v16.8h, v24.8h // sum_alt[1]
|
||||
add v17.4h, v17.4h, v25.4h // sum_alt[1]
|
||||
.endif
|
||||
.endr
|
||||
|
||||
movi v31.4s, #105
|
||||
|
|
|
@ -502,9 +502,9 @@ L(ipred_dc_h4):
|
|||
ld1 {v0.s}[0], [x2], #4
|
||||
ins v0.s[1], wzr
|
||||
uaddlv h0, v0.8b
|
||||
add x2, x2, #1
|
||||
br x3
|
||||
L(ipred_dc_w4):
|
||||
add x2, x2, #1
|
||||
ld1 {v1.s}[0], [x2]
|
||||
ins v1.s[1], wzr
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
|
@ -534,9 +534,9 @@ L(ipred_dc_w4):
|
|||
L(ipred_dc_h8):
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
uaddlv h0, v0.8b
|
||||
add x2, x2, #1
|
||||
br x3
|
||||
L(ipred_dc_w8):
|
||||
add x2, x2, #1
|
||||
ld1 {v1.8b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h1, v1.8b
|
||||
|
@ -565,9 +565,9 @@ L(ipred_dc_w8):
|
|||
L(ipred_dc_h16):
|
||||
ld1 {v0.16b}, [x2], #16
|
||||
uaddlv h0, v0.16b
|
||||
add x2, x2, #1
|
||||
br x3
|
||||
L(ipred_dc_w16):
|
||||
add x2, x2, #1
|
||||
ld1 {v1.16b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h1, v1.16b
|
||||
|
@ -597,10 +597,10 @@ L(ipred_dc_h32):
|
|||
ld1 {v0.16b, v1.16b}, [x2], #32
|
||||
uaddlv h0, v0.16b
|
||||
uaddlv h1, v1.16b
|
||||
add x2, x2, #1
|
||||
add v0.4h, v0.4h, v1.4h
|
||||
br x3
|
||||
L(ipred_dc_w32):
|
||||
add x2, x2, #1
|
||||
ld1 {v1.16b, v2.16b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h1, v1.16b
|
||||
|
@ -637,10 +637,10 @@ L(ipred_dc_h64):
|
|||
uaddlv h3, v3.16b
|
||||
add v0.4h, v0.4h, v1.4h
|
||||
add v2.4h, v2.4h, v3.4h
|
||||
add x2, x2, #1
|
||||
add v0.4h, v0.4h, v2.4h
|
||||
br x3
|
||||
L(ipred_dc_w64):
|
||||
add x2, x2, #1
|
||||
ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h1, v1.16b
|
||||
|
@ -1834,10 +1834,10 @@ function ipred_cfl_8bpc_neon, export=1
|
|||
L(ipred_cfl_h4):
|
||||
ld1 {v0.s}[0], [x2], #4
|
||||
ins v0.s[1], wzr
|
||||
add x2, x2, #1
|
||||
uaddlv h0, v0.8b
|
||||
br x9
|
||||
L(ipred_cfl_w4):
|
||||
add x2, x2, #1
|
||||
ld1 {v2.s}[0], [x2]
|
||||
ins v2.s[1], wzr
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
|
@ -1860,9 +1860,9 @@ L(ipred_cfl_w4):
|
|||
L(ipred_cfl_h8):
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
uaddlv h0, v0.8b
|
||||
add x2, x2, #1
|
||||
br x9
|
||||
L(ipred_cfl_w8):
|
||||
add x2, x2, #1
|
||||
ld1 {v2.8b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h2, v2.8b
|
||||
|
@ -1884,9 +1884,9 @@ L(ipred_cfl_w8):
|
|||
L(ipred_cfl_h16):
|
||||
ld1 {v0.16b}, [x2], #16
|
||||
uaddlv h0, v0.16b
|
||||
add x2, x2, #1
|
||||
br x9
|
||||
L(ipred_cfl_w16):
|
||||
add x2, x2, #1
|
||||
ld1 {v2.16b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h2, v2.16b
|
||||
|
@ -1909,10 +1909,10 @@ L(ipred_cfl_h32):
|
|||
ld1 {v2.16b, v3.16b}, [x2], #32
|
||||
uaddlv h2, v2.16b
|
||||
uaddlv h3, v3.16b
|
||||
add x2, x2, #1
|
||||
add v0.4h, v2.4h, v3.4h
|
||||
br x9
|
||||
L(ipred_cfl_w32):
|
||||
add x2, x2, #1
|
||||
ld1 {v2.16b, v3.16b}, [x2]
|
||||
add v0.4h, v0.4h, v16.4h
|
||||
uaddlv h2, v2.16b
|
||||
|
|
|
@ -562,9 +562,9 @@ function ipred_dc_16bpc_neon, export=1
|
|||
L(ipred_dc_h4):
|
||||
ld1 {v0.4h}, [x2], #8
|
||||
uaddlv s0, v0.4h
|
||||
add x2, x2, #2
|
||||
br x3
|
||||
L(ipred_dc_w4):
|
||||
add x2, x2, #2
|
||||
ld1 {v1.4h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
uaddlv s1, v1.4h
|
||||
|
@ -594,9 +594,9 @@ L(ipred_dc_w4):
|
|||
L(ipred_dc_h8):
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
uaddlv s0, v0.8h
|
||||
add x2, x2, #2
|
||||
br x3
|
||||
L(ipred_dc_w8):
|
||||
add x2, x2, #2
|
||||
ld1 {v1.8h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
uaddlv s1, v1.8h
|
||||
|
@ -626,10 +626,10 @@ L(ipred_dc_w8):
|
|||
L(ipred_dc_h16):
|
||||
ld1 {v0.8h, v1.8h}, [x2], #32
|
||||
addp v0.8h, v0.8h, v1.8h
|
||||
add x2, x2, #2
|
||||
uaddlv s0, v0.8h
|
||||
br x3
|
||||
L(ipred_dc_w16):
|
||||
add x2, x2, #2
|
||||
ld1 {v1.8h, v2.8h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
addp v1.8h, v1.8h, v2.8h
|
||||
|
@ -663,10 +663,10 @@ L(ipred_dc_h32):
|
|||
addp v0.8h, v0.8h, v1.8h
|
||||
addp v2.8h, v2.8h, v3.8h
|
||||
addp v0.8h, v0.8h, v2.8h
|
||||
add x2, x2, #2
|
||||
uaddlv s0, v0.8h
|
||||
br x3
|
||||
L(ipred_dc_w32):
|
||||
add x2, x2, #2
|
||||
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
addp v1.8h, v1.8h, v2.8h
|
||||
|
@ -709,10 +709,10 @@ L(ipred_dc_h64):
|
|||
addp v0.8h, v0.8h, v2.8h
|
||||
addp v4.8h, v4.8h, v6.8h
|
||||
addp v0.8h, v0.8h, v4.8h
|
||||
add x2, x2, #2
|
||||
uaddlv s0, v0.8h
|
||||
br x3
|
||||
L(ipred_dc_w64):
|
||||
add x2, x2, #2
|
||||
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
addp v1.8h, v1.8h, v2.8h
|
||||
|
@ -1382,7 +1382,9 @@ function ipred_filter_\bpc\()bpc_neon
|
|||
sxtl v21.8h, v21.8b
|
||||
sxtl v22.8h, v22.8b
|
||||
dup v31.8h, w8
|
||||
.if \bpc == 10
|
||||
movi v30.8h, #0
|
||||
.endif
|
||||
br x5
|
||||
40:
|
||||
ldur d0, [x2, #2] // top (0-3)
|
||||
|
@ -1421,7 +1423,6 @@ function ipred_filter_\bpc\()bpc_neon
|
|||
smin v2.8h, v2.8h, v31.8h
|
||||
subs w4, w4, #2
|
||||
st1 {v2.d}[0], [x0], x1
|
||||
uxtl v0.8h, v2.8b
|
||||
ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
|
||||
st1 {v2.d}[1], [x6], x1
|
||||
b.gt 4b
|
||||
|
@ -2143,9 +2144,9 @@ function ipred_cfl_16bpc_neon, export=1
|
|||
L(ipred_cfl_h4):
|
||||
ld1 {v0.4h}, [x2], #8
|
||||
uaddlv s0, v0.4h
|
||||
add x2, x2, #2
|
||||
br x9
|
||||
L(ipred_cfl_w4):
|
||||
add x2, x2, #2
|
||||
ld1 {v2.4h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
uaddlv s2, v2.4h
|
||||
|
@ -2168,9 +2169,9 @@ L(ipred_cfl_w4):
|
|||
L(ipred_cfl_h8):
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
uaddlv s0, v0.8h
|
||||
add x2, x2, #2
|
||||
br x9
|
||||
L(ipred_cfl_w8):
|
||||
add x2, x2, #2
|
||||
ld1 {v2.8h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
uaddlv s2, v2.8h
|
||||
|
@ -2193,10 +2194,10 @@ L(ipred_cfl_w8):
|
|||
L(ipred_cfl_h16):
|
||||
ld1 {v2.8h, v3.8h}, [x2], #32
|
||||
addp v0.8h, v2.8h, v3.8h
|
||||
add x2, x2, #2
|
||||
uaddlv s0, v0.8h
|
||||
br x9
|
||||
L(ipred_cfl_w16):
|
||||
add x2, x2, #2
|
||||
ld1 {v2.8h, v3.8h}, [x2]
|
||||
add v0.2s, v0.2s, v16.2s
|
||||
addp v2.8h, v2.8h, v3.8h
|
||||
|
@ -2222,10 +2223,10 @@ L(ipred_cfl_h32):
|
|||
addp v2.8h, v2.8h, v3.8h
|
||||
addp v4.8h, v4.8h, v5.8h
|
||||
addp v0.8h, v2.8h, v4.8h
|
||||
add x2, x2, #2
|
||||
uaddlv s0, v0.8h
|
||||
br x9
|
||||
L(ipred_cfl_w32):
|
||||
add x2, x2, #2
|
||||
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
|
||||
add v0.4s, v0.4s, v16.4s
|
||||
addp v2.8h, v2.8h, v3.8h
|
||||
|
|
|
@ -718,7 +718,7 @@ def_fn_4x4 identity, flipadst
|
|||
rshrn_sz \r7, v4, v5, #12, \sz // t7a
|
||||
smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
|
||||
rshrn_sz \r3, v6, v7, #12, \sz // t5a
|
||||
rshrn_sz \r5, v2, v3, #12, \sz // taa
|
||||
rshrn_sz \r5, v2, v3, #12, \sz // t6a
|
||||
|
||||
sqadd v2\sz, \r1\sz, \r3\sz // t4
|
||||
sqsub \r1\sz, \r1\sz, \r3\sz // t5a
|
||||
|
@ -1085,7 +1085,7 @@ def_fns_48 8, 4
|
|||
|
||||
rshrn_sz v4, v4, v5, #12, \sz // t11
|
||||
rshrn_sz v5, v6, v7, #12, \sz // t12
|
||||
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
|
||||
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
|
||||
rshrn_sz v2, v2, v3, #12, \sz // t10a
|
||||
rshrn_sz v3, v6, v7, #12, \sz // t13a
|
||||
|
||||
|
@ -3002,29 +3002,6 @@ function inv_txfm_add_vert_dct_8x64_neon
|
|||
br x14
|
||||
endfunc
|
||||
|
||||
.macro sub_sp space
|
||||
#ifdef _WIN32
|
||||
.if \space > 8192
|
||||
// Here, we'd need to touch two (or more) pages while decrementing
|
||||
// the stack pointer.
|
||||
.error "sub_sp_align doesn't support values over 8K at the moment"
|
||||
.elseif \space > 4096
|
||||
sub x16, sp, #4096
|
||||
ldr xzr, [x16]
|
||||
sub sp, x16, #(\space - 4096)
|
||||
.else
|
||||
sub sp, sp, #\space
|
||||
.endif
|
||||
#else
|
||||
.if \space >= 4096
|
||||
sub sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
sub sp, sp, #(\space)%4096
|
||||
.endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
|
||||
idct_dc 64, 64, 2
|
||||
|
||||
|
@ -3149,7 +3126,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
|
|||
mov w8, #(32 - \i)
|
||||
cmp w3, w12
|
||||
b.lt 1f
|
||||
.if \i < 24
|
||||
ldrh w12, [x13], #2
|
||||
.endif
|
||||
.endif
|
||||
add x7, x2, #(\i*2)
|
||||
mov x8, #32*2
|
||||
|
@ -3254,7 +3233,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
|
|||
mov w8, #(32 - \i)
|
||||
cmp w3, w12
|
||||
b.lt 1f
|
||||
.if \i < 24
|
||||
ldrh w12, [x13], #2
|
||||
.endif
|
||||
.endif
|
||||
add x7, x2, #(\i*2)
|
||||
mov x8, #32*2
|
||||
|
|
|
@ -124,7 +124,7 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
|
||||
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
|
||||
.ifnb \load
|
||||
ld1 {\load}, [\src], x1
|
||||
.endif
|
||||
|
@ -132,10 +132,7 @@ endconst
|
|||
srshr \shift, \shift, #\shiftbits
|
||||
.endif
|
||||
.ifnb \addsrc
|
||||
sqadd \adddst, \adddst, \addsrc
|
||||
.endif
|
||||
.ifnb \max
|
||||
smax \max, \max, v6.8h
|
||||
usqadd \adddst, \addsrc
|
||||
.endif
|
||||
.ifnb \min
|
||||
smin \min, \min, v7.8h
|
||||
|
@ -146,63 +143,57 @@ endconst
|
|||
.endm
|
||||
.macro load_add_store_8x16 dst, src
|
||||
mov \src, \dst
|
||||
movi v6.8h, #0
|
||||
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
|
||||
load_add_store v2.8h, v16.8h, , , , , , \dst, \src
|
||||
load_add_store v3.8h, v17.8h, , , , , , \dst, \src
|
||||
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src
|
||||
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src
|
||||
load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src
|
||||
load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
|
||||
load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
|
||||
load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
|
||||
load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
|
||||
load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
|
||||
load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
|
||||
load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
|
||||
load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
|
||||
load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
|
||||
load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
|
||||
load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
|
||||
load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
|
||||
load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
|
||||
load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src
|
||||
load_add_store , , , , , v31.8h, v30.8h, \dst, \src
|
||||
load_add_store , , , , , , v31.8h, \dst, \src
|
||||
load_add_store v2.8h, v16.8h, , , , , \dst, \src
|
||||
load_add_store v3.8h, v17.8h, , , , , \dst, \src
|
||||
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src
|
||||
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src
|
||||
load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src
|
||||
load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src
|
||||
load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src
|
||||
load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src
|
||||
load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
|
||||
load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
|
||||
load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
|
||||
load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
|
||||
load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
|
||||
load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
|
||||
load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
|
||||
load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
|
||||
load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
|
||||
load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
|
||||
load_add_store , , , , v27.8h, v26.8h, \dst, \src
|
||||
load_add_store , , , , , v27.8h, \dst, \src
|
||||
.endm
|
||||
.macro load_add_store_8x8 dst, src, shiftbits=4
|
||||
mov \src, \dst
|
||||
movi v6.8h, #0
|
||||
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
|
||||
load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
|
||||
load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
|
||||
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
|
||||
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
|
||||
load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
|
||||
load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
|
||||
load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
|
||||
load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits
|
||||
load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
|
||||
load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
|
||||
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
|
||||
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
|
||||
load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
|
||||
load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
|
||||
load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits
|
||||
load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , , v19.8h, \dst, \src, \shiftbits
|
||||
.endm
|
||||
.macro load_add_store_8x4 dst, src, shiftbits=4
|
||||
mov \src, \dst
|
||||
movi v6.8h, #0
|
||||
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
|
||||
load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
|
||||
load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
|
||||
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
|
||||
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
|
||||
load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
|
||||
load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits
|
||||
load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
|
||||
load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
|
||||
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
|
||||
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
|
||||
load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits
|
||||
load_add_store , , , , , v5.8h, \dst, \src, \shiftbits
|
||||
.endm
|
||||
.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
|
||||
.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
|
||||
.ifnb \load
|
||||
ld1 {\load}[0], [\src], x1
|
||||
.endif
|
||||
|
@ -216,14 +207,11 @@ endconst
|
|||
ld1 {\load}[1], [\src], x1
|
||||
.endif
|
||||
.ifnb \addsrc
|
||||
sqadd \adddst, \adddst, \addsrc
|
||||
usqadd \adddst, \addsrc
|
||||
.endif
|
||||
.ifnb \store
|
||||
st1 {\store}[0], [\dst], x1
|
||||
.endif
|
||||
.ifnb \max
|
||||
smax \max, \max, v6.8h
|
||||
.endif
|
||||
.ifnb \min
|
||||
smin \min, \min, v7.8h
|
||||
.endif
|
||||
|
@ -233,37 +221,33 @@ endconst
|
|||
.endm
|
||||
.macro load_add_store_4x16 dst, src
|
||||
mov \src, \dst
|
||||
movi v6.8h, #0
|
||||
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
|
||||
load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
|
||||
load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
|
||||
load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
|
||||
load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
|
||||
load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
|
||||
load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
|
||||
load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
|
||||
load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
|
||||
load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
|
||||
load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
|
||||
load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
|
||||
load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src
|
||||
load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src
|
||||
load_add_store4 , , , , , , , , v30.d, \dst, \src
|
||||
load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
|
||||
load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
|
||||
load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
|
||||
load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
|
||||
load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
|
||||
load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
|
||||
load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
|
||||
load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src
|
||||
load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src
|
||||
load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
|
||||
load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
|
||||
load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src
|
||||
load_add_store4 , , , , , , , v23.d, \dst, \src
|
||||
.endm
|
||||
.macro load_add_store_4x8 dst, src
|
||||
mov \src, \dst
|
||||
movi v6.8h, #0
|
||||
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
|
||||
load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
|
||||
load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
|
||||
load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
|
||||
load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
|
||||
load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
|
||||
load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
|
||||
load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
|
||||
load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src
|
||||
load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src
|
||||
load_add_store4 , , , , , , , , v22.d, \dst, \src
|
||||
load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
|
||||
load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
|
||||
load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
|
||||
load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
|
||||
load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
|
||||
load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
|
||||
load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
|
||||
load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src
|
||||
load_add_store4 , , , , , , , v3.d, \dst, \src
|
||||
.endm
|
||||
|
||||
.macro idct_dc w, h, shift
|
||||
|
@ -291,7 +275,6 @@ endconst
|
|||
.endm
|
||||
|
||||
function idct_dc_w4_neon
|
||||
movi v30.8h, #0
|
||||
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
|
||||
1:
|
||||
ld1 {v0.d}[0], [x0], x1
|
||||
|
@ -299,11 +282,9 @@ function idct_dc_w4_neon
|
|||
ld1 {v1.d}[0], [x0], x1
|
||||
subs w4, w4, #4
|
||||
ld1 {v1.d}[1], [x0], x1
|
||||
sqadd v0.8h, v0.8h, v16.8h
|
||||
usqadd v0.8h, v16.8h
|
||||
sub x0, x0, x1, lsl #2
|
||||
sqadd v1.8h, v1.8h, v16.8h
|
||||
smax v0.8h, v0.8h, v30.8h
|
||||
smax v1.8h, v1.8h, v30.8h
|
||||
usqadd v1.8h, v16.8h
|
||||
smin v0.8h, v0.8h, v31.8h
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
smin v1.8h, v1.8h, v31.8h
|
||||
|
@ -315,23 +296,18 @@ function idct_dc_w4_neon
|
|||
endfunc
|
||||
|
||||
function idct_dc_w8_neon
|
||||
movi v30.8h, #0
|
||||
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
|
||||
1:
|
||||
ld1 {v0.8h}, [x0], x1
|
||||
subs w4, w4, #4
|
||||
ld1 {v1.8h}, [x0], x1
|
||||
sqadd v0.8h, v0.8h, v16.8h
|
||||
usqadd v0.8h, v16.8h
|
||||
ld1 {v2.8h}, [x0], x1
|
||||
sqadd v1.8h, v1.8h, v16.8h
|
||||
usqadd v1.8h, v16.8h
|
||||
ld1 {v3.8h}, [x0], x1
|
||||
sqadd v2.8h, v2.8h, v16.8h
|
||||
sqadd v3.8h, v3.8h, v16.8h
|
||||
usqadd v2.8h, v16.8h
|
||||
usqadd v3.8h, v16.8h
|
||||
sub x0, x0, x1, lsl #2
|
||||
smax v0.8h, v0.8h, v30.8h
|
||||
smax v1.8h, v1.8h, v30.8h
|
||||
smax v2.8h, v2.8h, v30.8h
|
||||
smax v3.8h, v3.8h, v30.8h
|
||||
smin v0.8h, v0.8h, v31.8h
|
||||
smin v1.8h, v1.8h, v31.8h
|
||||
st1 {v0.8h}, [x0], x1
|
||||
|
@ -345,21 +321,16 @@ function idct_dc_w8_neon
|
|||
endfunc
|
||||
|
||||
function idct_dc_w16_neon
|
||||
movi v30.8h, #0
|
||||
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x0], x1
|
||||
subs w4, w4, #2
|
||||
ld1 {v2.8h, v3.8h}, [x0], x1
|
||||
sqadd v0.8h, v0.8h, v16.8h
|
||||
sqadd v1.8h, v1.8h, v16.8h
|
||||
usqadd v0.8h, v16.8h
|
||||
usqadd v1.8h, v16.8h
|
||||
sub x0, x0, x1, lsl #1
|
||||
sqadd v2.8h, v2.8h, v16.8h
|
||||
sqadd v3.8h, v3.8h, v16.8h
|
||||
smax v0.8h, v0.8h, v30.8h
|
||||
smax v1.8h, v1.8h, v30.8h
|
||||
smax v2.8h, v2.8h, v30.8h
|
||||
smax v3.8h, v3.8h, v30.8h
|
||||
usqadd v2.8h, v16.8h
|
||||
usqadd v3.8h, v16.8h
|
||||
smin v0.8h, v0.8h, v31.8h
|
||||
smin v1.8h, v1.8h, v31.8h
|
||||
smin v2.8h, v2.8h, v31.8h
|
||||
|
@ -371,19 +342,14 @@ function idct_dc_w16_neon
|
|||
endfunc
|
||||
|
||||
function idct_dc_w32_neon
|
||||
movi v30.8h, #0
|
||||
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
|
||||
subs w4, w4, #1
|
||||
sqadd v0.8h, v0.8h, v16.8h
|
||||
sqadd v1.8h, v1.8h, v16.8h
|
||||
sqadd v2.8h, v2.8h, v16.8h
|
||||
sqadd v3.8h, v3.8h, v16.8h
|
||||
smax v0.8h, v0.8h, v30.8h
|
||||
smax v1.8h, v1.8h, v30.8h
|
||||
smax v2.8h, v2.8h, v30.8h
|
||||
smax v3.8h, v3.8h, v30.8h
|
||||
usqadd v0.8h, v16.8h
|
||||
usqadd v1.8h, v16.8h
|
||||
usqadd v2.8h, v16.8h
|
||||
usqadd v3.8h, v16.8h
|
||||
smin v0.8h, v0.8h, v31.8h
|
||||
smin v1.8h, v1.8h, v31.8h
|
||||
smin v2.8h, v2.8h, v31.8h
|
||||
|
@ -394,30 +360,21 @@ function idct_dc_w32_neon
|
|||
endfunc
|
||||
|
||||
function idct_dc_w64_neon
|
||||
movi v30.8h, #0
|
||||
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
|
||||
sub x1, x1, #64
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
|
||||
subs w4, w4, #1
|
||||
sqadd v0.8h, v0.8h, v16.8h
|
||||
usqadd v0.8h, v16.8h
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
|
||||
sqadd v1.8h, v1.8h, v16.8h
|
||||
usqadd v1.8h, v16.8h
|
||||
sub x0, x0, #64
|
||||
sqadd v2.8h, v2.8h, v16.8h
|
||||
sqadd v3.8h, v3.8h, v16.8h
|
||||
sqadd v4.8h, v4.8h, v16.8h
|
||||
sqadd v5.8h, v5.8h, v16.8h
|
||||
sqadd v6.8h, v6.8h, v16.8h
|
||||
sqadd v7.8h, v7.8h, v16.8h
|
||||
smax v0.8h, v0.8h, v30.8h
|
||||
smax v1.8h, v1.8h, v30.8h
|
||||
smax v2.8h, v2.8h, v30.8h
|
||||
smax v3.8h, v3.8h, v30.8h
|
||||
smax v4.8h, v4.8h, v30.8h
|
||||
smax v5.8h, v5.8h, v30.8h
|
||||
smax v6.8h, v6.8h, v30.8h
|
||||
smax v7.8h, v7.8h, v30.8h
|
||||
usqadd v2.8h, v16.8h
|
||||
usqadd v3.8h, v16.8h
|
||||
usqadd v4.8h, v16.8h
|
||||
usqadd v5.8h, v16.8h
|
||||
usqadd v6.8h, v16.8h
|
||||
usqadd v7.8h, v16.8h
|
||||
smin v0.8h, v0.8h, v31.8h
|
||||
smin v1.8h, v1.8h, v31.8h
|
||||
smin v2.8h, v2.8h, v31.8h
|
||||
|
@ -445,12 +402,12 @@ endfunc
|
|||
|
||||
.macro idct_4 r0, r1, r2, r3
|
||||
mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
|
||||
mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
|
||||
mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
|
||||
mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
|
||||
mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
|
||||
srshr v6.4s, v6.4s, #12
|
||||
srshr v7.4s, v4.4s, #12
|
||||
srshr v2.4s, v2.4s, #12
|
||||
srshr v7.4s, v4.4s, #12
|
||||
srshr v3.4s, v3.4s, #12
|
||||
sqadd \r0\().4s, v2.4s, v6.4s
|
||||
sqsub \r3\().4s, v2.4s, v6.4s
|
||||
|
@ -575,16 +532,14 @@ function inv_txfm_add_4x4_neon
|
|||
L(itx_4x4_end):
|
||||
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
|
||||
sub x0, x0, x1, lsl #2
|
||||
sqadd v16.8h, v16.8h, v0.8h
|
||||
sqadd v18.8h, v18.8h, v1.8h
|
||||
smax v16.8h, v16.8h, v30.8h
|
||||
smax v18.8h, v18.8h, v30.8h
|
||||
smin v16.8h, v16.8h, v31.8h
|
||||
st1 {v16.d}[0], [x0], x1
|
||||
smin v18.8h, v18.8h, v31.8h
|
||||
st1 {v16.d}[1], [x0], x1
|
||||
st1 {v18.d}[0], [x0], x1
|
||||
st1 {v18.d}[1], [x0], x1
|
||||
usqadd v0.8h, v16.8h
|
||||
usqadd v1.8h, v18.8h
|
||||
smin v0.8h, v0.8h, v31.8h
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
smin v1.8h, v1.8h, v31.8h
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
st1 {v1.d}[0], [x0], x1
|
||||
st1 {v1.d}[1], [x0], x1
|
||||
|
||||
br x15
|
||||
endfunc
|
||||
|
@ -647,7 +602,7 @@ def_fn_4x4 identity, flipadst
|
|||
srshr \r1\().4s, v2.4s, #12 // t4a
|
||||
srshr \r7\().4s, v4.4s, #12 // t7a
|
||||
srshr \r3\().4s, v6.4s, #12 // t5a
|
||||
srshr \r5\().4s, v7.4s, #12 // taa
|
||||
srshr \r5\().4s, v7.4s, #12 // t6a
|
||||
|
||||
sqadd v2.4s, \r1\().4s, \r3\().4s // t4
|
||||
sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
|
||||
|
@ -1052,7 +1007,7 @@ function inv_dct_4s_x16_neon
|
|||
|
||||
srshr v4.4s, v4.4s, #12 // t11
|
||||
srshr v5.4s, v6.4s, #12 // t12
|
||||
mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a
|
||||
mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
|
||||
srshr v2.4s, v2.4s, #12 // t10a
|
||||
srshr v3.4s, v6.4s, #12 // t13a
|
||||
|
||||
|
@ -1488,10 +1443,10 @@ function inv_txfm_add_4x16_neon
|
|||
st1 {v2.4s}, [x6], x11
|
||||
.endr
|
||||
blr x4
|
||||
rshrn v28.4h, v16.4s, #1
|
||||
rshrn v29.4h, v17.4s, #1
|
||||
rshrn v30.4h, v18.4s, #1
|
||||
rshrn v31.4h, v19.4s, #1
|
||||
sqrshrn v28.4h, v16.4s, #1
|
||||
sqrshrn v29.4h, v17.4s, #1
|
||||
sqrshrn v30.4h, v18.4s, #1
|
||||
sqrshrn v31.4h, v19.4s, #1
|
||||
transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
|
||||
|
||||
b 2f
|
||||
|
@ -1511,10 +1466,10 @@ function inv_txfm_add_4x16_neon
|
|||
st1 {v2.4s}, [x6], x11
|
||||
.endr
|
||||
blr x4
|
||||
rshrn v24.4h, v16.4s, #1
|
||||
rshrn v25.4h, v17.4s, #1
|
||||
rshrn v26.4h, v18.4s, #1
|
||||
rshrn v27.4h, v19.4s, #1
|
||||
sqrshrn v24.4h, v16.4s, #1
|
||||
sqrshrn v25.4h, v17.4s, #1
|
||||
sqrshrn v26.4h, v18.4s, #1
|
||||
sqrshrn v27.4h, v19.4s, #1
|
||||
transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
|
||||
|
||||
b 2f
|
||||
|
@ -1533,10 +1488,10 @@ function inv_txfm_add_4x16_neon
|
|||
st1 {v2.4s}, [x6], x11
|
||||
.endr
|
||||
blr x4
|
||||
rshrn v20.4h, v16.4s, #1
|
||||
rshrn v21.4h, v17.4s, #1
|
||||
rshrn v22.4h, v18.4s, #1
|
||||
rshrn v23.4h, v19.4s, #1
|
||||
sqrshrn v20.4h, v16.4s, #1
|
||||
sqrshrn v21.4h, v17.4s, #1
|
||||
sqrshrn v22.4h, v18.4s, #1
|
||||
sqrshrn v23.4h, v19.4s, #1
|
||||
transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
|
||||
|
||||
b 2f
|
||||
|
@ -1552,10 +1507,10 @@ function inv_txfm_add_4x16_neon
|
|||
st1 {v2.4s}, [x2], x11
|
||||
.endr
|
||||
blr x4
|
||||
rshrn v16.4h, v16.4s, #1
|
||||
rshrn v17.4h, v17.4s, #1
|
||||
rshrn v18.4h, v18.4s, #1
|
||||
rshrn v19.4h, v19.4s, #1
|
||||
sqrshrn v16.4h, v16.4s, #1
|
||||
sqrshrn v17.4h, v17.4s, #1
|
||||
sqrshrn v18.4h, v18.4s, #1
|
||||
sqrshrn v19.4h, v19.4s, #1
|
||||
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
|
||||
|
||||
blr x5
|
||||
|
@ -2219,7 +2174,6 @@ function inv_txfm_add_vert_dct_8x32_neon
|
|||
|
||||
neg x9, x8
|
||||
mov x10, x6
|
||||
movi v0.8h, #0
|
||||
mvni v1.8h, #0xfc, lsl #8 // 0x3ff
|
||||
.macro combine r0, r1, r2, r3, op, stride
|
||||
ld1 {v5.8h}, [x7], \stride
|
||||
|
@ -2231,27 +2185,23 @@ function inv_txfm_add_vert_dct_8x32_neon
|
|||
ld1 {v4.8h}, [x10], x1
|
||||
srshr v5.8h, v5.8h, #4
|
||||
\op v6.8h, v6.8h, \r1
|
||||
sqadd v5.8h, v5.8h, v2.8h
|
||||
usqadd v2.8h, v5.8h
|
||||
srshr v6.8h, v6.8h, #4
|
||||
\op v7.8h, v7.8h, \r2
|
||||
smax v2.8h, v5.8h, v0.8h
|
||||
ld1 {v5.8h}, [x7], \stride
|
||||
sqadd v6.8h, v6.8h, v3.8h
|
||||
usqadd v3.8h, v6.8h
|
||||
smin v2.8h, v2.8h, v1.8h
|
||||
srshr v7.8h, v7.8h, #4
|
||||
\op v5.8h, v5.8h, \r3
|
||||
st1 {v2.8h}, [x6], x1
|
||||
ld1 {v2.8h}, [x10], x1
|
||||
smax v3.8h, v6.8h, v0.8h
|
||||
sqadd v7.8h, v7.8h, v4.8h
|
||||
usqadd v4.8h, v7.8h
|
||||
smin v3.8h, v3.8h, v1.8h
|
||||
srshr v5.8h, v5.8h, #4
|
||||
st1 {v3.8h}, [x6], x1
|
||||
smax v4.8h, v7.8h, v0.8h
|
||||
sqadd v5.8h, v5.8h, v2.8h
|
||||
usqadd v2.8h, v5.8h
|
||||
smin v4.8h, v4.8h, v1.8h
|
||||
st1 {v4.8h}, [x6], x1
|
||||
smax v2.8h, v5.8h, v0.8h
|
||||
smin v2.8h, v2.8h, v1.8h
|
||||
st1 {v2.8h}, [x6], x1
|
||||
.endm
|
||||
|
@ -2652,7 +2602,9 @@ function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
|
|||
mov w8, #(16 - \i)
|
||||
cmp w3, w12
|
||||
b.lt 1f
|
||||
.if \i < 12
|
||||
ldrh w12, [x13], #2
|
||||
.endif
|
||||
.endif
|
||||
mov x8, #4*16
|
||||
bl inv_txfm_horz_scale_dct_32x4_neon
|
||||
|
@ -3195,7 +3147,6 @@ function inv_txfm_add_vert_dct_8x64_neon
|
|||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
|
||||
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
|
||||
|
||||
movi v6.8h, #0
|
||||
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
|
||||
.macro add_dest_addsub src0, src1, src2, src3
|
||||
ld1 {v0.8h}, [x6], x1
|
||||
|
@ -3211,18 +3162,14 @@ function inv_txfm_add_vert_dct_8x64_neon
|
|||
srshr v4.8h, v4.8h, #4
|
||||
srshr v5.8h, v5.8h, #4
|
||||
srshr \src0, \src0, #4
|
||||
sqadd v0.8h, v0.8h, v4.8h
|
||||
usqadd v0.8h, v4.8h
|
||||
srshr \src2, \src2, #4
|
||||
sqadd v1.8h, v1.8h, \src0
|
||||
sqadd v2.8h, v2.8h, v5.8h
|
||||
smax v0.8h, v0.8h, v6.8h
|
||||
sqadd v3.8h, v3.8h, \src2
|
||||
smax v1.8h, v1.8h, v6.8h
|
||||
usqadd v1.8h, \src0
|
||||
usqadd v2.8h, v5.8h
|
||||
smin v0.8h, v0.8h, v7.8h
|
||||
smax v2.8h, v2.8h, v6.8h
|
||||
usqadd v3.8h, \src2
|
||||
smin v1.8h, v1.8h, v7.8h
|
||||
st1 {v0.8h}, [x6], x1
|
||||
smax v3.8h, v3.8h, v6.8h
|
||||
smin v2.8h, v2.8h, v7.8h
|
||||
st1 {v1.8h}, [x9], x10
|
||||
smin v3.8h, v3.8h, v7.8h
|
||||
|
@ -3240,29 +3187,6 @@ function inv_txfm_add_vert_dct_8x64_neon
|
|||
br x14
|
||||
endfunc
|
||||
|
||||
.macro sub_sp space
|
||||
#ifdef _WIN32
|
||||
.if \space > 8192
|
||||
// Here, we'd need to touch two (or more) pages while decrementing
|
||||
// the stack pointer.
|
||||
.error "sub_sp_align doesn't support values over 8K at the moment"
|
||||
.elseif \space > 4096
|
||||
sub x16, sp, #4096
|
||||
ldr xzr, [x16]
|
||||
sub sp, x16, #(\space - 4096)
|
||||
.else
|
||||
sub sp, sp, #\space
|
||||
.endif
|
||||
#else
|
||||
.if \space >= 4096
|
||||
sub sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
sub sp, sp, #(\space)%4096
|
||||
.endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
|
||||
idct_dc 64, 64, 2
|
||||
|
||||
|
@ -3492,7 +3416,9 @@ function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
|
|||
mov w8, #(32 - \i)
|
||||
cmp w3, w12
|
||||
b.lt 1f
|
||||
.if \i < 28
|
||||
ldrh w12, [x13], #2
|
||||
.endif
|
||||
.endif
|
||||
add x7, x2, #(\i*4)
|
||||
mov x8, #32*4
|
||||
|
|
|
@ -132,12 +132,11 @@ function lpf_16_wd\wd\()_neon
|
|||
.endif
|
||||
b.eq 1f // skip wd == 4 case
|
||||
.endif
|
||||
|
||||
usubl v2.8h, v22.8b, v25.8b // p1 - q1
|
||||
usubl2 v3.8h, v22.16b, v25.16b
|
||||
movi v3.16b, #128
|
||||
eor v2.16b, v22.16b, v3.16b // p1 - 128
|
||||
eor v3.16b, v25.16b, v3.16b // q1 - 128
|
||||
cmhi v0.16b, v0.16b, v12.16b // hev
|
||||
sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1)
|
||||
sqxtn2 v2.16b, v3.8h
|
||||
sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1)
|
||||
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
|
||||
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
|
||||
usubl v2.8h, v24.8b, v23.8b
|
||||
|
@ -155,35 +154,23 @@ function lpf_16_wd\wd\()_neon
|
|||
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
|
||||
sshr v4.16b, v4.16b, #3 // f1
|
||||
sshr v5.16b, v5.16b, #3 // f2
|
||||
uxtl v2.8h, v23.8b // p0
|
||||
uxtl2 v3.8h, v23.16b
|
||||
uxtl v6.8h, v24.8b // q0
|
||||
uxtl2 v7.8h, v24.16b
|
||||
saddw v2.8h, v2.8h, v5.8b
|
||||
saddw2 v3.8h, v3.8h, v5.16b
|
||||
ssubw v6.8h, v6.8h, v4.8b
|
||||
ssubw2 v7.8h, v7.8h, v4.16b
|
||||
mov v2.16b, v23.16b // p0
|
||||
mov v3.16b, v24.16b // q0
|
||||
neg v6.16b, v4.16b // -f1
|
||||
srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
|
||||
sqxtun v2.8b, v2.8h // out p0
|
||||
sqxtun2 v2.16b, v3.8h
|
||||
sqxtun v6.8b, v6.8h // out q0
|
||||
sqxtun2 v6.16b, v7.8h
|
||||
// p0 + f2, q0 - f1
|
||||
usqadd v2.16b, v5.16b // out p0
|
||||
usqadd v3.16b, v6.16b // out q0
|
||||
neg v6.16b, v4.16b // -((f1 + 1) >> 1)
|
||||
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
|
||||
uxtl v2.8h, v22.8b // p1
|
||||
uxtl2 v3.8h, v22.16b
|
||||
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
|
||||
uxtl v6.8h, v25.8b // q1
|
||||
uxtl2 v7.8h, v25.16b
|
||||
saddw v2.8h, v2.8h, v4.8b
|
||||
saddw2 v3.8h, v3.8h, v4.16b
|
||||
ssubw v6.8h, v6.8h, v4.8b
|
||||
ssubw2 v7.8h, v7.8h, v4.16b
|
||||
sqxtun v2.8b, v2.8h // out p1
|
||||
sqxtun2 v2.16b, v3.8h
|
||||
sqxtun v6.8b, v6.8h // out q1
|
||||
sqxtun2 v6.16b, v7.8h
|
||||
bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4)
|
||||
mov v2.16b, v22.16b // p1
|
||||
mov v3.16b, v25.16b // q1
|
||||
// p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
|
||||
usqadd v2.16b, v4.16b // out p1
|
||||
usqadd v3.16b, v6.16b // out q1
|
||||
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
|
||||
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
|
||||
bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev)
|
||||
1:
|
||||
|
||||
.if \wd == 6
|
||||
|
|
|
@ -150,10 +150,9 @@ function lpf_8_wd\wd\()_neon
|
|||
movi v6.8h, #4
|
||||
add v2.8h, v2.8h, v4.8h
|
||||
smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
|
||||
movi v7.8h, #3
|
||||
smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
|
||||
sqadd v4.8h, v6.8h, v2.8h // f + 4
|
||||
sqadd v5.8h, v7.8h, v2.8h // f + 3
|
||||
sqadd v5.8h, v5.8h, v2.8h // f + 3
|
||||
smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
|
||||
smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
|
||||
sshr v4.8h, v4.8h, #3 // f1
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -2180,16 +2180,7 @@ L(\type\()_8tap_filter_4):
|
|||
lsl \d_strd, \d_strd, #1
|
||||
lsl \s_strd, \s_strd, #1
|
||||
|
||||
ld1 {v28.8b, v29.8b}, [\src], \s_strd
|
||||
uxtl v28.8h, v28.8b
|
||||
uxtl v29.8h, v29.8b
|
||||
mul v24.8h, v28.8h, v0.h[0]
|
||||
.irpc i, 1234567
|
||||
ext v26.16b, v28.16b, v29.16b, #(2*\i)
|
||||
mla v24.8h, v26.8h, v0.h[\i]
|
||||
.endr
|
||||
srshr v16.8h, v24.8h, #2
|
||||
|
||||
bl L(\type\()_8tap_filter_8_first)
|
||||
bl L(\type\()_8tap_filter_8)
|
||||
mov v17.16b, v24.16b
|
||||
mov v18.16b, v25.16b
|
||||
|
@ -2267,16 +2258,7 @@ L(\type\()_8tap_filter_4):
|
|||
lsl \d_strd, \d_strd, #1
|
||||
lsl \s_strd, \s_strd, #1
|
||||
|
||||
ld1 {v28.8b, v29.8b}, [\src], \s_strd
|
||||
uxtl v28.8h, v28.8b
|
||||
uxtl v29.8h, v29.8b
|
||||
mul v24.8h, v28.8h, v0.h[0]
|
||||
.irpc i, 1234567
|
||||
ext v26.16b, v28.16b, v29.16b, #(2*\i)
|
||||
mla v24.8h, v26.8h, v0.h[\i]
|
||||
.endr
|
||||
srshr v16.8h, v24.8h, #2
|
||||
|
||||
bl L(\type\()_8tap_filter_8_first)
|
||||
bl L(\type\()_8tap_filter_8)
|
||||
mov v17.16b, v24.16b
|
||||
mov v18.16b, v25.16b
|
||||
|
@ -2363,6 +2345,28 @@ L(\type\()_8tap_filter_4):
|
|||
0:
|
||||
br x15
|
||||
|
||||
L(\type\()_8tap_filter_8_first):
|
||||
ld1 {v28.8b, v29.8b}, [\src], \s_strd
|
||||
uxtl v28.8h, v28.8b
|
||||
uxtl v29.8h, v29.8b
|
||||
mul v16.8h, v28.8h, v0.h[0]
|
||||
ext v24.16b, v28.16b, v29.16b, #(2*1)
|
||||
ext v25.16b, v28.16b, v29.16b, #(2*2)
|
||||
ext v26.16b, v28.16b, v29.16b, #(2*3)
|
||||
ext v27.16b, v28.16b, v29.16b, #(2*4)
|
||||
mla v16.8h, v24.8h, v0.h[1]
|
||||
mla v16.8h, v25.8h, v0.h[2]
|
||||
mla v16.8h, v26.8h, v0.h[3]
|
||||
mla v16.8h, v27.8h, v0.h[4]
|
||||
ext v24.16b, v28.16b, v29.16b, #(2*5)
|
||||
ext v25.16b, v28.16b, v29.16b, #(2*6)
|
||||
ext v26.16b, v28.16b, v29.16b, #(2*7)
|
||||
mla v16.8h, v24.8h, v0.h[5]
|
||||
mla v16.8h, v25.8h, v0.h[6]
|
||||
mla v16.8h, v26.8h, v0.h[7]
|
||||
srshr v16.8h, v16.8h, #2
|
||||
ret
|
||||
|
||||
L(\type\()_8tap_filter_8):
|
||||
ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
|
||||
ld1 {v30.8b, v31.8b}, [\src], \s_strd
|
||||
|
@ -2916,8 +2920,8 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
|
|||
|
||||
.macro load_filter_row dst, src, inc
|
||||
asr w13, \src, #10
|
||||
ldr \dst, [x11, w13, sxtw #3]
|
||||
add \src, \src, \inc
|
||||
ldr \dst, [x11, w13, sxtw #3]
|
||||
.endm
|
||||
|
||||
function warp_filter_horz_neon
|
||||
|
@ -2926,58 +2930,45 @@ function warp_filter_horz_neon
|
|||
ld1 {v16.8b, v17.8b}, [x2], x3
|
||||
|
||||
load_filter_row d0, w12, w7
|
||||
uxtl v16.8h, v16.8b
|
||||
load_filter_row d1, w12, w7
|
||||
uxtl v17.8h, v17.8b
|
||||
load_filter_row d2, w12, w7
|
||||
sxtl v0.8h, v0.8b
|
||||
load_filter_row d3, w12, w7
|
||||
sxtl v1.8h, v1.8b
|
||||
load_filter_row d4, w12, w7
|
||||
sxtl v2.8h, v2.8b
|
||||
load_filter_row d5, w12, w7
|
||||
sxtl v3.8h, v3.8b
|
||||
load_filter_row d6, w12, w7
|
||||
sxtl v4.8h, v4.8b
|
||||
// subtract by 128 to allow using smull
|
||||
eor v16.8b, v16.8b, v22.8b
|
||||
eor v17.8b, v17.8b, v22.8b
|
||||
load_filter_row d7, w12, w7
|
||||
sxtl v5.8h, v5.8b
|
||||
ext v18.16b, v16.16b, v17.16b, #2*1
|
||||
mul v23.8h, v16.8h, v0.8h
|
||||
sxtl v6.8h, v6.8b
|
||||
ext v19.16b, v16.16b, v17.16b, #2*2
|
||||
mul v18.8h, v18.8h, v1.8h
|
||||
sxtl v7.8h, v7.8b
|
||||
ext v20.16b, v16.16b, v17.16b, #2*3
|
||||
mul v19.8h, v19.8h, v2.8h
|
||||
ext v21.16b, v16.16b, v17.16b, #2*4
|
||||
saddlp v23.4s, v23.8h
|
||||
mul v20.8h, v20.8h, v3.8h
|
||||
ext v22.16b, v16.16b, v17.16b, #2*5
|
||||
saddlp v18.4s, v18.8h
|
||||
mul v21.8h, v21.8h, v4.8h
|
||||
saddlp v19.4s, v19.8h
|
||||
mul v22.8h, v22.8h, v5.8h
|
||||
saddlp v20.4s, v20.8h
|
||||
saddlp v21.4s, v21.8h
|
||||
saddlp v22.4s, v22.8h
|
||||
addp v18.4s, v23.4s, v18.4s
|
||||
ext v23.16b, v16.16b, v17.16b, #2*6
|
||||
addp v19.4s, v19.4s, v20.4s
|
||||
mul v23.8h, v23.8h, v6.8h
|
||||
ext v20.16b, v16.16b, v17.16b, #2*7
|
||||
mul v20.8h, v20.8h, v7.8h
|
||||
saddlp v23.4s, v23.8h
|
||||
addp v21.4s, v21.4s, v22.4s
|
||||
saddlp v20.4s, v20.8h
|
||||
addp v20.4s, v23.4s, v20.4s
|
||||
addp v18.4s, v18.4s, v19.4s
|
||||
addp v20.4s, v21.4s, v20.4s
|
||||
|
||||
ext v18.8b, v16.8b, v17.8b, #1
|
||||
ext v19.8b, v16.8b, v17.8b, #2
|
||||
smull v0.8h, v0.8b, v16.8b
|
||||
smull v1.8h, v1.8b, v18.8b
|
||||
ext v18.8b, v16.8b, v17.8b, #3
|
||||
ext v20.8b, v16.8b, v17.8b, #4
|
||||
smull v2.8h, v2.8b, v19.8b
|
||||
smull v3.8h, v3.8b, v18.8b
|
||||
ext v18.8b, v16.8b, v17.8b, #5
|
||||
ext v19.8b, v16.8b, v17.8b, #6
|
||||
smull v4.8h, v4.8b, v20.8b
|
||||
smull v5.8h, v5.8b, v18.8b
|
||||
ext v18.8b, v16.8b, v17.8b, #7
|
||||
smull v6.8h, v6.8b, v19.8b
|
||||
smull v7.8h, v7.8b, v18.8b
|
||||
|
||||
addp v0.8h, v0.8h, v1.8h
|
||||
addp v2.8h, v2.8h, v3.8h
|
||||
addp v4.8h, v4.8h, v5.8h
|
||||
addp v6.8h, v6.8h, v7.8h
|
||||
|
||||
addp v0.8h, v0.8h, v2.8h
|
||||
addp v4.8h, v4.8h, v6.8h
|
||||
|
||||
addp v0.8h, v0.8h, v4.8h
|
||||
|
||||
add w5, w5, w8
|
||||
|
||||
rshrn v16.4h, v18.4s, #3
|
||||
rshrn2 v16.8h, v20.4s, #3
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
@ -3002,25 +2993,32 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
lsl x1, x1, #1
|
||||
.endif
|
||||
|
||||
movi v22.8b, #128
|
||||
.ifb \t
|
||||
movi v23.8h, #128
|
||||
.else
|
||||
movi v23.8h, #8, lsl #8
|
||||
.endif
|
||||
|
||||
bl warp_filter_horz_neon
|
||||
mov v24.16b, v16.16b
|
||||
srshr v24.8h, v0.8h, #3
|
||||
bl warp_filter_horz_neon
|
||||
mov v25.16b, v16.16b
|
||||
srshr v25.8h, v0.8h, #3
|
||||
bl warp_filter_horz_neon
|
||||
mov v26.16b, v16.16b
|
||||
srshr v26.8h, v0.8h, #3
|
||||
bl warp_filter_horz_neon
|
||||
mov v27.16b, v16.16b
|
||||
srshr v27.8h, v0.8h, #3
|
||||
bl warp_filter_horz_neon
|
||||
mov v28.16b, v16.16b
|
||||
srshr v28.8h, v0.8h, #3
|
||||
bl warp_filter_horz_neon
|
||||
mov v29.16b, v16.16b
|
||||
srshr v29.8h, v0.8h, #3
|
||||
bl warp_filter_horz_neon
|
||||
mov v30.16b, v16.16b
|
||||
srshr v30.8h, v0.8h, #3
|
||||
|
||||
1:
|
||||
add w14, w6, #512
|
||||
bl warp_filter_horz_neon
|
||||
mov v31.16b, v16.16b
|
||||
srshr v31.8h, v0.8h, #3
|
||||
|
||||
load_filter_row d0, w14, w9
|
||||
load_filter_row d1, w14, w9
|
||||
|
@ -3030,15 +3028,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
load_filter_row d5, w14, w9
|
||||
load_filter_row d6, w14, w9
|
||||
load_filter_row d7, w14, w9
|
||||
transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
|
||||
sxtl v0.8h, v0.8b
|
||||
sxtl v1.8h, v1.8b
|
||||
sxtl v2.8h, v2.8b
|
||||
sxtl v3.8h, v3.8b
|
||||
sxtl v4.8h, v4.8b
|
||||
sxtl v5.8h, v5.8b
|
||||
sxtl v6.8h, v6.8b
|
||||
sxtl v7.8h, v7.8b
|
||||
transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
|
||||
|
||||
// This ordering of smull/smlal/smull2/smlal2 is highly
|
||||
// beneficial for Cortex A53 here.
|
||||
|
@ -3066,6 +3056,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
|
|||
sqrshrn2 v16.8h, v17.4s, #\shift
|
||||
mov v27.16b, v28.16b
|
||||
mov v28.16b, v29.16b
|
||||
add v16.8h, v16.8h, v23.8h
|
||||
.ifb \t
|
||||
sqxtun v16.8b, v16.8h
|
||||
.endif
|
||||
|
|
|
@ -3188,8 +3188,8 @@ filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
|
|||
|
||||
.macro load_filter_row dst, src, inc
|
||||
asr w13, \src, #10
|
||||
ldr \dst, [x11, w13, sxtw #3]
|
||||
add \src, \src, \inc
|
||||
ldr \dst, [x11, w13, sxtw #3]
|
||||
.endm
|
||||
|
||||
function warp_filter_horz_neon
|
||||
|
@ -3343,15 +3343,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
|
|||
load_filter_row d5, w14, w9
|
||||
load_filter_row d6, w14, w9
|
||||
load_filter_row d7, w14, w9
|
||||
transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
|
||||
sxtl v0.8h, v0.8b
|
||||
sxtl v1.8h, v1.8b
|
||||
sxtl v2.8h, v2.8b
|
||||
sxtl v3.8h, v3.8b
|
||||
sxtl v4.8h, v4.8b
|
||||
sxtl v5.8h, v5.8b
|
||||
sxtl v6.8h, v6.8b
|
||||
sxtl v7.8h, v7.8b
|
||||
transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
|
||||
|
||||
// This ordering of smull/smlal/smull2/smlal2 is highly
|
||||
// beneficial for Cortex A53 here.
|
||||
|
|
|
@ -59,33 +59,65 @@
|
|||
#endif
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
|
||||
trn1 \t8\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \t9\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \r1\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \r3\().8b, \r2\().8b, \r3\().8b
|
||||
trn1 \r0\().8b, \r4\().8b, \r5\().8b
|
||||
trn2 \r5\().8b, \r4\().8b, \r5\().8b
|
||||
trn1 \r2\().8b, \r6\().8b, \r7\().8b
|
||||
trn2 \r7\().8b, \r6\().8b, \r7\().8b
|
||||
.macro sub_sp space
|
||||
#ifdef _WIN32
|
||||
.if \space > 8192
|
||||
// Here, we'd need to touch two (or more) pages while decrementing
|
||||
// the stack pointer.
|
||||
.error "sub_sp_align doesn't support values over 8K at the moment"
|
||||
.elseif \space > 4096
|
||||
sub x16, sp, #4096
|
||||
ldr xzr, [x16]
|
||||
sub sp, x16, #(\space - 4096)
|
||||
.else
|
||||
sub sp, sp, #\space
|
||||
.endif
|
||||
#else
|
||||
.if \space >= 4096
|
||||
sub sp, sp, #(\space)/4096*4096
|
||||
.endif
|
||||
.if (\space % 4096) != 0
|
||||
sub sp, sp, #(\space)%4096
|
||||
.endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
trn1 \r4\().4h, \r0\().4h, \r2\().4h
|
||||
trn2 \r2\().4h, \r0\().4h, \r2\().4h
|
||||
trn1 \r6\().4h, \r5\().4h, \r7\().4h
|
||||
trn2 \r7\().4h, \r5\().4h, \r7\().4h
|
||||
trn1 \r5\().4h, \t9\().4h, \r3\().4h
|
||||
trn2 \t9\().4h, \t9\().4h, \r3\().4h
|
||||
trn1 \r3\().4h, \t8\().4h, \r1\().4h
|
||||
trn2 \t8\().4h, \t8\().4h, \r1\().4h
|
||||
.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
|
||||
// a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
|
||||
zip1 \r0\().16b, \r0\().16b, \r1\().16b
|
||||
// c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
|
||||
zip1 \r2\().16b, \r2\().16b, \r3\().16b
|
||||
// e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
|
||||
zip1 \r4\().16b, \r4\().16b, \r5\().16b
|
||||
// g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
|
||||
zip1 \r6\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r0\().2s, \r3\().2s, \r4\().2s
|
||||
trn2 \r4\().2s, \r3\().2s, \r4\().2s
|
||||
trn1 \r1\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r5\().2s, \r5\().2s, \r6\().2s
|
||||
trn2 \r6\().2s, \t8\().2s, \r2\().2s
|
||||
trn1 \r2\().2s, \t8\().2s, \r2\().2s
|
||||
trn1 \r3\().2s, \t9\().2s, \r7\().2s
|
||||
trn2 \r7\().2s, \t9\().2s, \r7\().2s
|
||||
// a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
|
||||
trn1 \r1\().8h, \r0\().8h, \r2\().8h
|
||||
// a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
|
||||
trn2 \r3\().8h, \r0\().8h, \r2\().8h
|
||||
// e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
|
||||
trn1 \r5\().8h, \r4\().8h, \r6\().8h
|
||||
// e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
|
||||
trn2 \r7\().8h, \r4\().8h, \r6\().8h
|
||||
|
||||
// a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
|
||||
trn1 \r0\().4s, \r1\().4s, \r5\().4s
|
||||
// a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
|
||||
trn2 \r2\().4s, \r1\().4s, \r5\().4s
|
||||
// a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
|
||||
trn1 \r1\().4s, \r3\().4s, \r7\().4s
|
||||
// a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
|
||||
trn2 \r3\().4s, \r3\().4s, \r7\().4s
|
||||
|
||||
\xtl\()2 \r4\().8h, \r0\().16b
|
||||
\xtl \r0\().8h, \r0\().8b
|
||||
\xtl\()2 \r6\().8h, \r2\().16b
|
||||
\xtl \r2\().8h, \r2\().8b
|
||||
\xtl\()2 \r5\().8h, \r1\().16b
|
||||
\xtl \r1\().8h, \r1\().8b
|
||||
\xtl\()2 \r7\().8h, \r3\().16b
|
||||
\xtl \r3\().8h, \r3\().8b
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
|
||||
|
|
|
@ -55,7 +55,6 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
|
||||
c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
|
||||
c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
|
||||
|
@ -78,5 +77,4 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
|
|||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
|
||||
|
||||
c->pal_pred = BF(dav1d_pal_pred, neon);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -119,7 +119,6 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
|
|||
|
||||
if (bpc > 10) return;
|
||||
|
||||
#if ARCH_AARCH64 || BITDEPTH == 8
|
||||
assign_itx17_fn( , 4, 4, neon);
|
||||
assign_itx16_fn(R, 4, 8, neon);
|
||||
assign_itx16_fn(R, 4, 16, neon);
|
||||
|
@ -139,5 +138,4 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
|
|||
assign_itx1_fn (R, 64, 16, neon);
|
||||
assign_itx1_fn (R, 64, 32, neon);
|
||||
assign_itx1_fn ( , 64, 64, neon);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -27,7 +27,23 @@
|
|||
|
||||
#include "src/cpu.h"
|
||||
#include "src/looprestoration.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#if ARCH_AARCH64
|
||||
void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
#else
|
||||
|
||||
// The 8bpc version calculates things slightly differently than the reference
|
||||
// C version. That version calculates roughly this:
|
||||
|
@ -59,16 +75,15 @@ void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
|
|||
const int16_t *mid, int w, int h,
|
||||
const int16_t fv[8], enum LrEdgeFlags edges,
|
||||
ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
|
||||
const pixel *src, int w, int h);
|
||||
|
||||
static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const int16_t filter[2][8],
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int16_t (*const filter)[8] = params->filter;
|
||||
ALIGN_STK_16(int16_t, mid, 68 * 384,);
|
||||
int mid_stride = (w + 7) & ~7;
|
||||
|
||||
|
@ -86,23 +101,12 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
|||
HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
// Vertical filter
|
||||
if (w >= 8)
|
||||
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
|
||||
w & ~7, h, filter[1], edges,
|
||||
mid_stride * sizeof(*mid)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into dest.
|
||||
ALIGN_STK_16(pixel, tmp, 64 * 8,);
|
||||
BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
|
||||
&mid[2*mid_stride + (w & ~7)],
|
||||
w & 7, h, filter[1], edges,
|
||||
mid_stride * sizeof(*mid)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
|
||||
}
|
||||
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
|
||||
w, h, filter[1], edges,
|
||||
mid_stride * sizeof(*mid)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
#endif
|
||||
|
||||
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
|
@ -204,83 +208,50 @@ void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
|
|||
const int w, const int h,
|
||||
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
|
||||
|
||||
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int sgr_idx,
|
||||
const int16_t sgr_wt[7], const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
if (!dav1d_sgr_params[sgr_idx][0]) {
|
||||
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w >= 8)
|
||||
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
|
||||
dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h,
|
||||
(1 << 7) - sgr_wt[1]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else if (!dav1d_sgr_params[sgr_idx][1]) {
|
||||
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w >= 8)
|
||||
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w & ~7, h, sgr_wt[0]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
|
||||
dst + (w & ~7), dst_stride,
|
||||
tmp + (w & ~7), w & 7, h, sgr_wt[0]
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
} else {
|
||||
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
|
||||
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
|
||||
if (w >= 8)
|
||||
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp1, tmp2, w & ~7, h, wt
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
if (w & 7) {
|
||||
// For uneven widths, do a full 8 pixel wide filtering into a temp
|
||||
// buffer and copy out the narrow slice of pixels separately into
|
||||
// dest.
|
||||
ALIGN_STK_16(pixel, stripe, 64 * 8,);
|
||||
BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
|
||||
dst + (w & ~7), dst_stride,
|
||||
tmp1 + (w & ~7), tmp2 + (w & ~7),
|
||||
w & 7, h, wt HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
|
||||
w & 7, h);
|
||||
}
|
||||
}
|
||||
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
|
||||
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
|
||||
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
|
||||
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
|
||||
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
|
||||
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
|
||||
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
|
||||
const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
|
||||
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
|
||||
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
|
||||
|
@ -288,7 +259,15 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if ARCH_AARCH64
|
||||
c->wiener[0] = BF(dav1d_wiener_filter7, neon);
|
||||
c->wiener[1] = BF(dav1d_wiener_filter5, neon);
|
||||
#else
|
||||
c->wiener[0] = c->wiener[1] = wiener_filter_neon;
|
||||
if (bpc <= 10)
|
||||
c->selfguided = sgr_filter_neon;
|
||||
#endif
|
||||
if (bpc <= 10) {
|
||||
c->sgr[0] = sgr_filter_5x5_neon;
|
||||
c->sgr[1] = sgr_filter_3x3_neon;
|
||||
c->sgr[2] = sgr_filter_mix_neon;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -117,7 +117,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
|
||||
for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
|
||||
const int tf = f->lf.top_pre_cdef_toggle;
|
||||
const int by_idx = by & 30;
|
||||
const int by_idx = (by & 30) >> 1;
|
||||
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
|
||||
|
||||
if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
|
||||
|
@ -140,6 +140,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
goto next_sb;
|
||||
}
|
||||
|
||||
// Create a complete 32-bit mask for the sb row ahead of time.
|
||||
const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
|
||||
const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
|
||||
noskip_row[0][0];
|
||||
|
||||
const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
|
||||
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
|
||||
const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
|
||||
|
@ -162,11 +167,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
|
||||
// check if this 8x8 block had any coded coefficients; if not,
|
||||
// go to the next block
|
||||
const unsigned bx_mask = 3U << (bx & 14);
|
||||
const int bx_idx = (bx & 16) >> 4;
|
||||
if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
|
||||
lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
|
||||
{
|
||||
const uint32_t bx_mask = 3U << (bx & 30);
|
||||
if (!(noskip_mask & bx_mask)) {
|
||||
last_skip = 1;
|
||||
goto next_b;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -29,6 +29,8 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#include "common/frame.h"
|
||||
|
||||
#include "src/internal.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
|
@ -4012,7 +4014,7 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
|
|||
update_cdf_1d(11, m.txtp_inter2);
|
||||
update_bit_1d(4, m.txtp_inter3);
|
||||
|
||||
if (!(hdr->frame_type & 1)) {
|
||||
if (IS_KEY_OR_INTRA(hdr)) {
|
||||
update_bit_0d(m.intrabc);
|
||||
|
||||
update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);
|
||||
|
|
|
@ -102,18 +102,6 @@ void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
|
|||
*dst = *src;
|
||||
}
|
||||
|
||||
void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
|
||||
validate_input(dst != NULL);
|
||||
validate_input(dst->data == NULL);
|
||||
validate_input(src != NULL);
|
||||
|
||||
if (src->ref)
|
||||
validate_input(src->data != NULL);
|
||||
|
||||
*dst = *src;
|
||||
memset(src, 0, sizeof(*src));
|
||||
}
|
||||
|
||||
void dav1d_data_props_copy(Dav1dDataProps *const dst,
|
||||
const Dav1dDataProps *const src)
|
||||
{
|
||||
|
|
|
@ -32,11 +32,6 @@
|
|||
|
||||
void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
|
||||
|
||||
/**
|
||||
* Move a data reference.
|
||||
*/
|
||||
void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
|
||||
|
||||
/**
|
||||
* Copy the source properties to the destitionatin and increase the
|
||||
* user_data's reference count (if it's not NULL).
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -35,6 +35,7 @@
|
|||
|
||||
#include "dav1d/data.h"
|
||||
|
||||
#include "common/frame.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "src/ctx.h"
|
||||
|
@ -727,7 +728,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
case_set(bh4, l., 1, by4);
|
||||
case_set(bw4, a->, 0, bx4);
|
||||
#undef set_ctx
|
||||
if (f->frame_hdr->frame_type & 1) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
|
||||
refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
|
||||
for (int x = 0; x < bw4; x++) {
|
||||
r[x].ref.ref[0] = 0;
|
||||
|
@ -748,7 +749,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
#undef set_ctx
|
||||
}
|
||||
} else {
|
||||
if (f->frame_hdr->frame_type & 1 /* not intrabc */ &&
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
|
||||
b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
|
||||
{
|
||||
if (b->matrix[0] == SHRT_MIN) {
|
||||
|
@ -791,7 +792,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
case_set(bw4, a->, 0, bx4);
|
||||
#undef set_ctx
|
||||
|
||||
if (f->frame_hdr->frame_type & 1) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
|
||||
refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
|
||||
for (int x = 0; x < bw4; x++) {
|
||||
r[x].ref.ref[0] = b->ref[0] + 1;
|
||||
|
@ -1043,7 +1044,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
|
||||
if (b->skip_mode) {
|
||||
b->intra = 0;
|
||||
} else if (f->frame_hdr->frame_type & 1) {
|
||||
} else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
|
||||
if (seg && (seg->ref >= 0 || seg->globalmv)) {
|
||||
b->intra = !seg->ref;
|
||||
} else {
|
||||
|
@ -1064,7 +1065,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
|
||||
// intra/inter-specific stuff
|
||||
if (b->intra) {
|
||||
uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ?
|
||||
uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
|
||||
ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
|
||||
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
|
||||
[dav1d_intra_mode_context[t->l.mode[by4]]];
|
||||
|
@ -1252,7 +1253,7 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
rep_macro(type, t->dir skip, off, mul * b->skip); \
|
||||
/* see aomedia bug 2183 for why we use luma coordinates here */ \
|
||||
rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
|
||||
if (f->frame_hdr->frame_type & 1) { \
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
|
||||
rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
|
||||
rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
|
||||
rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
|
||||
|
@ -1293,10 +1294,10 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
}
|
||||
}
|
||||
}
|
||||
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
|
||||
splat_intraref(&t->rt, t->by, t->bx, bs);
|
||||
}
|
||||
} else if (!(f->frame_hdr->frame_type & 1)) {
|
||||
} else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
|
||||
// intra block copy
|
||||
refmvs_candidate mvstack[8];
|
||||
int n_mvs, ctx;
|
||||
|
@ -1984,10 +1985,10 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
#undef set_ctx
|
||||
}
|
||||
if (!b->skip) {
|
||||
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
|
||||
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
|
||||
const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
|
||||
const int bx_idx = (bx4 & 16) >> 4;
|
||||
for (int y = 0; y < bh4; y++, noskip_mask++) {
|
||||
for (int y = 0; y < bh4; y += 2, noskip_mask++) {
|
||||
(*noskip_mask)[bx_idx] |= mask;
|
||||
if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
|
||||
(*noskip_mask)[1] |= mask;
|
||||
|
@ -2484,15 +2485,12 @@ static void read_restoration_info(Dav1dTileContext *const t,
|
|||
lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
|
||||
} else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
|
||||
const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
|
||||
const uint16_t *const sgr_params = dav1d_sgr_params[idx];
|
||||
lr->sgr_idx = idx;
|
||||
lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
|
||||
dav1d_msac_decode_subexp(&ts->msac,
|
||||
ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 :
|
||||
0;
|
||||
lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
|
||||
dav1d_msac_decode_subexp(&ts->msac,
|
||||
ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 :
|
||||
95;
|
||||
lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
|
||||
ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
|
||||
lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
|
||||
ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
|
||||
memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
|
||||
memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
|
||||
ts->lr_ref[p] = lr;
|
||||
|
@ -2513,20 +2511,20 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
|
|||
const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
|
||||
const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
|
||||
|
||||
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
|
||||
dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
|
||||
ts->tiling.col_end, ts->tiling.row_start,
|
||||
ts->tiling.row_end, t->by >> f->sb_shift,
|
||||
ts->tiling.row);
|
||||
}
|
||||
|
||||
reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
|
||||
reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
|
||||
if (f->frame_thread.pass == 2) {
|
||||
for (t->bx = ts->tiling.col_start,
|
||||
t->a = f->a + col_sb128_start + tile_row * f->sb128w;
|
||||
t->bx < ts->tiling.col_end; t->bx += sb_step)
|
||||
{
|
||||
if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
|
||||
if (atomic_load_explicit(c->flush, memory_order_acquire))
|
||||
return 1;
|
||||
if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
|
||||
return 1;
|
||||
|
@ -2557,7 +2555,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
|
|||
t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
|
||||
t->bx < ts->tiling.col_end; t->bx += sb_step)
|
||||
{
|
||||
if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
|
||||
if (atomic_load_explicit(c->flush, memory_order_acquire))
|
||||
return 1;
|
||||
if (root_bl == BL_128X128) {
|
||||
t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
|
||||
|
@ -2631,7 +2629,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
|
|||
}
|
||||
}
|
||||
|
||||
if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) {
|
||||
if (f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
|
||||
dav1d_refmvs_save_tmvs(&t->rt,
|
||||
ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
|
||||
t->by >> 1, (t->by + sb_step) >> 1);
|
||||
|
@ -2859,7 +2857,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
|
||||
if (lr_line_sz != f->lf.lr_line_sz) {
|
||||
dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
|
||||
uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);
|
||||
const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12;
|
||||
// lr simd may overread the input, so slightly over-allocate the lpf buffer
|
||||
uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32);
|
||||
if (!lr_ptr) {
|
||||
f->lf.lr_line_sz = 0;
|
||||
goto error;
|
||||
|
@ -2867,7 +2867,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
|
||||
for (int pl = 0; pl <= 2; pl++) {
|
||||
f->lf.lr_lpf_line[pl] = lr_ptr;
|
||||
lr_ptr += lr_line_sz * 12;
|
||||
lr_ptr += lr_line_sz * num_lines;
|
||||
}
|
||||
|
||||
f->lf.lr_line_sz = lr_line_sz;
|
||||
|
@ -2949,26 +2949,30 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
}
|
||||
|
||||
// init ref mvs
|
||||
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
|
||||
const int ret =
|
||||
dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
|
||||
f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
|
||||
if (ret < 0) goto error;
|
||||
}
|
||||
|
||||
// create post-filtering tasks
|
||||
if (c->n_pfc > 1)
|
||||
if (dav1d_task_create_filter_sbrow(f))
|
||||
goto error;
|
||||
|
||||
retval = DAV1D_ERR(EINVAL);
|
||||
|
||||
// setup dequant tables
|
||||
init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
|
||||
if (f->frame_hdr->quant.qm)
|
||||
for (int j = 0; j < N_RECT_TX_SIZES; j++) {
|
||||
f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
|
||||
f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
|
||||
f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
|
||||
for (int i = 0; i < N_RECT_TX_SIZES; i++) {
|
||||
f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
|
||||
f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
|
||||
f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
|
||||
}
|
||||
for (int i = f->frame_hdr->quant.qm; i < 2; i++)
|
||||
for (int tx = 0; tx < N_RECT_TX_SIZES; tx++)
|
||||
for (int pl = 0; pl < 3; pl++)
|
||||
f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx];
|
||||
else
|
||||
memset(f->qm, 0, sizeof(f->qm));
|
||||
|
||||
// setup jnt_comp weights
|
||||
if (f->frame_hdr->switchable_comp_refs) {
|
||||
|
@ -3079,9 +3083,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y;
|
||||
|
||||
for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
|
||||
reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
|
||||
reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
|
||||
|
||||
if (f->n_tc == 1) {
|
||||
if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) {
|
||||
Dav1dTileContext *const t = f->tc;
|
||||
|
||||
// no tile threading - we explicitly interleave tile/sbrow decoding
|
||||
|
@ -3108,18 +3112,31 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
}
|
||||
for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
|
||||
t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
|
||||
|
||||
if (dav1d_decode_tile_sbrow(t)) goto error;
|
||||
}
|
||||
if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) {
|
||||
if (f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
|
||||
dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
|
||||
}
|
||||
|
||||
// loopfilter + cdef + restoration
|
||||
if (f->frame_thread.pass != 1)
|
||||
f->bd_fn.filter_sbrow(f, sby);
|
||||
dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
|
||||
progress_plane_type);
|
||||
if (f->frame_thread.pass != 1) {
|
||||
if (c->n_pfc == 1)
|
||||
f->bd_fn.filter_sbrow(f, sby);
|
||||
else {
|
||||
pthread_mutex_lock(&f->lf.thread.pftd->lock);
|
||||
if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
|
||||
Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
|
||||
t->start = 1;
|
||||
if (t->status == DAV1D_TASK_READY)
|
||||
dav1d_task_schedule(f->lf.thread.pftd, t);
|
||||
}
|
||||
pthread_mutex_unlock(&f->lf.thread.pftd->lock);
|
||||
}
|
||||
}
|
||||
if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
|
||||
dav1d_thread_picture_signal(&f->sr_cur,
|
||||
(sby + 1) * f->sb_step * 4,
|
||||
progress_plane_type);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -3142,7 +3159,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
pthread_cond_broadcast(&f->tile_thread.cond);
|
||||
pthread_mutex_unlock(&f->tile_thread.lock);
|
||||
|
||||
// loopfilter + cdef + restoration
|
||||
for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
|
||||
for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
|
||||
sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
|
||||
|
@ -3174,10 +3190,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
}
|
||||
|
||||
// loopfilter + cdef + restoration
|
||||
if (f->frame_thread.pass != 1)
|
||||
f->bd_fn.filter_sbrow(f, sby);
|
||||
dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
|
||||
progress_plane_type);
|
||||
if (f->frame_thread.pass != 1) {
|
||||
if (c->n_pfc == 1)
|
||||
f->bd_fn.filter_sbrow(f, sby);
|
||||
else {
|
||||
pthread_mutex_lock(&f->lf.thread.pftd->lock);
|
||||
if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
|
||||
Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
|
||||
t->start = 1;
|
||||
if (t->status == DAV1D_TASK_READY)
|
||||
dav1d_task_schedule(f->lf.thread.pftd, t);
|
||||
}
|
||||
pthread_mutex_unlock(&f->lf.thread.pftd->lock);
|
||||
}
|
||||
}
|
||||
if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
|
||||
dav1d_thread_picture_signal(&f->sr_cur,
|
||||
(sby + 1) * f->sb_step * 4,
|
||||
progress_plane_type);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3222,6 +3252,17 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
|
||||
retval = 0;
|
||||
error:
|
||||
if (c->n_pfc > 1) {
|
||||
pthread_mutex_lock(&f->lf.thread.pftd->lock);
|
||||
if (!f->lf.thread.done) {
|
||||
if (retval != 0) {
|
||||
f->lf.thread.done = -1;
|
||||
pthread_cond_signal(&f->lf.thread.pftd->cond);
|
||||
}
|
||||
pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock);
|
||||
}
|
||||
pthread_mutex_unlock(&f->lf.thread.pftd->lock);
|
||||
}
|
||||
dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
|
||||
PLANE_TYPE_ALL);
|
||||
for (int i = 0; i < 7; i++) {
|
||||
|
@ -3329,6 +3370,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
|
||||
f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
|
||||
f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
|
||||
f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \
|
||||
f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
|
||||
f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
|
||||
f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
|
||||
f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
|
||||
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
|
||||
if (!f->seq_hdr->hbd) {
|
||||
|
@ -3343,7 +3388,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
#undef assign_bitdepth_case
|
||||
|
||||
int ref_coded_width[7];
|
||||
if (f->frame_hdr->frame_type & 1) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
|
||||
if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
|
||||
const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
|
||||
if (!c->refs[pri_ref].p.p.data[0]) {
|
||||
|
@ -3461,7 +3506,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
|
||||
|
||||
// ref_mvs
|
||||
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
|
||||
f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
|
||||
sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
|
||||
if (!f->mvs_ref) {
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
|
||||
#include "src/dequant_tables.h"
|
||||
|
||||
const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = {
|
||||
const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = {
|
||||
{
|
||||
{ 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, },
|
||||
{ 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, },
|
||||
|
|
|
@ -32,6 +32,6 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2];
|
||||
extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
|
||||
|
||||
#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
;*****************************************************************************
|
||||
;* x86inc.asm: x86 abstraction layer
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2020 x264 project
|
||||
;* Copyright (C) 2005-2021 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Henrik Gramner <henrik@gramner.com>
|
||||
|
@ -349,6 +349,28 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
|
||||
%define high_mm_regs (16*cpuflag(avx512))
|
||||
|
||||
; Large stack allocations on Windows need to use stack probing in order
|
||||
; to guarantee that all stack memory is committed before accessing it.
|
||||
; This is done by ensuring that the guard page(s) at the end of the
|
||||
; currently committed pages are touched prior to any pages beyond that.
|
||||
%if WIN64
|
||||
%assign STACK_PROBE_SIZE 8192
|
||||
%elifidn __OUTPUT_FORMAT__, win32
|
||||
%assign STACK_PROBE_SIZE 4096
|
||||
%else
|
||||
%assign STACK_PROBE_SIZE 0
|
||||
%endif
|
||||
|
||||
%macro PROBE_STACK 1 ; stack_size
|
||||
%if STACK_PROBE_SIZE
|
||||
%assign %%i STACK_PROBE_SIZE
|
||||
%rep %1 / STACK_PROBE_SIZE
|
||||
mov eax, [rsp-%%i]
|
||||
%assign %%i %%i+STACK_PROBE_SIZE
|
||||
%endrep
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
|
||||
%ifnum %1
|
||||
%if %1 != 0
|
||||
|
@ -369,6 +391,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||
%if required_stack_alignment <= STACK_ALIGNMENT
|
||||
; maintain the current stack alignment
|
||||
%assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
|
||||
PROBE_STACK stack_size_padded
|
||||
SUB rsp, stack_size_padded
|
||||
%else
|
||||
%assign %%reg_num (regs_used - 1)
|
||||
|
@ -384,6 +407,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||
%xdefine rstkm rstk
|
||||
%endif
|
||||
%assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
|
||||
PROBE_STACK stack_size_padded
|
||||
mov rstk, rsp
|
||||
and rsp, ~(required_stack_alignment-1)
|
||||
sub rsp, stack_size_padded
|
||||
|
@ -1139,8 +1163,7 @@ INIT_XMM
|
|||
%endif
|
||||
%xdefine %%tmp %%f %+ 0
|
||||
%ifnum %%tmp
|
||||
RESET_MM_PERMUTATION
|
||||
AVX512_MM_PERMUTATION
|
||||
DEFINE_MMREGS mmtype
|
||||
%assign %%i 0
|
||||
%rep num_mmregs
|
||||
%xdefine %%tmp %%f %+ %%i
|
||||
|
|
|
@ -35,6 +35,8 @@
|
|||
typedef struct Dav1dFrameContext Dav1dFrameContext;
|
||||
typedef struct Dav1dTileState Dav1dTileState;
|
||||
typedef struct Dav1dTileContext Dav1dTileContext;
|
||||
typedef struct Dav1dPostFilterContext Dav1dPostFilterContext;
|
||||
typedef struct Dav1dTask Dav1dTask;
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
|
@ -76,6 +78,9 @@ struct Dav1dContext {
|
|||
Dav1dFrameContext *fc;
|
||||
unsigned n_fc;
|
||||
|
||||
Dav1dPostFilterContext *pfc;
|
||||
unsigned n_pfc;
|
||||
|
||||
// cache of OBUs that make up a single frame before we submit them
|
||||
// to a frame worker to be decoded
|
||||
struct Dav1dTileGroup *tile;
|
||||
|
@ -99,15 +104,23 @@ struct Dav1dContext {
|
|||
// decoded output picture queue
|
||||
Dav1dData in;
|
||||
Dav1dPicture out;
|
||||
// dummy is a pointer to prevent compiler errors about atomic_load()
|
||||
// not taking const arguments
|
||||
atomic_int flush_mem, *flush;
|
||||
struct {
|
||||
Dav1dThreadPicture *out_delayed;
|
||||
unsigned next;
|
||||
// dummy is a pointer to prevent compiler errors about atomic_load()
|
||||
// not taking const arguments; the const attribute is not taken
|
||||
// from pointers
|
||||
atomic_int flush_mem, *flush;
|
||||
} frame_thread;
|
||||
|
||||
// postfilter threading (refer to pfc[] for per_thread thingies)
|
||||
struct PostFilterThreadData {
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t cond;
|
||||
struct Dav1dTask *tasks;
|
||||
int frame_cnt;
|
||||
int inited;
|
||||
} postfilter_thread;
|
||||
|
||||
// reference/entropy state
|
||||
Dav1dMemPool *segmap_pool;
|
||||
Dav1dMemPool *refmvs_pool;
|
||||
|
@ -182,6 +195,10 @@ struct Dav1dFrameContext {
|
|||
recon_b_intra_fn recon_b_intra;
|
||||
recon_b_inter_fn recon_b_inter;
|
||||
filter_sbrow_fn filter_sbrow;
|
||||
filter_sbrow_fn filter_sbrow_deblock;
|
||||
filter_sbrow_fn filter_sbrow_cdef;
|
||||
filter_sbrow_fn filter_sbrow_resize;
|
||||
filter_sbrow_fn filter_sbrow_lr;
|
||||
backup_ipred_edge_fn backup_ipred_edge;
|
||||
read_coef_blocks_fn read_coef_blocks;
|
||||
} bd_fn;
|
||||
|
@ -191,7 +208,7 @@ struct Dav1dFrameContext {
|
|||
ptrdiff_t b4_stride;
|
||||
int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
|
||||
uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
|
||||
const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
|
||||
const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */];
|
||||
BlockContext *a;
|
||||
int a_sz /* w*tile_rows */;
|
||||
refmvs_frame rf;
|
||||
|
@ -238,6 +255,16 @@ struct Dav1dFrameContext {
|
|||
pixel *p[3], *sr_p[3];
|
||||
Av1Filter *mask_ptr, *prev_mask_ptr;
|
||||
int restore_planes; // enum LrRestorePlanes
|
||||
|
||||
struct {
|
||||
pthread_cond_t cond;
|
||||
struct PostFilterThreadData *pftd;
|
||||
struct Dav1dTask *tasks;
|
||||
int num_tasks;
|
||||
int npf;
|
||||
int done;
|
||||
int inited;
|
||||
} thread;
|
||||
} lf;
|
||||
|
||||
// threading (refer to tc[] for per-thread things)
|
||||
|
@ -353,4 +380,11 @@ struct Dav1dTileContext {
|
|||
} tile_thread;
|
||||
};
|
||||
|
||||
struct Dav1dPostFilterContext {
|
||||
Dav1dContext *c;
|
||||
struct thread_data td;
|
||||
int flushed;
|
||||
int die;
|
||||
};
|
||||
|
||||
#endif /* DAV1D_SRC_INTERNAL_H */
|
||||
|
|
|
@ -89,7 +89,7 @@ static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
|
|||
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
|
||||
int y, x;
|
||||
|
||||
uint8_t txa[2 /* edge */][2 /* txsz, step */][32 /* y */][32 /* x */];
|
||||
ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]);
|
||||
for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
|
||||
for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
|
||||
decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],
|
||||
|
|
|
@ -40,11 +40,11 @@ typedef struct Av1FilterLUT {
|
|||
} Av1FilterLUT;
|
||||
|
||||
typedef struct Av1RestorationUnit {
|
||||
enum Dav1dRestorationType type;
|
||||
uint8_t /* enum Dav1dRestorationType */ type;
|
||||
int8_t filter_h[3];
|
||||
int8_t filter_v[3];
|
||||
uint8_t sgr_idx;
|
||||
int16_t sgr_weights[2];
|
||||
int8_t sgr_weights[2];
|
||||
} Av1RestorationUnit;
|
||||
|
||||
// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
|
||||
|
@ -53,7 +53,7 @@ typedef struct Av1Filter {
|
|||
uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
|
||||
uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
|
||||
int8_t cdef_idx[4]; // -1 means "unset"
|
||||
uint16_t noskip_mask[32][2];
|
||||
uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis
|
||||
} Av1Filter;
|
||||
|
||||
// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
|
||||
|
|
|
@ -65,6 +65,7 @@ COLD const char *dav1d_version(void) {
|
|||
COLD void dav1d_default_settings(Dav1dSettings *const s) {
|
||||
s->n_frame_threads = 1;
|
||||
s->n_tile_threads = 1;
|
||||
s->n_postfilter_threads = 1;
|
||||
s->apply_grain = 1;
|
||||
s->allocator.cookie = NULL;
|
||||
s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
|
||||
|
@ -100,6 +101,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
|
||||
validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_postfilter_threads >= 1 &&
|
||||
s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_tile_threads >= 1 &&
|
||||
s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(s->n_frame_threads >= 1 &&
|
||||
|
@ -136,9 +139,17 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
{
|
||||
goto error;
|
||||
}
|
||||
if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc) {
|
||||
|
||||
if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc &&
|
||||
c->allocator.release_picture_callback == dav1d_default_picture_release)
|
||||
{
|
||||
if (c->allocator.cookie) goto error;
|
||||
if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
|
||||
c->allocator.cookie = c->picture_pool;
|
||||
} else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc ||
|
||||
c->allocator.release_picture_callback == dav1d_default_picture_release)
|
||||
{
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* On 32-bit systems extremely large frame sizes can cause overflows in
|
||||
|
@ -152,12 +163,49 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
s->frame_size_limit, c->frame_size_limit);
|
||||
}
|
||||
|
||||
c->frame_thread.flush = &c->frame_thread.flush_mem;
|
||||
atomic_init(c->frame_thread.flush, 0);
|
||||
c->flush = &c->flush_mem;
|
||||
atomic_init(c->flush, 0);
|
||||
|
||||
c->n_pfc = s->n_postfilter_threads;
|
||||
c->n_fc = s->n_frame_threads;
|
||||
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
|
||||
if (!c->fc) goto error;
|
||||
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
|
||||
|
||||
if (c->n_pfc > 1) {
|
||||
c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32);
|
||||
if (!c->pfc) goto error;
|
||||
memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads);
|
||||
if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) {
|
||||
pthread_mutex_destroy(&c->postfilter_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
c->postfilter_thread.inited = 1;
|
||||
for (int n = 0; n < s->n_frame_threads; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error;
|
||||
f->lf.thread.pftd = &c->postfilter_thread;
|
||||
f->lf.thread.done = 1;
|
||||
f->lf.thread.inited = 1;
|
||||
}
|
||||
for (int n = 0; n < s->n_postfilter_threads; ++n) {
|
||||
Dav1dPostFilterContext *const pf = &c->pfc[n];
|
||||
pf->c = c;
|
||||
if (pthread_mutex_init(&pf->td.lock, NULL)) goto error;
|
||||
if (pthread_cond_init(&pf->td.cond, NULL)) {
|
||||
pthread_mutex_destroy(&pf->td.lock);
|
||||
goto error;
|
||||
}
|
||||
if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) {
|
||||
pthread_cond_destroy(&c->postfilter_thread.cond);
|
||||
pthread_mutex_destroy(&c->postfilter_thread.lock);
|
||||
goto error;
|
||||
}
|
||||
pf->td.inited = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (c->n_fc > 1) {
|
||||
c->frame_thread.out_delayed =
|
||||
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
|
||||
|
@ -459,11 +507,17 @@ void dav1d_flush(Dav1dContext *const c) {
|
|||
dav1d_ref_dec(&c->content_light_ref);
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
|
||||
if (c->n_fc == 1) return;
|
||||
if (c->n_fc == 1 && c->n_pfc == 1) return;
|
||||
|
||||
// mark each currently-running frame as flushing, so that we
|
||||
// exit out as quickly as the running thread checks this flag
|
||||
atomic_store(c->frame_thread.flush, 1);
|
||||
// wait for threads to complete flushing
|
||||
if (c->n_pfc > 1)
|
||||
pthread_mutex_lock(&c->postfilter_thread.lock);
|
||||
atomic_store(c->flush, 1);
|
||||
if (c->n_pfc > 1) {
|
||||
pthread_cond_broadcast(&c->postfilter_thread.cond);
|
||||
pthread_mutex_unlock(&c->postfilter_thread.lock);
|
||||
}
|
||||
if (c->n_fc == 1) goto skip_ft_flush;
|
||||
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
|
||||
if (next == c->n_fc) next = 0;
|
||||
Dav1dFrameContext *const f = &c->fc[next];
|
||||
|
@ -475,13 +529,31 @@ void dav1d_flush(Dav1dContext *const c) {
|
|||
assert(!f->cur.data[0]);
|
||||
}
|
||||
pthread_mutex_unlock(&f->frame_thread.td.lock);
|
||||
Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next];
|
||||
Dav1dThreadPicture *const out_delayed =
|
||||
&c->frame_thread.out_delayed[next];
|
||||
if (out_delayed->p.data[0])
|
||||
dav1d_thread_picture_unref(out_delayed);
|
||||
}
|
||||
atomic_store(c->frame_thread.flush, 0);
|
||||
|
||||
c->frame_thread.next = 0;
|
||||
skip_ft_flush:
|
||||
if (c->n_pfc > 1) {
|
||||
for (unsigned i = 0; i < c->n_pfc; ++i) {
|
||||
Dav1dPostFilterContext *const pf = &c->pfc[i];
|
||||
pthread_mutex_lock(&pf->td.lock);
|
||||
if (!pf->flushed)
|
||||
pthread_cond_wait(&pf->td.cond, &pf->td.lock);
|
||||
pf->flushed = 0;
|
||||
pthread_mutex_unlock(&pf->td.lock);
|
||||
}
|
||||
pthread_mutex_lock(&c->postfilter_thread.lock);
|
||||
c->postfilter_thread.tasks = NULL;
|
||||
pthread_mutex_unlock(&c->postfilter_thread.lock);
|
||||
for (unsigned i = 0; i < c->n_fc; ++i) {
|
||||
freep(&c->fc[i].lf.thread.tasks);
|
||||
c->fc[i].lf.thread.num_tasks = 0;
|
||||
}
|
||||
}
|
||||
atomic_store(c->flush, 0);
|
||||
}
|
||||
|
||||
COLD void dav1d_close(Dav1dContext **const c_out) {
|
||||
|
@ -495,6 +567,25 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
|
||||
if (flush) dav1d_flush(c);
|
||||
|
||||
if (c->pfc) {
|
||||
struct PostFilterThreadData *pftd = &c->postfilter_thread;
|
||||
if (pftd->inited) {
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++)
|
||||
c->pfc[n].die = 1;
|
||||
pthread_cond_broadcast(&pftd->cond);
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) {
|
||||
pthread_join(c->pfc[n].td.thread, NULL);
|
||||
pthread_cond_destroy(&c->pfc[n].td.cond);
|
||||
pthread_mutex_destroy(&c->pfc[n].td.lock);
|
||||
}
|
||||
pthread_cond_destroy(&pftd->cond);
|
||||
pthread_mutex_destroy(&pftd->lock);
|
||||
}
|
||||
dav1d_free_aligned(c->pfc);
|
||||
}
|
||||
|
||||
for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
|
||||
|
@ -546,6 +637,10 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
pthread_cond_destroy(&ts->tile_thread.cond);
|
||||
pthread_mutex_destroy(&ts->tile_thread.lock);
|
||||
}
|
||||
if (f->lf.thread.inited) {
|
||||
freep(&f->lf.thread.tasks);
|
||||
pthread_cond_destroy(&f->lf.thread.cond);
|
||||
}
|
||||
dav1d_free_aligned(f->ts);
|
||||
dav1d_free_aligned(f->tc);
|
||||
dav1d_free_aligned(f->ipred_edge[0]);
|
||||
|
|
|
@ -46,29 +46,32 @@ typedef const pixel (*const_left_pixel_row)[4];
|
|||
typedef const void *const_left_pixel_row;
|
||||
#endif
|
||||
|
||||
// Although the spec applies restoration filters over 4x4 blocks, the wiener
|
||||
// filter can be applied to a bigger surface.
|
||||
typedef union LooprestorationParams {
|
||||
ALIGN(int16_t filter[2][8], 16);
|
||||
struct {
|
||||
uint32_t s0, s1;
|
||||
int16_t w0, w1;
|
||||
} sgr;
|
||||
} LooprestorationParams;
|
||||
|
||||
// Although the spec applies restoration filters over 4x4 blocks,
|
||||
// they can be applied to a bigger surface.
|
||||
// * w is constrained by the restoration unit size (w <= 256)
|
||||
// * h is constrained by the stripe height (h <= 64)
|
||||
#define decl_wiener_filter_fn(name) \
|
||||
// The filter functions are allowed to do aligned writes past the right
|
||||
// edge of the buffer, aligned up to the minimum loop restoration unit size
|
||||
// (which is 32 pixels for subsampled chroma and 64 pixels for luma).
|
||||
#define decl_lr_filter_fn(name) \
|
||||
void (name)(pixel *dst, ptrdiff_t dst_stride, \
|
||||
const_left_pixel_row left, \
|
||||
const pixel *lpf, ptrdiff_t lpf_stride, \
|
||||
int w, int h, const int16_t filter[2][8], \
|
||||
int w, int h, const LooprestorationParams *params, \
|
||||
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_wiener_filter_fn(*wienerfilter_fn);
|
||||
|
||||
#define decl_selfguided_filter_fn(name) \
|
||||
void (name)(pixel *dst, ptrdiff_t dst_stride, \
|
||||
const_left_pixel_row left, \
|
||||
const pixel *lpf, ptrdiff_t lpf_stride, \
|
||||
int w, int h, int sgr_idx, const int16_t sgr_w[2], \
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_selfguided_filter_fn(*selfguided_fn);
|
||||
typedef decl_lr_filter_fn(*looprestorationfilter_fn);
|
||||
|
||||
typedef struct Dav1dLoopRestorationDSPContext {
|
||||
wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
|
||||
selfguided_fn selfguided;
|
||||
looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */
|
||||
looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */
|
||||
} Dav1dLoopRestorationDSPContext;
|
||||
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
|
||||
|
|
|
@ -39,10 +39,10 @@
|
|||
|
||||
// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
|
||||
// TODO Chroma only requires 2 rows of padding.
|
||||
static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
|
||||
static NOINLINE void
|
||||
padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*left)[4], const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
|
||||
{
|
||||
const int have_left = !!(edges & LR_HAVE_LEFT);
|
||||
const int have_right = !!(edges & LR_HAVE_RIGHT);
|
||||
|
@ -135,7 +135,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
|
|||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const int16_t filter[2][8],
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
|
||||
|
@ -150,6 +150,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
|
|||
uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
|
||||
uint16_t *hor_ptr = hor;
|
||||
|
||||
const int16_t (*const filter)[8] = params->filter;
|
||||
const int bitdepth = bitdepth_from_max(bitdepth_max);
|
||||
const int round_bits_h = 3 + (bitdepth == 12) * 2;
|
||||
const int rounding_off_h = 1 << (round_bits_h - 1);
|
||||
|
@ -347,12 +348,12 @@ static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
|
|||
}
|
||||
}
|
||||
|
||||
static void selfguided_filter(coef *dst, const pixel *src,
|
||||
const ptrdiff_t src_stride, const int w,
|
||||
const int h, const int n, const int s
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
static NOINLINE void
|
||||
selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
|
||||
const int w, const int h, const int n, const unsigned s
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int sgr_one_by_x = n == 25 ? 164 : 455;
|
||||
const unsigned sgr_one_by_x = n == 25 ? 164 : 455;
|
||||
|
||||
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
|
||||
// of padding above and below
|
||||
|
@ -446,71 +447,93 @@ static void selfguided_filter(coef *dst, const pixel *src,
|
|||
#undef EIGHT_NEIGHBORS
|
||||
}
|
||||
|
||||
static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride,
|
||||
const int w, const int h, const int sgr_idx,
|
||||
const int16_t sgr_w[2], const enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4], const pixel *lpf,
|
||||
const ptrdiff_t lpf_stride, const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
|
||||
// of padding above and below
|
||||
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
|
||||
|
||||
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
|
||||
|
||||
// Selfguided filter outputs to a maximum stripe height of 64 and a
|
||||
// maximum restoration width of 384 (256 * 1.5)
|
||||
coef dst[64 * 384];
|
||||
|
||||
// both r1 and r0 can't be zero
|
||||
if (!dav1d_sgr_params[sgr_idx][0]) {
|
||||
const int s1 = dav1d_sgr_params[sgr_idx][3];
|
||||
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
|
||||
const int w1 = (1 << 7) - sgr_w[1];
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
|
||||
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
|
||||
params->sgr.s0 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int w0 = params->sgr.w0;
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
}
|
||||
} else if (!dav1d_sgr_params[sgr_idx][1]) {
|
||||
const int s0 = dav1d_sgr_params[sgr_idx][2];
|
||||
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
|
||||
const int w0 = sgr_w[0];
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4], const pixel *lpf,
|
||||
const ptrdiff_t lpf_stride, const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
|
||||
coef dst[64 * 384];
|
||||
|
||||
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
|
||||
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
|
||||
params->sgr.s1 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int w1 = params->sgr.w1;
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
}
|
||||
} else {
|
||||
coef dst1[64 * 384];
|
||||
const int s0 = dav1d_sgr_params[sgr_idx][2];
|
||||
const int s1 = dav1d_sgr_params[sgr_idx][3];
|
||||
const int w0 = sgr_w[0];
|
||||
const int w1 = (1 << 7) - w0 - sgr_w[1];
|
||||
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
|
||||
selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
|
||||
w1 * (dst1[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride,
|
||||
const pixel (*const left)[4], const pixel *lpf,
|
||||
const ptrdiff_t lpf_stride, const int w, const int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
|
||||
coef dst0[64 * 384];
|
||||
coef dst1[64 * 384];
|
||||
|
||||
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
|
||||
selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
|
||||
params->sgr.s0 HIGHBD_TAIL_SUFFIX);
|
||||
selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9,
|
||||
params->sgr.s1 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int w0 = params->sgr.w0;
|
||||
const int w1 = params->sgr.w1;
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
const int u = (p[i] << 4);
|
||||
const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) +
|
||||
w1 * (dst1[j * 384 + i] - u);
|
||||
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
|
||||
}
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
}
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
|
||||
c->wiener[0] = c->wiener[1] = wiener_c;
|
||||
c->selfguided = selfguided_c;
|
||||
c->sgr[0] = sgr_5x5_c;
|
||||
c->sgr[1] = sgr_3x3_c;
|
||||
c->sgr[2] = sgr_mix_c;
|
||||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
|
|
|
@ -48,31 +48,32 @@ static void backup_lpf(const Dav1dFrameContext *const f,
|
|||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const int ss_ver, const int sb128,
|
||||
int row, const int row_h, const int src_w,
|
||||
const int h, const int ss_hor)
|
||||
const int h, const int ss_hor, const int pft)
|
||||
{
|
||||
const int dst_w = f->frame_hdr->super_res.enabled ?
|
||||
(f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
|
||||
|
||||
// The first stripe of the frame is shorter by 8 luma pixel rows.
|
||||
int stripe_h = (64 - 8 * !row) >> ss_ver;
|
||||
|
||||
if (row) {
|
||||
const int top = 4 << sb128;
|
||||
// Copy the top part of the stored loop filtered pixels from the
|
||||
// previous sb row needed above the first stripe of this sb row.
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
|
||||
&dst[PXSTRIDE(dst_stride) * top], dst_w);
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
|
||||
&dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
|
||||
&dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
|
||||
&dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
|
||||
}
|
||||
|
||||
dst += 4 * PXSTRIDE(dst_stride);
|
||||
src += (stripe_h - 2) * PXSTRIDE(src_stride);
|
||||
|
||||
if (!pft) {
|
||||
if (row) {
|
||||
const int top = 4 << sb128;
|
||||
// Copy the top part of the stored loop filtered pixels from the
|
||||
// previous sb row needed above the first stripe of this sb row.
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
|
||||
&dst[PXSTRIDE(dst_stride) * top], dst_w);
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
|
||||
&dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
|
||||
&dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
|
||||
pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
|
||||
&dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
|
||||
}
|
||||
dst += 4 * PXSTRIDE(dst_stride);
|
||||
}
|
||||
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
while (row + stripe_h <= row_h) {
|
||||
const int n_lines = 4 - (row + stripe_h + 1 == h);
|
||||
|
@ -107,9 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f,
|
|||
void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
||||
/*const*/ pixel *const src[3], const int sby)
|
||||
{
|
||||
const int pft = f->c->n_pfc > 1;
|
||||
const int offset = 8 * !!sby;
|
||||
const ptrdiff_t *const src_stride = f->cur.stride;
|
||||
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
|
||||
pixel *const dst[3] = {
|
||||
f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
|
||||
f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
|
||||
f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride)
|
||||
};
|
||||
|
||||
// TODO Also check block level restore type to reduce copying.
|
||||
const int restore_planes = f->lf.restore_planes;
|
||||
|
@ -119,9 +126,9 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
|||
const int w = f->bw << 2;
|
||||
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
|
||||
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
|
||||
backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,
|
||||
backup_lpf(f, dst[0], lr_stride,
|
||||
src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
|
||||
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
|
||||
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft);
|
||||
}
|
||||
if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
|
||||
const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
@ -130,18 +137,16 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
|||
const int w = f->bw << (2 - ss_hor);
|
||||
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
|
||||
const int offset_uv = offset >> ss_ver;
|
||||
const int y_stripe =
|
||||
(sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
|
||||
|
||||
const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
|
||||
if (restore_planes & LR_RESTORE_U) {
|
||||
backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,
|
||||
backup_lpf(f, dst[1], lr_stride,
|
||||
src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
|
||||
}
|
||||
if (restore_planes & LR_RESTORE_V) {
|
||||
backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,
|
||||
backup_lpf(f, dst[2], lr_stride,
|
||||
src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
|
||||
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -154,17 +159,18 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
|
|||
const Dav1dDSPContext *const dsp = f->dsp;
|
||||
const int chroma = !!plane;
|
||||
const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
|
||||
const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
|
||||
const pixel *lpf = f->lf.lr_lpf_line[plane] + x;
|
||||
const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
|
||||
const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
|
||||
const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
|
||||
const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
|
||||
|
||||
// The first stripe of the frame is shorter by 8 luma pixel rows.
|
||||
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
|
||||
|
||||
ALIGN_STK_16(int16_t, filter, 2, [8]);
|
||||
wienerfilter_fn wiener_fn = NULL;
|
||||
looprestorationfilter_fn lr_fn;
|
||||
LooprestorationParams params;
|
||||
if (lr->type == DAV1D_RESTORATION_WIENER) {
|
||||
int16_t (*const filter)[8] = params.filter;
|
||||
filter[0][0] = filter[0][6] = lr->filter_h[0];
|
||||
filter[0][1] = filter[0][5] = lr->filter_h[1];
|
||||
filter[0][2] = filter[0][4] = lr->filter_h[2];
|
||||
|
@ -180,25 +186,26 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
|
|||
filter[1][2] = filter[1][4] = lr->filter_v[2];
|
||||
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
|
||||
|
||||
wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
|
||||
lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
|
||||
} else {
|
||||
assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
|
||||
const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx];
|
||||
params.sgr.s0 = sgr_params[0];
|
||||
params.sgr.s1 = sgr_params[1];
|
||||
params.sgr.w0 = lr->sgr_weights[0];
|
||||
params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
|
||||
|
||||
lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
|
||||
}
|
||||
|
||||
while (y + stripe_h <= row_h) {
|
||||
// Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
|
||||
edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
|
||||
if (wiener_fn) {
|
||||
wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
|
||||
filter, edges HIGHBD_CALL_SUFFIX);
|
||||
} else {
|
||||
dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
|
||||
lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
// Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
|
||||
edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
|
||||
lr_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
|
||||
¶ms, edges HIGHBD_CALL_SUFFIX);
|
||||
|
||||
left += stripe_h;
|
||||
y += stripe_h;
|
||||
if (y + stripe_h > row_h && sbrow_has_bottom) break;
|
||||
p += stripe_h * PXSTRIDE(p_stride);
|
||||
edges |= LR_HAVE_TOP;
|
||||
stripe_h = imin(64 >> ss_ver, row_h - y);
|
||||
|
@ -242,8 +249,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
|
|||
pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
|
||||
const Av1RestorationUnit *lr[2];
|
||||
|
||||
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
|
||||
(row_h < h ? LR_HAVE_BOTTOM : 0);
|
||||
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;
|
||||
|
||||
int aligned_unit_pos = row_y & ~(unit_size - 1);
|
||||
if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
|
||||
|
@ -281,11 +287,13 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
|
|||
const int offset_y = 8 * !!sby;
|
||||
const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
|
||||
const int restore_planes = f->lf.restore_planes;
|
||||
const int not_last = sby + 1 < f->sbh;
|
||||
|
||||
if (restore_planes & LR_RESTORE_Y) {
|
||||
const int h = f->sr_cur.p.p.h;
|
||||
const int w = f->sr_cur.p.p.w;
|
||||
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h);
|
||||
const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
|
||||
const int row_h = imin(next_row_y - 8 * not_last, h);
|
||||
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
|
||||
lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
|
||||
h, row_h, 0);
|
||||
|
@ -295,10 +303,10 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
|
|||
const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
|
||||
const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
|
||||
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h);
|
||||
const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
|
||||
const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
|
||||
const int offset_uv = offset_y >> ss_ver;
|
||||
const int y_stripe =
|
||||
(sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
|
||||
const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
|
||||
if (restore_planes & LR_RESTORE_U)
|
||||
lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
|
||||
w, h, row_h, 1);
|
||||
|
|
|
@ -87,9 +87,15 @@ prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
|
|||
#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
|
||||
((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
|
||||
|
||||
#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
|
||||
((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
|
||||
|
||||
#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
|
||||
iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
|
||||
|
||||
#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
|
||||
iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
|
||||
|
||||
#define GET_H_FILTER(mx) \
|
||||
const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
|
||||
dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
|
||||
|
@ -111,7 +117,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
|
|||
const int filter_type HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int intermediate_bits = get_intermediate_bits(bitdepth_max);
|
||||
const int intermediate_rnd = (1 << intermediate_bits) >> 1;
|
||||
const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
|
||||
|
||||
GET_FILTERS();
|
||||
dst_stride = PXSTRIDE(dst_stride);
|
||||
|
@ -144,9 +150,8 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
|
|||
} else {
|
||||
do {
|
||||
for (int x = 0; x < w; x++) {
|
||||
const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
|
||||
6 - intermediate_bits);
|
||||
dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
|
||||
dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
|
||||
intermediate_rnd, 6);
|
||||
}
|
||||
|
||||
dst += dst_stride;
|
||||
|
|
|
@ -132,6 +132,8 @@ if is_asm_enabled
|
|||
endif
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
libdav1d_sources_asm = files(
|
||||
# itx.S is used for both 8 and 16 bpc.
|
||||
'arm/32/itx.S',
|
||||
'arm/32/looprestoration_common.S',
|
||||
'arm/32/msac.S',
|
||||
)
|
||||
|
@ -140,7 +142,6 @@ if is_asm_enabled
|
|||
libdav1d_sources_asm += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/ipred.S',
|
||||
'arm/32/itx.S',
|
||||
'arm/32/loopfilter.S',
|
||||
'arm/32/looprestoration.S',
|
||||
'arm/32/mc.S',
|
||||
|
@ -150,6 +151,8 @@ if is_asm_enabled
|
|||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/32/cdef16.S',
|
||||
'arm/32/ipred16.S',
|
||||
'arm/32/itx16.S',
|
||||
'arm/32/loopfilter16.S',
|
||||
'arm/32/looprestoration16.S',
|
||||
'arm/32/mc16.S',
|
||||
|
@ -183,20 +186,20 @@ if is_asm_enabled
|
|||
libdav1d_sources_asm = files(
|
||||
'x86/cpuid.asm',
|
||||
'x86/msac.asm',
|
||||
'x86/cdef_avx2.asm',
|
||||
'x86/cdef_sse.asm',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources_asm += files(
|
||||
'x86/cdef_avx512.asm',
|
||||
'x86/mc_avx512.asm',
|
||||
'x86/cdef_avx2.asm',
|
||||
'x86/mc_avx2.asm',
|
||||
'x86/film_grain.asm',
|
||||
'x86/ipred.asm',
|
||||
'x86/itx.asm',
|
||||
'x86/loopfilter.asm',
|
||||
'x86/looprestoration.asm',
|
||||
'x86/cdef_sse.asm',
|
||||
'x86/film_grain_ssse3.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
|
@ -208,6 +211,9 @@ if is_asm_enabled
|
|||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources_asm += files(
|
||||
'x86/cdef16_avx2.asm',
|
||||
'x86/cdef16_sse.asm',
|
||||
'x86/looprestoration16_avx2.asm',
|
||||
)
|
||||
endif
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -33,6 +33,7 @@
|
|||
|
||||
#include "dav1d/data.h"
|
||||
|
||||
#include "common/frame.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "src/decode.h"
|
||||
|
@ -406,7 +407,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
else
|
||||
hdr->force_integer_mv = 0;
|
||||
|
||||
if (!(hdr->frame_type & 1))
|
||||
if (IS_KEY_OR_INTRA(hdr))
|
||||
hdr->force_integer_mv = 1;
|
||||
|
||||
if (seqhdr->frame_id_numbers_present)
|
||||
|
@ -420,7 +421,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
#endif
|
||||
hdr->frame_offset = seqhdr->order_hint ?
|
||||
dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
|
||||
hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ?
|
||||
hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ?
|
||||
dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
|
||||
|
||||
if (seqhdr->decoder_model_info_present) {
|
||||
|
@ -439,9 +440,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
}
|
||||
}
|
||||
|
||||
if (hdr->frame_type == DAV1D_FRAME_TYPE_KEY ||
|
||||
hdr->frame_type == DAV1D_FRAME_TYPE_INTRA)
|
||||
{
|
||||
if (IS_KEY_OR_INTRA(hdr)) {
|
||||
hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
|
||||
hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
|
||||
if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
|
||||
|
@ -569,7 +568,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
hdr->switchable_motion_mode = dav1d_get_bits(gb, 1);
|
||||
hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
|
||||
seqhdr->ref_frame_mvs && seqhdr->order_hint &&
|
||||
hdr->frame_type & 1 && dav1d_get_bits(gb, 1);
|
||||
IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1);
|
||||
}
|
||||
#if DEBUG_FRAME_HDR
|
||||
printf("HDR: post-frametype-specific-bits: off=%td\n",
|
||||
|
@ -916,13 +915,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
printf("HDR: post-txfmmode: off=%td\n",
|
||||
(gb->ptr - init_ptr) * 8 - gb->bits_left);
|
||||
#endif
|
||||
hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0;
|
||||
hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0;
|
||||
#if DEBUG_FRAME_HDR
|
||||
printf("HDR: post-refmode: off=%td\n",
|
||||
(gb->ptr - init_ptr) * 8 - gb->bits_left);
|
||||
#endif
|
||||
hdr->skip_mode_allowed = 0;
|
||||
if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
|
||||
if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) {
|
||||
const unsigned poc = hdr->frame_offset;
|
||||
unsigned off_before = 0xFFFFFFFFU;
|
||||
int off_after = -1;
|
||||
|
@ -982,7 +981,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
printf("HDR: post-extskip: off=%td\n",
|
||||
(gb->ptr - init_ptr) * 8 - gb->bits_left);
|
||||
#endif
|
||||
hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 &&
|
||||
hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) &&
|
||||
seqhdr->warped_motion && dav1d_get_bits(gb, 1);
|
||||
#if DEBUG_FRAME_HDR
|
||||
printf("HDR: post-warpmotionbit: off=%td\n",
|
||||
|
@ -997,7 +996,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
for (int i = 0; i < 7; i++)
|
||||
hdr->gmv[i] = dav1d_default_wm_params;
|
||||
|
||||
if (hdr->frame_type & 1) {
|
||||
if (IS_INTER_OR_SWITCH(hdr)) {
|
||||
for (int i = 0; i < 7; i++) {
|
||||
hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY :
|
||||
dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM :
|
||||
|
|
|
@ -299,7 +299,6 @@ static inline void padding(uint8_t *dst, const uint8_t *p,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// FIXME Could split into luma and chroma specific functions,
|
||||
// (since first and last tops are always 0 for chroma)
|
||||
// FIXME Could implement a version that requires less temporary memory
|
||||
|
@ -309,9 +308,11 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
|
|||
const uint8_t *lpf,
|
||||
const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const int16_t filter[2][8],
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int16_t (*const filter)[8] = params->filter;
|
||||
|
||||
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
|
||||
// of padding above and below
|
||||
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
|
||||
|
@ -320,7 +321,6 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
|
|||
|
||||
wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
|
||||
wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -3066,7 +3066,6 @@ static const uint8_t qm_tbl_32x32_t[][2][528] = {
|
|||
};
|
||||
|
||||
const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
|
||||
static uint8_t pb_32x32[32 * 32];
|
||||
static uint8_t qm_tbl_4x4[15][2][16];
|
||||
static uint8_t qm_tbl_4x8[15][2][32];
|
||||
static uint8_t qm_tbl_4x16[15][2][64];
|
||||
|
@ -3145,8 +3144,5 @@ COLD void dav1d_init_qm_tables(void) {
|
|||
dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
|
||||
}
|
||||
|
||||
memset(pb_32x32, 32, sizeof(pb_32x32));
|
||||
for (int j = 0; j < 2; j++)
|
||||
for (int k = 0; k < N_RECT_TX_SIZES; k++)
|
||||
dav1d_qm_tbl[15][j][k] = pb_32x32;
|
||||
// dav1d_qm_tbl[15][*][*] == NULL
|
||||
}
|
||||
|
|
|
@ -65,6 +65,14 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
|
|||
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc);
|
||||
decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc);
|
||||
|
||||
decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
|
||||
decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018-2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
|
@ -33,6 +33,7 @@
|
|||
#include "common/attributes.h"
|
||||
#include "common/bitdepth.h"
|
||||
#include "common/dump.h"
|
||||
#include "common/frame.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "src/cdef_apply.h"
|
||||
|
@ -438,34 +439,39 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
} else {
|
||||
eob = eob_bin;
|
||||
}
|
||||
assert(eob >= 0);
|
||||
|
||||
// base tokens
|
||||
uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
|
||||
uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
|
||||
const uint16_t *const scan = dav1d_scans[tx][tx_class];
|
||||
int dc_tok;
|
||||
unsigned rc, dc_tok;
|
||||
|
||||
if (eob) {
|
||||
uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
|
||||
uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
|
||||
const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
|
||||
const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
|
||||
|
||||
/* eob */
|
||||
unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
|
||||
unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
|
||||
int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
|
||||
int tok = eob_tok + 1;
|
||||
int level_tok = tok * 0x41;
|
||||
unsigned mag;
|
||||
if (dbg)
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);
|
||||
|
||||
#define DECODE_COEFS_CLASS(tx_class) \
|
||||
unsigned x, y; \
|
||||
if (tx_class == TX_CLASS_2D) \
|
||||
rc = scan[eob], x = rc >> shift, y = rc & mask; \
|
||||
else if (tx_class == TX_CLASS_H) \
|
||||
/* Transposing reduces the stride and padding requirements */ \
|
||||
x = eob & mask, y = eob >> shift, rc = eob; \
|
||||
else /* tx_class == TX_CLASS_V */ \
|
||||
x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
|
||||
if (dbg) \
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
|
||||
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
|
||||
if (eob_tok == 2) { \
|
||||
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
|
||||
tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
|
||||
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
|
||||
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
|
||||
level_tok = tok + (3 << 6); \
|
||||
if (dbg) \
|
||||
|
@ -473,40 +479,46 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
|
||||
ts->msac.rng); \
|
||||
} \
|
||||
cf[rc] = tok; \
|
||||
if (tx_class == TX_CLASS_H) \
|
||||
/* Transposing reduces the stride and padding requirements */ \
|
||||
levels[y * stride + x] = (uint8_t) level_tok; \
|
||||
else \
|
||||
levels[x * stride + y] = (uint8_t) level_tok; \
|
||||
cf[rc] = tok << 11; \
|
||||
levels[x * stride + y] = (uint8_t) level_tok; \
|
||||
for (int i = eob - 1; i > 0; i--) { /* ac */ \
|
||||
if (tx_class == TX_CLASS_H) \
|
||||
rc = i, x = rc & mask, y = rc >> shift; \
|
||||
else \
|
||||
rc = scan[i], x = rc >> shift, y = rc & mask; \
|
||||
unsigned rc_i; \
|
||||
if (tx_class == TX_CLASS_2D) \
|
||||
rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
|
||||
else if (tx_class == TX_CLASS_H) \
|
||||
x = i & mask, y = i >> shift, rc_i = i; \
|
||||
else /* tx_class == TX_CLASS_V */ \
|
||||
x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
|
||||
assert(x < 32 && y < 32); \
|
||||
uint8_t *const level = levels + x * stride + y; \
|
||||
ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
|
||||
if (tx_class == TX_CLASS_2D) \
|
||||
y |= x; \
|
||||
tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
|
||||
level_tok = tok * 0x41; \
|
||||
if (dbg) \
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
|
||||
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
|
||||
t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
|
||||
if (tok == 3) { \
|
||||
mag &= 63; \
|
||||
ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
|
||||
(mag > 12 ? 6 : (mag + 1) >> 1); \
|
||||
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
|
||||
level_tok = tok + (3 << 6); \
|
||||
if (dbg) \
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
|
||||
imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
|
||||
imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
|
||||
ts->msac.rng); \
|
||||
*level = (uint8_t) (tok + (3 << 6)); \
|
||||
cf[rc_i] = (tok << 11) | rc; \
|
||||
rc = rc_i; \
|
||||
} else { \
|
||||
/* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
|
||||
tok *= 0x17ff41; \
|
||||
*level = (uint8_t) tok; \
|
||||
/* tok ? (tok << 11) | rc : 0 */ \
|
||||
tok = (tok >> 9) & (rc + ~0x7ffu); \
|
||||
if (tok) rc = rc_i; \
|
||||
cf[rc_i] = tok; \
|
||||
} \
|
||||
cf[rc] = tok; \
|
||||
*level = (uint8_t) level_tok; \
|
||||
} \
|
||||
/* dc */ \
|
||||
ctx = (tx_class == TX_CLASS_2D) ? 0 : \
|
||||
|
@ -528,27 +540,35 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
} \
|
||||
break
|
||||
|
||||
const uint16_t *scan;
|
||||
switch (tx_class) {
|
||||
case TX_CLASS_2D: {
|
||||
const unsigned nonsquare_tx = tx >= RTX_4X8;
|
||||
const uint8_t (*const lo_ctx_offsets)[5] =
|
||||
dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
|
||||
scan = dav1d_scans[tx];
|
||||
const ptrdiff_t stride = 4 * sh;
|
||||
const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
|
||||
const unsigned mask = 4 * sh - 1;
|
||||
memset(levels, 0, stride * (4 * sw + 2));
|
||||
DECODE_COEFS_CLASS(TX_CLASS_2D);
|
||||
}
|
||||
case TX_CLASS_H: {
|
||||
#define lo_ctx_offsets NULL
|
||||
const uint8_t (*const lo_ctx_offsets)[5] = NULL;
|
||||
const ptrdiff_t stride = 16;
|
||||
const unsigned shift = t_dim->lh + 2, shift2 = 0;
|
||||
const unsigned mask = 4 * sh - 1;
|
||||
memset(levels, 0, stride * (4 * sh + 2));
|
||||
DECODE_COEFS_CLASS(TX_CLASS_H);
|
||||
}
|
||||
case TX_CLASS_V: {
|
||||
const uint8_t (*const lo_ctx_offsets)[5] = NULL;
|
||||
const ptrdiff_t stride = 16;
|
||||
const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
|
||||
const unsigned mask = 4 * sw - 1;
|
||||
memset(levels, 0, stride * (4 * sw + 2));
|
||||
DECODE_COEFS_CLASS(TX_CLASS_V);
|
||||
}
|
||||
#undef lo_ctx_offsets
|
||||
#undef DECODE_COEFS_CLASS
|
||||
default: assert(0);
|
||||
}
|
||||
|
@ -564,71 +584,137 @@ static int decode_coefs(Dav1dTileContext *const t,
|
|||
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
|
||||
imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
|
||||
}
|
||||
rc = 0;
|
||||
}
|
||||
|
||||
// residual and sign
|
||||
int dc_sign = 1 << 6;
|
||||
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
|
||||
const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
|
||||
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
|
||||
const int dq_shift = imax(0, t_dim->ctx - 2);
|
||||
const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
|
||||
const int cf_max = (1 << (7 + bitdepth)) - 1;
|
||||
unsigned cul_level = 0;
|
||||
const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
|
||||
unsigned cul_level, dc_sign_level;
|
||||
|
||||
if (dc_tok) { // dc
|
||||
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
|
||||
uint16_t *const dc_sign_cdf =
|
||||
ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
|
||||
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
|
||||
const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
|
||||
if (dbg)
|
||||
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
|
||||
chroma, dc_sign_ctx, sign, ts->msac.rng);
|
||||
dc_sign = (sign - 1) & (2 << 6);
|
||||
if (!dc_tok) {
|
||||
cul_level = 0;
|
||||
dc_sign_level = 1 << 6;
|
||||
if (qm_tbl) goto ac_qm;
|
||||
goto ac_noqm;
|
||||
}
|
||||
|
||||
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
|
||||
uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
|
||||
const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
|
||||
if (dbg)
|
||||
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
|
||||
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
|
||||
|
||||
unsigned dc_dq = dq_tbl[0];
|
||||
dc_sign_level = (dc_sign - 1) & (2 << 6);
|
||||
|
||||
if (qm_tbl) {
|
||||
dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
|
||||
|
||||
if (dc_tok == 15) {
|
||||
dc_tok += read_golomb(&ts->msac);
|
||||
dc_tok = read_golomb(&ts->msac) + 15;
|
||||
if (dbg)
|
||||
printf("Post-dc_residual[%d->%d]: r=%d\n",
|
||||
dc_tok - 15, dc_tok, ts->msac.rng);
|
||||
|
||||
dc_tok &= 0xfffff;
|
||||
dc_dq = (dc_dq * dc_tok) & 0xffffff;
|
||||
} else {
|
||||
dc_dq *= dc_tok;
|
||||
assert(dc_dq <= 0xffffff);
|
||||
}
|
||||
cul_level = dc_tok;
|
||||
dc_dq >>= dq_shift;
|
||||
cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
|
||||
|
||||
cul_level += dc_tok;
|
||||
dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift;
|
||||
cf[0] = imin(dc_tok - sign, cf_max) ^ -sign;
|
||||
}
|
||||
for (int i = 1; i <= eob; i++) { // ac
|
||||
const int rc = scan[i];
|
||||
int tok = cf[rc];
|
||||
if (!tok) continue;
|
||||
if (rc) ac_qm: {
|
||||
const unsigned ac_dq = dq_tbl[1];
|
||||
do {
|
||||
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
|
||||
if (dbg)
|
||||
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
|
||||
const unsigned rc_tok = cf[rc];
|
||||
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
|
||||
|
||||
// sign
|
||||
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
|
||||
const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
|
||||
if (dbg)
|
||||
printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
|
||||
if (rc_tok >= (15 << 11)) {
|
||||
tok = read_golomb(&ts->msac) + 15;
|
||||
if (dbg)
|
||||
printf("Post-residual[%d=%d->%d]: r=%d\n",
|
||||
rc, tok - 15, tok, ts->msac.rng);
|
||||
|
||||
// residual
|
||||
if (tok == 15) {
|
||||
tok += read_golomb(&ts->msac);
|
||||
tok &= 0xfffff;
|
||||
dq = (dq * tok) & 0xffffff;
|
||||
} else {
|
||||
tok = rc_tok >> 11;
|
||||
dq *= tok;
|
||||
assert(dq <= 0xffffff);
|
||||
}
|
||||
cul_level += tok;
|
||||
dq >>= dq_shift;
|
||||
cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
|
||||
|
||||
rc = rc_tok & 0x3ff;
|
||||
} while (rc);
|
||||
}
|
||||
} else {
|
||||
// non-qmatrix is the common case and allows for additional optimizations
|
||||
if (dc_tok == 15) {
|
||||
dc_tok = read_golomb(&ts->msac) + 15;
|
||||
if (dbg)
|
||||
printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
|
||||
i, rc, tok - 15, tok, ts->msac.rng);
|
||||
printf("Post-dc_residual[%d->%d]: r=%d\n",
|
||||
dc_tok - 15, dc_tok, ts->msac.rng);
|
||||
|
||||
// coefficient parsing, see 5.11.39
|
||||
tok &= 0xfffff;
|
||||
dc_tok &= 0xfffff;
|
||||
dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
|
||||
dc_dq = umin(dc_dq - dc_sign, cf_max);
|
||||
} else {
|
||||
dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
|
||||
assert(dc_dq <= cf_max);
|
||||
}
|
||||
cul_level = dc_tok;
|
||||
cf[0] = (coef) (dc_dq ^ -dc_sign);
|
||||
|
||||
// dequant, see 7.12.3
|
||||
cul_level += tok;
|
||||
tok = ((dq * tok) & 0xffffff) >> dq_shift;
|
||||
cf[rc] = imin(tok - sign, cf_max) ^ -sign;
|
||||
if (rc) ac_noqm: {
|
||||
const unsigned ac_dq = dq_tbl[1];
|
||||
do {
|
||||
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
|
||||
if (dbg)
|
||||
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
|
||||
const unsigned rc_tok = cf[rc];
|
||||
unsigned tok, dq;
|
||||
|
||||
// residual
|
||||
if (rc_tok >= (15 << 11)) {
|
||||
tok = read_golomb(&ts->msac) + 15;
|
||||
if (dbg)
|
||||
printf("Post-residual[%d=%d->%d]: r=%d\n",
|
||||
rc, tok - 15, tok, ts->msac.rng);
|
||||
|
||||
// coefficient parsing, see 5.11.39
|
||||
tok &= 0xfffff;
|
||||
|
||||
// dequant, see 7.12.3
|
||||
dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
|
||||
dq = umin(dq - sign, cf_max);
|
||||
} else {
|
||||
// cannot exceed cf_max, so we can avoid the clipping
|
||||
tok = rc_tok >> 11;
|
||||
dq = ((ac_dq * tok) >> dq_shift) - sign;
|
||||
assert(dq <= cf_max);
|
||||
}
|
||||
cul_level += tok;
|
||||
cf[rc] = (coef) (dq ^ -sign);
|
||||
|
||||
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
|
||||
} while (rc);
|
||||
}
|
||||
}
|
||||
|
||||
// context
|
||||
*res_ctx = umin(cul_level, 63) | dc_sign;
|
||||
*res_ctx = umin(cul_level, 63) | dc_sign_level;
|
||||
|
||||
return eob;
|
||||
}
|
||||
|
@ -1544,7 +1630,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
|
||||
const ptrdiff_t uvdstoff =
|
||||
4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
|
||||
if (!(f->frame_hdr->frame_type & 1)) {
|
||||
if (IS_KEY_OR_INTRA(f->frame_hdr)) {
|
||||
// intrabc
|
||||
assert(!f->frame_hdr->super_res.enabled);
|
||||
res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
|
||||
|
@ -1965,74 +2051,107 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
|
|||
return 0;
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
||||
const int sbsz = f->sb_step, sbh = f->sbh;
|
||||
|
||||
if (f->frame_hdr->loopfilter.level_y[0] ||
|
||||
f->frame_hdr->loopfilter.level_y[1])
|
||||
{
|
||||
void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) {
|
||||
const int y = sby * f->sb_step * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *const p[3] = {
|
||||
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
|
||||
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
|
||||
};
|
||||
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
|
||||
int start_of_tile_row = 0;
|
||||
if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
|
||||
start_of_tile_row = f->lf.tile_row++;
|
||||
bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
|
||||
start_of_tile_row);
|
||||
bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row);
|
||||
}
|
||||
|
||||
if (f->lf.restore_planes) {
|
||||
// Store loop filtered pixels required by loop restoration
|
||||
bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
|
||||
}
|
||||
if (f->seq_hdr->cdef) {
|
||||
if (sby) {
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *p_up[3] = {
|
||||
f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
|
||||
f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
};
|
||||
bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
|
||||
sby * sbsz - 2, sby * sbsz);
|
||||
}
|
||||
const int n_blks = sbsz - 2 * (sby + 1 < sbh);
|
||||
bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
|
||||
imin(sby * sbsz + n_blks, f->bh));
|
||||
}
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
|
||||
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
|
||||
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int h_start = 8 * !!sby >> ss_ver;
|
||||
const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
|
||||
pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride);
|
||||
const ptrdiff_t src_stride = f->cur.stride[!!pl];
|
||||
const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride);
|
||||
const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver;
|
||||
const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
|
||||
const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
|
||||
const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
|
||||
|
||||
f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
|
||||
imin(img_h, h_end) + h_start, src_w,
|
||||
f->resize_step[!!pl], f->resize_start[!!pl]
|
||||
HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
}
|
||||
if (f->lf.restore_planes) {
|
||||
bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
|
||||
bytefn(dav1d_lr_copy_lpf)(f, p, sby);
|
||||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) {
|
||||
const int sbsz = f->sb_step;
|
||||
const int y = sby * sbsz * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]);
|
||||
f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
|
||||
f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
|
||||
f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]);
|
||||
f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
|
||||
f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
|
||||
f->lf.prev_mask_ptr = f->lf.mask_ptr;
|
||||
if ((sby & 1) || f->seq_hdr->sb128) {
|
||||
f->lf.mask_ptr += f->sb128w;
|
||||
pixel *const p[3] = {
|
||||
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
|
||||
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
|
||||
};
|
||||
Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
|
||||
const int start = sby * sbsz;
|
||||
if (sby) {
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *p_up[3] = {
|
||||
p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
|
||||
p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
};
|
||||
bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start);
|
||||
}
|
||||
const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
|
||||
const int end = imin(start + n_blks, f->bh);
|
||||
bytefn(dav1d_cdef_brow)(f, p, mask, start, end);
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
|
||||
const int sbsz = f->sb_step;
|
||||
const int y = sby * sbsz * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const pixel *const p[3] = {
|
||||
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
|
||||
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
|
||||
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
|
||||
};
|
||||
pixel *const sr_p[3] = {
|
||||
f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
|
||||
f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
|
||||
f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
|
||||
};
|
||||
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
|
||||
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int h_start = 8 * !!sby >> ss_ver;
|
||||
const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
|
||||
pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
|
||||
const ptrdiff_t src_stride = f->cur.stride[!!pl];
|
||||
const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
|
||||
const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
|
||||
const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
|
||||
const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
|
||||
const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
|
||||
|
||||
f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
|
||||
imin(img_h, h_end) + h_start, src_w,
|
||||
f->resize_step[!!pl], f->resize_start[!!pl]
|
||||
HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
|
||||
const int y = sby * f->sb_step * 4;
|
||||
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
pixel *const sr_p[3] = {
|
||||
f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
|
||||
f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
|
||||
f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
|
||||
};
|
||||
bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
|
||||
}
|
||||
|
||||
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
||||
bytefn(dav1d_filter_sbrow_deblock)(f, sby);
|
||||
if (f->seq_hdr->cdef)
|
||||
bytefn(dav1d_filter_sbrow_cdef)(f, sby);
|
||||
if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
|
||||
bytefn(dav1d_filter_sbrow_resize)(f, sby);
|
||||
if (f->lf.restore_planes)
|
||||
bytefn(dav1d_filter_sbrow_lr)(f, sby);
|
||||
}
|
||||
|
||||
void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
|
||||
|
|
|
@ -51,12 +51,13 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
|
|||
const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
|
||||
gmv[0] : b->mv.mv[n];
|
||||
|
||||
*have_refmv_match = 1;
|
||||
*have_newmv_match |= b->mf >> 1;
|
||||
|
||||
const int last = *cnt;
|
||||
for (int m = 0; m < last; m++)
|
||||
if (mvstack[m].mv.mv[0].n == cand_mv.n) {
|
||||
mvstack[m].weight += weight;
|
||||
*have_refmv_match = 1;
|
||||
*have_newmv_match |= b->mf >> 1;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -65,8 +66,6 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
|
|||
mvstack[last].weight = weight;
|
||||
*cnt = last + 1;
|
||||
}
|
||||
*have_refmv_match = 1;
|
||||
*have_newmv_match |= b->mf >> 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -76,12 +75,13 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
|
|||
[1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
|
||||
}};
|
||||
|
||||
*have_refmv_match = 1;
|
||||
*have_newmv_match |= b->mf >> 1;
|
||||
|
||||
const int last = *cnt;
|
||||
for (int n = 0; n < last; n++)
|
||||
if (mvstack[n].mv.n == cand_mv.n) {
|
||||
mvstack[n].weight += weight;
|
||||
*have_refmv_match = 1;
|
||||
*have_newmv_match |= b->mf >> 1;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -90,8 +90,6 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
|
|||
mvstack[last].weight = weight;
|
||||
*cnt = last + 1;
|
||||
}
|
||||
*have_refmv_match = 1;
|
||||
*have_newmv_match |= b->mf >> 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -30,19 +30,14 @@
|
|||
#include "common/attributes.h"
|
||||
#include "src/scan.h"
|
||||
|
||||
static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
|
||||
static const uint16_t ALIGN(scan_4x4[], 32) = {
|
||||
0, 4, 1, 2,
|
||||
5, 8, 12, 9,
|
||||
6, 3, 7, 10,
|
||||
13, 14, 11, 15,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
|
||||
0, 4, 8, 12,
|
||||
1, 5, 9, 13,
|
||||
2, 6, 10, 14,
|
||||
3, 7, 11, 15,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_4x8[], 32) = {
|
||||
0, 8, 1, 16,
|
||||
9, 2, 24, 17,
|
||||
10, 3, 25, 18,
|
||||
|
@ -52,17 +47,8 @@ static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
|
|||
14, 7, 29, 22,
|
||||
15, 30, 23, 31,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
|
||||
0, 8, 16, 24,
|
||||
1, 9, 17, 25,
|
||||
2, 10, 18, 26,
|
||||
3, 11, 19, 27,
|
||||
4, 12, 20, 28,
|
||||
5, 13, 21, 29,
|
||||
6, 14, 22, 30,
|
||||
7, 15, 23, 31,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_4x16[], 32) = {
|
||||
0, 16, 1, 32,
|
||||
17, 2, 48, 33,
|
||||
18, 3, 49, 34,
|
||||
|
@ -80,37 +66,15 @@ static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
|
|||
30, 15, 61, 46,
|
||||
31, 62, 47, 63,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
|
||||
0, 16, 32, 48,
|
||||
1, 17, 33, 49,
|
||||
2, 18, 34, 50,
|
||||
3, 19, 35, 51,
|
||||
4, 20, 36, 52,
|
||||
5, 21, 37, 53,
|
||||
6, 22, 38, 54,
|
||||
7, 23, 39, 55,
|
||||
8, 24, 40, 56,
|
||||
9, 25, 41, 57,
|
||||
10, 26, 42, 58,
|
||||
11, 27, 43, 59,
|
||||
12, 28, 44, 60,
|
||||
13, 29, 45, 61,
|
||||
14, 30, 46, 62,
|
||||
15, 31, 47, 63,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_8x4[], 32) = {
|
||||
0, 1, 4, 2, 5, 8, 3, 6,
|
||||
9, 12, 7, 10, 13, 16, 11, 14,
|
||||
17, 20, 15, 18, 21, 24, 19, 22,
|
||||
25, 28, 23, 26, 29, 27, 30, 31,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
|
||||
0, 4, 8, 12, 16, 20, 24, 28,
|
||||
1, 5, 9, 13, 17, 21, 25, 29,
|
||||
2, 6, 10, 14, 18, 22, 26, 30,
|
||||
3, 7, 11, 15, 19, 23, 27, 31,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_8x8[], 32) = {
|
||||
0, 8, 1, 2, 9, 16, 24, 17,
|
||||
10, 3, 4, 11, 18, 25, 32, 40,
|
||||
33, 26, 19, 12, 5, 6, 13, 20,
|
||||
|
@ -120,17 +84,8 @@ static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
|
|||
23, 31, 38, 45, 52, 59, 60, 53,
|
||||
46, 39, 47, 54, 61, 62, 55, 63,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
|
||||
0, 8, 16, 24, 32, 40, 48, 56,
|
||||
1, 9, 17, 25, 33, 41, 49, 57,
|
||||
2, 10, 18, 26, 34, 42, 50, 58,
|
||||
3, 11, 19, 27, 35, 43, 51, 59,
|
||||
4, 12, 20, 28, 36, 44, 52, 60,
|
||||
5, 13, 21, 29, 37, 45, 53, 61,
|
||||
6, 14, 22, 30, 38, 46, 54, 62,
|
||||
7, 15, 23, 31, 39, 47, 55, 63,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_8x16[], 32) = {
|
||||
0, 16, 1, 32, 17, 2, 48, 33,
|
||||
18, 3, 64, 49, 34, 19, 4, 80,
|
||||
65, 50, 35, 20, 5, 96, 81, 66,
|
||||
|
@ -148,25 +103,8 @@ static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
|
|||
47, 123, 108, 93, 78, 63, 124, 109,
|
||||
94, 79, 125, 110, 95, 126, 111, 127,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
|
||||
0, 16, 32, 48, 64, 80, 96, 112,
|
||||
1, 17, 33, 49, 65, 81, 97, 113,
|
||||
2, 18, 34, 50, 66, 82, 98, 114,
|
||||
3, 19, 35, 51, 67, 83, 99, 115,
|
||||
4, 20, 36, 52, 68, 84, 100, 116,
|
||||
5, 21, 37, 53, 69, 85, 101, 117,
|
||||
6, 22, 38, 54, 70, 86, 102, 118,
|
||||
7, 23, 39, 55, 71, 87, 103, 119,
|
||||
8, 24, 40, 56, 72, 88, 104, 120,
|
||||
9, 25, 41, 57, 73, 89, 105, 121,
|
||||
10, 26, 42, 58, 74, 90, 106, 122,
|
||||
11, 27, 43, 59, 75, 91, 107, 123,
|
||||
12, 28, 44, 60, 76, 92, 108, 124,
|
||||
13, 29, 45, 61, 77, 93, 109, 125,
|
||||
14, 30, 46, 62, 78, 94, 110, 126,
|
||||
15, 31, 47, 63, 79, 95, 111, 127,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_8x32[], 32) = {
|
||||
0, 32, 1, 64, 33, 2, 96, 65,
|
||||
34, 3, 128, 97, 66, 35, 4, 160,
|
||||
129, 98, 67, 36, 5, 192, 161, 130,
|
||||
|
@ -200,19 +138,15 @@ static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
|
|||
95, 251, 220, 189, 158, 127, 252, 221,
|
||||
190, 159, 253, 222, 191, 254, 223, 255,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_16x4[], 32) = {
|
||||
0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
|
||||
17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
|
||||
33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
|
||||
49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
|
||||
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
|
||||
1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
|
||||
2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
|
||||
3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_16x8[], 32) = {
|
||||
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5,
|
||||
12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
|
||||
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44,
|
||||
|
@ -222,17 +156,8 @@ static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
|
|||
99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115,
|
||||
122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
|
||||
0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
|
||||
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
|
||||
2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
|
||||
3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
|
||||
4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
|
||||
5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
|
||||
6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
|
||||
7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_16x16[], 32) = {
|
||||
0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80,
|
||||
65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67,
|
||||
52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114,
|
||||
|
@ -250,43 +175,8 @@ static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
|
|||
188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
|
||||
175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
|
||||
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
|
||||
1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
|
||||
2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
|
||||
3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
|
||||
4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
|
||||
5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
|
||||
6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
|
||||
7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
|
||||
8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
|
||||
9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
|
||||
10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
|
||||
11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
|
||||
12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
|
||||
13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
|
||||
14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
|
||||
15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
|
||||
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
|
||||
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
|
||||
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
|
||||
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
|
||||
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
|
||||
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
||||
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
|
||||
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
|
||||
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
|
||||
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
||||
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_16x32[], 32) = {
|
||||
0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160,
|
||||
129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131,
|
||||
100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226,
|
||||
|
@ -320,7 +210,8 @@ static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
|
|||
380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
|
||||
351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_32x8[], 32) = {
|
||||
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
|
||||
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
|
||||
67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
|
||||
|
@ -330,7 +221,8 @@ static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
|
|||
195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
|
||||
227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_32x16[], 32) = {
|
||||
0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52,
|
||||
67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130,
|
||||
145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73,
|
||||
|
@ -348,7 +240,8 @@ static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
|
|||
381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
|
||||
459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
|
||||
};
|
||||
static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
|
||||
|
||||
static const uint16_t ALIGN(scan_32x32[], 32) = {
|
||||
0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131,
|
||||
100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258,
|
||||
289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292,
|
||||
|
@ -383,62 +276,24 @@ static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
|
|||
892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
|
||||
};
|
||||
|
||||
const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
|
||||
[TX_4X4] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_4x4,
|
||||
[TX_CLASS_V] = av1_mrow_scan_4x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [TX_8X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x8,
|
||||
[TX_CLASS_V] = av1_mrow_scan_8x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [TX_16X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x16,
|
||||
[TX_CLASS_V] = av1_mrow_scan_16x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [TX_32X32] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x32,
|
||||
}, [TX_64X64] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x32,
|
||||
}, [RTX_4X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_4x8,
|
||||
[TX_CLASS_V] = av1_mrow_scan_4x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_8X4] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x4,
|
||||
[TX_CLASS_V] = av1_mrow_scan_8x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_8X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x16,
|
||||
[TX_CLASS_V] = av1_mrow_scan_8x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_16X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x8,
|
||||
[TX_CLASS_V] = av1_mrow_scan_16x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_16X32] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x32,
|
||||
}, [RTX_32X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x16,
|
||||
}, [RTX_32X64] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x32,
|
||||
}, [RTX_64X32] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x32,
|
||||
}, [RTX_4X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_4x16,
|
||||
[TX_CLASS_V] = av1_mrow_scan_4x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_16X4] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x4,
|
||||
[TX_CLASS_V] = av1_mrow_scan_16x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_8X32] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x32,
|
||||
}, [RTX_32X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x8,
|
||||
}, [RTX_16X64] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x32,
|
||||
}, [RTX_64X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_32x16,
|
||||
},
|
||||
const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
|
||||
[ TX_4X4 ] = scan_4x4,
|
||||
[ TX_8X8 ] = scan_8x8,
|
||||
[ TX_16X16] = scan_16x16,
|
||||
[ TX_32X32] = scan_32x32,
|
||||
[ TX_64X64] = scan_32x32,
|
||||
[RTX_4X8 ] = scan_4x8,
|
||||
[RTX_8X4 ] = scan_8x4,
|
||||
[RTX_8X16 ] = scan_8x16,
|
||||
[RTX_16X8 ] = scan_16x8,
|
||||
[RTX_16X32] = scan_16x32,
|
||||
[RTX_32X16] = scan_32x16,
|
||||
[RTX_32X64] = scan_32x32,
|
||||
[RTX_64X32] = scan_32x32,
|
||||
[RTX_4X16 ] = scan_4x16,
|
||||
[RTX_16X4 ] = scan_16x4,
|
||||
[RTX_8X32 ] = scan_8x32,
|
||||
[RTX_32X8 ] = scan_32x8,
|
||||
[RTX_16X64] = scan_16x32,
|
||||
[RTX_64X16] = scan_32x16,
|
||||
};
|
||||
|
|
|
@ -32,6 +32,6 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
|
||||
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
|
||||
|
||||
#endif /* DAV1D_SRC_SCAN_H */
|
||||
|
|
|
@ -412,13 +412,11 @@ const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
|
|||
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
|
||||
};
|
||||
|
||||
const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
|
||||
{ 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
|
||||
{ 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 },
|
||||
{ 2, 1, 47, 1079 }, { 2, 1, 37, 996 }, { 2, 1, 30, 925 },
|
||||
{ 2, 1, 25, 863 }, { 0, 1, -1, 2589 }, { 0, 1, -1, 1618 },
|
||||
{ 0, 1, -1, 1177 }, { 0, 1, -1, 925 }, { 2, 0, 56, -1 },
|
||||
{ 2, 0, 22, -1 },
|
||||
const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = {
|
||||
{ 140, 3236 }, { 112, 2158 }, { 93, 1618 }, { 80, 1438 },
|
||||
{ 70, 1295 }, { 58, 1177 }, { 47, 1079 }, { 37, 996 },
|
||||
{ 30, 925 }, { 25, 863 }, { 0, 2589 }, { 0, 1618 },
|
||||
{ 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 },
|
||||
};
|
||||
|
||||
const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
|
||||
|
|
|
@ -107,7 +107,7 @@ extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
|
|||
|
||||
extern const int8_t dav1d_cdef_directions[12][2];
|
||||
|
||||
extern const int16_t dav1d_sgr_params[16][4];
|
||||
extern const uint16_t dav1d_sgr_params[16][2];
|
||||
extern const uint8_t dav1d_sgr_x_by_x[256];
|
||||
|
||||
extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
|
||||
|
|
|
@ -169,6 +169,14 @@ static inline void dav1d_set_thread_name(const char *const name) {
|
|||
pthread_setname_np(pthread_self(), "%s", (void*)name);
|
||||
}
|
||||
|
||||
#elif defined(__HAIKU__)
|
||||
|
||||
#include <os/kernel/OS.h>
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
rename_thread(find_thread(NULL), name);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define dav1d_set_thread_name(name) do {} while (0)
|
||||
|
|
|
@ -29,6 +29,140 @@
|
|||
|
||||
#include "src/thread_task.h"
|
||||
|
||||
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) {
|
||||
struct PostFilterThreadData *const pftd = f->lf.thread.pftd;
|
||||
const int frame_idx = (int)(f - f->c->fc);
|
||||
|
||||
const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
|
||||
f->frame_hdr->loopfilter.level_y[1] ||
|
||||
f->lf.restore_planes;
|
||||
const int has_cdef = f->seq_hdr->cdef;
|
||||
const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
|
||||
const int has_lr = !!f->lf.restore_planes;
|
||||
f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr;
|
||||
if (f->lf.thread.npf == 0) return 0;
|
||||
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
|
||||
Dav1dTask *tasks = f->lf.thread.tasks;
|
||||
int num_tasks = f->sbh * f->lf.thread.npf;
|
||||
if (num_tasks > f->lf.thread.num_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * num_tasks;
|
||||
tasks = realloc(f->lf.thread.tasks, size);
|
||||
if (!tasks) {
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
return -1;
|
||||
}
|
||||
memset(tasks, 0, size);
|
||||
f->lf.thread.tasks = tasks;
|
||||
f->lf.thread.num_tasks = num_tasks;
|
||||
}
|
||||
|
||||
#define create_task(task, ready_cond, start_cond) \
|
||||
do { \
|
||||
t = &tasks[num_tasks++]; \
|
||||
t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \
|
||||
t->start = start_cond; \
|
||||
t->frame_id = frame_cnt; \
|
||||
t->frame_idx = frame_idx; \
|
||||
t->sby = sby; \
|
||||
t->fn = f->bd_fn.filter_sbrow_##task; \
|
||||
t->last_deps[0] = NULL; \
|
||||
t->last_deps[1] = NULL; \
|
||||
t->next_deps[0] = NULL; \
|
||||
t->next_deps[1] = NULL; \
|
||||
t->next_exec = NULL; \
|
||||
} while (0)
|
||||
|
||||
Dav1dTask *last_sbrow_deblock = NULL;
|
||||
Dav1dTask *last_sbrow_cdef = NULL;
|
||||
Dav1dTask *last_sbrow_resize = NULL;
|
||||
Dav1dTask *last_sbrow_lr = NULL;
|
||||
num_tasks = 0;
|
||||
const int frame_cnt = pftd->frame_cnt++;
|
||||
|
||||
for (int sby = 0; sby < f->sbh; ++sby) {
|
||||
Dav1dTask *t;
|
||||
Dav1dTask *last = NULL;
|
||||
if (has_deblock) {
|
||||
create_task(deblock, sby == 0, 0);
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_deblock;
|
||||
last_sbrow_deblock->next_deps[1] = t;
|
||||
}
|
||||
last = t;
|
||||
last_sbrow_deblock = t;
|
||||
}
|
||||
if (has_cdef) {
|
||||
create_task(cdef, sby == 0 && !has_deblock, has_deblock);
|
||||
if (has_deblock) {
|
||||
t->last_deps[0] = last;
|
||||
last->next_deps[0] = t;
|
||||
}
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_cdef;
|
||||
last_sbrow_cdef->next_deps[1] = t;
|
||||
}
|
||||
last = t;
|
||||
last_sbrow_cdef = t;
|
||||
};
|
||||
if (has_resize) {
|
||||
create_task(resize, sby == 0 && !last, !!last);
|
||||
if (last) {
|
||||
t->last_deps[0] = last;
|
||||
last->next_deps[0] = t;
|
||||
}
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_resize;
|
||||
last_sbrow_resize->next_deps[1] = t;
|
||||
}
|
||||
last = t;
|
||||
last_sbrow_resize = t;
|
||||
}
|
||||
if (has_lr) {
|
||||
create_task(lr, sby == 0 && !last, !!last);
|
||||
if (last) {
|
||||
t->last_deps[0] = last;
|
||||
last->next_deps[0] = t;
|
||||
}
|
||||
if (sby) {
|
||||
t->last_deps[1] = last_sbrow_lr;
|
||||
last_sbrow_lr->next_deps[1] = t;
|
||||
}
|
||||
last_sbrow_lr = t;
|
||||
}
|
||||
}
|
||||
f->lf.thread.done = 0;
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dav1d_task_schedule(struct PostFilterThreadData *const pftd,
|
||||
Dav1dTask *const t)
|
||||
{
|
||||
Dav1dTask **pt = &pftd->tasks;
|
||||
while (*pt &&
|
||||
((*pt)->sby < t->sby ||
|
||||
((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id)))
|
||||
pt = &(*pt)->next_exec;
|
||||
t->next_exec = *pt;
|
||||
*pt = t;
|
||||
pthread_cond_signal(&pftd->cond);
|
||||
}
|
||||
|
||||
static inline void update_task(Dav1dTask *const t, const int dep_type,
|
||||
Dav1dFrameContext *const f)
|
||||
{
|
||||
if (!t->last_deps[!dep_type] ||
|
||||
t->last_deps[!dep_type]->status == DAV1D_TASK_DONE)
|
||||
{
|
||||
t->status = DAV1D_TASK_READY;
|
||||
if (t->start)
|
||||
dav1d_task_schedule(f->lf.thread.pftd, t);
|
||||
}
|
||||
}
|
||||
|
||||
void *dav1d_frame_task(void *const data) {
|
||||
Dav1dFrameContext *const f = data;
|
||||
|
||||
|
@ -140,3 +274,98 @@ void *dav1d_tile_task(void *const data) {
|
|||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int handle_abortion(Dav1dPostFilterContext *const pf,
|
||||
Dav1dContext *const c,
|
||||
struct PostFilterThreadData *const pftd)
|
||||
{
|
||||
const int flush = atomic_load_explicit(c->flush, memory_order_acquire);
|
||||
if (flush) {
|
||||
pthread_mutex_lock(&pf->td.lock);
|
||||
pf->flushed = 0;
|
||||
pthread_mutex_unlock(&pf->td.lock);
|
||||
}
|
||||
for (unsigned i = 0; i < c->n_fc; i++) {
|
||||
Dav1dFrameContext *const f = &c->fc[i];
|
||||
int send_signal;
|
||||
if (flush) // TODO before merge, see if this can be safely merged
|
||||
send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0;
|
||||
else
|
||||
send_signal = f->lf.thread.done == -1;
|
||||
for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) {
|
||||
Dav1dTask *const t = &f->lf.thread.tasks[j];
|
||||
if (t->status == DAV1D_TASK_RUNNING ||
|
||||
(t->status == DAV1D_TASK_DONE && t->start != -1))
|
||||
send_signal = 0;
|
||||
}
|
||||
if (send_signal) {
|
||||
if (!flush) {
|
||||
Dav1dTask **pt = &pftd->tasks;
|
||||
while (*pt) {
|
||||
if ((*pt)->frame_idx == i)
|
||||
*pt = (*pt)->next_exec;
|
||||
else
|
||||
pt = &(*pt)->next_exec;
|
||||
}
|
||||
}
|
||||
f->lf.thread.done = 1;
|
||||
pthread_cond_signal(&f->lf.thread.cond);
|
||||
}
|
||||
}
|
||||
if (flush) {
|
||||
pthread_mutex_lock(&pf->td.lock);
|
||||
pf->flushed = 1;
|
||||
pthread_cond_signal(&pf->td.cond);
|
||||
pthread_mutex_unlock(&pf->td.lock);
|
||||
}
|
||||
return !flush;
|
||||
}
|
||||
|
||||
void *dav1d_postfilter_task(void *data) {
|
||||
Dav1dPostFilterContext *const pf = data;
|
||||
Dav1dContext *const c = pf->c;
|
||||
struct PostFilterThreadData *pftd = &c->postfilter_thread;
|
||||
|
||||
dav1d_set_thread_name("dav1d-postfilter");
|
||||
|
||||
int exec = 1;
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
for (;;) {
|
||||
if (!exec && !pf->die)
|
||||
pthread_cond_wait(&pftd->cond, &pftd->lock);
|
||||
if (!(exec = handle_abortion(pf, c, pftd))) continue;
|
||||
if (pf->die) break;
|
||||
|
||||
Dav1dTask *const t = pftd->tasks;
|
||||
if (!t) { exec = 0; continue; }
|
||||
pftd->tasks = t->next_exec;
|
||||
t->status = DAV1D_TASK_RUNNING;
|
||||
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
Dav1dFrameContext *const f = &c->fc[t->frame_idx];
|
||||
t->fn(f, t->sby);
|
||||
exec = 1;
|
||||
pthread_mutex_lock(&pftd->lock);
|
||||
|
||||
if (t->next_deps[0])
|
||||
update_task(t->next_deps[0], 0, f);
|
||||
if (t->next_deps[1])
|
||||
update_task(t->next_deps[1], 1, f);
|
||||
t->status = DAV1D_TASK_DONE;
|
||||
if (!t->next_deps[0]) {
|
||||
const enum PlaneType progress_plane_type =
|
||||
c->n_fc > 1 && f->frame_hdr->refresh_context ?
|
||||
PLANE_TYPE_Y : PLANE_TYPE_ALL;
|
||||
const int y = (t->sby + 1) * f->sb_step * 4;
|
||||
dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type);
|
||||
if (t->sby + 1 == f->sbh) {
|
||||
f->lf.thread.done = 1;
|
||||
pthread_cond_signal(&f->lf.thread.cond);
|
||||
}
|
||||
}
|
||||
t->start = -1;
|
||||
}
|
||||
pthread_mutex_unlock(&pftd->lock);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -35,10 +35,33 @@
|
|||
#define FRAME_ERROR (UINT_MAX - 1)
|
||||
#define TILE_ERROR (INT_MAX - 1)
|
||||
|
||||
int dav1d_decode_frame(Dav1dFrameContext *f);
|
||||
void *dav1d_frame_task(void *data);
|
||||
enum TaskStatus {
|
||||
DAV1D_TASK_DEFAULT,
|
||||
DAV1D_TASK_READY,
|
||||
DAV1D_TASK_RUNNING,
|
||||
DAV1D_TASK_DONE,
|
||||
};
|
||||
|
||||
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
|
||||
struct Dav1dTask {
|
||||
enum TaskStatus status; // task status
|
||||
int start; // frame thread start flag
|
||||
unsigned frame_idx; // frame thread id
|
||||
int frame_id; // frame ordering
|
||||
int sby; // sbrow
|
||||
filter_sbrow_fn fn; // task work
|
||||
Dav1dTask *last_deps[2]; // dependencies
|
||||
Dav1dTask *next_deps[2]; // dependant tasks
|
||||
Dav1dTask *next_exec; // tasks scheduling
|
||||
};
|
||||
|
||||
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f);
|
||||
void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t);
|
||||
|
||||
void *dav1d_frame_task(void *data);
|
||||
void *dav1d_tile_task(void *data);
|
||||
void *dav1d_postfilter_task(void *data);
|
||||
|
||||
int dav1d_decode_frame(Dav1dFrameContext *f);
|
||||
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
|
||||
|
||||
#endif /* DAV1D_SRC_THREAD_TASK_H */
|
||||
|
|
|
@ -45,41 +45,41 @@ enum WedgeDirectionType {
|
|||
};
|
||||
|
||||
typedef struct {
|
||||
enum WedgeDirectionType direction;
|
||||
int x_offset;
|
||||
int y_offset;
|
||||
uint8_t /* enum WedgeDirectionType */ direction;
|
||||
uint8_t x_offset;
|
||||
uint8_t y_offset;
|
||||
} wedge_code_type;
|
||||
|
||||
static const wedge_code_type wedge_codebook_16_hgtw[16] = {
|
||||
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
|
||||
{ WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
|
||||
{ WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
|
||||
{ WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
|
||||
{ WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
|
||||
{ WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
|
||||
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
|
||||
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
|
||||
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
|
||||
};
|
||||
|
||||
static const wedge_code_type wedge_codebook_16_hltw[16] = {
|
||||
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
|
||||
{ WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
|
||||
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
|
||||
{ WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
|
||||
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
|
||||
{ WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
|
||||
{ WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
|
||||
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
|
||||
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
|
||||
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
|
||||
};
|
||||
|
||||
static const wedge_code_type wedge_codebook_16_heqw[16] = {
|
||||
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
|
||||
{ WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
|
||||
{ WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
|
||||
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
|
||||
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
|
||||
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
|
||||
{ WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
|
||||
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
|
||||
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
|
||||
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
; Copyright (c) 2017-2021, The rav1e contributors
|
||||
; Copyright (c) 2021, Nathan Egge
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern cdef_dir_8bpc_avx2
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal cdef_dir_16bpc, 4, 4, 3, 32 + 8*8, src, ss, var, bdmax
|
||||
popcnt bdmaxd, bdmaxd
|
||||
movzx bdmaxq, bdmaxw
|
||||
sub bdmaxq, 8
|
||||
movq xm2, bdmaxq
|
||||
DEFINE_ARGS src, ss, var, ss3
|
||||
lea ss3q, [ssq*3]
|
||||
mova xm0, [srcq + ssq*0]
|
||||
mova xm1, [srcq + ssq*1]
|
||||
vinserti128 m0, [srcq + ssq*2], 1
|
||||
vinserti128 m1, [srcq + ss3q], 1
|
||||
psraw m0, xm2
|
||||
psraw m1, xm2
|
||||
vpackuswb m0, m1
|
||||
mova [rsp + 32 + 0*8], m0
|
||||
lea srcq, [srcq + ssq*4]
|
||||
mova xm0, [srcq + ssq*0]
|
||||
mova xm1, [srcq + ssq*1]
|
||||
vinserti128 m0, [srcq + ssq*2], 1
|
||||
vinserti128 m1, [srcq + ss3q], 1
|
||||
psraw m0, xm2
|
||||
psraw m1, xm2
|
||||
vpackuswb m0, m1
|
||||
mova [rsp + 32 + 4*8], m0
|
||||
lea srcq, [rsp + 32] ; WIN64 shadow space
|
||||
mov ssq, 8
|
||||
call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX)
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
|
@ -0,0 +1,93 @@
|
|||
; Copyright (c) 2017-2021, The rav1e contributors
|
||||
; Copyright (c) 2021, Nathan Egge
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
%ifn ARCH_X86_64
|
||||
SECTION_RODATA 16
|
||||
|
||||
pq_dir_shr: dq 2, 4
|
||||
%endif
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern cdef_dir_8bpc_ssse3
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal cdef_dir_16bpc, 2, 4, 4, 32 + 8*8, src, ss, var, bdmax
|
||||
bsr bdmaxd, bdmaxm
|
||||
%if ARCH_X86_64
|
||||
movzx bdmaxq, bdmaxw
|
||||
sub bdmaxq, 7
|
||||
movq m4, bdmaxq
|
||||
%else
|
||||
push r4
|
||||
sub bdmaxd, 9
|
||||
LEA r4, pq_dir_shr
|
||||
movq m4, [r4 + bdmaxd*4]
|
||||
pop r4
|
||||
%endif
|
||||
DEFINE_ARGS src, ss, var, ss3
|
||||
lea ss3q, [ssq*3]
|
||||
mova m0, [srcq + ssq*0]
|
||||
mova m1, [srcq + ssq*1]
|
||||
mova m2, [srcq + ssq*2]
|
||||
mova m3, [srcq + ss3q]
|
||||
psraw m0, m4
|
||||
psraw m1, m4
|
||||
psraw m2, m4
|
||||
psraw m3, m4
|
||||
packuswb m0, m1
|
||||
packuswb m2, m3
|
||||
mova [rsp + 32 + 0*8], m0
|
||||
mova [rsp + 32 + 2*8], m2
|
||||
lea srcq, [srcq + ssq*4]
|
||||
mova m0, [srcq + ssq*0]
|
||||
mova m1, [srcq + ssq*1]
|
||||
mova m2, [srcq + ssq*2]
|
||||
mova m3, [srcq + ss3q]
|
||||
psraw m0, m4
|
||||
psraw m1, m4
|
||||
psraw m2, m4
|
||||
psraw m3, m4
|
||||
packuswb m0, m1
|
||||
packuswb m2, m3
|
||||
mova [rsp + 32 + 4*8], m0
|
||||
mova [rsp + 32 + 6*8], m2
|
||||
lea srcq, [rsp + 32] ; WIN64 shadow space
|
||||
mov ssq, 8
|
||||
%if ARCH_X86_64
|
||||
call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX)
|
||||
%else
|
||||
movifnidn vard, varm
|
||||
push eax ; align stack
|
||||
push vard
|
||||
push ssd
|
||||
push srcd
|
||||
call mangle(private_prefix %+ _cdef_dir_8bpc)
|
||||
add esp, 0x10
|
||||
%endif
|
||||
RET
|
|
@ -39,7 +39,7 @@
|
|||
%endmacro
|
||||
|
||||
%macro CDEF_FILTER_JMP_TABLE 1
|
||||
JMP_TABLE cdef_filter_%1, \
|
||||
JMP_TABLE cdef_filter_%1_8bpc, \
|
||||
d6k0, d6k1, d7k0, d7k1, \
|
||||
d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
|
||||
d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
|
||||
|
@ -94,7 +94,7 @@ SECTION .text
|
|||
%macro PREP_REGS 2 ; w, h
|
||||
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
|
||||
mov dird, r6m
|
||||
lea tableq, [cdef_filter_%1x%2_jmptable]
|
||||
lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
|
||||
lea dirq, [tableq+dirq*2*4]
|
||||
%if %1 == 4
|
||||
%if %2 == 4
|
||||
|
@ -397,7 +397,7 @@ SECTION .text
|
|||
|
||||
%macro CDEF_FILTER 2 ; w, h
|
||||
INIT_YMM avx2
|
||||
cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
|
||||
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
|
||||
pri, sec, dir, damping, edge
|
||||
%assign stack_offset_entry stack_offset
|
||||
mov edged, edgem
|
||||
|
@ -1592,7 +1592,7 @@ CDEF_FILTER 4, 8
|
|||
CDEF_FILTER 4, 4
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
||||
cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3
|
||||
lea stride3q, [strideq*3]
|
||||
movq xm0, [srcq+strideq*0]
|
||||
movq xm1, [srcq+strideq*1]
|
||||
|
@ -1622,10 +1622,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
|
|||
psubw m3, m8
|
||||
|
||||
; shuffle registers to generate partial_sum_diag[0-1] together
|
||||
vpermq m7, m0, q1032
|
||||
vpermq m6, m1, q1032
|
||||
vpermq m5, m2, q1032
|
||||
vpermq m4, m3, q1032
|
||||
vperm2i128 m7, m0, m0, 0x01
|
||||
vperm2i128 m6, m1, m1, 0x01
|
||||
vperm2i128 m5, m2, m2, 0x01
|
||||
vperm2i128 m4, m3, m3, 0x01
|
||||
|
||||
; start with partial_sum_hv[0-1]
|
||||
paddw m8, m0, m1
|
||||
|
|
|
@ -109,7 +109,8 @@ DECLARE_REG_TMP 8, 5
|
|||
; 5e 5f 50 51 52 53 54 55
|
||||
|
||||
INIT_ZMM avx512icl
|
||||
cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
|
||||
cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \
|
||||
pri, sec, dir, damping, edge
|
||||
%define base r7-edge_mask
|
||||
movq xmm0, [dstq+strideq*0]
|
||||
movhps xmm0, [dstq+strideq*1]
|
||||
|
@ -269,8 +270,7 @@ DECLARE_REG_TMP 2, 7
|
|||
; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85
|
||||
; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95
|
||||
|
||||
cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
|
||||
pri, sec, dir, damping, edge
|
||||
cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, pri, sec, dir, damping, edge
|
||||
%define base r8-edge_mask
|
||||
vpbroadcastd ym21, strided
|
||||
mov r6d, edgem
|
||||
|
@ -504,8 +504,8 @@ ALIGN function_align
|
|||
; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b
|
||||
; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b
|
||||
|
||||
cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
|
||||
pri, sec, dir, damping, edge
|
||||
cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \
|
||||
pri, sec, dir, damping, edge
|
||||
%define base r8-edge_mask
|
||||
mov r6d, edgem
|
||||
lea r10, [dstq+strideq*4-2]
|
||||
|
|
|
@ -28,20 +28,23 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/cdef.h"
|
||||
|
||||
#define decl_cdef_size_fn(sz) \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
|
||||
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
|
||||
#define decl_cdef_fns(ext) \
|
||||
decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
|
||||
decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
|
||||
decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
|
||||
|
||||
decl_cdef_size_fn(4x4);
|
||||
decl_cdef_size_fn(4x8);
|
||||
decl_cdef_size_fn(8x8);
|
||||
#if BITDEPTH == 8
|
||||
decl_cdef_fns(avx512icl);
|
||||
decl_cdef_fns(avx2);
|
||||
decl_cdef_fns(sse4);
|
||||
decl_cdef_fns(ssse3);
|
||||
decl_cdef_fns(sse2);
|
||||
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
|
||||
decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
|
||||
#endif
|
||||
|
||||
decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
|
||||
decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
|
||||
|
||||
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
@ -49,45 +52,47 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
|||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_sse2;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_sse2;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_sse2;
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
c->dir = BF(dav1d_cdef_dir, ssse3);
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_ssse3;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_sse4;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
|
||||
c->dir = BF(dav1d_cdef_dir, sse4);
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
|
||||
#endif
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
c->dir = BF(dav1d_cdef_dir, avx2);
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_dir_avx2;
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_avx2;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_avx2;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_avx2;
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
|
||||
|
||||
#if HAVE_AVX512ICL && BITDEPTH == 8
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
|
||||
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
|
||||
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
|
||||
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -249,13 +249,13 @@ SECTION .text
|
|||
|
||||
%macro CDEF_FILTER 2 ; w, h
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
|
||||
dst, stride, left, top, pri, sec, edge, stride3, dst4
|
||||
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \
|
||||
dst, stride, left, top, pri, sec, edge, stride3, dst4
|
||||
%define px rsp+3*16+2*32
|
||||
%define base 0
|
||||
%else
|
||||
cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
|
||||
dst, stride, left, edge, stride3
|
||||
cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
|
||||
dst, stride, left, edge, stride3
|
||||
%define topq r2
|
||||
%define dst4q r2
|
||||
LEA r5, tap_table
|
||||
|
@ -758,7 +758,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
|
|||
|
||||
%macro CDEF_DIR 0
|
||||
%if ARCH_X86_64
|
||||
cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
|
||||
cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3
|
||||
lea stride3q, [strideq*3]
|
||||
movq m1, [srcq+strideq*0]
|
||||
movhps m1, [srcq+strideq*1]
|
||||
|
@ -1030,7 +1030,7 @@ cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
|
|||
shr r1d, 10
|
||||
mov [varq], r1d
|
||||
%else
|
||||
cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
|
||||
cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
|
||||
%define base r2-shufw_6543210x
|
||||
LEA r2, shufw_6543210x
|
||||
pxor m0, m0
|
||||
|
|
|
@ -1170,7 +1170,7 @@ ALIGN function_align
|
|||
mova m9, [base+ipred_v_shuf]
|
||||
vbroadcasti128 m6, [base+smooth_weights+16*2]
|
||||
vbroadcasti128 m7, [base+smooth_weights+16*3]
|
||||
vpermq m8, m9, q1032
|
||||
vperm2i128 m8, m9, m9, 0x01
|
||||
paddw m0, m10, m3
|
||||
paddw m3, m11
|
||||
paddw m12, m0
|
||||
|
@ -4197,7 +4197,7 @@ ALIGN function_align
|
|||
pmaddubsw m%3, m5
|
||||
paddw m%1, m%3
|
||||
psraw m%1, 4
|
||||
vpermq m%3, m%1, q1032
|
||||
vperm2i128 m%3, m%1, m%1, 0x01
|
||||
packuswb m%1, m%3
|
||||
%endmacro
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,480 @@
|
|||
; Copyright (c) 2017-2021, The rav1e contributors
|
||||
; Copyright (c) 2021, Nathan Egge
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
|
||||
wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13
|
||||
wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1
|
||||
wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
|
||||
wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9
|
||||
wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
|
||||
wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
|
||||
rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
|
||||
rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
|
||||
wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
|
||||
pq_3: dq (6 - 4) + 1
|
||||
pq_5: dq (6 - 2) + 1
|
||||
pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
|
||||
pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
|
||||
|
||||
pq_11: dq 12 - (6 - 4) + 1
|
||||
pq_9: dq 12 - (6 - 2) + 1
|
||||
nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
|
||||
nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
|
||||
|
||||
pb_wiener5_l: times 2 db 2, 3
|
||||
pb_wiener5_r: times 2 db -6, -5
|
||||
|
||||
pb_wiener7_l: times 2 db 4, 5
|
||||
pb_wiener7_m: times 2 db -4, -3
|
||||
pb_wiener7_r: times 2 db -8, -7
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movifnidn edgeb, edgem
|
||||
vbroadcasti128 m6, [wiener5_shufB]
|
||||
vpbroadcastd m12, [fq + 2]
|
||||
vbroadcasti128 m7, [wiener5_shufC]
|
||||
vpbroadcastw m13, [fq + 6]
|
||||
vbroadcasti128 m8, [wiener5_shufD]
|
||||
popcnt bdmaxd, bdmaxm
|
||||
vpbroadcastd m9, [pd_65540]
|
||||
movq xm10, [pq_3]
|
||||
cmp bdmaxd, 10
|
||||
je .bits10
|
||||
vpbroadcastd m9, [pd_262160]
|
||||
movq xm10, [pq_5]
|
||||
.bits10:
|
||||
pxor m11, m11
|
||||
add wq, wq
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
|
||||
.v_loop:
|
||||
mov xq, wq
|
||||
test edgeb, 1 ; LR_HAVE_LEFT
|
||||
jz .h_extend_left
|
||||
test leftq, leftq
|
||||
jz .h_loop
|
||||
movd xm4, [leftq + 4]
|
||||
vpblendd m4, [srcq + xq - 4], 0xfe
|
||||
add leftq, 8
|
||||
jmp .h_main
|
||||
.h_extend_left:
|
||||
vbroadcasti128 m5, [srcq + xq]
|
||||
mova m4, [srcq + xq]
|
||||
palignr m4, m5, 12
|
||||
pshufb m4, [wiener5_l_shuf]
|
||||
jmp .h_main
|
||||
.h_loop:
|
||||
movu m4, [srcq + xq - 4]
|
||||
.h_main:
|
||||
movu m5, [srcq + xq + 4]
|
||||
test edgeb, 2 ; LR_HAVE_RIGHT
|
||||
jnz .h_have_right
|
||||
cmp xd, -36
|
||||
jl .h_have_right
|
||||
movd xm2, xd
|
||||
vpbroadcastd m0, [pb_wiener5_l]
|
||||
vpbroadcastd m1, [pb_wiener5_r]
|
||||
vpbroadcastb m2, xm2
|
||||
movu m3, [pb_0to31]
|
||||
psubb m0, m2
|
||||
psubb m1, m2
|
||||
pminub m0, m3
|
||||
pminub m1, m3
|
||||
pshufb m4, m0
|
||||
pshufb m5, m1
|
||||
.h_have_right:
|
||||
pshufb m0, m4, m6
|
||||
pshufb m2, m4, m7
|
||||
paddw m0, m2
|
||||
pmaddwd m0, m12
|
||||
pshufb m1, m5, m6
|
||||
pshufb m3, m5, m7
|
||||
paddw m1, m3
|
||||
pmaddwd m1, m12
|
||||
pshufb m4, m8
|
||||
pmaddwd m4, m13
|
||||
pshufb m5, m8
|
||||
pmaddwd m5, m13
|
||||
paddd m0, m4
|
||||
paddd m1, m5
|
||||
paddd m0, m9
|
||||
paddd m1, m9
|
||||
psrad m0, xm10
|
||||
psrad m1, xm10
|
||||
packssdw m0, m1
|
||||
pmaxsw m0, m11
|
||||
mova [dstq + xq], m0
|
||||
add xq, 32
|
||||
jl .h_loop
|
||||
add srcq, ssq
|
||||
add dstq, 384*2
|
||||
dec hd
|
||||
jg .v_loop
|
||||
RET
|
||||
|
||||
DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movifnidn edgeb, edgem
|
||||
pxor m6, m6
|
||||
vpbroadcastd m7, [fq + 2]
|
||||
vpbroadcastd m8, [fq + 6]
|
||||
popcnt bdmaxd, bdmaxm
|
||||
vpbroadcastd m9, [nd_1047552]
|
||||
movq xm10, [pq_11]
|
||||
cmp bdmaxd, 10
|
||||
je .bits10
|
||||
vpbroadcastd m9, [nd_1048320]
|
||||
movq xm10, [pq_9]
|
||||
.bits10:
|
||||
vpbroadcastw m11, bdmaxm
|
||||
add wq, wq
|
||||
add midq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
|
||||
mov msq, 2*384
|
||||
mov t0, midq
|
||||
lea t1, [t0 + msq]
|
||||
lea t2, [t1 + msq]
|
||||
lea t3, [t2 + msq]
|
||||
lea t4, [t3 + msq]
|
||||
test edgeb, 4 ; LR_HAVE_TOP
|
||||
jnz .have_top
|
||||
mov t0, t2
|
||||
mov t1, t2
|
||||
.have_top:
|
||||
test edgeb, 8 ; LR_HAVE_BOTTOM
|
||||
jnz .v_loop
|
||||
cmp hd, 2
|
||||
jg .v_loop
|
||||
cmp hd, 1
|
||||
jne .limit_v
|
||||
mov t3, t2
|
||||
.limit_v:
|
||||
mov t4, t3
|
||||
.v_loop:
|
||||
mov xq, wq
|
||||
.h_loop:
|
||||
mova m1, [t0 + xq]
|
||||
mova m2, [t1 + xq]
|
||||
mova m3, [t2 + xq]
|
||||
mova m4, [t3 + xq]
|
||||
mova m5, [t4 + xq]
|
||||
punpcklwd m0, m1, m2
|
||||
pmaddwd m0, m7
|
||||
punpckhwd m1, m2
|
||||
pmaddwd m1, m7
|
||||
punpcklwd m2, m5, m4
|
||||
pmaddwd m2, m7
|
||||
punpckhwd m5, m4
|
||||
pmaddwd m5, m7
|
||||
paddd m0, m2
|
||||
paddd m1, m5
|
||||
punpcklwd m2, m3, m6
|
||||
pmaddwd m2, m8
|
||||
punpckhwd m3, m6
|
||||
pmaddwd m3, m8
|
||||
paddd m0, m2
|
||||
paddd m1, m3
|
||||
paddd m0, m9
|
||||
paddd m1, m9
|
||||
psrad m0, xm10
|
||||
psrad m1, xm10
|
||||
packusdw m0, m1
|
||||
pminuw m0, m11
|
||||
mova [dstq + xq], m0
|
||||
add xq, 32
|
||||
jl .h_loop
|
||||
add dstq, dsq
|
||||
mov t0, t1
|
||||
mov t1, t2
|
||||
mov t2, t3
|
||||
mov t3, t4
|
||||
add t4, msq
|
||||
test edgeb, 8 ; LR_HAVE_BOTTOM
|
||||
jnz .have_bottom
|
||||
cmp hd, 3
|
||||
jg .have_bottom
|
||||
mov t4, t3
|
||||
.have_bottom:
|
||||
dec hd
|
||||
jg .v_loop
|
||||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movifnidn edgeb, edgem
|
||||
vpbroadcastd m7, [fq]
|
||||
vpbroadcastd m8, [fq + 4]
|
||||
vbroadcasti128 m10, [rev_w]
|
||||
vbroadcasti128 m11, [wiener5_shufB]
|
||||
vbroadcasti128 m12, [wiener7_shufC]
|
||||
vbroadcasti128 m13, [wiener7_shufD]
|
||||
vbroadcasti128 m14, [wiener7_shufE]
|
||||
vbroadcasti128 m15, [rev_d]
|
||||
popcnt bdmaxd, bdmaxm
|
||||
vpbroadcastd m9, [pd_65540]
|
||||
mov rhq, [pq_3]
|
||||
cmp bdmaxd, 10
|
||||
je .bits10
|
||||
vpbroadcastd m9, [pd_262160]
|
||||
mov rhq, [pq_5]
|
||||
.bits10:
|
||||
add wq, wq
|
||||
add srcq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
|
||||
.v_loop:
|
||||
mov xq, wq
|
||||
test edgeb, 1 ; LR_HAVE_LEFT
|
||||
jz .h_extend_left
|
||||
test leftq, leftq
|
||||
jz .h_loop
|
||||
movq xm4, [leftq + 2]
|
||||
vpblendw xm4, [srcq + xq - 6], 0xf8
|
||||
vinserti128 m4, [srcq + xq + 10], 1
|
||||
add leftq, 8
|
||||
jmp .h_main
|
||||
.h_extend_left:
|
||||
vbroadcasti128 m5, [srcq + xq]
|
||||
mova m4, [srcq + xq]
|
||||
palignr m4, m5, 10
|
||||
pshufb m4, [wiener7_l_shuf]
|
||||
jmp .h_main
|
||||
.h_loop:
|
||||
movu m4, [srcq + xq - 6]
|
||||
.h_main:
|
||||
movu m5, [srcq + xq + 2]
|
||||
movu m6, [srcq + xq + 6]
|
||||
test edgeb, 2 ; LR_HAVE_RIGHT
|
||||
jnz .h_have_right
|
||||
cmp xd, -38
|
||||
jl .h_have_right
|
||||
movd xm3, xd
|
||||
vpbroadcastd m0, [pb_wiener7_l]
|
||||
vpbroadcastd m1, [pb_wiener7_m]
|
||||
vpbroadcastd m2, [pb_wiener7_r]
|
||||
vpbroadcastb m3, xm3
|
||||
psubb m0, m3
|
||||
psubb m1, m3
|
||||
psubb m2, m3
|
||||
movu m3, [pb_0to31]
|
||||
pminub m0, m3
|
||||
pminub m1, m3
|
||||
pminub m2, m3
|
||||
pshufb m4, m0
|
||||
pshufb m5, m1
|
||||
pshufb m6, m2
|
||||
cmp xd, -9*2
|
||||
jne .hack
|
||||
vpbroadcastw xm3, [srcq + xq + 16]
|
||||
vinserti128 m5, xm3, 1
|
||||
jmp .h_have_right
|
||||
.hack:
|
||||
cmp xd, -1*2
|
||||
jne .h_have_right
|
||||
vpbroadcastw xm5, [srcq + xq]
|
||||
.h_have_right:
|
||||
pshufb m6, m10
|
||||
pshufb m0, m4, m11
|
||||
pshufb m2, m5, m12
|
||||
paddw m0, m2
|
||||
pmaddwd m0, m7
|
||||
pshufb m2, m4, m13
|
||||
pshufb m4, m14
|
||||
paddw m2, m4
|
||||
pmaddwd m2, m8
|
||||
pshufb m1, m6, m11
|
||||
pshufb m5, m11
|
||||
pmaddwd m1, m7
|
||||
pmaddwd m5, m7
|
||||
pshufb m3, m6, m13
|
||||
pshufb m6, m14
|
||||
paddw m3, m6
|
||||
pmaddwd m3, m8
|
||||
paddd m0, m2
|
||||
paddd m1, m3
|
||||
pshufb m1, m15
|
||||
paddd m1, m5
|
||||
movq xm4, rhq
|
||||
pxor m5, m5
|
||||
paddd m0, m9
|
||||
paddd m1, m9
|
||||
psrad m0, xm4
|
||||
psrad m1, xm4
|
||||
packssdw m0, m1
|
||||
pmaxsw m0, m5
|
||||
mova [dstq + xq], m0
|
||||
add xq, 32
|
||||
jl .h_loop
|
||||
add srcq, ssq
|
||||
add dstq, 384*2
|
||||
dec hd
|
||||
jg .v_loop
|
||||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movifnidn edgeb, edgem
|
||||
pxor m6, m6
|
||||
vpbroadcastd m7, [fq]
|
||||
vpbroadcastw m8, [fq + 4]
|
||||
vpbroadcastd m9, [fq + 6]
|
||||
popcnt bdmaxd, bdmaxm
|
||||
vpbroadcastd m10, [nd_1047552]
|
||||
movq xm11, [pq_11]
|
||||
cmp bdmaxd, 10
|
||||
je .bits10
|
||||
vpbroadcastd m10, [nd_1048320]
|
||||
movq xm11, [pq_9]
|
||||
.bits10:
|
||||
vpbroadcastw m12, bdmaxm
|
||||
add wq, wq
|
||||
add midq, wq
|
||||
add dstq, wq
|
||||
neg wq
|
||||
DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
|
||||
mov msq, 2*384
|
||||
mov t0, midq
|
||||
mov t1, t0
|
||||
lea t2, [t1 + msq]
|
||||
lea t3, [t2 + msq]
|
||||
lea t4, [t3 + msq]
|
||||
lea t5, [t4 + msq]
|
||||
lea t6, [t5 + msq]
|
||||
test edgeb, 4 ; LR_HAVE_TOP
|
||||
jnz .have_top
|
||||
mov t0, t3
|
||||
mov t1, t3
|
||||
mov t2, t3
|
||||
.have_top:
|
||||
cmp hd, 3
|
||||
jg .v_loop
|
||||
test edgeb, 8 ; LR_HAVE_BOTTOM
|
||||
jz .no_bottom0
|
||||
cmp hd, 1
|
||||
jg .v_loop
|
||||
jmp .h3
|
||||
.no_bottom0:
|
||||
cmp hd, 2
|
||||
je .h2
|
||||
jns .h3
|
||||
.h1:
|
||||
mov t4, t3
|
||||
.h2:
|
||||
mov t5, t4
|
||||
.h3:
|
||||
mov t6, t5
|
||||
.v_loop:
|
||||
mov xq, wq
|
||||
.h_loop:
|
||||
mova m1, [t0 + xq]
|
||||
mova m2, [t1 + xq]
|
||||
mova m3, [t5 + xq]
|
||||
mova m4, [t6 + xq]
|
||||
punpcklwd m0, m1, m2
|
||||
pmaddwd m0, m7
|
||||
punpckhwd m1, m2
|
||||
pmaddwd m1, m7
|
||||
punpcklwd m2, m4, m3
|
||||
pmaddwd m2, m7
|
||||
punpckhwd m4, m3
|
||||
pmaddwd m4, m7
|
||||
paddd m0, m2
|
||||
paddd m1, m4
|
||||
mova m3, [t2 + xq]
|
||||
mova m4, [t4 + xq]
|
||||
punpcklwd m2, m3, m4
|
||||
pmaddwd m2, m8
|
||||
punpckhwd m3, m4
|
||||
pmaddwd m3, m8
|
||||
paddd m0, m2
|
||||
paddd m1, m3
|
||||
mova m3, [t3 + xq]
|
||||
punpcklwd m2, m3, m6
|
||||
pmaddwd m2, m9
|
||||
punpckhwd m3, m6
|
||||
pmaddwd m3, m9
|
||||
paddd m0, m2
|
||||
paddd m1, m3
|
||||
paddd m0, m10
|
||||
paddd m1, m10
|
||||
psrad m0, xm11
|
||||
psrad m1, xm11
|
||||
packusdw m0, m1
|
||||
pminuw m0, m12
|
||||
mova [dstq + xq], m0
|
||||
add xq, 32
|
||||
jl .h_loop
|
||||
add dstq, dsq
|
||||
mov t0, t1
|
||||
mov t1, t2
|
||||
mov t2, t3
|
||||
mov t3, t4
|
||||
mov t4, t5
|
||||
mov t5, t6
|
||||
add t6, msq
|
||||
cmp hd, 4
|
||||
jg .next_row
|
||||
test edgeb, 8 ; LR_HAVE_BOTTOM
|
||||
jz .no_bottom
|
||||
cmp hd, 2
|
||||
jg .next_row
|
||||
.no_bottom:
|
||||
mov t6, t5
|
||||
.next_row:
|
||||
dec hd
|
||||
jg .v_loop
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
|
@ -29,173 +29,235 @@
|
|||
#include "src/looprestoration.h"
|
||||
|
||||
#include "common/intops.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#define WIENER_FILTER(ext) \
|
||||
void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[4], const pixel *lpf, \
|
||||
ptrdiff_t lpf_stride, int w, int h, \
|
||||
const int16_t filter[2][8], \
|
||||
enum LrEdgeFlags edges); \
|
||||
void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[4], const pixel *lpf, \
|
||||
ptrdiff_t lpf_stride, int w, int h, \
|
||||
const int16_t filter[2][8], \
|
||||
enum LrEdgeFlags edges);
|
||||
#if BITDEPTH != 8
|
||||
#define decl_wiener_filter_fn(name, ext) \
|
||||
void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
|
||||
ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
|
||||
int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
|
||||
void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
|
||||
const int16_t fv[7], int w, int h, \
|
||||
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
|
||||
static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const LooprestorationParams *params, \
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
|
||||
ALIGN_STK_64(int16_t, mid, 68 * 384,); \
|
||||
BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \
|
||||
edges HIGHBD_TAIL_SUFFIX); \
|
||||
if (edges & LR_HAVE_TOP) { \
|
||||
BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \
|
||||
edges HIGHBD_TAIL_SUFFIX); \
|
||||
} \
|
||||
if (edges & LR_HAVE_BOTTOM) { \
|
||||
BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
|
||||
} \
|
||||
BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
#define decl_wiener_filter_fns(ext) \
|
||||
decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
|
||||
decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
|
||||
#else
|
||||
#define decl_wiener_filter_fns(ext) \
|
||||
decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
|
||||
decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
|
||||
#endif
|
||||
|
||||
#define SGR_FILTER(ext) \
|
||||
void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
|
||||
const int w, const int h, const int strength); \
|
||||
void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int32_t *a, const int16_t *b, \
|
||||
const int w, const int h); \
|
||||
#define decl_sgr_filter_fns(ext) \
|
||||
void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[4], const pixel *lpf, \
|
||||
ptrdiff_t lpf_stride, int w, int h, \
|
||||
const LooprestorationParams *params, \
|
||||
enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[4], const pixel *lpf, \
|
||||
ptrdiff_t lpf_stride, int w, int h, \
|
||||
const LooprestorationParams *params, \
|
||||
enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[4], const pixel *lpf, \
|
||||
ptrdiff_t lpf_stride, int w, int h, \
|
||||
const LooprestorationParams *params, \
|
||||
enum LrEdgeFlags edges);
|
||||
|
||||
/* FIXME: Replace with a port of the AVX2 code */
|
||||
#define SGR_FILTER_OLD(ext) \
|
||||
void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
|
||||
const int w, const int h, const unsigned s); \
|
||||
void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int32_t *a, const int16_t *b, \
|
||||
const int w, const int h); \
|
||||
\
|
||||
/* filter with a 3x3 box (radius=1) */ \
|
||||
static void dav1d_sgr_filter1_##ext(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int strength, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int strength, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
|
||||
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
|
||||
\
|
||||
dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
|
||||
BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
|
||||
if (edges & LR_HAVE_TOP) \
|
||||
dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
|
||||
NULL, lpf, lpf_stride, w, 2, edges); \
|
||||
BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
|
||||
NULL, lpf, lpf_stride, w, 2, edges); \
|
||||
\
|
||||
if (edges & LR_HAVE_BOTTOM) \
|
||||
dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, w, 2, edges); \
|
||||
BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, w, 2, edges); \
|
||||
\
|
||||
dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
|
||||
dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
|
||||
dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
|
||||
BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
|
||||
BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
|
||||
BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
|
||||
} \
|
||||
\
|
||||
void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
|
||||
const int w, const int h, const int strength); \
|
||||
void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int32_t *a, const int16_t *b, \
|
||||
const int w, const int h); \
|
||||
void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
|
||||
const int w, const int h, \
|
||||
const enum LrEdgeFlags edges); \
|
||||
void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
|
||||
const int w, const int h, const int strength); \
|
||||
void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const int32_t *a, const int16_t *b, \
|
||||
const int w, const int h); \
|
||||
\
|
||||
/* filter with a 5x5 box (radius=2) */ \
|
||||
static void dav1d_sgr_filter2_##ext(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int strength, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
|
||||
const pixel *src, const ptrdiff_t stride, \
|
||||
const pixel (*left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int strength, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
|
||||
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
|
||||
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
|
||||
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
|
||||
\
|
||||
dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
|
||||
BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
|
||||
if (edges & LR_HAVE_TOP) \
|
||||
dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
|
||||
NULL, lpf, lpf_stride, w, 2, edges); \
|
||||
BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
|
||||
NULL, lpf, lpf_stride, w, 2, edges); \
|
||||
\
|
||||
if (edges & LR_HAVE_BOTTOM) \
|
||||
dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, w, 2, edges); \
|
||||
BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
|
||||
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
|
||||
lpf_stride, w, 2, edges); \
|
||||
\
|
||||
dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
|
||||
dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
|
||||
dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
|
||||
BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
|
||||
BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
|
||||
BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
|
||||
} \
|
||||
\
|
||||
void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
|
||||
const coef *t1, const int w, const int h, \
|
||||
const int wt); \
|
||||
void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
|
||||
const coef *t1, const coef *t2, \
|
||||
const int w, const int h, \
|
||||
const uint32_t wt); \
|
||||
void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
|
||||
const coef *t1, const int w, const int h, \
|
||||
const int wt); \
|
||||
void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
|
||||
const coef *t1, const coef *t2, \
|
||||
const int w, const int h, \
|
||||
const uint32_t wt); \
|
||||
\
|
||||
static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, const int sgr_idx, \
|
||||
const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
|
||||
static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, \
|
||||
const LooprestorationParams *const params, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
if (!dav1d_sgr_params[sgr_idx][0]) { \
|
||||
ALIGN_STK_32(coef, tmp, 64 * 384,); \
|
||||
dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges); \
|
||||
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
|
||||
} else if (!dav1d_sgr_params[sgr_idx][1]) { \
|
||||
ALIGN_STK_32(coef, tmp, 64 * 384,); \
|
||||
dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges); \
|
||||
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
|
||||
} else { \
|
||||
ALIGN_STK_32(coef, tmp1, 64 * 384,); \
|
||||
ALIGN_STK_32(coef, tmp2, 64 * 384,); \
|
||||
dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, dav1d_sgr_params[sgr_idx][2], edges); \
|
||||
dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, dav1d_sgr_params[sgr_idx][3], edges); \
|
||||
const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
|
||||
dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
|
||||
} \
|
||||
ALIGN_STK_32(coef, tmp, 64 * 384,); \
|
||||
BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s0, edges); \
|
||||
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
|
||||
} \
|
||||
static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, \
|
||||
const LooprestorationParams *const params, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_32(coef, tmp, 64 * 384,); \
|
||||
BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s1, edges); \
|
||||
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
|
||||
} \
|
||||
static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
|
||||
const pixel (*const left)[4], \
|
||||
const pixel *lpf, const ptrdiff_t lpf_stride, \
|
||||
const int w, const int h, \
|
||||
const LooprestorationParams *const params, \
|
||||
const enum LrEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_32(coef, tmp1, 64 * 384,); \
|
||||
ALIGN_STK_32(coef, tmp2, 64 * 384,); \
|
||||
BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s0, edges); \
|
||||
BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
|
||||
w, h, params->sgr.s1, edges); \
|
||||
const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
|
||||
BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
|
||||
}
|
||||
|
||||
#if BITDEPTH == 8
|
||||
WIENER_FILTER(sse2)
|
||||
WIENER_FILTER(ssse3)
|
||||
SGR_FILTER(ssse3)
|
||||
decl_wiener_filter_fns(sse2);
|
||||
decl_wiener_filter_fns(ssse3);
|
||||
SGR_FILTER_OLD(ssse3)
|
||||
# if ARCH_X86_64
|
||||
WIENER_FILTER(avx2)
|
||||
SGR_FILTER(avx2)
|
||||
decl_sgr_filter_fns(avx2)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if ARCH_X86_64
|
||||
decl_wiener_filter_fns(avx2);
|
||||
#endif
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
#if BITDEPTH == 8
|
||||
c->wiener[0] = dav1d_wiener_filter7_sse2;
|
||||
c->wiener[1] = dav1d_wiener_filter5_sse2;
|
||||
c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
|
||||
c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
#if BITDEPTH == 8
|
||||
c->wiener[0] = dav1d_wiener_filter7_ssse3;
|
||||
c->wiener[1] = dav1d_wiener_filter5_ssse3;
|
||||
c->selfguided = sgr_filter_ssse3;
|
||||
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
|
||||
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
|
||||
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
|
||||
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
|
||||
c->sgr[2] = BF(sgr_filter_mix, ssse3);
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
c->wiener[0] = dav1d_wiener_filter7_avx2;
|
||||
c->wiener[1] = dav1d_wiener_filter5_avx2;
|
||||
c->selfguided = sgr_filter_avx2;
|
||||
#if ARCH_X86_64
|
||||
c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
|
||||
c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
|
||||
# if BITDEPTH == 8
|
||||
c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
|
||||
c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
|
||||
c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -97,8 +97,8 @@ SECTION .text
|
|||
%macro WIENER 0
|
||||
%if ARCH_X86_64
|
||||
DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
|
||||
cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
|
||||
lpf_stride, w, edge, flt, h, x
|
||||
cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
|
||||
lpf_stride, w, edge, flt, h, x
|
||||
%define base 0
|
||||
mov fltq, fltmp
|
||||
mov edged, r8m
|
||||
|
@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
|
|||
%define m11 [stk+96]
|
||||
%define stk_off 112
|
||||
%endif
|
||||
cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
|
||||
cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
|
||||
%define base r6-pb_right_ext_mask-21
|
||||
%define stk esp
|
||||
%define dstq leftq
|
||||
|
@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
|
|||
add lpfq, [rsp+gprsize*1]
|
||||
call .hv_bottom
|
||||
.v1:
|
||||
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
|
||||
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
|
||||
RET
|
||||
.no_top:
|
||||
lea t3, [lpfq+lpf_strideq*4]
|
||||
|
@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
|
|||
dec hd
|
||||
jnz .main
|
||||
.v3:
|
||||
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
|
||||
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
|
||||
.v2:
|
||||
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
|
||||
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
|
||||
jmp .v1
|
||||
.extend_right:
|
||||
movd m2, [lpfq-4]
|
||||
|
@ -685,8 +685,8 @@ ALIGN function_align
|
|||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
|
||||
lpf_stride, w, edge, flt, h, x
|
||||
cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
|
||||
lpf_stride, w, edge, flt, h, x
|
||||
mov fltq, fltmp
|
||||
mov edged, r8m
|
||||
mov wd, wm
|
||||
|
@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
|
|||
%define m11 [stk+80]
|
||||
%define stk_off 96
|
||||
%endif
|
||||
cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
|
||||
cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
|
||||
%define stk esp
|
||||
%define leftmp [stk+28]
|
||||
%define m8 [base+pw_m16380]
|
||||
|
@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
|
|||
dec hd
|
||||
jnz .main
|
||||
.v2:
|
||||
call mangle(private_prefix %+ _wiener_filter5_ssse3).v
|
||||
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
|
||||
add dstq, dst_strideq
|
||||
mov t4, t3
|
||||
mov t3, t2
|
||||
mov t2, t1
|
||||
movifnidn dstmp, dstq
|
||||
.v1:
|
||||
call mangle(private_prefix %+ _wiener_filter5_ssse3).v
|
||||
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
|
||||
jmp .end
|
||||
.h:
|
||||
%define stk esp+4
|
||||
|
@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
|
|||
jnz .h_have_right
|
||||
cmp xd, -17
|
||||
jl .h_have_right
|
||||
call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
|
||||
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
|
||||
.h_have_right:
|
||||
%macro %%h5 0
|
||||
%if cpuflag(ssse3)
|
||||
|
@ -991,7 +991,7 @@ ALIGN function_align
|
|||
jnz .hv_have_right
|
||||
cmp xd, -17
|
||||
jl .hv_have_right
|
||||
call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
|
||||
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
|
||||
.hv_have_right:
|
||||
%%h5
|
||||
mova m2, [t3+xq*2]
|
||||
|
@ -1161,7 +1161,7 @@ WIENER
|
|||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
||||
cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
||||
mov xlimd, edgem
|
||||
movifnidn xd, xm
|
||||
mov hd, hm
|
||||
|
@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
|||
add xd, xlimd
|
||||
xor xlimd, 2 ; 2*!have_right
|
||||
%else
|
||||
cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
||||
cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
||||
%define wq r0m
|
||||
%define xlimd r1m
|
||||
%define hd hmp
|
||||
|
@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
|
||||
cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
|
||||
movifnidn edged, edgem
|
||||
%else
|
||||
cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
|
||||
cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
|
||||
%define sumsq_baseq dword [esp+0]
|
||||
%define sum_baseq dword [esp+4]
|
||||
%define ylimd dword [esp+8]
|
||||
|
@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
|
|||
jl .loop_x
|
||||
RET
|
||||
|
||||
cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
|
||||
cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
|
||||
movifnidn sd, sm
|
||||
sub aq, (384+16-1)*4
|
||||
sub bq, (384+16-1)*2
|
||||
|
@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
|
||||
tmp_base, src_base, a_base, b_base, x, y
|
||||
cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
|
||||
tmp_base, src_base, a_base, b_base, x, y
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
mova m15, [pw_16]
|
||||
|
@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
|
|||
mov b_baseq, bq
|
||||
xor xd, xd
|
||||
%else
|
||||
cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
|
||||
cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
|
||||
%define tmp_baseq [esp+8]
|
||||
%define src_baseq [esp+12]
|
||||
%define a_baseq [esp+16]
|
||||
|
@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
|
|||
jl .loop_x
|
||||
RET
|
||||
|
||||
cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
|
||||
cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
|
||||
movifnidn hd, hm
|
||||
%if ARCH_X86_32
|
||||
SETUP_PIC r6, 0
|
||||
|
@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
|
||||
cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
|
||||
mov edged, edgem
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
mova m10, [pb_0]
|
||||
mova m11, [pb_0_1]
|
||||
%else
|
||||
cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
|
||||
cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
|
||||
%define edgeb byte edgem
|
||||
%define wd xd
|
||||
%define wq wd
|
||||
|
@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
|
||||
cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
|
||||
movifnidn edged, edgem
|
||||
mov ylimd, edged
|
||||
%else
|
||||
cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
|
||||
cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
|
||||
%define wm [esp+0]
|
||||
%define hm [esp+4]
|
||||
%define edgem [esp+8]
|
||||
|
@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
|
|||
jmp .sum_loop_y_noload
|
||||
%endif
|
||||
|
||||
cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
|
||||
cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
|
||||
movifnidn sd, sm
|
||||
sub aq, (384+16-1)*4
|
||||
sub bq, (384+16-1)*2
|
||||
|
@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
|
|||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
|
||||
cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
|
||||
tmp_base, src_base, a_base, b_base, x, y
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
|
@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
|
|||
psrlw m11, m12, 1 ; pw_128
|
||||
pxor m13, m13
|
||||
%else
|
||||
cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
|
||||
cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
|
||||
%define tmp_baseq r0m
|
||||
%define src_baseq r1m
|
||||
%define a_baseq r3m
|
||||
|
@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
|
|||
RET
|
||||
|
||||
%undef t2
|
||||
cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
|
||||
cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
|
||||
movifnidn wd, wm
|
||||
movd m0, wtm
|
||||
%if ARCH_X86_64
|
||||
|
|
|
@ -3825,9 +3825,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
|
|||
pblendw m6, m7, 0xaa ; 67 89
|
||||
pmulhrsw m6, m12
|
||||
paddd m4, m5
|
||||
vpblendd m0, m1, m6, 0x0f
|
||||
vperm2i128 m0, m1, m6, 0x21 ; 45 67
|
||||
mova m1, m6
|
||||
vpermq m0, m0, q1032 ; 45 67
|
||||
pmaddwd m6, m0, m10
|
||||
pmaddwd m7, m1, m11
|
||||
paddd m4, m13
|
||||
|
|
|
@ -153,6 +153,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
|
|||
.renorm4:
|
||||
bsr ecx, t2d
|
||||
xor ecx, 15 ; d
|
||||
.renorm5:
|
||||
shl t2d, cl
|
||||
shl t4, cl
|
||||
mov [t7+msac.rng], t2d
|
||||
|
@ -413,13 +414,20 @@ cglobal msac_decode_bool_equi, 0, 6, 0
|
|||
sub t2d, t1d ; r - v
|
||||
sub t4, rax ; dif - vw
|
||||
cmovb t2d, t1d
|
||||
mov t1d, [t0+msac.cnt]
|
||||
cmovb t4, t3
|
||||
movifnidn t7, t0
|
||||
mov ecx, 0xbfff
|
||||
setb al ; the upper 32 bits contains garbage but that's OK
|
||||
sub ecx, t2d
|
||||
not t4
|
||||
; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
|
||||
; i.e. (0 <= d <= 2) and v < (3 << 14)
|
||||
shr ecx, 14 ; d
|
||||
%if ARCH_X86_64 == 0
|
||||
movzx eax, al
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
|
||||
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
|
||||
|
||||
cglobal msac_decode_bool, 0, 6, 0
|
||||
movifnidn t0, r0mp
|
||||
|
|
|
@ -115,7 +115,7 @@ int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
|
|||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_X86
|
||||
#ifdef _MSC_VER
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#include <intrin.h>
|
||||
#define readtime() (_mm_lfence(), __rdtsc())
|
||||
#else
|
||||
|
|
|
@ -138,14 +138,21 @@ static int copy_subcoefs(coef *coeff,
|
|||
* dimensions are non-zero. This leads to braching to specific optimized
|
||||
* simd versions (e.g. dc-only) so that we get full asm coverage in this
|
||||
* test */
|
||||
const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
|
||||
|
||||
const enum TxClass tx_class = dav1d_tx_type_class[txtp];
|
||||
const uint16_t *const scan = dav1d_scans[tx];
|
||||
const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
|
||||
const int sub_low = subsh > 1 ? sub_high - 8 : 0;
|
||||
int n, eob;
|
||||
|
||||
for (n = 0, eob = 0; n < sw * sh; n++) {
|
||||
const int rc = scan[n];
|
||||
const int rcx = rc % sh, rcy = rc / sh;
|
||||
int rc, rcx, rcy;
|
||||
if (tx_class == TX_CLASS_2D)
|
||||
rc = scan[n], rcx = rc % sh, rcy = rc / sh;
|
||||
else if (tx_class == TX_CLASS_H)
|
||||
rcx = n % sh, rcy = n / sh, rc = n;
|
||||
else /* tx_class == TX_CLASS_V */
|
||||
rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx;
|
||||
|
||||
/* Pick a random eob within this sub-itx */
|
||||
if (rcx > sub_high || rcy > sub_high) {
|
||||
|
@ -156,8 +163,18 @@ static int copy_subcoefs(coef *coeff,
|
|||
|
||||
if (eob)
|
||||
eob += rnd() % (n - eob - 1);
|
||||
for (n = eob + 1; n < sw * sh; n++)
|
||||
coeff[scan[n]] = 0;
|
||||
if (tx_class == TX_CLASS_2D)
|
||||
for (n = eob + 1; n < sw * sh; n++)
|
||||
coeff[scan[n]] = 0;
|
||||
else if (tx_class == TX_CLASS_H)
|
||||
for (n = eob + 1; n < sw * sh; n++)
|
||||
coeff[n] = 0;
|
||||
else /* tx_class == TX_CLASS_V */ {
|
||||
for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1)
|
||||
while (++rcy < sw)
|
||||
coeff[rcy * sh + rcx] = 0;
|
||||
n = sw * sh;
|
||||
}
|
||||
for (; n < 32 * 32; n++)
|
||||
coeff[n] = rnd();
|
||||
return eob;
|
||||
|
|
|
@ -41,24 +41,30 @@ static int to_binary(int x) { /* 0-15 -> 0000-1111 */
|
|||
static void init_tmp(pixel *buf, const ptrdiff_t stride,
|
||||
const int w, const int h, const int bitdepth_max)
|
||||
{
|
||||
const int noise_mask = bitdepth_max >> 4;
|
||||
const int x_off = rnd() & 7, y_off = rnd() & 7;
|
||||
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++)
|
||||
buf[x] = rnd() & bitdepth_max;
|
||||
for (int x = 0; x < w; x++) {
|
||||
buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^
|
||||
(rnd() & noise_mask);
|
||||
}
|
||||
buf += PXSTRIDE(stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
|
||||
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
|
||||
ALIGN_STK_16(int16_t, filter, 2, [8]);
|
||||
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
|
||||
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
|
||||
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
|
||||
pixel left[64][4];
|
||||
LooprestorationParams params;
|
||||
int16_t (*const filter)[8] = params.filter;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, ptrdiff_t lpf_stride,
|
||||
int w, int h, const int16_t filter[2][8],
|
||||
int w, int h, const LooprestorationParams *params,
|
||||
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int t = 0; t < 2; t++) {
|
||||
|
@ -80,24 +86,24 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc)
|
|||
const int base_h = 1 + (rnd() & 63);
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
|
||||
|
||||
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
|
||||
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
|
||||
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
|
||||
|
||||
memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
|
||||
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
|
||||
|
||||
call_ref(c_dst + 32, 448 * sizeof(pixel), left,
|
||||
h_edge + 32, 448 * sizeof(pixel),
|
||||
w, h, filter, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst + 32, 448 * sizeof(pixel), left,
|
||||
h_edge + 32, 448 * sizeof(pixel),
|
||||
w, h, filter, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
|
||||
a_dst + 32, 448 * sizeof(pixel),
|
||||
call_ref(c_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
|
||||
a_dst, 448 * sizeof(pixel),
|
||||
w, h, "dst"))
|
||||
{
|
||||
fprintf(stderr, "size = %dx%d, edges = %04d\n",
|
||||
|
@ -105,63 +111,72 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc)
|
|||
break;
|
||||
}
|
||||
}
|
||||
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
|
||||
h_edge + 32, 448 * sizeof(pixel),
|
||||
256, 64, filter, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
bench_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
|
||||
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
|
||||
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
|
||||
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
|
||||
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
|
||||
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
|
||||
pixel left[64][4];
|
||||
LooprestorationParams params;
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
|
||||
const pixel (*const left)[4],
|
||||
const pixel *lpf, ptrdiff_t lpf_stride,
|
||||
int w, int h, int sgr_idx,
|
||||
const int16_t sgr_wt[7], enum LrEdgeFlags edges
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
int w, int h, const LooprestorationParams *params,
|
||||
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
|
||||
if (check_func(c->selfguided, "selfguided_%s_%dbpc",
|
||||
sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
|
||||
{
|
||||
int16_t sgr_wt[2];
|
||||
static const struct { char name[4]; uint8_t idx; } sgr_data[3] = {
|
||||
{ "5x5", 14 },
|
||||
{ "3x3", 10 },
|
||||
{ "mix", 0 },
|
||||
};
|
||||
|
||||
sgr_wt[0] = dav1d_sgr_params[sgr_idx][0] ? (rnd() & 127) - 96 : 0;
|
||||
sgr_wt[1] = dav1d_sgr_params[sgr_idx][1] ? (rnd() & 127) - 32 :
|
||||
iclip(128 - sgr_wt[0], -32, 95);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) {
|
||||
const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx];
|
||||
params.sgr.s0 = sgr_params[0];
|
||||
params.sgr.s1 = sgr_params[1];
|
||||
params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0;
|
||||
params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0;
|
||||
|
||||
const int base_w = 1 + (rnd() % 384);
|
||||
const int base_h = 1 + (rnd() & 63);
|
||||
const int bitdepth_max = (1 << bpc) - 1;
|
||||
|
||||
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
|
||||
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
|
||||
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
|
||||
|
||||
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
|
||||
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
|
||||
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
|
||||
|
||||
memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
|
||||
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
|
||||
|
||||
call_ref(c_dst + 32, 448 * sizeof(pixel), left,
|
||||
h_edge + 32, 448 * sizeof(pixel),
|
||||
w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst + 32, 448 * sizeof(pixel), left,
|
||||
h_edge + 32, 448 * sizeof(pixel),
|
||||
w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
|
||||
a_dst + 32, 448 * sizeof(pixel),
|
||||
w, h, "dst");
|
||||
call_ref(c_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX);
|
||||
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
|
||||
a_dst, 448 * sizeof(pixel),
|
||||
w, h, "dst"))
|
||||
{
|
||||
fprintf(stderr, "size = %dx%d, edges = %04d\n",
|
||||
w, h, to_binary(edges));
|
||||
break;
|
||||
}
|
||||
}
|
||||
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
|
||||
h_edge + 32, 448 * sizeof(pixel),
|
||||
256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
bench_new(a_dst, 448 * sizeof(pixel), left,
|
||||
h_edge, 448 * sizeof(pixel),
|
||||
256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -193,7 +193,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
|||
}
|
||||
|
||||
cleanup:
|
||||
dav1d_flush(ctx);
|
||||
dav1d_close(&ctx);
|
||||
end:
|
||||
return 0;
|
||||
|
|
|
@ -76,8 +76,6 @@ if is_asm_enabled
|
|||
checkasm_sources += checkasm_asm_sources
|
||||
endif
|
||||
|
||||
m_lib = cc.find_library('m', required: false)
|
||||
|
||||
checkasm = executable('checkasm',
|
||||
checkasm_sources,
|
||||
checkasm_asm_objs,
|
||||
|
@ -94,7 +92,7 @@ if is_asm_enabled
|
|||
thread_dependency,
|
||||
rt_dependency,
|
||||
libdl_dependency,
|
||||
m_lib,
|
||||
libm_dependency,
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -127,6 +125,26 @@ endforeach
|
|||
# fuzzing binaries
|
||||
subdir('libfuzzer')
|
||||
|
||||
# seek stress test binary, depends on dav1d cli tool
|
||||
if get_option('enable_tools')
|
||||
seek_stress_sources = files('seek_stress.c')
|
||||
seek_stress = executable('seek_stress',
|
||||
seek_stress_sources, rev_target,
|
||||
objects: [
|
||||
dav1d.extract_objects('dav1d_cli_parse.c'),
|
||||
dav1d_input_objs.extract_objects('input/input.c', 'input/ivf.c'),
|
||||
],
|
||||
include_directories: [dav1d_inc_dirs, include_directories('../tools')],
|
||||
link_with: libdav1d,
|
||||
dependencies: [
|
||||
thread_dependency,
|
||||
rt_dependency,
|
||||
getopt_dependency,
|
||||
libm_dependency,
|
||||
],
|
||||
)
|
||||
endif
|
||||
|
||||
# Include dav1d test data repository with additional tests
|
||||
if get_option('testdata_tests')
|
||||
subdir('dav1d-test-data')
|
||||
|
|
|
@ -0,0 +1,243 @@
|
|||
/*
|
||||
* Copyright © 2020, VideoLAN and dav1d authors
|
||||
* Copyright © 2020, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "vcs_version.h"
|
||||
#include "cli_config.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "dav1d/dav1d.h"
|
||||
#include "input/input.h"
|
||||
#include "input/demuxer.h"
|
||||
#include "dav1d_cli_parse.h"
|
||||
|
||||
#define NUM_RAND_SEEK 3
|
||||
#define NUM_REL_SEEK 4
|
||||
#define NUM_END_SEEK 2
|
||||
|
||||
const Demuxer annexb_demuxer = { .name = "" };
|
||||
const Demuxer section5_demuxer = { .name = "" };
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
static unsigned get_seed(void) {
|
||||
return GetTickCount();
|
||||
}
|
||||
#else
|
||||
#ifdef __APPLE__
|
||||
#include <mach/mach_time.h>
|
||||
#else
|
||||
#include <time.h>
|
||||
#endif
|
||||
static unsigned get_seed(void) {
|
||||
#ifdef __APPLE__
|
||||
return (unsigned) mach_absolute_time();
|
||||
#elif defined(HAVE_CLOCK_GETTIME)
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
static uint32_t xs_state[4];
|
||||
|
||||
static void xor128_srand(unsigned seed) {
|
||||
xs_state[0] = seed;
|
||||
xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
|
||||
xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
|
||||
xs_state[3] = ~seed;
|
||||
}
|
||||
|
||||
// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
|
||||
// Journal of Statistical Software. 8 (14).
|
||||
// doi:10.18637/jss.v008.i14.
|
||||
static int xor128_rand(void) {
|
||||
const uint32_t x = xs_state[0];
|
||||
const uint32_t t = x ^ (x << 11);
|
||||
|
||||
xs_state[0] = xs_state[1];
|
||||
xs_state[1] = xs_state[2];
|
||||
xs_state[2] = xs_state[3];
|
||||
uint32_t w = xs_state[3];
|
||||
|
||||
w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
|
||||
xs_state[3] = w;
|
||||
|
||||
return w >> 1;
|
||||
}
|
||||
|
||||
static inline int decode_frame(Dav1dPicture *const p,
|
||||
Dav1dContext *const c, Dav1dData *const data)
|
||||
{
|
||||
int res;
|
||||
memset(p, 0, sizeof(*p));
|
||||
if ((res = dav1d_send_data(c, data)) < 0) {
|
||||
if (res != DAV1D_ERR(EAGAIN)) {
|
||||
fprintf(stderr, "Error decoding frame: %s\n",
|
||||
strerror(DAV1D_ERR(res)));
|
||||
return res;
|
||||
}
|
||||
}
|
||||
if ((res = dav1d_get_picture(c, p)) < 0) {
|
||||
if (res != DAV1D_ERR(EAGAIN)) {
|
||||
fprintf(stderr, "Error decoding frame: %s\n",
|
||||
strerror(DAV1D_ERR(res)));
|
||||
return res;
|
||||
}
|
||||
} else dav1d_picture_unref(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int decode_rand(DemuxerContext *const in, Dav1dContext *const c,
|
||||
Dav1dData *const data, const double fps)
|
||||
{
|
||||
int res = 0;
|
||||
Dav1dPicture p;
|
||||
const int num_frames = xor128_rand() % (int)(fps * 5);
|
||||
for (int i = 0; i < num_frames; i++) {
|
||||
if ((res = decode_frame(&p, c, data))) break;
|
||||
if (input_read(in, data) || data->sz == 0) break;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static int decode_all(DemuxerContext *const in,
|
||||
Dav1dContext *const c, Dav1dData *const data)
|
||||
{
|
||||
int res = 0;
|
||||
Dav1dPicture p;
|
||||
do { if ((res = decode_frame(&p, c, data))) break;
|
||||
} while (!input_read(in, data) && data->sz > 0);
|
||||
return res;
|
||||
}
|
||||
|
||||
static int seek(DemuxerContext *const in, Dav1dContext *const c,
|
||||
const uint64_t pts, Dav1dData *const data)
|
||||
{
|
||||
int res;
|
||||
if ((res = input_seek(in, pts))) return res;
|
||||
Dav1dSequenceHeader seq;
|
||||
do { if ((res = input_read(in, data))) break;
|
||||
} while (dav1d_parse_sequence_header(&seq, data->data, data->sz));
|
||||
dav1d_flush(c);
|
||||
return res;
|
||||
}
|
||||
|
||||
int main(const int argc, char *const *const argv) {
|
||||
const char *version = dav1d_version();
|
||||
if (strcmp(version, DAV1D_VERSION)) {
|
||||
fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
|
||||
version, DAV1D_VERSION);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
CLISettings cli_settings;
|
||||
Dav1dSettings lib_settings;
|
||||
DemuxerContext *in;
|
||||
Dav1dContext *c;
|
||||
Dav1dData data;
|
||||
unsigned total, i_fps[2], i_timebase[2];
|
||||
double timebase, spf, fps;
|
||||
uint64_t pts;
|
||||
|
||||
xor128_srand(get_seed());
|
||||
parse(argc, argv, &cli_settings, &lib_settings);
|
||||
|
||||
if (input_open(&in, "ivf", cli_settings.inputfile,
|
||||
i_fps, &total, i_timebase) < 0 ||
|
||||
!i_timebase[0] || !i_timebase[1] || !i_fps[0] || !i_fps[1])
|
||||
{
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
if (dav1d_open(&c, &lib_settings))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
timebase = (double)i_timebase[1] / i_timebase[0];
|
||||
spf = (double)i_fps[1] / i_fps[0];
|
||||
fps = (double)i_fps[0] / i_fps[1];
|
||||
if (fps < 1) goto end;
|
||||
|
||||
#define FRAME_OFFSET_TO_PTS(foff) \
|
||||
(uint64_t)llround(((foff) * spf) * 1000000000.0)
|
||||
#define TS_TO_PTS(ts) \
|
||||
(uint64_t)llround(((ts) * timebase) * 1000000000.0)
|
||||
|
||||
// seek at random pts
|
||||
for (int i = 0; i < NUM_RAND_SEEK; i++) {
|
||||
pts = FRAME_OFFSET_TO_PTS(xor128_rand() % total);
|
||||
if (seek(in, c, pts, &data)) continue;
|
||||
if (decode_rand(in, c, &data, fps)) goto end;
|
||||
}
|
||||
pts = TS_TO_PTS(data.m.timestamp);
|
||||
|
||||
// seek left / right randomly with random intervals within 1s
|
||||
for (int i = 0, tries = 0;
|
||||
i - tries < NUM_REL_SEEK && tries < NUM_REL_SEEK / 2;
|
||||
i++)
|
||||
{
|
||||
const int sign = xor128_rand() & 1 ? -1 : +1;
|
||||
const float diff = (xor128_rand() % 100) / 100.f;
|
||||
int64_t new_pts = pts + sign * FRAME_OFFSET_TO_PTS(diff * fps);
|
||||
const int64_t new_ts = llround(new_pts / (timebase * 1000000000.0));
|
||||
new_pts = TS_TO_PTS(new_ts);
|
||||
if (new_pts < 0 || (uint64_t)new_pts >= FRAME_OFFSET_TO_PTS(total)) {
|
||||
if (seek(in, c, FRAME_OFFSET_TO_PTS(total / 2), &data)) break;
|
||||
pts = TS_TO_PTS(data.m.timestamp);
|
||||
tries++;
|
||||
continue;
|
||||
}
|
||||
if (seek(in, c, new_pts, &data))
|
||||
if (seek(in, c, 0, &data)) goto end;
|
||||
if (decode_rand(in, c, &data, fps)) goto end;
|
||||
pts = TS_TO_PTS(data.m.timestamp);
|
||||
}
|
||||
|
||||
unsigned shift = 0;
|
||||
do {
|
||||
shift += 5;
|
||||
if (shift > total)
|
||||
shift = total;
|
||||
} while (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data));
|
||||
|
||||
// simulate seeking after the end of the file
|
||||
for (int i = 0; i < NUM_END_SEEK; i++) {
|
||||
if (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)) goto end;
|
||||
if (decode_all(in, c, &data)) goto end;
|
||||
int num_flush = 1 + 64 + xor128_rand() % 64;
|
||||
while (num_flush--) dav1d_flush(c);
|
||||
}
|
||||
|
||||
end:
|
||||
input_close(in);
|
||||
dav1d_close(&c);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
|
@ -197,7 +197,6 @@ int main(const int argc, char *const *const argv) {
|
|||
seq_skip);
|
||||
}
|
||||
|
||||
//getc(stdin);
|
||||
if (cli_settings.limit != 0 && cli_settings.limit < total)
|
||||
total = cli_settings.limit;
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "cli_config.h"
|
||||
|
||||
#include <getopt.h>
|
||||
#include <limits.h>
|
||||
|
@ -51,6 +52,7 @@ enum {
|
|||
ARG_REALTIME_CACHE,
|
||||
ARG_FRAME_THREADS,
|
||||
ARG_TILE_THREADS,
|
||||
ARG_POSTFILTER_THREADS,
|
||||
ARG_VERIFY,
|
||||
ARG_FILM_GRAIN,
|
||||
ARG_OPPOINT,
|
||||
|
@ -73,6 +75,7 @@ static const struct option long_opts[] = {
|
|||
{ "realtimecache", 1, NULL, ARG_REALTIME_CACHE },
|
||||
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
|
||||
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
|
||||
{ "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS },
|
||||
{ "verify", 1, NULL, ARG_VERIFY },
|
||||
{ "filmgrain", 1, NULL, ARG_FILM_GRAIN },
|
||||
{ "oppoint", 1, NULL, ARG_OPPOINT },
|
||||
|
@ -82,6 +85,12 @@ static const struct option long_opts[] = {
|
|||
{ NULL, 0, NULL, 0 },
|
||||
};
|
||||
|
||||
#if HAVE_XXHASH_H
|
||||
#define AVAILABLE_MUXERS "'md5', 'xxh3', 'yuv', 'yuv4mpeg2' or 'null'"
|
||||
#else
|
||||
#define AVAILABLE_MUXERS "'md5', 'yuv', 'yuv4mpeg2' or 'null'"
|
||||
#endif
|
||||
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
#define ALLOWED_CPU_MASKS " or 'neon'"
|
||||
#elif ARCH_PPC64LE
|
||||
|
@ -107,7 +116,7 @@ static void usage(const char *const app, const char *const reason, ...) {
|
|||
" --input/-i $file: input file\n"
|
||||
" --output/-o $file: output file\n"
|
||||
" --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from content)\n"
|
||||
" --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n"
|
||||
" --muxer $name: force muxer type (" AVAILABLE_MUXERS "; default: detect from extension)\n"
|
||||
" --quiet/-q: disable status messages\n"
|
||||
" --frametimes $file: dump frame times to file\n"
|
||||
" --limit/-l $num: stop decoding after $num frames\n"
|
||||
|
@ -117,7 +126,8 @@ static void usage(const char *const app, const char *const reason, ...) {
|
|||
" --version/-v: print version and exit\n"
|
||||
" --framethreads $num: number of frame threads (default: 1)\n"
|
||||
" --tilethreads $num: number of tile threads (default: 1)\n"
|
||||
" --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n"
|
||||
" --pfthreads $num: number of postfilter threads (default: 1)\n"
|
||||
" --filmgrain $num: enable film grain application (default: 1, except if muxer is md5 or xxh3)\n"
|
||||
" --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n"
|
||||
" --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
|
||||
" --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n"
|
||||
|
@ -198,24 +208,26 @@ static const EnumParseTable cpu_mask_tbl[] = {
|
|||
{ "avx2", X86_CPU_MASK_AVX2 },
|
||||
{ "avx512icl", X86_CPU_MASK_AVX512ICL },
|
||||
#endif
|
||||
{ 0 },
|
||||
{ "none", 0 },
|
||||
};
|
||||
|
||||
#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n)))
|
||||
|
||||
static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl,
|
||||
const int option, const char *app)
|
||||
const int tbl_sz, const int option, const char *app)
|
||||
{
|
||||
char str[1024];
|
||||
|
||||
strcpy(str, "any of ");
|
||||
for (int n = 0; tbl[n].str; n++) {
|
||||
for (int n = 0; n < tbl_sz; n++) {
|
||||
if (!strcmp(tbl[n].str, optarg))
|
||||
return tbl[n].val;
|
||||
|
||||
if (n) {
|
||||
if (!tbl[n + 1].str)
|
||||
strcat(str, " or ");
|
||||
else
|
||||
if (n < tbl_sz - 1)
|
||||
strcat(str, ", ");
|
||||
else
|
||||
strcat(str, " or ");
|
||||
}
|
||||
strcat(str, tbl[n].str);
|
||||
}
|
||||
|
@ -295,6 +307,10 @@ void parse(const int argc, char *const *const argv,
|
|||
lib_settings->n_tile_threads =
|
||||
parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
|
||||
break;
|
||||
case ARG_POSTFILTER_THREADS:
|
||||
lib_settings->n_postfilter_threads =
|
||||
parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
|
||||
break;
|
||||
case ARG_VERIFY:
|
||||
cli_settings->verify = optarg;
|
||||
break;
|
||||
|
@ -325,7 +341,7 @@ void parse(const int argc, char *const *const argv,
|
|||
fprintf(stderr, "%s\n", dav1d_version());
|
||||
exit(0);
|
||||
case ARG_CPU_MASK:
|
||||
dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,
|
||||
dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, ARRAY_SIZE(cpu_mask_tbl),
|
||||
ARG_CPU_MASK, argv[0]));
|
||||
break;
|
||||
default:
|
||||
|
@ -338,8 +354,11 @@ void parse(const int argc, char *const *const argv,
|
|||
if (cli_settings->verify) {
|
||||
if (cli_settings->outputfile)
|
||||
usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set");
|
||||
if (cli_settings->muxer && !strcmp(cli_settings->muxer, "md5"))
|
||||
usage(argv[0], "Verification (--verify) requires the md5 muxer (--muxer md5)");
|
||||
if (cli_settings->muxer && strcmp(cli_settings->muxer, "md5") &&
|
||||
strcmp(cli_settings->muxer, "xxh3"))
|
||||
{
|
||||
usage(argv[0], "Verification (--verify) requires a checksum muxer (md5 or xxh3)");
|
||||
}
|
||||
|
||||
cli_settings->outputfile = "-";
|
||||
if (!cli_settings->muxer)
|
||||
|
@ -347,7 +366,8 @@ void parse(const int argc, char *const *const argv,
|
|||
}
|
||||
|
||||
if (!grain_specified && cli_settings->muxer &&
|
||||
!strcmp(cli_settings->muxer, "md5"))
|
||||
(!strcmp(cli_settings->muxer, "md5") ||
|
||||
!strcmp(cli_settings->muxer, "xxh3")))
|
||||
{
|
||||
lib_settings->apply_grain = 0;
|
||||
}
|
||||
|
|
|
@ -191,5 +191,6 @@ const Demuxer annexb_demuxer = {
|
|||
.probe_sz = PROBE_SIZE,
|
||||
.open = annexb_open,
|
||||
.read = annexb_read,
|
||||
.seek = NULL,
|
||||
.close = annexb_close,
|
||||
};
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче