Bug 1688992 - Update libdav1d to 0.8.2 for Firefox 88. r=dminor

Differential Revision: https://phabricator.services.mozilla.com/D106197
This commit is contained in:
Jon Bauman 2021-02-24 23:05:38 +00:00
Родитель a34308d565
Коммит 7c5470c9ff
109 изменённых файлов: 14300 добавлений и 5741 удалений

Просмотреть файл

@ -25,6 +25,10 @@ The rough steps are:
- Update ./moz.build and ./asm/moz.build to add new files and remove deleted ones using
third_party/dav1d/src/meson.build as a guide (confirm with the diff) (note the
empty .asm file in x86_64)
- Some files will be automatically added to the various autovendored_sources.mozbuild files.
In the case of the asm dir, these may cause build failures on particular platforms which
can be resolved by moving those out of autovendored_sources.mozbuild and into the regular
moz.build which has a condition on CONFIG['CPU_ARCH'].
- Clone the tag from the dav1d repo and build a stand-alone libdav1d following the steps here:
https://code.videolan.org/videolan/dav1d#compile
- Copy vcs_version.h from the local build/include/vcs_version.h

Просмотреть файл

@ -1,4 +1,5 @@
sources = [
'../../../third_party/dav1d/src/x86/cdef16_sse.asm',
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
'../../../third_party/dav1d/src/x86/cpuid.asm',
'../../../third_party/dav1d/src/x86/film_grain_ssse3.asm',

Просмотреть файл

@ -83,6 +83,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
# Empty file on all other archs. Nasm produces
# an error when it compiles empty files.
SOURCES += [
'../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored
'../../../third_party/dav1d/src/x86/cdef_avx2.asm',
'../../../third_party/dav1d/src/x86/cdef_avx512.asm',
'../../../third_party/dav1d/src/x86/film_grain.asm',
@ -90,6 +91,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/itx.asm',
'../../../third_party/dav1d/src/x86/loopfilter.asm',
'../../../third_party/dav1d/src/x86/looprestoration.asm',
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm', # moved from autovendored
'../../../third_party/dav1d/src/x86/mc_avx2.asm',
'../../../third_party/dav1d/src/x86/mc_avx512.asm',
]
@ -185,7 +187,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
'../../../third_party/dav1d/src/arm/32/cdef16.S',
'../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
'../../../third_party/dav1d/src/arm/32/ipred.S',
'../../../third_party/dav1d/src/arm/32/ipred16.S',
'../../../third_party/dav1d/src/arm/32/itx.S',
'../../../third_party/dav1d/src/arm/32/itx16.S',
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
'../../../third_party/dav1d/src/arm/32/loopfilter16.S',
'../../../third_party/dav1d/src/arm/32/looprestoration.S',

Просмотреть файл

@ -163,6 +163,7 @@ EXPORTS.dav1d += [
'../../third_party/dav1d/include/common/attributes.h',
'../../third_party/dav1d/include/common/bitdepth.h',
'../../third_party/dav1d/include/common/dump.h',
'../../third_party/dav1d/include/common/frame.h',
'../../third_party/dav1d/include/common/intops.h',
'../../third_party/dav1d/include/common/validate.h',
]

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 6ed5fafb42c651c24b6a65fd4f50ed426fd72d65 (2021-01-01T21:36:25.000+01:00).
release: commit f06148e7c755098666b9c0ed97a672a51785413a (2021-02-21T21:40:09.000+01:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 6ed5fafb42c651c24b6a65fd4f50ed426fd72d65
revision: f06148e7c755098666b9c0ed97a672a51785413a
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.8.1-0-g6ed5faf"
#define DAV1D_VERSION "0.8.2-0-gf06148e"

Просмотреть файл

@ -29,6 +29,6 @@
#define DAV1D_API_VERSION_MAJOR 5
#define DAV1D_API_VERSION_MINOR 0
#define DAV1D_API_VERSION_PATCH 0
#define DAV1D_API_VERSION_PATCH 1
#endif /* DAV1D_VERSION_H */

25
third_party/dav1d/NEWS поставляемый
Просмотреть файл

@ -1,4 +1,25 @@
Changes for 0.8.1 'Eurasian hobby":
Changes for 0.8.2 'Eurasian hobby':
-----------------------------------
0.8.2 is a middle-size update of the 0.8.0 branch:
- ARM32 optimizations for ipred and itx in 10/12bits,
completing the 10b/12b work on ARM64 and ARM32
- Give the post-filters their own threads
- ARM64: rewrite the wiener functions
- Speed up coefficient decoding, 0.5%-3% global decoding gain
- x86 optimizations for CDEF_filter and wiener in 10/12bit
- x86: rewrite the SGR AVX2 asm
- x86: improve msac speed on SSE2+ machines
- ARM32: improve speed of ipred and warp
- ARM64: improve speed of ipred, cdef_dir, cdef_filter, warp_motion and itx16
- ARM32/64: improve speed of looprestoration
- Add seeking, pausing to the player
- Update the player for rendering of 10b/12b
- Misc speed improvements and fixes on all platforms
- Add a xxh3 muxer in the dav1d application
Changes for 0.8.1 'Eurasian hobby':
-----------------------------------
0.8.1 is a minor update on 0.8.0:
@ -10,7 +31,7 @@ Changes for 0.8.1 'Eurasian hobby":
- x86 optimizations for wiener in SSE2/SSSE3/AVX2
Changes for 0.8.0 'Eurasian hobby":
Changes for 0.8.0 'Eurasian hobby':
-----------------------------------
0.8.0 is a major update for dav1d:

427
third_party/dav1d/examples/dav1dplay.c поставляемый
Просмотреть файл

@ -39,6 +39,11 @@
#include "dp_fifo.h"
#include "dp_renderer.h"
#define FRAME_OFFSET_TO_PTS(foff) \
(uint64_t)(((foff) * rd_ctx->spf) * 1000000000.0 + .5)
#define TS_TO_PTS(ts) \
(uint64_t)(((ts) * rd_ctx->timebase) * 1000000000.0 + .5)
// Selected renderer callbacks and cookie
static const Dav1dPlayRenderInfo *renderer_info = { NULL };
@ -59,27 +64,43 @@ typedef struct render_context
// Lock to protect access to the context structure
SDL_mutex *lock;
// Timestamp of previous decoded frame
int64_t last_pts;
// Timestamp of current decoded frame
int64_t current_pts;
// Timestamp of last displayed frame (in timebase unit)
int64_t last_ts;
// Timestamp of last decoded frame (in timebase unit)
int64_t current_ts;
// Ticks when last frame was received
uint32_t last_ticks;
// PTS time base
double timebase;
// Seconds per frame
double spf;
// Number of frames
uint32_t total;
// Fifo
Dav1dPlayPtrFifo *fifo;
// Custom SDL2 event type
uint32_t renderer_event_type;
// Custom SDL2 event types
uint32_t event_types;
// User pause state
uint8_t user_paused;
// Internal pause state
uint8_t paused;
// Start of internal pause state
uint32_t pause_start;
// Duration of internal pause state
uint32_t pause_time;
// Seek accumulator
int seek;
// Indicates if termination of the decoder thread was requested
uint8_t dec_should_terminate;
} Dav1dPlayRenderContext;
static void dp_settings_print_usage(const char *const app,
const char *const reason, ...)
const char *const reason, ...)
{
if (reason) {
va_list args;
@ -95,6 +116,7 @@ static void dp_settings_print_usage(const char *const app,
" --untimed/-u: ignore PTS, render as fast as possible\n"
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
" --pfthreads $num: number of postfilter threads(default: 1)\n"
" --highquality: enable high quality rendering\n"
" --zerocopy/-z: enable zero copy upload path\n"
" --gpugrain/-g: enable GPU grain synthesis\n"
@ -115,7 +137,7 @@ static unsigned parse_unsigned(const char *const optarg, const int option,
}
static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
const int argc, char *const *const argv)
const int argc, char *const *const argv)
{
int o;
Dav1dPlaySettings *settings = &rd_ctx->settings;
@ -127,6 +149,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
enum {
ARG_FRAME_THREADS = 256,
ARG_TILE_THREADS,
ARG_POSTFILTER_THREADS,
ARG_HIGH_QUALITY,
};
@ -137,6 +160,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
{ "untimed", 0, NULL, 'u' },
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
{ "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS },
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
{ "zerocopy", 0, NULL, 'z' },
{ "gpugrain", 0, NULL, 'g' },
@ -175,6 +199,10 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
lib_settings->n_tile_threads =
parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
break;
case ARG_POSTFILTER_THREADS:
lib_settings->n_postfilter_threads =
parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
break;
default:
dp_settings_print_usage(argv[0], NULL);
}
@ -213,16 +241,16 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
Dav1dPlayRenderContext *rd_ctx;
// Alloc
rd_ctx = malloc(sizeof(Dav1dPlayRenderContext));
rd_ctx = calloc(1, sizeof(Dav1dPlayRenderContext));
if (rd_ctx == NULL) {
return NULL;
}
// Register a custom event to notify our SDL main thread
// about new frames
rd_ctx->renderer_event_type = SDL_RegisterEvents(1);
if (rd_ctx->renderer_event_type == UINT32_MAX) {
fprintf(stderr, "Failure to create custom SDL event type!\n");
rd_ctx->event_types = SDL_RegisterEvents(3);
if (rd_ctx->event_types == UINT32_MAX) {
fprintf(stderr, "Failure to create custom SDL event types!\n");
free(rd_ctx);
return NULL;
}
@ -265,24 +293,17 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
return NULL;
}
rd_ctx->last_pts = 0;
rd_ctx->last_ticks = 0;
rd_ctx->current_pts = 0;
rd_ctx->timebase = 0;
rd_ctx->dec_should_terminate = 0;
return rd_ctx;
}
/**
* Notify about new available frame
* Notify about new event
*/
static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t type)
{
SDL_Event event;
SDL_zero(event);
event.type = rd_ctx->renderer_event_type;
event.user.code = code;
event.type = type;
SDL_PushEvent(&event);
}
@ -294,10 +315,137 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
* new picture.
*/
static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
Dav1dPicture *dav1d_pic)
Dav1dPicture *dav1d_pic)
{
rd_ctx->current_ts = dav1d_pic->m.timestamp;
renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
rd_ctx->current_pts = dav1d_pic->m.timestamp;
}
/**
* Toggle pause state
*/
static void dp_rd_ctx_toggle_pause(Dav1dPlayRenderContext *rd_ctx)
{
SDL_LockMutex(rd_ctx->lock);
rd_ctx->user_paused = !rd_ctx->user_paused;
if (rd_ctx->seek)
goto out;
rd_ctx->paused = rd_ctx->user_paused;
uint32_t now = SDL_GetTicks();
if (rd_ctx->paused)
rd_ctx->pause_start = now;
else {
rd_ctx->pause_time += now - rd_ctx->pause_start;
rd_ctx->pause_start = 0;
rd_ctx->last_ticks = now;
}
out:
SDL_UnlockMutex(rd_ctx->lock);
}
/**
* Query pause state
*/
static int dp_rd_ctx_is_paused(Dav1dPlayRenderContext *rd_ctx)
{
int ret;
SDL_LockMutex(rd_ctx->lock);
ret = rd_ctx->paused;
SDL_UnlockMutex(rd_ctx->lock);
return ret;
}
/**
* Request seeking, in seconds
*/
static void dp_rd_ctx_seek(Dav1dPlayRenderContext *rd_ctx, int sec)
{
SDL_LockMutex(rd_ctx->lock);
rd_ctx->seek += sec;
if (!rd_ctx->paused)
rd_ctx->pause_start = SDL_GetTicks();
rd_ctx->paused = 1;
SDL_UnlockMutex(rd_ctx->lock);
}
static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
Dav1dData *data, DemuxerContext *in_ctx);
static inline void destroy_pic(void *a);
/**
* Seek the stream, if requested
*/
static int dp_rd_ctx_handle_seek(Dav1dPlayRenderContext *rd_ctx,
DemuxerContext *in_ctx,
Dav1dContext *c, Dav1dData *data)
{
int res = 0;
SDL_LockMutex(rd_ctx->lock);
if (!rd_ctx->seek)
goto out;
int64_t seek = rd_ctx->seek * 1000000000ULL;
uint64_t pts = TS_TO_PTS(rd_ctx->current_ts);
pts = ((int64_t)pts > -seek) ? pts + seek : 0;
int end = pts >= FRAME_OFFSET_TO_PTS(rd_ctx->total);
if (end)
pts = FRAME_OFFSET_TO_PTS(rd_ctx->total - 1);
uint64_t target_pts = pts;
dav1d_flush(c);
uint64_t shift = FRAME_OFFSET_TO_PTS(5);
while (1) {
if (shift > pts)
shift = pts;
if ((res = input_seek(in_ctx, pts - shift)))
goto out;
Dav1dSequenceHeader seq;
uint64_t cur_pts;
do {
if ((res = input_read(in_ctx, data)))
break;
cur_pts = TS_TO_PTS(data->m.timestamp);
res = dav1d_parse_sequence_header(&seq, data->data, data->sz);
} while (res && cur_pts < pts);
if (!res && cur_pts <= pts)
break;
if (shift > pts)
shift = pts;
pts -= shift;
}
if (!res) {
pts = TS_TO_PTS(data->m.timestamp);
while (pts < target_pts) {
Dav1dPicture *p;
if ((res = decode_frame(&p, c, data, in_ctx)))
break;
if (p) {
pts = TS_TO_PTS(p->m.timestamp);
if (pts < target_pts)
destroy_pic(p);
else {
dp_fifo_push(rd_ctx->fifo, p);
uint32_t type = rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME;
dp_rd_ctx_post_event(rd_ctx, type);
}
}
}
if (!res) {
rd_ctx->last_ts = data->m.timestamp - rd_ctx->spf / rd_ctx->timebase;
rd_ctx->current_ts = data->m.timestamp;
}
}
out:
rd_ctx->paused = rd_ctx->user_paused;
if (!rd_ctx->paused && rd_ctx->seek) {
uint32_t now = SDL_GetTicks();
rd_ctx->pause_time += now - rd_ctx->pause_start;
rd_ctx->pause_start = 0;
rd_ctx->last_ticks = now;
}
rd_ctx->seek = 0;
SDL_UnlockMutex(rd_ctx->lock);
if (res)
fprintf(stderr, "Error seeking, aborting\n");
return res;
}
/**
@ -329,14 +477,15 @@ static int dp_rd_ctx_should_terminate(Dav1dPlayRenderContext *rd_ctx)
*/
static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
{
SDL_LockMutex(rd_ctx->lock);
// Calculate time since last frame was received
uint32_t ticks_now = SDL_GetTicks();
uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0;
// Calculate when to display the frame
int64_t pts_diff = rd_ctx->current_pts - rd_ctx->last_pts;
int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
rd_ctx->last_pts = rd_ctx->current_pts;
int64_t ts_diff = rd_ctx->current_ts - rd_ctx->last_ts;
int32_t pts_diff = (ts_diff * rd_ctx->timebase) * 1000.0 + .5;
int32_t wait_time = pts_diff - ticks_diff;
// In untimed mode, simply don't wait
if (rd_ctx->settings.untimed)
@ -347,13 +496,59 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
// accurate player this would need to be done in a better way.
if (wait_time > 0) {
SDL_Delay(wait_time);
} else if (wait_time < -10) { // Do not warn for minor time drifts
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
} else if (wait_time < -10 && !rd_ctx->paused) { // Do not warn for minor time drifts
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time / 1000.0);
}
renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);
rd_ctx->last_ts = rd_ctx->current_ts;
rd_ctx->last_ticks = SDL_GetTicks();
SDL_UnlockMutex(rd_ctx->lock);
}
static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
Dav1dData *data, DemuxerContext *in_ctx)
{
int res;
// Send data packets we got from the demuxer to dav1d
if ((res = dav1d_send_data(c, data)) < 0) {
// On EAGAIN, dav1d can not consume more data and
// dav1d_get_picture needs to be called first, which
// will happen below, so just keep going in that case
// and do not error out.
if (res != DAV1D_ERR(EAGAIN)) {
dav1d_data_unref(data);
goto err;
}
}
*p = calloc(1, sizeof(**p));
// Try to get a decoded frame
if ((res = dav1d_get_picture(c, *p)) < 0) {
// In all error cases, even EAGAIN, p needs to be freed as
// it is never added to the queue and would leak.
free(*p);
*p = NULL;
// On EAGAIN, it means dav1d has not enough data to decode
// therefore this is not a decoding error but just means
// we need to feed it more data, which happens in the next
// run of the decoder loop.
if (res != DAV1D_ERR(EAGAIN))
goto err;
}
return data->sz == 0 ? input_read(in_ctx, data) : 0;
err:
fprintf(stderr, "Error decoding frame: %s\n",
strerror(-res));
return res;
}
static inline void destroy_pic(void *a)
{
Dav1dPicture *p = (Dav1dPicture *)a;
dav1d_picture_unref(p);
free(p);
}
/* Decoder thread "main" function */
@ -366,10 +561,7 @@ static int decoder_thread_main(void *cookie)
Dav1dData data;
DemuxerContext *in_ctx = NULL;
int res = 0;
unsigned n_out = 0, total, timebase[2], fps[2];
// Store current ticks for stats calculation
uint32_t decoder_start = SDL_GetTicks();
unsigned total, timebase[2], fps[2];
Dav1dPlaySettings settings = rd_ctx->settings;
@ -382,8 +574,9 @@ static int decoder_thread_main(void *cookie)
goto cleanup;
}
double timebase_d = timebase[1]/(double)timebase[0];
rd_ctx->timebase = timebase_d;
rd_ctx->timebase = (double)timebase[1] / timebase[0];
rd_ctx->spf = (double)fps[1] / fps[0];
rd_ctx->total = total;
if ((res = dav1d_open(&c, &rd_ctx->lib_settings))) {
fprintf(stderr, "Failed opening dav1d decoder\n");
@ -398,55 +591,29 @@ static int decoder_thread_main(void *cookie)
}
// Decoder loop
do {
if (dp_rd_ctx_should_terminate(rd_ctx))
while (1) {
if (dp_rd_ctx_should_terminate(rd_ctx) ||
(res = dp_rd_ctx_handle_seek(rd_ctx, in_ctx, c, &data)) ||
(res = decode_frame(&p, c, &data, in_ctx)))
{
break;
// Send data packets we got from the demuxer to dav1d
if ((res = dav1d_send_data(c, &data)) < 0) {
// On EAGAIN, dav1d can not consume more data and
// dav1d_get_picture needs to be called first, which
// will happen below, so just keep going in that case
// and do not error out.
if (res != DAV1D_ERR(EAGAIN)) {
dav1d_data_unref(&data);
fprintf(stderr, "Error decoding frame: %s\n",
strerror(-res));
break;
}
}
p = calloc(1, sizeof(*p));
// Try to get a decoded frame
if ((res = dav1d_get_picture(c, p)) < 0) {
// In all error cases, even EAGAIN, p needs to be freed as
// it is never added to the queue and would leak.
free(p);
// On EAGAIN, it means dav1d has not enough data to decode
// therefore this is not a decoding error but just means
// we need to feed it more data, which happens in the next
// run of this decoder loop.
if (res != DAV1D_ERR(EAGAIN)) {
fprintf(stderr, "Error decoding frame: %s\n",
strerror(-res));
break;
}
res = 0;
} else {
else if (p) {
// Queue frame
dp_fifo_push(rd_ctx->fifo, p);
dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
n_out++;
SDL_LockMutex(rd_ctx->lock);
int seek = rd_ctx->seek;
SDL_UnlockMutex(rd_ctx->lock);
if (!seek) {
dp_fifo_push(rd_ctx->fifo, p);
uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
dp_rd_ctx_post_event(rd_ctx, type);
}
}
} while ((data.sz > 0 || !input_read(in_ctx, &data)));
}
// Release remaining data
if (data.sz > 0) dav1d_data_unref(&data);
if (data.sz > 0)
dav1d_data_unref(&data);
// Do not drain in case an error occured and caused us to leave the
// decoding loop early.
if (res < 0)
@ -461,7 +628,6 @@ static int decoder_thread_main(void *cookie)
do {
if (dp_rd_ctx_should_terminate(rd_ctx))
break;
p = calloc(1, sizeof(*p));
res = dav1d_get_picture(c, p);
if (res < 0) {
@ -474,19 +640,13 @@ static int decoder_thread_main(void *cookie)
} else {
// Queue frame
dp_fifo_push(rd_ctx->fifo, p);
dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
n_out++;
uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
dp_rd_ctx_post_event(rd_ctx, type);
}
} while (res != DAV1D_ERR(EAGAIN));
// Print stats
uint32_t decoding_time_ms = SDL_GetTicks() - decoder_start;
printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
n_out, decoding_time_ms/1000, n_out / (decoding_time_ms / 1000.0));
cleanup:
dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_DEC_QUIT);
dp_rd_ctx_post_event(rd_ctx, rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT);
if (in_ctx)
input_close(in_ctx);
@ -543,41 +703,84 @@ int main(int argc, char **argv)
decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);
// Main loop
#define NUM_MAX_EVENTS 8
SDL_Event events[NUM_MAX_EVENTS];
int num_frame_events = 0;
uint32_t start_time = 0, n_out = 0;
while (1) {
SDL_Event e;
if (SDL_WaitEvent(&e)) {
if (e.type == SDL_QUIT) {
int num_events = 0;
SDL_WaitEvent(NULL);
while (num_events < NUM_MAX_EVENTS && SDL_PollEvent(&events[num_events++]))
break;
for (int i = 0; i < num_events; ++i) {
SDL_Event *e = &events[i];
if (e->type == SDL_QUIT) {
dp_rd_ctx_request_shutdown(rd_ctx);
} else if (e.type == SDL_WINDOWEVENT) {
if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
dp_fifo_flush(rd_ctx->fifo, destroy_pic);
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME);
num_frame_events = 0;
} else if (e->type == SDL_WINDOWEVENT) {
if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
// TODO: Handle window resizes
} else if(e->window.event == SDL_WINDOWEVENT_EXPOSED) {
dp_rd_ctx_render(rd_ctx);
}
} else if (e.type == rd_ctx->renderer_event_type) {
if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
// Dequeue frame and update the render context with it
Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
// Do not update textures during termination
if (!dp_rd_ctx_should_terminate(rd_ctx))
dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
dav1d_picture_unref(p);
free(p);
} else if (e.user.code == DAV1D_EVENT_DEC_QUIT) {
break;
} else if (e->type == SDL_KEYDOWN) {
SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e;
if (kbde->keysym.sym == SDLK_SPACE) {
dp_rd_ctx_toggle_pause(rd_ctx);
} else if (kbde->keysym.sym == SDLK_LEFT ||
kbde->keysym.sym == SDLK_RIGHT)
{
if (kbde->keysym.sym == SDLK_LEFT)
dp_rd_ctx_seek(rd_ctx, -5);
else if (kbde->keysym.sym == SDLK_RIGHT)
dp_rd_ctx_seek(rd_ctx, +5);
dp_fifo_flush(rd_ctx->fifo, destroy_pic);
SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
num_frame_events = 0;
}
} else if (e->type == rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME) {
num_frame_events++;
// Store current ticks for stats calculation
if (start_time == 0)
start_time = SDL_GetTicks();
} else if (e->type == rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME) {
// Dequeue frame and update the render context with it
Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
// Do not update textures during termination
if (!dp_rd_ctx_should_terminate(rd_ctx)) {
dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
n_out++;
}
destroy_pic(p);
} else if (e->type == rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT) {
goto out;
}
}
// Do not render during termination
if (!dp_rd_ctx_should_terminate(rd_ctx))
dp_rd_ctx_render(rd_ctx);
if (num_frame_events && !dp_rd_ctx_is_paused(rd_ctx)) {
// Dequeue frame and update the render context with it
Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
// Do not update textures during termination
if (!dp_rd_ctx_should_terminate(rd_ctx)) {
dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
dp_rd_ctx_render(rd_ctx);
n_out++;
}
destroy_pic(p);
num_frame_events--;
}
}
out:;
// Print stats
uint32_t time_ms = SDL_GetTicks() - start_time - rd_ctx->pause_time;
printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
n_out, time_ms / 1000, n_out/ (time_ms / 1000.0));
int decoder_ret = 0;
SDL_WaitThread(decoder_thread, &decoder_ret);
dp_rd_ctx_destroy(rd_ctx);
return decoder_ret;
}

28
third_party/dav1d/examples/dp_fifo.c поставляемый
Просмотреть файл

@ -37,6 +37,8 @@ struct dp_fifo
size_t capacity;
size_t count;
void **entries;
int push_wait;
int flush;
};
@ -54,6 +56,8 @@ Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
fifo->capacity = capacity;
fifo->count = 0;
fifo->push_wait = 0;
fifo->flush = 0;
fifo->lock = SDL_CreateMutex();
if (fifo->lock == NULL) {
@ -90,8 +94,16 @@ void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
{
SDL_LockMutex(fifo->lock);
while (fifo->count == fifo->capacity)
while (fifo->count == fifo->capacity) {
fifo->push_wait = 1;
SDL_CondWait(fifo->cond_change, fifo->lock);
fifo->push_wait = 0;
if (fifo->flush) {
SDL_CondSignal(fifo->cond_change);
SDL_UnlockMutex(fifo->lock);
return;
}
}
fifo->entries[fifo->count++] = element;
if (fifo->count == 1)
SDL_CondSignal(fifo->cond_change);
@ -120,4 +132,16 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
return res;
}
void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *))
{
SDL_LockMutex(fifo->lock);
fifo->flush = 1;
if (fifo->push_wait) {
SDL_CondSignal(fifo->cond_change);
SDL_CondWait(fifo->cond_change, fifo->lock);
}
while (fifo->count)
destroy_elem(fifo->entries[--fifo->count]);
fifo->flush = 0;
SDL_UnlockMutex(fifo->lock);
}

2
third_party/dav1d/examples/dp_fifo.h поставляемый
Просмотреть файл

@ -59,3 +59,5 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
* other thread will call dp_fifo_shift will lead to a deadlock.
*/
void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *));

9
third_party/dav1d/examples/dp_renderer.h поставляемый
Просмотреть файл

@ -66,8 +66,11 @@ typedef struct {
#define WINDOW_WIDTH 910
#define WINDOW_HEIGHT 512
#define DAV1D_EVENT_NEW_FRAME 1
#define DAV1D_EVENT_DEC_QUIT 2
enum {
DAV1D_EVENT_NEW_FRAME,
DAV1D_EVENT_SEEK_FRAME,
DAV1D_EVENT_DEC_QUIT
};
/**
* Renderer info
@ -84,7 +87,7 @@ typedef struct rdr_info
void (*destroy_renderer)(void *cookie);
// Callback to the render function that renders a prevously sent frame
void (*render)(void *cookie, const Dav1dPlaySettings *settings);
// Callback to the send frame function
// Callback to the send frame function, _may_ also unref dav1d_pic!
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings);
// Callback for alloc/release pictures (optional)

Просмотреть файл

@ -30,7 +30,7 @@
#include <assert.h>
#include <libplacebo/renderer.h>
#include <libplacebo/utils/upload.h>
#include <libplacebo/utils/dav1d.h>
#ifdef HAVE_PLACEBO_VULKAN
# include <libplacebo/vulkan.h>
@ -72,7 +72,7 @@ typedef struct renderer_priv_ctx
// Lock protecting access to the texture
SDL_mutex *lock;
// Image to render, and planes backing them
struct pl_image image;
struct pl_frame image;
const struct pl_tex *plane_tex[3];
} Dav1dPlayRendererPrivateContext;
@ -319,22 +319,15 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
if (settings->highquality)
render_params = pl_render_default_params;
struct pl_render_target target;
pl_render_target_from_swapchain(&target, &frame);
target.profile = (struct pl_icc_profile) {
.data = NULL,
.len = 0,
};
#if PL_API_VER >= 66
pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
if (pl_render_target_partial(&target))
pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
#endif
struct pl_frame target;
pl_frame_from_swapchain(&target, &frame);
pl_rect2df_aspect_copy(&target.crop, &rd_priv_ctx->image.crop, 0.0);
if (pl_frame_is_cropped(&target))
pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 0.0 });
if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
fprintf(stderr, "Failed rendering frame!\n");
pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 1.0 });
}
ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
@ -351,320 +344,37 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
Dav1dPlayRendererPrivateContext *p = cookie;
assert(p != NULL);
int ret = 0;
SDL_LockMutex(rd_priv_ctx->lock);
if (!dav1d_pic)
return ret;
if (dav1d_pic == NULL) {
SDL_UnlockMutex(rd_priv_ctx->lock);
return 0;
}
int width = dav1d_pic->p.w;
int height = dav1d_pic->p.h;
int sub_x = 0, sub_y = 0;
int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
struct pl_image *image = &rd_priv_ctx->image;
*image = (struct pl_image) {
.num_planes = 3,
.width = width,
.height = height,
.src_rect = {0, 0, width, height},
.repr = {
.bits = {
.sample_depth = bytes * 8,
.color_depth = dav1d_pic->p.bpc,
},
},
struct pl_dav1d_upload_params params = {
.picture = dav1d_pic,
.film_grain = settings->gpugrain,
.gpu_allocated = settings->zerocopy,
.asynchronous = true,
};
// Figure out the correct plane dimensions/count
switch (dav1d_pic->p.layout) {
case DAV1D_PIXEL_LAYOUT_I400:
image->num_planes = 1;
break;
case DAV1D_PIXEL_LAYOUT_I420:
sub_x = sub_y = 1;
break;
case DAV1D_PIXEL_LAYOUT_I422:
sub_x = 1;
break;
case DAV1D_PIXEL_LAYOUT_I444:
break;
}
// Set the right colorspace metadata etc.
switch (dav1d_pic->seq_hdr->pri) {
case DAV1D_COLOR_PRI_UNKNOWN: image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
case DAV1D_COLOR_PRI_BT709: image->color.primaries = PL_COLOR_PRIM_BT_709; break;
case DAV1D_COLOR_PRI_BT470M: image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
case DAV1D_COLOR_PRI_BT470BG: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
case DAV1D_COLOR_PRI_BT601: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
case DAV1D_COLOR_PRI_BT2020: image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
case DAV1D_COLOR_PRI_XYZ:
// Handled below
assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
break;
default:
printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
"may be very incorrect\n", dav1d_pic->seq_hdr->pri);
break;
}
switch (dav1d_pic->seq_hdr->trc) {
case DAV1D_TRC_BT709:
case DAV1D_TRC_BT470M:
case DAV1D_TRC_BT470BG:
case DAV1D_TRC_BT601:
case DAV1D_TRC_SMPTE240:
case DAV1D_TRC_BT2020_10BIT:
case DAV1D_TRC_BT2020_12BIT:
// These all map to the effective "SDR" CRT-based EOTF, BT.1886
image->color.transfer = PL_COLOR_TRC_BT_1886;
break;
case DAV1D_TRC_UNKNOWN: image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
case DAV1D_TRC_LINEAR: image->color.transfer = PL_COLOR_TRC_LINEAR; break;
case DAV1D_TRC_SRGB: image->color.transfer = PL_COLOR_TRC_SRGB; break;
case DAV1D_TRC_SMPTE2084: image->color.transfer = PL_COLOR_TRC_PQ; break;
case DAV1D_TRC_HLG: image->color.transfer = PL_COLOR_TRC_HLG; break;
default:
printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
"may be very incorrect\n", dav1d_pic->seq_hdr->trc);
break;
}
switch (dav1d_pic->seq_hdr->mtrx) {
case DAV1D_MC_IDENTITY:
// This is going to be either RGB or XYZ
if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
image->repr.sys = PL_COLOR_SYSTEM_XYZ;
} else {
image->repr.sys = PL_COLOR_SYSTEM_RGB;
}
break;
case DAV1D_MC_UNKNOWN:
// PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
image->repr.sys = pl_color_system_guess_ycbcr(width, height);
break;
case DAV1D_MC_BT709: image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
case DAV1D_MC_BT601: image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
case DAV1D_MC_SMPTE240: image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
case DAV1D_MC_SMPTE_YCGCO: image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
case DAV1D_MC_BT2020_NCL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
case DAV1D_MC_BT2020_CL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
case DAV1D_MC_ICTCP:
// This one is split up based on the actual HDR curve in use
if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
} else {
image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
}
break;
default:
printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
"may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
break;
}
if (dav1d_pic->seq_hdr->color_range) {
image->repr.levels = PL_COLOR_LEVELS_PC;
} else {
image->repr.levels = PL_COLOR_LEVELS_TV;
}
switch (dav1d_pic->seq_hdr->chr) {
case DAV1D_CHR_UNKNOWN: chroma_loc = PL_CHROMA_UNKNOWN; break;
case DAV1D_CHR_VERTICAL: chroma_loc = PL_CHROMA_LEFT; break;
case DAV1D_CHR_COLOCATED: chroma_loc = PL_CHROMA_TOP_LEFT; break;
}
#if PL_API_VER >= 63
if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
struct pl_av1_grain_data *dst = &image->av1_grain;
*dst = (struct pl_av1_grain_data) {
.grain_seed = src->seed,
.num_points_y = src->num_y_points,
.chroma_scaling_from_luma = src->chroma_scaling_from_luma,
.num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] },
.scaling_shift = src->scaling_shift,
.ar_coeff_lag = src->ar_coeff_lag,
.ar_coeff_shift = (int)src->ar_coeff_shift,
.grain_scale_shift = src->grain_scale_shift,
.uv_mult = { src->uv_mult[0], src->uv_mult[1] },
.uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
.uv_offset = { src->uv_offset[0], src->uv_offset[1] },
.overlap = src->overlap_flag,
};
assert(sizeof(dst->points_y) == sizeof(src->y_points));
assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
// this one has different row sizes for alignment
for (int c = 0; c < 2; c++) {
for (int i = 0; i < 25; i++)
dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
}
}
#endif
// Upload the actual planes
struct pl_plane_data data[3] = {
{
// Y plane
.type = PL_FMT_UNORM,
.width = width,
.height = height,
.pixel_stride = bytes,
.row_stride = dav1d_pic->stride[0],
.component_size = {bytes * 8},
.component_map = {0},
}, {
// U plane
.type = PL_FMT_UNORM,
.width = width >> sub_x,
.height = height >> sub_y,
.pixel_stride = bytes,
.row_stride = dav1d_pic->stride[1],
.component_size = {bytes * 8},
.component_map = {1},
}, {
// V plane
.type = PL_FMT_UNORM,
.width = width >> sub_x,
.height = height >> sub_y,
.pixel_stride = bytes,
.row_stride = dav1d_pic->stride[1],
.component_size = {bytes * 8},
.component_map = {2},
},
};
bool ok = true;
for (int i = 0; i < image->num_planes; i++) {
if (settings->zerocopy) {
const struct pl_buf *buf = dav1d_pic->allocator_data;
assert(buf);
data[i].buf = buf;
data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
} else {
data[i].pixels = dav1d_pic->data[i];
}
ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
}
// Apply the correct chroma plane shift. This has to be done after pl_upload_plane
#if PL_API_VER >= 67
pl_image_set_chroma_location(image, chroma_loc);
#else
pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
#endif
if (!ok) {
SDL_LockMutex(p->lock);
if (!pl_upload_dav1dpicture(p->gpu, &p->image, p->plane_tex, &params)) {
fprintf(stderr, "Failed uploading planes!\n");
*image = (struct pl_image) {0};
p->image = (struct pl_frame) {0};
ret = -1;
}
SDL_UnlockMutex(rd_priv_ctx->lock);
return !ok;
SDL_UnlockMutex(p->lock);
return ret;
}
// Align to power of 2
#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
static int placebo_alloc_pic(Dav1dPicture *const pic, void *cookie)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
SDL_LockMutex(rd_priv_ctx->lock);
const struct pl_gpu *gpu = rd_priv_ctx->gpu;
int ret = DAV1D_ERR(ENOMEM);
// Copied from dav1d_default_picture_alloc
const int hbd = p->p.bpc > 8;
const int aligned_w = ALIGN2(p->p.w, 128);
const int aligned_h = ALIGN2(p->p.h, 128);
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
p->stride[0] = aligned_w << hbd;
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
// Align strides up to multiples of the GPU performance hints
p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
// Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
// The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
// even in the case that the driver gives us insane alignments
const size_t pic_size = y_sz + 2 * uv_sz;
const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
// Validate size limitations
if (total_size > gpu->limits.max_xfer_size) {
printf("alloc of %zu bytes exceeds limits\n", total_size);
goto err;
}
const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
.type = PL_BUF_TEX_TRANSFER,
.host_mapped = true,
.size = total_size,
.memory_type = PL_BUF_MEM_HOST,
.user_data = p,
});
if (!buf) {
printf("alloc of GPU mapped buffer failed\n");
goto err;
}
assert(buf->data);
uintptr_t base = (uintptr_t) buf->data, data[3];
data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
// Sanity check offset alignment for the sake of debugging
if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
data[1] - base != ALIGN2(data[1] - base, off_align) ||
data[2] - base != ALIGN2(data[2] - base, off_align))
{
printf("GPU buffer horribly misaligned, expect slowdown!\n");
}
p->allocator_data = (void *) buf;
p->data[0] = (void *) data[0];
p->data[1] = (void *) data[1];
p->data[2] = (void *) data[2];
ret = 0;
// fall through
err:
int ret = pl_allocate_dav1dpicture(pic, rd_priv_ctx->gpu);
SDL_UnlockMutex(rd_priv_ctx->lock);
return ret;
}
@ -673,11 +383,9 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
assert(pic->allocator_data);
SDL_LockMutex(rd_priv_ctx->lock);
const struct pl_gpu *gpu = rd_priv_ctx->gpu;
pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
pl_release_dav1dpicture(pic, rd_priv_ctx->gpu);
SDL_UnlockMutex(rd_priv_ctx->lock);
}
@ -690,10 +398,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
.update_frame = placebo_upload_image,
.alloc_pic = placebo_alloc_pic,
.release_pic = placebo_release_pic,
# if PL_API_VER >= 63
.supports_gpu_grain = 1,
# endif
};
#else
const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
@ -706,12 +411,7 @@ const Dav1dPlayRenderInfo rdr_placebo_gl = {
.destroy_renderer = placebo_renderer_destroy,
.render = placebo_render,
.update_frame = placebo_upload_image,
.alloc_pic = placebo_alloc_pic,
.release_pic = placebo_release_pic,
# if PL_API_VER >= 63
.supports_gpu_grain = 1,
# endif
};
#else
const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };

4
third_party/dav1d/examples/meson.build поставляемый
Просмотреть файл

@ -43,10 +43,10 @@ dav1dplay_sources = files(
sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)
if sdl2_dependency.found()
dav1dplay_deps = [sdl2_dependency]
dav1dplay_deps = [sdl2_dependency, libm_dependency]
dav1dplay_cflags = []
placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
placebo_dependency = dependency('libplacebo', version: '>= 3.110.0', required: false)
if placebo_dependency.found()
dav1dplay_deps += placebo_dependency

Просмотреть файл

@ -116,8 +116,8 @@
# define dav1d_uninit(x) x
#endif
#ifdef _MSC_VER
#include <intrin.h>
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
static inline int ctz(const unsigned int mask) {
unsigned long idx;

45
third_party/dav1d/include/common/frame.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,45 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_COMMON_FRAME_H
#define DAV1D_COMMON_FRAME_H
/*
* Checks whether Dav1dFrameType == INTER || == SWITCH
* Both are defined as odd numbers {1, 3} and therefore have the LSB set.
* See also: AV1 spec 6.8.2
*/
#define IS_INTER_OR_SWITCH(frame_header) \
((frame_header)->frame_type & 1)
/*
* Checks whether Dav1dFrameType == KEY || == INTRA
* See also: AV1 spec 6.8.2
*/
#define IS_KEY_OR_INTRA(frame_header) \
(!IS_INTER_OR_SWITCH(frame_header))
#endif /* DAV1D_COMMON_FRAME_H */

4
third_party/dav1d/include/dav1d/dav1d.h поставляемый
Просмотреть файл

@ -45,6 +45,7 @@ typedef struct Dav1dRef Dav1dRef;
#define DAV1D_MAX_FRAME_THREADS 256
#define DAV1D_MAX_TILE_THREADS 64
#define DAV1D_MAX_POSTFILTER_THREADS 256
typedef struct Dav1dLogger {
void *cookie; ///< Custom data to pass to the callback.
@ -67,7 +68,8 @@ typedef struct Dav1dSettings {
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
Dav1dPicAllocator allocator; ///< Picture allocator callback.
Dav1dLogger logger; ///< Logger callback.
uint8_t reserved[32]; ///< reserved for future use
int n_postfilter_threads;
uint8_t reserved[28]; ///< reserved for future use
} Dav1dSettings;
/**

4
third_party/dav1d/include/meson.build поставляемый
Просмотреть файл

@ -25,9 +25,7 @@
# Revision file (vcs_version.h) generation
dav1d_git_dir = join_paths(dav1d_src_root, '.git')
rev_target = vcs_tag(command: [
'git', '--git-dir', dav1d_git_dir,
'describe', '--tags', '--long',
'--match', '?.*.*', '--always'
'git', '--git-dir', dav1d_git_dir, 'describe', '--long', '--always'
],
input: 'vcs_version.h.in',
output: 'vcs_version.h'

11
third_party/dav1d/meson.build поставляемый
Просмотреть файл

@ -1,4 +1,4 @@
# Copyright © 2018-2020, VideoLAN and dav1d authors
# Copyright © 2018-2021, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.8.1',
version: '0.8.2',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
dav1d_soname_version = '5.0.0'
dav1d_soname_version = '5.0.1'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -128,7 +128,7 @@ if host_machine.system() == 'windows'
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
rc_data.set('COPYRIGHT_YEARS', '2020')
rc_data.set('COPYRIGHT_YEARS', '2021')
else
thread_dependency = dependency('threads')
thread_compat_dep = []
@ -168,6 +168,8 @@ if host_machine.system() == 'linux'
endif
endif
libm_dependency = cc.find_library('m', required: false)
# Header checks
@ -257,6 +259,7 @@ if cc.get_argument_syntax() != 'msvc'
else
optional_arguments += [
'-wd4028', # parameter different from declaration
'-wd4090', # broken with arrays of pointers
'-wd4996' # use of POSIX functions
]
endif

4
third_party/dav1d/meson_options.txt поставляемый
Просмотреть файл

@ -53,3 +53,7 @@ option('fuzzer_ldflags',
option('stack_alignment',
type: 'integer',
value: 0)
option('xxhash_muxer',
type : 'feature',
value : 'auto')

87
third_party/dav1d/src/arm/32/ipred.S поставляемый
Просмотреть файл

@ -40,8 +40,7 @@ function ipred_dc_128_8bpc_neon, export=1
adr r2, L(ipred_dc_128_tbl)
sub r3, r3, #25
ldr r3, [r2, r3, lsl #2]
mov lr, #128
vdup.8 q0, lr
vmov.i8 q0, #128
add r2, r2, r3
add r12, r0, r1
lsl r1, r1, #1
@ -79,7 +78,7 @@ L(ipred_dc_128_tbl):
bgt 16b
pop {r4, pc}
320:
vdup.8 q1, lr
vmov.i8 q1, #128
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
@ -89,20 +88,18 @@ L(ipred_dc_128_tbl):
bgt 32b
pop {r4, pc}
640:
vdup.8 q1, lr
vdup.8 q2, lr
vdup.8 q3, lr
vmov.i8 q1, #128
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
@ -401,19 +398,17 @@ L(ipred_dc_top_tbl):
vrshrn.u16 d18, q0, #6
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
@ -538,20 +533,18 @@ L(ipred_dc_left_h64):
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w64):
sub r1, r1, #32
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
sub r1, r1, #32
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
endfunc
@ -600,10 +593,10 @@ L(ipred_dc_tbl):
L(ipred_dc_h4):
vld1.32 {d0[]}, [r2, :32]!
vpaddl.u8 d0, d0
add r2, r2, #1
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w4):
add r2, r2, #1
vld1.32 {d1[]}, [r2]
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
@ -635,10 +628,10 @@ L(ipred_dc_h8):
vld1.8 {d0}, [r2, :64]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
add r2, r2, #1
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w8):
add r2, r2, #1
vld1.8 {d2}, [r2]
vadd.s16 d0, d0, d30
vpaddl.u8 d2, d2
@ -672,10 +665,10 @@ L(ipred_dc_h16):
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
add r2, r2, #1
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w16):
add r2, r2, #1
vld1.8 {d2, d3}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q1, d2, d3
@ -712,10 +705,10 @@ L(ipred_dc_h32):
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
add r2, r2, #1
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w32):
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q1, d2, d3
@ -760,10 +753,10 @@ L(ipred_dc_h64):
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
add r2, r2, #1
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w64):
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]!
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
@ -789,11 +782,11 @@ L(ipred_dc_w64):
vadd.s16 d0, d0, d2
vadd.s16 d0, d0, d3
vshl.u16 d18, d0, d28
beq 1f // h = 16/32
beq 1f
// h = 16/32
movw lr, #(0x5556/2)
movt lr, #(0x3334/2)
mov r5, r4
and r5, r5, #31
and r5, r4, #31
lsr lr, lr, r5
vdup.16 d30, lr
vqdmulh.s16 d18, d18, d30
@ -801,18 +794,16 @@ L(ipred_dc_w64):
sub r1, r1, #32
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
endfunc
@ -1444,6 +1435,8 @@ function ipred_filter_8bpc_neon, export=1
vmovl.s8 q13, d28
vmovl.s8 q14, d29
add r8, r2, #1
sub r2, r2, #2
mov r7, #-2
bx r5
.align 2
@ -1455,8 +1448,6 @@ L(ipred_filter_tbl):
40:
vld1.32 {d0[]}, [r8] // top (0-3)
sub r2, r2, #2
mov r7, #-2
vmovl.u8 q0, d0 // top (0-3)
4:
vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
@ -1473,13 +1464,11 @@ L(ipred_filter_tbl):
vst1.32 {d4[0]}, [r0, :32], r1
vmovl.u8 q0, d4
vst1.32 {d4[1]}, [r6, :32], r1
vext.8 q0, q0, q0, #8 // move top from [4-7] to [0-3]
vmov d0, d1 // move top from [4-7] to [0-3]
bgt 4b
pop {r4-r8, pc}
80:
vld1.8 {d0}, [r8] // top (0-7)
sub r2, r2, #2
mov r7, #-2
vmovl.u8 q0, d0 // top (0-7)
8:
vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
@ -1503,16 +1492,14 @@ L(ipred_filter_tbl):
vqrshrun.s16 d5, q3, #4
vzip.32 d4, d5
subs r4, r4, #2
vst1.64 {d4}, [r0, :64], r1
vst1.8 {d4}, [r0, :64], r1
vmovl.u8 q0, d5
vst1.64 {d5}, [r6, :64], r1
vst1.8 {d5}, [r6, :64], r1
bgt 8b
pop {r4-r8, pc}
160:
320:
vpush {q4-q5}
sub r2, r2, #2
mov r7, #-2
sub r1, r1, r3
mov lr, r3
@ -2003,10 +1990,10 @@ L(ipred_cfl_tbl):
L(ipred_cfl_h4):
vld1.32 {d0[]}, [r2, :32]!
vpaddl.u8 d0, d0
add r2, r2, #1
vpadd.i16 d0, d0
bx r12
L(ipred_cfl_w4):
add r2, r2, #1
vld1.32 {d1[]}, [r2]
vadd.i16 d0, d0, d16
vpaddl.u8 d1, d1
@ -2031,10 +2018,10 @@ L(ipred_cfl_h8):
vld1.8 {d0}, [r2, :64]!
vpaddl.u8 d0, d0
vpadd.i16 d0, d0
add r2, r2, #1
vpadd.i16 d0, d0
bx r12
L(ipred_cfl_w8):
add r2, r2, #1
vld1.8 {d1}, [r2]
vadd.i16 d0, d0, d16
vpaddl.u8 d1, d1
@ -2061,10 +2048,10 @@ L(ipred_cfl_h16):
vaddl.u8 q0, d0, d1
vadd.i16 d0, d0, d1
vpadd.i16 d0, d0
add r2, r2, #1
vpadd.i16 d0, d0
bx r12
L(ipred_cfl_w16):
add r2, r2, #1
vld1.8 {q2}, [r2]
vadd.i16 d0, d0, d16
vaddl.u8 q2, d4, d5
@ -2094,10 +2081,10 @@ L(ipred_cfl_h32):
vadd.i16 q0, q2, q3
vadd.i16 d0, d0, d1
vpadd.i16 d0, d0
add r2, r2, #1
vpadd.i16 d0, d0
bx r12
L(ipred_cfl_w32):
add r2, r2, #1
vld1.8 {q2, q3}, [r2]
vadd.i16 d0, d0, d16
vaddl.u8 q2, d4, d5

3254
third_party/dav1d/src/arm/32/ipred16.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

57
third_party/dav1d/src/arm/32/itx.S поставляемый
Просмотреть файл

@ -706,7 +706,7 @@ def_fn_4x4 identity, flipadst
vrshrn_8h \r14, \r15, q4, q5, #12 // t7a
vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
vrshrn_8h \r6, \r7, q6, q7, #12 // t5a
vrshrn_8h \r10, \r11, q2, q3, #12 // taa
vrshrn_8h \r10, \r11, q2, q3, #12 // t6a
vqadd.s16 q2, \q1, \q3 // t4
vqsub.s16 \q1, \q1, \q3 // t5a
@ -1173,7 +1173,7 @@ function inv_dct_4h_x16_neon, export=1
vrshrn.i32 d6, q3, #12 // t11
vrshrn.i32 d7, q4, #12 // t12
vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a
vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a
vrshrn.i32 d4, q2, #12 // t10a
vrshrn.i32 d5, q4, #12 // t13a
@ -1480,53 +1480,6 @@ function inv_txfm_add_vert_4x16_neon
pop {pc}
endfunc
.macro sub_sp_align space
#if CONFIG_THUMB
mov r7, sp
and r7, r7, #15
#else
and r7, sp, #15
#endif
sub sp, sp, r7
// Now the stack is aligned, store the amount of adjustment back
// on the stack, as we don't want to waste a register as frame
// pointer.
str r7, [sp, #-16]!
#ifdef _WIN32
.if \space > 8192
// Here, we'd need to touch two (or more) pages while decrementing
// the stack pointer.
.error "sub_sp_align doesn't support values over 8K at the moment"
.elseif \space > 4096
sub r7, sp, #4096
ldr r12, [r7]
sub r7, r7, #(\space - 4096)
mov sp, r7
.else
sub sp, sp, #\space
.endif
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
#endif
.endm
.macro add_sp_align space
.if \space >= 4096
add sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
add sp, sp, #(\space)%4096
.endif
ldr r7, [sp], #16
// Add back the original stack adjustment
add sp, sp, r7
.endm
function inv_txfm_add_16x16_neon
sub_sp_align 512
ldrh r11, [r10], #2
@ -3248,7 +3201,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
mov r8, #(32 - \i)
cmp r3, r11
blt 1f
.if \i < 28
ldrh r11, [r10], #2
.endif
.endif
add r7, r2, #(\i*2)
mov r8, #32*2
@ -3304,7 +3259,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
add r6, r4, #(\i*64*2)
mov r9, #-2 // shift
bl inv_txfm_horz_dct_64x4_neon
.if \i < 8
.if \i < 12
ldrh r11, [r10], #2
.endif
.endr
@ -3353,7 +3308,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
mov r8, #(32 - \i)
cmp r3, r11
blt 1f
.if \i < 28
ldrh r11, [r10], #2
.endif
.endif
add r7, r2, #(\i*2)
mov r8, #32*2

3428
third_party/dav1d/src/arm/32/itx16.S поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

7
third_party/dav1d/src/arm/32/loopfilter16.S поставляемый
Просмотреть файл

@ -141,13 +141,12 @@ function lpf_4_wd\wd\()_neon
vmov.i16 d6, #3
vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
vmul.i16 d2, d2, d6
vmov.i16 d6, #4
vmov.i16 d7, #4
vadd.i16 d2, d2, d4
vmin.s16 d2, d2, d3 // f = iclip_diff()
vmov.i16 d7, #3
vmax.s16 d2, d2, d9 // f = iclip_diff()
vqadd.s16 d4, d6, d2 // f + 4
vqadd.s16 d5, d7, d2 // f + 3
vqadd.s16 d4, d7, d2 // f + 4
vqadd.s16 d5, d6, d2 // f + 3
vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1)
vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1)
vshr.s16 d4, d4, #3 // f1

748
third_party/dav1d/src/arm/32/looprestoration.S поставляемый
Просмотреть файл

@ -28,15 +28,27 @@
#include "src/arm/asm.S"
#include "util.S"
const right_ext_mask_buf
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
right_ext_mask:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
endconst
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[8], intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4}
ldrd r4, r5, [sp, #52]
ldrd r6, r7, [sp, #60]
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [sp, #108]
mov r8, r5
vld1.16 {q0}, [r4, :128]
movw r9, #(1 << 14) - (1 << 2)
@ -47,27 +59,19 @@ function wiener_filter_h_8bpc_neon, export=1
bic r10, r10, #7
lsl r10, r10, #1
// Clear the last unused element of q0, to allow filtering a single
// pixel with one plain vmul+vpadd.
mov r12, #0
vmov.16 d1[3], r12
// Set up pointers for reading/writing alternate rows
add r12, r0, r10
lsl r10, r10, #1
add lr, r2, r3
lsl r3, r3, #1
// Subtract the width from mid_stride
sub r10, r10, r5, lsl #1
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
cmp r5, #8
add r11, r5, #13
// Subtract the aligned width from mid_stride
add r11, r5, #7
bic r11, r11, #7
bge 1f
mov r11, #16
1:
sub r10, r10, r11, lsl #1
// Subtract the number of pixels read from the source stride
add r11, r11, #8
sub r3, r3, r11
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -131,47 +135,56 @@ function wiener_filter_h_8bpc_neon, export=1
ldrb r11, [r2, r9]
ldrb r9, [lr, r9]
// Fill q12/q13 with the right padding pixel
vdup.8 d24, r11
vdup.8 d26, r9
vmovl.u8 q12, d24
vmovl.u8 q13, d26
vdup.16 q12, r11
vdup.16 q13, r9
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
// Check whether we need to pad the right edge
cmp r5, #11
bge 4f // If w >= 11, all used input pixels are valid
cmp r5, #7
bge 5f // If w >= 7, we can filter 4 pixels
b 6f
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r4, right_ext_mask, -6
sub r4, r4, r5, lsl #1
vld1.8 {q10, q11}, [r4]
vbit q1, q12, q10
vbit q2, q12, q11
vbit q8, q13, q10
vbit q9, q13, q11
4: // Loop horizontally
// This is tuned as some sort of compromise between Cortex A7, A8,
// A9 and A53.
vmul.s16 q3, q1, d0[0]
vext.8 q10, q1, q2, #2
vext.8 q11, q1, q2, #4
vmla.s16 q3, q10, d0[1]
vmla.s16 q3, q11, d0[2]
vext.8 q10, q1, q2, #6
vext.8 q11, q1, q2, #8
vmla.s16 q3, q10, d0[3]
vmla.s16 q3, q11, d1[0]
vext.8 q10, q1, q2, #10
vext.8 q11, q1, q2, #12
vmla.s16 q3, q10, d1[1]
vmla.s16 q3, q11, d1[2]
vext.8 q5, q1, q2, #8
vext.8 q10, q1, q2, #2
vext.8 q6, q1, q2, #10
vext.8 q7, q1, q2, #12
vext.8 q4, q1, q2, #6
vadd.i16 q5, q5, q11
vadd.i16 q6, q6, q10
vadd.i16 q7, q7, q1
vmul.s16 q3, q4, d0[3]
vmla.s16 q3, q5, d1[0]
vmla.s16 q3, q6, d1[1]
vmla.s16 q3, q7, d1[2]
vmul.s16 q10, q8, d0[0]
vext.8 q11, q8, q9, #2
vext.8 q4, q8, q9, #4
vmla.s16 q10, q11, d0[1]
vmla.s16 q10, q4, d0[2]
vext.8 q11, q8, q9, #6
vext.8 q4, q8, q9, #8
vmla.s16 q10, q11, d0[3]
vmla.s16 q10, q4, d1[0]
vext.8 q11, q8, q9, #10
vext.8 q6, q8, q9, #8
vext.8 q11, q8, q9, #2
vext.8 q7, q8, q9, #10
vadd.i16 q6, q6, q4
vext.8 q4, q8, q9, #12
vmla.s16 q10, q11, d1[1]
vext.8 q5, q8, q9, #6
vadd.i16 q7, q7, q11
vadd.i16 q4, q4, q8
vmul.s16 q10, q5, d0[3]
vmla.s16 q10, q6, d1[0]
vmla.s16 q10, q7, d1[1]
vmla.s16 q10, q4, d1[2]
vext.8 q1, q1, q2, #6
@ -186,10 +199,10 @@ function wiener_filter_h_8bpc_neon, export=1
vshr.s16 q10, q10, #3
vadd.s16 q3, q3, q15
vadd.s16 q10, q10, q15
subs r5, r5, #8
vst1.16 {q3}, [r0, :128]!
vst1.16 {q10}, [r12, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q1, q2
@ -201,145 +214,6 @@ function wiener_filter_h_8bpc_neon, export=1
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
5: // Filter 4 pixels, 7 <= w < 11
.macro filter_4
vext.8 d20, d2, d3, #2
vext.8 d21, d2, d3, #4
vext.8 d22, d2, d3, #6
vext.8 d23, d3, d4, #2
vext.8 d8, d3, d4, #4
vmul.s16 d6, d2, d0[0]
vmla.s16 d6, d20, d0[1]
vmla.s16 d6, d21, d0[2]
vmla.s16 d6, d22, d0[3]
vmla.s16 d6, d3, d1[0]
vmla.s16 d6, d23, d1[1]
vmla.s16 d6, d8, d1[2]
vext.8 d20, d16, d17, #2
vext.8 d21, d16, d17, #4
vext.8 d22, d16, d17, #6
vext.8 d23, d17, d18, #2
vext.8 d8, d17, d18, #4
vmul.s16 d7, d16, d0[0]
vmla.s16 d7, d20, d0[1]
vmla.s16 d7, d21, d0[2]
vmla.s16 d7, d22, d0[3]
vmla.s16 d7, d17, d1[0]
vmla.s16 d7, d23, d1[1]
vmla.s16 d7, d8, d1[2]
vext.8 d22, d2, d3, #6
vext.8 d23, d16, d17, #6
vshl.s16 q11, q11, #7
vsub.s16 q11, q11, q14
vqadd.s16 q3, q3, q11
vshr.s16 q3, q3, #3
vadd.s16 q3, q3, q15
.endm
filter_4
vst1.16 {d6}, [r0, :64]!
vst1.16 {d7}, [r12, :64]!
subs r5, r5, #4 // 3 <= w < 7
vext.8 q1, q1, q2, #8
vext.8 q2, q2, q2, #8
vext.8 q8, q8, q9, #8
vext.8 q9, q9, q9, #8
6: // Pad the right edge and filter the last few pixels.
// w < 7, w+3 pixels valid in q1-q2
cmp r5, #5
blt 7f
bgt 8f
// w == 5, 8 pixels valid in q1, q2 invalid
vmov q2, q12
vmov q9, q13
b 88f
7: // 1 <= w < 5, 4-7 pixels valid in q1
sub r9, r5, #1
// r9 = (pixels valid - 4)
adr r11, L(variable_shift_tbl)
ldr r9, [r11, r9, lsl #2]
add r11, r11, r9
vmov q2, q12
vmov q9, q13
bx r11
.align 2
L(variable_shift_tbl):
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
44: // 4 pixels valid in d2/d16, fill d3/d17 with padding.
vmov d3, d4
vmov d17, d18
b 88f
// Shift q1 right, shifting out invalid pixels,
// shift q1 left to the original offset, shifting in padding pixels.
55: // 5 pixels valid
vext.8 q1, q1, q1, #10
vext.8 q1, q1, q2, #6
vext.8 q8, q8, q8, #10
vext.8 q8, q8, q9, #6
b 88f
66: // 6 pixels valid
vext.8 q1, q1, q1, #12
vext.8 q1, q1, q2, #4
vext.8 q8, q8, q8, #12
vext.8 q8, q8, q9, #4
b 88f
77: // 7 pixels valid
vext.8 q1, q1, q1, #14
vext.8 q1, q1, q2, #2
vext.8 q8, q8, q8, #14
vext.8 q8, q8, q9, #2
b 88f
8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2
vext.8 q2, q2, q2, #2
vext.8 q2, q2, q12, #14
vext.8 q9, q9, q9, #2
vext.8 q9, q9, q13, #14
88:
// w < 7, q1-q2 padded properly
cmp r5, #4
blt 888f
// w >= 4, filter 4 pixels
filter_4
vst1.16 {d6}, [r0, :64]!
vst1.16 {d7}, [r12, :64]!
subs r5, r5, #4 // 0 <= w < 4
vext.8 q1, q1, q2, #8
vext.8 q8, q8, q9, #8
beq 9f
888: // 1 <= w < 4, filter 1 pixel at a time
vmul.s16 q3, q1, q0
vmul.s16 q10, q8, q0
vpadd.s16 d6, d6, d7
vpadd.s16 d7, d20, d21
vdup.16 d24, d2[3]
vpadd.s16 d6, d6, d7
vdup.16 d25, d16[3]
vpadd.s16 d6, d6, d6
vtrn.16 d24, d25
vshl.s16 d24, d24, #7
vsub.s16 d24, d24, d28
vqadd.s16 d6, d6, d24
vshr.s16 d6, d6, #3
vadd.s16 d6, d6, d30
vst1.s16 {d6[0]}, [r0, :16]!
vst1.s16 {d6[1]}, [r12, :16]!
subs r5, r5, #1
vext.8 q1, q1, q2, #2
vext.8 q8, q8, q9, #2
bgt 888b
9:
subs r6, r6, #2
ble 0f
@ -351,9 +225,8 @@ L(variable_shift_tbl):
mov r5, r8
b 1b
0:
vpop {q4}
vpop {q4-q7}
pop {r4-r11,pc}
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
@ -362,8 +235,9 @@ endfunc
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
vpush {q4-q6}
ldrd r4, r5, [sp, #68]
ldrd r6, r7, [sp, #76]
mov lr, r4
vld1.16 {q0}, [r5, :128]
@ -407,24 +281,21 @@ function wiener_filter_v_8bpc_neon, export=1
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
vmull.s16 q2, d16, d0[0]
vmlal.s16 q2, d18, d0[1]
vmlal.s16 q2, d20, d0[2]
vmlal.s16 q2, d22, d0[3]
vmlal.s16 q2, d24, d1[0]
vmlal.s16 q2, d26, d1[1]
vmlal.s16 q2, d28, d1[2]
vmull.s16 q3, d17, d0[0]
vmlal.s16 q3, d19, d0[1]
vmlal.s16 q3, d21, d0[2]
vmlal.s16 q3, d23, d0[3]
vmlal.s16 q3, d25, d1[0]
vmlal.s16 q3, d27, d1[1]
vmlal.s16 q3, d29, d1[2]
vadd.i16 q4, q10, q12
vadd.i16 q5, q9, q13
vadd.i16 q6, q8, q14
vmull.s16 q2, d22, d0[3]
vmlal.s16 q2, d8, d1[0]
vmlal.s16 q2, d10, d1[1]
vmlal.s16 q2, d12, d1[2]
vmull.s16 q3, d23, d0[3]
vmlal.s16 q3, d9, d1[0]
vmlal.s16 q3, d11, d1[1]
vmlal.s16 q3, d13, d1[2]
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
vqmovun.s16 d4, q2
vst1.8 {d4}, [r0], r1
vst1.8 {d4}, [r0, :64], r1
.if \compare
cmp r4, #4
.else
@ -529,147 +400,11 @@ function wiener_filter_v_8bpc_neon, export=1
b 1b
0:
vpop {q4-q6}
pop {r4-r7,pc}
.purgem filter
endfunc
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_8bpc_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
ldr r3, [r12, r3, lsl #2]
add r12, r12, r3
bx r12
.align 2
L(copy_narrow_tbl):
.word 0
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
10:
add r3, r0, r1
lsl r1, r1, #1
18:
subs r4, r4, #8
blt 110f
vld1.8 {d0}, [r2, :64]!
vst1.8 {d0[0]}, [r0], r1
vst1.8 {d0[1]}, [r3], r1
vst1.8 {d0[2]}, [r0], r1
vst1.8 {d0[3]}, [r3], r1
vst1.8 {d0[4]}, [r0], r1
vst1.8 {d0[5]}, [r3], r1
vst1.8 {d0[6]}, [r0], r1
vst1.8 {d0[7]}, [r3], r1
ble 0f
b 18b
110:
add r4, r4, #8
asr r1, r1, #1
11:
subs r4, r4, #1
vld1.8 {d0[]}, [r2]!
vst1.8 {d0[0]}, [r0], r1
bgt 11b
0:
pop {r4,pc}
20:
add r3, r0, r1
lsl r1, r1, #1
24:
subs r4, r4, #4
blt 210f
vld1.16 {d0}, [r2, :64]!
vst1.16 {d0[0]}, [r0, :16], r1
vst1.16 {d0[1]}, [r3, :16], r1
vst1.16 {d0[2]}, [r0, :16], r1
vst1.16 {d0[3]}, [r3, :16], r1
ble 0f
b 24b
210:
add r4, r4, #4
asr r1, r1, #1
22:
subs r4, r4, #1
vld1.16 {d0[]}, [r2, :16]!
vst1.16 {d0[0]}, [r0, :16], r1
bgt 22b
0:
pop {r4,pc}
30:
ldrh r3, [r2]
ldrb r12, [r2, #2]
add r2, r2, #3
subs r4, r4, #1
strh r3, [r0]
strb r12, [r0, #2]
add r0, r0, r1
bgt 30b
pop {r4,pc}
40:
add r3, r0, r1
lsl r1, r1, #1
42:
subs r4, r4, #2
blt 41f
vld1.8 {d0}, [r2, :64]!
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[1]}, [r3, :32], r1
ble 0f
b 42b
41:
vld1.32 {d0[]}, [r2, :32]
vst1.32 {d0[0]}, [r0, :32]
0:
pop {r4,pc}
50:
ldr r3, [r2]
ldrb r12, [r2, #4]
add r2, r2, #5
subs r4, r4, #1
str r3, [r0]
strb r12, [r0, #4]
add r0, r0, r1
bgt 50b
pop {r4,pc}
60:
ldr r3, [r2]
ldrh r12, [r2, #4]
add r2, r2, #6
subs r4, r4, #1
str r3, [r0]
strh r12, [r0, #4]
add r0, r0, r1
bgt 60b
pop {r4,pc}
70:
ldr r3, [r2]
ldrh r12, [r2, #4]
ldrb lr, [r2, #6]
add r2, r2, #7
subs r4, r4, #1
str r3, [r0]
strh r12, [r0, #4]
strb lr, [r0, #6]
add r0, r0, r1
bgt 70b
pop {r4,pc}
endfunc
#define SUM_STRIDE (384+16)
#include "looprestoration_tmpl.S"
@ -694,25 +429,15 @@ function sgr_box3_h_8bpc_neon, export=1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
tst r7, #2 // LR_HAVE_RIGHT
bne 0f
// !LR_HAVE_RIGHT
add lr, r5, #3
bic lr, lr, #3
b 1f
0:
add lr, r5, #7
bic lr, lr, #7
1:
sub r9, r9, lr, lsl #1
// Store the width for the vertical loop
mov r8, r5
// Subtract the number of pixels read from the input from the stride
add lr, r5, #14
bic lr, lr, #7
add lr, lr, #8
sub r4, r4, lr
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -781,34 +506,30 @@ function sgr_box3_h_8bpc_neon, export=1
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
// Check whether we need to pad the right edge
cmp r5, #10
bge 4f // If w >= 10, all used input pixels are valid
cmp r5, #6
bge 5f // If w >= 6, we can filter 4 pixels
b 6f
// 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0/4.b[w] onwards
movrel_local lr, right_ext_mask
sub lr, lr, r5
vld1.8 {q13}, [lr]
vbit q0, q14, q13
vbit q4, q15, q13
// Update the precalculated squares
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
4: // Loop horizontally
.macro vaddl_u16_n dst1, dst2, src1, src2, src3, src4, w
vaddl.u16 \dst1, \src1, \src3
.if \w > 4
vaddl.u16 \dst2, \src2, \src4
.endif
.endm
.macro vaddw_u16_n dst1, dst2, src1, src2, w
vaddw.u16 \dst1, \dst1, \src1
.if \w > 4
vaddw.u16 \dst2, \dst2, \src2
.endif
.endm
.macro vadd_i32_n dst1, dst2, src1, src2, w
vadd.i32 \dst1, \dst1, \src1
.if \w > 4
vadd.i32 \dst2, \dst2, \src2
.endif
.endm
.macro add3 w
vext.8 d16, d0, d1, #1
vext.8 d17, d0, d1, #2
vext.8 d18, d8, d9, #1
@ -823,19 +544,22 @@ function sgr_box3_h_8bpc_neon, export=1
vext.8 q10, q5, q6, #2
vext.8 q11, q5, q6, #4
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
vaddw_u16_n q12, q13, d18, d19, \w
vaddl.u16 q12, d2, d16
vaddl.u16 q13, d3, d17
vaddw.u16 q12, q12, d18
vaddw.u16 q13, q13, d19
vaddl_u16_n q8, q9, d10, d11, d20, d21, \w
vaddw_u16_n q8, q9, d22, d23, \w
.endm
add3 8
vaddl.u16 q8, d10, d20
vaddl.u16 q9, d11, d21
vaddw.u16 q8, q8, d22
vaddw.u16 q9, q9, d23
subs r5, r5, #8
vst1.16 {q3}, [r1, :128]!
vst1.16 {q7}, [r11, :128]!
vst1.32 {q12, q13}, [r0, :128]!
vst1.32 {q8, q9}, [r10, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r3]!
@ -850,86 +574,6 @@ function sgr_box3_h_8bpc_neon, export=1
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
5: // Produce 4 pixels, 6 <= w < 10
add3 4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q8}, [r10, :128]!
subs r5, r5, #4 // 2 <= w < 6
vext.8 q0, q0, q0, #4
vext.8 q4, q4, q4, #4
6: // Pad the right edge and produce the last few pixels.
// 2 <= w < 6, 2-5 pixels valid in q0
sub lr, r5, #2
// lr = (pixels valid - 2)
adr r11, L(box3_variable_shift_tbl)
ldr lr, [r11, lr, lsl #2]
add r11, r11, lr
bx r11
.align 2
L(box3_variable_shift_tbl):
.word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
.word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
.word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
// Shift q0 right, shifting out invalid pixels,
// shift q0 left to the original offset, shifting in padding pixels.
22: // 2 pixels valid
vext.8 q0, q0, q0, #2
vext.8 q4, q4, q4, #2
vext.8 q0, q0, q14, #14
vext.8 q4, q4, q15, #14
b 88f
33: // 3 pixels valid
vext.8 q0, q0, q0, #3
vext.8 q4, q4, q4, #3
vext.8 q0, q0, q14, #13
vext.8 q4, q4, q15, #13
b 88f
44: // 4 pixels valid
vext.8 q0, q0, q0, #4
vext.8 q4, q4, q4, #4
vext.8 q0, q0, q14, #12
vext.8 q4, q4, q15, #12
b 88f
55: // 5 pixels valid
vext.8 q0, q0, q0, #5
vext.8 q4, q4, q4, #5
vext.8 q0, q0, q14, #11
vext.8 q4, q4, q15, #11
88:
// Restore r11 after using it for a temporary value above
add r11, r1, #(2*SUM_STRIDE)
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
add3 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q8}, [r10, :128]!
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
vext.8 q4, q4, q4, #4
vext.8 q5, q5, q6, #8
// Only one needed pixel left, but do a normal 4 pixel
// addition anyway
add3 4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q8}, [r10, :128]!
9:
subs r6, r6, #2
ble 0f
@ -945,7 +589,6 @@ L(box3_variable_shift_tbl):
0:
vpop {q4-q7}
pop {r4-r11,pc}
.purgem add3
endfunc
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
@ -968,23 +611,11 @@ function sgr_box5_h_8bpc_neon, export=1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
// Subtract the number of pixels read from the input from the stride.
tst r7, #2 // LR_HAVE_RIGHT
bne 0f
// !LR_HAVE_RIGHT
add lr, r5, #3
bic lr, lr, #3
add r8, r5, #13
b 1f
0:
add lr, r5, #7
bic lr, lr, #7
add r8, r5, #15
1:
sub r9, r9, lr, lsl #1
bic r8, r8, #7
sub r4, r4, r8
add lr, lr, #8
sub r4, r4, lr
// Store the width for the vertical loop
mov r8, r5
@ -1054,15 +685,31 @@ function sgr_box5_h_8bpc_neon, export=1
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
// Check whether we need to pad the right edge
cmp r5, #11
bge 4f // If w >= 11, all used input pixels are valid
cmp r5, #7
bge 5f // If w >= 7, we can produce 4 pixels
b 6f
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1
sub lr, lr, r5
vld1.8 {q13}, [lr]
vbit q0, q14, q13
vbit q4, q15, q13
// Update the precalculated squares
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
4: // Loop horizontally
.macro add5 w
vext.8 d16, d0, d1, #1
vext.8 d17, d0, d1, #2
vext.8 d18, d0, d1, #3
@ -1084,35 +731,33 @@ function sgr_box5_h_8bpc_neon, export=1
vext.8 q9, q1, q2, #4
vext.8 q10, q1, q2, #6
vext.8 q11, q1, q2, #8
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
vaddw_u16_n q12, q13, d22, d23, \w
vadd_i32_n q12, q13, q8, q9, \w
vaddl.u16 q12, d2, d16
vaddl.u16 q13, d3, d17
vaddl.u16 q8, d18, d20
vaddl.u16 q9, d19, d21
vaddw.u16 q12, q12, d22
vaddw.u16 q13, q13, d23
vadd.i32 q12, q12, q8
vadd.i32 q13, q13, q9
vext.8 q8, q5, q6, #2
vext.8 q9, q5, q6, #4
vext.8 q10, q5, q6, #6
vext.8 q11, q5, q6, #8
.if \w > 4
vaddl_u16_n q1, q5, d10, d11, d16, d17, 8
vaddl_u16_n q8, q9, d18, d19, d20, d21, 8
vaddw_u16_n q1, q5, d22, d23, 8
vaddl.u16 q1, d10, d16
vaddl.u16 q5, d11, d17
vaddl.u16 q8, d18, d20
vaddl.u16 q9, d19, d21
vaddw.u16 q1, q1, d22
vaddw.u16 q5, q5, d23
vadd.i32 q10, q1, q8
vadd.i32 q11, q5, q9
.else
// Can't clobber q1/q5 if only doing 4 pixels
vaddl.u16 q8, d10, d16
vaddl.u16 q9, d18, d20
vaddw.u16 q8, q8, d22
vadd.i32 q10, q8, q9
.endif
.endm
add5 8
subs r5, r5, #8
vst1.16 {q3}, [r1, :128]!
vst1.16 {q7}, [r11, :128]!
vst1.32 {q12, q13}, [r0, :128]!
vst1.32 {q10, q11}, [r10, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r3]!
@ -1126,98 +771,6 @@ function sgr_box5_h_8bpc_neon, export=1
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
5: // Produce 4 pixels, 7 <= w < 11
add5 4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q10}, [r10, :128]!
subs r5, r5, #4 // 3 <= w < 7
vext.8 q0, q0, q0, #4
vext.8 q4, q4, q4, #4
6: // Pad the right edge and produce the last few pixels.
// w < 7, w+1 pixels valid in q0/q4
sub lr, r5, #1
// lr = pixels valid - 2
adr r11, L(box5_variable_shift_tbl)
ldr lr, [r11, lr, lsl #2]
add r11, r11, lr
bx r11
.align 2
L(box5_variable_shift_tbl):
.word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
// Shift q0 right, shifting out invalid pixels,
// shift q0 left to the original offset, shifting in padding pixels.
22: // 2 pixels valid
vext.8 q0, q0, q0, #2
vext.8 q4, q4, q4, #2
vext.8 q0, q0, q14, #14
vext.8 q4, q4, q15, #14
b 88f
33: // 3 pixels valid
vext.8 q0, q0, q0, #3
vext.8 q4, q4, q4, #3
vext.8 q0, q0, q14, #13
vext.8 q4, q4, q15, #13
b 88f
44: // 4 pixels valid
vext.8 q0, q0, q0, #4
vext.8 q4, q4, q4, #4
vext.8 q0, q0, q14, #12
vext.8 q4, q4, q15, #12
b 88f
55: // 5 pixels valid
vext.8 q0, q0, q0, #5
vext.8 q4, q4, q4, #5
vext.8 q0, q0, q14, #11
vext.8 q4, q4, q15, #11
b 88f
66: // 6 pixels valid
vext.8 q0, q0, q0, #6
vext.8 q4, q4, q4, #6
vext.8 q0, q0, q14, #10
vext.8 q4, q4, q15, #10
b 88f
77: // 7 pixels valid
vext.8 q0, q0, q0, #7
vext.8 q4, q4, q4, #7
vext.8 q0, q0, q14, #9
vext.8 q4, q4, q15, #9
88:
// Restore r11 after using it for a temporary value above
add r11, r1, #(2*SUM_STRIDE)
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
vmull.u8 q5, d8, d8
vmull.u8 q6, d9, d9
add5 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q10}, [r10, :128]!
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
vext.8 q4, q4, q4, #4
vext.8 q5, q5, q6, #8
add5 4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q10}, [r10, :128]!
9:
subs r6, r6, #2
ble 0f
@ -1233,7 +786,6 @@ L(box5_variable_shift_tbl):
0:
vpop {q4-q7}
pop {r4-r11,pc}
.purgem add5
endfunc
sgr_funcs 8

Просмотреть файл

@ -28,6 +28,18 @@
#include "src/arm/asm.S"
#include "util.S"
const right_ext_mask_buf
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
right_ext_mask:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
endconst
// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
@ -55,27 +67,19 @@ function wiener_filter_h_16bpc_neon, export=1
bic r10, r10, #7
lsl r10, r10, #1
// Clear the last unused element of q0, to allow filtering a single
// pixel with one plain vmul+vpadd.
mov r12, #0
vmov.16 d1[3], r12
// Set up pointers for reading/writing alternate rows
add r12, r0, r10
lsl r10, r10, #1
add lr, r2, r3
lsl r3, r3, #1
// Subtract the width from mid_stride
sub r10, r10, r5, lsl #1
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
cmp r5, #8
add r11, r5, #13
// Subtract the aligned width from mid_stride
add r11, r5, #7
bic r11, r11, #7
bge 1f
mov r11, #16
1:
sub r10, r10, r11, lsl #1
// Subtract the number of pixels read from the source stride
add r11, r11, #8
sub r3, r3, r11, lsl #1
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -143,54 +147,62 @@ function wiener_filter_h_16bpc_neon, export=1
vdup.16 q11, r11
vdup.16 q12, r9
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
// Check whether we need to pad the right edge
cmp r5, #11
bge 4f // If w >= 11, all used input pixels are valid
cmp r5, #7
bge 5f // If w >= 7, we can filter 4 pixels
b 6f
// 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r4, right_ext_mask, -6
sub r4, r4, r5, lsl #1
vld1.8 {q9, q10}, [r4]
vbit q2, q11, q9
vbit q3, q11, q10
vbit q4, q12, q9
vbit q5, q12, q10
4: // Loop horizontally
vext.8 q8, q2, q3, #2
vext.8 q9, q2, q3, #4
vext.8 q10, q2, q3, #6
vmull.s16 q6, d4, d0[0]
vmlal.s16 q6, d16, d0[1]
vmlal.s16 q6, d18, d0[2]
vmlal.s16 q6, d20, d0[3]
vmull.s16 q7, d5, d0[0]
vmlal.s16 q7, d17, d0[1]
vmlal.s16 q7, d19, d0[2]
vmlal.s16 q7, d21, d0[3]
vext.8 q7, q2, q3, #4
vext.8 q8, q2, q3, #8
vext.8 q6, q2, q3, #2
vext.8 q9, q2, q3, #10
vext.8 q10, q2, q3, #12
vadd.i16 q8, q8, q7
vadd.i16 q9, q9, q6
vext.8 q6, q2, q3, #12
vext.8 q7, q2, q3, #6
vadd.i16 q2, q2, q6
vmull.s16 q6, d14, d0[3]
vmlal.s16 q6, d16, d1[0]
vmlal.s16 q6, d18, d1[1]
vmlal.s16 q6, d20, d1[2]
vmlal.s16 q6, d4, d1[2]
vmull.s16 q7, d15, d0[3]
vmlal.s16 q7, d17, d1[0]
vmlal.s16 q7, d19, d1[1]
vmlal.s16 q7, d21, d1[2]
vext.8 q2, q4, q5, #2
vext.8 q10, q4, q5, #6
vmull.s16 q8, d8, d0[0]
vmlal.s16 q8, d4, d0[1]
vmlal.s16 q8, d20, d0[3]
vmull.s16 q9, d9, d0[0]
vmlal.s16 q9, d5, d0[1]
vmlal.s16 q9, d21, d0[3]
vext.8 q2, q4, q5, #4
vmlal.s16 q7, d5, d1[2]
vext.8 q8, q4, q5, #4
vext.8 q10, q4, q5, #8
vmlal.s16 q8, d4, d0[2]
vmlal.s16 q8, d20, d1[0]
vmlal.s16 q9, d5, d0[2]
vmlal.s16 q9, d21, d1[0]
vext.8 q9, q4, q5, #2
vext.8 q2, q4, q5, #10
vext.8 q10, q4, q5, #12
vadd.i16 q10, q10, q8
vadd.i16 q2, q2, q9
vext.8 q8, q4, q5, #12
vext.8 q9, q4, q5, #6
vadd.i16 q4, q4, q8
vmull.s16 q8, d18, d0[3]
vmlal.s16 q8, d20, d1[0]
vmlal.s16 q8, d4, d1[1]
vmlal.s16 q8, d20, d1[2]
vmlal.s16 q8, d8, d1[2]
vmull.s16 q9, d19, d0[3]
vmlal.s16 q9, d21, d1[0]
vmlal.s16 q9, d5, d1[1]
vmlal.s16 q9, d21, d1[2]
vmlal.s16 q9, d9, d1[2]
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
vadd.i32 q6, q6, q14
@ -209,10 +221,10 @@ function wiener_filter_h_16bpc_neon, export=1
vmin.u16 q7, q7, q10
vsub.i16 q6, q6, q15
vsub.i16 q7, q7, q15
subs r5, r5, #8
vst1.16 {q6}, [r0, :128]!
vst1.16 {q7}, [r12, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q2, q3
@ -222,148 +234,6 @@ function wiener_filter_h_16bpc_neon, export=1
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
5: // Filter 4 pixels, 7 <= w < 11
.macro filter_4
vext.8 d18, d4, d5, #6
vext.8 d16, d4, d5, #2
vext.8 d17, d4, d5, #4
vext.8 d19, d5, d6, #2
vext.8 d20, d5, d6, #4
vmull.s16 q6, d4, d0[0]
vmlal.s16 q6, d16, d0[1]
vmlal.s16 q6, d17, d0[2]
vmlal.s16 q6, d18, d0[3]
vmlal.s16 q6, d5, d1[0]
vmlal.s16 q6, d19, d1[1]
vmlal.s16 q6, d20, d1[2]
vext.8 d18, d8, d9, #6
vext.8 d16, d8, d9, #2
vext.8 d17, d8, d9, #4
vext.8 d19, d9, d10, #2
vext.8 d20, d9, d10, #4
vmull.s16 q7, d8, d0[0]
vmlal.s16 q7, d16, d0[1]
vmlal.s16 q7, d17, d0[2]
vmlal.s16 q7, d18, d0[3]
vmlal.s16 q7, d9, d1[0]
vmlal.s16 q7, d19, d1[1]
vmlal.s16 q7, d20, d1[2]
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
vadd.i32 q6, q6, q14
vadd.i32 q7, q7, q14
vrshl.s32 q6, q6, q13
vrshl.s32 q7, q7, q13
vqmovun.s32 d12, q6
vqmovun.s32 d13, q7
vmin.u16 q6, q6, q10
vsub.i16 q6, q6, q15
.endm
filter_4
vst1.16 {d12}, [r0, :64]!
vst1.16 {d13}, [r12, :64]!
subs r5, r5, #4 // 3 <= w < 7
vext.8 q2, q2, q3, #8
vext.8 q3, q3, q3, #8
vext.8 q4, q4, q5, #8
vext.8 q5, q5, q5, #8
6: // Pad the right edge and filter the last few pixels.
// w < 7, w+3 pixels valid in q2-q3
cmp r5, #5
blt 7f
bgt 8f
// w == 5, 8 pixels valid in q2, q3 invalid
vmov q3, q11
vmov q5, q12
b 88f
7: // 1 <= w < 5, 4-7 pixels valid in q2
sub r9, r5, #1
// r9 = (pixels valid - 4)
adr r11, L(variable_shift_tbl)
ldr r9, [r11, r9, lsl #2]
add r11, r11, r9
vmov q3, q11
vmov q5, q12
bx r11
.align 2
L(variable_shift_tbl):
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
44: // 4 pixels valid in q2/q4, fill the high half with padding.
vmov d5, d6
vmov d9, d10
b 88f
// Shift q2 right, shifting out invalid pixels,
// shift q2 left to the original offset, shifting in padding pixels.
55: // 5 pixels valid
vext.8 q2, q2, q2, #10
vext.8 q2, q2, q3, #6
vext.8 q4, q4, q4, #10
vext.8 q4, q4, q5, #6
b 88f
66: // 6 pixels valid
vext.8 q2, q2, q2, #12
vext.8 q2, q2, q3, #4
vext.8 q4, q4, q4, #12
vext.8 q4, q4, q5, #4
b 88f
77: // 7 pixels valid
vext.8 q2, q2, q2, #14
vext.8 q2, q2, q3, #2
vext.8 q4, q4, q4, #14
vext.8 q4, q4, q5, #2
b 88f
8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
vext.8 q3, q3, q3, #2
vext.8 q3, q3, q11, #14
vext.8 q5, q5, q5, #2
vext.8 q5, q5, q12, #14
88:
// w < 7, q2-q3 padded properly
cmp r5, #4
blt 888f
// w >= 4, filter 4 pixels
filter_4
vst1.16 {d12}, [r0, :64]!
vst1.16 {d13}, [r12, :64]!
subs r5, r5, #4 // 0 <= w < 4
vext.8 q2, q2, q3, #8
vext.8 q4, q4, q5, #8
beq 9f
888: // 1 <= w < 4, filter 1 pixel at a time
vmull.s16 q6, d4, d0
vmull.s16 q7, d5, d1
vmull.s16 q8, d8, d0
vmull.s16 q9, d9, d1
vadd.i32 q6, q7
vadd.i32 q8, q9
vpadd.i32 d12, d12, d13
vpadd.i32 d13, d16, d17
vpadd.i32 d12, d12, d13
vadd.i32 d12, d12, d28
vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1
vrshl.s32 d12, d12, d26
vqmovun.s32 d12, q6
vmin.u16 d12, d12, d20
vsub.i16 d12, d12, d30
vst1.16 {d12[0]}, [r0, :16]!
vst1.16 {d12[1]}, [r12, :16]!
subs r5, r5, #1
vext.8 q2, q2, q3, #2
vext.8 q4, q4, q5, #2
bgt 888b
9:
subs r6, r6, #2
ble 0f
@ -377,7 +247,6 @@ L(variable_shift_tbl):
0:
vpop {q4-q7}
pop {r4-r11,pc}
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
@ -457,7 +326,7 @@ function wiener_filter_v_16bpc_neon, export=1
vqmovun.s32 d4, q2
vqmovun.s32 d5, q3
vmin.u16 q2, q2, q5 // bitdepth_max
vst1.16 {q2}, [r0], r1
vst1.16 {q2}, [r0, :128], r1
.if \compare
cmp r4, #4
.else
@ -567,143 +436,6 @@ function wiener_filter_v_16bpc_neon, export=1
.purgem filter
endfunc
// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_16bpc_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
ldr r3, [r12, r3, lsl #2]
add r12, r12, r3
bx r12
.align 2
L(copy_narrow_tbl):
.word 0
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
10:
add r3, r0, r1
lsl r1, r1, #1
18:
subs r4, r4, #8
blt 110f
vld1.16 {q0}, [r2, :128]!
vst1.16 {d0[0]}, [r0, :16], r1
vst1.16 {d0[1]}, [r3, :16], r1
vst1.16 {d0[2]}, [r0, :16], r1
vst1.16 {d0[3]}, [r3, :16], r1
vst1.16 {d1[0]}, [r0, :16], r1
vst1.16 {d1[1]}, [r3, :16], r1
vst1.16 {d1[2]}, [r0, :16], r1
vst1.16 {d1[3]}, [r3, :16], r1
ble 0f
b 18b
110:
add r4, r4, #8
asr r1, r1, #1
11:
subs r4, r4, #1
vld1.16 {d0[]}, [r2]!
vst1.16 {d0[0]}, [r0], r1
bgt 11b
0:
pop {r4,pc}
20:
add r3, r0, r1
lsl r1, r1, #1
24:
subs r4, r4, #4
blt 210f
vld1.32 {q0}, [r2, :128]!
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[1]}, [r3, :32], r1
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d1[1]}, [r3, :32], r1
ble 0f
b 24b
210:
add r4, r4, #4
asr r1, r1, #1
22:
subs r4, r4, #1
vld1.32 {d0[]}, [r2, :32]!
vst1.32 {d0[0]}, [r0, :32], r1
bgt 22b
0:
pop {r4,pc}
30:
ldr r3, [r2]
ldrh r12, [r2, #4]
add r2, r2, #6
subs r4, r4, #1
str r3, [r0]
strh r12, [r0, #4]
add r0, r0, r1
bgt 30b
pop {r4,pc}
40:
add r3, r0, r1
lsl r1, r1, #1
42:
subs r4, r4, #2
blt 41f
vld1.16 {q0}, [r2, :128]!
vst1.16 {d0}, [r0, :64], r1
vst1.16 {d1}, [r3, :64], r1
ble 0f
b 42b
41:
vld1.16 {d0}, [r2, :64]
vst1.16 {d0}, [r0, :64]
0:
pop {r4,pc}
50:
vld1.16 {d0}, [r2]
ldrh r12, [r2, #8]
add r2, r2, #10
subs r4, r4, #1
vst1.16 {d0}, [r0]
strh r12, [r0, #8]
add r0, r0, r1
bgt 50b
pop {r4,pc}
60:
vld1.16 {d0}, [r2]
ldr r12, [r2, #8]
add r2, r2, #12
subs r4, r4, #1
vst1.16 {d0}, [r0]
str r12, [r0, #8]
add r0, r0, r1
bgt 60b
pop {r4,pc}
70:
vld1.16 {d0}, [r2]
ldr r12, [r2, #8]
ldrh lr, [r2, #12]
add r2, r2, #14
subs r4, r4, #1
vst1.16 {d0}, [r0]
str r12, [r0, #8]
strh lr, [r0, #12]
add r0, r0, r1
bgt 70b
pop {r4,pc}
endfunc
#define SUM_STRIDE (384+16)
#include "looprestoration_tmpl.S"
@ -728,25 +460,15 @@ function sgr_box3_h_16bpc_neon, export=1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
tst r7, #2 // LR_HAVE_RIGHT
bne 0f
// !LR_HAVE_RIGHT
add lr, r5, #3
bic lr, lr, #3
b 1f
0:
add lr, r5, #7
bic lr, lr, #7
1:
sub r9, r9, lr, lsl #1
// Store the width for the vertical loop
mov r8, r5
// Subtract the number of pixels read from the input from the stride
add lr, r5, #14
bic lr, lr, #7
add lr, lr, #8
sub r4, r4, lr, lsl #1
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -815,16 +537,26 @@ function sgr_box3_h_16bpc_neon, export=1
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
// Check whether we need to pad the right edge
cmp r5, #10
bge 4f // If w >= 10, all used input pixels are valid
cmp r5, #6
bge 5f // If w >= 6, we can filter 4 pixels
b 6f
// 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0/1.h[w] onwards
movrel_local lr, right_ext_mask
sub lr, lr, r5, lsl #1
vld1.8 {q12, q13}, [lr]
vbit q0, q14, q12
vbit q1, q14, q13
vbit q4, q15, q12
vbit q5, q15, q13
4: // Loop horizontally
.macro add3 w
.if \w > 4
vext.8 q8, q0, q1, #2
vext.8 q10, q4, q5, #2
vext.8 q9, q0, q1, #4
@ -833,16 +565,6 @@ function sgr_box3_h_16bpc_neon, export=1
vadd.i16 q3, q4, q10
vadd.i16 q2, q2, q9
vadd.i16 q3, q3, q11
.else
vext.8 d16, d0, d1, #2
vext.8 d20, d8, d9, #2
vext.8 d18, d0, d1, #4
vext.8 d22, d8, d9, #4
vadd.i16 d4, d0, d16
vadd.i16 d6, d8, d20
vadd.i16 d4, d4, d18
vadd.i16 d6, d6, d22
.endif
vmull.u16 q6, d0, d0
vmlal.u16 q6, d16, d16
@ -850,22 +572,18 @@ function sgr_box3_h_16bpc_neon, export=1
vmull.u16 q12, d8, d8
vmlal.u16 q12, d20, d20
vmlal.u16 q12, d22, d22
.if \w > 4
vmull.u16 q7, d1, d1
vmlal.u16 q7, d17, d17
vmlal.u16 q7, d19, d19
vmull.u16 q13, d9, d9
vmlal.u16 q13, d21, d21
vmlal.u16 q13, d23, d23
.endif
.endm
add3 8
subs r5, r5, #8
vst1.16 {q2}, [r1, :128]!
vst1.16 {q3}, [r11, :128]!
vst1.32 {q6, q7}, [r0, :128]!
vst1.32 {q12, q13}, [r10, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q0, q1
@ -876,78 +594,6 @@ function sgr_box3_h_16bpc_neon, export=1
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
5: // Produce 4 pixels, 6 <= w < 10
add3 4
vst1.16 {d4}, [r1, :64]!
vst1.16 {d6}, [r11, :64]!
vst1.32 {q6}, [r0, :128]!
vst1.32 {q12}, [r10, :128]!
subs r5, r5, #4 // 2 <= w < 6
vext.8 q0, q0, q1, #8
vext.8 q4, q4, q5, #8
6: // Pad the right edge and produce the last few pixels.
// 2 <= w < 6, 2-5 pixels valid in q0
sub lr, r5, #2
// lr = (pixels valid - 2)
adr r11, L(box3_variable_shift_tbl)
ldr lr, [r11, lr, lsl #2]
add r11, r11, lr
bx r11
.align 2
L(box3_variable_shift_tbl):
.word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
.word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
.word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
// Shift q0 right, shifting out invalid pixels,
// shift q0 left to the original offset, shifting in padding pixels.
22: // 2 pixels valid
vext.8 q0, q0, q0, #4
vext.8 q4, q4, q4, #4
vext.8 q0, q0, q14, #12
vext.8 q4, q4, q15, #12
b 88f
33: // 3 pixels valid
vext.8 q0, q0, q0, #6
vext.8 q4, q4, q4, #6
vext.8 q0, q0, q14, #10
vext.8 q4, q4, q15, #10
b 88f
44: // 4 pixels valid
vmov d1, d28
vmov d9, d30
b 88f
55: // 5 pixels valid
vext.8 q0, q0, q0, #10
vext.8 q4, q4, q4, #10
vext.8 q0, q0, q14, #6
vext.8 q4, q4, q15, #6
88:
// Restore r11 after using it for a temporary value above
add r11, r1, #(2*SUM_STRIDE)
add3 4
subs r5, r5, #4
vst1.16 {d4}, [r1, :64]!
vst1.16 {d6}, [r11, :64]!
vst1.32 {q6}, [r0, :128]!
vst1.32 {q12}, [r10, :128]!
ble 9f
vext.8 q0, q0, q0, #8
vext.8 q4, q4, q4, #8
// Only one needed pixel left, but do a normal 4 pixel
// addition anyway
add3 4
vst1.16 {d4}, [r1, :64]!
vst1.16 {d6}, [r11, :64]!
vst1.32 {q6}, [r0, :128]!
vst1.32 {q12}, [r10, :128]!
9:
subs r6, r6, #2
ble 0f
@ -963,7 +609,6 @@ L(box3_variable_shift_tbl):
0:
vpop {q4-q7}
pop {r4-r11,pc}
.purgem add3
endfunc
// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
@ -986,23 +631,11 @@ function sgr_box5_h_16bpc_neon, export=1
mov r9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
// With LR_HAVE_RIGHT, align to 8, without it, align to 4.
// Subtract the number of pixels read from the input from the stride.
tst r7, #2 // LR_HAVE_RIGHT
bne 0f
// !LR_HAVE_RIGHT
add lr, r5, #3
bic lr, lr, #3
add r8, r5, #13
b 1f
0:
add lr, r5, #7
bic lr, lr, #7
add r8, r5, #15
1:
sub r9, r9, lr, lsl #1
bic r8, r8, #7
sub r4, r4, r8, lsl #1
add lr, lr, #8
sub r4, r4, lr, lsl #1
// Store the width for the vertical loop
mov r8, r5
@ -1072,16 +705,27 @@ function sgr_box5_h_16bpc_neon, export=1
// Restore r11 after using it for a temporary value
add r11, r1, #(2*SUM_STRIDE)
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
// Check whether we need to pad the right edge
cmp r5, #11
bge 4f // If w >= 11, all used input pixels are valid
cmp r5, #7
bge 5f // If w >= 7, we can produce 4 pixels
b 6f
// 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -2
sub lr, lr, r5, lsl #1
vld1.8 {q12, q13}, [lr]
vbit q0, q14, q12
vbit q1, q14, q13
vbit q4, q15, q12
vbit q5, q15, q13
4: // Loop horizontally
.macro add5 w
.if \w > 4
vext.8 q8, q0, q1, #2
vext.8 q10, q4, q5, #2
vext.8 q9, q0, q1, #4
@ -1090,16 +734,6 @@ function sgr_box5_h_16bpc_neon, export=1
vadd.i16 q3, q4, q10
vadd.i16 q2, q2, q9
vadd.i16 q3, q3, q11
.else
vext.8 d16, d0, d1, #2
vext.8 d20, d8, d9, #2
vext.8 d18, d0, d1, #4
vext.8 d22, d8, d9, #4
vadd.i16 d4, d0, d16
vadd.i16 d6, d8, d20
vadd.i16 d4, d4, d18
vadd.i16 d6, d6, d22
.endif
vmull.u16 q6, d0, d0
vmlal.u16 q6, d16, d16
@ -1107,16 +741,13 @@ function sgr_box5_h_16bpc_neon, export=1
vmull.u16 q12, d8, d8
vmlal.u16 q12, d20, d20
vmlal.u16 q12, d22, d22
.if \w > 4
vmull.u16 q7, d1, d1
vmlal.u16 q7, d17, d17
vmlal.u16 q7, d19, d19
vmull.u16 q13, d9, d9
vmlal.u16 q13, d21, d21
vmlal.u16 q13, d23, d23
.endif
.if \w > 4
vext.8 q8, q0, q1, #6
vext.8 q10, q4, q5, #6
vext.8 q9, q0, q1, #8
@ -1125,35 +756,22 @@ function sgr_box5_h_16bpc_neon, export=1
vadd.i16 q3, q3, q10
vadd.i16 q2, q2, q9
vadd.i16 q3, q3, q11
.else
vext.8 d16, d0, d1, #6
// d18 would be equal to d1; using d1 instead
vext.8 d20, d8, d9, #6
// d22 would be equal to d9; using d9 instead
vadd.i16 d4, d4, d16
vadd.i16 d6, d6, d20
vadd.i16 d4, d4, d1
vadd.i16 d6, d6, d9
.endif
vmlal.u16 q6, d16, d16
vmlal.u16 q6, d1, d1
vmlal.u16 q12, d20, d20
vmlal.u16 q12, d9, d9
.if \w > 4
vmlal.u16 q7, d17, d17
vmlal.u16 q7, d19, d19
vmlal.u16 q13, d21, d21
vmlal.u16 q13, d23, d23
.endif
.endm
add5 8
subs r5, r5, #8
vst1.16 {q2}, [r1, :128]!
vst1.16 {q3}, [r11, :128]!
vst1.32 {q6, q7}, [r0, :128]!
vst1.32 {q12, q13}, [r10, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q0, q1
@ -1163,92 +781,6 @@ function sgr_box5_h_16bpc_neon, export=1
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
5: // Produce 4 pixels, 7 <= w < 11
add5 4
vst1.16 {d4}, [r1, :64]!
vst1.16 {d6}, [r11, :64]!
vst1.32 {q6}, [r0, :128]!
vst1.32 {q12}, [r10, :128]!
subs r5, r5, #4 // 3 <= w < 7
vext.8 q0, q0, q1, #8
vext.8 q4, q4, q5, #8
6: // Pad the right edge and produce the last few pixels.
// w < 7, w+1 pixels valid in q0/q4
sub lr, r5, #1
// lr = pixels valid - 2
adr r11, L(box5_variable_shift_tbl)
ldr lr, [r11, lr, lsl #2]
vmov q1, q14
vmov q5, q15
add r11, r11, lr
bx r11
.align 2
L(box5_variable_shift_tbl):
.word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
.word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
// Shift q0 right, shifting out invalid pixels,
// shift q0 left to the original offset, shifting in padding pixels.
22: // 2 pixels valid
vext.8 q0, q0, q0, #4
vext.8 q4, q4, q4, #4
vext.8 q0, q0, q14, #12
vext.8 q4, q4, q15, #12
b 88f
33: // 3 pixels valid
vext.8 q0, q0, q0, #6
vext.8 q4, q4, q4, #6
vext.8 q0, q0, q14, #10
vext.8 q4, q4, q15, #10
b 88f
44: // 4 pixels valid
vmov d1, d28
vmov d9, d30
b 88f
55: // 5 pixels valid
vext.8 q0, q0, q0, #10
vext.8 q4, q4, q4, #10
vext.8 q0, q0, q14, #6
vext.8 q4, q4, q15, #6
b 88f
66: // 6 pixels valid
vext.8 q0, q0, q0, #12
vext.8 q4, q4, q4, #12
vext.8 q0, q0, q14, #4
vext.8 q4, q4, q15, #4
b 88f
77: // 7 pixels valid
vext.8 q0, q0, q0, #14
vext.8 q4, q4, q4, #14
vext.8 q0, q0, q14, #2
vext.8 q4, q4, q15, #2
88:
// Restore r11 after using it for a temporary value above
add r11, r1, #(2*SUM_STRIDE)
add5 4
subs r5, r5, #4
vst1.16 {d4}, [r1, :64]!
vst1.16 {d6}, [r11, :64]!
vst1.32 {q6}, [r0, :128]!
vst1.32 {q12}, [r10, :128]!
ble 9f
vext.8 q0, q0, q1, #8
vext.8 q4, q4, q5, #8
add5 4
vst1.16 {d4}, [r1, :64]!
vst1.16 {d6}, [r11, :64]!
vst1.32 {q6}, [r0, :128]!
vst1.32 {q12}, [r10, :128]!
9:
subs r6, r6, #2
ble 0f
@ -1264,7 +796,6 @@ L(box5_variable_shift_tbl):
0:
vpop {q4-q7}
pop {r4-r11,pc}
.purgem add5
endfunc
sgr_funcs 16

Просмотреть файл

@ -389,8 +389,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
vrshrn.i32 d21, q11, #11
vqmovun.s16 d4, q2
vqmovun.s16 d20, q10
vst1.8 {d4}, [r0]!
vst1.8 {d20}, [r9]!
vst1.8 {d4}, [r0, :64]!
vst1.8 {d20}, [r9, :64]!
.else
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
@ -398,8 +398,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
vqrshrun.s32 d21, q11, #11
vmin.u16 q2, q2, q14
vmin.u16 q10, q10, q14
vst1.16 {q2}, [r0]!
vst1.16 {q10}, [r9]!
vst1.16 {q2}, [r0, :128]!
vst1.16 {q10}, [r9, :128]!
.endif
bgt 1b
@ -438,12 +438,12 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
vrshrn.i32 d4, q2, #11
vrshrn.i32 d5, q3, #11
vqmovun.s16 d2, q2
vst1.8 {d2}, [r0]!
vst1.8 {d2}, [r0, :64]!
.else
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
vmin.u16 q2, q2, q14
vst1.16 {q2}, [r0]!
vst1.16 {q2}, [r0, :128]!
.endif
bgt 2b
0:
@ -531,8 +531,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
vrshrn.i32 d23, q8, #11
vqmovun.s16 d6, q3
vqmovun.s16 d22, q11
vst1.8 {d6}, [r0]!
vst1.8 {d22}, [r10]!
vst1.8 {d6}, [r0, :64]!
vst1.8 {d22}, [r10, :64]!
.else
vqrshrun.s32 d6, q3, #11
vqrshrun.s32 d7, q0, #11
@ -540,8 +540,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
vqrshrun.s32 d23, q8, #11
vmin.u16 q3, q3, q14
vmin.u16 q11, q11, q14
vst1.16 {q3}, [r0]!
vst1.16 {q11}, [r10]!
vst1.16 {q3}, [r0, :128]!
vst1.16 {q11}, [r10, :128]!
.endif
bgt 1b
@ -586,12 +586,12 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
vrshrn.i32 d6, q3, #11
vrshrn.i32 d7, q0, #11
vqmovun.s16 d6, q3
vst1.8 {d6}, [r0]!
vst1.8 {d6}, [r0, :64]!
.else
vqrshrun.s32 d6, q3, #11
vqrshrun.s32 d7, q0, #11
vmin.u16 q3, q3, q14
vst1.16 {q3}, [r0]!
vst1.16 {q3}, [r0, :128]!
.endif
bgt 1b
0:

118
third_party/dav1d/src/arm/32/mc.S поставляемый
Просмотреть файл

@ -2966,8 +2966,8 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
.endm
.macro load_filter_coef dst, src, inc
vld1.8 {\dst}, [r12, :64]
add \src, \src, \inc
vld1.8 {\dst}, [r12, :64]
.endm
.macro load_filter_row dst, src, inc
@ -2978,72 +2978,57 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
function warp_filter_horz_neon
load_filter_ptr r5 // filter 0
vld1.16 {q7}, [r2], r3
vmov.i8 q6, #128
load_filter_coef d0, r5, r7 // filter 0
vmovl.u8 q6, d14 // original pixels
load_filter_row d2, r5, r7 // filter 1
vmovl.u8 q7, d15 // original pixels
load_filter_row d4, r5, r7 // filter 2
vmovl.s8 q0, d0 // filter 0
vext.8 q3, q6, q7, #2*1 // filter 1 pixels
load_filter_row d1, r5, r7 // filter 1
load_filter_row d2, r5, r7 // filter 2
load_filter_ptr r5 // filter 3
vmovl.s8 q1, d2 // filter 1
vmul.i16 q5, q6, q0 // filter 0 output
load_filter_coef d0, r5, r7 // filter 3
vmovl.s8 q2, d4 // filter 2
veor q7, q7, q6 // subtract by 128 to allow using vmull
load_filter_coef d3, r5, r7 // filter 3
vext.8 d12, d14, d15, #1 // filter 1 pixels
vext.8 d13, d14, d15, #2 // filter 2 pixels
load_filter_ptr r5 // filter 4
vext.8 q4, q6, q7, #2*2 // filter 2 pixels
vmul.i16 q3, q3, q1 // filter 1 output
load_filter_coef d2, r5, r7 // filter 4
vmul.i16 q4, q4, q2 // filter 2 output
vext.8 q2, q6, q7, #2*3 // filter 3 pixels
vmovl.s8 q0, d0 // filter 3
vpaddl.s16 q5, q5 // pixel 0 (4x32)
vpaddl.s16 q3, q3 // pixel 1 (4x32)
vmul.i16 q0, q2, q0 // filter 3 output
vmull.s8 q2, d14, d0 // filter 0 output
vmull.s8 q3, d12, d1 // filter 1 output
load_filter_coef d0, r5, r7 // filter 4
load_filter_ptr r5 // filter 5
vext.8 q2, q6, q7, #2*4 // filter 4 pixels
vmovl.s8 q1, d2 // filter 4
vpaddl.s16 q4, q4 // pixel 2 (4x32)
vpadd.s32 d10, d10, d11 // pixel 0 (2x32)
vpadd.s32 d11, d6, d7 // pixel 1 (2x32)
load_filter_coef d6, r5, r7 // filter 5
vmul.i16 q1, q2, q1 // filter 4 output
vpadd.s32 d8, d8, d9 // pixel 2 (2x32)
vext.8 d12, d14, d15, #3 // filter 3 pixels
vmull.s8 q4, d13, d2 // filter 2 output
vext.8 d13, d14, d15, #4 // filter 4 pixels
vpadd.i16 d4, d4, d5 // pixel 0 (4x16)
vpadd.i16 d5, d6, d7 // pixel 1 (4x16)
load_filter_coef d1, r5, r7 // filter 5
load_filter_ptr r5 // filter 6
vpaddl.s16 q0, q0 // pixel 3 (4x32)
vpadd.s32 d10, d10, d11 // pixel 0,1
vext.8 q2, q6, q7, #2*5 // filter 5 pixels
vmovl.s8 q3, d6 // filter 5
vpaddl.s16 q1, q1 // pixel 4 (4x32)
vpadd.s32 d9, d0, d1 // pixel 3 (2x32)
vmull.s8 q5, d12, d3 // filter 3 output
vext.8 d12, d14, d15, #5 // filter 5 pixels
vmull.s8 q3, d13, d0 // filter 4 output
load_filter_coef d0, r5, r7 // filter 6
vmul.i16 q2, q2, q3 // filter 5 output
vpadd.s32 d11, d8, d9 // pixel 2,3
vext.8 d13, d14, d15, #6 // filter 6 pixels
load_filter_ptr r5 // filter 7
vpaddl.s16 q2, q2 // pixel 5 (4x32)
vpadd.s32 d8, d2, d3 // pixel 4 (2x32)
vext.8 q3, q6, q7, #2*6 // filter 6 pixels
vmovl.s8 q0, d0 // filter 6
vpadd.s32 d9, d4, d5 // pixel 5 (2x32)
load_filter_coef d4, r5, r7 // filter 7
vpadd.s32 d8, d8, d9 // pixel 4,5
vext.8 q1, q6, q7, #2*7 // filter 7 pixels
vmovl.s8 q2, d4 // filter 7
vmul.i16 q3, q3, q0 // filter 6 output
vmul.i16 q1, q1, q2 // filter 7 output
vpadd.i16 d8, d8, d9 // pixel 2 (4x16)
vpadd.i16 d9, d10, d11 // pixel 3 (4x16)
vmull.s8 q5, d12, d1 // filter 5 output
load_filter_coef d1, r5, r7 // filter 7
vext.8 d14, d14, d15, #7 // filter 7 pixels
vpadd.i16 d6, d6, d7 // pixel 4 (4x16)
vpadd.i16 d10, d10, d11 // pixel 5 (4x16)
vmull.s8 q6, d13, d0 // filter 6 output
vmull.s8 q7, d14, d1 // filter 7 output
sub r5, r5, r7, lsl #3
vpaddl.s16 q3, q3 // pixel 6 (4x32)
vpaddl.s16 q1, q1 // pixel 7 (4x32)
vpadd.s32 d6, d6, d7 // pixel 6 (2x32)
vpadd.s32 d2, d2, d3 // pixel 7 (2x32)
vpadd.s32 d9, d6, d2 // pixel 6,7
vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16)
vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16)
vpadd.i16 d12, d12, d13 // pixel 6 (4x16)
vpadd.i16 d14, d14, d15 // pixel 7 (4x16)
vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16)
vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16)
vpadd.i16 d4, d4, d5 // pixel 0-3
vpadd.i16 d5, d6, d10 // pixel 4-7
add r5, r5, r8
vrshrn.s32 d10, q5, #3
vrshrn.s32 d11, q4, #3
bx lr
endfunc
@ -3074,23 +3059,23 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
add r6, r6, #512
bl warp_filter_horz_neon
vmov q8, q5
vrshr.s16 q8, q2, #3
bl warp_filter_horz_neon
vmov q9, q5
vrshr.s16 q9, q2, #3
bl warp_filter_horz_neon
vmov q10, q5
vrshr.s16 q10, q2, #3
bl warp_filter_horz_neon
vmov q11, q5
vrshr.s16 q11, q2, #3
bl warp_filter_horz_neon
vmov q12, q5
vrshr.s16 q12, q2, #3
bl warp_filter_horz_neon
vmov q13, q5
vrshr.s16 q13, q2, #3
bl warp_filter_horz_neon
vmov q14, q5
vrshr.s16 q14, q2, #3
1:
bl warp_filter_horz_neon
vmov q15, q5
vrshr.s16 q15, q2, #3
load_filter_row d8, r6, r9
load_filter_row d9, r6, r9
@ -3133,12 +3118,19 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
vmlal.s16 q1, d29, d5
vmlal.s16 q1, d31, d7
.ifb \t
vmov.i16 q7, #128
.else
vmov.i16 q7, #0x800
.endif
vmov q8, q9
vmov q9, q10
vqrshrn.s32 d0, q0, #\shift
vmov q10, q11
vqrshrn.s32 d1, q1, #\shift
vmov q11, q12
vadd.i16 q0, q0, q7
vmov q12, q13
.ifb \t
vqmovun.s16 d0, q0

2
third_party/dav1d/src/arm/32/mc16.S поставляемый
Просмотреть файл

@ -3154,8 +3154,8 @@ filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
.endm
.macro load_filter_coef dst, src, inc
vld1.8 {\dst}, [r12, :64]
add \src, \src, \inc
vld1.8 {\dst}, [r12, :64]
.endm
.macro load_filter_row dst, src, inc

58
third_party/dav1d/src/arm/32/util.S поставляемый
Просмотреть файл

@ -69,6 +69,56 @@
#endif
.endm
// This macro clobbers r7 (and r12 on windows) and stores data at the
// bottom of the stack; sp is the start of the space allocated that
// the caller can use.
.macro sub_sp_align space
#if CONFIG_THUMB
mov r7, sp
and r7, r7, #15
#else
and r7, sp, #15
#endif
sub sp, sp, r7
// Now the stack is aligned, store the amount of adjustment back
// on the stack, as we don't want to waste a register as frame
// pointer.
str r7, [sp, #-16]!
#ifdef _WIN32
.if \space > 8192
// Here, we'd need to touch two (or more) pages while decrementing
// the stack pointer.
.error "sub_sp_align doesn't support values over 8K at the moment"
.elseif \space > 4096
sub r7, sp, #4096
ldr r12, [r7]
sub r7, r7, #(\space - 4096)
mov sp, r7
.else
sub sp, sp, #\space
.endif
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
#endif
.endm
.macro add_sp_align space
.if \space >= 4096
add sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
add sp, sp, #(\space)%4096
.endif
ldr r7, [sp], #16
// Add back the original stack adjustment
add sp, sp, r7
.endm
.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
vtrn.32 \q0, \q2
vtrn.32 \q1, \q3
@ -108,6 +158,14 @@
vtrn.8 \r2, \r3
.endm
.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
vswp \r1, \r4 // vtrn.64 \q0, \q2
vswp \r3, \r6 // vtrn.64 \q1, \q3
vtrn.32 \q0, \q1
vtrn.32 \q2, \q3
.endm
.macro transpose_4x4h q0, q1, r0, r1, r2, r3
vtrn.32 \q0, \q1

33
third_party/dav1d/src/arm/64/cdef.S поставляемый
Просмотреть файл

@ -363,10 +363,8 @@ find_dir 8
neg v20.16b, v21.16b // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain()
smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain()
smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain()
smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain()
mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
.endm
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
@ -418,8 +416,11 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon
ld1 {v0.s}[3], [x14] // px
.endif
movi v1.8h, #0 // sum
movi v2.8h, #0 // sum
// We need 9-bits or two 8-bit accululators to fit the sum.
// Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
// Start sum at -1 instead of 0 to help handle rounding later.
movi v1.16b, #255 // sum
movi v2.16b, #0 // sum
.if \min
mov v3.16b, v0.16b // min
mov v4.16b, v0.16b // max
@ -468,16 +469,16 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon
.endif
b.ne 2b
sshr v5.8h, v1.8h, #15 // -(sum < 0)
sshr v6.8h, v2.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v5.8h // sum - (sum < 0)
add v2.8h, v2.8h, v6.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4
uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4
uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4
sqxtun v0.8b, v1.8h
sqxtun2 v0.16b, v2.8h
// Perform halving adds since the value won't fit otherwise.
// To handle the offset for negative values, use both halving w/ and w/o rounding.
srhadd v5.16b, v1.16b, v2.16b // sum >> 1
shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
sshr v1.16b, v5.16b, #7 // sum < 0
bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
.if \min
umin v0.16b, v0.16b, v4.16b
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)

95
third_party/dav1d/src/arm/64/cdef_tmpl.S поставляемый
Просмотреть файл

@ -311,6 +311,30 @@ endconst
.endif
.endm
// Steps for loading and preparing each row
.macro dir_load_step1 s1, bpc
.if \bpc == 8
ld1 {\s1\().8b}, [x0], x1
.else
ld1 {\s1\().8h}, [x0], x1
.endif
.endm
.macro dir_load_step2 s1, bpc
.if \bpc == 8
usubl \s1\().8h, \s1\().8b, v31.8b
.else
ushl \s1\().8h, \s1\().8h, v8.8h
.endif
.endm
.macro dir_load_step3 s1, bpc
// Nothing for \bpc == 8
.if \bpc != 8
sub \s1\().8h, \s1\().8h, v31.8h
.endif
.endm
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
.macro find_dir bpc
@ -333,21 +357,15 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
movi v3.8h, #0 // v2-v3 sum_diag[1]
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
movi v7.8h, #0 // v6-v7 sum_alt[0]
dir_load_step1 v26, \bpc // Setup first row early
movi v17.8h, #0 // v16-v17 sum_alt[1]
movi v18.8h, #0 // v18-v19 sum_alt[2]
dir_load_step2 v26, \bpc
movi v19.8h, #0
dir_load_step3 v26, \bpc
movi v21.8h, #0 // v20-v21 sum_alt[3]
.irpc i, 01234567
.if \bpc == 8
ld1 {v26.8b}, [x0], x1
usubl v26.8h, v26.8b, v31.8b
.else
ld1 {v26.8h}, [x0], x1
ushl v26.8h, v26.8h, v8.8h
sub v26.8h, v26.8h, v31.8h
.endif
addv h25, v26.8h // [y]
rev64 v27.8h, v26.8h
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
@ -355,30 +373,6 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
rev64 v29.4h, v28.4h // [-(x >> 1)]
ins v4.h[\i], v25.h[0] // sum_hv[0]
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
mov v2.16b, v27.16b // sum_diag[1]
mov v6.16b, v28.16b // sum_alt[0]
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
@ -397,6 +391,41 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
dir_load_step1 v26, \bpc
mov v2.16b, v27.16b // sum_diag[1]
dir_load_step2 v26, \bpc
mov v6.16b, v28.16b // sum_alt[0]
dir_load_step3 v26, \bpc
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
.if \i != 7 // Nothing to load for the final row
dir_load_step1 v26, \bpc // Start setting up the next row early.
.endif
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
.if \i != 7
dir_load_step2 v26, \bpc
.endif
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
.if \i != 7
dir_load_step3 v26, \bpc
.endif
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.endr
movi v31.4s, #105

18
third_party/dav1d/src/arm/64/ipred.S поставляемый
Просмотреть файл

@ -502,9 +502,9 @@ L(ipred_dc_h4):
ld1 {v0.s}[0], [x2], #4
ins v0.s[1], wzr
uaddlv h0, v0.8b
add x2, x2, #1
br x3
L(ipred_dc_w4):
add x2, x2, #1
ld1 {v1.s}[0], [x2]
ins v1.s[1], wzr
add v0.4h, v0.4h, v16.4h
@ -534,9 +534,9 @@ L(ipred_dc_w4):
L(ipred_dc_h8):
ld1 {v0.8b}, [x2], #8
uaddlv h0, v0.8b
add x2, x2, #1
br x3
L(ipred_dc_w8):
add x2, x2, #1
ld1 {v1.8b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h1, v1.8b
@ -565,9 +565,9 @@ L(ipred_dc_w8):
L(ipred_dc_h16):
ld1 {v0.16b}, [x2], #16
uaddlv h0, v0.16b
add x2, x2, #1
br x3
L(ipred_dc_w16):
add x2, x2, #1
ld1 {v1.16b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h1, v1.16b
@ -597,10 +597,10 @@ L(ipred_dc_h32):
ld1 {v0.16b, v1.16b}, [x2], #32
uaddlv h0, v0.16b
uaddlv h1, v1.16b
add x2, x2, #1
add v0.4h, v0.4h, v1.4h
br x3
L(ipred_dc_w32):
add x2, x2, #1
ld1 {v1.16b, v2.16b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h1, v1.16b
@ -637,10 +637,10 @@ L(ipred_dc_h64):
uaddlv h3, v3.16b
add v0.4h, v0.4h, v1.4h
add v2.4h, v2.4h, v3.4h
add x2, x2, #1
add v0.4h, v0.4h, v2.4h
br x3
L(ipred_dc_w64):
add x2, x2, #1
ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h1, v1.16b
@ -1834,10 +1834,10 @@ function ipred_cfl_8bpc_neon, export=1
L(ipred_cfl_h4):
ld1 {v0.s}[0], [x2], #4
ins v0.s[1], wzr
add x2, x2, #1
uaddlv h0, v0.8b
br x9
L(ipred_cfl_w4):
add x2, x2, #1
ld1 {v2.s}[0], [x2]
ins v2.s[1], wzr
add v0.4h, v0.4h, v16.4h
@ -1860,9 +1860,9 @@ L(ipred_cfl_w4):
L(ipred_cfl_h8):
ld1 {v0.8b}, [x2], #8
uaddlv h0, v0.8b
add x2, x2, #1
br x9
L(ipred_cfl_w8):
add x2, x2, #1
ld1 {v2.8b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h2, v2.8b
@ -1884,9 +1884,9 @@ L(ipred_cfl_w8):
L(ipred_cfl_h16):
ld1 {v0.16b}, [x2], #16
uaddlv h0, v0.16b
add x2, x2, #1
br x9
L(ipred_cfl_w16):
add x2, x2, #1
ld1 {v2.16b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h2, v2.16b
@ -1909,10 +1909,10 @@ L(ipred_cfl_h32):
ld1 {v2.16b, v3.16b}, [x2], #32
uaddlv h2, v2.16b
uaddlv h3, v3.16b
add x2, x2, #1
add v0.4h, v2.4h, v3.4h
br x9
L(ipred_cfl_w32):
add x2, x2, #1
ld1 {v2.16b, v3.16b}, [x2]
add v0.4h, v0.4h, v16.4h
uaddlv h2, v2.16b

21
third_party/dav1d/src/arm/64/ipred16.S поставляемый
Просмотреть файл

@ -562,9 +562,9 @@ function ipred_dc_16bpc_neon, export=1
L(ipred_dc_h4):
ld1 {v0.4h}, [x2], #8
uaddlv s0, v0.4h
add x2, x2, #2
br x3
L(ipred_dc_w4):
add x2, x2, #2
ld1 {v1.4h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s1, v1.4h
@ -594,9 +594,9 @@ L(ipred_dc_w4):
L(ipred_dc_h8):
ld1 {v0.8h}, [x2], #16
uaddlv s0, v0.8h
add x2, x2, #2
br x3
L(ipred_dc_w8):
add x2, x2, #2
ld1 {v1.8h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s1, v1.8h
@ -626,10 +626,10 @@ L(ipred_dc_w8):
L(ipred_dc_h16):
ld1 {v0.8h, v1.8h}, [x2], #32
addp v0.8h, v0.8h, v1.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x3
L(ipred_dc_w16):
add x2, x2, #2
ld1 {v1.8h, v2.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
@ -663,10 +663,10 @@ L(ipred_dc_h32):
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v2.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x3
L(ipred_dc_w32):
add x2, x2, #2
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
@ -709,10 +709,10 @@ L(ipred_dc_h64):
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x3
L(ipred_dc_w64):
add x2, x2, #2
ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
add v0.2s, v0.2s, v16.2s
addp v1.8h, v1.8h, v2.8h
@ -1382,7 +1382,9 @@ function ipred_filter_\bpc\()bpc_neon
sxtl v21.8h, v21.8b
sxtl v22.8h, v22.8b
dup v31.8h, w8
.if \bpc == 10
movi v30.8h, #0
.endif
br x5
40:
ldur d0, [x2, #2] // top (0-3)
@ -1421,7 +1423,6 @@ function ipred_filter_\bpc\()bpc_neon
smin v2.8h, v2.8h, v31.8h
subs w4, w4, #2
st1 {v2.d}[0], [x0], x1
uxtl v0.8h, v2.8b
ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
st1 {v2.d}[1], [x6], x1
b.gt 4b
@ -2143,9 +2144,9 @@ function ipred_cfl_16bpc_neon, export=1
L(ipred_cfl_h4):
ld1 {v0.4h}, [x2], #8
uaddlv s0, v0.4h
add x2, x2, #2
br x9
L(ipred_cfl_w4):
add x2, x2, #2
ld1 {v2.4h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s2, v2.4h
@ -2168,9 +2169,9 @@ L(ipred_cfl_w4):
L(ipred_cfl_h8):
ld1 {v0.8h}, [x2], #16
uaddlv s0, v0.8h
add x2, x2, #2
br x9
L(ipred_cfl_w8):
add x2, x2, #2
ld1 {v2.8h}, [x2]
add v0.2s, v0.2s, v16.2s
uaddlv s2, v2.8h
@ -2193,10 +2194,10 @@ L(ipred_cfl_w8):
L(ipred_cfl_h16):
ld1 {v2.8h, v3.8h}, [x2], #32
addp v0.8h, v2.8h, v3.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x9
L(ipred_cfl_w16):
add x2, x2, #2
ld1 {v2.8h, v3.8h}, [x2]
add v0.2s, v0.2s, v16.2s
addp v2.8h, v2.8h, v3.8h
@ -2222,10 +2223,10 @@ L(ipred_cfl_h32):
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v0.8h, v2.8h, v4.8h
add x2, x2, #2
uaddlv s0, v0.8h
br x9
L(ipred_cfl_w32):
add x2, x2, #2
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
add v0.4s, v0.4s, v16.4s
addp v2.8h, v2.8h, v3.8h

31
third_party/dav1d/src/arm/64/itx.S поставляемый
Просмотреть файл

@ -718,7 +718,7 @@ def_fn_4x4 identity, flipadst
rshrn_sz \r7, v4, v5, #12, \sz // t7a
smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
rshrn_sz \r3, v6, v7, #12, \sz // t5a
rshrn_sz \r5, v2, v3, #12, \sz // taa
rshrn_sz \r5, v2, v3, #12, \sz // t6a
sqadd v2\sz, \r1\sz, \r3\sz // t4
sqsub \r1\sz, \r1\sz, \r3\sz // t5a
@ -1085,7 +1085,7 @@ def_fns_48 8, 4
rshrn_sz v4, v4, v5, #12, \sz // t11
rshrn_sz v5, v6, v7, #12, \sz // t12
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
rshrn_sz v2, v2, v3, #12, \sz // t10a
rshrn_sz v3, v6, v7, #12, \sz // t13a
@ -3002,29 +3002,6 @@ function inv_txfm_add_vert_dct_8x64_neon
br x14
endfunc
.macro sub_sp space
#ifdef _WIN32
.if \space > 8192
// Here, we'd need to touch two (or more) pages while decrementing
// the stack pointer.
.error "sub_sp_align doesn't support values over 8K at the moment"
.elseif \space > 4096
sub x16, sp, #4096
ldr xzr, [x16]
sub sp, x16, #(\space - 4096)
.else
sub sp, sp, #\space
.endif
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
#endif
.endm
function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
idct_dc 64, 64, 2
@ -3149,7 +3126,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
mov w8, #(32 - \i)
cmp w3, w12
b.lt 1f
.if \i < 24
ldrh w12, [x13], #2
.endif
.endif
add x7, x2, #(\i*2)
mov x8, #32*2
@ -3254,7 +3233,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
mov w8, #(32 - \i)
cmp w3, w12
b.lt 1f
.if \i < 24
ldrh w12, [x13], #2
.endif
.endif
add x7, x2, #(\i*2)
mov x8, #32*2

330
third_party/dav1d/src/arm/64/itx16.S поставляемый
Просмотреть файл

@ -124,7 +124,7 @@ endconst
.endif
.endm
.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
.endif
@ -132,10 +132,7 @@ endconst
srshr \shift, \shift, #\shiftbits
.endif
.ifnb \addsrc
sqadd \adddst, \adddst, \addsrc
.endif
.ifnb \max
smax \max, \max, v6.8h
usqadd \adddst, \addsrc
.endif
.ifnb \min
smin \min, \min, v7.8h
@ -146,63 +143,57 @@ endconst
.endm
.macro load_add_store_8x16 dst, src
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store v2.8h, v16.8h, , , , , , \dst, \src
load_add_store v3.8h, v17.8h, , , , , , \dst, \src
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src
load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src
load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src
load_add_store , , , , , v31.8h, v30.8h, \dst, \src
load_add_store , , , , , , v31.8h, \dst, \src
load_add_store v2.8h, v16.8h, , , , , \dst, \src
load_add_store v3.8h, v17.8h, , , , , \dst, \src
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src
load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src
load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src
load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src
load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src
load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
load_add_store , , , , v27.8h, v26.8h, \dst, \src
load_add_store , , , , , v27.8h, \dst, \src
.endm
.macro load_add_store_8x8 dst, src, shiftbits=4
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits
load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits
load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits
load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits
load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
load_add_store , , , , , v19.8h, \dst, \src, \shiftbits
.endm
.macro load_add_store_8x4 dst, src, shiftbits=4
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits
load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits
load_add_store , , , , , v5.8h, \dst, \src, \shiftbits
.endm
.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
.ifnb \load
ld1 {\load}[0], [\src], x1
.endif
@ -216,14 +207,11 @@ endconst
ld1 {\load}[1], [\src], x1
.endif
.ifnb \addsrc
sqadd \adddst, \adddst, \addsrc
usqadd \adddst, \addsrc
.endif
.ifnb \store
st1 {\store}[0], [\dst], x1
.endif
.ifnb \max
smax \max, \max, v6.8h
.endif
.ifnb \min
smin \min, \min, v7.8h
.endif
@ -233,37 +221,33 @@ endconst
.endm
.macro load_add_store_4x16 dst, src
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src
load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src
load_add_store4 , , , , , , , , v30.d, \dst, \src
load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src
load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src
load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src
load_add_store4 , , , , , , , v23.d, \dst, \src
.endm
.macro load_add_store_4x8 dst, src
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src
load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src
load_add_store4 , , , , , , , , v22.d, \dst, \src
load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src
load_add_store4 , , , , , , , v3.d, \dst, \src
.endm
.macro idct_dc w, h, shift
@ -291,7 +275,6 @@ endconst
.endm
function idct_dc_w4_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.d}[0], [x0], x1
@ -299,11 +282,9 @@ function idct_dc_w4_neon
ld1 {v1.d}[0], [x0], x1
subs w4, w4, #4
ld1 {v1.d}[1], [x0], x1
sqadd v0.8h, v0.8h, v16.8h
usqadd v0.8h, v16.8h
sub x0, x0, x1, lsl #2
sqadd v1.8h, v1.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
usqadd v1.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
st1 {v0.d}[0], [x0], x1
smin v1.8h, v1.8h, v31.8h
@ -315,23 +296,18 @@ function idct_dc_w4_neon
endfunc
function idct_dc_w8_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.8h}, [x0], x1
subs w4, w4, #4
ld1 {v1.8h}, [x0], x1
sqadd v0.8h, v0.8h, v16.8h
usqadd v0.8h, v16.8h
ld1 {v2.8h}, [x0], x1
sqadd v1.8h, v1.8h, v16.8h
usqadd v1.8h, v16.8h
ld1 {v3.8h}, [x0], x1
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
sub x0, x0, x1, lsl #2
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
st1 {v0.8h}, [x0], x1
@ -345,21 +321,16 @@ function idct_dc_w8_neon
endfunc
function idct_dc_w16_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.8h, v1.8h}, [x0], x1
subs w4, w4, #2
ld1 {v2.8h, v3.8h}, [x0], x1
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v16.8h
usqadd v0.8h, v16.8h
usqadd v1.8h, v16.8h
sub x0, x0, x1, lsl #1
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
smin v2.8h, v2.8h, v31.8h
@ -371,19 +342,14 @@ function idct_dc_w16_neon
endfunc
function idct_dc_w32_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
subs w4, w4, #1
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v16.8h
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
usqadd v0.8h, v16.8h
usqadd v1.8h, v16.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
smin v2.8h, v2.8h, v31.8h
@ -394,30 +360,21 @@ function idct_dc_w32_neon
endfunc
function idct_dc_w64_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
sub x1, x1, #64
1:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
subs w4, w4, #1
sqadd v0.8h, v0.8h, v16.8h
usqadd v0.8h, v16.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
sqadd v1.8h, v1.8h, v16.8h
usqadd v1.8h, v16.8h
sub x0, x0, #64
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
sqadd v4.8h, v4.8h, v16.8h
sqadd v5.8h, v5.8h, v16.8h
sqadd v6.8h, v6.8h, v16.8h
sqadd v7.8h, v7.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smax v4.8h, v4.8h, v30.8h
smax v5.8h, v5.8h, v30.8h
smax v6.8h, v6.8h, v30.8h
smax v7.8h, v7.8h, v30.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
usqadd v4.8h, v16.8h
usqadd v5.8h, v16.8h
usqadd v6.8h, v16.8h
usqadd v7.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
smin v2.8h, v2.8h, v31.8h
@ -445,12 +402,12 @@ endfunc
.macro idct_4 r0, r1, r2, r3
mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
srshr v6.4s, v6.4s, #12
srshr v7.4s, v4.4s, #12
srshr v2.4s, v2.4s, #12
srshr v7.4s, v4.4s, #12
srshr v3.4s, v3.4s, #12
sqadd \r0\().4s, v2.4s, v6.4s
sqsub \r3\().4s, v2.4s, v6.4s
@ -575,16 +532,14 @@ function inv_txfm_add_4x4_neon
L(itx_4x4_end):
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
sub x0, x0, x1, lsl #2
sqadd v16.8h, v16.8h, v0.8h
sqadd v18.8h, v18.8h, v1.8h
smax v16.8h, v16.8h, v30.8h
smax v18.8h, v18.8h, v30.8h
smin v16.8h, v16.8h, v31.8h
st1 {v16.d}[0], [x0], x1
smin v18.8h, v18.8h, v31.8h
st1 {v16.d}[1], [x0], x1
st1 {v18.d}[0], [x0], x1
st1 {v18.d}[1], [x0], x1
usqadd v0.8h, v16.8h
usqadd v1.8h, v18.8h
smin v0.8h, v0.8h, v31.8h
st1 {v0.d}[0], [x0], x1
smin v1.8h, v1.8h, v31.8h
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[0], [x0], x1
st1 {v1.d}[1], [x0], x1
br x15
endfunc
@ -647,7 +602,7 @@ def_fn_4x4 identity, flipadst
srshr \r1\().4s, v2.4s, #12 // t4a
srshr \r7\().4s, v4.4s, #12 // t7a
srshr \r3\().4s, v6.4s, #12 // t5a
srshr \r5\().4s, v7.4s, #12 // taa
srshr \r5\().4s, v7.4s, #12 // t6a
sqadd v2.4s, \r1\().4s, \r3\().4s // t4
sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
@ -1052,7 +1007,7 @@ function inv_dct_4s_x16_neon
srshr v4.4s, v4.4s, #12 // t11
srshr v5.4s, v6.4s, #12 // t12
mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a
mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v2.4s, v2.4s, #12 // t10a
srshr v3.4s, v6.4s, #12 // t13a
@ -1488,10 +1443,10 @@ function inv_txfm_add_4x16_neon
st1 {v2.4s}, [x6], x11
.endr
blr x4
rshrn v28.4h, v16.4s, #1
rshrn v29.4h, v17.4s, #1
rshrn v30.4h, v18.4s, #1
rshrn v31.4h, v19.4s, #1
sqrshrn v28.4h, v16.4s, #1
sqrshrn v29.4h, v17.4s, #1
sqrshrn v30.4h, v18.4s, #1
sqrshrn v31.4h, v19.4s, #1
transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
b 2f
@ -1511,10 +1466,10 @@ function inv_txfm_add_4x16_neon
st1 {v2.4s}, [x6], x11
.endr
blr x4
rshrn v24.4h, v16.4s, #1
rshrn v25.4h, v17.4s, #1
rshrn v26.4h, v18.4s, #1
rshrn v27.4h, v19.4s, #1
sqrshrn v24.4h, v16.4s, #1
sqrshrn v25.4h, v17.4s, #1
sqrshrn v26.4h, v18.4s, #1
sqrshrn v27.4h, v19.4s, #1
transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
b 2f
@ -1533,10 +1488,10 @@ function inv_txfm_add_4x16_neon
st1 {v2.4s}, [x6], x11
.endr
blr x4
rshrn v20.4h, v16.4s, #1
rshrn v21.4h, v17.4s, #1
rshrn v22.4h, v18.4s, #1
rshrn v23.4h, v19.4s, #1
sqrshrn v20.4h, v16.4s, #1
sqrshrn v21.4h, v17.4s, #1
sqrshrn v22.4h, v18.4s, #1
sqrshrn v23.4h, v19.4s, #1
transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
b 2f
@ -1552,10 +1507,10 @@ function inv_txfm_add_4x16_neon
st1 {v2.4s}, [x2], x11
.endr
blr x4
rshrn v16.4h, v16.4s, #1
rshrn v17.4h, v17.4s, #1
rshrn v18.4h, v18.4s, #1
rshrn v19.4h, v19.4s, #1
sqrshrn v16.4h, v16.4s, #1
sqrshrn v17.4h, v17.4s, #1
sqrshrn v18.4h, v18.4s, #1
sqrshrn v19.4h, v19.4s, #1
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
blr x5
@ -2219,7 +2174,6 @@ function inv_txfm_add_vert_dct_8x32_neon
neg x9, x8
mov x10, x6
movi v0.8h, #0
mvni v1.8h, #0xfc, lsl #8 // 0x3ff
.macro combine r0, r1, r2, r3, op, stride
ld1 {v5.8h}, [x7], \stride
@ -2231,27 +2185,23 @@ function inv_txfm_add_vert_dct_8x32_neon
ld1 {v4.8h}, [x10], x1
srshr v5.8h, v5.8h, #4
\op v6.8h, v6.8h, \r1
sqadd v5.8h, v5.8h, v2.8h
usqadd v2.8h, v5.8h
srshr v6.8h, v6.8h, #4
\op v7.8h, v7.8h, \r2
smax v2.8h, v5.8h, v0.8h
ld1 {v5.8h}, [x7], \stride
sqadd v6.8h, v6.8h, v3.8h
usqadd v3.8h, v6.8h
smin v2.8h, v2.8h, v1.8h
srshr v7.8h, v7.8h, #4
\op v5.8h, v5.8h, \r3
st1 {v2.8h}, [x6], x1
ld1 {v2.8h}, [x10], x1
smax v3.8h, v6.8h, v0.8h
sqadd v7.8h, v7.8h, v4.8h
usqadd v4.8h, v7.8h
smin v3.8h, v3.8h, v1.8h
srshr v5.8h, v5.8h, #4
st1 {v3.8h}, [x6], x1
smax v4.8h, v7.8h, v0.8h
sqadd v5.8h, v5.8h, v2.8h
usqadd v2.8h, v5.8h
smin v4.8h, v4.8h, v1.8h
st1 {v4.8h}, [x6], x1
smax v2.8h, v5.8h, v0.8h
smin v2.8h, v2.8h, v1.8h
st1 {v2.8h}, [x6], x1
.endm
@ -2652,7 +2602,9 @@ function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
mov w8, #(16 - \i)
cmp w3, w12
b.lt 1f
.if \i < 12
ldrh w12, [x13], #2
.endif
.endif
mov x8, #4*16
bl inv_txfm_horz_scale_dct_32x4_neon
@ -3195,7 +3147,6 @@ function inv_txfm_add_vert_dct_8x64_neon
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
.macro add_dest_addsub src0, src1, src2, src3
ld1 {v0.8h}, [x6], x1
@ -3211,18 +3162,14 @@ function inv_txfm_add_vert_dct_8x64_neon
srshr v4.8h, v4.8h, #4
srshr v5.8h, v5.8h, #4
srshr \src0, \src0, #4
sqadd v0.8h, v0.8h, v4.8h
usqadd v0.8h, v4.8h
srshr \src2, \src2, #4
sqadd v1.8h, v1.8h, \src0
sqadd v2.8h, v2.8h, v5.8h
smax v0.8h, v0.8h, v6.8h
sqadd v3.8h, v3.8h, \src2
smax v1.8h, v1.8h, v6.8h
usqadd v1.8h, \src0
usqadd v2.8h, v5.8h
smin v0.8h, v0.8h, v7.8h
smax v2.8h, v2.8h, v6.8h
usqadd v3.8h, \src2
smin v1.8h, v1.8h, v7.8h
st1 {v0.8h}, [x6], x1
smax v3.8h, v3.8h, v6.8h
smin v2.8h, v2.8h, v7.8h
st1 {v1.8h}, [x9], x10
smin v3.8h, v3.8h, v7.8h
@ -3240,29 +3187,6 @@ function inv_txfm_add_vert_dct_8x64_neon
br x14
endfunc
.macro sub_sp space
#ifdef _WIN32
.if \space > 8192
// Here, we'd need to touch two (or more) pages while decrementing
// the stack pointer.
.error "sub_sp_align doesn't support values over 8K at the moment"
.elseif \space > 4096
sub x16, sp, #4096
ldr xzr, [x16]
sub sp, x16, #(\space - 4096)
.else
sub sp, sp, #\space
.endif
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
#endif
.endm
function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
idct_dc 64, 64, 2
@ -3492,7 +3416,9 @@ function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
mov w8, #(32 - \i)
cmp w3, w12
b.lt 1f
.if \i < 28
ldrh w12, [x13], #2
.endif
.endif
add x7, x2, #(\i*4)
mov x8, #32*4

49
third_party/dav1d/src/arm/64/loopfilter.S поставляемый
Просмотреть файл

@ -132,12 +132,11 @@ function lpf_16_wd\wd\()_neon
.endif
b.eq 1f // skip wd == 4 case
.endif
usubl v2.8h, v22.8b, v25.8b // p1 - q1
usubl2 v3.8h, v22.16b, v25.16b
movi v3.16b, #128
eor v2.16b, v22.16b, v3.16b // p1 - 128
eor v3.16b, v25.16b, v3.16b // q1 - 128
cmhi v0.16b, v0.16b, v12.16b // hev
sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1)
sqxtn2 v2.16b, v3.8h
sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1)
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
usubl v2.8h, v24.8b, v23.8b
@ -155,35 +154,23 @@ function lpf_16_wd\wd\()_neon
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
sshr v4.16b, v4.16b, #3 // f1
sshr v5.16b, v5.16b, #3 // f2
uxtl v2.8h, v23.8b // p0
uxtl2 v3.8h, v23.16b
uxtl v6.8h, v24.8b // q0
uxtl2 v7.8h, v24.16b
saddw v2.8h, v2.8h, v5.8b
saddw2 v3.8h, v3.8h, v5.16b
ssubw v6.8h, v6.8h, v4.8b
ssubw2 v7.8h, v7.8h, v4.16b
mov v2.16b, v23.16b // p0
mov v3.16b, v24.16b // q0
neg v6.16b, v4.16b // -f1
srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
sqxtun v2.8b, v2.8h // out p0
sqxtun2 v2.16b, v3.8h
sqxtun v6.8b, v6.8h // out q0
sqxtun2 v6.16b, v7.8h
// p0 + f2, q0 - f1
usqadd v2.16b, v5.16b // out p0
usqadd v3.16b, v6.16b // out q0
neg v6.16b, v4.16b // -((f1 + 1) >> 1)
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
uxtl v2.8h, v22.8b // p1
uxtl2 v3.8h, v22.16b
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
uxtl v6.8h, v25.8b // q1
uxtl2 v7.8h, v25.16b
saddw v2.8h, v2.8h, v4.8b
saddw2 v3.8h, v3.8h, v4.16b
ssubw v6.8h, v6.8h, v4.8b
ssubw2 v7.8h, v7.8h, v4.16b
sqxtun v2.8b, v2.8h // out p1
sqxtun2 v2.16b, v3.8h
sqxtun v6.8b, v6.8h // out q1
sqxtun2 v6.16b, v7.8h
bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4)
mov v2.16b, v22.16b // p1
mov v3.16b, v25.16b // q1
// p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
usqadd v2.16b, v4.16b // out p1
usqadd v3.16b, v6.16b // out q1
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6

3
third_party/dav1d/src/arm/64/loopfilter16.S поставляемый
Просмотреть файл

@ -150,10 +150,9 @@ function lpf_8_wd\wd\()_neon
movi v6.8h, #4
add v2.8h, v2.8h, v4.8h
smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
movi v7.8h, #3
smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
sqadd v4.8h, v6.8h, v2.8h // f + 4
sqadd v5.8h, v7.8h, v2.8h // f + 3
sqadd v5.8h, v5.8h, v2.8h // f + 3
smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
sshr v4.8h, v4.8h, #3 // f1

1750
third_party/dav1d/src/arm/64/looprestoration.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1870
third_party/dav1d/src/arm/64/looprestoration16.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

151
third_party/dav1d/src/arm/64/mc.S поставляемый
Просмотреть файл

@ -2180,16 +2180,7 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
mul v24.8h, v28.8h, v0.h[0]
.irpc i, 1234567
ext v26.16b, v28.16b, v29.16b, #(2*\i)
mla v24.8h, v26.8h, v0.h[\i]
.endr
srshr v16.8h, v24.8h, #2
bl L(\type\()_8tap_filter_8_first)
bl L(\type\()_8tap_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
@ -2267,16 +2258,7 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
mul v24.8h, v28.8h, v0.h[0]
.irpc i, 1234567
ext v26.16b, v28.16b, v29.16b, #(2*\i)
mla v24.8h, v26.8h, v0.h[\i]
.endr
srshr v16.8h, v24.8h, #2
bl L(\type\()_8tap_filter_8_first)
bl L(\type\()_8tap_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
@ -2363,6 +2345,28 @@ L(\type\()_8tap_filter_4):
0:
br x15
L(\type\()_8tap_filter_8_first):
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
mul v16.8h, v28.8h, v0.h[0]
ext v24.16b, v28.16b, v29.16b, #(2*1)
ext v25.16b, v28.16b, v29.16b, #(2*2)
ext v26.16b, v28.16b, v29.16b, #(2*3)
ext v27.16b, v28.16b, v29.16b, #(2*4)
mla v16.8h, v24.8h, v0.h[1]
mla v16.8h, v25.8h, v0.h[2]
mla v16.8h, v26.8h, v0.h[3]
mla v16.8h, v27.8h, v0.h[4]
ext v24.16b, v28.16b, v29.16b, #(2*5)
ext v25.16b, v28.16b, v29.16b, #(2*6)
ext v26.16b, v28.16b, v29.16b, #(2*7)
mla v16.8h, v24.8h, v0.h[5]
mla v16.8h, v25.8h, v0.h[6]
mla v16.8h, v26.8h, v0.h[7]
srshr v16.8h, v16.8h, #2
ret
L(\type\()_8tap_filter_8):
ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
ld1 {v30.8b, v31.8b}, [\src], \s_strd
@ -2916,8 +2920,8 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
.macro load_filter_row dst, src, inc
asr w13, \src, #10
ldr \dst, [x11, w13, sxtw #3]
add \src, \src, \inc
ldr \dst, [x11, w13, sxtw #3]
.endm
function warp_filter_horz_neon
@ -2926,58 +2930,45 @@ function warp_filter_horz_neon
ld1 {v16.8b, v17.8b}, [x2], x3
load_filter_row d0, w12, w7
uxtl v16.8h, v16.8b
load_filter_row d1, w12, w7
uxtl v17.8h, v17.8b
load_filter_row d2, w12, w7
sxtl v0.8h, v0.8b
load_filter_row d3, w12, w7
sxtl v1.8h, v1.8b
load_filter_row d4, w12, w7
sxtl v2.8h, v2.8b
load_filter_row d5, w12, w7
sxtl v3.8h, v3.8b
load_filter_row d6, w12, w7
sxtl v4.8h, v4.8b
// subtract by 128 to allow using smull
eor v16.8b, v16.8b, v22.8b
eor v17.8b, v17.8b, v22.8b
load_filter_row d7, w12, w7
sxtl v5.8h, v5.8b
ext v18.16b, v16.16b, v17.16b, #2*1
mul v23.8h, v16.8h, v0.8h
sxtl v6.8h, v6.8b
ext v19.16b, v16.16b, v17.16b, #2*2
mul v18.8h, v18.8h, v1.8h
sxtl v7.8h, v7.8b
ext v20.16b, v16.16b, v17.16b, #2*3
mul v19.8h, v19.8h, v2.8h
ext v21.16b, v16.16b, v17.16b, #2*4
saddlp v23.4s, v23.8h
mul v20.8h, v20.8h, v3.8h
ext v22.16b, v16.16b, v17.16b, #2*5
saddlp v18.4s, v18.8h
mul v21.8h, v21.8h, v4.8h
saddlp v19.4s, v19.8h
mul v22.8h, v22.8h, v5.8h
saddlp v20.4s, v20.8h
saddlp v21.4s, v21.8h
saddlp v22.4s, v22.8h
addp v18.4s, v23.4s, v18.4s
ext v23.16b, v16.16b, v17.16b, #2*6
addp v19.4s, v19.4s, v20.4s
mul v23.8h, v23.8h, v6.8h
ext v20.16b, v16.16b, v17.16b, #2*7
mul v20.8h, v20.8h, v7.8h
saddlp v23.4s, v23.8h
addp v21.4s, v21.4s, v22.4s
saddlp v20.4s, v20.8h
addp v20.4s, v23.4s, v20.4s
addp v18.4s, v18.4s, v19.4s
addp v20.4s, v21.4s, v20.4s
ext v18.8b, v16.8b, v17.8b, #1
ext v19.8b, v16.8b, v17.8b, #2
smull v0.8h, v0.8b, v16.8b
smull v1.8h, v1.8b, v18.8b
ext v18.8b, v16.8b, v17.8b, #3
ext v20.8b, v16.8b, v17.8b, #4
smull v2.8h, v2.8b, v19.8b
smull v3.8h, v3.8b, v18.8b
ext v18.8b, v16.8b, v17.8b, #5
ext v19.8b, v16.8b, v17.8b, #6
smull v4.8h, v4.8b, v20.8b
smull v5.8h, v5.8b, v18.8b
ext v18.8b, v16.8b, v17.8b, #7
smull v6.8h, v6.8b, v19.8b
smull v7.8h, v7.8b, v18.8b
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
add w5, w5, w8
rshrn v16.4h, v18.4s, #3
rshrn2 v16.8h, v20.4s, #3
ret
endfunc
@ -3002,25 +2993,32 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
lsl x1, x1, #1
.endif
movi v22.8b, #128
.ifb \t
movi v23.8h, #128
.else
movi v23.8h, #8, lsl #8
.endif
bl warp_filter_horz_neon
mov v24.16b, v16.16b
srshr v24.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v25.16b, v16.16b
srshr v25.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v26.16b, v16.16b
srshr v26.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v27.16b, v16.16b
srshr v27.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v28.16b, v16.16b
srshr v28.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v29.16b, v16.16b
srshr v29.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v30.16b, v16.16b
srshr v30.8h, v0.8h, #3
1:
add w14, w6, #512
bl warp_filter_horz_neon
mov v31.16b, v16.16b
srshr v31.8h, v0.8h, #3
load_filter_row d0, w14, w9
load_filter_row d1, w14, w9
@ -3030,15 +3028,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
load_filter_row d5, w14, w9
load_filter_row d6, w14, w9
load_filter_row d7, w14, w9
transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
sxtl v2.8h, v2.8b
sxtl v3.8h, v3.8b
sxtl v4.8h, v4.8b
sxtl v5.8h, v5.8b
sxtl v6.8h, v6.8b
sxtl v7.8h, v7.8b
transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
// This ordering of smull/smlal/smull2/smlal2 is highly
// beneficial for Cortex A53 here.
@ -3066,6 +3056,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
sqrshrn2 v16.8h, v17.4s, #\shift
mov v27.16b, v28.16b
mov v28.16b, v29.16b
add v16.8h, v16.8h, v23.8h
.ifb \t
sqxtun v16.8b, v16.8h
.endif

12
third_party/dav1d/src/arm/64/mc16.S поставляемый
Просмотреть файл

@ -3188,8 +3188,8 @@ filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
.macro load_filter_row dst, src, inc
asr w13, \src, #10
ldr \dst, [x11, w13, sxtw #3]
add \src, \src, \inc
ldr \dst, [x11, w13, sxtw #3]
.endm
function warp_filter_horz_neon
@ -3343,15 +3343,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
load_filter_row d5, w14, w9
load_filter_row d6, w14, w9
load_filter_row d7, w14, w9
transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
sxtl v2.8h, v2.8b
sxtl v3.8h, v3.8b
sxtl v4.8h, v4.8b
sxtl v5.8h, v5.8b
sxtl v6.8h, v6.8b
sxtl v7.8h, v7.8b
transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
// This ordering of smull/smlal/smull2/smlal2 is highly
// beneficial for Cortex A53 here.

82
third_party/dav1d/src/arm/64/util.S поставляемый
Просмотреть файл

@ -59,33 +59,65 @@
#endif
.endm
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().8b, \r0\().8b, \r1\().8b
trn2 \t9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8b, \r6\().8b, \r7\().8b
.macro sub_sp space
#ifdef _WIN32
.if \space > 8192
// Here, we'd need to touch two (or more) pages while decrementing
// the stack pointer.
.error "sub_sp_align doesn't support values over 8K at the moment"
.elseif \space > 4096
sub x16, sp, #4096
ldr xzr, [x16]
sub sp, x16, #(\space - 4096)
.else
sub sp, sp, #\space
.endif
#else
.if \space >= 4096
sub sp, sp, #(\space)/4096*4096
.endif
.if (\space % 4096) != 0
sub sp, sp, #(\space)%4096
.endif
#endif
.endm
trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \t9\().4h, \r3\().4h
trn2 \t9\().4h, \t9\().4h, \r3\().4h
trn1 \r3\().4h, \t8\().4h, \r1\().4h
trn2 \t8\().4h, \t8\().4h, \r1\().4h
.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
// a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
zip1 \r0\().16b, \r0\().16b, \r1\().16b
// c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
zip1 \r2\().16b, \r2\().16b, \r3\().16b
// e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
zip1 \r4\().16b, \r4\().16b, \r5\().16b
// g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
zip1 \r6\().16b, \r6\().16b, \r7\().16b
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2s, \t8\().2s, \r2\().2s
trn1 \r2\().2s, \t8\().2s, \r2\().2s
trn1 \r3\().2s, \t9\().2s, \r7\().2s
trn2 \r7\().2s, \t9\().2s, \r7\().2s
// a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
trn1 \r1\().8h, \r0\().8h, \r2\().8h
// a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
trn2 \r3\().8h, \r0\().8h, \r2\().8h
// e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
trn1 \r5\().8h, \r4\().8h, \r6\().8h
// e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
trn2 \r7\().8h, \r4\().8h, \r6\().8h
// a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
trn1 \r0\().4s, \r1\().4s, \r5\().4s
// a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
trn2 \r2\().4s, \r1\().4s, \r5\().4s
// a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
trn1 \r1\().4s, \r3\().4s, \r7\().4s
// a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
trn2 \r3\().4s, \r3\().4s, \r7\().4s
\xtl\()2 \r4\().8h, \r0\().16b
\xtl \r0\().8h, \r0\().8b
\xtl\()2 \r6\().8h, \r2\().16b
\xtl \r2\().8h, \r2\().8b
\xtl\()2 \r5\().8h, \r1\().16b
\xtl \r1\().8h, \r1\().8b
\xtl\()2 \r7\().8h, \r3\().16b
\xtl \r3\().8h, \r3\().8b
.endm
.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9

2
third_party/dav1d/src/arm/ipred_init_tmpl.c поставляемый
Просмотреть файл

@ -55,7 +55,6 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 || ARCH_AARCH64
c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
@ -78,5 +77,4 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
c->pal_pred = BF(dav1d_pal_pred, neon);
#endif
}

2
third_party/dav1d/src/arm/itx_init_tmpl.c поставляемый
Просмотреть файл

@ -119,7 +119,6 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
if (bpc > 10) return;
#if ARCH_AARCH64 || BITDEPTH == 8
assign_itx17_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
@ -139,5 +138,4 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
assign_itx1_fn (R, 64, 16, neon);
assign_itx1_fn (R, 64, 32, neon);
assign_itx1_fn ( , 64, 64, neon);
#endif
}

Просмотреть файл

@ -27,7 +27,23 @@
#include "src/cpu.h"
#include "src/looprestoration.h"
#include "src/tables.h"
#if ARCH_AARCH64
void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
#else
// The 8bpc version calculates things slightly differently than the reference
// C version. That version calculates roughly this:
@ -59,16 +75,15 @@ void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
const int16_t *mid, int w, int h,
const int16_t fv[8], enum LrEdgeFlags edges,
ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
const pixel *src, int w, int h);
static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
const int16_t (*const filter)[8] = params->filter;
ALIGN_STK_16(int16_t, mid, 68 * 384,);
int mid_stride = (w + 7) & ~7;
@ -86,23 +101,12 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
HIGHBD_TAIL_SUFFIX);
// Vertical filter
if (w >= 8)
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
w & ~7, h, filter[1], edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into dest.
ALIGN_STK_16(pixel, tmp, 64 * 8,);
BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
&mid[2*mid_stride + (w & ~7)],
w & 7, h, filter[1], edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
}
BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
w, h, filter[1], edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
}
#endif
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
@ -204,83 +208,50 @@ void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
const int w, const int h,
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_wt[7], const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges
HIGHBD_TAIL_SUFFIX);
if (w >= 8)
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h,
(1 << 7) - sgr_wt[1]
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges
HIGHBD_TAIL_SUFFIX);
if (w >= 8)
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w & ~7, h, sgr_wt[0]
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
dst + (w & ~7), dst_stride,
tmp + (w & ~7), w & 7, h, sgr_wt[0]
HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
} else {
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges
HIGHBD_TAIL_SUFFIX);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges
HIGHBD_TAIL_SUFFIX);
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
if (w >= 8)
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w & ~7, h, wt
HIGHBD_TAIL_SUFFIX);
if (w & 7) {
// For uneven widths, do a full 8 pixel wide filtering into a temp
// buffer and copy out the narrow slice of pixels separately into
// dest.
ALIGN_STK_16(pixel, stripe, 64 * 8,);
BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
dst + (w & ~7), dst_stride,
tmp1 + (w & ~7), tmp2 + (w & ~7),
w & 7, h, wt HIGHBD_TAIL_SUFFIX);
BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
w & 7, h);
}
}
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
}
static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
}
static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
}
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
@ -288,7 +259,15 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if ARCH_AARCH64
c->wiener[0] = BF(dav1d_wiener_filter7, neon);
c->wiener[1] = BF(dav1d_wiener_filter5, neon);
#else
c->wiener[0] = c->wiener[1] = wiener_filter_neon;
if (bpc <= 10)
c->selfguided = sgr_filter_neon;
#endif
if (bpc <= 10) {
c->sgr[0] = sgr_filter_5x5_neon;
c->sgr[1] = sgr_filter_3x3_neon;
c->sgr[2] = sgr_filter_mix_neon;
}
}

14
third_party/dav1d/src/cdef_apply_tmpl.c поставляемый
Просмотреть файл

@ -117,7 +117,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
const int tf = f->lf.top_pre_cdef_toggle;
const int by_idx = by & 30;
const int by_idx = (by & 30) >> 1;
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
@ -140,6 +140,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
goto next_sb;
}
// Create a complete 32-bit mask for the sb row ahead of time.
const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
noskip_row[0][0];
const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
@ -162,11 +167,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
// check if this 8x8 block had any coded coefficients; if not,
// go to the next block
const unsigned bx_mask = 3U << (bx & 14);
const int bx_idx = (bx & 16) >> 4;
if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
{
const uint32_t bx_mask = 3U << (bx & 30);
if (!(noskip_mask & bx_mask)) {
last_skip = 1;
goto next_b;
}

6
third_party/dav1d/src/cdf.c поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -29,6 +29,8 @@
#include <string.h>
#include "common/frame.h"
#include "src/internal.h"
#include "src/tables.h"
@ -4012,7 +4014,7 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
update_cdf_1d(11, m.txtp_inter2);
update_bit_1d(4, m.txtp_inter3);
if (!(hdr->frame_type & 1)) {
if (IS_KEY_OR_INTRA(hdr)) {
update_bit_0d(m.intrabc);
update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);

12
third_party/dav1d/src/data.c поставляемый
Просмотреть файл

@ -102,18 +102,6 @@ void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
*dst = *src;
}
void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
validate_input(dst != NULL);
validate_input(dst->data == NULL);
validate_input(src != NULL);
if (src->ref)
validate_input(src->data != NULL);
*dst = *src;
memset(src, 0, sizeof(*src));
}
void dav1d_data_props_copy(Dav1dDataProps *const dst,
const Dav1dDataProps *const src)
{

5
third_party/dav1d/src/data.h поставляемый
Просмотреть файл

@ -32,11 +32,6 @@
void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
/**
* Move a data reference.
*/
void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
/**
* Copy the source properties to the destitionatin and increase the
* user_data's reference count (if it's not NULL).

145
third_party/dav1d/src/decode.c поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -35,6 +35,7 @@
#include "dav1d/data.h"
#include "common/frame.h"
#include "common/intops.h"
#include "src/ctx.h"
@ -727,7 +728,7 @@ static int decode_b(Dav1dTileContext *const t,
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (f->frame_hdr->frame_type & 1) {
if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
for (int x = 0; x < bw4; x++) {
r[x].ref.ref[0] = 0;
@ -748,7 +749,7 @@ static int decode_b(Dav1dTileContext *const t,
#undef set_ctx
}
} else {
if (f->frame_hdr->frame_type & 1 /* not intrabc */ &&
if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
{
if (b->matrix[0] == SHRT_MIN) {
@ -791,7 +792,7 @@ static int decode_b(Dav1dTileContext *const t,
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (f->frame_hdr->frame_type & 1) {
if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
for (int x = 0; x < bw4; x++) {
r[x].ref.ref[0] = b->ref[0] + 1;
@ -1043,7 +1044,7 @@ static int decode_b(Dav1dTileContext *const t,
if (b->skip_mode) {
b->intra = 0;
} else if (f->frame_hdr->frame_type & 1) {
} else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
if (seg && (seg->ref >= 0 || seg->globalmv)) {
b->intra = !seg->ref;
} else {
@ -1064,7 +1065,7 @@ static int decode_b(Dav1dTileContext *const t,
// intra/inter-specific stuff
if (b->intra) {
uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ?
uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
[dav1d_intra_mode_context[t->l.mode[by4]]];
@ -1252,7 +1253,7 @@ static int decode_b(Dav1dTileContext *const t,
rep_macro(type, t->dir skip, off, mul * b->skip); \
/* see aomedia bug 2183 for why we use luma coordinates here */ \
rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
if (f->frame_hdr->frame_type & 1) { \
if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
@ -1293,10 +1294,10 @@ static int decode_b(Dav1dTileContext *const t,
}
}
}
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
splat_intraref(&t->rt, t->by, t->bx, bs);
}
} else if (!(f->frame_hdr->frame_type & 1)) {
} else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
// intra block copy
refmvs_candidate mvstack[8];
int n_mvs, ctx;
@ -1984,10 +1985,10 @@ static int decode_b(Dav1dTileContext *const t,
#undef set_ctx
}
if (!b->skip) {
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
const int bx_idx = (bx4 & 16) >> 4;
for (int y = 0; y < bh4; y++, noskip_mask++) {
for (int y = 0; y < bh4; y += 2, noskip_mask++) {
(*noskip_mask)[bx_idx] |= mask;
if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
(*noskip_mask)[1] |= mask;
@ -2484,15 +2485,12 @@ static void read_restoration_info(Dav1dTileContext *const t,
lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
} else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
const uint16_t *const sgr_params = dav1d_sgr_params[idx];
lr->sgr_idx = idx;
lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 :
0;
lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 :
95;
lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
ts->lr_ref[p] = lr;
@ -2513,20 +2511,20 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
ts->tiling.col_end, ts->tiling.row_start,
ts->tiling.row_end, t->by >> f->sb_shift,
ts->tiling.row);
}
reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
if (f->frame_thread.pass == 2) {
for (t->bx = ts->tiling.col_start,
t->a = f->a + col_sb128_start + tile_row * f->sb128w;
t->bx < ts->tiling.col_end; t->bx += sb_step)
{
if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
if (atomic_load_explicit(c->flush, memory_order_acquire))
return 1;
if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
return 1;
@ -2557,7 +2555,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
t->bx < ts->tiling.col_end; t->bx += sb_step)
{
if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
if (atomic_load_explicit(c->flush, memory_order_acquire))
return 1;
if (root_bl == BL_128X128) {
t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
@ -2631,7 +2629,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
}
}
if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) {
if (f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
dav1d_refmvs_save_tmvs(&t->rt,
ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
t->by >> 1, (t->by + sb_step) >> 1);
@ -2859,7 +2857,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
if (lr_line_sz != f->lf.lr_line_sz) {
dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);
const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12;
// lr simd may overread the input, so slightly over-allocate the lpf buffer
uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32);
if (!lr_ptr) {
f->lf.lr_line_sz = 0;
goto error;
@ -2867,7 +2867,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
for (int pl = 0; pl <= 2; pl++) {
f->lf.lr_lpf_line[pl] = lr_ptr;
lr_ptr += lr_line_sz * 12;
lr_ptr += lr_line_sz * num_lines;
}
f->lf.lr_line_sz = lr_line_sz;
@ -2949,26 +2949,30 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
// init ref mvs
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
const int ret =
dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
if (ret < 0) goto error;
}
// create post-filtering tasks
if (c->n_pfc > 1)
if (dav1d_task_create_filter_sbrow(f))
goto error;
retval = DAV1D_ERR(EINVAL);
// setup dequant tables
init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
if (f->frame_hdr->quant.qm)
for (int j = 0; j < N_RECT_TX_SIZES; j++) {
f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
for (int i = 0; i < N_RECT_TX_SIZES; i++) {
f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
}
for (int i = f->frame_hdr->quant.qm; i < 2; i++)
for (int tx = 0; tx < N_RECT_TX_SIZES; tx++)
for (int pl = 0; pl < 3; pl++)
f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx];
else
memset(f->qm, 0, sizeof(f->qm));
// setup jnt_comp weights
if (f->frame_hdr->switchable_comp_refs) {
@ -3079,9 +3083,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y;
for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
if (f->n_tc == 1) {
if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) {
Dav1dTileContext *const t = f->tc;
// no tile threading - we explicitly interleave tile/sbrow decoding
@ -3108,18 +3112,31 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
if (dav1d_decode_tile_sbrow(t)) goto error;
}
if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) {
if (f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
}
// loopfilter + cdef + restoration
if (f->frame_thread.pass != 1)
f->bd_fn.filter_sbrow(f, sby);
dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
progress_plane_type);
if (f->frame_thread.pass != 1) {
if (c->n_pfc == 1)
f->bd_fn.filter_sbrow(f, sby);
else {
pthread_mutex_lock(&f->lf.thread.pftd->lock);
if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
t->start = 1;
if (t->status == DAV1D_TASK_READY)
dav1d_task_schedule(f->lf.thread.pftd, t);
}
pthread_mutex_unlock(&f->lf.thread.pftd->lock);
}
}
if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
dav1d_thread_picture_signal(&f->sr_cur,
(sby + 1) * f->sb_step * 4,
progress_plane_type);
}
}
} else {
@ -3142,7 +3159,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
pthread_cond_broadcast(&f->tile_thread.cond);
pthread_mutex_unlock(&f->tile_thread.lock);
// loopfilter + cdef + restoration
for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
@ -3174,10 +3190,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
// loopfilter + cdef + restoration
if (f->frame_thread.pass != 1)
f->bd_fn.filter_sbrow(f, sby);
dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
progress_plane_type);
if (f->frame_thread.pass != 1) {
if (c->n_pfc == 1)
f->bd_fn.filter_sbrow(f, sby);
else {
pthread_mutex_lock(&f->lf.thread.pftd->lock);
if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
t->start = 1;
if (t->status == DAV1D_TASK_READY)
dav1d_task_schedule(f->lf.thread.pftd, t);
}
pthread_mutex_unlock(&f->lf.thread.pftd->lock);
}
}
if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
dav1d_thread_picture_signal(&f->sr_cur,
(sby + 1) * f->sb_step * 4,
progress_plane_type);
}
}
@ -3222,6 +3252,17 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
retval = 0;
error:
if (c->n_pfc > 1) {
pthread_mutex_lock(&f->lf.thread.pftd->lock);
if (!f->lf.thread.done) {
if (retval != 0) {
f->lf.thread.done = -1;
pthread_cond_signal(&f->lf.thread.pftd->cond);
}
pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock);
}
pthread_mutex_unlock(&f->lf.thread.pftd->lock);
}
dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
PLANE_TYPE_ALL);
for (int i = 0; i < 7; i++) {
@ -3329,6 +3370,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \
f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
if (!f->seq_hdr->hbd) {
@ -3343,7 +3388,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
#undef assign_bitdepth_case
int ref_coded_width[7];
if (f->frame_hdr->frame_type & 1) {
if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
if (!c->refs[pri_ref].p.p.data[0]) {
@ -3461,7 +3506,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
// ref_mvs
if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
if (!f->mvs_ref) {

2
third_party/dav1d/src/dequant_tables.c поставляемый
Просмотреть файл

@ -29,7 +29,7 @@
#include "src/dequant_tables.h"
const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = {
const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = {
{
{ 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, },
{ 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, },

2
third_party/dav1d/src/dequant_tables.h поставляемый
Просмотреть файл

@ -32,6 +32,6 @@
#include "src/levels.h"
extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2];
extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
#endif /* DAV1D_SRC_DEQUANT_TABLES_H */

29
third_party/dav1d/src/ext/x86/x86inc.asm поставляемый
Просмотреть файл

@ -1,7 +1,7 @@
;*****************************************************************************
;* x86inc.asm: x86 abstraction layer
;*****************************************************************************
;* Copyright (C) 2005-2020 x264 project
;* Copyright (C) 2005-2021 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Henrik Gramner <henrik@gramner.com>
@ -349,6 +349,28 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
; Large stack allocations on Windows need to use stack probing in order
; to guarantee that all stack memory is committed before accessing it.
; This is done by ensuring that the guard page(s) at the end of the
; currently committed pages are touched prior to any pages beyond that.
%if WIN64
%assign STACK_PROBE_SIZE 8192
%elifidn __OUTPUT_FORMAT__, win32
%assign STACK_PROBE_SIZE 4096
%else
%assign STACK_PROBE_SIZE 0
%endif
%macro PROBE_STACK 1 ; stack_size
%if STACK_PROBE_SIZE
%assign %%i STACK_PROBE_SIZE
%rep %1 / STACK_PROBE_SIZE
mov eax, [rsp-%%i]
%assign %%i %%i+STACK_PROBE_SIZE
%endrep
%endif
%endmacro
%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%if %1 != 0
@ -369,6 +391,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if required_stack_alignment <= STACK_ALIGNMENT
; maintain the current stack alignment
%assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
PROBE_STACK stack_size_padded
SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
@ -384,6 +407,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%xdefine rstkm rstk
%endif
%assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
PROBE_STACK stack_size_padded
mov rstk, rsp
and rsp, ~(required_stack_alignment-1)
sub rsp, stack_size_padded
@ -1139,8 +1163,7 @@ INIT_XMM
%endif
%xdefine %%tmp %%f %+ 0
%ifnum %%tmp
RESET_MM_PERMUTATION
AVX512_MM_PERMUTATION
DEFINE_MMREGS mmtype
%assign %%i 0
%rep num_mmregs
%xdefine %%tmp %%f %+ %%i

44
third_party/dav1d/src/internal.h поставляемый
Просмотреть файл

@ -35,6 +35,8 @@
typedef struct Dav1dFrameContext Dav1dFrameContext;
typedef struct Dav1dTileState Dav1dTileState;
typedef struct Dav1dTileContext Dav1dTileContext;
typedef struct Dav1dPostFilterContext Dav1dPostFilterContext;
typedef struct Dav1dTask Dav1dTask;
#include "common/attributes.h"
@ -76,6 +78,9 @@ struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
Dav1dPostFilterContext *pfc;
unsigned n_pfc;
// cache of OBUs that make up a single frame before we submit them
// to a frame worker to be decoded
struct Dav1dTileGroup *tile;
@ -99,15 +104,23 @@ struct Dav1dContext {
// decoded output picture queue
Dav1dData in;
Dav1dPicture out;
// dummy is a pointer to prevent compiler errors about atomic_load()
// not taking const arguments
atomic_int flush_mem, *flush;
struct {
Dav1dThreadPicture *out_delayed;
unsigned next;
// dummy is a pointer to prevent compiler errors about atomic_load()
// not taking const arguments; the const attribute is not taken
// from pointers
atomic_int flush_mem, *flush;
} frame_thread;
// postfilter threading (refer to pfc[] for per_thread thingies)
struct PostFilterThreadData {
pthread_mutex_t lock;
pthread_cond_t cond;
struct Dav1dTask *tasks;
int frame_cnt;
int inited;
} postfilter_thread;
// reference/entropy state
Dav1dMemPool *segmap_pool;
Dav1dMemPool *refmvs_pool;
@ -182,6 +195,10 @@ struct Dav1dFrameContext {
recon_b_intra_fn recon_b_intra;
recon_b_inter_fn recon_b_inter;
filter_sbrow_fn filter_sbrow;
filter_sbrow_fn filter_sbrow_deblock;
filter_sbrow_fn filter_sbrow_cdef;
filter_sbrow_fn filter_sbrow_resize;
filter_sbrow_fn filter_sbrow_lr;
backup_ipred_edge_fn backup_ipred_edge;
read_coef_blocks_fn read_coef_blocks;
} bd_fn;
@ -191,7 +208,7 @@ struct Dav1dFrameContext {
ptrdiff_t b4_stride;
int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */];
BlockContext *a;
int a_sz /* w*tile_rows */;
refmvs_frame rf;
@ -238,6 +255,16 @@ struct Dav1dFrameContext {
pixel *p[3], *sr_p[3];
Av1Filter *mask_ptr, *prev_mask_ptr;
int restore_planes; // enum LrRestorePlanes
struct {
pthread_cond_t cond;
struct PostFilterThreadData *pftd;
struct Dav1dTask *tasks;
int num_tasks;
int npf;
int done;
int inited;
} thread;
} lf;
// threading (refer to tc[] for per-thread things)
@ -353,4 +380,11 @@ struct Dav1dTileContext {
} tile_thread;
};
struct Dav1dPostFilterContext {
Dav1dContext *c;
struct thread_data td;
int flushed;
int die;
};
#endif /* DAV1D_SRC_INTERNAL_H */

2
third_party/dav1d/src/lf_mask.c поставляемый
Просмотреть файл

@ -89,7 +89,7 @@ static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
int y, x;
uint8_t txa[2 /* edge */][2 /* txsz, step */][32 /* y */][32 /* x */];
ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]);
for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],

6
third_party/dav1d/src/lf_mask.h поставляемый
Просмотреть файл

@ -40,11 +40,11 @@ typedef struct Av1FilterLUT {
} Av1FilterLUT;
typedef struct Av1RestorationUnit {
enum Dav1dRestorationType type;
uint8_t /* enum Dav1dRestorationType */ type;
int8_t filter_h[3];
int8_t filter_v[3];
uint8_t sgr_idx;
int16_t sgr_weights[2];
int8_t sgr_weights[2];
} Av1RestorationUnit;
// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
@ -53,7 +53,7 @@ typedef struct Av1Filter {
uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
int8_t cdef_idx[4]; // -1 means "unset"
uint16_t noskip_mask[32][2];
uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis
} Av1Filter;
// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling

115
third_party/dav1d/src/lib.c поставляемый
Просмотреть файл

@ -65,6 +65,7 @@ COLD const char *dav1d_version(void) {
COLD void dav1d_default_settings(Dav1dSettings *const s) {
s->n_frame_threads = 1;
s->n_tile_threads = 1;
s->n_postfilter_threads = 1;
s->apply_grain = 1;
s->allocator.cookie = NULL;
s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
@ -100,6 +101,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_postfilter_threads >= 1 &&
s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_tile_threads >= 1 &&
s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_frame_threads >= 1 &&
@ -136,9 +139,17 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
{
goto error;
}
if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc) {
if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc &&
c->allocator.release_picture_callback == dav1d_default_picture_release)
{
if (c->allocator.cookie) goto error;
if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
c->allocator.cookie = c->picture_pool;
} else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc ||
c->allocator.release_picture_callback == dav1d_default_picture_release)
{
goto error;
}
/* On 32-bit systems extremely large frame sizes can cause overflows in
@ -152,12 +163,49 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
s->frame_size_limit, c->frame_size_limit);
}
c->frame_thread.flush = &c->frame_thread.flush_mem;
atomic_init(c->frame_thread.flush, 0);
c->flush = &c->flush_mem;
atomic_init(c->flush, 0);
c->n_pfc = s->n_postfilter_threads;
c->n_fc = s->n_frame_threads;
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
if (!c->fc) goto error;
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
if (c->n_pfc > 1) {
c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32);
if (!c->pfc) goto error;
memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads);
if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error;
if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) {
pthread_mutex_destroy(&c->postfilter_thread.lock);
goto error;
}
c->postfilter_thread.inited = 1;
for (int n = 0; n < s->n_frame_threads; n++) {
Dav1dFrameContext *const f = &c->fc[n];
if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error;
f->lf.thread.pftd = &c->postfilter_thread;
f->lf.thread.done = 1;
f->lf.thread.inited = 1;
}
for (int n = 0; n < s->n_postfilter_threads; ++n) {
Dav1dPostFilterContext *const pf = &c->pfc[n];
pf->c = c;
if (pthread_mutex_init(&pf->td.lock, NULL)) goto error;
if (pthread_cond_init(&pf->td.cond, NULL)) {
pthread_mutex_destroy(&pf->td.lock);
goto error;
}
if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) {
pthread_cond_destroy(&c->postfilter_thread.cond);
pthread_mutex_destroy(&c->postfilter_thread.lock);
goto error;
}
pf->td.inited = 1;
}
}
if (c->n_fc > 1) {
c->frame_thread.out_delayed =
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
@ -459,11 +507,17 @@ void dav1d_flush(Dav1dContext *const c) {
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
if (c->n_fc == 1) return;
if (c->n_fc == 1 && c->n_pfc == 1) return;
// mark each currently-running frame as flushing, so that we
// exit out as quickly as the running thread checks this flag
atomic_store(c->frame_thread.flush, 1);
// wait for threads to complete flushing
if (c->n_pfc > 1)
pthread_mutex_lock(&c->postfilter_thread.lock);
atomic_store(c->flush, 1);
if (c->n_pfc > 1) {
pthread_cond_broadcast(&c->postfilter_thread.cond);
pthread_mutex_unlock(&c->postfilter_thread.lock);
}
if (c->n_fc == 1) goto skip_ft_flush;
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
if (next == c->n_fc) next = 0;
Dav1dFrameContext *const f = &c->fc[next];
@ -475,13 +529,31 @@ void dav1d_flush(Dav1dContext *const c) {
assert(!f->cur.data[0]);
}
pthread_mutex_unlock(&f->frame_thread.td.lock);
Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next];
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0])
dav1d_thread_picture_unref(out_delayed);
}
atomic_store(c->frame_thread.flush, 0);
c->frame_thread.next = 0;
skip_ft_flush:
if (c->n_pfc > 1) {
for (unsigned i = 0; i < c->n_pfc; ++i) {
Dav1dPostFilterContext *const pf = &c->pfc[i];
pthread_mutex_lock(&pf->td.lock);
if (!pf->flushed)
pthread_cond_wait(&pf->td.cond, &pf->td.lock);
pf->flushed = 0;
pthread_mutex_unlock(&pf->td.lock);
}
pthread_mutex_lock(&c->postfilter_thread.lock);
c->postfilter_thread.tasks = NULL;
pthread_mutex_unlock(&c->postfilter_thread.lock);
for (unsigned i = 0; i < c->n_fc; ++i) {
freep(&c->fc[i].lf.thread.tasks);
c->fc[i].lf.thread.num_tasks = 0;
}
}
atomic_store(c->flush, 0);
}
COLD void dav1d_close(Dav1dContext **const c_out) {
@ -495,6 +567,25 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
if (flush) dav1d_flush(c);
if (c->pfc) {
struct PostFilterThreadData *pftd = &c->postfilter_thread;
if (pftd->inited) {
pthread_mutex_lock(&pftd->lock);
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++)
c->pfc[n].die = 1;
pthread_cond_broadcast(&pftd->cond);
pthread_mutex_unlock(&pftd->lock);
for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) {
pthread_join(c->pfc[n].td.thread, NULL);
pthread_cond_destroy(&c->pfc[n].td.cond);
pthread_mutex_destroy(&c->pfc[n].td.lock);
}
pthread_cond_destroy(&pftd->cond);
pthread_mutex_destroy(&pftd->lock);
}
dav1d_free_aligned(c->pfc);
}
for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
@ -546,6 +637,10 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
pthread_cond_destroy(&ts->tile_thread.cond);
pthread_mutex_destroy(&ts->tile_thread.lock);
}
if (f->lf.thread.inited) {
freep(&f->lf.thread.tasks);
pthread_cond_destroy(&f->lf.thread.cond);
}
dav1d_free_aligned(f->ts);
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);

33
third_party/dav1d/src/looprestoration.h поставляемый
Просмотреть файл

@ -46,29 +46,32 @@ typedef const pixel (*const_left_pixel_row)[4];
typedef const void *const_left_pixel_row;
#endif
// Although the spec applies restoration filters over 4x4 blocks, the wiener
// filter can be applied to a bigger surface.
typedef union LooprestorationParams {
ALIGN(int16_t filter[2][8], 16);
struct {
uint32_t s0, s1;
int16_t w0, w1;
} sgr;
} LooprestorationParams;
// Although the spec applies restoration filters over 4x4 blocks,
// they can be applied to a bigger surface.
// * w is constrained by the restoration unit size (w <= 256)
// * h is constrained by the stripe height (h <= 64)
#define decl_wiener_filter_fn(name) \
// The filter functions are allowed to do aligned writes past the right
// edge of the buffer, aligned up to the minimum loop restoration unit size
// (which is 32 pixels for subsampled chroma and 64 pixels for luma).
#define decl_lr_filter_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
int w, int h, const int16_t filter[2][8], \
int w, int h, const LooprestorationParams *params, \
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_wiener_filter_fn(*wienerfilter_fn);
#define decl_selfguided_filter_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
int w, int h, int sgr_idx, const int16_t sgr_w[2], \
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_selfguided_filter_fn(*selfguided_fn);
typedef decl_lr_filter_fn(*looprestorationfilter_fn);
typedef struct Dav1dLoopRestorationDSPContext {
wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
selfguided_fn selfguided;
looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */
looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */
} Dav1dLoopRestorationDSPContext;
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);

139
third_party/dav1d/src/looprestoration_tmpl.c поставляемый
Просмотреть файл

@ -39,10 +39,10 @@
// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
// TODO Chroma only requires 2 rows of padding.
static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
static NOINLINE void
padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4], const pixel *lpf, const ptrdiff_t lpf_stride,
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
{
const int have_left = !!(edges & LR_HAVE_LEFT);
const int have_right = !!(edges & LR_HAVE_RIGHT);
@ -135,7 +135,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
@ -150,6 +150,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
uint16_t *hor_ptr = hor;
const int16_t (*const filter)[8] = params->filter;
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int round_bits_h = 3 + (bitdepth == 12) * 2;
const int rounding_off_h = 1 << (round_bits_h - 1);
@ -347,12 +348,12 @@ static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
}
}
static void selfguided_filter(coef *dst, const pixel *src,
const ptrdiff_t src_stride, const int w,
const int h, const int n, const int s
HIGHBD_DECL_SUFFIX)
static NOINLINE void
selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
const int w, const int h, const int n, const unsigned s
HIGHBD_DECL_SUFFIX)
{
const int sgr_one_by_x = n == 25 ? 164 : 455;
const unsigned sgr_one_by_x = n == 25 ? 164 : 455;
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
@ -446,71 +447,93 @@ static void selfguided_filter(coef *dst, const pixel *src,
#undef EIGHT_NEIGHBORS
}
static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_w[2], const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4], const pixel *lpf,
const ptrdiff_t lpf_stride, const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
// Selfguided filter outputs to a maximum stripe height of 64 and a
// maximum restoration width of 384 (256 * 1.5)
coef dst[64 * 384];
// both r1 and r0 can't be zero
if (!dav1d_sgr_params[sgr_idx][0]) {
const int s1 = dav1d_sgr_params[sgr_idx][3];
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
const int w1 = (1 << 7) - sgr_w[1];
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
params->sgr.s0 HIGHBD_TAIL_SUFFIX);
const int w0 = params->sgr.w0;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
const int s0 = dav1d_sgr_params[sgr_idx][2];
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
const int w0 = sgr_w[0];
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
p += PXSTRIDE(p_stride);
}
}
static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4], const pixel *lpf,
const ptrdiff_t lpf_stride, const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
coef dst[64 * 384];
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
params->sgr.s1 HIGHBD_TAIL_SUFFIX);
const int w1 = params->sgr.w1;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
} else {
coef dst1[64 * 384];
const int s0 = dav1d_sgr_params[sgr_idx][2];
const int s1 = dav1d_sgr_params[sgr_idx][3];
const int w0 = sgr_w[0];
const int w1 = (1 << 7) - w0 - sgr_w[1];
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
w1 * (dst1[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
p += PXSTRIDE(p_stride);
}
}
static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4], const pixel *lpf,
const ptrdiff_t lpf_stride, const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
coef dst0[64 * 384];
coef dst1[64 * 384];
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
params->sgr.s0 HIGHBD_TAIL_SUFFIX);
selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9,
params->sgr.s1 HIGHBD_TAIL_SUFFIX);
const int w0 = params->sgr.w0;
const int w1 = params->sgr.w1;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) +
w1 * (dst1[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
}
}
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
c->wiener[0] = c->wiener[1] = wiener_c;
c->selfguided = selfguided_c;
c->sgr[0] = sgr_5x5_c;
c->sgr[1] = sgr_3x3_c;
c->sgr[2] = sgr_mix_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM

102
third_party/dav1d/src/lr_apply_tmpl.c поставляемый
Просмотреть файл

@ -48,31 +48,32 @@ static void backup_lpf(const Dav1dFrameContext *const f,
const pixel *src, const ptrdiff_t src_stride,
const int ss_ver, const int sb128,
int row, const int row_h, const int src_w,
const int h, const int ss_hor)
const int h, const int ss_hor, const int pft)
{
const int dst_w = f->frame_hdr->super_res.enabled ?
(f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = (64 - 8 * !row) >> ss_ver;
if (row) {
const int top = 4 << sb128;
// Copy the top part of the stored loop filtered pixels from the
// previous sb row needed above the first stripe of this sb row.
pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
&dst[PXSTRIDE(dst_stride) * top], dst_w);
pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
&dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
&dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
&dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
}
dst += 4 * PXSTRIDE(dst_stride);
src += (stripe_h - 2) * PXSTRIDE(src_stride);
if (!pft) {
if (row) {
const int top = 4 << sb128;
// Copy the top part of the stored loop filtered pixels from the
// previous sb row needed above the first stripe of this sb row.
pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
&dst[PXSTRIDE(dst_stride) * top], dst_w);
pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
&dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
&dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
&dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
}
dst += 4 * PXSTRIDE(dst_stride);
}
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
while (row + stripe_h <= row_h) {
const int n_lines = 4 - (row + stripe_h + 1 == h);
@ -107,9 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f,
void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
/*const*/ pixel *const src[3], const int sby)
{
const int pft = f->c->n_pfc > 1;
const int offset = 8 * !!sby;
const ptrdiff_t *const src_stride = f->cur.stride;
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
pixel *const dst[3] = {
f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride)
};
// TODO Also check block level restore type to reduce copying.
const int restore_planes = f->lf.restore_planes;
@ -119,9 +126,9 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
const int w = f->bw << 2;
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,
backup_lpf(f, dst[0], lr_stride,
src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft);
}
if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -130,18 +137,16 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
const int w = f->bw << (2 - ss_hor);
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
const int offset_uv = offset >> ss_ver;
const int y_stripe =
(sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
if (restore_planes & LR_RESTORE_U) {
backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,
backup_lpf(f, dst[1], lr_stride,
src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
}
if (restore_planes & LR_RESTORE_V) {
backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,
backup_lpf(f, dst[2], lr_stride,
src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
}
}
}
@ -154,17 +159,18 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
const Dav1dDSPContext *const dsp = f->dsp;
const int chroma = !!plane;
const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
const pixel *lpf = f->lf.lr_lpf_line[plane] + x;
const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
ALIGN_STK_16(int16_t, filter, 2, [8]);
wienerfilter_fn wiener_fn = NULL;
looprestorationfilter_fn lr_fn;
LooprestorationParams params;
if (lr->type == DAV1D_RESTORATION_WIENER) {
int16_t (*const filter)[8] = params.filter;
filter[0][0] = filter[0][6] = lr->filter_h[0];
filter[0][1] = filter[0][5] = lr->filter_h[1];
filter[0][2] = filter[0][4] = lr->filter_h[2];
@ -180,25 +186,26 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
filter[1][2] = filter[1][4] = lr->filter_v[2];
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
} else {
assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx];
params.sgr.s0 = sgr_params[0];
params.sgr.s1 = sgr_params[1];
params.sgr.w0 = lr->sgr_weights[0];
params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
}
while (y + stripe_h <= row_h) {
// Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
if (wiener_fn) {
wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
filter, edges HIGHBD_CALL_SUFFIX);
} else {
dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
}
// Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
lr_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
&params, edges HIGHBD_CALL_SUFFIX);
left += stripe_h;
y += stripe_h;
if (y + stripe_h > row_h && sbrow_has_bottom) break;
p += stripe_h * PXSTRIDE(p_stride);
edges |= LR_HAVE_TOP;
stripe_h = imin(64 >> ss_ver, row_h - y);
@ -242,8 +249,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
const Av1RestorationUnit *lr[2];
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
(row_h < h ? LR_HAVE_BOTTOM : 0);
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;
int aligned_unit_pos = row_y & ~(unit_size - 1);
if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
@ -281,11 +287,13 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
const int offset_y = 8 * !!sby;
const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
const int restore_planes = f->lf.restore_planes;
const int not_last = sby + 1 < f->sbh;
if (restore_planes & LR_RESTORE_Y) {
const int h = f->sr_cur.p.p.h;
const int w = f->sr_cur.p.p.w;
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h);
const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
const int row_h = imin(next_row_y - 8 * not_last, h);
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
h, row_h, 0);
@ -295,10 +303,10 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h);
const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
const int offset_uv = offset_y >> ss_ver;
const int y_stripe =
(sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
if (restore_planes & LR_RESTORE_U)
lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
w, h, row_h, 1);

13
third_party/dav1d/src/mc_tmpl.c поставляемый
Просмотреть файл

@ -87,9 +87,15 @@ prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
#define GET_H_FILTER(mx) \
const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
@ -111,7 +117,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
const int filter_type HIGHBD_DECL_SUFFIX)
{
const int intermediate_bits = get_intermediate_bits(bitdepth_max);
const int intermediate_rnd = (1 << intermediate_bits) >> 1;
const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
GET_FILTERS();
dst_stride = PXSTRIDE(dst_stride);
@ -144,9 +150,8 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
} else {
do {
for (int x = 0; x < w; x++) {
const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
6 - intermediate_bits);
dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
intermediate_rnd, 6);
}
dst += dst_stride;

12
third_party/dav1d/src/meson.build поставляемый
Просмотреть файл

@ -132,6 +132,8 @@ if is_asm_enabled
endif
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources_asm = files(
# itx.S is used for both 8 and 16 bpc.
'arm/32/itx.S',
'arm/32/looprestoration_common.S',
'arm/32/msac.S',
)
@ -140,7 +142,6 @@ if is_asm_enabled
libdav1d_sources_asm += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
'arm/32/itx.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
@ -150,6 +151,8 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
'arm/32/cdef16.S',
'arm/32/ipred16.S',
'arm/32/itx16.S',
'arm/32/loopfilter16.S',
'arm/32/looprestoration16.S',
'arm/32/mc16.S',
@ -183,20 +186,20 @@ if is_asm_enabled
libdav1d_sources_asm = files(
'x86/cpuid.asm',
'x86/msac.asm',
'x86/cdef_avx2.asm',
'x86/cdef_sse.asm',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'x86/cdef_avx512.asm',
'x86/mc_avx512.asm',
'x86/cdef_avx2.asm',
'x86/mc_avx2.asm',
'x86/film_grain.asm',
'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/cdef_sse.asm',
'x86/film_grain_ssse3.asm',
'x86/ipred_ssse3.asm',
'x86/itx_ssse3.asm',
@ -208,6 +211,9 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
'x86/cdef16_avx2.asm',
'x86/cdef16_sse.asm',
'x86/looprestoration16_avx2.asm',
)
endif

21
third_party/dav1d/src/obu.c поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -33,6 +33,7 @@
#include "dav1d/data.h"
#include "common/frame.h"
#include "common/intops.h"
#include "src/decode.h"
@ -406,7 +407,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
else
hdr->force_integer_mv = 0;
if (!(hdr->frame_type & 1))
if (IS_KEY_OR_INTRA(hdr))
hdr->force_integer_mv = 1;
if (seqhdr->frame_id_numbers_present)
@ -420,7 +421,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
#endif
hdr->frame_offset = seqhdr->order_hint ?
dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ?
hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ?
dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
if (seqhdr->decoder_model_info_present) {
@ -439,9 +440,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
}
}
if (hdr->frame_type == DAV1D_FRAME_TYPE_KEY ||
hdr->frame_type == DAV1D_FRAME_TYPE_INTRA)
{
if (IS_KEY_OR_INTRA(hdr)) {
hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
@ -569,7 +568,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->switchable_motion_mode = dav1d_get_bits(gb, 1);
hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
seqhdr->ref_frame_mvs && seqhdr->order_hint &&
hdr->frame_type & 1 && dav1d_get_bits(gb, 1);
IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1);
}
#if DEBUG_FRAME_HDR
printf("HDR: post-frametype-specific-bits: off=%td\n",
@ -916,13 +915,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
printf("HDR: post-txfmmode: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0;
hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0;
#if DEBUG_FRAME_HDR
printf("HDR: post-refmode: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->skip_mode_allowed = 0;
if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) {
const unsigned poc = hdr->frame_offset;
unsigned off_before = 0xFFFFFFFFU;
int off_after = -1;
@ -982,7 +981,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
printf("HDR: post-extskip: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 &&
hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) &&
seqhdr->warped_motion && dav1d_get_bits(gb, 1);
#if DEBUG_FRAME_HDR
printf("HDR: post-warpmotionbit: off=%td\n",
@ -997,7 +996,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
for (int i = 0; i < 7; i++)
hdr->gmv[i] = dav1d_default_wm_params;
if (hdr->frame_type & 1) {
if (IS_INTER_OR_SWITCH(hdr)) {
for (int i = 0; i < 7; i++) {
hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY :
dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM :

Просмотреть файл

@ -299,7 +299,6 @@ static inline void padding(uint8_t *dst, const uint8_t *p,
}
}
// FIXME Could split into luma and chroma specific functions,
// (since first and last tops are always 0 for chroma)
// FIXME Could implement a version that requires less temporary memory
@ -309,9 +308,11 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
const uint8_t *lpf,
const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
const int16_t (*const filter)[8] = params->filter;
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
@ -320,7 +321,6 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h);
}
#endif

6
third_party/dav1d/src/qm.c поставляемый
Просмотреть файл

@ -3066,7 +3066,6 @@ static const uint8_t qm_tbl_32x32_t[][2][528] = {
};
const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
static uint8_t pb_32x32[32 * 32];
static uint8_t qm_tbl_4x4[15][2][16];
static uint8_t qm_tbl_4x8[15][2][32];
static uint8_t qm_tbl_4x16[15][2][64];
@ -3145,8 +3144,5 @@ COLD void dav1d_init_qm_tables(void) {
dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
}
memset(pb_32x32, 32, sizeof(pb_32x32));
for (int j = 0; j < 2; j++)
for (int k = 0; k < N_RECT_TX_SIZES; k++)
dav1d_qm_tbl[15][j][k] = pb_32x32;
// dav1d_qm_tbl[15][*][*] == NULL
}

8
third_party/dav1d/src/recon.h поставляемый
Просмотреть файл

@ -65,6 +65,14 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc);
decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);

375
third_party/dav1d/src/recon_tmpl.c поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
@ -33,6 +33,7 @@
#include "common/attributes.h"
#include "common/bitdepth.h"
#include "common/dump.h"
#include "common/frame.h"
#include "common/intops.h"
#include "src/cdef_apply.h"
@ -438,34 +439,39 @@ static int decode_coefs(Dav1dTileContext *const t,
} else {
eob = eob_bin;
}
assert(eob >= 0);
// base tokens
uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
const uint16_t *const scan = dav1d_scans[tx][tx_class];
int dc_tok;
unsigned rc, dc_tok;
if (eob) {
uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
/* eob */
unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
int tok = eob_tok + 1;
int level_tok = tok * 0x41;
unsigned mag;
if (dbg)
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);
#define DECODE_COEFS_CLASS(tx_class) \
unsigned x, y; \
if (tx_class == TX_CLASS_2D) \
rc = scan[eob], x = rc >> shift, y = rc & mask; \
else if (tx_class == TX_CLASS_H) \
/* Transposing reduces the stride and padding requirements */ \
x = eob & mask, y = eob >> shift, rc = eob; \
else /* tx_class == TX_CLASS_V */ \
x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
if (dbg) \
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
if (eob_tok == 2) { \
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
level_tok = tok + (3 << 6); \
if (dbg) \
@ -473,40 +479,46 @@ static int decode_coefs(Dav1dTileContext *const t,
imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
ts->msac.rng); \
} \
cf[rc] = tok; \
if (tx_class == TX_CLASS_H) \
/* Transposing reduces the stride and padding requirements */ \
levels[y * stride + x] = (uint8_t) level_tok; \
else \
levels[x * stride + y] = (uint8_t) level_tok; \
cf[rc] = tok << 11; \
levels[x * stride + y] = (uint8_t) level_tok; \
for (int i = eob - 1; i > 0; i--) { /* ac */ \
if (tx_class == TX_CLASS_H) \
rc = i, x = rc & mask, y = rc >> shift; \
else \
rc = scan[i], x = rc >> shift, y = rc & mask; \
unsigned rc_i; \
if (tx_class == TX_CLASS_2D) \
rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
else if (tx_class == TX_CLASS_H) \
x = i & mask, y = i >> shift, rc_i = i; \
else /* tx_class == TX_CLASS_V */ \
x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
assert(x < 32 && y < 32); \
uint8_t *const level = levels + x * stride + y; \
ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
if (tx_class == TX_CLASS_2D) \
y |= x; \
tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
level_tok = tok * 0x41; \
if (dbg) \
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
if (tok == 3) { \
mag &= 63; \
ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
(mag > 12 ? 6 : (mag + 1) >> 1); \
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
level_tok = tok + (3 << 6); \
if (dbg) \
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
ts->msac.rng); \
*level = (uint8_t) (tok + (3 << 6)); \
cf[rc_i] = (tok << 11) | rc; \
rc = rc_i; \
} else { \
/* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
tok *= 0x17ff41; \
*level = (uint8_t) tok; \
/* tok ? (tok << 11) | rc : 0 */ \
tok = (tok >> 9) & (rc + ~0x7ffu); \
if (tok) rc = rc_i; \
cf[rc_i] = tok; \
} \
cf[rc] = tok; \
*level = (uint8_t) level_tok; \
} \
/* dc */ \
ctx = (tx_class == TX_CLASS_2D) ? 0 : \
@ -528,27 +540,35 @@ static int decode_coefs(Dav1dTileContext *const t,
} \
break
const uint16_t *scan;
switch (tx_class) {
case TX_CLASS_2D: {
const unsigned nonsquare_tx = tx >= RTX_4X8;
const uint8_t (*const lo_ctx_offsets)[5] =
dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
scan = dav1d_scans[tx];
const ptrdiff_t stride = 4 * sh;
const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
const unsigned mask = 4 * sh - 1;
memset(levels, 0, stride * (4 * sw + 2));
DECODE_COEFS_CLASS(TX_CLASS_2D);
}
case TX_CLASS_H: {
#define lo_ctx_offsets NULL
const uint8_t (*const lo_ctx_offsets)[5] = NULL;
const ptrdiff_t stride = 16;
const unsigned shift = t_dim->lh + 2, shift2 = 0;
const unsigned mask = 4 * sh - 1;
memset(levels, 0, stride * (4 * sh + 2));
DECODE_COEFS_CLASS(TX_CLASS_H);
}
case TX_CLASS_V: {
const uint8_t (*const lo_ctx_offsets)[5] = NULL;
const ptrdiff_t stride = 16;
const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
const unsigned mask = 4 * sw - 1;
memset(levels, 0, stride * (4 * sw + 2));
DECODE_COEFS_CLASS(TX_CLASS_V);
}
#undef lo_ctx_offsets
#undef DECODE_COEFS_CLASS
default: assert(0);
}
@ -564,71 +584,137 @@ static int decode_coefs(Dav1dTileContext *const t,
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
}
rc = 0;
}
// residual and sign
int dc_sign = 1 << 6;
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
const int dq_shift = imax(0, t_dim->ctx - 2);
const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
const int cf_max = (1 << (7 + bitdepth)) - 1;
unsigned cul_level = 0;
const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
unsigned cul_level, dc_sign_level;
if (dc_tok) { // dc
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
uint16_t *const dc_sign_cdf =
ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
if (dbg)
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
chroma, dc_sign_ctx, sign, ts->msac.rng);
dc_sign = (sign - 1) & (2 << 6);
if (!dc_tok) {
cul_level = 0;
dc_sign_level = 1 << 6;
if (qm_tbl) goto ac_qm;
goto ac_noqm;
}
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
if (dbg)
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
unsigned dc_dq = dq_tbl[0];
dc_sign_level = (dc_sign - 1) & (2 << 6);
if (qm_tbl) {
dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
if (dc_tok == 15) {
dc_tok += read_golomb(&ts->msac);
dc_tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-dc_residual[%d->%d]: r=%d\n",
dc_tok - 15, dc_tok, ts->msac.rng);
dc_tok &= 0xfffff;
dc_dq = (dc_dq * dc_tok) & 0xffffff;
} else {
dc_dq *= dc_tok;
assert(dc_dq <= 0xffffff);
}
cul_level = dc_tok;
dc_dq >>= dq_shift;
cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
cul_level += dc_tok;
dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift;
cf[0] = imin(dc_tok - sign, cf_max) ^ -sign;
}
for (int i = 1; i <= eob; i++) { // ac
const int rc = scan[i];
int tok = cf[rc];
if (!tok) continue;
if (rc) ac_qm: {
const unsigned ac_dq = dq_tbl[1];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
// sign
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
if (dbg)
printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n",
rc, tok - 15, tok, ts->msac.rng);
// residual
if (tok == 15) {
tok += read_golomb(&ts->msac);
tok &= 0xfffff;
dq = (dq * tok) & 0xffffff;
} else {
tok = rc_tok >> 11;
dq *= tok;
assert(dq <= 0xffffff);
}
cul_level += tok;
dq >>= dq_shift;
cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
rc = rc_tok & 0x3ff;
} while (rc);
}
} else {
// non-qmatrix is the common case and allows for additional optimizations
if (dc_tok == 15) {
dc_tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
i, rc, tok - 15, tok, ts->msac.rng);
printf("Post-dc_residual[%d->%d]: r=%d\n",
dc_tok - 15, dc_tok, ts->msac.rng);
// coefficient parsing, see 5.11.39
tok &= 0xfffff;
dc_tok &= 0xfffff;
dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
dc_dq = umin(dc_dq - dc_sign, cf_max);
} else {
dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
assert(dc_dq <= cf_max);
}
cul_level = dc_tok;
cf[0] = (coef) (dc_dq ^ -dc_sign);
// dequant, see 7.12.3
cul_level += tok;
tok = ((dq * tok) & 0xffffff) >> dq_shift;
cf[rc] = imin(tok - sign, cf_max) ^ -sign;
if (rc) ac_noqm: {
const unsigned ac_dq = dq_tbl[1];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq;
// residual
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n",
rc, tok - 15, tok, ts->msac.rng);
// coefficient parsing, see 5.11.39
tok &= 0xfffff;
// dequant, see 7.12.3
dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
dq = umin(dq - sign, cf_max);
} else {
// cannot exceed cf_max, so we can avoid the clipping
tok = rc_tok >> 11;
dq = ((ac_dq * tok) >> dq_shift) - sign;
assert(dq <= cf_max);
}
cul_level += tok;
cf[rc] = (coef) (dq ^ -sign);
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
} while (rc);
}
}
// context
*res_ctx = umin(cul_level, 63) | dc_sign;
*res_ctx = umin(cul_level, 63) | dc_sign_level;
return eob;
}
@ -1544,7 +1630,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
const ptrdiff_t uvdstoff =
4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
if (!(f->frame_hdr->frame_type & 1)) {
if (IS_KEY_OR_INTRA(f->frame_hdr)) {
// intrabc
assert(!f->frame_hdr->super_res.enabled);
res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
@ -1965,74 +2051,107 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
return 0;
}
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
const int sbsz = f->sb_step, sbh = f->sbh;
if (f->frame_hdr->loopfilter.level_y[0] ||
f->frame_hdr->loopfilter.level_y[1])
{
void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) {
const int y = sby * f->sb_step * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3] = {
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
int start_of_tile_row = 0;
if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
start_of_tile_row = f->lf.tile_row++;
bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
start_of_tile_row);
bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row);
}
if (f->lf.restore_planes) {
// Store loop filtered pixels required by loop restoration
bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
}
if (f->seq_hdr->cdef) {
if (sby) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *p_up[3] = {
f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
};
bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
sby * sbsz - 2, sby * sbsz);
}
const int n_blks = sbsz - 2 * (sby + 1 < sbh);
bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
imin(sby * sbsz + n_blks, f->bh));
}
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int h_start = 8 * !!sby >> ss_ver;
const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride);
const ptrdiff_t src_stride = f->cur.stride[!!pl];
const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride);
const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver;
const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
imin(img_h, h_end) + h_start, src_w,
f->resize_step[!!pl], f->resize_start[!!pl]
HIGHBD_CALL_SUFFIX);
}
}
if (f->lf.restore_planes) {
bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
bytefn(dav1d_lr_copy_lpf)(f, p, sby);
}
}
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) {
const int sbsz = f->sb_step;
const int y = sby * sbsz * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]);
f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]);
f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
f->lf.prev_mask_ptr = f->lf.mask_ptr;
if ((sby & 1) || f->seq_hdr->sb128) {
f->lf.mask_ptr += f->sb128w;
pixel *const p[3] = {
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
};
Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
const int start = sby * sbsz;
if (sby) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *p_up[3] = {
p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
};
bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start);
}
const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
const int end = imin(start + n_blks, f->bh);
bytefn(dav1d_cdef_brow)(f, p, mask, start, end);
}
void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
const int sbsz = f->sb_step;
const int y = sby * sbsz * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const pixel *const p[3] = {
f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
};
pixel *const sr_p[3] = {
f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
};
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int h_start = 8 * !!sby >> ss_ver;
const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
const ptrdiff_t src_stride = f->cur.stride[!!pl];
const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
imin(img_h, h_end) + h_start, src_w,
f->resize_step[!!pl], f->resize_start[!!pl]
HIGHBD_CALL_SUFFIX);
}
}
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
const int y = sby * f->sb_step * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const sr_p[3] = {
f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
};
bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
}
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
bytefn(dav1d_filter_sbrow_deblock)(f, sby);
if (f->seq_hdr->cdef)
bytefn(dav1d_filter_sbrow_cdef)(f, sby);
if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
bytefn(dav1d_filter_sbrow_resize)(f, sby);
if (f->lf.restore_planes)
bytefn(dav1d_filter_sbrow_lr)(f, sby);
}
void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {

14
third_party/dav1d/src/refmvs.c поставляемый
Просмотреть файл

@ -51,12 +51,13 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
gmv[0] : b->mv.mv[n];
*have_refmv_match = 1;
*have_newmv_match |= b->mf >> 1;
const int last = *cnt;
for (int m = 0; m < last; m++)
if (mvstack[m].mv.mv[0].n == cand_mv.n) {
mvstack[m].weight += weight;
*have_refmv_match = 1;
*have_newmv_match |= b->mf >> 1;
return;
}
@ -65,8 +66,6 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
mvstack[last].weight = weight;
*cnt = last + 1;
}
*have_refmv_match = 1;
*have_newmv_match |= b->mf >> 1;
return;
}
}
@ -76,12 +75,13 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
[1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
}};
*have_refmv_match = 1;
*have_newmv_match |= b->mf >> 1;
const int last = *cnt;
for (int n = 0; n < last; n++)
if (mvstack[n].mv.n == cand_mv.n) {
mvstack[n].weight += weight;
*have_refmv_match = 1;
*have_newmv_match |= b->mf >> 1;
return;
}
@ -90,8 +90,6 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
mvstack[last].weight = weight;
*cnt = last + 1;
}
*have_refmv_match = 1;
*have_newmv_match |= b->mf >> 1;
}
}

239
third_party/dav1d/src/scan.c поставляемый
Просмотреть файл

@ -30,19 +30,14 @@
#include "common/attributes.h"
#include "src/scan.h"
static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
static const uint16_t ALIGN(scan_4x4[], 32) = {
0, 4, 1, 2,
5, 8, 12, 9,
6, 3, 7, 10,
13, 14, 11, 15,
};
static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
0, 4, 8, 12,
1, 5, 9, 13,
2, 6, 10, 14,
3, 7, 11, 15,
};
static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
static const uint16_t ALIGN(scan_4x8[], 32) = {
0, 8, 1, 16,
9, 2, 24, 17,
10, 3, 25, 18,
@ -52,17 +47,8 @@ static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
14, 7, 29, 22,
15, 30, 23, 31,
};
static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
0, 8, 16, 24,
1, 9, 17, 25,
2, 10, 18, 26,
3, 11, 19, 27,
4, 12, 20, 28,
5, 13, 21, 29,
6, 14, 22, 30,
7, 15, 23, 31,
};
static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
static const uint16_t ALIGN(scan_4x16[], 32) = {
0, 16, 1, 32,
17, 2, 48, 33,
18, 3, 49, 34,
@ -80,37 +66,15 @@ static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
30, 15, 61, 46,
31, 62, 47, 63,
};
static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
0, 16, 32, 48,
1, 17, 33, 49,
2, 18, 34, 50,
3, 19, 35, 51,
4, 20, 36, 52,
5, 21, 37, 53,
6, 22, 38, 54,
7, 23, 39, 55,
8, 24, 40, 56,
9, 25, 41, 57,
10, 26, 42, 58,
11, 27, 43, 59,
12, 28, 44, 60,
13, 29, 45, 61,
14, 30, 46, 62,
15, 31, 47, 63,
};
static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
static const uint16_t ALIGN(scan_8x4[], 32) = {
0, 1, 4, 2, 5, 8, 3, 6,
9, 12, 7, 10, 13, 16, 11, 14,
17, 20, 15, 18, 21, 24, 19, 22,
25, 28, 23, 26, 29, 27, 30, 31,
};
static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
0, 4, 8, 12, 16, 20, 24, 28,
1, 5, 9, 13, 17, 21, 25, 29,
2, 6, 10, 14, 18, 22, 26, 30,
3, 7, 11, 15, 19, 23, 27, 31,
};
static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
static const uint16_t ALIGN(scan_8x8[], 32) = {
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
@ -120,17 +84,8 @@ static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63,
};
static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
0, 8, 16, 24, 32, 40, 48, 56,
1, 9, 17, 25, 33, 41, 49, 57,
2, 10, 18, 26, 34, 42, 50, 58,
3, 11, 19, 27, 35, 43, 51, 59,
4, 12, 20, 28, 36, 44, 52, 60,
5, 13, 21, 29, 37, 45, 53, 61,
6, 14, 22, 30, 38, 46, 54, 62,
7, 15, 23, 31, 39, 47, 55, 63,
};
static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
static const uint16_t ALIGN(scan_8x16[], 32) = {
0, 16, 1, 32, 17, 2, 48, 33,
18, 3, 64, 49, 34, 19, 4, 80,
65, 50, 35, 20, 5, 96, 81, 66,
@ -148,25 +103,8 @@ static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
47, 123, 108, 93, 78, 63, 124, 109,
94, 79, 125, 110, 95, 126, 111, 127,
};
static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
0, 16, 32, 48, 64, 80, 96, 112,
1, 17, 33, 49, 65, 81, 97, 113,
2, 18, 34, 50, 66, 82, 98, 114,
3, 19, 35, 51, 67, 83, 99, 115,
4, 20, 36, 52, 68, 84, 100, 116,
5, 21, 37, 53, 69, 85, 101, 117,
6, 22, 38, 54, 70, 86, 102, 118,
7, 23, 39, 55, 71, 87, 103, 119,
8, 24, 40, 56, 72, 88, 104, 120,
9, 25, 41, 57, 73, 89, 105, 121,
10, 26, 42, 58, 74, 90, 106, 122,
11, 27, 43, 59, 75, 91, 107, 123,
12, 28, 44, 60, 76, 92, 108, 124,
13, 29, 45, 61, 77, 93, 109, 125,
14, 30, 46, 62, 78, 94, 110, 126,
15, 31, 47, 63, 79, 95, 111, 127,
};
static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
static const uint16_t ALIGN(scan_8x32[], 32) = {
0, 32, 1, 64, 33, 2, 96, 65,
34, 3, 128, 97, 66, 35, 4, 160,
129, 98, 67, 36, 5, 192, 161, 130,
@ -200,19 +138,15 @@ static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
95, 251, 220, 189, 158, 127, 252, 221,
190, 159, 253, 222, 191, 254, 223, 255,
};
static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
static const uint16_t ALIGN(scan_16x4[], 32) = {
0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
};
static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
};
static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
static const uint16_t ALIGN(scan_16x8[], 32) = {
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5,
12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44,
@ -222,17 +156,8 @@ static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115,
122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
};
static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
};
static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
static const uint16_t ALIGN(scan_16x16[], 32) = {
0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80,
65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67,
52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114,
@ -250,43 +175,8 @@ static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
};
static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
};
static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
};
static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
static const uint16_t ALIGN(scan_16x32[], 32) = {
0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160,
129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131,
100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226,
@ -320,7 +210,8 @@ static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
};
static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
static const uint16_t ALIGN(scan_32x8[], 32) = {
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
@ -330,7 +221,8 @@ static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
};
static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
static const uint16_t ALIGN(scan_32x16[], 32) = {
0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52,
67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130,
145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73,
@ -348,7 +240,8 @@ static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
};
static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
static const uint16_t ALIGN(scan_32x32[], 32) = {
0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131,
100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258,
289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292,
@ -383,62 +276,24 @@ static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
};
const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
[TX_4X4] = {
[TX_CLASS_2D] = av1_default_scan_4x4,
[TX_CLASS_V] = av1_mrow_scan_4x4,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [TX_8X8] = {
[TX_CLASS_2D] = av1_default_scan_8x8,
[TX_CLASS_V] = av1_mrow_scan_8x8,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [TX_16X16] = {
[TX_CLASS_2D] = av1_default_scan_16x16,
[TX_CLASS_V] = av1_mrow_scan_16x16,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [TX_32X32] = {
[TX_CLASS_2D] = av1_default_scan_32x32,
}, [TX_64X64] = {
[TX_CLASS_2D] = av1_default_scan_32x32,
}, [RTX_4X8] = {
[TX_CLASS_2D] = av1_default_scan_4x8,
[TX_CLASS_V] = av1_mrow_scan_4x8,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_8X4] = {
[TX_CLASS_2D] = av1_default_scan_8x4,
[TX_CLASS_V] = av1_mrow_scan_8x4,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_8X16] = {
[TX_CLASS_2D] = av1_default_scan_8x16,
[TX_CLASS_V] = av1_mrow_scan_8x16,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_16X8] = {
[TX_CLASS_2D] = av1_default_scan_16x8,
[TX_CLASS_V] = av1_mrow_scan_16x8,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_16X32] = {
[TX_CLASS_2D] = av1_default_scan_16x32,
}, [RTX_32X16] = {
[TX_CLASS_2D] = av1_default_scan_32x16,
}, [RTX_32X64] = {
[TX_CLASS_2D] = av1_default_scan_32x32,
}, [RTX_64X32] = {
[TX_CLASS_2D] = av1_default_scan_32x32,
}, [RTX_4X16] = {
[TX_CLASS_2D] = av1_default_scan_4x16,
[TX_CLASS_V] = av1_mrow_scan_4x16,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_16X4] = {
[TX_CLASS_2D] = av1_default_scan_16x4,
[TX_CLASS_V] = av1_mrow_scan_16x4,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_8X32] = {
[TX_CLASS_2D] = av1_default_scan_8x32,
}, [RTX_32X8] = {
[TX_CLASS_2D] = av1_default_scan_32x8,
}, [RTX_16X64] = {
[TX_CLASS_2D] = av1_default_scan_16x32,
}, [RTX_64X16] = {
[TX_CLASS_2D] = av1_default_scan_32x16,
},
const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
[ TX_4X4 ] = scan_4x4,
[ TX_8X8 ] = scan_8x8,
[ TX_16X16] = scan_16x16,
[ TX_32X32] = scan_32x32,
[ TX_64X64] = scan_32x32,
[RTX_4X8 ] = scan_4x8,
[RTX_8X4 ] = scan_8x4,
[RTX_8X16 ] = scan_8x16,
[RTX_16X8 ] = scan_16x8,
[RTX_16X32] = scan_16x32,
[RTX_32X16] = scan_32x16,
[RTX_32X64] = scan_32x32,
[RTX_64X32] = scan_32x32,
[RTX_4X16 ] = scan_4x16,
[RTX_16X4 ] = scan_16x4,
[RTX_8X32 ] = scan_8x32,
[RTX_32X8 ] = scan_32x8,
[RTX_16X64] = scan_16x32,
[RTX_64X16] = scan_32x16,
};

2
third_party/dav1d/src/scan.h поставляемый
Просмотреть файл

@ -32,6 +32,6 @@
#include "src/levels.h"
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
#endif /* DAV1D_SRC_SCAN_H */

12
third_party/dav1d/src/tables.c поставляемый
Просмотреть файл

@ -412,13 +412,11 @@ const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
};
const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
{ 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 },
{ 2, 1, 47, 1079 }, { 2, 1, 37, 996 }, { 2, 1, 30, 925 },
{ 2, 1, 25, 863 }, { 0, 1, -1, 2589 }, { 0, 1, -1, 1618 },
{ 0, 1, -1, 1177 }, { 0, 1, -1, 925 }, { 2, 0, 56, -1 },
{ 2, 0, 22, -1 },
const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = {
{ 140, 3236 }, { 112, 2158 }, { 93, 1618 }, { 80, 1438 },
{ 70, 1295 }, { 58, 1177 }, { 47, 1079 }, { 37, 996 },
{ 30, 925 }, { 25, 863 }, { 0, 2589 }, { 0, 1618 },
{ 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 },
};
const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {

2
third_party/dav1d/src/tables.h поставляемый
Просмотреть файл

@ -107,7 +107,7 @@ extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int8_t dav1d_cdef_directions[12][2];
extern const int16_t dav1d_sgr_params[16][4];
extern const uint16_t dav1d_sgr_params[16][2];
extern const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];

8
third_party/dav1d/src/thread.h поставляемый
Просмотреть файл

@ -169,6 +169,14 @@ static inline void dav1d_set_thread_name(const char *const name) {
pthread_setname_np(pthread_self(), "%s", (void*)name);
}
#elif defined(__HAIKU__)
#include <os/kernel/OS.h>
static inline void dav1d_set_thread_name(const char *const name) {
rename_thread(find_thread(NULL), name);
}
#else
#define dav1d_set_thread_name(name) do {} while (0)

229
third_party/dav1d/src/thread_task.c поставляемый
Просмотреть файл

@ -29,6 +29,140 @@
#include "src/thread_task.h"
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) {
struct PostFilterThreadData *const pftd = f->lf.thread.pftd;
const int frame_idx = (int)(f - f->c->fc);
const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
f->frame_hdr->loopfilter.level_y[1] ||
f->lf.restore_planes;
const int has_cdef = f->seq_hdr->cdef;
const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
const int has_lr = !!f->lf.restore_planes;
f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr;
if (f->lf.thread.npf == 0) return 0;
pthread_mutex_lock(&pftd->lock);
Dav1dTask *tasks = f->lf.thread.tasks;
int num_tasks = f->sbh * f->lf.thread.npf;
if (num_tasks > f->lf.thread.num_tasks) {
const size_t size = sizeof(Dav1dTask) * num_tasks;
tasks = realloc(f->lf.thread.tasks, size);
if (!tasks) {
pthread_mutex_unlock(&pftd->lock);
return -1;
}
memset(tasks, 0, size);
f->lf.thread.tasks = tasks;
f->lf.thread.num_tasks = num_tasks;
}
#define create_task(task, ready_cond, start_cond) \
do { \
t = &tasks[num_tasks++]; \
t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \
t->start = start_cond; \
t->frame_id = frame_cnt; \
t->frame_idx = frame_idx; \
t->sby = sby; \
t->fn = f->bd_fn.filter_sbrow_##task; \
t->last_deps[0] = NULL; \
t->last_deps[1] = NULL; \
t->next_deps[0] = NULL; \
t->next_deps[1] = NULL; \
t->next_exec = NULL; \
} while (0)
Dav1dTask *last_sbrow_deblock = NULL;
Dav1dTask *last_sbrow_cdef = NULL;
Dav1dTask *last_sbrow_resize = NULL;
Dav1dTask *last_sbrow_lr = NULL;
num_tasks = 0;
const int frame_cnt = pftd->frame_cnt++;
for (int sby = 0; sby < f->sbh; ++sby) {
Dav1dTask *t;
Dav1dTask *last = NULL;
if (has_deblock) {
create_task(deblock, sby == 0, 0);
if (sby) {
t->last_deps[1] = last_sbrow_deblock;
last_sbrow_deblock->next_deps[1] = t;
}
last = t;
last_sbrow_deblock = t;
}
if (has_cdef) {
create_task(cdef, sby == 0 && !has_deblock, has_deblock);
if (has_deblock) {
t->last_deps[0] = last;
last->next_deps[0] = t;
}
if (sby) {
t->last_deps[1] = last_sbrow_cdef;
last_sbrow_cdef->next_deps[1] = t;
}
last = t;
last_sbrow_cdef = t;
};
if (has_resize) {
create_task(resize, sby == 0 && !last, !!last);
if (last) {
t->last_deps[0] = last;
last->next_deps[0] = t;
}
if (sby) {
t->last_deps[1] = last_sbrow_resize;
last_sbrow_resize->next_deps[1] = t;
}
last = t;
last_sbrow_resize = t;
}
if (has_lr) {
create_task(lr, sby == 0 && !last, !!last);
if (last) {
t->last_deps[0] = last;
last->next_deps[0] = t;
}
if (sby) {
t->last_deps[1] = last_sbrow_lr;
last_sbrow_lr->next_deps[1] = t;
}
last_sbrow_lr = t;
}
}
f->lf.thread.done = 0;
pthread_mutex_unlock(&pftd->lock);
return 0;
}
void dav1d_task_schedule(struct PostFilterThreadData *const pftd,
Dav1dTask *const t)
{
Dav1dTask **pt = &pftd->tasks;
while (*pt &&
((*pt)->sby < t->sby ||
((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id)))
pt = &(*pt)->next_exec;
t->next_exec = *pt;
*pt = t;
pthread_cond_signal(&pftd->cond);
}
static inline void update_task(Dav1dTask *const t, const int dep_type,
Dav1dFrameContext *const f)
{
if (!t->last_deps[!dep_type] ||
t->last_deps[!dep_type]->status == DAV1D_TASK_DONE)
{
t->status = DAV1D_TASK_READY;
if (t->start)
dav1d_task_schedule(f->lf.thread.pftd, t);
}
}
void *dav1d_frame_task(void *const data) {
Dav1dFrameContext *const f = data;
@ -140,3 +274,98 @@ void *dav1d_tile_task(void *const data) {
return NULL;
}
static inline int handle_abortion(Dav1dPostFilterContext *const pf,
Dav1dContext *const c,
struct PostFilterThreadData *const pftd)
{
const int flush = atomic_load_explicit(c->flush, memory_order_acquire);
if (flush) {
pthread_mutex_lock(&pf->td.lock);
pf->flushed = 0;
pthread_mutex_unlock(&pf->td.lock);
}
for (unsigned i = 0; i < c->n_fc; i++) {
Dav1dFrameContext *const f = &c->fc[i];
int send_signal;
if (flush) // TODO before merge, see if this can be safely merged
send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0;
else
send_signal = f->lf.thread.done == -1;
for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) {
Dav1dTask *const t = &f->lf.thread.tasks[j];
if (t->status == DAV1D_TASK_RUNNING ||
(t->status == DAV1D_TASK_DONE && t->start != -1))
send_signal = 0;
}
if (send_signal) {
if (!flush) {
Dav1dTask **pt = &pftd->tasks;
while (*pt) {
if ((*pt)->frame_idx == i)
*pt = (*pt)->next_exec;
else
pt = &(*pt)->next_exec;
}
}
f->lf.thread.done = 1;
pthread_cond_signal(&f->lf.thread.cond);
}
}
if (flush) {
pthread_mutex_lock(&pf->td.lock);
pf->flushed = 1;
pthread_cond_signal(&pf->td.cond);
pthread_mutex_unlock(&pf->td.lock);
}
return !flush;
}
void *dav1d_postfilter_task(void *data) {
Dav1dPostFilterContext *const pf = data;
Dav1dContext *const c = pf->c;
struct PostFilterThreadData *pftd = &c->postfilter_thread;
dav1d_set_thread_name("dav1d-postfilter");
int exec = 1;
pthread_mutex_lock(&pftd->lock);
for (;;) {
if (!exec && !pf->die)
pthread_cond_wait(&pftd->cond, &pftd->lock);
if (!(exec = handle_abortion(pf, c, pftd))) continue;
if (pf->die) break;
Dav1dTask *const t = pftd->tasks;
if (!t) { exec = 0; continue; }
pftd->tasks = t->next_exec;
t->status = DAV1D_TASK_RUNNING;
pthread_mutex_unlock(&pftd->lock);
Dav1dFrameContext *const f = &c->fc[t->frame_idx];
t->fn(f, t->sby);
exec = 1;
pthread_mutex_lock(&pftd->lock);
if (t->next_deps[0])
update_task(t->next_deps[0], 0, f);
if (t->next_deps[1])
update_task(t->next_deps[1], 1, f);
t->status = DAV1D_TASK_DONE;
if (!t->next_deps[0]) {
const enum PlaneType progress_plane_type =
c->n_fc > 1 && f->frame_hdr->refresh_context ?
PLANE_TYPE_Y : PLANE_TYPE_ALL;
const int y = (t->sby + 1) * f->sb_step * 4;
dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type);
if (t->sby + 1 == f->sbh) {
f->lf.thread.done = 1;
pthread_cond_signal(&f->lf.thread.cond);
}
}
t->start = -1;
}
pthread_mutex_unlock(&pftd->lock);
return NULL;
}

29
third_party/dav1d/src/thread_task.h поставляемый
Просмотреть файл

@ -35,10 +35,33 @@
#define FRAME_ERROR (UINT_MAX - 1)
#define TILE_ERROR (INT_MAX - 1)
int dav1d_decode_frame(Dav1dFrameContext *f);
void *dav1d_frame_task(void *data);
enum TaskStatus {
DAV1D_TASK_DEFAULT,
DAV1D_TASK_READY,
DAV1D_TASK_RUNNING,
DAV1D_TASK_DONE,
};
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
struct Dav1dTask {
enum TaskStatus status; // task status
int start; // frame thread start flag
unsigned frame_idx; // frame thread id
int frame_id; // frame ordering
int sby; // sbrow
filter_sbrow_fn fn; // task work
Dav1dTask *last_deps[2]; // dependencies
Dav1dTask *next_deps[2]; // dependant tasks
Dav1dTask *next_exec; // tasks scheduling
};
int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f);
void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t);
void *dav1d_frame_task(void *data);
void *dav1d_tile_task(void *data);
void *dav1d_postfilter_task(void *data);
int dav1d_decode_frame(Dav1dFrameContext *f);
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
#endif /* DAV1D_SRC_THREAD_TASK_H */

32
third_party/dav1d/src/wedge.c поставляемый
Просмотреть файл

@ -45,41 +45,41 @@ enum WedgeDirectionType {
};
typedef struct {
enum WedgeDirectionType direction;
int x_offset;
int y_offset;
uint8_t /* enum WedgeDirectionType */ direction;
uint8_t x_offset;
uint8_t y_offset;
} wedge_code_type;
static const wedge_code_type wedge_codebook_16_hgtw[16] = {
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
{ WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
{ WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
{ WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
{ WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
{ WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};
static const wedge_code_type wedge_codebook_16_hltw[16] = {
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
{ WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
{ WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
{ WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
{ WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};
static const wedge_code_type wedge_codebook_16_heqw[16] = {
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
{ WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
{ WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
{ WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
{ WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
{ WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
{ WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
{ WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};

65
third_party/dav1d/src/x86/cdef16_avx2.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,65 @@
; Copyright (c) 2017-2021, The rav1e contributors
; Copyright (c) 2021, Nathan Egge
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION .text
cextern cdef_dir_8bpc_avx2
INIT_YMM avx2
cglobal cdef_dir_16bpc, 4, 4, 3, 32 + 8*8, src, ss, var, bdmax
popcnt bdmaxd, bdmaxd
movzx bdmaxq, bdmaxw
sub bdmaxq, 8
movq xm2, bdmaxq
DEFINE_ARGS src, ss, var, ss3
lea ss3q, [ssq*3]
mova xm0, [srcq + ssq*0]
mova xm1, [srcq + ssq*1]
vinserti128 m0, [srcq + ssq*2], 1
vinserti128 m1, [srcq + ss3q], 1
psraw m0, xm2
psraw m1, xm2
vpackuswb m0, m1
mova [rsp + 32 + 0*8], m0
lea srcq, [srcq + ssq*4]
mova xm0, [srcq + ssq*0]
mova xm1, [srcq + ssq*1]
vinserti128 m0, [srcq + ssq*2], 1
vinserti128 m1, [srcq + ss3q], 1
psraw m0, xm2
psraw m1, xm2
vpackuswb m0, m1
mova [rsp + 32 + 4*8], m0
lea srcq, [rsp + 32] ; WIN64 shadow space
mov ssq, 8
call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX)
RET
%endif ; ARCH_X86_64

93
third_party/dav1d/src/x86/cdef16_sse.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,93 @@
; Copyright (c) 2017-2021, The rav1e contributors
; Copyright (c) 2021, Nathan Egge
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%ifn ARCH_X86_64
SECTION_RODATA 16
pq_dir_shr: dq 2, 4
%endif
SECTION .text
cextern cdef_dir_8bpc_ssse3
INIT_XMM ssse3
cglobal cdef_dir_16bpc, 2, 4, 4, 32 + 8*8, src, ss, var, bdmax
bsr bdmaxd, bdmaxm
%if ARCH_X86_64
movzx bdmaxq, bdmaxw
sub bdmaxq, 7
movq m4, bdmaxq
%else
push r4
sub bdmaxd, 9
LEA r4, pq_dir_shr
movq m4, [r4 + bdmaxd*4]
pop r4
%endif
DEFINE_ARGS src, ss, var, ss3
lea ss3q, [ssq*3]
mova m0, [srcq + ssq*0]
mova m1, [srcq + ssq*1]
mova m2, [srcq + ssq*2]
mova m3, [srcq + ss3q]
psraw m0, m4
psraw m1, m4
psraw m2, m4
psraw m3, m4
packuswb m0, m1
packuswb m2, m3
mova [rsp + 32 + 0*8], m0
mova [rsp + 32 + 2*8], m2
lea srcq, [srcq + ssq*4]
mova m0, [srcq + ssq*0]
mova m1, [srcq + ssq*1]
mova m2, [srcq + ssq*2]
mova m3, [srcq + ss3q]
psraw m0, m4
psraw m1, m4
psraw m2, m4
psraw m3, m4
packuswb m0, m1
packuswb m2, m3
mova [rsp + 32 + 4*8], m0
mova [rsp + 32 + 6*8], m2
lea srcq, [rsp + 32] ; WIN64 shadow space
mov ssq, 8
%if ARCH_X86_64
call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX)
%else
movifnidn vard, varm
push eax ; align stack
push vard
push ssd
push srcd
call mangle(private_prefix %+ _cdef_dir_8bpc)
add esp, 0x10
%endif
RET

16
third_party/dav1d/src/x86/cdef_avx2.asm поставляемый
Просмотреть файл

@ -39,7 +39,7 @@
%endmacro
%macro CDEF_FILTER_JMP_TABLE 1
JMP_TABLE cdef_filter_%1, \
JMP_TABLE cdef_filter_%1_8bpc, \
d6k0, d6k1, d7k0, d7k1, \
d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
@ -94,7 +94,7 @@ SECTION .text
%macro PREP_REGS 2 ; w, h
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
mov dird, r6m
lea tableq, [cdef_filter_%1x%2_jmptable]
lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
lea dirq, [tableq+dirq*2*4]
%if %1 == 4
%if %2 == 4
@ -397,7 +397,7 @@ SECTION .text
%macro CDEF_FILTER 2 ; w, h
INIT_YMM avx2
cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
pri, sec, dir, damping, edge
%assign stack_offset_entry stack_offset
mov edged, edgem
@ -1592,7 +1592,7 @@ CDEF_FILTER 4, 8
CDEF_FILTER 4, 4
INIT_YMM avx2
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3
lea stride3q, [strideq*3]
movq xm0, [srcq+strideq*0]
movq xm1, [srcq+strideq*1]
@ -1622,10 +1622,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
psubw m3, m8
; shuffle registers to generate partial_sum_diag[0-1] together
vpermq m7, m0, q1032
vpermq m6, m1, q1032
vpermq m5, m2, q1032
vpermq m4, m3, q1032
vperm2i128 m7, m0, m0, 0x01
vperm2i128 m6, m1, m1, 0x01
vperm2i128 m5, m2, m2, 0x01
vperm2i128 m4, m3, m3, 0x01
; start with partial_sum_hv[0-1]
paddw m8, m0, m1

10
third_party/dav1d/src/x86/cdef_avx512.asm поставляемый
Просмотреть файл

@ -109,7 +109,8 @@ DECLARE_REG_TMP 8, 5
; 5e 5f 50 51 52 53 54 55
INIT_ZMM avx512icl
cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \
pri, sec, dir, damping, edge
%define base r7-edge_mask
movq xmm0, [dstq+strideq*0]
movhps xmm0, [dstq+strideq*1]
@ -269,8 +270,7 @@ DECLARE_REG_TMP 2, 7
; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85
; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95
cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
pri, sec, dir, damping, edge
cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, pri, sec, dir, damping, edge
%define base r8-edge_mask
vpbroadcastd ym21, strided
mov r6d, edgem
@ -504,8 +504,8 @@ ALIGN function_align
; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b
; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b
cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
pri, sec, dir, damping, edge
cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \
pri, sec, dir, damping, edge
%define base r8-edge_mask
mov r6d, edgem
lea r10, [dstq+strideq*4-2]

65
third_party/dav1d/src/x86/cdef_init_tmpl.c поставляемый
Просмотреть файл

@ -28,20 +28,23 @@
#include "src/cpu.h"
#include "src/cdef.h"
#define decl_cdef_size_fn(sz) \
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
#define decl_cdef_fns(ext) \
decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
decl_cdef_size_fn(4x4);
decl_cdef_size_fn(4x8);
decl_cdef_size_fn(8x8);
#if BITDEPTH == 8
decl_cdef_fns(avx512icl);
decl_cdef_fns(avx2);
decl_cdef_fns(sse4);
decl_cdef_fns(ssse3);
decl_cdef_fns(sse2);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
#endif
decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@ -49,45 +52,47 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
c->fb[0] = dav1d_cdef_filter_8x8_sse2;
c->fb[1] = dav1d_cdef_filter_4x8_sse2;
c->fb[2] = dav1d_cdef_filter_4x4_sse2;
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
c->dir = BF(dav1d_cdef_dir, ssse3);
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_ssse3;
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_sse4;
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
c->dir = BF(dav1d_cdef_dir, sse4);
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
#endif
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
c->dir = BF(dav1d_cdef_dir, avx2);
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_avx2;
c->fb[0] = dav1d_cdef_filter_8x8_avx2;
c->fb[1] = dav1d_cdef_filter_4x8_avx2;
c->fb[2] = dav1d_cdef_filter_4x4_avx2;
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
#if HAVE_AVX512ICL && BITDEPTH == 8
c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
#endif
#endif

12
third_party/dav1d/src/x86/cdef_sse.asm поставляемый
Просмотреть файл

@ -249,13 +249,13 @@ SECTION .text
%macro CDEF_FILTER 2 ; w, h
%if ARCH_X86_64
cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
dst, stride, left, top, pri, sec, edge, stride3, dst4
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \
dst, stride, left, top, pri, sec, edge, stride3, dst4
%define px rsp+3*16+2*32
%define base 0
%else
cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
dst, stride, left, edge, stride3
cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
dst, stride, left, edge, stride3
%define topq r2
%define dst4q r2
LEA r5, tap_table
@ -758,7 +758,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
%macro CDEF_DIR 0
%if ARCH_X86_64
cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3
lea stride3q, [strideq*3]
movq m1, [srcq+strideq*0]
movhps m1, [srcq+strideq*1]
@ -1030,7 +1030,7 @@ cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
shr r1d, 10
mov [varq], r1d
%else
cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
%define base r2-shufw_6543210x
LEA r2, shufw_6543210x
pxor m0, m0

4
third_party/dav1d/src/x86/ipred.asm поставляемый
Просмотреть файл

@ -1170,7 +1170,7 @@ ALIGN function_align
mova m9, [base+ipred_v_shuf]
vbroadcasti128 m6, [base+smooth_weights+16*2]
vbroadcasti128 m7, [base+smooth_weights+16*3]
vpermq m8, m9, q1032
vperm2i128 m8, m9, m9, 0x01
paddw m0, m10, m3
paddw m3, m11
paddw m12, m0
@ -4197,7 +4197,7 @@ ALIGN function_align
pmaddubsw m%3, m5
paddw m%1, m%3
psraw m%1, 4
vpermq m%3, m%1, q1032
vperm2i128 m%3, m%1, m%1, 0x01
packuswb m%1, m%3
%endmacro

2419
third_party/dav1d/src/x86/looprestoration.asm поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

480
third_party/dav1d/src/x86/looprestoration16_avx2.asm поставляемый Normal file
Просмотреть файл

@ -0,0 +1,480 @@
; Copyright (c) 2017-2021, The rav1e contributors
; Copyright (c) 2021, Nathan Egge
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13
wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1
wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9
wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
pq_3: dq (6 - 4) + 1
pq_5: dq (6 - 2) + 1
pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
pq_11: dq 12 - (6 - 4) + 1
pq_9: dq 12 - (6 - 2) + 1
nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
pb_wiener5_l: times 2 db 2, 3
pb_wiener5_r: times 2 db -6, -5
pb_wiener7_l: times 2 db 4, 5
pb_wiener7_m: times 2 db -4, -3
pb_wiener7_r: times 2 db -8, -7
SECTION .text
INIT_YMM avx2
cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
movifnidn wd, wm
movifnidn hd, hm
movifnidn edgeb, edgem
vbroadcasti128 m6, [wiener5_shufB]
vpbroadcastd m12, [fq + 2]
vbroadcasti128 m7, [wiener5_shufC]
vpbroadcastw m13, [fq + 6]
vbroadcasti128 m8, [wiener5_shufD]
popcnt bdmaxd, bdmaxm
vpbroadcastd m9, [pd_65540]
movq xm10, [pq_3]
cmp bdmaxd, 10
je .bits10
vpbroadcastd m9, [pd_262160]
movq xm10, [pq_5]
.bits10:
pxor m11, m11
add wq, wq
add srcq, wq
add dstq, wq
neg wq
DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
.v_loop:
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
test leftq, leftq
jz .h_loop
movd xm4, [leftq + 4]
vpblendd m4, [srcq + xq - 4], 0xfe
add leftq, 8
jmp .h_main
.h_extend_left:
vbroadcasti128 m5, [srcq + xq]
mova m4, [srcq + xq]
palignr m4, m5, 12
pshufb m4, [wiener5_l_shuf]
jmp .h_main
.h_loop:
movu m4, [srcq + xq - 4]
.h_main:
movu m5, [srcq + xq + 4]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp xd, -36
jl .h_have_right
movd xm2, xd
vpbroadcastd m0, [pb_wiener5_l]
vpbroadcastd m1, [pb_wiener5_r]
vpbroadcastb m2, xm2
movu m3, [pb_0to31]
psubb m0, m2
psubb m1, m2
pminub m0, m3
pminub m1, m3
pshufb m4, m0
pshufb m5, m1
.h_have_right:
pshufb m0, m4, m6
pshufb m2, m4, m7
paddw m0, m2
pmaddwd m0, m12
pshufb m1, m5, m6
pshufb m3, m5, m7
paddw m1, m3
pmaddwd m1, m12
pshufb m4, m8
pmaddwd m4, m13
pshufb m5, m8
pmaddwd m5, m13
paddd m0, m4
paddd m1, m5
paddd m0, m9
paddd m1, m9
psrad m0, xm10
psrad m1, xm10
packssdw m0, m1
pmaxsw m0, m11
mova [dstq + xq], m0
add xq, 32
jl .h_loop
add srcq, ssq
add dstq, 384*2
dec hd
jg .v_loop
RET
DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
INIT_YMM avx2
cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
movifnidn wd, wm
movifnidn hd, hm
movifnidn edgeb, edgem
pxor m6, m6
vpbroadcastd m7, [fq + 2]
vpbroadcastd m8, [fq + 6]
popcnt bdmaxd, bdmaxm
vpbroadcastd m9, [nd_1047552]
movq xm10, [pq_11]
cmp bdmaxd, 10
je .bits10
vpbroadcastd m9, [nd_1048320]
movq xm10, [pq_9]
.bits10:
vpbroadcastw m11, bdmaxm
add wq, wq
add midq, wq
add dstq, wq
neg wq
DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
mov msq, 2*384
mov t0, midq
lea t1, [t0 + msq]
lea t2, [t1 + msq]
lea t3, [t2 + msq]
lea t4, [t3 + msq]
test edgeb, 4 ; LR_HAVE_TOP
jnz .have_top
mov t0, t2
mov t1, t2
.have_top:
test edgeb, 8 ; LR_HAVE_BOTTOM
jnz .v_loop
cmp hd, 2
jg .v_loop
cmp hd, 1
jne .limit_v
mov t3, t2
.limit_v:
mov t4, t3
.v_loop:
mov xq, wq
.h_loop:
mova m1, [t0 + xq]
mova m2, [t1 + xq]
mova m3, [t2 + xq]
mova m4, [t3 + xq]
mova m5, [t4 + xq]
punpcklwd m0, m1, m2
pmaddwd m0, m7
punpckhwd m1, m2
pmaddwd m1, m7
punpcklwd m2, m5, m4
pmaddwd m2, m7
punpckhwd m5, m4
pmaddwd m5, m7
paddd m0, m2
paddd m1, m5
punpcklwd m2, m3, m6
pmaddwd m2, m8
punpckhwd m3, m6
pmaddwd m3, m8
paddd m0, m2
paddd m1, m3
paddd m0, m9
paddd m1, m9
psrad m0, xm10
psrad m1, xm10
packusdw m0, m1
pminuw m0, m11
mova [dstq + xq], m0
add xq, 32
jl .h_loop
add dstq, dsq
mov t0, t1
mov t1, t2
mov t2, t3
mov t3, t4
add t4, msq
test edgeb, 8 ; LR_HAVE_BOTTOM
jnz .have_bottom
cmp hd, 3
jg .have_bottom
mov t4, t3
.have_bottom:
dec hd
jg .v_loop
RET
INIT_YMM avx2
cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
movifnidn wd, wm
movifnidn hd, hm
movifnidn edgeb, edgem
vpbroadcastd m7, [fq]
vpbroadcastd m8, [fq + 4]
vbroadcasti128 m10, [rev_w]
vbroadcasti128 m11, [wiener5_shufB]
vbroadcasti128 m12, [wiener7_shufC]
vbroadcasti128 m13, [wiener7_shufD]
vbroadcasti128 m14, [wiener7_shufE]
vbroadcasti128 m15, [rev_d]
popcnt bdmaxd, bdmaxm
vpbroadcastd m9, [pd_65540]
mov rhq, [pq_3]
cmp bdmaxd, 10
je .bits10
vpbroadcastd m9, [pd_262160]
mov rhq, [pq_5]
.bits10:
add wq, wq
add srcq, wq
add dstq, wq
neg wq
DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
.v_loop:
mov xq, wq
test edgeb, 1 ; LR_HAVE_LEFT
jz .h_extend_left
test leftq, leftq
jz .h_loop
movq xm4, [leftq + 2]
vpblendw xm4, [srcq + xq - 6], 0xf8
vinserti128 m4, [srcq + xq + 10], 1
add leftq, 8
jmp .h_main
.h_extend_left:
vbroadcasti128 m5, [srcq + xq]
mova m4, [srcq + xq]
palignr m4, m5, 10
pshufb m4, [wiener7_l_shuf]
jmp .h_main
.h_loop:
movu m4, [srcq + xq - 6]
.h_main:
movu m5, [srcq + xq + 2]
movu m6, [srcq + xq + 6]
test edgeb, 2 ; LR_HAVE_RIGHT
jnz .h_have_right
cmp xd, -38
jl .h_have_right
movd xm3, xd
vpbroadcastd m0, [pb_wiener7_l]
vpbroadcastd m1, [pb_wiener7_m]
vpbroadcastd m2, [pb_wiener7_r]
vpbroadcastb m3, xm3
psubb m0, m3
psubb m1, m3
psubb m2, m3
movu m3, [pb_0to31]
pminub m0, m3
pminub m1, m3
pminub m2, m3
pshufb m4, m0
pshufb m5, m1
pshufb m6, m2
cmp xd, -9*2
jne .hack
vpbroadcastw xm3, [srcq + xq + 16]
vinserti128 m5, xm3, 1
jmp .h_have_right
.hack:
cmp xd, -1*2
jne .h_have_right
vpbroadcastw xm5, [srcq + xq]
.h_have_right:
pshufb m6, m10
pshufb m0, m4, m11
pshufb m2, m5, m12
paddw m0, m2
pmaddwd m0, m7
pshufb m2, m4, m13
pshufb m4, m14
paddw m2, m4
pmaddwd m2, m8
pshufb m1, m6, m11
pshufb m5, m11
pmaddwd m1, m7
pmaddwd m5, m7
pshufb m3, m6, m13
pshufb m6, m14
paddw m3, m6
pmaddwd m3, m8
paddd m0, m2
paddd m1, m3
pshufb m1, m15
paddd m1, m5
movq xm4, rhq
pxor m5, m5
paddd m0, m9
paddd m1, m9
psrad m0, xm4
psrad m1, xm4
packssdw m0, m1
pmaxsw m0, m5
mova [dstq + xq], m0
add xq, 32
jl .h_loop
add srcq, ssq
add dstq, 384*2
dec hd
jg .v_loop
RET
INIT_YMM avx2
cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
movifnidn wd, wm
movifnidn hd, hm
movifnidn edgeb, edgem
pxor m6, m6
vpbroadcastd m7, [fq]
vpbroadcastw m8, [fq + 4]
vpbroadcastd m9, [fq + 6]
popcnt bdmaxd, bdmaxm
vpbroadcastd m10, [nd_1047552]
movq xm11, [pq_11]
cmp bdmaxd, 10
je .bits10
vpbroadcastd m10, [nd_1048320]
movq xm11, [pq_9]
.bits10:
vpbroadcastw m12, bdmaxm
add wq, wq
add midq, wq
add dstq, wq
neg wq
DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
mov msq, 2*384
mov t0, midq
mov t1, t0
lea t2, [t1 + msq]
lea t3, [t2 + msq]
lea t4, [t3 + msq]
lea t5, [t4 + msq]
lea t6, [t5 + msq]
test edgeb, 4 ; LR_HAVE_TOP
jnz .have_top
mov t0, t3
mov t1, t3
mov t2, t3
.have_top:
cmp hd, 3
jg .v_loop
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .no_bottom0
cmp hd, 1
jg .v_loop
jmp .h3
.no_bottom0:
cmp hd, 2
je .h2
jns .h3
.h1:
mov t4, t3
.h2:
mov t5, t4
.h3:
mov t6, t5
.v_loop:
mov xq, wq
.h_loop:
mova m1, [t0 + xq]
mova m2, [t1 + xq]
mova m3, [t5 + xq]
mova m4, [t6 + xq]
punpcklwd m0, m1, m2
pmaddwd m0, m7
punpckhwd m1, m2
pmaddwd m1, m7
punpcklwd m2, m4, m3
pmaddwd m2, m7
punpckhwd m4, m3
pmaddwd m4, m7
paddd m0, m2
paddd m1, m4
mova m3, [t2 + xq]
mova m4, [t4 + xq]
punpcklwd m2, m3, m4
pmaddwd m2, m8
punpckhwd m3, m4
pmaddwd m3, m8
paddd m0, m2
paddd m1, m3
mova m3, [t3 + xq]
punpcklwd m2, m3, m6
pmaddwd m2, m9
punpckhwd m3, m6
pmaddwd m3, m9
paddd m0, m2
paddd m1, m3
paddd m0, m10
paddd m1, m10
psrad m0, xm11
psrad m1, xm11
packusdw m0, m1
pminuw m0, m12
mova [dstq + xq], m0
add xq, 32
jl .h_loop
add dstq, dsq
mov t0, t1
mov t1, t2
mov t2, t3
mov t3, t4
mov t4, t5
mov t5, t6
add t6, msq
cmp hd, 4
jg .next_row
test edgeb, 8 ; LR_HAVE_BOTTOM
jz .no_bottom
cmp hd, 2
jg .next_row
.no_bottom:
mov t6, t5
.next_row:
dec hd
jg .v_loop
RET
%endif ; ARCH_X86_64

Просмотреть файл

@ -29,173 +29,235 @@
#include "src/looprestoration.h"
#include "common/intops.h"
#include "src/tables.h"
#define WIENER_FILTER(ext) \
void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const int16_t filter[2][8], \
enum LrEdgeFlags edges); \
void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const int16_t filter[2][8], \
enum LrEdgeFlags edges);
#if BITDEPTH != 8
#define decl_wiener_filter_fn(name, ext) \
void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
const int16_t fv[7], int w, int h, \
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const LooprestorationParams *params, \
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
ALIGN_STK_64(int16_t, mid, 68 * 384,); \
BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \
edges HIGHBD_TAIL_SUFFIX); \
if (edges & LR_HAVE_TOP) { \
BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \
edges HIGHBD_TAIL_SUFFIX); \
} \
if (edges & LR_HAVE_BOTTOM) { \
BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
} \
BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
}
#define decl_wiener_filter_fns(ext) \
decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
#else
#define decl_wiener_filter_fns(ext) \
decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
#endif
#define SGR_FILTER(ext) \
void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
#define decl_sgr_filter_fns(ext) \
void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges);
/* FIXME: Replace with a port of the AVX2 code */
#define SGR_FILTER_OLD(ext) \
void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
const int w, const int h, const unsigned s); \
void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
\
/* filter with a 3x3 box (radius=1) */ \
static void dav1d_sgr_filter1_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
\
dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
if (edges & LR_HAVE_TOP) \
dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
\
if (edges & LR_HAVE_BOTTOM) \
dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
\
dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
} \
\
void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
\
/* filter with a 5x5 box (radius=2) */ \
static void dav1d_sgr_filter2_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
\
dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
if (edges & LR_HAVE_TOP) \
dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
\
if (edges & LR_HAVE_BOTTOM) \
dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
\
dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
} \
\
void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const int w, const int h, \
const int wt); \
void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const coef *t2, \
const int w, const int h, \
const uint32_t wt); \
void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const int w, const int h, \
const int wt); \
void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const coef *t2, \
const int w, const int h, \
const uint32_t wt); \
\
static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int sgr_idx, \
const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
if (!dav1d_sgr_params[sgr_idx][0]) { \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, dav1d_sgr_params[sgr_idx][3], edges); \
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
} else if (!dav1d_sgr_params[sgr_idx][1]) { \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, dav1d_sgr_params[sgr_idx][2], edges); \
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
} else { \
ALIGN_STK_32(coef, tmp1, 64 * 384,); \
ALIGN_STK_32(coef, tmp2, 64 * 384,); \
dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
w, h, dav1d_sgr_params[sgr_idx][2], edges); \
dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, dav1d_sgr_params[sgr_idx][3], edges); \
const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
} \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
} \
static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
} \
static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(coef, tmp1, 64 * 384,); \
ALIGN_STK_32(coef, tmp2, 64 * 384,); \
BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
}
#if BITDEPTH == 8
WIENER_FILTER(sse2)
WIENER_FILTER(ssse3)
SGR_FILTER(ssse3)
decl_wiener_filter_fns(sse2);
decl_wiener_filter_fns(ssse3);
SGR_FILTER_OLD(ssse3)
# if ARCH_X86_64
WIENER_FILTER(avx2)
SGR_FILTER(avx2)
decl_sgr_filter_fns(avx2)
# endif
#endif
#if ARCH_X86_64
decl_wiener_filter_fns(avx2);
#endif
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
c->wiener[0] = dav1d_wiener_filter7_sse2;
c->wiener[1] = dav1d_wiener_filter5_sse2;
c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->wiener[0] = dav1d_wiener_filter7_ssse3;
c->wiener[1] = dav1d_wiener_filter5_ssse3;
c->selfguided = sgr_filter_ssse3;
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
c->sgr[2] = BF(sgr_filter_mix, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
c->wiener[0] = dav1d_wiener_filter7_avx2;
c->wiener[1] = dav1d_wiener_filter5_avx2;
c->selfguided = sgr_filter_avx2;
#if ARCH_X86_64
c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
# if BITDEPTH == 8
c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
# endif
#endif
}

Просмотреть файл

@ -97,8 +97,8 @@ SECTION .text
%macro WIENER 0
%if ARCH_X86_64
DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
%define base 0
mov fltq, fltmp
mov edged, r8m
@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
%define m11 [stk+96]
%define stk_off 112
%endif
cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
%define base r6-pb_right_ext_mask-21
%define stk esp
%define dstq leftq
@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
add lpfq, [rsp+gprsize*1]
call .hv_bottom
.v1:
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
RET
.no_top:
lea t3, [lpfq+lpf_strideq*4]
@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
dec hd
jnz .main
.v3:
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
.v2:
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
jmp .v1
.extend_right:
movd m2, [lpfq-4]
@ -685,8 +685,8 @@ ALIGN function_align
%endif
%if ARCH_X86_64
cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
mov fltq, fltmp
mov edged, r8m
mov wd, wm
@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
%define m11 [stk+80]
%define stk_off 96
%endif
cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
%define stk esp
%define leftmp [stk+28]
%define m8 [base+pw_m16380]
@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
dec hd
jnz .main
.v2:
call mangle(private_prefix %+ _wiener_filter5_ssse3).v
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
add dstq, dst_strideq
mov t4, t3
mov t3, t2
mov t2, t1
movifnidn dstmp, dstq
.v1:
call mangle(private_prefix %+ _wiener_filter5_ssse3).v
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
jmp .end
.h:
%define stk esp+4
@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
jnz .h_have_right
cmp xd, -17
jl .h_have_right
call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
.h_have_right:
%macro %%h5 0
%if cpuflag(ssse3)
@ -991,7 +991,7 @@ ALIGN function_align
jnz .hv_have_right
cmp xd, -17
jl .hv_have_right
call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
.hv_have_right:
%%h5
mova m2, [t3+xq*2]
@ -1161,7 +1161,7 @@ WIENER
%endmacro
%if ARCH_X86_64
cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
mov xlimd, edgem
movifnidn xd, xm
mov hd, hm
@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
add xd, xlimd
xor xlimd, 2 ; 2*!have_right
%else
cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
%define wq r0m
%define xlimd r1m
%define hd hmp
@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
RET
%if ARCH_X86_64
cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
movifnidn edged, edgem
%else
cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
%define sumsq_baseq dword [esp+0]
%define sum_baseq dword [esp+4]
%define ylimd dword [esp+8]
@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
jl .loop_x
RET
cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
movifnidn sd, sm
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
RET
%if ARCH_X86_64
cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
tmp_base, src_base, a_base, b_base, x, y
cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
tmp_base, src_base, a_base, b_base, x, y
movifnidn wd, wm
mov hd, hm
mova m15, [pw_16]
@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
mov b_baseq, bq
xor xd, xd
%else
cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
%define tmp_baseq [esp+8]
%define src_baseq [esp+12]
%define a_baseq [esp+16]
@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
jl .loop_x
RET
cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
movifnidn hd, hm
%if ARCH_X86_32
SETUP_PIC r6, 0
@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
RET
%if ARCH_X86_64
cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
mov edged, edgem
movifnidn wd, wm
mov hd, hm
mova m10, [pb_0]
mova m11, [pb_0_1]
%else
cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
%define edgeb byte edgem
%define wd xd
%define wq wd
@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
RET
%if ARCH_X86_64
cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
movifnidn edged, edgem
mov ylimd, edged
%else
cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
%define wm [esp+0]
%define hm [esp+4]
%define edgem [esp+8]
@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
jmp .sum_loop_y_noload
%endif
cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
movifnidn sd, sm
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
RET
%if ARCH_X86_64
cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
tmp_base, src_base, a_base, b_base, x, y
movifnidn wd, wm
mov hd, hm
@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
psrlw m11, m12, 1 ; pw_128
pxor m13, m13
%else
cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
%define tmp_baseq r0m
%define src_baseq r1m
%define a_baseq r3m
@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
RET
%undef t2
cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
movifnidn wd, wm
movd m0, wtm
%if ARCH_X86_64

3
third_party/dav1d/src/x86/mc_avx2.asm поставляемый
Просмотреть файл

@ -3825,9 +3825,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
pblendw m6, m7, 0xaa ; 67 89
pmulhrsw m6, m12
paddd m4, m5
vpblendd m0, m1, m6, 0x0f
vperm2i128 m0, m1, m6, 0x21 ; 45 67
mova m1, m6
vpermq m0, m0, q1032 ; 45 67
pmaddwd m6, m0, m10
pmaddwd m7, m1, m11
paddd m4, m13

10
third_party/dav1d/src/x86/msac.asm поставляемый
Просмотреть файл

@ -153,6 +153,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
.renorm4:
bsr ecx, t2d
xor ecx, 15 ; d
.renorm5:
shl t2d, cl
shl t4, cl
mov [t7+msac.rng], t2d
@ -413,13 +414,20 @@ cglobal msac_decode_bool_equi, 0, 6, 0
sub t2d, t1d ; r - v
sub t4, rax ; dif - vw
cmovb t2d, t1d
mov t1d, [t0+msac.cnt]
cmovb t4, t3
movifnidn t7, t0
mov ecx, 0xbfff
setb al ; the upper 32 bits contains garbage but that's OK
sub ecx, t2d
not t4
; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
; i.e. (0 <= d <= 2) and v < (3 << 14)
shr ecx, 14 ; d
%if ARCH_X86_64 == 0
movzx eax, al
%endif
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
cglobal msac_decode_bool, 0, 6, 0
movifnidn t0, r0mp

2
third_party/dav1d/tests/checkasm/checkasm.h поставляемый
Просмотреть файл

@ -115,7 +115,7 @@ int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
#if HAVE_ASM
#if ARCH_X86
#ifdef _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
#define readtime() (_mm_lfence(), __rdtsc())
#else

27
third_party/dav1d/tests/checkasm/itx.c поставляемый
Просмотреть файл

@ -138,14 +138,21 @@ static int copy_subcoefs(coef *coeff,
* dimensions are non-zero. This leads to braching to specific optimized
* simd versions (e.g. dc-only) so that we get full asm coverage in this
* test */
const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
const enum TxClass tx_class = dav1d_tx_type_class[txtp];
const uint16_t *const scan = dav1d_scans[tx];
const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
const int sub_low = subsh > 1 ? sub_high - 8 : 0;
int n, eob;
for (n = 0, eob = 0; n < sw * sh; n++) {
const int rc = scan[n];
const int rcx = rc % sh, rcy = rc / sh;
int rc, rcx, rcy;
if (tx_class == TX_CLASS_2D)
rc = scan[n], rcx = rc % sh, rcy = rc / sh;
else if (tx_class == TX_CLASS_H)
rcx = n % sh, rcy = n / sh, rc = n;
else /* tx_class == TX_CLASS_V */
rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx;
/* Pick a random eob within this sub-itx */
if (rcx > sub_high || rcy > sub_high) {
@ -156,8 +163,18 @@ static int copy_subcoefs(coef *coeff,
if (eob)
eob += rnd() % (n - eob - 1);
for (n = eob + 1; n < sw * sh; n++)
coeff[scan[n]] = 0;
if (tx_class == TX_CLASS_2D)
for (n = eob + 1; n < sw * sh; n++)
coeff[scan[n]] = 0;
else if (tx_class == TX_CLASS_H)
for (n = eob + 1; n < sw * sh; n++)
coeff[n] = 0;
else /* tx_class == TX_CLASS_V */ {
for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1)
while (++rcy < sw)
coeff[rcy * sh + rcx] = 0;
n = sw * sh;
}
for (; n < 32 * 32; n++)
coeff[n] = rnd();
return eob;

Просмотреть файл

@ -41,24 +41,30 @@ static int to_binary(int x) { /* 0-15 -> 0000-1111 */
static void init_tmp(pixel *buf, const ptrdiff_t stride,
const int w, const int h, const int bitdepth_max)
{
const int noise_mask = bitdepth_max >> 4;
const int x_off = rnd() & 7, y_off = rnd() & 7;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
buf[x] = rnd() & bitdepth_max;
for (int x = 0; x < w; x++) {
buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^
(rnd() & noise_mask);
}
buf += PXSTRIDE(stride);
}
}
static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
ALIGN_STK_16(int16_t, filter, 2, [8]);
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
pixel left[64][4];
LooprestorationParams params;
int16_t (*const filter)[8] = params.filter;
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, const int16_t filter[2][8],
int w, int h, const LooprestorationParams *params,
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
for (int t = 0; t < 2; t++) {
@ -80,24 +86,24 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc)
const int base_h = 1 + (rnd() & 63);
const int bitdepth_max = (1 << bpc) - 1;
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
call_ref(c_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, filter, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, filter, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
a_dst + 32, 448 * sizeof(pixel),
call_ref(c_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
a_dst, 448 * sizeof(pixel),
w, h, "dst"))
{
fprintf(stderr, "size = %dx%d, edges = %04d\n",
@ -105,63 +111,72 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc)
break;
}
}
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
256, 64, filter, 0xf HIGHBD_TAIL_SUFFIX);
bench_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
256, 64, &params, 0xf HIGHBD_TAIL_SUFFIX);
}
}
}
static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
ALIGN_STK_64(pixel, c_dst, 448 * 64,);
ALIGN_STK_64(pixel, a_dst, 448 * 64,);
ALIGN_STK_64(pixel, h_edge, 448 * 8,);
ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
pixel left[64][4];
LooprestorationParams params;
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, int sgr_idx,
const int16_t sgr_wt[7], enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
int w, int h, const LooprestorationParams *params,
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
if (check_func(c->selfguided, "selfguided_%s_%dbpc",
sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
{
int16_t sgr_wt[2];
static const struct { char name[4]; uint8_t idx; } sgr_data[3] = {
{ "5x5", 14 },
{ "3x3", 10 },
{ "mix", 0 },
};
sgr_wt[0] = dav1d_sgr_params[sgr_idx][0] ? (rnd() & 127) - 96 : 0;
sgr_wt[1] = dav1d_sgr_params[sgr_idx][1] ? (rnd() & 127) - 32 :
iclip(128 - sgr_wt[0], -32, 95);
for (int i = 0; i < 3; i++) {
if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) {
const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx];
params.sgr.s0 = sgr_params[0];
params.sgr.s1 = sgr_params[1];
params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0;
params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0;
const int base_w = 1 + (rnd() % 384);
const int base_h = 1 + (rnd() & 63);
const int bitdepth_max = (1 << bpc) - 1;
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);
for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));
call_ref(c_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
a_dst + 32, 448 * sizeof(pixel),
w, h, "dst");
call_ref(c_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
call_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
w, h, &params, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
a_dst, 448 * sizeof(pixel),
w, h, "dst"))
{
fprintf(stderr, "size = %dx%d, edges = %04d\n",
w, h, to_binary(edges));
break;
}
}
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
bench_new(a_dst, 448 * sizeof(pixel), left,
h_edge, 448 * sizeof(pixel),
256, 64, &params, 0xf HIGHBD_TAIL_SUFFIX);
}
}
}

Просмотреть файл

@ -193,7 +193,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
}
cleanup:
dav1d_flush(ctx);
dav1d_close(&ctx);
end:
return 0;

24
third_party/dav1d/tests/meson.build поставляемый
Просмотреть файл

@ -76,8 +76,6 @@ if is_asm_enabled
checkasm_sources += checkasm_asm_sources
endif
m_lib = cc.find_library('m', required: false)
checkasm = executable('checkasm',
checkasm_sources,
checkasm_asm_objs,
@ -94,7 +92,7 @@ if is_asm_enabled
thread_dependency,
rt_dependency,
libdl_dependency,
m_lib,
libm_dependency,
],
)
@ -127,6 +125,26 @@ endforeach
# fuzzing binaries
subdir('libfuzzer')
# seek stress test binary, depends on dav1d cli tool
if get_option('enable_tools')
seek_stress_sources = files('seek_stress.c')
seek_stress = executable('seek_stress',
seek_stress_sources, rev_target,
objects: [
dav1d.extract_objects('dav1d_cli_parse.c'),
dav1d_input_objs.extract_objects('input/input.c', 'input/ivf.c'),
],
include_directories: [dav1d_inc_dirs, include_directories('../tools')],
link_with: libdav1d,
dependencies: [
thread_dependency,
rt_dependency,
getopt_dependency,
libm_dependency,
],
)
endif
# Include dav1d test data repository with additional tests
if get_option('testdata_tests')
subdir('dav1d-test-data')

243
third_party/dav1d/tests/seek_stress.c поставляемый Normal file
Просмотреть файл

@ -0,0 +1,243 @@
/*
* Copyright © 2020, VideoLAN and dav1d authors
* Copyright © 2020, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "vcs_version.h"
#include "cli_config.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "dav1d/dav1d.h"
#include "input/input.h"
#include "input/demuxer.h"
#include "dav1d_cli_parse.h"
#define NUM_RAND_SEEK 3
#define NUM_REL_SEEK 4
#define NUM_END_SEEK 2
const Demuxer annexb_demuxer = { .name = "" };
const Demuxer section5_demuxer = { .name = "" };
#ifdef _WIN32
#include <windows.h>
static unsigned get_seed(void) {
return GetTickCount();
}
#else
#ifdef __APPLE__
#include <mach/mach_time.h>
#else
#include <time.h>
#endif
static unsigned get_seed(void) {
#ifdef __APPLE__
return (unsigned) mach_absolute_time();
#elif defined(HAVE_CLOCK_GETTIME)
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
#endif
}
#endif
static uint32_t xs_state[4];
static void xor128_srand(unsigned seed) {
xs_state[0] = seed;
xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
xs_state[3] = ~seed;
}
// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
// Journal of Statistical Software. 8 (14).
// doi:10.18637/jss.v008.i14.
static int xor128_rand(void) {
const uint32_t x = xs_state[0];
const uint32_t t = x ^ (x << 11);
xs_state[0] = xs_state[1];
xs_state[1] = xs_state[2];
xs_state[2] = xs_state[3];
uint32_t w = xs_state[3];
w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
xs_state[3] = w;
return w >> 1;
}
static inline int decode_frame(Dav1dPicture *const p,
Dav1dContext *const c, Dav1dData *const data)
{
int res;
memset(p, 0, sizeof(*p));
if ((res = dav1d_send_data(c, data)) < 0) {
if (res != DAV1D_ERR(EAGAIN)) {
fprintf(stderr, "Error decoding frame: %s\n",
strerror(DAV1D_ERR(res)));
return res;
}
}
if ((res = dav1d_get_picture(c, p)) < 0) {
if (res != DAV1D_ERR(EAGAIN)) {
fprintf(stderr, "Error decoding frame: %s\n",
strerror(DAV1D_ERR(res)));
return res;
}
} else dav1d_picture_unref(p);
return 0;
}
static int decode_rand(DemuxerContext *const in, Dav1dContext *const c,
Dav1dData *const data, const double fps)
{
int res = 0;
Dav1dPicture p;
const int num_frames = xor128_rand() % (int)(fps * 5);
for (int i = 0; i < num_frames; i++) {
if ((res = decode_frame(&p, c, data))) break;
if (input_read(in, data) || data->sz == 0) break;
}
return res;
}
static int decode_all(DemuxerContext *const in,
Dav1dContext *const c, Dav1dData *const data)
{
int res = 0;
Dav1dPicture p;
do { if ((res = decode_frame(&p, c, data))) break;
} while (!input_read(in, data) && data->sz > 0);
return res;
}
static int seek(DemuxerContext *const in, Dav1dContext *const c,
const uint64_t pts, Dav1dData *const data)
{
int res;
if ((res = input_seek(in, pts))) return res;
Dav1dSequenceHeader seq;
do { if ((res = input_read(in, data))) break;
} while (dav1d_parse_sequence_header(&seq, data->data, data->sz));
dav1d_flush(c);
return res;
}
int main(const int argc, char *const *const argv) {
const char *version = dav1d_version();
if (strcmp(version, DAV1D_VERSION)) {
fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
version, DAV1D_VERSION);
return EXIT_FAILURE;
}
CLISettings cli_settings;
Dav1dSettings lib_settings;
DemuxerContext *in;
Dav1dContext *c;
Dav1dData data;
unsigned total, i_fps[2], i_timebase[2];
double timebase, spf, fps;
uint64_t pts;
xor128_srand(get_seed());
parse(argc, argv, &cli_settings, &lib_settings);
if (input_open(&in, "ivf", cli_settings.inputfile,
i_fps, &total, i_timebase) < 0 ||
!i_timebase[0] || !i_timebase[1] || !i_fps[0] || !i_fps[1])
{
return EXIT_SUCCESS;
}
if (dav1d_open(&c, &lib_settings))
return EXIT_FAILURE;
timebase = (double)i_timebase[1] / i_timebase[0];
spf = (double)i_fps[1] / i_fps[0];
fps = (double)i_fps[0] / i_fps[1];
if (fps < 1) goto end;
#define FRAME_OFFSET_TO_PTS(foff) \
(uint64_t)llround(((foff) * spf) * 1000000000.0)
#define TS_TO_PTS(ts) \
(uint64_t)llround(((ts) * timebase) * 1000000000.0)
// seek at random pts
for (int i = 0; i < NUM_RAND_SEEK; i++) {
pts = FRAME_OFFSET_TO_PTS(xor128_rand() % total);
if (seek(in, c, pts, &data)) continue;
if (decode_rand(in, c, &data, fps)) goto end;
}
pts = TS_TO_PTS(data.m.timestamp);
// seek left / right randomly with random intervals within 1s
for (int i = 0, tries = 0;
i - tries < NUM_REL_SEEK && tries < NUM_REL_SEEK / 2;
i++)
{
const int sign = xor128_rand() & 1 ? -1 : +1;
const float diff = (xor128_rand() % 100) / 100.f;
int64_t new_pts = pts + sign * FRAME_OFFSET_TO_PTS(diff * fps);
const int64_t new_ts = llround(new_pts / (timebase * 1000000000.0));
new_pts = TS_TO_PTS(new_ts);
if (new_pts < 0 || (uint64_t)new_pts >= FRAME_OFFSET_TO_PTS(total)) {
if (seek(in, c, FRAME_OFFSET_TO_PTS(total / 2), &data)) break;
pts = TS_TO_PTS(data.m.timestamp);
tries++;
continue;
}
if (seek(in, c, new_pts, &data))
if (seek(in, c, 0, &data)) goto end;
if (decode_rand(in, c, &data, fps)) goto end;
pts = TS_TO_PTS(data.m.timestamp);
}
unsigned shift = 0;
do {
shift += 5;
if (shift > total)
shift = total;
} while (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data));
// simulate seeking after the end of the file
for (int i = 0; i < NUM_END_SEEK; i++) {
if (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)) goto end;
if (decode_all(in, c, &data)) goto end;
int num_flush = 1 + 64 + xor128_rand() % 64;
while (num_flush--) dav1d_flush(c);
}
end:
input_close(in);
dav1d_close(&c);
return EXIT_SUCCESS;
}

1
third_party/dav1d/tools/dav1d.c поставляемый
Просмотреть файл

@ -197,7 +197,6 @@ int main(const int argc, char *const *const argv) {
seq_skip);
}
//getc(stdin);
if (cli_settings.limit != 0 && cli_settings.limit < total)
total = cli_settings.limit;

44
third_party/dav1d/tools/dav1d_cli_parse.c поставляемый
Просмотреть файл

@ -26,6 +26,7 @@
*/
#include "config.h"
#include "cli_config.h"
#include <getopt.h>
#include <limits.h>
@ -51,6 +52,7 @@ enum {
ARG_REALTIME_CACHE,
ARG_FRAME_THREADS,
ARG_TILE_THREADS,
ARG_POSTFILTER_THREADS,
ARG_VERIFY,
ARG_FILM_GRAIN,
ARG_OPPOINT,
@ -73,6 +75,7 @@ static const struct option long_opts[] = {
{ "realtimecache", 1, NULL, ARG_REALTIME_CACHE },
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
{ "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS },
{ "verify", 1, NULL, ARG_VERIFY },
{ "filmgrain", 1, NULL, ARG_FILM_GRAIN },
{ "oppoint", 1, NULL, ARG_OPPOINT },
@ -82,6 +85,12 @@ static const struct option long_opts[] = {
{ NULL, 0, NULL, 0 },
};
#if HAVE_XXHASH_H
#define AVAILABLE_MUXERS "'md5', 'xxh3', 'yuv', 'yuv4mpeg2' or 'null'"
#else
#define AVAILABLE_MUXERS "'md5', 'yuv', 'yuv4mpeg2' or 'null'"
#endif
#if ARCH_AARCH64 || ARCH_ARM
#define ALLOWED_CPU_MASKS " or 'neon'"
#elif ARCH_PPC64LE
@ -107,7 +116,7 @@ static void usage(const char *const app, const char *const reason, ...) {
" --input/-i $file: input file\n"
" --output/-o $file: output file\n"
" --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from content)\n"
" --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n"
" --muxer $name: force muxer type (" AVAILABLE_MUXERS "; default: detect from extension)\n"
" --quiet/-q: disable status messages\n"
" --frametimes $file: dump frame times to file\n"
" --limit/-l $num: stop decoding after $num frames\n"
@ -117,7 +126,8 @@ static void usage(const char *const app, const char *const reason, ...) {
" --version/-v: print version and exit\n"
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
" --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n"
" --pfthreads $num: number of postfilter threads (default: 1)\n"
" --filmgrain $num: enable film grain application (default: 1, except if muxer is md5 or xxh3)\n"
" --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n"
" --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
" --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n"
@ -198,24 +208,26 @@ static const EnumParseTable cpu_mask_tbl[] = {
{ "avx2", X86_CPU_MASK_AVX2 },
{ "avx512icl", X86_CPU_MASK_AVX512ICL },
#endif
{ 0 },
{ "none", 0 },
};
#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n)))
static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl,
const int option, const char *app)
const int tbl_sz, const int option, const char *app)
{
char str[1024];
strcpy(str, "any of ");
for (int n = 0; tbl[n].str; n++) {
for (int n = 0; n < tbl_sz; n++) {
if (!strcmp(tbl[n].str, optarg))
return tbl[n].val;
if (n) {
if (!tbl[n + 1].str)
strcat(str, " or ");
else
if (n < tbl_sz - 1)
strcat(str, ", ");
else
strcat(str, " or ");
}
strcat(str, tbl[n].str);
}
@ -295,6 +307,10 @@ void parse(const int argc, char *const *const argv,
lib_settings->n_tile_threads =
parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
break;
case ARG_POSTFILTER_THREADS:
lib_settings->n_postfilter_threads =
parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
break;
case ARG_VERIFY:
cli_settings->verify = optarg;
break;
@ -325,7 +341,7 @@ void parse(const int argc, char *const *const argv,
fprintf(stderr, "%s\n", dav1d_version());
exit(0);
case ARG_CPU_MASK:
dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,
dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, ARRAY_SIZE(cpu_mask_tbl),
ARG_CPU_MASK, argv[0]));
break;
default:
@ -338,8 +354,11 @@ void parse(const int argc, char *const *const argv,
if (cli_settings->verify) {
if (cli_settings->outputfile)
usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set");
if (cli_settings->muxer && !strcmp(cli_settings->muxer, "md5"))
usage(argv[0], "Verification (--verify) requires the md5 muxer (--muxer md5)");
if (cli_settings->muxer && strcmp(cli_settings->muxer, "md5") &&
strcmp(cli_settings->muxer, "xxh3"))
{
usage(argv[0], "Verification (--verify) requires a checksum muxer (md5 or xxh3)");
}
cli_settings->outputfile = "-";
if (!cli_settings->muxer)
@ -347,7 +366,8 @@ void parse(const int argc, char *const *const argv,
}
if (!grain_specified && cli_settings->muxer &&
!strcmp(cli_settings->muxer, "md5"))
(!strcmp(cli_settings->muxer, "md5") ||
!strcmp(cli_settings->muxer, "xxh3")))
{
lib_settings->apply_grain = 0;
}

1
third_party/dav1d/tools/input/annexb.c поставляемый
Просмотреть файл

@ -191,5 +191,6 @@ const Demuxer annexb_demuxer = {
.probe_sz = PROBE_SIZE,
.open = annexb_open,
.read = annexb_read,
.seek = NULL,
.close = annexb_close,
};

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше