Bug 1688992 - Update libdav1d to 0.8.2 for Firefox 88. r=dminor

Differential Revision: https://phabricator.services.mozilla.com/D106197
2021-02-24 23:05:38 +00:00 · 2021-02-24 23:05:38 +00:00 · 7c5470c9ff
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@ -25,6 +25,10 @@ The rough steps are:
 - Update ./moz.build and ./asm/moz.build to add new files and remove deleted ones using
  third_party/dav1d/src/meson.build as a guide (confirm with the diff) (note the
  empty .asm file in x86_64)
+- Some files will be automatically added to the various autovendored_sources.mozbuild files.
+  In the case of the asm dir, these may cause build failures on particular platforms which
+  can be resolved by moving those out of autovendored_sources.mozbuild and into the regular
+  moz.build which has a condition on CONFIG['CPU_ARCH'].
 - Clone the tag from the dav1d repo and build a stand-alone libdav1d following the steps here:
  https://code.videolan.org/videolan/dav1d#compile
 - Copy vcs_version.h from the local build/include/vcs_version.h
--- a/media/libdav1d/asm/autovendored_sources.mozbuild
+++ b/media/libdav1d/asm/autovendored_sources.mozbuild
@ -1,4 +1,5 @@
 sources = [
+'../../../third_party/dav1d/src/x86/cdef16_sse.asm',
 '../../../third_party/dav1d/src/x86/cdef_sse.asm',
 '../../../third_party/dav1d/src/x86/cpuid.asm',
 '../../../third_party/dav1d/src/x86/film_grain_ssse3.asm',
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -83,6 +83,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
        # Empty file on all other archs. Nasm produces
        # an error when it compiles empty files.
        SOURCES += [
+            '../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored
            '../../../third_party/dav1d/src/x86/cdef_avx2.asm',
            '../../../third_party/dav1d/src/x86/cdef_avx512.asm',
            '../../../third_party/dav1d/src/x86/film_grain.asm',
@ -90,6 +91,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
            '../../../third_party/dav1d/src/x86/itx.asm',
            '../../../third_party/dav1d/src/x86/loopfilter.asm',
            '../../../third_party/dav1d/src/x86/looprestoration.asm',
+            '../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm', # moved from autovendored
            '../../../third_party/dav1d/src/x86/mc_avx2.asm',
            '../../../third_party/dav1d/src/x86/mc_avx512.asm',
        ]
@ -185,7 +187,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
            '../../../third_party/dav1d/src/arm/32/cdef16.S',
            '../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
            '../../../third_party/dav1d/src/arm/32/ipred.S',
+            '../../../third_party/dav1d/src/arm/32/ipred16.S',
            '../../../third_party/dav1d/src/arm/32/itx.S',
+            '../../../third_party/dav1d/src/arm/32/itx16.S',
            '../../../third_party/dav1d/src/arm/32/loopfilter.S',
            '../../../third_party/dav1d/src/arm/32/loopfilter16.S',
            '../../../third_party/dav1d/src/arm/32/looprestoration.S',
--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@ -163,6 +163,7 @@ EXPORTS.dav1d += [
    '../../third_party/dav1d/include/common/attributes.h',
    '../../third_party/dav1d/include/common/bitdepth.h',
    '../../third_party/dav1d/include/common/dump.h',
+    '../../third_party/dav1d/include/common/frame.h',
    '../../third_party/dav1d/include/common/intops.h',
    '../../third_party/dav1d/include/common/validate.h',
 ]
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 6ed5fafb42c651c24b6a65fd4f50ed426fd72d65 (2021-01-01T21:36:25.000+01:00).
+  release: commit f06148e7c755098666b9c0ed97a672a51785413a (2021-02-21T21:40:09.000+01:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 6ed5fafb42c651c24b6a65fd4f50ed426fd72d65
+  revision: f06148e7c755098666b9c0ed97a672a51785413a

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.8.1-0-g6ed5faf"
+#define DAV1D_VERSION "0.8.2-0-gf06148e"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@ -29,6 +29,6 @@

 #define DAV1D_API_VERSION_MAJOR 5
 #define DAV1D_API_VERSION_MINOR 0
-#define DAV1D_API_VERSION_PATCH 0
+#define DAV1D_API_VERSION_PATCH 1

 #endif /* DAV1D_VERSION_H */
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@ -1,4 +1,25 @@
-Changes for 0.8.1 'Eurasian hobby":
+Changes for 0.8.2 'Eurasian hobby':
+-----------------------------------
+
+0.8.2 is a middle-size update of the 0.8.0 branch:
+ - ARM32 optimizations for ipred and itx in 10/12bits,
+   completing the 10b/12b work on ARM64 and ARM32
+ - Give the post-filters their own threads
+ - ARM64: rewrite the wiener functions
+ - Speed up coefficient decoding, 0.5%-3% global decoding gain
+ - x86 optimizations for CDEF_filter and wiener in 10/12bit
+ - x86: rewrite the SGR AVX2 asm
+ - x86: improve msac speed on SSE2+ machines
+ - ARM32: improve speed of ipred and warp
+ - ARM64: improve speed of ipred, cdef_dir, cdef_filter, warp_motion and itx16
+ - ARM32/64: improve speed of looprestoration
+ - Add seeking, pausing to the player
+ - Update the player for rendering of 10b/12b
+ - Misc speed improvements and fixes on all platforms
+ - Add a xxh3 muxer in the dav1d application
+
+
+Changes for 0.8.1 'Eurasian hobby':
 -----------------------------------

 0.8.1 is a minor update on 0.8.0:
@ -10,7 +31,7 @@ Changes for 0.8.1 'Eurasian hobby":
 - x86 optimizations for wiener in SSE2/SSSE3/AVX2


-Changes for 0.8.0 'Eurasian hobby":
+Changes for 0.8.0 'Eurasian hobby':
 -----------------------------------

 0.8.0 is a major update for dav1d:
--- a/third_party/dav1d/examples/dav1dplay.c
+++ b/third_party/dav1d/examples/dav1dplay.c
@ -39,6 +39,11 @@
 #include "dp_fifo.h"
 #include "dp_renderer.h"

+#define FRAME_OFFSET_TO_PTS(foff) \
+    (uint64_t)(((foff) * rd_ctx->spf) * 1000000000.0 + .5)
+#define TS_TO_PTS(ts) \
+    (uint64_t)(((ts) * rd_ctx->timebase) * 1000000000.0 + .5)
+
 // Selected renderer callbacks and cookie
 static const Dav1dPlayRenderInfo *renderer_info = { NULL };

@ -59,27 +64,43 @@ typedef struct render_context
    // Lock to protect access to the context structure
    SDL_mutex *lock;

-    // Timestamp of previous decoded frame
-    int64_t last_pts;
-    // Timestamp of current decoded frame
-    int64_t current_pts;
+    // Timestamp of last displayed frame (in timebase unit)
+    int64_t last_ts;
+    // Timestamp of last decoded frame (in timebase unit)
+    int64_t current_ts;
    // Ticks when last frame was received
    uint32_t last_ticks;
    // PTS time base
    double timebase;
+    // Seconds per frame
+    double spf;
+    // Number of frames
+    uint32_t total;

    // Fifo
    Dav1dPlayPtrFifo *fifo;

-    // Custom SDL2 event type
-    uint32_t renderer_event_type;
+    // Custom SDL2 event types
+    uint32_t event_types;
+
+    // User pause state
+    uint8_t user_paused;
+    // Internal pause state
+    uint8_t paused;
+    // Start of internal pause state
+    uint32_t pause_start;
+    // Duration of internal pause state
+    uint32_t pause_time;
+
+    // Seek accumulator
+    int seek;

    // Indicates if termination of the decoder thread was requested
    uint8_t dec_should_terminate;
 } Dav1dPlayRenderContext;

 static void dp_settings_print_usage(const char *const app,
-    const char *const reason, ...)
+                                    const char *const reason, ...)
 {
    if (reason) {
        va_list args;
@ -95,6 +116,7 @@ static void dp_settings_print_usage(const char *const app,
            " --untimed/-u:         ignore PTS, render as fast as possible\n"
            " --framethreads $num:  number of frame threads (default: 1)\n"
            " --tilethreads $num:   number of tile threads (default: 1)\n"
+            " --pfthreads $num:     number of postfilter threads(default: 1)\n"
            " --highquality:        enable high quality rendering\n"
            " --zerocopy/-z:        enable zero copy upload path\n"
            " --gpugrain/-g:        enable GPU grain synthesis\n"
@ -115,7 +137,7 @@ static unsigned parse_unsigned(const char *const optarg, const int option,
 }

 static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
-    const int argc, char *const *const argv)
+                                 const int argc, char *const *const argv)
 {
    int o;
    Dav1dPlaySettings *settings = &rd_ctx->settings;
@ -127,6 +149,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
    enum {
        ARG_FRAME_THREADS = 256,
        ARG_TILE_THREADS,
+        ARG_POSTFILTER_THREADS,
        ARG_HIGH_QUALITY,
    };

@ -137,6 +160,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
        { "untimed",        0, NULL, 'u' },
        { "framethreads",   1, NULL, ARG_FRAME_THREADS },
        { "tilethreads",    1, NULL, ARG_TILE_THREADS },
+        { "pfthreads",      1, NULL, ARG_POSTFILTER_THREADS },
        { "highquality",    0, NULL, ARG_HIGH_QUALITY },
        { "zerocopy",       0, NULL, 'z' },
        { "gpugrain",       0, NULL, 'g' },
@ -175,6 +199,10 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
                lib_settings->n_tile_threads =
                    parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
                break;
+            case ARG_POSTFILTER_THREADS:
+                lib_settings->n_postfilter_threads =
+                    parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
+                break;
            default:
                dp_settings_print_usage(argv[0], NULL);
        }
@ -213,16 +241,16 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
    Dav1dPlayRenderContext *rd_ctx;

    // Alloc
-    rd_ctx = malloc(sizeof(Dav1dPlayRenderContext));
+    rd_ctx = calloc(1, sizeof(Dav1dPlayRenderContext));
    if (rd_ctx == NULL) {
        return NULL;
    }

    // Register a custom event to notify our SDL main thread
    // about new frames
-    rd_ctx->renderer_event_type = SDL_RegisterEvents(1);
-    if (rd_ctx->renderer_event_type == UINT32_MAX) {
-        fprintf(stderr, "Failure to create custom SDL event type!\n");
+    rd_ctx->event_types = SDL_RegisterEvents(3);
+    if (rd_ctx->event_types == UINT32_MAX) {
+        fprintf(stderr, "Failure to create custom SDL event types!\n");
        free(rd_ctx);
        return NULL;
    }
@ -265,24 +293,17 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
        return NULL;
    }

-    rd_ctx->last_pts = 0;
-    rd_ctx->last_ticks = 0;
-    rd_ctx->current_pts = 0;
-    rd_ctx->timebase = 0;
-    rd_ctx->dec_should_terminate = 0;
-
    return rd_ctx;
 }

 /**
- * Notify about new available frame
+ * Notify about new event
 */
-static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
+static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t type)
 {
    SDL_Event event;
    SDL_zero(event);
-    event.type = rd_ctx->renderer_event_type;
-    event.user.code = code;
+    event.type = type;
    SDL_PushEvent(&event);
 }

@ -294,10 +315,137 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
 * new picture.
 */
 static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
-    Dav1dPicture *dav1d_pic)
+                                                Dav1dPicture *dav1d_pic)
 {
+    rd_ctx->current_ts = dav1d_pic->m.timestamp;
    renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
-    rd_ctx->current_pts = dav1d_pic->m.timestamp;
+}
+
+/**
+ * Toggle pause state
+ */
+static void dp_rd_ctx_toggle_pause(Dav1dPlayRenderContext *rd_ctx)
+{
+    SDL_LockMutex(rd_ctx->lock);
+    rd_ctx->user_paused = !rd_ctx->user_paused;
+    if (rd_ctx->seek)
+        goto out;
+    rd_ctx->paused = rd_ctx->user_paused;
+    uint32_t now = SDL_GetTicks();
+    if (rd_ctx->paused)
+        rd_ctx->pause_start = now;
+    else {
+        rd_ctx->pause_time += now - rd_ctx->pause_start;
+        rd_ctx->pause_start = 0;
+        rd_ctx->last_ticks = now;
+    }
+out:
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+/**
+ * Query pause state
+ */
+static int dp_rd_ctx_is_paused(Dav1dPlayRenderContext *rd_ctx)
+{
+    int ret;
+    SDL_LockMutex(rd_ctx->lock);
+    ret = rd_ctx->paused;
+    SDL_UnlockMutex(rd_ctx->lock);
+    return ret;
+}
+
+/**
+ * Request seeking, in seconds
+ */
+static void dp_rd_ctx_seek(Dav1dPlayRenderContext *rd_ctx, int sec)
+{
+    SDL_LockMutex(rd_ctx->lock);
+    rd_ctx->seek += sec;
+    if (!rd_ctx->paused)
+        rd_ctx->pause_start = SDL_GetTicks();
+    rd_ctx->paused = 1;
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
+                        Dav1dData *data, DemuxerContext *in_ctx);
+static inline void destroy_pic(void *a);
+
+/**
+ * Seek the stream, if requested
+ */
+static int dp_rd_ctx_handle_seek(Dav1dPlayRenderContext *rd_ctx,
+                                 DemuxerContext *in_ctx,
+                                 Dav1dContext *c, Dav1dData *data)
+{
+    int res = 0;
+    SDL_LockMutex(rd_ctx->lock);
+    if (!rd_ctx->seek)
+        goto out;
+    int64_t seek = rd_ctx->seek * 1000000000ULL;
+    uint64_t pts = TS_TO_PTS(rd_ctx->current_ts);
+    pts = ((int64_t)pts > -seek) ? pts + seek : 0;
+    int end = pts >= FRAME_OFFSET_TO_PTS(rd_ctx->total);
+    if (end)
+        pts = FRAME_OFFSET_TO_PTS(rd_ctx->total - 1);
+    uint64_t target_pts = pts;
+    dav1d_flush(c);
+    uint64_t shift = FRAME_OFFSET_TO_PTS(5);
+    while (1) {
+        if (shift > pts)
+            shift = pts;
+        if ((res = input_seek(in_ctx, pts - shift)))
+            goto out;
+        Dav1dSequenceHeader seq;
+        uint64_t cur_pts;
+        do {
+            if ((res = input_read(in_ctx, data)))
+                break;
+            cur_pts = TS_TO_PTS(data->m.timestamp);
+            res = dav1d_parse_sequence_header(&seq, data->data, data->sz);
+        } while (res && cur_pts < pts);
+        if (!res && cur_pts <= pts)
+            break;
+        if (shift > pts)
+            shift = pts;
+        pts -= shift;
+    }
+    if (!res) {
+        pts = TS_TO_PTS(data->m.timestamp);
+        while (pts < target_pts) {
+            Dav1dPicture *p;
+            if ((res = decode_frame(&p, c, data, in_ctx)))
+                break;
+            if (p) {
+                pts = TS_TO_PTS(p->m.timestamp);
+                if (pts < target_pts)
+                    destroy_pic(p);
+                else {
+                    dp_fifo_push(rd_ctx->fifo, p);
+                    uint32_t type = rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME;
+                    dp_rd_ctx_post_event(rd_ctx, type);
+                }
+            }
+        }
+        if (!res) {
+            rd_ctx->last_ts = data->m.timestamp - rd_ctx->spf / rd_ctx->timebase;
+            rd_ctx->current_ts = data->m.timestamp;
+        }
+    }
+out:
+    rd_ctx->paused = rd_ctx->user_paused;
+    if (!rd_ctx->paused && rd_ctx->seek) {
+        uint32_t now = SDL_GetTicks();
+        rd_ctx->pause_time += now - rd_ctx->pause_start;
+        rd_ctx->pause_start = 0;
+        rd_ctx->last_ticks = now;
+    }
+    rd_ctx->seek = 0;
+    SDL_UnlockMutex(rd_ctx->lock);
+    if (res)
+        fprintf(stderr, "Error seeking, aborting\n");
+    return res;
 }

 /**
@ -329,14 +477,15 @@ static int dp_rd_ctx_should_terminate(Dav1dPlayRenderContext *rd_ctx)
 */
 static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
 {
+    SDL_LockMutex(rd_ctx->lock);
    // Calculate time since last frame was received
    uint32_t ticks_now = SDL_GetTicks();
    uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0;

    // Calculate when to display the frame
-    int64_t pts_diff = rd_ctx->current_pts - rd_ctx->last_pts;
-    int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
-    rd_ctx->last_pts = rd_ctx->current_pts;
+    int64_t ts_diff = rd_ctx->current_ts - rd_ctx->last_ts;
+    int32_t pts_diff = (ts_diff * rd_ctx->timebase) * 1000.0 + .5;
+    int32_t wait_time = pts_diff - ticks_diff;

    // In untimed mode, simply don't wait
    if (rd_ctx->settings.untimed)
@ -347,13 +496,59 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
    // accurate player this would need to be done in a better way.
    if (wait_time > 0) {
        SDL_Delay(wait_time);
-    } else if (wait_time < -10) { // Do not warn for minor time drifts
-        fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
+    } else if (wait_time < -10 && !rd_ctx->paused) { // Do not warn for minor time drifts
+        fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time / 1000.0);
    }

    renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);

+    rd_ctx->last_ts = rd_ctx->current_ts;
    rd_ctx->last_ticks = SDL_GetTicks();
+
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
+                        Dav1dData *data, DemuxerContext *in_ctx)
+{
+    int res;
+    // Send data packets we got from the demuxer to dav1d
+    if ((res = dav1d_send_data(c, data)) < 0) {
+        // On EAGAIN, dav1d can not consume more data and
+        // dav1d_get_picture needs to be called first, which
+        // will happen below, so just keep going in that case
+        // and do not error out.
+        if (res != DAV1D_ERR(EAGAIN)) {
+            dav1d_data_unref(data);
+            goto err;
+        }
+    }
+    *p = calloc(1, sizeof(**p));
+    // Try to get a decoded frame
+    if ((res = dav1d_get_picture(c, *p)) < 0) {
+        // In all error cases, even EAGAIN, p needs to be freed as
+        // it is never added to the queue and would leak.
+        free(*p);
+        *p = NULL;
+        // On EAGAIN, it means dav1d has not enough data to decode
+        // therefore this is not a decoding error but just means
+        // we need to feed it more data, which happens in the next
+        // run of the decoder loop.
+        if (res != DAV1D_ERR(EAGAIN))
+            goto err;
+    }
+    return data->sz == 0 ? input_read(in_ctx, data) : 0;
+err:
+    fprintf(stderr, "Error decoding frame: %s\n",
+            strerror(-res));
+    return res;
+}
+
+static inline void destroy_pic(void *a)
+{
+    Dav1dPicture *p = (Dav1dPicture *)a;
+    dav1d_picture_unref(p);
+    free(p);
 }

 /* Decoder thread "main" function */
@ -366,10 +561,7 @@ static int decoder_thread_main(void *cookie)
    Dav1dData data;
    DemuxerContext *in_ctx = NULL;
    int res = 0;
-    unsigned n_out = 0, total, timebase[2], fps[2];
-
-    // Store current ticks for stats calculation
-    uint32_t decoder_start = SDL_GetTicks();
+    unsigned total, timebase[2], fps[2];

    Dav1dPlaySettings settings = rd_ctx->settings;

@ -382,8 +574,9 @@ static int decoder_thread_main(void *cookie)
        goto cleanup;
    }

-    double timebase_d = timebase[1]/(double)timebase[0];
-    rd_ctx->timebase = timebase_d;
+    rd_ctx->timebase = (double)timebase[1] / timebase[0];
+    rd_ctx->spf = (double)fps[1] / fps[0];
+    rd_ctx->total = total;

    if ((res = dav1d_open(&c, &rd_ctx->lib_settings))) {
        fprintf(stderr, "Failed opening dav1d decoder\n");
@ -398,55 +591,29 @@ static int decoder_thread_main(void *cookie)
    }

    // Decoder loop
-    do {
-        if (dp_rd_ctx_should_terminate(rd_ctx))
+    while (1) {
+        if (dp_rd_ctx_should_terminate(rd_ctx) ||
+            (res = dp_rd_ctx_handle_seek(rd_ctx, in_ctx, c, &data)) ||
+            (res = decode_frame(&p, c, &data, in_ctx)))
+        {
            break;
-
-        // Send data packets we got from the demuxer to dav1d
-        if ((res = dav1d_send_data(c, &data)) < 0) {
-            // On EAGAIN, dav1d can not consume more data and
-            // dav1d_get_picture needs to be called first, which
-            // will happen below, so just keep going in that case
-            // and do not error out.
-            if (res != DAV1D_ERR(EAGAIN)) {
-                dav1d_data_unref(&data);
-                fprintf(stderr, "Error decoding frame: %s\n",
-                        strerror(-res));
-                break;
-            }
        }
-
-        p = calloc(1, sizeof(*p));
-
-        // Try to get a decoded frame
-        if ((res = dav1d_get_picture(c, p)) < 0) {
-            // In all error cases, even EAGAIN, p needs to be freed as
-            // it is never added to the queue and would leak.
-            free(p);
-
-            // On EAGAIN, it means dav1d has not enough data to decode
-            // therefore this is not a decoding error but just means
-            // we need to feed it more data, which happens in the next
-            // run of this decoder loop.
-            if (res != DAV1D_ERR(EAGAIN)) {
-                fprintf(stderr, "Error decoding frame: %s\n",
-                        strerror(-res));
-                break;
-            }
-            res = 0;
-        } else {
-
+        else if (p) {
            // Queue frame
-            dp_fifo_push(rd_ctx->fifo, p);
-            dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
-
-            n_out++;
+            SDL_LockMutex(rd_ctx->lock);
+            int seek = rd_ctx->seek;
+            SDL_UnlockMutex(rd_ctx->lock);
+            if (!seek) {
+                dp_fifo_push(rd_ctx->fifo, p);
+                uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
+                dp_rd_ctx_post_event(rd_ctx, type);
+            }
        }
-    } while ((data.sz > 0 || !input_read(in_ctx, &data)));
+    }

    // Release remaining data
-    if (data.sz > 0) dav1d_data_unref(&data);
-
+    if (data.sz > 0)
+        dav1d_data_unref(&data);
    // Do not drain in case an error occured and caused us to leave the
    // decoding loop early.
    if (res < 0)
@ -461,7 +628,6 @@ static int decoder_thread_main(void *cookie)
    do {
        if (dp_rd_ctx_should_terminate(rd_ctx))
            break;
-
        p = calloc(1, sizeof(*p));
        res = dav1d_get_picture(c, p);
        if (res < 0) {
@ -474,19 +640,13 @@ static int decoder_thread_main(void *cookie)
        } else {
            // Queue frame
            dp_fifo_push(rd_ctx->fifo, p);
-            dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
-
-            n_out++;
+            uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
+            dp_rd_ctx_post_event(rd_ctx, type);
        }
    } while (res != DAV1D_ERR(EAGAIN));

-    // Print stats
-    uint32_t decoding_time_ms = SDL_GetTicks() - decoder_start;
-    printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
-        n_out, decoding_time_ms/1000, n_out / (decoding_time_ms / 1000.0));
-
 cleanup:
-    dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_DEC_QUIT);
+    dp_rd_ctx_post_event(rd_ctx, rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT);

    if (in_ctx)
        input_close(in_ctx);
@ -543,41 +703,84 @@ int main(int argc, char **argv)
    decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);

    // Main loop
+#define NUM_MAX_EVENTS 8
+    SDL_Event events[NUM_MAX_EVENTS];
+    int num_frame_events = 0;
+    uint32_t start_time = 0, n_out = 0;
    while (1) {
-
-        SDL_Event e;
-        if (SDL_WaitEvent(&e)) {
-            if (e.type == SDL_QUIT) {
+        int num_events = 0;
+        SDL_WaitEvent(NULL);
+        while (num_events < NUM_MAX_EVENTS && SDL_PollEvent(&events[num_events++]))
+            break;
+        for (int i = 0; i < num_events; ++i) {
+            SDL_Event *e = &events[i];
+            if (e->type == SDL_QUIT) {
                dp_rd_ctx_request_shutdown(rd_ctx);
-            } else if (e.type == SDL_WINDOWEVENT) {
-                if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
+                dp_fifo_flush(rd_ctx->fifo, destroy_pic);
+                SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
+                SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME);
+                num_frame_events = 0;
+            } else if (e->type == SDL_WINDOWEVENT) {
+                if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
                    // TODO: Handle window resizes
+                } else if(e->window.event == SDL_WINDOWEVENT_EXPOSED) {
+                    dp_rd_ctx_render(rd_ctx);
                }
-            } else if (e.type == rd_ctx->renderer_event_type) {
-                if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
-                    // Dequeue frame and update the render context with it
-                    Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
-
-                    // Do not update textures during termination
-                    if (!dp_rd_ctx_should_terminate(rd_ctx))
-                        dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
-                    dav1d_picture_unref(p);
-                    free(p);
-                } else if (e.user.code == DAV1D_EVENT_DEC_QUIT) {
-                    break;
+            } else if (e->type == SDL_KEYDOWN) {
+                SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e;
+                if (kbde->keysym.sym == SDLK_SPACE) {
+                    dp_rd_ctx_toggle_pause(rd_ctx);
+                } else if (kbde->keysym.sym == SDLK_LEFT ||
+                           kbde->keysym.sym == SDLK_RIGHT)
+                {
+                    if (kbde->keysym.sym == SDLK_LEFT)
+                        dp_rd_ctx_seek(rd_ctx, -5);
+                    else if (kbde->keysym.sym == SDLK_RIGHT)
+                        dp_rd_ctx_seek(rd_ctx, +5);
+                    dp_fifo_flush(rd_ctx->fifo, destroy_pic);
+                    SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
+                    num_frame_events = 0;
                }
+            } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME) {
+                num_frame_events++;
+                // Store current ticks for stats calculation
+                if (start_time == 0)
+                    start_time = SDL_GetTicks();
+            } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME) {
+                // Dequeue frame and update the render context with it
+                Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
+                // Do not update textures during termination
+                if (!dp_rd_ctx_should_terminate(rd_ctx)) {
+                    dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
+                    n_out++;
+                }
+                destroy_pic(p);
+            } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT) {
+                goto out;
            }
        }
-
-        // Do not render during termination
-        if (!dp_rd_ctx_should_terminate(rd_ctx))
-            dp_rd_ctx_render(rd_ctx);
+        if (num_frame_events && !dp_rd_ctx_is_paused(rd_ctx)) {
+            // Dequeue frame and update the render context with it
+            Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
+            // Do not update textures during termination
+            if (!dp_rd_ctx_should_terminate(rd_ctx)) {
+                dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
+                dp_rd_ctx_render(rd_ctx);
+                n_out++;
+            }
+            destroy_pic(p);
+            num_frame_events--;
+        }
    }

+out:;
+    // Print stats
+    uint32_t time_ms = SDL_GetTicks() - start_time - rd_ctx->pause_time;
+    printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
+           n_out, time_ms / 1000, n_out/ (time_ms / 1000.0));
+
    int decoder_ret = 0;
    SDL_WaitThread(decoder_thread, &decoder_ret);
-
    dp_rd_ctx_destroy(rd_ctx);
-
    return decoder_ret;
 }
--- a/third_party/dav1d/examples/dp_fifo.c
+++ b/third_party/dav1d/examples/dp_fifo.c
@ -37,6 +37,8 @@ struct dp_fifo
    size_t capacity;
    size_t count;
    void **entries;
+    int push_wait;
+    int flush;
 };


@ -54,6 +56,8 @@ Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)

    fifo->capacity = capacity;
    fifo->count = 0;
+    fifo->push_wait = 0;
+    fifo->flush = 0;

    fifo->lock = SDL_CreateMutex();
    if (fifo->lock == NULL) {
@ -90,8 +94,16 @@ void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
 void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
 {
    SDL_LockMutex(fifo->lock);
-    while (fifo->count == fifo->capacity)
+    while (fifo->count == fifo->capacity) {
+        fifo->push_wait = 1;
        SDL_CondWait(fifo->cond_change, fifo->lock);
+        fifo->push_wait = 0;
+        if (fifo->flush) {
+            SDL_CondSignal(fifo->cond_change);
+            SDL_UnlockMutex(fifo->lock);
+            return;
+        }
+    }
    fifo->entries[fifo->count++] = element;
    if (fifo->count == 1)
        SDL_CondSignal(fifo->cond_change);
@ -120,4 +132,16 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
    return res;
 }

-
+void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *))
+{
+    SDL_LockMutex(fifo->lock);
+    fifo->flush = 1;
+    if (fifo->push_wait) {
+        SDL_CondSignal(fifo->cond_change);
+        SDL_CondWait(fifo->cond_change, fifo->lock);
+    }
+    while (fifo->count)
+        destroy_elem(fifo->entries[--fifo->count]);
+    fifo->flush = 0;
+    SDL_UnlockMutex(fifo->lock);
+}
--- a/third_party/dav1d/examples/dp_fifo.h
+++ b/third_party/dav1d/examples/dp_fifo.h
@ -59,3 +59,5 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
 * other thread will call dp_fifo_shift will lead to a deadlock.
 */
 void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
+
+void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *));
--- a/third_party/dav1d/examples/dp_renderer.h
+++ b/third_party/dav1d/examples/dp_renderer.h
@ -66,8 +66,11 @@ typedef struct {
 #define WINDOW_WIDTH  910
 #define WINDOW_HEIGHT 512

-#define DAV1D_EVENT_NEW_FRAME 1
-#define DAV1D_EVENT_DEC_QUIT  2
+enum {
+    DAV1D_EVENT_NEW_FRAME,
+    DAV1D_EVENT_SEEK_FRAME,
+    DAV1D_EVENT_DEC_QUIT
+};

 /**
 * Renderer info
@ -84,7 +87,7 @@ typedef struct rdr_info
    void (*destroy_renderer)(void *cookie);
    // Callback to the render function that renders a prevously sent frame
    void (*render)(void *cookie, const Dav1dPlaySettings *settings);
-    // Callback to the send frame function
+    // Callback to the send frame function, _may_ also unref dav1d_pic!
    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
                        const Dav1dPlaySettings *settings);
    // Callback for alloc/release pictures (optional)
--- a/third_party/dav1d/examples/dp_renderer_placebo.c
+++ b/third_party/dav1d/examples/dp_renderer_placebo.c
@ -30,7 +30,7 @@
 #include <assert.h>

 #include <libplacebo/renderer.h>
-#include <libplacebo/utils/upload.h>
+#include <libplacebo/utils/dav1d.h>

 #ifdef HAVE_PLACEBO_VULKAN
 # include <libplacebo/vulkan.h>
@ -72,7 +72,7 @@ typedef struct renderer_priv_ctx
    // Lock protecting access to the texture
    SDL_mutex *lock;
    // Image to render, and planes backing them
-    struct pl_image image;
+    struct pl_frame image;
    const struct pl_tex *plane_tex[3];
 } Dav1dPlayRendererPrivateContext;

@ -319,22 +319,15 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
    if (settings->highquality)
        render_params = pl_render_default_params;

-    struct pl_render_target target;
-    pl_render_target_from_swapchain(&target, &frame);
-    target.profile = (struct pl_icc_profile) {
-        .data = NULL,
-        .len = 0,
-    };
-
-#if PL_API_VER >= 66
-    pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
-    if (pl_render_target_partial(&target))
-        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
-#endif
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, &frame);
+    pl_rect2df_aspect_copy(&target.crop, &rd_priv_ctx->image.crop, 0.0);
+    if (pl_frame_is_cropped(&target))
+        pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 0.0 });

    if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
        fprintf(stderr, "Failed rendering frame!\n");
-        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
+        pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 1.0 });
    }

    ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
@ -351,320 +344,37 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
 static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
                                const Dav1dPlaySettings *settings)
 {
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
+    Dav1dPlayRendererPrivateContext *p = cookie;
+    assert(p != NULL);
+    int ret = 0;

-    SDL_LockMutex(rd_priv_ctx->lock);
+    if (!dav1d_pic)
+        return ret;

-    if (dav1d_pic == NULL) {
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return 0;
-    }
-
-    int width = dav1d_pic->p.w;
-    int height = dav1d_pic->p.h;
-    int sub_x = 0, sub_y = 0;
-    int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
-    enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
-
-    struct pl_image *image = &rd_priv_ctx->image;
-    *image = (struct pl_image) {
-        .num_planes = 3,
-        .width      = width,
-        .height     = height,
-        .src_rect   = {0, 0, width, height},
-
-        .repr = {
-            .bits = {
-                .sample_depth = bytes * 8,
-                .color_depth = dav1d_pic->p.bpc,
-            },
-        },
+    struct pl_dav1d_upload_params params = {
+        .picture = dav1d_pic,
+        .film_grain = settings->gpugrain,
+        .gpu_allocated = settings->zerocopy,
+        .asynchronous = true,
    };

-    // Figure out the correct plane dimensions/count
-    switch (dav1d_pic->p.layout) {
-    case DAV1D_PIXEL_LAYOUT_I400:
-        image->num_planes = 1;
-        break;
-    case DAV1D_PIXEL_LAYOUT_I420:
-        sub_x = sub_y = 1;
-        break;
-    case DAV1D_PIXEL_LAYOUT_I422:
-        sub_x = 1;
-        break;
-    case DAV1D_PIXEL_LAYOUT_I444:
-        break;
-    }
-
-    // Set the right colorspace metadata etc.
-    switch (dav1d_pic->seq_hdr->pri) {
-    case DAV1D_COLOR_PRI_UNKNOWN:   image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
-    case DAV1D_COLOR_PRI_BT709:     image->color.primaries = PL_COLOR_PRIM_BT_709; break;
-    case DAV1D_COLOR_PRI_BT470M:    image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
-    case DAV1D_COLOR_PRI_BT470BG:   image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
-    case DAV1D_COLOR_PRI_BT601:     image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
-    case DAV1D_COLOR_PRI_BT2020:    image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
-
-    case DAV1D_COLOR_PRI_XYZ:
-        // Handled below
-        assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
-        break;
-
-    default:
-        printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
-               "may be very incorrect\n", dav1d_pic->seq_hdr->pri);
-        break;
-    }
-
-    switch (dav1d_pic->seq_hdr->trc) {
-    case DAV1D_TRC_BT709:
-    case DAV1D_TRC_BT470M:
-    case DAV1D_TRC_BT470BG:
-    case DAV1D_TRC_BT601:
-    case DAV1D_TRC_SMPTE240:
-    case DAV1D_TRC_BT2020_10BIT:
-    case DAV1D_TRC_BT2020_12BIT:
-        // These all map to the effective "SDR" CRT-based EOTF, BT.1886
-        image->color.transfer = PL_COLOR_TRC_BT_1886;
-        break;
-
-    case DAV1D_TRC_UNKNOWN:     image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
-    case DAV1D_TRC_LINEAR:      image->color.transfer = PL_COLOR_TRC_LINEAR; break;
-    case DAV1D_TRC_SRGB:        image->color.transfer = PL_COLOR_TRC_SRGB; break;
-    case DAV1D_TRC_SMPTE2084:   image->color.transfer = PL_COLOR_TRC_PQ; break;
-    case DAV1D_TRC_HLG:         image->color.transfer = PL_COLOR_TRC_HLG; break;
-
-    default:
-        printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
-               "may be very incorrect\n", dav1d_pic->seq_hdr->trc);
-        break;
-    }
-
-    switch (dav1d_pic->seq_hdr->mtrx) {
-    case DAV1D_MC_IDENTITY:
-        // This is going to be either RGB or XYZ
-        if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
-            image->repr.sys = PL_COLOR_SYSTEM_XYZ;
-        } else {
-            image->repr.sys = PL_COLOR_SYSTEM_RGB;
-        }
-        break;
-
-    case DAV1D_MC_UNKNOWN:
-        // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
-        image->repr.sys = pl_color_system_guess_ycbcr(width, height);
-        break;
-
-    case DAV1D_MC_BT709:        image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
-    case DAV1D_MC_BT601:        image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
-    case DAV1D_MC_SMPTE240:     image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
-    case DAV1D_MC_SMPTE_YCGCO:  image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
-    case DAV1D_MC_BT2020_NCL:   image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
-    case DAV1D_MC_BT2020_CL:    image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
-
-    case DAV1D_MC_ICTCP:
-        // This one is split up based on the actual HDR curve in use
-        if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
-            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
-        } else {
-            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
-        }
-        break;
-
-    default:
-        printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
-               "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
-        break;
-    }
-
-    if (dav1d_pic->seq_hdr->color_range) {
-        image->repr.levels = PL_COLOR_LEVELS_PC;
-    } else {
-        image->repr.levels = PL_COLOR_LEVELS_TV;
-    }
-
-    switch (dav1d_pic->seq_hdr->chr) {
-    case DAV1D_CHR_UNKNOWN:     chroma_loc = PL_CHROMA_UNKNOWN; break;
-    case DAV1D_CHR_VERTICAL:    chroma_loc = PL_CHROMA_LEFT; break;
-    case DAV1D_CHR_COLOCATED:   chroma_loc = PL_CHROMA_TOP_LEFT; break;
-    }
-
-#if PL_API_VER >= 63
-    if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
-        Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
-        struct pl_av1_grain_data *dst = &image->av1_grain;
-        *dst = (struct pl_av1_grain_data) {
-            .grain_seed     = src->seed,
-            .num_points_y   = src->num_y_points,
-            .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
-            .num_points_uv  = { src->num_uv_points[0], src->num_uv_points[1] },
-            .scaling_shift  = src->scaling_shift,
-            .ar_coeff_lag   = src->ar_coeff_lag,
-            .ar_coeff_shift = (int)src->ar_coeff_shift,
-            .grain_scale_shift = src->grain_scale_shift,
-            .uv_mult        = { src->uv_mult[0], src->uv_mult[1] },
-            .uv_mult_luma   = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
-            .uv_offset      = { src->uv_offset[0], src->uv_offset[1] },
-            .overlap        = src->overlap_flag,
-        };
-
-        assert(sizeof(dst->points_y) == sizeof(src->y_points));
-        assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
-        assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
-        memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
-        memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
-        memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
-
-        // this one has different row sizes for alignment
-        for (int c = 0; c < 2; c++) {
-            for (int i = 0; i < 25; i++)
-                dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
-        }
-    }
-#endif
-
-    // Upload the actual planes
-    struct pl_plane_data data[3] = {
-        {
-            // Y plane
-            .type           = PL_FMT_UNORM,
-            .width          = width,
-            .height         = height,
-            .pixel_stride   = bytes,
-            .row_stride     = dav1d_pic->stride[0],
-            .component_size = {bytes * 8},
-            .component_map  = {0},
-        }, {
-            // U plane
-            .type           = PL_FMT_UNORM,
-            .width          = width >> sub_x,
-            .height         = height >> sub_y,
-            .pixel_stride   = bytes,
-            .row_stride     = dav1d_pic->stride[1],
-            .component_size = {bytes * 8},
-            .component_map  = {1},
-        }, {
-            // V plane
-            .type           = PL_FMT_UNORM,
-            .width          = width >> sub_x,
-            .height         = height >> sub_y,
-            .pixel_stride   = bytes,
-            .row_stride     = dav1d_pic->stride[1],
-            .component_size = {bytes * 8},
-            .component_map  = {2},
-        },
-    };
-
-    bool ok = true;
-
-    for (int i = 0; i < image->num_planes; i++) {
-        if (settings->zerocopy) {
-            const struct pl_buf *buf = dav1d_pic->allocator_data;
-            assert(buf);
-            data[i].buf = buf;
-            data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
-        } else {
-            data[i].pixels = dav1d_pic->data[i];
-        }
-
-        ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
-    }
-
-    // Apply the correct chroma plane shift. This has to be done after pl_upload_plane
-#if PL_API_VER >= 67
-    pl_image_set_chroma_location(image, chroma_loc);
-#else
-    pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
-    pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
-#endif
-
-    if (!ok) {
+    SDL_LockMutex(p->lock);
+    if (!pl_upload_dav1dpicture(p->gpu, &p->image, p->plane_tex, &params)) {
        fprintf(stderr, "Failed uploading planes!\n");
-        *image = (struct pl_image) {0};
+        p->image = (struct pl_frame) {0};
+        ret = -1;
    }
-
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-    return !ok;
+    SDL_UnlockMutex(p->lock);
+    return ret;
 }

-// Align to power of 2
-#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
-
-static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+static int placebo_alloc_pic(Dav1dPicture *const pic, void *cookie)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
+
    SDL_LockMutex(rd_priv_ctx->lock);
-
-    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
-    int ret = DAV1D_ERR(ENOMEM);
-
-    // Copied from dav1d_default_picture_alloc
-    const int hbd = p->p.bpc > 8;
-    const int aligned_w = ALIGN2(p->p.w, 128);
-    const int aligned_h = ALIGN2(p->p.h, 128);
-    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
-    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    p->stride[0] = aligned_w << hbd;
-    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
-
-    // Align strides up to multiples of the GPU performance hints
-    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
-    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
-
-    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
-    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
-    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
-    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
-
-    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
-    // even in the case that the driver gives us insane alignments
-    const size_t pic_size = y_sz + 2 * uv_sz;
-    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
-
-    // Validate size limitations
-    if (total_size > gpu->limits.max_xfer_size) {
-        printf("alloc of %zu bytes exceeds limits\n", total_size);
-        goto err;
-    }
-
-    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
-        .type = PL_BUF_TEX_TRANSFER,
-        .host_mapped = true,
-        .size = total_size,
-        .memory_type = PL_BUF_MEM_HOST,
-        .user_data = p,
-    });
-
-    if (!buf) {
-        printf("alloc of GPU mapped buffer failed\n");
-        goto err;
-    }
-
-    assert(buf->data);
-    uintptr_t base = (uintptr_t) buf->data, data[3];
-    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
-    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
-    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
-
-    // Sanity check offset alignment for the sake of debugging
-    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
-        data[1] - base != ALIGN2(data[1] - base, off_align) ||
-        data[2] - base != ALIGN2(data[2] - base, off_align))
-    {
-        printf("GPU buffer horribly misaligned, expect slowdown!\n");
-    }
-
-    p->allocator_data = (void *) buf;
-    p->data[0] = (void *) data[0];
-    p->data[1] = (void *) data[1];
-    p->data[2] = (void *) data[2];
-    ret = 0;
-
-    // fall through
-err:
+    int ret = pl_allocate_dav1dpicture(pic, rd_priv_ctx->gpu);
    SDL_UnlockMutex(rd_priv_ctx->lock);
    return ret;
 }
@ -673,11 +383,9 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
-    assert(pic->allocator_data);

    SDL_LockMutex(rd_priv_ctx->lock);
-    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
-    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+    pl_release_dav1dpicture(pic, rd_priv_ctx->gpu);
    SDL_UnlockMutex(rd_priv_ctx->lock);
 }

@ -690,10 +398,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
    .update_frame = placebo_upload_image,
    .alloc_pic = placebo_alloc_pic,
    .release_pic = placebo_release_pic,
-
-# if PL_API_VER >= 63
    .supports_gpu_grain = 1,
-# endif
 };
 #else
 const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
@ -706,12 +411,7 @@ const Dav1dPlayRenderInfo rdr_placebo_gl = {
    .destroy_renderer = placebo_renderer_destroy,
    .render = placebo_render,
    .update_frame = placebo_upload_image,
-    .alloc_pic = placebo_alloc_pic,
-    .release_pic = placebo_release_pic,
-
-# if PL_API_VER >= 63
    .supports_gpu_grain = 1,
-# endif
 };
 #else
 const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
--- a/third_party/dav1d/examples/meson.build
+++ b/third_party/dav1d/examples/meson.build
@ -43,10 +43,10 @@ dav1dplay_sources = files(
 sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)

 if sdl2_dependency.found()
-    dav1dplay_deps = [sdl2_dependency]
+    dav1dplay_deps = [sdl2_dependency, libm_dependency]
    dav1dplay_cflags = []

-    placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
+    placebo_dependency = dependency('libplacebo', version: '>= 3.110.0', required: false)

    if placebo_dependency.found()
        dav1dplay_deps += placebo_dependency
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@ -116,8 +116,8 @@
 #    define dav1d_uninit(x) x
 #endif

- #ifdef _MSC_VER
- #include <intrin.h>
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>

 static inline int ctz(const unsigned int mask) {
    unsigned long idx;
--- a/third_party/dav1d/include/common/frame.h
+++ b/third_party/dav1d/include/common/frame.h
@ -0,0 +1,45 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_FRAME_H
+#define DAV1D_COMMON_FRAME_H
+
+/*
+ * Checks whether Dav1dFrameType == INTER || == SWITCH
+ * Both are defined as odd numbers {1, 3} and therefore have the LSB set.
+ * See also: AV1 spec 6.8.2
+ */
+#define IS_INTER_OR_SWITCH(frame_header) \
+    ((frame_header)->frame_type & 1)
+
+/*
+ * Checks whether Dav1dFrameType == KEY || == INTRA
+ * See also: AV1 spec 6.8.2
+ */
+#define IS_KEY_OR_INTRA(frame_header) \
+    (!IS_INTER_OR_SWITCH(frame_header))
+
+#endif /* DAV1D_COMMON_FRAME_H */
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@ -45,6 +45,7 @@ typedef struct Dav1dRef Dav1dRef;

 #define DAV1D_MAX_FRAME_THREADS 256
 #define DAV1D_MAX_TILE_THREADS 64
+#define DAV1D_MAX_POSTFILTER_THREADS 256

 typedef struct Dav1dLogger {
    void *cookie; ///< Custom data to pass to the callback.
@ -67,7 +68,8 @@ typedef struct Dav1dSettings {
    unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
    Dav1dPicAllocator allocator; ///< Picture allocator callback.
    Dav1dLogger logger; ///< Logger callback.
-    uint8_t reserved[32]; ///< reserved for future use
+    int n_postfilter_threads;
+    uint8_t reserved[28]; ///< reserved for future use
 } Dav1dSettings;

 /**
--- a/third_party/dav1d/include/meson.build
+++ b/third_party/dav1d/include/meson.build
@ -25,9 +25,7 @@
 # Revision file (vcs_version.h) generation
 dav1d_git_dir = join_paths(dav1d_src_root, '.git')
 rev_target = vcs_tag(command: [
-        'git', '--git-dir', dav1d_git_dir,
-        'describe', '--tags', '--long',
-        '--match', '?.*.*', '--always'
+        'git', '--git-dir', dav1d_git_dir, 'describe', '--long', '--always'
    ],
    input: 'vcs_version.h.in',
    output: 'vcs_version.h'
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -1,4 +1,4 @@
-# Copyright © 2018-2020, VideoLAN and dav1d authors
+# Copyright © 2018-2021, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -23,14 +23,14 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '0.8.1',
+    version: '0.8.2',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
                      'b_ndebug=if-release'],
    meson_version: '>= 0.49.0')

-dav1d_soname_version       = '5.0.0'
+dav1d_soname_version       = '5.0.1'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@ -128,7 +128,7 @@ if host_machine.system() == 'windows'
    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
-    rc_data.set('COPYRIGHT_YEARS', '2020')
+    rc_data.set('COPYRIGHT_YEARS', '2021')
 else
    thread_dependency = dependency('threads')
    thread_compat_dep = []
@ -168,6 +168,8 @@ if host_machine.system() == 'linux'
    endif
 endif

+libm_dependency = cc.find_library('m', required: false)
+

 # Header checks

@ -257,6 +259,7 @@ if cc.get_argument_syntax() != 'msvc'
 else
    optional_arguments += [
      '-wd4028', # parameter different from declaration
+      '-wd4090', # broken with arrays of pointers
      '-wd4996'  # use of POSIX functions
    ]
 endif
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@ -53,3 +53,7 @@ option('fuzzer_ldflags',
 option('stack_alignment',
    type: 'integer',
    value: 0)
+
+option('xxhash_muxer',
+    type : 'feature',
+    value : 'auto')
--- a/third_party/dav1d/src/arm/32/ipred.S
+++ b/third_party/dav1d/src/arm/32/ipred.S
@ -40,8 +40,7 @@ function ipred_dc_128_8bpc_neon, export=1
        adr             r2,  L(ipred_dc_128_tbl)
        sub             r3,  r3,  #25
        ldr             r3,  [r2,  r3,  lsl #2]
-        mov             lr,  #128
-        vdup.8          q0,  lr
+        vmov.i8         q0,  #128
        add             r2,  r2,  r3
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
@ -79,7 +78,7 @@ L(ipred_dc_128_tbl):
        bgt             16b
        pop             {r4, pc}
 320:
-        vdup.8          q1,  lr
+        vmov.i8         q1,  #128
 32:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
@ -89,20 +88,18 @@ L(ipred_dc_128_tbl):
        bgt             32b
        pop             {r4, pc}
 640:
-        vdup.8          q1,  lr
-        vdup.8          q2,  lr
-        vdup.8          q3,  lr
+        vmov.i8         q1,  #128
        sub             r1,  r1,  #32
 64:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4,  #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             64b
        pop             {r4, pc}
 endfunc
@ -401,19 +398,17 @@ L(ipred_dc_top_tbl):
        vrshrn.u16      d18, q0,  #6
        vdup.8          q0,  d18[0]
        vdup.8          q1,  d18[0]
-        vdup.8          q2,  d18[0]
-        vdup.8          q3,  d18[0]
        sub             r1,  r1,  #32
 64:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4,  #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             64b
        pop             {r4-r5, pc}
 endfunc
@ -538,20 +533,18 @@ L(ipred_dc_left_h64):
        vdup.8          q0,  d0[0]
        bx              r3
 L(ipred_dc_left_w64):
-        sub             r1,  r1,  #32
        vmov.8          q1,  q0
-        vmov.8          q2,  q0
-        vmov.8          q3,  q0
+        sub             r1,  r1,  #32
 1:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4, #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             1b
        pop             {r4-r5, pc}
 endfunc
@ -600,10 +593,10 @@ L(ipred_dc_tbl):
 L(ipred_dc_h4):
        vld1.32         {d0[]},  [r2, :32]!
        vpaddl.u8       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w4):
-        add             r2,  r2,  #1
        vld1.32         {d1[]},  [r2]
        vadd.s16        d0,  d0,  d30
        vpaddl.u8       d1,  d1
@ -635,10 +628,10 @@ L(ipred_dc_h8):
        vld1.8          {d0},  [r2, :64]!
        vpaddl.u8       d0,  d0
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w8):
-        add             r2,  r2,  #1
        vld1.8          {d2},  [r2]
        vadd.s16        d0,  d0,  d30
        vpaddl.u8       d2,  d2
@ -672,10 +665,10 @@ L(ipred_dc_h16):
        vaddl.u8        q0,  d0,  d1
        vadd.u16        d0,  d0,  d1
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w16):
-        add             r2,  r2,  #1
        vld1.8          {d2,  d3},  [r2]
        vadd.s16        d0,  d0,  d30
        vaddl.u8        q1,  d2,  d3
@ -712,10 +705,10 @@ L(ipred_dc_h32):
        vadd.u16        q0,  q0,  q1
        vadd.u16        d0,  d0,  d1
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w32):
-        add             r2,  r2,  #1
        vld1.8          {d2,  d3,  d4,  d5},  [r2]
        vadd.s16        d0,  d0,  d30
        vaddl.u8        q1,  d2,  d3
@ -760,10 +753,10 @@ L(ipred_dc_h64):
        vadd.u16        q0,  q0,  q1
        vadd.u16        d0,  d0,  d1
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w64):
-        add             r2,  r2,  #1
        vld1.8          {d2,  d3,  d4,  d5},  [r2]!
        vadd.s16        d0,  d0,  d30
        vaddl.u8        q2,  d4,  d5
@ -789,11 +782,11 @@ L(ipred_dc_w64):
        vadd.s16        d0,  d0,  d2
        vadd.s16        d0,  d0,  d3
        vshl.u16        d18, d0,  d28
-        beq             1f                  // h = 16/32
+        beq             1f
+        // h = 16/32
        movw            lr,  #(0x5556/2)
        movt            lr,  #(0x3334/2)
-        mov             r5,  r4
-        and             r5,  r5,  #31
+        and             r5,  r4,  #31
        lsr             lr,  lr,  r5
        vdup.16         d30, lr
        vqdmulh.s16     d18, d18, d30
@ -801,18 +794,16 @@ L(ipred_dc_w64):
        sub             r1,  r1,  #32
        vdup.8          q0,  d18[0]
        vdup.8          q1,  d18[0]
-        vdup.8          q2,  d18[0]
-        vdup.8          q3,  d18[0]
 2:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4,  #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             2b
        pop             {r4-r6, pc}
 endfunc
@ -1444,6 +1435,8 @@ function ipred_filter_8bpc_neon, export=1
        vmovl.s8        q13, d28
        vmovl.s8        q14, d29
        add             r8,  r2,  #1
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
        bx              r5

        .align 2
@ -1455,8 +1448,6 @@ L(ipred_filter_tbl):

 40:
        vld1.32         {d0[]}, [r8]     // top (0-3)
-        sub             r2,  r2,  #2
-        mov             r7,  #-2
        vmovl.u8        q0,  d0          // top (0-3)
 4:
        vld1.32         {d2[]}, [r2], r7 // left (0-1) + topleft (2)
@ -1473,13 +1464,11 @@ L(ipred_filter_tbl):
        vst1.32         {d4[0]}, [r0, :32], r1
        vmovl.u8        q0,  d4
        vst1.32         {d4[1]}, [r6, :32], r1
-        vext.8          q0,  q0,  q0,  #8 // move top from [4-7] to [0-3]
+        vmov            d0,  d1          // move top from [4-7] to [0-3]
        bgt             4b
        pop             {r4-r8, pc}
 80:
        vld1.8          {d0},  [r8]      // top (0-7)
-        sub             r2,  r2,  #2
-        mov             r7,  #-2
        vmovl.u8        q0,  d0          // top (0-7)
 8:
        vld1.32         {d2[]}, [r2], r7 // left (0-1) + topleft (2)
@ -1503,16 +1492,14 @@ L(ipred_filter_tbl):
        vqrshrun.s16    d5,  q3,  #4
        vzip.32         d4,  d5
        subs            r4,  r4,  #2
-        vst1.64         {d4}, [r0, :64], r1
+        vst1.8          {d4}, [r0, :64], r1
        vmovl.u8        q0,  d5
-        vst1.64         {d5}, [r6, :64], r1
+        vst1.8          {d5}, [r6, :64], r1
        bgt             8b
        pop             {r4-r8, pc}
 160:
 320:
        vpush           {q4-q5}
-        sub             r2,  r2,  #2
-        mov             r7,  #-2
        sub             r1,  r1,  r3
        mov             lr,  r3

@ -2003,10 +1990,10 @@ L(ipred_cfl_tbl):
 L(ipred_cfl_h4):
        vld1.32         {d0[]}, [r2, :32]!
        vpaddl.u8       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w4):
-        add             r2,  r2,  #1
        vld1.32         {d1[]},  [r2]
        vadd.i16        d0,  d0,  d16
        vpaddl.u8       d1,  d1
@ -2031,10 +2018,10 @@ L(ipred_cfl_h8):
        vld1.8          {d0}, [r2, :64]!
        vpaddl.u8       d0,  d0
        vpadd.i16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w8):
-        add             r2,  r2,  #1
        vld1.8          {d1}, [r2]
        vadd.i16        d0,  d0,  d16
        vpaddl.u8       d1,  d1
@ -2061,10 +2048,10 @@ L(ipred_cfl_h16):
        vaddl.u8        q0,  d0,  d1
        vadd.i16        d0,  d0,  d1
        vpadd.i16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w16):
-        add             r2,  r2,  #1
        vld1.8          {q2}, [r2]
        vadd.i16        d0,  d0,  d16
        vaddl.u8        q2,  d4,  d5
@ -2094,10 +2081,10 @@ L(ipred_cfl_h32):
        vadd.i16        q0,  q2,  q3
        vadd.i16        d0,  d0,  d1
        vpadd.i16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w32):
-        add             r2,  r2,  #1
        vld1.8          {q2, q3},  [r2]
        vadd.i16        d0,  d0,  d16
        vaddl.u8        q2,  d4,  d5
--- a/third_party/dav1d/src/arm/32/ipred16.S
+++ b/third_party/dav1d/src/arm/32/ipred16.S
--- a/third_party/dav1d/src/arm/32/itx.S
+++ b/third_party/dav1d/src/arm/32/itx.S
@ -706,7 +706,7 @@ def_fn_4x4 identity, flipadst
        vrshrn_8h       \r14, \r15, q4,   q5,   #12         // t7a
        vmull_vmlal_8h  q2,   q3,   \r10, \r11, \r6,  \r7,  d1[3], d1[2] // -> t6a
        vrshrn_8h       \r6,  \r7,  q6,   q7,   #12         // t5a
-        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // taa
+        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // t6a

        vqadd.s16       q2,   \q1,  \q3 // t4
        vqsub.s16       \q1,  \q1,  \q3 // t5a
@ -1173,7 +1173,7 @@ function inv_dct_4h_x16_neon, export=1

        vrshrn.i32      d6,  q3,  #12  // t11
        vrshrn.i32      d7,  q4,  #12  // t12
-        vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t10a
+        vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t13a
        vrshrn.i32      d4,  q2,  #12  // t10a
        vrshrn.i32      d5,  q4,  #12  // t13a

@ -1480,53 +1480,6 @@ function inv_txfm_add_vert_4x16_neon
        pop             {pc}
 endfunc

-.macro sub_sp_align space
-#if CONFIG_THUMB
-        mov             r7,  sp
-        and             r7,  r7,  #15
-#else
-        and             r7,  sp,  #15
-#endif
-        sub             sp,  sp,  r7
-        // Now the stack is aligned, store the amount of adjustment back
-        // on the stack, as we don't want to waste a register as frame
-        // pointer.
-        str             r7,  [sp, #-16]!
-#ifdef _WIN32
-.if \space > 8192
-        // Here, we'd need to touch two (or more) pages while decrementing
-        // the stack pointer.
-        .error          "sub_sp_align doesn't support values over 8K at the moment"
-.elseif \space > 4096
-        sub             r7,  sp,  #4096
-        ldr             r12, [r7]
-        sub             r7,  r7,  #(\space - 4096)
-        mov             sp,  r7
-.else
-        sub             sp,  sp,  #\space
-.endif
-#else
-.if \space >= 4096
-        sub             sp,  sp,  #(\space)/4096*4096
-.endif
-.if (\space % 4096) != 0
-        sub             sp,  sp,  #(\space)%4096
-.endif
-#endif
-.endm
-
-.macro add_sp_align space
-.if \space >= 4096
-        add             sp,  sp,  #(\space)/4096*4096
-.endif
-.if (\space % 4096) != 0
-        add             sp,  sp,  #(\space)%4096
-.endif
-        ldr             r7,  [sp], #16
-        // Add back the original stack adjustment
-        add             sp,  sp,  r7
-.endm
-
 function inv_txfm_add_16x16_neon
        sub_sp_align    512
        ldrh            r11, [r10], #2
@ -3248,7 +3201,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
        mov             r8,  #(32 - \i)
        cmp             r3,  r11
        blt             1f
+.if \i < 28
        ldrh            r11, [r10], #2
+.endif
 .endif
        add             r7,  r2,  #(\i*2)
        mov             r8,  #32*2
@ -3304,7 +3259,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
        add             r6,  r4,  #(\i*64*2)
        mov             r9,  #-2 // shift
        bl              inv_txfm_horz_dct_64x4_neon
-.if \i < 8
+.if \i < 12
        ldrh            r11, [r10], #2
 .endif
 .endr
@ -3353,7 +3308,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
        mov             r8,  #(32 - \i)
        cmp             r3,  r11
        blt             1f
+.if \i < 28
        ldrh            r11, [r10], #2
+.endif
 .endif
        add             r7,  r2,  #(\i*2)
        mov             r8,  #32*2
--- a/third_party/dav1d/src/arm/32/itx16.S
+++ b/third_party/dav1d/src/arm/32/itx16.S
--- a/third_party/dav1d/src/arm/32/loopfilter16.S
+++ b/third_party/dav1d/src/arm/32/loopfilter16.S
@ -141,13 +141,12 @@ function lpf_4_wd\wd\()_neon
        vmov.i16        d6,  #3
        vbic            d0,  d1,  d0  // (fm && wd >= 4 && !hev)
        vmul.i16        d2,  d2,  d6
-        vmov.i16        d6,  #4
+        vmov.i16        d7,  #4
        vadd.i16        d2,  d2,  d4
        vmin.s16        d2,  d2,  d3  // f = iclip_diff()
-        vmov.i16        d7,  #3
        vmax.s16        d2,  d2,  d9  // f = iclip_diff()
-        vqadd.s16       d4,  d6,  d2  // f + 4
-        vqadd.s16       d5,  d7,  d2  // f + 3
+        vqadd.s16       d4,  d7,  d2  // f + 4
+        vqadd.s16       d5,  d6,  d2  // f + 3
        vmin.s16        d4,  d4,  d3  // imin(f + 4, 128 << bitdepth_min_8 - 1)
        vmin.s16        d5,  d5,  d3  // imin(f + 3, 128 << bitdepth_min_8 - 1)
        vshr.s16        d4,  d4,  #3  // f1
--- a/third_party/dav1d/src/arm/32/looprestoration.S
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@ -28,15 +28,27 @@
 #include "src/arm/asm.S"
 #include "util.S"

+const right_ext_mask_buf
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
 // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
 //                                      const pixel *src, ptrdiff_t stride,
 //                                      const int16_t fh[8], intptr_t w,
 //                                      int h, enum LrEdgeFlags edges);
 function wiener_filter_h_8bpc_neon, export=1
        push            {r4-r11,lr}
-        vpush           {q4}
-        ldrd            r4,  r5,  [sp, #52]
-        ldrd            r6,  r7,  [sp, #60]
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
        mov             r8,  r5
        vld1.16         {q0},  [r4, :128]
        movw            r9,  #(1 << 14) - (1 << 2)
@ -47,27 +59,19 @@ function wiener_filter_h_8bpc_neon, export=1
        bic             r10, r10, #7
        lsl             r10, r10, #1

-        // Clear the last unused element of q0, to allow filtering a single
-        // pixel with one plain vmul+vpadd.
-        mov             r12, #0
-        vmov.16         d1[3], r12
-
        // Set up pointers for reading/writing alternate rows
        add             r12, r0,  r10
        lsl             r10, r10, #1
        add             lr,  r2,  r3
        lsl             r3,  r3,  #1

-        // Subtract the width from mid_stride
-        sub             r10, r10, r5, lsl #1
-
-        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
-        cmp             r5,  #8
-        add             r11, r5,  #13
+        // Subtract the aligned width from mid_stride
+        add             r11, r5,  #7
        bic             r11, r11, #7
-        bge             1f
-        mov             r11, #16
-1:
+        sub             r10, r10, r11, lsl #1
+
+        // Subtract the number of pixels read from the source stride
+        add             r11, r11, #8
        sub             r3,  r3,  r11

        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -131,47 +135,56 @@ function wiener_filter_h_8bpc_neon, export=1
        ldrb            r11, [r2, r9]
        ldrb            r9,  [lr, r9]
        // Fill q12/q13 with the right padding pixel
-        vdup.8          d24, r11
-        vdup.8          d26, r9
-        vmovl.u8        q12, d24
-        vmovl.u8        q13, d26
+        vdup.16         q12, r11
+        vdup.16         q13, r9
 3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
+
+        // Check whether we need to pad the right edge
        cmp             r5,  #11
        bge             4f   // If w >= 11, all used input pixels are valid
-        cmp             r5,  #7
-        bge             5f   // If w >= 7, we can filter 4 pixels
-        b               6f
+
+        // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
+        // buffer pointer.
+        movrel_local    r4,  right_ext_mask, -6
+        sub             r4,  r4,  r5,  lsl #1
+        vld1.8          {q10, q11}, [r4]
+
+        vbit            q1,  q12, q10
+        vbit            q2,  q12, q11
+        vbit            q8,  q13, q10
+        vbit            q9,  q13, q11

 4:      // Loop horizontally
-        // This is tuned as some sort of compromise between Cortex A7, A8,
-        // A9 and A53.
-        vmul.s16        q3,  q1,  d0[0]
-        vext.8          q10, q1,  q2,  #2
        vext.8          q11, q1,  q2,  #4
-        vmla.s16        q3,  q10, d0[1]
-        vmla.s16        q3,  q11, d0[2]
-        vext.8          q10, q1,  q2,  #6
-        vext.8          q11, q1,  q2,  #8
-        vmla.s16        q3,  q10, d0[3]
-        vmla.s16        q3,  q11, d1[0]
-        vext.8          q10, q1,  q2,  #10
-        vext.8          q11, q1,  q2,  #12
-        vmla.s16        q3,  q10, d1[1]
-        vmla.s16        q3,  q11, d1[2]
+        vext.8          q5,  q1,  q2,  #8
+        vext.8          q10, q1,  q2,  #2
+        vext.8          q6,  q1,  q2,  #10
+        vext.8          q7,  q1,  q2,  #12
+        vext.8          q4,  q1,  q2,  #6
+        vadd.i16        q5,  q5,  q11
+        vadd.i16        q6,  q6,  q10
+        vadd.i16        q7,  q7,  q1
+        vmul.s16        q3,  q4,  d0[3]
+        vmla.s16        q3,  q5,  d1[0]
+        vmla.s16        q3,  q6,  d1[1]
+        vmla.s16        q3,  q7,  d1[2]

-        vmul.s16        q10, q8,  d0[0]
-        vext.8          q11, q8,  q9,  #2
        vext.8          q4,  q8,  q9,  #4
-        vmla.s16        q10, q11, d0[1]
-        vmla.s16        q10, q4,  d0[2]
-        vext.8          q11, q8,  q9,  #6
-        vext.8          q4,  q8,  q9,  #8
-        vmla.s16        q10, q11, d0[3]
-        vmla.s16        q10, q4,  d1[0]
-        vext.8          q11, q8,  q9,  #10
+        vext.8          q6,  q8,  q9,  #8
+        vext.8          q11, q8,  q9,  #2
+        vext.8          q7,  q8,  q9,  #10
+        vadd.i16        q6,  q6,  q4
        vext.8          q4,  q8,  q9,  #12
-        vmla.s16        q10, q11, d1[1]
+        vext.8          q5,  q8,  q9,  #6
+        vadd.i16        q7,  q7,  q11
+        vadd.i16        q4,  q4,  q8
+        vmul.s16        q10, q5,  d0[3]
+        vmla.s16        q10, q6,  d1[0]
+        vmla.s16        q10, q7,  d1[1]
        vmla.s16        q10, q4,  d1[2]

        vext.8          q1,  q1,  q2,  #6
@ -186,10 +199,10 @@ function wiener_filter_h_8bpc_neon, export=1
        vshr.s16        q10, q10, #3
        vadd.s16        q3,  q3,  q15
        vadd.s16        q10, q10, q15
+        subs            r5,  r5,  #8
        vst1.16         {q3},  [r0,  :128]!
        vst1.16         {q10}, [r12, :128]!

-        subs            r5,  r5,  #8
        ble             9f
        tst             r7,  #2 // LR_HAVE_RIGHT
        vmov            q1,  q2
@ -201,145 +214,6 @@ function wiener_filter_h_8bpc_neon, export=1
        bne             4b // If we don't need to pad, just keep filtering.
        b               3b // If we need to pad, check how many pixels we have left.

-5:      // Filter 4 pixels, 7 <= w < 11
-.macro filter_4
-        vext.8          d20, d2,  d3,  #2
-        vext.8          d21, d2,  d3,  #4
-        vext.8          d22, d2,  d3,  #6
-        vext.8          d23, d3,  d4,  #2
-        vext.8          d8,  d3,  d4,  #4
-        vmul.s16        d6,  d2,  d0[0]
-        vmla.s16        d6,  d20, d0[1]
-        vmla.s16        d6,  d21, d0[2]
-        vmla.s16        d6,  d22, d0[3]
-        vmla.s16        d6,  d3,  d1[0]
-        vmla.s16        d6,  d23, d1[1]
-        vmla.s16        d6,  d8,  d1[2]
-
-        vext.8          d20, d16, d17, #2
-        vext.8          d21, d16, d17, #4
-        vext.8          d22, d16, d17, #6
-        vext.8          d23, d17, d18, #2
-        vext.8          d8,  d17, d18, #4
-        vmul.s16        d7,  d16, d0[0]
-        vmla.s16        d7,  d20, d0[1]
-        vmla.s16        d7,  d21, d0[2]
-        vmla.s16        d7,  d22, d0[3]
-        vmla.s16        d7,  d17, d1[0]
-        vmla.s16        d7,  d23, d1[1]
-        vmla.s16        d7,  d8,  d1[2]
-
-        vext.8          d22, d2,  d3,  #6
-        vext.8          d23, d16, d17, #6
-        vshl.s16        q11, q11, #7
-        vsub.s16        q11, q11, q14
-        vqadd.s16       q3,  q3,  q11
-        vshr.s16        q3,  q3,  #3
-        vadd.s16        q3,  q3,  q15
-.endm
-        filter_4
-        vst1.16         {d6},  [r0,  :64]!
-        vst1.16         {d7},  [r12, :64]!
-
-        subs            r5,  r5,  #4 // 3 <= w < 7
-        vext.8          q1,  q1,  q2,  #8
-        vext.8          q2,  q2,  q2,  #8
-        vext.8          q8,  q8,  q9,  #8
-        vext.8          q9,  q9,  q9,  #8
-
-6:      // Pad the right edge and filter the last few pixels.
-        // w < 7, w+3 pixels valid in q1-q2
-        cmp             r5,  #5
-        blt             7f
-        bgt             8f
-        // w == 5, 8 pixels valid in q1, q2 invalid
-        vmov            q2,  q12
-        vmov            q9,  q13
-        b               88f
-
-7:      // 1 <= w < 5, 4-7 pixels valid in q1
-        sub             r9,  r5,  #1
-        // r9 = (pixels valid - 4)
-        adr             r11, L(variable_shift_tbl)
-        ldr             r9,  [r11, r9, lsl #2]
-        add             r11, r11, r9
-        vmov            q2,  q12
-        vmov            q9,  q13
-        bx              r11
-
-        .align 2
-L(variable_shift_tbl):
-        .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
-        .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
-        .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
-        .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
-
-44:     // 4 pixels valid in d2/d16, fill d3/d17 with padding.
-        vmov            d3,  d4
-        vmov            d17, d18
-        b               88f
-        // Shift q1 right, shifting out invalid pixels,
-        // shift q1 left to the original offset, shifting in padding pixels.
-55:     // 5 pixels valid
-        vext.8          q1,  q1,  q1,  #10
-        vext.8          q1,  q1,  q2,  #6
-        vext.8          q8,  q8,  q8,  #10
-        vext.8          q8,  q8,  q9,  #6
-        b               88f
-66:     // 6 pixels valid
-        vext.8          q1,  q1,  q1,  #12
-        vext.8          q1,  q1,  q2,  #4
-        vext.8          q8,  q8,  q8,  #12
-        vext.8          q8,  q8,  q9,  #4
-        b               88f
-77:     // 7 pixels valid
-        vext.8          q1,  q1,  q1,  #14
-        vext.8          q1,  q1,  q2,  #2
-        vext.8          q8,  q8,  q8,  #14
-        vext.8          q8,  q8,  q9,  #2
-        b               88f
-
-8:      // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2
-        vext.8          q2,  q2,  q2,  #2
-        vext.8          q2,  q2,  q12, #14
-        vext.8          q9,  q9,  q9,  #2
-        vext.8          q9,  q9,  q13, #14
-
-88:
-        // w < 7, q1-q2 padded properly
-        cmp             r5,  #4
-        blt             888f
-
-        // w >= 4, filter 4 pixels
-        filter_4
-        vst1.16         {d6},  [r0,  :64]!
-        vst1.16         {d7},  [r12, :64]!
-        subs            r5,  r5,  #4 // 0 <= w < 4
-        vext.8          q1,  q1,  q2,  #8
-        vext.8          q8,  q8,  q9,  #8
-        beq             9f
-888:    // 1 <= w < 4, filter 1 pixel at a time
-        vmul.s16        q3,  q1,  q0
-        vmul.s16        q10, q8,  q0
-        vpadd.s16       d6,  d6,  d7
-        vpadd.s16       d7,  d20, d21
-        vdup.16         d24, d2[3]
-        vpadd.s16       d6,  d6,  d7
-        vdup.16         d25, d16[3]
-        vpadd.s16       d6,  d6,  d6
-        vtrn.16         d24, d25
-        vshl.s16        d24, d24, #7
-        vsub.s16        d24, d24, d28
-        vqadd.s16       d6,  d6,  d24
-        vshr.s16        d6,  d6,  #3
-        vadd.s16        d6,  d6,  d30
-        vst1.s16        {d6[0]}, [r0,  :16]!
-        vst1.s16        {d6[1]}, [r12, :16]!
-        subs            r5,  r5,  #1
-        vext.8          q1,  q1,  q2,  #2
-        vext.8          q8,  q8,  q9,  #2
-        bgt             888b
-
 9:
        subs            r6,  r6,  #2
        ble             0f
@ -351,9 +225,8 @@ L(variable_shift_tbl):
        mov             r5,  r8
        b               1b
 0:
-        vpop            {q4}
+        vpop            {q4-q7}
        pop             {r4-r11,pc}
-.purgem filter_4
 endfunc

 // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
@ -362,8 +235,9 @@ endfunc
 //                                      ptrdiff_t mid_stride);
 function wiener_filter_v_8bpc_neon, export=1
        push            {r4-r7,lr}
-        ldrd            r4,  r5,  [sp, #20]
-        ldrd            r6,  r7,  [sp, #28]
+        vpush           {q4-q6}
+        ldrd            r4,  r5,  [sp, #68]
+        ldrd            r6,  r7,  [sp, #76]
        mov             lr,  r4
        vld1.16         {q0},  [r5, :128]

@ -407,24 +281,21 @@ function wiener_filter_v_8bpc_neon, export=1
        // Interleaving the mul/mla chains actually hurts performance
        // significantly on Cortex A53, thus keeping mul/mla tightly
        // chained like this.
-        vmull.s16       q2,  d16, d0[0]
-        vmlal.s16       q2,  d18, d0[1]
-        vmlal.s16       q2,  d20, d0[2]
-        vmlal.s16       q2,  d22, d0[3]
-        vmlal.s16       q2,  d24, d1[0]
-        vmlal.s16       q2,  d26, d1[1]
-        vmlal.s16       q2,  d28, d1[2]
-        vmull.s16       q3,  d17, d0[0]
-        vmlal.s16       q3,  d19, d0[1]
-        vmlal.s16       q3,  d21, d0[2]
-        vmlal.s16       q3,  d23, d0[3]
-        vmlal.s16       q3,  d25, d1[0]
-        vmlal.s16       q3,  d27, d1[1]
-        vmlal.s16       q3,  d29, d1[2]
+        vadd.i16        q4,  q10, q12
+        vadd.i16        q5,  q9,  q13
+        vadd.i16        q6,  q8,  q14
+        vmull.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d8,  d1[0]
+        vmlal.s16       q2,  d10, d1[1]
+        vmlal.s16       q2,  d12, d1[2]
+        vmull.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d9,  d1[0]
+        vmlal.s16       q3,  d11, d1[1]
+        vmlal.s16       q3,  d13, d1[2]
        vqrshrun.s32    d4,  q2,  #11
        vqrshrun.s32    d5,  q3,  #11
        vqmovun.s16     d4,  q2
-        vst1.8          {d4}, [r0], r1
+        vst1.8          {d4}, [r0, :64], r1
 .if \compare
        cmp             r4,  #4
 .else
@ -529,147 +400,11 @@ function wiener_filter_v_8bpc_neon, export=1
        b               1b

 0:
+        vpop            {q4-q6}
        pop             {r4-r7,pc}
 .purgem filter
 endfunc

-// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
-//                                  const pixel *src, int w, int h);
-function copy_narrow_8bpc_neon, export=1
-        push            {r4,lr}
-        ldr             r4, [sp, #8]
-        adr             r12, L(copy_narrow_tbl)
-        ldr             r3,  [r12, r3, lsl #2]
-        add             r12, r12, r3
-        bx              r12
-
-        .align 2
-L(copy_narrow_tbl):
-        .word 0
-        .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
-
-10:
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
-18:
-        subs            r4,  r4,  #8
-        blt             110f
-        vld1.8          {d0}, [r2, :64]!
-        vst1.8          {d0[0]}, [r0], r1
-        vst1.8          {d0[1]}, [r3], r1
-        vst1.8          {d0[2]}, [r0], r1
-        vst1.8          {d0[3]}, [r3], r1
-        vst1.8          {d0[4]}, [r0], r1
-        vst1.8          {d0[5]}, [r3], r1
-        vst1.8          {d0[6]}, [r0], r1
-        vst1.8          {d0[7]}, [r3], r1
-        ble             0f
-        b               18b
-110:
-        add             r4,  r4,  #8
-        asr             r1,  r1,  #1
-11:
-        subs            r4,  r4,  #1
-        vld1.8          {d0[]},  [r2]!
-        vst1.8          {d0[0]}, [r0], r1
-        bgt             11b
-0:
-        pop             {r4,pc}
-
-20:
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
-24:
-        subs            r4,  r4,  #4
-        blt             210f
-        vld1.16         {d0}, [r2, :64]!
-        vst1.16         {d0[0]}, [r0, :16], r1
-        vst1.16         {d0[1]}, [r3, :16], r1
-        vst1.16         {d0[2]}, [r0, :16], r1
-        vst1.16         {d0[3]}, [r3, :16], r1
-        ble             0f
-        b               24b
-210:
-        add             r4,  r4,  #4
-        asr             r1,  r1,  #1
-22:
-        subs            r4,  r4,  #1
-        vld1.16         {d0[]},  [r2, :16]!
-        vst1.16         {d0[0]}, [r0, :16], r1
-        bgt             22b
-0:
-        pop             {r4,pc}
-
-30:
-        ldrh            r3,  [r2]
-        ldrb            r12, [r2, #2]
-        add             r2,  r2,  #3
-        subs            r4,  r4,  #1
-        strh            r3,  [r0]
-        strb            r12, [r0, #2]
-        add             r0,  r0,  r1
-        bgt             30b
-        pop             {r4,pc}
-
-40:
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
-42:
-        subs            r4,  r4,  #2
-        blt             41f
-        vld1.8          {d0}, [r2, :64]!
-        vst1.32         {d0[0]}, [r0, :32], r1
-        vst1.32         {d0[1]}, [r3, :32], r1
-        ble             0f
-        b               42b
-41:
-        vld1.32         {d0[]},  [r2, :32]
-        vst1.32         {d0[0]}, [r0, :32]
-0:
-        pop             {r4,pc}
-
-50:
-        ldr             r3,  [r2]
-        ldrb            r12, [r2, #4]
-        add             r2,  r2,  #5
-        subs            r4,  r4,  #1
-        str             r3,  [r0]
-        strb            r12, [r0, #4]
-        add             r0,  r0,  r1
-        bgt             50b
-        pop             {r4,pc}
-
-60:
-        ldr             r3,  [r2]
-        ldrh            r12, [r2, #4]
-        add             r2,  r2,  #6
-        subs            r4,  r4,  #1
-        str             r3,  [r0]
-        strh            r12, [r0, #4]
-        add             r0,  r0,  r1
-        bgt             60b
-        pop             {r4,pc}
-
-70:
-        ldr             r3,  [r2]
-        ldrh            r12, [r2, #4]
-        ldrb            lr,  [r2, #6]
-        add             r2,  r2,  #7
-        subs            r4,  r4,  #1
-        str             r3,  [r0]
-        strh            r12, [r0, #4]
-        strb            lr,  [r0, #6]
-        add             r0,  r0,  r1
-        bgt             70b
-        pop             {r4,pc}
-endfunc
-
 #define SUM_STRIDE (384+16)

 #include "looprestoration_tmpl.S"
@ -694,25 +429,15 @@ function sgr_box3_h_8bpc_neon, export=1
        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride

        // Subtract the aligned width from the output stride.
-        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
-        tst             r7,  #2 // LR_HAVE_RIGHT
-        bne             0f
-        // !LR_HAVE_RIGHT
-        add             lr,  r5,  #3
-        bic             lr,  lr,  #3
-        b               1f
-0:
        add             lr,  r5,  #7
        bic             lr,  lr,  #7
-1:
        sub             r9,  r9,  lr, lsl #1

        // Store the width for the vertical loop
        mov             r8,  r5

        // Subtract the number of pixels read from the input from the stride
-        add             lr,  r5,  #14
-        bic             lr,  lr,  #7
+        add             lr,  lr,  #8
        sub             r4,  r4,  lr

        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -781,34 +506,30 @@ function sgr_box3_h_8bpc_neon, export=1
        // Restore r11 after using it for a temporary value
        add             r11, r1,  #(2*SUM_STRIDE)
 3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
+
+        // Check whether we need to pad the right edge
        cmp             r5,  #10
        bge             4f   // If w >= 10, all used input pixels are valid
-        cmp             r5,  #6
-        bge             5f   // If w >= 6, we can filter 4 pixels
-        b               6f
+
+        // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
+        // again; it's not strictly needed in those cases (we pad enough here),
+        // but keeping the code as simple as possible.
+
+        // Insert padding in q0/4.b[w] onwards
+        movrel_local    lr,  right_ext_mask
+        sub             lr,  lr,  r5
+        vld1.8          {q13}, [lr]
+
+        vbit            q0,  q14, q13
+        vbit            q4,  q15, q13
+
+        // Update the precalculated squares
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9

 4:      // Loop horizontally
-.macro vaddl_u16_n      dst1, dst2, src1, src2, src3, src4, w
-        vaddl.u16       \dst1,  \src1,  \src3
-.if \w > 4
-        vaddl.u16       \dst2,  \src2,  \src4
-.endif
-.endm
-.macro vaddw_u16_n      dst1, dst2, src1, src2, w
-        vaddw.u16       \dst1,  \dst1,  \src1
-.if \w > 4
-        vaddw.u16       \dst2,  \dst2,  \src2
-.endif
-.endm
-.macro vadd_i32_n       dst1, dst2, src1, src2, w
-        vadd.i32        \dst1,  \dst1,  \src1
-.if \w > 4
-        vadd.i32        \dst2,  \dst2,  \src2
-.endif
-.endm
-
-.macro add3 w
        vext.8          d16, d0,  d1,  #1
        vext.8          d17, d0,  d1,  #2
        vext.8          d18, d8,  d9,  #1
@ -823,19 +544,22 @@ function sgr_box3_h_8bpc_neon, export=1
        vext.8          q10, q5,  q6,  #2
        vext.8          q11, q5,  q6,  #4

-        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
-        vaddw_u16_n     q12, q13, d18, d19, \w
+        vaddl.u16       q12, d2,  d16
+        vaddl.u16       q13, d3,  d17
+        vaddw.u16       q12, q12, d18
+        vaddw.u16       q13, q13, d19

-        vaddl_u16_n     q8,  q9,  d10, d11, d20, d21, \w
-        vaddw_u16_n     q8,  q9,  d22, d23, \w
-.endm
-        add3            8
+        vaddl.u16       q8,  d10, d20
+        vaddl.u16       q9,  d11, d21
+        vaddw.u16       q8,  q8,  d22
+        vaddw.u16       q9,  q9,  d23
+
+        subs            r5,  r5,  #8
        vst1.16         {q3},       [r1,  :128]!
        vst1.16         {q7},       [r11, :128]!
        vst1.32         {q12, q13}, [r0,  :128]!
        vst1.32         {q8,  q9},  [r10, :128]!

-        subs            r5,  r5,  #8
        ble             9f
        tst             r7,  #2 // LR_HAVE_RIGHT
        vld1.8          {d6},  [r3]!
@ -850,86 +574,6 @@ function sgr_box3_h_8bpc_neon, export=1
        bne             4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

-5:      // Produce 4 pixels, 6 <= w < 10
-        add3            4
-        vst1.16         {d6},  [r1,  :64]!
-        vst1.16         {d14}, [r11, :64]!
-        vst1.32         {q12}, [r0,  :128]!
-        vst1.32         {q8},  [r10, :128]!
-
-        subs            r5,  r5,  #4 // 2 <= w < 6
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q4,  q4,  q4,  #4
-
-6:      // Pad the right edge and produce the last few pixels.
-        // 2 <= w < 6, 2-5 pixels valid in q0
-        sub             lr,  r5,  #2
-        // lr = (pixels valid - 2)
-        adr             r11, L(box3_variable_shift_tbl)
-        ldr             lr,  [r11, lr, lsl #2]
-        add             r11, r11, lr
-        bx              r11
-
-        .align 2
-L(box3_variable_shift_tbl):
-        .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-        .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-        .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-        .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-
-        // Shift q0 right, shifting out invalid pixels,
-        // shift q0 left to the original offset, shifting in padding pixels.
-22:     // 2 pixels valid
-        vext.8          q0,  q0,  q0,  #2
-        vext.8          q4,  q4,  q4,  #2
-        vext.8          q0,  q0,  q14, #14
-        vext.8          q4,  q4,  q15, #14
-        b               88f
-33:     // 3 pixels valid
-        vext.8          q0,  q0,  q0,  #3
-        vext.8          q4,  q4,  q4,  #3
-        vext.8          q0,  q0,  q14, #13
-        vext.8          q4,  q4,  q15, #13
-        b               88f
-44:     // 4 pixels valid
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q4,  q4,  q4,  #4
-        vext.8          q0,  q0,  q14, #12
-        vext.8          q4,  q4,  q15, #12
-        b               88f
-55:     // 5 pixels valid
-        vext.8          q0,  q0,  q0,  #5
-        vext.8          q4,  q4,  q4,  #5
-        vext.8          q0,  q0,  q14, #11
-        vext.8          q4,  q4,  q15, #11
-
-88:
-        // Restore r11 after using it for a temporary value above
-        add             r11, r1,  #(2*SUM_STRIDE)
-        vmull.u8        q1,  d0,  d0
-        vmull.u8        q2,  d1,  d1
-        vmull.u8        q5,  d8,  d8
-        vmull.u8        q6,  d9,  d9
-
-        add3            4
-        subs            r5,  r5,  #4
-        vst1.16         {d6},  [r1,  :64]!
-        vst1.16         {d14}, [r11, :64]!
-        vst1.32         {q12}, [r0,  :128]!
-        vst1.32         {q8},  [r10, :128]!
-        ble             9f
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q1,  q1,  q2,  #8
-        vext.8          q4,  q4,  q4,  #4
-        vext.8          q5,  q5,  q6,  #8
-        // Only one needed pixel left, but do a normal 4 pixel
-        // addition anyway
-        add3            4
-        vst1.16         {d6},  [r1,  :64]!
-        vst1.16         {d14}, [r11, :64]!
-        vst1.32         {q12}, [r0,  :128]!
-        vst1.32         {q8},  [r10, :128]!
-
 9:
        subs            r6,  r6,  #2
        ble             0f
@ -945,7 +589,6 @@ L(box3_variable_shift_tbl):
 0:
        vpop            {q4-q7}
        pop             {r4-r11,pc}
-.purgem add3
 endfunc

 // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
@ -968,23 +611,11 @@ function sgr_box5_h_8bpc_neon, export=1
        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride

        // Subtract the aligned width from the output stride.
-        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
-        // Subtract the number of pixels read from the input from the stride.
-        tst             r7,  #2 // LR_HAVE_RIGHT
-        bne             0f
-        // !LR_HAVE_RIGHT
-        add             lr,  r5,  #3
-        bic             lr,  lr,  #3
-        add             r8,  r5,  #13
-        b               1f
-0:
        add             lr,  r5,  #7
        bic             lr,  lr,  #7
-        add             r8,  r5,  #15
-1:
        sub             r9,  r9,  lr, lsl #1
-        bic             r8,  r8,  #7
-        sub             r4,  r4,  r8
+        add             lr,  lr,  #8
+        sub             r4,  r4,  lr

        // Store the width for the vertical loop
        mov             r8,  r5
@ -1054,15 +685,31 @@ function sgr_box5_h_8bpc_neon, export=1
        // Restore r11 after using it for a temporary value
        add             r11, r1,  #(2*SUM_STRIDE)
 3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
+
+        // Check whether we need to pad the right edge
        cmp             r5,  #11
        bge             4f   // If w >= 11, all used input pixels are valid
-        cmp             r5,  #7
-        bge             5f   // If w >= 7, we can produce 4 pixels
-        b               6f
+
+        // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
+        // buffer pointer.
+        movrel_local    lr,  right_ext_mask, -1
+        sub             lr,  lr,  r5
+        vld1.8          {q13}, [lr]
+
+        vbit            q0,  q14, q13
+        vbit            q4,  q15, q13
+
+        // Update the precalculated squares
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9

 4:      // Loop horizontally
-.macro add5 w
        vext.8          d16, d0,  d1,  #1
        vext.8          d17, d0,  d1,  #2
        vext.8          d18, d0,  d1,  #3
@ -1084,35 +731,33 @@ function sgr_box5_h_8bpc_neon, export=1
        vext.8          q9,  q1,  q2,  #4
        vext.8          q10, q1,  q2,  #6
        vext.8          q11, q1,  q2,  #8
-        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
-        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w
-        vaddw_u16_n     q12, q13, d22, d23, \w
-        vadd_i32_n      q12, q13, q8,  q9,  \w
+        vaddl.u16       q12, d2,  d16
+        vaddl.u16       q13, d3,  d17
+        vaddl.u16       q8,  d18, d20
+        vaddl.u16       q9,  d19, d21
+        vaddw.u16       q12, q12, d22
+        vaddw.u16       q13, q13, d23
+        vadd.i32        q12, q12, q8
+        vadd.i32        q13, q13, q9
        vext.8          q8,  q5,  q6,  #2
        vext.8          q9,  q5,  q6,  #4
        vext.8          q10, q5,  q6,  #6
        vext.8          q11, q5,  q6,  #8
-.if \w > 4
-        vaddl_u16_n     q1,  q5,  d10, d11, d16, d17, 8
-        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, 8
-        vaddw_u16_n     q1,  q5,  d22, d23, 8
+        vaddl.u16       q1,  d10, d16
+        vaddl.u16       q5,  d11, d17
+        vaddl.u16       q8,  d18, d20
+        vaddl.u16       q9,  d19, d21
+        vaddw.u16       q1,  q1,  d22
+        vaddw.u16       q5,  q5,  d23
        vadd.i32        q10, q1,  q8
        vadd.i32        q11, q5,  q9
-.else
-        // Can't clobber q1/q5 if only doing 4 pixels
-        vaddl.u16       q8,  d10, d16
-        vaddl.u16       q9,  d18, d20
-        vaddw.u16       q8,  q8,  d22
-        vadd.i32        q10, q8,  q9
-.endif
-.endm
-        add5            8
+
+        subs            r5,  r5,  #8
        vst1.16         {q3},       [r1,  :128]!
        vst1.16         {q7},       [r11, :128]!
        vst1.32         {q12, q13}, [r0,  :128]!
        vst1.32         {q10, q11}, [r10, :128]!

-        subs            r5,  r5,  #8
        ble             9f
        tst             r7,  #2 // LR_HAVE_RIGHT
        vld1.8          {d6},  [r3]!
@ -1126,98 +771,6 @@ function sgr_box5_h_8bpc_neon, export=1
        bne             4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

-5:      // Produce 4 pixels, 7 <= w < 11
-        add5            4
-        vst1.16         {d6},  [r1,  :64]!
-        vst1.16         {d14}, [r11, :64]!
-        vst1.32         {q12}, [r0,  :128]!
-        vst1.32         {q10}, [r10, :128]!
-
-        subs            r5,  r5,  #4 // 3 <= w < 7
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q4,  q4,  q4,  #4
-
-6:      // Pad the right edge and produce the last few pixels.
-        // w < 7, w+1 pixels valid in q0/q4
-        sub             lr,  r5,  #1
-        // lr = pixels valid - 2
-        adr             r11, L(box5_variable_shift_tbl)
-        ldr             lr,  [r11, lr, lsl #2]
-        add             r11, r11, lr
-        bx              r11
-
-        .align 2
-L(box5_variable_shift_tbl):
-        .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-
-        // Shift q0 right, shifting out invalid pixels,
-        // shift q0 left to the original offset, shifting in padding pixels.
-22:     // 2 pixels valid
-        vext.8          q0,  q0,  q0,  #2
-        vext.8          q4,  q4,  q4,  #2
-        vext.8          q0,  q0,  q14, #14
-        vext.8          q4,  q4,  q15, #14
-        b               88f
-33:     // 3 pixels valid
-        vext.8          q0,  q0,  q0,  #3
-        vext.8          q4,  q4,  q4,  #3
-        vext.8          q0,  q0,  q14, #13
-        vext.8          q4,  q4,  q15, #13
-        b               88f
-44:     // 4 pixels valid
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q4,  q4,  q4,  #4
-        vext.8          q0,  q0,  q14, #12
-        vext.8          q4,  q4,  q15, #12
-        b               88f
-55:     // 5 pixels valid
-        vext.8          q0,  q0,  q0,  #5
-        vext.8          q4,  q4,  q4,  #5
-        vext.8          q0,  q0,  q14, #11
-        vext.8          q4,  q4,  q15, #11
-        b               88f
-66:     // 6 pixels valid
-        vext.8          q0,  q0,  q0,  #6
-        vext.8          q4,  q4,  q4,  #6
-        vext.8          q0,  q0,  q14, #10
-        vext.8          q4,  q4,  q15, #10
-        b               88f
-77:     // 7 pixels valid
-        vext.8          q0,  q0,  q0,  #7
-        vext.8          q4,  q4,  q4,  #7
-        vext.8          q0,  q0,  q14, #9
-        vext.8          q4,  q4,  q15, #9
-
-88:
-        // Restore r11 after using it for a temporary value above
-        add             r11, r1,  #(2*SUM_STRIDE)
-        vmull.u8        q1,  d0,  d0
-        vmull.u8        q2,  d1,  d1
-        vmull.u8        q5,  d8,  d8
-        vmull.u8        q6,  d9,  d9
-
-        add5            4
-        subs            r5,  r5,  #4
-        vst1.16         {d6},  [r1,  :64]!
-        vst1.16         {d14}, [r11, :64]!
-        vst1.32         {q12}, [r0,  :128]!
-        vst1.32         {q10}, [r10, :128]!
-        ble             9f
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q1,  q1,  q2,  #8
-        vext.8          q4,  q4,  q4,  #4
-        vext.8          q5,  q5,  q6,  #8
-        add5            4
-        vst1.16         {d6},  [r1,  :64]!
-        vst1.16         {d14}, [r11, :64]!
-        vst1.32         {q12}, [r0,  :128]!
-        vst1.32         {q10}, [r10, :128]!
-
 9:
        subs            r6,  r6,  #2
        ble             0f
@ -1233,7 +786,6 @@ L(box5_variable_shift_tbl):
 0:
        vpop            {q4-q7}
        pop             {r4-r11,pc}
-.purgem add5
 endfunc

 sgr_funcs 8
--- a/third_party/dav1d/src/arm/32/looprestoration16.S
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@ -28,6 +28,18 @@
 #include "src/arm/asm.S"
 #include "util.S"

+const right_ext_mask_buf
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
 // void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
 //                                       const pixel *src, ptrdiff_t stride,
 //                                       const int16_t fh[7], const intptr_t w,
@ -55,27 +67,19 @@ function wiener_filter_h_16bpc_neon, export=1
        bic             r10, r10, #7
        lsl             r10, r10, #1

-        // Clear the last unused element of q0, to allow filtering a single
-        // pixel with one plain vmul+vpadd.
-        mov             r12, #0
-        vmov.16         d1[3], r12
-
        // Set up pointers for reading/writing alternate rows
        add             r12, r0,  r10
        lsl             r10, r10, #1
        add             lr,  r2,  r3
        lsl             r3,  r3,  #1

-        // Subtract the width from mid_stride
-        sub             r10, r10, r5, lsl #1
-
-        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
-        cmp             r5,  #8
-        add             r11, r5,  #13
+        // Subtract the aligned width from mid_stride
+        add             r11, r5,  #7
        bic             r11, r11, #7
-        bge             1f
-        mov             r11, #16
-1:
+        sub             r10, r10, r11, lsl #1
+
+        // Subtract the number of pixels read from the source stride
+        add             r11, r11, #8
        sub             r3,  r3,  r11, lsl #1

        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -143,54 +147,62 @@ function wiener_filter_h_16bpc_neon, export=1
        vdup.16         q11, r11
        vdup.16         q12, r9
 3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
+
+        // Check whether we need to pad the right edge
        cmp             r5,  #11
        bge             4f   // If w >= 11, all used input pixels are valid
-        cmp             r5,  #7
-        bge             5f   // If w >= 7, we can filter 4 pixels
-        b               6f
+
+        // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
+        // buffer pointer.
+        movrel_local    r4,  right_ext_mask, -6
+        sub             r4,  r4,  r5,  lsl #1
+        vld1.8          {q9, q10}, [r4]
+
+        vbit            q2,  q11, q9
+        vbit            q3,  q11, q10
+        vbit            q4,  q12, q9
+        vbit            q5,  q12, q10

 4:      // Loop horizontally
-        vext.8          q8,  q2,  q3,  #2
-        vext.8          q9,  q2,  q3,  #4
-        vext.8          q10, q2,  q3,  #6
-        vmull.s16       q6,  d4,  d0[0]
-        vmlal.s16       q6,  d16, d0[1]
-        vmlal.s16       q6,  d18, d0[2]
-        vmlal.s16       q6,  d20, d0[3]
-        vmull.s16       q7,  d5,  d0[0]
-        vmlal.s16       q7,  d17, d0[1]
-        vmlal.s16       q7,  d19, d0[2]
-        vmlal.s16       q7,  d21, d0[3]
+        vext.8          q7,  q2,  q3,  #4
        vext.8          q8,  q2,  q3,  #8
+        vext.8          q6,  q2,  q3,  #2
        vext.8          q9,  q2,  q3,  #10
-        vext.8          q10, q2,  q3,  #12
+        vadd.i16        q8,  q8,  q7
+        vadd.i16        q9,  q9,  q6
+        vext.8          q6,  q2,  q3,  #12
+        vext.8          q7,  q2,  q3,  #6
+        vadd.i16        q2,  q2,  q6
+        vmull.s16       q6,  d14, d0[3]
        vmlal.s16       q6,  d16, d1[0]
        vmlal.s16       q6,  d18, d1[1]
-        vmlal.s16       q6,  d20, d1[2]
+        vmlal.s16       q6,  d4,  d1[2]
+        vmull.s16       q7,  d15, d0[3]
        vmlal.s16       q7,  d17, d1[0]
        vmlal.s16       q7,  d19, d1[1]
-        vmlal.s16       q7,  d21, d1[2]
-        vext.8          q2,  q4,  q5,  #2
-        vext.8          q10, q4,  q5,  #6
-        vmull.s16       q8,  d8,  d0[0]
-        vmlal.s16       q8,  d4,  d0[1]
-        vmlal.s16       q8,  d20, d0[3]
-        vmull.s16       q9,  d9,  d0[0]
-        vmlal.s16       q9,  d5,  d0[1]
-        vmlal.s16       q9,  d21, d0[3]
-        vext.8          q2,  q4,  q5,  #4
+        vmlal.s16       q7,  d5,  d1[2]
+
+        vext.8          q8,  q4,  q5,  #4
        vext.8          q10, q4,  q5,  #8
-        vmlal.s16       q8,  d4,  d0[2]
-        vmlal.s16       q8,  d20, d1[0]
-        vmlal.s16       q9,  d5,  d0[2]
-        vmlal.s16       q9,  d21, d1[0]
+        vext.8          q9,  q4,  q5,  #2
        vext.8          q2,  q4,  q5,  #10
-        vext.8          q10, q4,  q5,  #12
+        vadd.i16        q10, q10, q8
+        vadd.i16        q2,  q2,  q9
+        vext.8          q8,  q4,  q5,  #12
+        vext.8          q9,  q4,  q5,  #6
+        vadd.i16        q4,  q4,  q8
+        vmull.s16       q8,  d18, d0[3]
+        vmlal.s16       q8,  d20, d1[0]
        vmlal.s16       q8,  d4,  d1[1]
-        vmlal.s16       q8,  d20, d1[2]
+        vmlal.s16       q8,  d8,  d1[2]
+        vmull.s16       q9,  d19, d0[3]
+        vmlal.s16       q9,  d21, d1[0]
        vmlal.s16       q9,  d5,  d1[1]
-        vmlal.s16       q9,  d21, d1[2]
+        vmlal.s16       q9,  d9,  d1[2]

        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
        vadd.i32        q6,  q6,  q14
@ -209,10 +221,10 @@ function wiener_filter_h_16bpc_neon, export=1
        vmin.u16        q7,  q7,  q10
        vsub.i16        q6,  q6,  q15
        vsub.i16        q7,  q7,  q15
+        subs            r5,  r5,  #8
        vst1.16         {q6}, [r0,  :128]!
        vst1.16         {q7}, [r12, :128]!

-        subs            r5,  r5,  #8
        ble             9f
        tst             r7,  #2 // LR_HAVE_RIGHT
        vmov            q2,  q3
@ -222,148 +234,6 @@ function wiener_filter_h_16bpc_neon, export=1
        bne             4b // If we don't need to pad, just keep filtering.
        b               3b // If we need to pad, check how many pixels we have left.

-5:      // Filter 4 pixels, 7 <= w < 11
-.macro filter_4
-        vext.8          d18, d4,  d5,  #6
-        vext.8          d16, d4,  d5,  #2
-        vext.8          d17, d4,  d5,  #4
-        vext.8          d19, d5,  d6,  #2
-        vext.8          d20, d5,  d6,  #4
-        vmull.s16       q6,  d4,  d0[0]
-        vmlal.s16       q6,  d16, d0[1]
-        vmlal.s16       q6,  d17, d0[2]
-        vmlal.s16       q6,  d18, d0[3]
-        vmlal.s16       q6,  d5,  d1[0]
-        vmlal.s16       q6,  d19, d1[1]
-        vmlal.s16       q6,  d20, d1[2]
-
-        vext.8          d18, d8,  d9,  #6
-        vext.8          d16, d8,  d9,  #2
-        vext.8          d17, d8,  d9,  #4
-        vext.8          d19, d9,  d10, #2
-        vext.8          d20, d9,  d10, #4
-        vmull.s16       q7,  d8,  d0[0]
-        vmlal.s16       q7,  d16, d0[1]
-        vmlal.s16       q7,  d17, d0[2]
-        vmlal.s16       q7,  d18, d0[3]
-        vmlal.s16       q7,  d9,  d1[0]
-        vmlal.s16       q7,  d19, d1[1]
-        vmlal.s16       q7,  d20, d1[2]
-
-        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
-        vadd.i32        q6,  q6,  q14
-        vadd.i32        q7,  q7,  q14
-        vrshl.s32       q6,  q6,  q13
-        vrshl.s32       q7,  q7,  q13
-        vqmovun.s32     d12, q6
-        vqmovun.s32     d13, q7
-        vmin.u16        q6,  q6,  q10
-        vsub.i16        q6,  q6,  q15
-.endm
-        filter_4
-        vst1.16         {d12}, [r0,  :64]!
-        vst1.16         {d13}, [r12, :64]!
-
-        subs            r5,  r5,  #4 // 3 <= w < 7
-        vext.8          q2,  q2,  q3,  #8
-        vext.8          q3,  q3,  q3,  #8
-        vext.8          q4,  q4,  q5,  #8
-        vext.8          q5,  q5,  q5,  #8
-
-6:      // Pad the right edge and filter the last few pixels.
-        // w < 7, w+3 pixels valid in q2-q3
-        cmp             r5,  #5
-        blt             7f
-        bgt             8f
-        // w == 5, 8 pixels valid in q2, q3 invalid
-        vmov            q3,  q11
-        vmov            q5,  q12
-        b               88f
-
-7:      // 1 <= w < 5, 4-7 pixels valid in q2
-        sub             r9,  r5,  #1
-        // r9 = (pixels valid - 4)
-        adr             r11, L(variable_shift_tbl)
-        ldr             r9,  [r11, r9, lsl #2]
-        add             r11, r11, r9
-        vmov            q3,  q11
-        vmov            q5,  q12
-        bx              r11
-
-        .align 2
-L(variable_shift_tbl):
-        .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
-        .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
-        .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
-        .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
-
-44:     // 4 pixels valid in q2/q4, fill the high half with padding.
-        vmov            d5,  d6
-        vmov            d9,  d10
-        b               88f
-        // Shift q2 right, shifting out invalid pixels,
-        // shift q2 left to the original offset, shifting in padding pixels.
-55:     // 5 pixels valid
-        vext.8          q2,  q2,  q2,  #10
-        vext.8          q2,  q2,  q3,  #6
-        vext.8          q4,  q4,  q4,  #10
-        vext.8          q4,  q4,  q5,  #6
-        b               88f
-66:     // 6 pixels valid
-        vext.8          q2,  q2,  q2,  #12
-        vext.8          q2,  q2,  q3,  #4
-        vext.8          q4,  q4,  q4,  #12
-        vext.8          q4,  q4,  q5,  #4
-        b               88f
-77:     // 7 pixels valid
-        vext.8          q2,  q2,  q2,  #14
-        vext.8          q2,  q2,  q3,  #2
-        vext.8          q4,  q4,  q4,  #14
-        vext.8          q4,  q4,  q5,  #2
-        b               88f
-
-8:      // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
-        vext.8          q3,  q3,  q3,  #2
-        vext.8          q3,  q3,  q11, #14
-        vext.8          q5,  q5,  q5,  #2
-        vext.8          q5,  q5,  q12, #14
-
-88:
-        // w < 7, q2-q3 padded properly
-        cmp             r5,  #4
-        blt             888f
-
-        // w >= 4, filter 4 pixels
-        filter_4
-        vst1.16         {d12}, [r0,  :64]!
-        vst1.16         {d13}, [r12, :64]!
-        subs            r5,  r5,  #4 // 0 <= w < 4
-        vext.8          q2,  q2,  q3,  #8
-        vext.8          q4,  q4,  q5,  #8
-        beq             9f
-888:    // 1 <= w < 4, filter 1 pixel at a time
-        vmull.s16       q6,  d4,  d0
-        vmull.s16       q7,  d5,  d1
-        vmull.s16       q8,  d8,  d0
-        vmull.s16       q9,  d9,  d1
-        vadd.i32        q6,  q7
-        vadd.i32        q8,  q9
-        vpadd.i32       d12, d12, d13
-        vpadd.i32       d13, d16, d17
-        vpadd.i32       d12, d12, d13
-        vadd.i32        d12, d12, d28
-        vmvn.i16        d20, #0x8000 // 0x7fff = (1 << 15) - 1
-        vrshl.s32       d12, d12, d26
-        vqmovun.s32     d12, q6
-        vmin.u16        d12, d12, d20
-        vsub.i16        d12, d12, d30
-        vst1.16         {d12[0]}, [r0,  :16]!
-        vst1.16         {d12[1]}, [r12, :16]!
-        subs            r5,  r5,  #1
-        vext.8          q2,  q2,  q3,  #2
-        vext.8          q4,  q4,  q5,  #2
-        bgt             888b
-
 9:
        subs            r6,  r6,  #2
        ble             0f
@ -377,7 +247,6 @@ L(variable_shift_tbl):
 0:
        vpop            {q4-q7}
        pop             {r4-r11,pc}
-.purgem filter_4
 endfunc

 // void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
@ -457,7 +326,7 @@ function wiener_filter_v_16bpc_neon, export=1
        vqmovun.s32     d4,  q2
        vqmovun.s32     d5,  q3
        vmin.u16        q2,  q2,  q5    // bitdepth_max
-        vst1.16         {q2}, [r0], r1
+        vst1.16         {q2}, [r0, :128], r1
 .if \compare
        cmp             r4,  #4
 .else
@ -567,143 +436,6 @@ function wiener_filter_v_16bpc_neon, export=1
 .purgem filter
 endfunc

-// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
-//                                   const pixel *src, int w, int h);
-function copy_narrow_16bpc_neon, export=1
-        push            {r4,lr}
-        ldr             r4,  [sp, #8]
-        adr             r12, L(copy_narrow_tbl)
-        ldr             r3,  [r12, r3, lsl #2]
-        add             r12, r12, r3
-        bx              r12
-
-        .align 2
-L(copy_narrow_tbl):
-        .word 0
-        .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
-        .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
-
-10:
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
-18:
-        subs            r4,  r4,  #8
-        blt             110f
-        vld1.16         {q0}, [r2, :128]!
-        vst1.16         {d0[0]}, [r0, :16], r1
-        vst1.16         {d0[1]}, [r3, :16], r1
-        vst1.16         {d0[2]}, [r0, :16], r1
-        vst1.16         {d0[3]}, [r3, :16], r1
-        vst1.16         {d1[0]}, [r0, :16], r1
-        vst1.16         {d1[1]}, [r3, :16], r1
-        vst1.16         {d1[2]}, [r0, :16], r1
-        vst1.16         {d1[3]}, [r3, :16], r1
-        ble             0f
-        b               18b
-110:
-        add             r4,  r4,  #8
-        asr             r1,  r1,  #1
-11:
-        subs            r4,  r4,  #1
-        vld1.16         {d0[]},  [r2]!
-        vst1.16         {d0[0]}, [r0], r1
-        bgt             11b
-0:
-        pop             {r4,pc}
-
-20:
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
-24:
-        subs            r4,  r4,  #4
-        blt             210f
-        vld1.32         {q0}, [r2, :128]!
-        vst1.32         {d0[0]}, [r0, :32], r1
-        vst1.32         {d0[1]}, [r3, :32], r1
-        vst1.32         {d1[0]}, [r0, :32], r1
-        vst1.32         {d1[1]}, [r3, :32], r1
-        ble             0f
-        b               24b
-210:
-        add             r4,  r4,  #4
-        asr             r1,  r1,  #1
-22:
-        subs            r4,  r4,  #1
-        vld1.32         {d0[]},  [r2, :32]!
-        vst1.32         {d0[0]}, [r0, :32], r1
-        bgt             22b
-0:
-        pop             {r4,pc}
-
-30:
-        ldr             r3,  [r2]
-        ldrh            r12, [r2, #4]
-        add             r2,  r2,  #6
-        subs            r4,  r4,  #1
-        str             r3,  [r0]
-        strh            r12, [r0, #4]
-        add             r0,  r0,  r1
-        bgt             30b
-        pop             {r4,pc}
-
-40:
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
-42:
-        subs            r4,  r4,  #2
-        blt             41f
-        vld1.16         {q0}, [r2, :128]!
-        vst1.16         {d0}, [r0, :64], r1
-        vst1.16         {d1}, [r3, :64], r1
-        ble             0f
-        b               42b
-41:
-        vld1.16         {d0}, [r2, :64]
-        vst1.16         {d0}, [r0, :64]
-0:
-        pop             {r4,pc}
-
-50:
-        vld1.16         {d0}, [r2]
-        ldrh            r12, [r2, #8]
-        add             r2,  r2,  #10
-        subs            r4,  r4,  #1
-        vst1.16         {d0}, [r0]
-        strh            r12, [r0, #8]
-        add             r0,  r0,  r1
-        bgt             50b
-        pop             {r4,pc}
-
-60:
-        vld1.16         {d0}, [r2]
-        ldr             r12, [r2, #8]
-        add             r2,  r2,  #12
-        subs            r4,  r4,  #1
-        vst1.16         {d0}, [r0]
-        str             r12, [r0, #8]
-        add             r0,  r0,  r1
-        bgt             60b
-        pop             {r4,pc}
-
-70:
-        vld1.16         {d0}, [r2]
-        ldr             r12, [r2, #8]
-        ldrh            lr,  [r2, #12]
-        add             r2,  r2,  #14
-        subs            r4,  r4,  #1
-        vst1.16         {d0}, [r0]
-        str             r12, [r0, #8]
-        strh            lr,  [r0, #12]
-        add             r0,  r0,  r1
-        bgt             70b
-        pop             {r4,pc}
-endfunc
-
 #define SUM_STRIDE (384+16)

 #include "looprestoration_tmpl.S"
@ -728,25 +460,15 @@ function sgr_box3_h_16bpc_neon, export=1
        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride

        // Subtract the aligned width from the output stride.
-        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
-        tst             r7,  #2 // LR_HAVE_RIGHT
-        bne             0f
-        // !LR_HAVE_RIGHT
-        add             lr,  r5,  #3
-        bic             lr,  lr,  #3
-        b               1f
-0:
        add             lr,  r5,  #7
        bic             lr,  lr,  #7
-1:
        sub             r9,  r9,  lr, lsl #1

        // Store the width for the vertical loop
        mov             r8,  r5

        // Subtract the number of pixels read from the input from the stride
-        add             lr,  r5,  #14
-        bic             lr,  lr,  #7
+        add             lr,  lr,  #8
        sub             r4,  r4,  lr, lsl #1

        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
@ -815,16 +537,26 @@ function sgr_box3_h_16bpc_neon, export=1
        // Restore r11 after using it for a temporary value
        add             r11, r1,  #(2*SUM_STRIDE)
 3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
+
+        // Check whether we need to pad the right edge
        cmp             r5,  #10
        bge             4f   // If w >= 10, all used input pixels are valid
-        cmp             r5,  #6
-        bge             5f   // If w >= 6, we can filter 4 pixels
-        b               6f
+
+        // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
+        // again; it's not strictly needed in those cases (we pad enough here),
+        // but keeping the code as simple as possible.
+
+        // Insert padding in q0/1.h[w] onwards
+        movrel_local    lr,  right_ext_mask
+        sub             lr,  lr,  r5,  lsl #1
+        vld1.8          {q12, q13}, [lr]
+
+        vbit            q0,  q14, q12
+        vbit            q1,  q14, q13
+        vbit            q4,  q15, q12
+        vbit            q5,  q15, q13

 4:      // Loop horizontally
-.macro add3 w
-.if \w > 4
        vext.8          q8,  q0,  q1,  #2
        vext.8          q10, q4,  q5,  #2
        vext.8          q9,  q0,  q1,  #4
@ -833,16 +565,6 @@ function sgr_box3_h_16bpc_neon, export=1
        vadd.i16        q3,  q4,  q10
        vadd.i16        q2,  q2,  q9
        vadd.i16        q3,  q3,  q11
-.else
-        vext.8          d16, d0,  d1,  #2
-        vext.8          d20, d8,  d9,  #2
-        vext.8          d18, d0,  d1,  #4
-        vext.8          d22, d8,  d9,  #4
-        vadd.i16        d4,  d0,  d16
-        vadd.i16        d6,  d8,  d20
-        vadd.i16        d4,  d4,  d18
-        vadd.i16        d6,  d6,  d22
-.endif

        vmull.u16       q6,  d0,  d0
        vmlal.u16       q6,  d16, d16
@ -850,22 +572,18 @@ function sgr_box3_h_16bpc_neon, export=1
        vmull.u16       q12, d8,  d8
        vmlal.u16       q12, d20, d20
        vmlal.u16       q12, d22, d22
-.if \w > 4
        vmull.u16       q7,  d1,  d1
        vmlal.u16       q7,  d17, d17
        vmlal.u16       q7,  d19, d19
        vmull.u16       q13, d9,  d9
        vmlal.u16       q13, d21, d21
        vmlal.u16       q13, d23, d23
-.endif
-.endm
-        add3            8
+        subs            r5,  r5,  #8
        vst1.16         {q2},       [r1,  :128]!
        vst1.16         {q3},       [r11, :128]!
        vst1.32         {q6,  q7},  [r0,  :128]!
        vst1.32         {q12, q13}, [r10, :128]!

-        subs            r5,  r5,  #8
        ble             9f
        tst             r7,  #2 // LR_HAVE_RIGHT
        vmov            q0,  q1
@ -876,78 +594,6 @@ function sgr_box3_h_16bpc_neon, export=1
        bne             4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

-5:      // Produce 4 pixels, 6 <= w < 10
-        add3            4
-        vst1.16         {d4},  [r1,  :64]!
-        vst1.16         {d6},  [r11, :64]!
-        vst1.32         {q6},  [r0,  :128]!
-        vst1.32         {q12}, [r10, :128]!
-
-        subs            r5,  r5,  #4 // 2 <= w < 6
-        vext.8          q0,  q0,  q1,  #8
-        vext.8          q4,  q4,  q5,  #8
-
-6:      // Pad the right edge and produce the last few pixels.
-        // 2 <= w < 6, 2-5 pixels valid in q0
-        sub             lr,  r5,  #2
-        // lr = (pixels valid - 2)
-        adr             r11, L(box3_variable_shift_tbl)
-        ldr             lr,  [r11, lr, lsl #2]
-        add             r11, r11, lr
-        bx              r11
-
-        .align 2
-L(box3_variable_shift_tbl):
-        .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-        .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-        .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-        .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
-
-        // Shift q0 right, shifting out invalid pixels,
-        // shift q0 left to the original offset, shifting in padding pixels.
-22:     // 2 pixels valid
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q4,  q4,  q4,  #4
-        vext.8          q0,  q0,  q14, #12
-        vext.8          q4,  q4,  q15, #12
-        b               88f
-33:     // 3 pixels valid
-        vext.8          q0,  q0,  q0,  #6
-        vext.8          q4,  q4,  q4,  #6
-        vext.8          q0,  q0,  q14, #10
-        vext.8          q4,  q4,  q15, #10
-        b               88f
-44:     // 4 pixels valid
-        vmov            d1,  d28
-        vmov            d9,  d30
-        b               88f
-55:     // 5 pixels valid
-        vext.8          q0,  q0,  q0,  #10
-        vext.8          q4,  q4,  q4,  #10
-        vext.8          q0,  q0,  q14, #6
-        vext.8          q4,  q4,  q15, #6
-
-88:
-        // Restore r11 after using it for a temporary value above
-        add             r11, r1,  #(2*SUM_STRIDE)
-
-        add3            4
-        subs            r5,  r5,  #4
-        vst1.16         {d4},  [r1,  :64]!
-        vst1.16         {d6},  [r11, :64]!
-        vst1.32         {q6},  [r0,  :128]!
-        vst1.32         {q12}, [r10, :128]!
-        ble             9f
-        vext.8          q0,  q0,  q0,  #8
-        vext.8          q4,  q4,  q4,  #8
-        // Only one needed pixel left, but do a normal 4 pixel
-        // addition anyway
-        add3            4
-        vst1.16         {d4},  [r1,  :64]!
-        vst1.16         {d6},  [r11, :64]!
-        vst1.32         {q6},  [r0,  :128]!
-        vst1.32         {q12}, [r10, :128]!
-
 9:
        subs            r6,  r6,  #2
        ble             0f
@ -963,7 +609,6 @@ L(box3_variable_shift_tbl):
 0:
        vpop            {q4-q7}
        pop             {r4-r11,pc}
-.purgem add3
 endfunc

 // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
@ -986,23 +631,11 @@ function sgr_box5_h_16bpc_neon, export=1
        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride

        // Subtract the aligned width from the output stride.
-        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
-        // Subtract the number of pixels read from the input from the stride.
-        tst             r7,  #2 // LR_HAVE_RIGHT
-        bne             0f
-        // !LR_HAVE_RIGHT
-        add             lr,  r5,  #3
-        bic             lr,  lr,  #3
-        add             r8,  r5,  #13
-        b               1f
-0:
        add             lr,  r5,  #7
        bic             lr,  lr,  #7
-        add             r8,  r5,  #15
-1:
        sub             r9,  r9,  lr, lsl #1
-        bic             r8,  r8,  #7
-        sub             r4,  r4,  r8, lsl #1
+        add             lr,  lr,  #8
+        sub             r4,  r4,  lr, lsl #1

        // Store the width for the vertical loop
        mov             r8,  r5
@ -1072,16 +705,27 @@ function sgr_box5_h_16bpc_neon, export=1
        // Restore r11 after using it for a temporary value
        add             r11, r1,  #(2*SUM_STRIDE)
 3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
+
+        // Check whether we need to pad the right edge
        cmp             r5,  #11
        bge             4f   // If w >= 11, all used input pixels are valid
-        cmp             r5,  #7
-        bge             5f   // If w >= 7, we can produce 4 pixels
-        b               6f
+
+        // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
+        // buffer pointer.
+        movrel_local    lr,  right_ext_mask, -2
+        sub             lr,  lr,  r5,  lsl #1
+        vld1.8          {q12, q13}, [lr]
+
+        vbit            q0,  q14, q12
+        vbit            q1,  q14, q13
+        vbit            q4,  q15, q12
+        vbit            q5,  q15, q13

 4:      // Loop horizontally
-.macro add5 w
-.if \w > 4
        vext.8          q8,  q0,  q1,  #2
        vext.8          q10, q4,  q5,  #2
        vext.8          q9,  q0,  q1,  #4
@ -1090,16 +734,6 @@ function sgr_box5_h_16bpc_neon, export=1
        vadd.i16        q3,  q4,  q10
        vadd.i16        q2,  q2,  q9
        vadd.i16        q3,  q3,  q11
-.else
-        vext.8          d16, d0,  d1,  #2
-        vext.8          d20, d8,  d9,  #2
-        vext.8          d18, d0,  d1,  #4
-        vext.8          d22, d8,  d9,  #4
-        vadd.i16        d4,  d0,  d16
-        vadd.i16        d6,  d8,  d20
-        vadd.i16        d4,  d4,  d18
-        vadd.i16        d6,  d6,  d22
-.endif

        vmull.u16       q6,  d0,  d0
        vmlal.u16       q6,  d16, d16
@ -1107,16 +741,13 @@ function sgr_box5_h_16bpc_neon, export=1
        vmull.u16       q12, d8,  d8
        vmlal.u16       q12, d20, d20
        vmlal.u16       q12, d22, d22
-.if \w > 4
        vmull.u16       q7,  d1,  d1
        vmlal.u16       q7,  d17, d17
        vmlal.u16       q7,  d19, d19
        vmull.u16       q13, d9,  d9
        vmlal.u16       q13, d21, d21
        vmlal.u16       q13, d23, d23
-.endif

-.if \w > 4
        vext.8          q8,  q0,  q1,  #6
        vext.8          q10, q4,  q5,  #6
        vext.8          q9,  q0,  q1,  #8
@ -1125,35 +756,22 @@ function sgr_box5_h_16bpc_neon, export=1
        vadd.i16        q3,  q3,  q10
        vadd.i16        q2,  q2,  q9
        vadd.i16        q3,  q3,  q11
-.else
-        vext.8          d16, d0,  d1,  #6
-        // d18 would be equal to d1; using d1 instead
-        vext.8          d20, d8,  d9,  #6
-        // d22 would be equal to d9; using d9 instead
-        vadd.i16        d4,  d4,  d16
-        vadd.i16        d6,  d6,  d20
-        vadd.i16        d4,  d4,  d1
-        vadd.i16        d6,  d6,  d9
-.endif

        vmlal.u16       q6,  d16, d16
        vmlal.u16       q6,  d1,  d1
        vmlal.u16       q12, d20, d20
        vmlal.u16       q12, d9,  d9
-.if \w > 4
        vmlal.u16       q7,  d17, d17
        vmlal.u16       q7,  d19, d19
        vmlal.u16       q13, d21, d21
        vmlal.u16       q13, d23, d23
-.endif
-.endm
-        add5            8
+
+        subs            r5,  r5,  #8
        vst1.16         {q2},       [r1,  :128]!
        vst1.16         {q3},       [r11, :128]!
        vst1.32         {q6,  q7},  [r0,  :128]!
        vst1.32         {q12, q13}, [r10, :128]!

-        subs            r5,  r5,  #8
        ble             9f
        tst             r7,  #2 // LR_HAVE_RIGHT
        vmov            q0,  q1
@ -1163,92 +781,6 @@ function sgr_box5_h_16bpc_neon, export=1
        bne             4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

-5:      // Produce 4 pixels, 7 <= w < 11
-        add5            4
-        vst1.16         {d4},  [r1,  :64]!
-        vst1.16         {d6},  [r11, :64]!
-        vst1.32         {q6},  [r0,  :128]!
-        vst1.32         {q12}, [r10, :128]!
-
-        subs            r5,  r5,  #4 // 3 <= w < 7
-        vext.8          q0,  q0,  q1,  #8
-        vext.8          q4,  q4,  q5,  #8
-
-6:      // Pad the right edge and produce the last few pixels.
-        // w < 7, w+1 pixels valid in q0/q4
-        sub             lr,  r5,  #1
-        // lr = pixels valid - 2
-        adr             r11, L(box5_variable_shift_tbl)
-        ldr             lr,  [r11, lr, lsl #2]
-        vmov            q1,  q14
-        vmov            q5,  q15
-        add             r11, r11, lr
-        bx              r11
-
-        .align 2
-L(box5_variable_shift_tbl):
-        .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-        .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
-
-        // Shift q0 right, shifting out invalid pixels,
-        // shift q0 left to the original offset, shifting in padding pixels.
-22:     // 2 pixels valid
-        vext.8          q0,  q0,  q0,  #4
-        vext.8          q4,  q4,  q4,  #4
-        vext.8          q0,  q0,  q14, #12
-        vext.8          q4,  q4,  q15, #12
-        b               88f
-33:     // 3 pixels valid
-        vext.8          q0,  q0,  q0,  #6
-        vext.8          q4,  q4,  q4,  #6
-        vext.8          q0,  q0,  q14, #10
-        vext.8          q4,  q4,  q15, #10
-        b               88f
-44:     // 4 pixels valid
-        vmov            d1,  d28
-        vmov            d9,  d30
-        b               88f
-55:     // 5 pixels valid
-        vext.8          q0,  q0,  q0,  #10
-        vext.8          q4,  q4,  q4,  #10
-        vext.8          q0,  q0,  q14, #6
-        vext.8          q4,  q4,  q15, #6
-        b               88f
-66:     // 6 pixels valid
-        vext.8          q0,  q0,  q0,  #12
-        vext.8          q4,  q4,  q4,  #12
-        vext.8          q0,  q0,  q14, #4
-        vext.8          q4,  q4,  q15, #4
-        b               88f
-77:     // 7 pixels valid
-        vext.8          q0,  q0,  q0,  #14
-        vext.8          q4,  q4,  q4,  #14
-        vext.8          q0,  q0,  q14, #2
-        vext.8          q4,  q4,  q15, #2
-
-88:
-        // Restore r11 after using it for a temporary value above
-        add             r11, r1,  #(2*SUM_STRIDE)
-
-        add5            4
-        subs            r5,  r5,  #4
-        vst1.16         {d4},  [r1,  :64]!
-        vst1.16         {d6},  [r11, :64]!
-        vst1.32         {q6},  [r0,  :128]!
-        vst1.32         {q12}, [r10, :128]!
-        ble             9f
-        vext.8          q0,  q0,  q1,  #8
-        vext.8          q4,  q4,  q5,  #8
-        add5            4
-        vst1.16         {d4},  [r1,  :64]!
-        vst1.16         {d6},  [r11, :64]!
-        vst1.32         {q6},  [r0,  :128]!
-        vst1.32         {q12}, [r10, :128]!
-
 9:
        subs            r6,  r6,  #2
        ble             0f
@ -1264,7 +796,6 @@ L(box5_variable_shift_tbl):
 0:
        vpop            {q4-q7}
        pop             {r4-r11,pc}
-.purgem add5
 endfunc

 sgr_funcs 16
--- a/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
+++ b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
@ -389,8 +389,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
        vrshrn.i32      d21, q11, #11
        vqmovun.s16     d4,  q2
        vqmovun.s16     d20, q10
-        vst1.8          {d4},  [r0]!
-        vst1.8          {d20}, [r9]!
+        vst1.8          {d4},  [r0, :64]!
+        vst1.8          {d20}, [r9, :64]!
 .else
        vqrshrun.s32    d4,  q2,  #11
        vqrshrun.s32    d5,  q3,  #11
@ -398,8 +398,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
        vqrshrun.s32    d21, q11, #11
        vmin.u16        q2,  q2,  q14
        vmin.u16        q10, q10, q14
-        vst1.16         {q2},  [r0]!
-        vst1.16         {q10}, [r9]!
+        vst1.16         {q2},  [r0, :128]!
+        vst1.16         {q10}, [r9, :128]!
 .endif
        bgt             1b

@ -438,12 +438,12 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1
        vrshrn.i32      d4,  q2,  #11
        vrshrn.i32      d5,  q3,  #11
        vqmovun.s16     d2,  q2
-        vst1.8          {d2}, [r0]!
+        vst1.8          {d2}, [r0, :64]!
 .else
        vqrshrun.s32    d4,  q2,  #11
        vqrshrun.s32    d5,  q3,  #11
        vmin.u16        q2,  q2,  q14
-        vst1.16         {q2}, [r0]!
+        vst1.16         {q2}, [r0, :128]!
 .endif
        bgt             2b
 0:
@ -531,8 +531,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
        vrshrn.i32      d23, q8,  #11
        vqmovun.s16     d6,  q3
        vqmovun.s16     d22, q11
-        vst1.8          {d6},  [r0]!
-        vst1.8          {d22}, [r10]!
+        vst1.8          {d6},  [r0,  :64]!
+        vst1.8          {d22}, [r10, :64]!
 .else
        vqrshrun.s32    d6,  q3,  #11
        vqrshrun.s32    d7,  q0,  #11
@ -540,8 +540,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
        vqrshrun.s32    d23, q8,  #11
        vmin.u16        q3,  q3,  q14
        vmin.u16        q11, q11, q14
-        vst1.16         {q3},  [r0]!
-        vst1.16         {q11}, [r10]!
+        vst1.16         {q3},  [r0,  :128]!
+        vst1.16         {q11}, [r10, :128]!
 .endif
        bgt             1b

@ -586,12 +586,12 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
        vrshrn.i32      d6,  q3,  #11
        vrshrn.i32      d7,  q0,  #11
        vqmovun.s16     d6,  q3
-        vst1.8          {d6}, [r0]!
+        vst1.8          {d6}, [r0, :64]!
 .else
        vqrshrun.s32    d6,  q3,  #11
        vqrshrun.s32    d7,  q0,  #11
        vmin.u16        q3,  q3,  q14
-        vst1.16         {q3}, [r0]!
+        vst1.16         {q3}, [r0, :128]!
 .endif
        bgt             1b
 0:
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@ -2966,8 +2966,8 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
 .endm

 .macro load_filter_coef dst, src, inc
-        vld1.8          {\dst}, [r12, :64]
        add             \src, \src, \inc
+        vld1.8          {\dst}, [r12, :64]
 .endm

 .macro load_filter_row dst, src, inc
@ -2978,72 +2978,57 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
 function warp_filter_horz_neon
        load_filter_ptr r5                  // filter 0
        vld1.16         {q7}, [r2], r3
+        vmov.i8         q6,  #128

        load_filter_coef d0, r5,  r7        // filter 0
-        vmovl.u8        q6,  d14            // original pixels
-        load_filter_row d2,  r5,  r7        // filter 1
-        vmovl.u8        q7,  d15            // original pixels
-        load_filter_row d4,  r5,  r7        // filter 2
-        vmovl.s8        q0,  d0             // filter 0
-        vext.8          q3,  q6,  q7,  #2*1 // filter 1 pixels
+        load_filter_row d1,  r5,  r7        // filter 1
+        load_filter_row d2,  r5,  r7        // filter 2
        load_filter_ptr r5                  // filter 3
-        vmovl.s8        q1,  d2             // filter 1
-        vmul.i16        q5,  q6,  q0        // filter 0 output
-        load_filter_coef d0, r5,  r7        // filter 3
-        vmovl.s8        q2,  d4             // filter 2
+        veor            q7,  q7,  q6        // subtract by 128 to allow using vmull
+        load_filter_coef d3, r5,  r7        // filter 3
+        vext.8          d12, d14, d15, #1   // filter 1 pixels
+        vext.8          d13, d14, d15, #2   // filter 2 pixels
        load_filter_ptr r5                  // filter 4
-        vext.8          q4,  q6,  q7,  #2*2 // filter 2 pixels
-        vmul.i16        q3,  q3,  q1        // filter 1 output
-        load_filter_coef d2, r5,  r7        // filter 4
-        vmul.i16        q4,  q4,  q2        // filter 2 output
-        vext.8          q2,  q6,  q7,  #2*3 // filter 3 pixels
-        vmovl.s8        q0,  d0             // filter 3
-        vpaddl.s16      q5,  q5             // pixel 0 (4x32)
-        vpaddl.s16      q3,  q3             // pixel 1 (4x32)
-        vmul.i16        q0,  q2,  q0        // filter 3 output
+        vmull.s8        q2,  d14, d0        // filter 0 output
+        vmull.s8        q3,  d12, d1        // filter 1 output
+        load_filter_coef d0, r5,  r7        // filter 4
        load_filter_ptr r5                  // filter 5
-        vext.8          q2,  q6,  q7,  #2*4 // filter 4 pixels
-        vmovl.s8        q1,  d2             // filter 4
-        vpaddl.s16      q4,  q4             // pixel 2 (4x32)
-        vpadd.s32       d10, d10, d11       // pixel 0 (2x32)
-        vpadd.s32       d11, d6,  d7        // pixel 1 (2x32)
-        load_filter_coef d6, r5,  r7        // filter 5
-        vmul.i16        q1,  q2,  q1        // filter 4 output
-        vpadd.s32       d8,  d8,  d9        // pixel 2 (2x32)
+        vext.8          d12, d14, d15, #3   // filter 3 pixels
+        vmull.s8        q4,  d13, d2        // filter 2 output
+        vext.8          d13, d14, d15, #4   // filter 4 pixels
+        vpadd.i16       d4,  d4,  d5        // pixel 0 (4x16)
+        vpadd.i16       d5,  d6,  d7        // pixel 1 (4x16)
+        load_filter_coef d1, r5,  r7        // filter 5
        load_filter_ptr r5                  // filter 6
-        vpaddl.s16      q0,  q0             // pixel 3 (4x32)
-        vpadd.s32       d10, d10, d11       // pixel 0,1
-        vext.8          q2,  q6,  q7,  #2*5 // filter 5 pixels
-        vmovl.s8        q3,  d6             // filter 5
-        vpaddl.s16      q1,  q1             // pixel 4 (4x32)
-        vpadd.s32       d9,  d0,  d1        // pixel 3 (2x32)
+        vmull.s8        q5,  d12, d3        // filter 3 output
+        vext.8          d12, d14, d15, #5   // filter 5 pixels
+        vmull.s8        q3,  d13, d0        // filter 4 output
        load_filter_coef d0, r5,  r7        // filter 6
-        vmul.i16        q2,  q2,  q3        // filter 5 output
-        vpadd.s32       d11, d8,  d9        // pixel 2,3
+        vext.8          d13, d14, d15, #6   // filter 6 pixels
        load_filter_ptr r5                  // filter 7
-        vpaddl.s16      q2,  q2             // pixel 5 (4x32)
-        vpadd.s32       d8,  d2,  d3        // pixel 4 (2x32)
-        vext.8          q3,  q6,  q7,  #2*6 // filter 6 pixels
-        vmovl.s8        q0,  d0             // filter 6
-        vpadd.s32       d9,  d4,  d5        // pixel 5 (2x32)
-        load_filter_coef d4, r5,  r7        // filter 7
-        vpadd.s32       d8,  d8,  d9        // pixel 4,5
-        vext.8          q1,  q6,  q7,  #2*7 // filter 7 pixels
-        vmovl.s8        q2,  d4             // filter 7
-        vmul.i16        q3,  q3,  q0        // filter 6 output
-        vmul.i16        q1,  q1,  q2        // filter 7 output
+        vpadd.i16       d8,  d8,  d9        // pixel 2 (4x16)
+        vpadd.i16       d9,  d10, d11       // pixel 3 (4x16)
+        vmull.s8        q5,  d12, d1        // filter 5 output
+        load_filter_coef d1, r5,  r7        // filter 7
+        vext.8          d14, d14, d15, #7   // filter 7 pixels
+        vpadd.i16       d6,  d6,  d7        // pixel 4 (4x16)
+        vpadd.i16       d10, d10, d11       // pixel 5 (4x16)
+        vmull.s8        q6,  d13, d0        // filter 6 output
+        vmull.s8        q7,  d14, d1        // filter 7 output
+
        sub             r5,  r5,  r7, lsl #3
-        vpaddl.s16      q3,  q3             // pixel 6 (4x32)
-        vpaddl.s16      q1,  q1             // pixel 7 (4x32)
-        vpadd.s32       d6,  d6,  d7        // pixel 6 (2x32)
-        vpadd.s32       d2,  d2,  d3        // pixel 7 (2x32)
-        vpadd.s32       d9,  d6,  d2        // pixel 6,7
+
+        vpadd.i16       d4,  d4,  d5        // pixel 0,1 (2x16)
+        vpadd.i16       d5,  d8,  d9        // pixel 2,3 (2x16)
+        vpadd.i16       d12, d12, d13       // pixel 6 (4x16)
+        vpadd.i16       d14, d14, d15       // pixel 7 (4x16)
+        vpadd.i16       d6,  d6,  d10       // pixel 4,5 (2x16)
+        vpadd.i16       d10, d12, d14       // pixel 6,7 (2x16)
+        vpadd.i16       d4,  d4,  d5        // pixel 0-3
+        vpadd.i16       d5,  d6,  d10       // pixel 4-7

        add             r5,  r5,  r8

-        vrshrn.s32      d10, q5,  #3
-        vrshrn.s32      d11, q4,  #3
-
        bx              lr
 endfunc

@ -3074,23 +3059,23 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
        add             r6,  r6,  #512

        bl              warp_filter_horz_neon
-        vmov            q8,  q5
+        vrshr.s16       q8,  q2,  #3
        bl              warp_filter_horz_neon
-        vmov            q9,  q5
+        vrshr.s16       q9,  q2,  #3
        bl              warp_filter_horz_neon
-        vmov            q10, q5
+        vrshr.s16       q10, q2,  #3
        bl              warp_filter_horz_neon
-        vmov            q11, q5
+        vrshr.s16       q11, q2,  #3
        bl              warp_filter_horz_neon
-        vmov            q12, q5
+        vrshr.s16       q12, q2,  #3
        bl              warp_filter_horz_neon
-        vmov            q13, q5
+        vrshr.s16       q13, q2,  #3
        bl              warp_filter_horz_neon
-        vmov            q14, q5
+        vrshr.s16       q14, q2,  #3

 1:
        bl              warp_filter_horz_neon
-        vmov            q15, q5
+        vrshr.s16       q15, q2,  #3

        load_filter_row d8,  r6,  r9
        load_filter_row d9,  r6,  r9
@ -3133,12 +3118,19 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
        vmlal.s16       q1,  d29,  d5
        vmlal.s16       q1,  d31,  d7

+.ifb \t
+        vmov.i16        q7,  #128
+.else
+        vmov.i16        q7,  #0x800
+.endif
+
        vmov            q8,  q9
        vmov            q9,  q10
        vqrshrn.s32     d0,  q0,  #\shift
        vmov            q10, q11
        vqrshrn.s32     d1,  q1,  #\shift
        vmov            q11, q12
+        vadd.i16        q0,  q0,  q7
        vmov            q12, q13
 .ifb \t
        vqmovun.s16     d0,  q0
--- a/third_party/dav1d/src/arm/32/mc16.S
+++ b/third_party/dav1d/src/arm/32/mc16.S
@ -3154,8 +3154,8 @@ filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
 .endm

 .macro load_filter_coef dst, src, inc
-        vld1.8          {\dst}, [r12, :64]
        add             \src, \src, \inc
+        vld1.8          {\dst}, [r12, :64]
 .endm

 .macro load_filter_row dst, src, inc
--- a/third_party/dav1d/src/arm/32/util.S
+++ b/third_party/dav1d/src/arm/32/util.S
@ -69,6 +69,56 @@
 #endif
 .endm

+// This macro clobbers r7 (and r12 on windows) and stores data at the
+// bottom of the stack; sp is the start of the space allocated that
+// the caller can use.
+.macro sub_sp_align space
+#if CONFIG_THUMB
+        mov             r7,  sp
+        and             r7,  r7,  #15
+#else
+        and             r7,  sp,  #15
+#endif
+        sub             sp,  sp,  r7
+        // Now the stack is aligned, store the amount of adjustment back
+        // on the stack, as we don't want to waste a register as frame
+        // pointer.
+        str             r7,  [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             r7,  sp,  #4096
+        ldr             r12, [r7]
+        sub             r7,  r7,  #(\space - 4096)
+        mov             sp,  r7
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+        add             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        add             sp,  sp,  #(\space)%4096
+.endif
+        ldr             r7,  [sp], #16
+        // Add back the original stack adjustment
+        add             sp,  sp,  r7
+.endm
+
 .macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
        vtrn.32         \q0,  \q2
        vtrn.32         \q1,  \q3
@ -108,6 +158,14 @@
        vtrn.8          \r2,  \r3
 .endm

+.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+        vswp            \r1,  \r4 // vtrn.64 \q0, \q2
+        vswp            \r3,  \r6 // vtrn.64 \q1, \q3
+
+        vtrn.32         \q0,  \q1
+        vtrn.32         \q2,  \q3
+.endm
+
 .macro transpose_4x4h q0, q1, r0, r1, r2, r3
        vtrn.32         \q0,  \q1

--- a/third_party/dav1d/src/arm/64/cdef.S
+++ b/third_party/dav1d/src/arm/64/cdef.S
@ -363,10 +363,8 @@ find_dir 8
        neg             v20.16b, v21.16b              // -imin()
        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()
        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()
-        smlal           v1.8h,   v18.8b,  v19.8b      // sum += taps[k] * constrain()
-        smlal           v1.8h,   v22.8b,  v19.8b      // sum += taps[k] * constrain()
-        smlal2          v2.8h,   v18.16b, v19.16b     // sum += taps[k] * constrain()
-        smlal2          v2.8h,   v22.16b, v19.16b     // sum += taps[k] * constrain()
+        mla             v1.16b,  v18.16b, v19.16b     // sum += taps[k] * constrain()
+        mla             v2.16b,  v22.16b, v19.16b     // sum += taps[k] * constrain()
 .endm

 // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
@ -418,8 +416,11 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon
        ld1             {v0.s}[3], [x14]            // px
 .endif

-        movi            v1.8h,  #0                  // sum
-        movi            v2.8h,  #0                  // sum
+        // We need 9-bits or two 8-bit accululators to fit the sum.
+        // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
+        // Start sum at -1 instead of 0 to help handle rounding later.
+        movi            v1.16b, #255                // sum
+        movi            v2.16b, #0                  // sum
 .if \min
        mov             v3.16b, v0.16b              // min
        mov             v4.16b, v0.16b              // max
@ -468,16 +469,16 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon
 .endif
        b.ne            2b

-        sshr            v5.8h,   v1.8h,   #15       // -(sum < 0)
-        sshr            v6.8h,   v2.8h,   #15       // -(sum < 0)
-        add             v1.8h,   v1.8h,   v5.8h     // sum - (sum < 0)
-        add             v2.8h,   v2.8h,   v6.8h     // sum - (sum < 0)
-        srshr           v1.8h,   v1.8h,   #4        // (8 + sum - (sum < 0)) >> 4
-        srshr           v2.8h,   v2.8h,   #4        // (8 + sum - (sum < 0)) >> 4
-        uaddw           v1.8h,   v1.8h,   v0.8b     // px + (8 + sum ...) >> 4
-        uaddw2          v2.8h,   v2.8h,   v0.16b    // px + (8 + sum ...) >> 4
-        sqxtun          v0.8b,   v1.8h
-        sqxtun2         v0.16b,  v2.8h
+        // Perform halving adds since the value won't fit otherwise.
+        // To handle the offset for negative values, use both halving w/ and w/o rounding.
+        srhadd          v5.16b,  v1.16b,  v2.16b    // sum >> 1
+        shadd           v6.16b,  v1.16b,  v2.16b    // (sum - 1) >> 1
+        sshr            v1.16b,  v5.16b,  #7        // sum < 0
+        bsl             v1.16b,  v6.16b,  v5.16b    // (sum - (sum < 0)) >> 1
+
+        srshr           v1.16b,  v1.16b,  #3        // (8 + sum - (sum < 0)) >> 4
+
+        usqadd          v0.16b,  v1.16b             // px + (8 + sum ...) >> 4
 .if \min
        umin            v0.16b,  v0.16b,  v4.16b
        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)
--- a/third_party/dav1d/src/arm/64/cdef_tmpl.S
+++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S
@ -311,6 +311,30 @@ endconst
 .endif
 .endm

+// Steps for loading and preparing each row
+.macro dir_load_step1 s1, bpc
+.if \bpc == 8
+        ld1             {\s1\().8b}, [x0], x1
+.else
+        ld1             {\s1\().8h}, [x0], x1
+.endif
+.endm
+
+.macro dir_load_step2 s1, bpc
+.if \bpc == 8
+        usubl           \s1\().8h,  \s1\().8b, v31.8b
+.else
+        ushl            \s1\().8h,  \s1\().8h, v8.8h
+.endif
+.endm
+
+.macro dir_load_step3 s1, bpc
+// Nothing for \bpc == 8
+.if \bpc != 8
+        sub             \s1\().8h,  \s1\().8h, v31.8h
+.endif
+.endm
+
 // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
 //                                   unsigned *const var)
 .macro find_dir bpc
@ -333,21 +357,15 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
+        dir_load_step1  v26, \bpc       // Setup first row early
        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
+        dir_load_step2  v26, \bpc
        movi            v19.8h,  #0
+        dir_load_step3  v26, \bpc
        movi            v21.8h,  #0 // v20-v21 sum_alt[3]

 .irpc i, 01234567
-.if \bpc == 8
-        ld1             {v26.8b}, [x0], x1
-        usubl           v26.8h,  v26.8b, v31.8b
-.else
-        ld1             {v26.8h}, [x0], x1
-        ushl            v26.8h,  v26.8h, v8.8h
-        sub             v26.8h,  v26.8h, v31.8h
-.endif
-
        addv            h25,     v26.8h               // [y]
        rev64           v27.8h,  v26.8h
        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
@ -355,30 +373,6 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
-
-.if \i == 0
-        mov             v0.16b,  v26.16b              // sum_diag[0]
-        mov             v2.16b,  v27.16b              // sum_diag[1]
-        mov             v6.16b,  v28.16b              // sum_alt[0]
-        mov             v16.16b, v29.16b              // sum_alt[1]
-.else
-        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
-        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
-        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
-        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
-        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
-        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
-        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
-        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
-        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
-        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
-        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
-        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
-        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
-        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
-        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
-        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
-.endif
 .if \i < 6
        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
@ -397,6 +391,41 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1
        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
 .endif
+.if \i == 0
+        mov             v0.16b,  v26.16b              // sum_diag[0]
+        dir_load_step1  v26, \bpc
+        mov             v2.16b,  v27.16b              // sum_diag[1]
+        dir_load_step2  v26, \bpc
+        mov             v6.16b,  v28.16b              // sum_alt[0]
+        dir_load_step3  v26, \bpc
+        mov             v16.16b, v29.16b              // sum_alt[1]
+.else
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
+        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
+.if \i != 7 // Nothing to load for the final row
+        dir_load_step1  v26, \bpc // Start setting up the next row early.
+.endif
+        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
+        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
+        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
+        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
+.if \i != 7
+        dir_load_step2  v26, \bpc
+.endif
+        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
+        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
+        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
+.if \i != 7
+        dir_load_step3  v26, \bpc
+.endif
+        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
+        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
+        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
+        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
+.endif
 .endr

        movi            v31.4s,  #105
--- a/third_party/dav1d/src/arm/64/ipred.S
+++ b/third_party/dav1d/src/arm/64/ipred.S
@ -502,9 +502,9 @@ L(ipred_dc_h4):
        ld1             {v0.s}[0],  [x2], #4
        ins             v0.s[1], wzr
        uaddlv          h0,      v0.8b
+        add             x2,  x2,  #1
        br              x3
 L(ipred_dc_w4):
-        add             x2,  x2,  #1
        ld1             {v1.s}[0],  [x2]
        ins             v1.s[1], wzr
        add             v0.4h,   v0.4h,   v16.4h
@ -534,9 +534,9 @@ L(ipred_dc_w4):
 L(ipred_dc_h8):
        ld1             {v0.8b},  [x2], #8
        uaddlv          h0,      v0.8b
+        add             x2,  x2,  #1
        br              x3
 L(ipred_dc_w8):
-        add             x2,  x2,  #1
        ld1             {v1.8b},  [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.8b
@ -565,9 +565,9 @@ L(ipred_dc_w8):
 L(ipred_dc_h16):
        ld1             {v0.16b}, [x2], #16
        uaddlv          h0,      v0.16b
+        add             x2,  x2,  #1
        br              x3
 L(ipred_dc_w16):
-        add             x2,  x2,  #1
        ld1             {v1.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.16b
@ -597,10 +597,10 @@ L(ipred_dc_h32):
        ld1             {v0.16b, v1.16b}, [x2], #32
        uaddlv          h0,      v0.16b
        uaddlv          h1,      v1.16b
+        add             x2,  x2,  #1
        add             v0.4h,   v0.4h,   v1.4h
        br              x3
 L(ipred_dc_w32):
-        add             x2,  x2,  #1
        ld1             {v1.16b, v2.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.16b
@ -637,10 +637,10 @@ L(ipred_dc_h64):
        uaddlv          h3,      v3.16b
        add             v0.4h,   v0.4h,   v1.4h
        add             v2.4h,   v2.4h,   v3.4h
+        add             x2,  x2,  #1
        add             v0.4h,   v0.4h,   v2.4h
        br              x3
 L(ipred_dc_w64):
-        add             x2,  x2,  #1
        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h1,      v1.16b
@ -1834,10 +1834,10 @@ function ipred_cfl_8bpc_neon, export=1
 L(ipred_cfl_h4):
        ld1             {v0.s}[0],  [x2], #4
        ins             v0.s[1], wzr
+        add             x2,  x2,  #1
        uaddlv          h0,      v0.8b
        br              x9
 L(ipred_cfl_w4):
-        add             x2,  x2,  #1
        ld1             {v2.s}[0],  [x2]
        ins             v2.s[1], wzr
        add             v0.4h,   v0.4h,   v16.4h
@ -1860,9 +1860,9 @@ L(ipred_cfl_w4):
 L(ipred_cfl_h8):
        ld1             {v0.8b},  [x2], #8
        uaddlv          h0,      v0.8b
+        add             x2,  x2,  #1
        br              x9
 L(ipred_cfl_w8):
-        add             x2,  x2,  #1
        ld1             {v2.8b},  [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.8b
@ -1884,9 +1884,9 @@ L(ipred_cfl_w8):
 L(ipred_cfl_h16):
        ld1             {v0.16b}, [x2], #16
        uaddlv          h0,      v0.16b
+        add             x2,  x2,  #1
        br              x9
 L(ipred_cfl_w16):
-        add             x2,  x2,  #1
        ld1             {v2.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.16b
@ -1909,10 +1909,10 @@ L(ipred_cfl_h32):
        ld1             {v2.16b, v3.16b}, [x2], #32
        uaddlv          h2,      v2.16b
        uaddlv          h3,      v3.16b
+        add             x2,  x2,  #1
        add             v0.4h,   v2.4h,   v3.4h
        br              x9
 L(ipred_cfl_w32):
-        add             x2,  x2,  #1
        ld1             {v2.16b, v3.16b}, [x2]
        add             v0.4h,   v0.4h,   v16.4h
        uaddlv          h2,      v2.16b
--- a/third_party/dav1d/src/arm/64/ipred16.S
+++ b/third_party/dav1d/src/arm/64/ipred16.S
@ -562,9 +562,9 @@ function ipred_dc_16bpc_neon, export=1
 L(ipred_dc_h4):
        ld1             {v0.4h},  [x2], #8
        uaddlv          s0,      v0.4h
+        add             x2,  x2,  #2
        br              x3
 L(ipred_dc_w4):
-        add             x2,  x2,  #2
        ld1             {v1.4h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s1,      v1.4h
@ -594,9 +594,9 @@ L(ipred_dc_w4):
 L(ipred_dc_h8):
        ld1             {v0.8h},  [x2], #16
        uaddlv          s0,      v0.8h
+        add             x2,  x2,  #2
        br              x3
 L(ipred_dc_w8):
-        add             x2,  x2,  #2
        ld1             {v1.8h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s1,      v1.8h
@ -626,10 +626,10 @@ L(ipred_dc_w8):
 L(ipred_dc_h16):
        ld1             {v0.8h, v1.8h}, [x2], #32
        addp            v0.8h,   v0.8h,   v1.8h
+        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x3
 L(ipred_dc_w16):
-        add             x2,  x2,  #2
        ld1             {v1.8h, v2.8h}, [x2]
        add             v0.2s,   v0.2s,   v16.2s
        addp            v1.8h,   v1.8h,   v2.8h
@ -663,10 +663,10 @@ L(ipred_dc_h32):
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v0.8h,   v0.8h,   v2.8h
+        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x3
 L(ipred_dc_w32):
-        add             x2,  x2,  #2
        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
        add             v0.2s,   v0.2s,   v16.2s
        addp            v1.8h,   v1.8h,   v2.8h
@ -709,10 +709,10 @@ L(ipred_dc_h64):
        addp            v0.8h,   v0.8h,   v2.8h
        addp            v4.8h,   v4.8h,   v6.8h
        addp            v0.8h,   v0.8h,   v4.8h
+        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x3
 L(ipred_dc_w64):
-        add             x2,  x2,  #2
        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
        add             v0.2s,   v0.2s,   v16.2s
        addp            v1.8h,   v1.8h,   v2.8h
@ -1382,7 +1382,9 @@ function ipred_filter_\bpc\()bpc_neon
        sxtl            v21.8h,  v21.8b
        sxtl            v22.8h,  v22.8b
        dup             v31.8h,  w8
+.if \bpc == 10
        movi            v30.8h,  #0
+.endif
        br              x5
 40:
        ldur            d0,  [x2, #2]             // top (0-3)
@ -1421,7 +1423,6 @@ function ipred_filter_\bpc\()bpc_neon
        smin            v2.8h,   v2.8h,   v31.8h
        subs            w4,  w4,  #2
        st1             {v2.d}[0], [x0], x1
-        uxtl            v0.8h,   v2.8b
        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
        st1             {v2.d}[1], [x6], x1
        b.gt            4b
@ -2143,9 +2144,9 @@ function ipred_cfl_16bpc_neon, export=1
 L(ipred_cfl_h4):
        ld1             {v0.4h},  [x2], #8
        uaddlv          s0,      v0.4h
+        add             x2,  x2,  #2
        br              x9
 L(ipred_cfl_w4):
-        add             x2,  x2,  #2
        ld1             {v2.4h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s2,      v2.4h
@ -2168,9 +2169,9 @@ L(ipred_cfl_w4):
 L(ipred_cfl_h8):
        ld1             {v0.8h},  [x2], #16
        uaddlv          s0,      v0.8h
+        add             x2,  x2,  #2
        br              x9
 L(ipred_cfl_w8):
-        add             x2,  x2,  #2
        ld1             {v2.8h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s2,      v2.8h
@ -2193,10 +2194,10 @@ L(ipred_cfl_w8):
 L(ipred_cfl_h16):
        ld1             {v2.8h, v3.8h}, [x2], #32
        addp            v0.8h,   v2.8h,   v3.8h
+        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x9
 L(ipred_cfl_w16):
-        add             x2,  x2,  #2
        ld1             {v2.8h, v3.8h}, [x2]
        add             v0.2s,   v0.2s,   v16.2s
        addp            v2.8h,   v2.8h,   v3.8h
@ -2222,10 +2223,10 @@ L(ipred_cfl_h32):
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v0.8h,   v2.8h,   v4.8h
+        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x9
 L(ipred_cfl_w32):
-        add             x2,  x2,  #2
        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
        add             v0.4s,   v0.4s,   v16.4s
        addp            v2.8h,   v2.8h,   v3.8h
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@ -718,7 +718,7 @@ def_fn_4x4 identity, flipadst
        rshrn_sz        \r7, v4,  v5,  #12, \sz                   // t7a
        smull_smlal     v2,  v3,  \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
        rshrn_sz        \r3, v6,  v7,  #12, \sz                   // t5a
-        rshrn_sz        \r5, v2,  v3,  #12, \sz                   // taa
+        rshrn_sz        \r5, v2,  v3,  #12, \sz                   // t6a

        sqadd           v2\sz,   \r1\sz,  \r3\sz // t4
        sqsub           \r1\sz,  \r1\sz,  \r3\sz // t5a
@ -1085,7 +1085,7 @@ def_fns_48 8, 4

        rshrn_sz        v4,  v4,  v5,  #12, \sz   // t11
        rshrn_sz        v5,  v6,  v7,  #12, \sz   // t12
-        smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+        smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
        rshrn_sz        v2,  v2,  v3,  #12, \sz   // t10a
        rshrn_sz        v3,  v6,  v7,  #12, \sz   // t13a

@ -3002,29 +3002,6 @@ function inv_txfm_add_vert_dct_8x64_neon
        br              x14
 endfunc

-.macro sub_sp space
-#ifdef _WIN32
-.if \space > 8192
-        // Here, we'd need to touch two (or more) pages while decrementing
-        // the stack pointer.
-        .error          "sub_sp_align doesn't support values over 8K at the moment"
-.elseif \space > 4096
-        sub             x16, sp,  #4096
-        ldr             xzr, [x16]
-        sub             sp,  x16, #(\space - 4096)
-.else
-        sub             sp,  sp,  #\space
-.endif
-#else
-.if \space >= 4096
-        sub             sp,  sp,  #(\space)/4096*4096
-.endif
-.if (\space % 4096) != 0
-        sub             sp,  sp,  #(\space)%4096
-.endif
-#endif
-.endm
-
 function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
        idct_dc         64,  64,  2

@ -3149,7 +3126,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
        mov             w8,  #(32 - \i)
        cmp             w3,  w12
        b.lt            1f
+.if \i < 24
        ldrh            w12, [x13], #2
+.endif
 .endif
        add             x7,  x2,  #(\i*2)
        mov             x8,  #32*2
@ -3254,7 +3233,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
        mov             w8,  #(32 - \i)
        cmp             w3,  w12
        b.lt            1f
+.if \i < 24
        ldrh            w12, [x13], #2
+.endif
 .endif
        add             x7,  x2,  #(\i*2)
        mov             x8,  #32*2
--- a/third_party/dav1d/src/arm/64/itx16.S
+++ b/third_party/dav1d/src/arm/64/itx16.S
@ -124,7 +124,7 @@ endconst
 .endif
 .endm

-.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
 .ifnb \load
        ld1             {\load},  [\src], x1
 .endif
@ -132,10 +132,7 @@ endconst
        srshr           \shift,  \shift,  #\shiftbits
 .endif
 .ifnb \addsrc
-        sqadd           \adddst, \adddst, \addsrc
-.endif
-.ifnb \max
-        smax            \max,  \max,  v6.8h
+        usqadd          \adddst, \addsrc
 .endif
 .ifnb \min
        smin            \min,  \min,  v7.8h
@ -146,63 +143,57 @@ endconst
 .endm
 .macro load_add_store_8x16 dst, src
        mov             \src, \dst
-        movi            v6.8h,   #0
        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
-        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src
-        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src
-        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src
-        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src
-        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src
-        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
-        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
-        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
-        load_add_store  v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
-        load_add_store  v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
-        load_add_store  v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
-        load_add_store  v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
-        load_add_store  v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
-        load_add_store  v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
-        load_add_store  v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
-        load_add_store  v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
-        load_add_store       ,       , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
-        load_add_store       ,       , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
-        load_add_store       ,       ,      ,       , v31.8h, v30.8h, v29.8h, \dst, \src
-        load_add_store       ,       ,      ,       ,       , v31.8h, v30.8h, \dst, \src
-        load_add_store       ,       ,      ,       ,       ,       , v31.8h, \dst, \src
+        load_add_store  v2.8h,  v16.8h,       ,      ,       ,       ,  \dst, \src
+        load_add_store  v3.8h,  v17.8h,       ,      ,       ,       ,  \dst, \src
+        load_add_store  v4.8h,  v18.8h, v16.8h, v2.8h,       ,       ,  \dst, \src
+        load_add_store  v5.8h,  v19.8h, v17.8h, v3.8h,  v2.8h,       ,  \dst, \src
+        load_add_store  v16.8h, v20.8h, v18.8h, v4.8h,  v3.8h,  v2.8h,  \dst, \src
+        load_add_store  v17.8h, v21.8h, v19.8h, v5.8h,  v4.8h,  v3.8h,  \dst, \src
+        load_add_store  v18.8h, v22.8h, v20.8h, v16.8h, v5.8h,  v4.8h,  \dst, \src
+        load_add_store  v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h,  \dst, \src
+        load_add_store  v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+        load_add_store  v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+        load_add_store  v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+        load_add_store  v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+        load_add_store  v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+        load_add_store  v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+        load_add_store  v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+        load_add_store  v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+        load_add_store        ,       , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+        load_add_store        ,       , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+        load_add_store        ,       ,       ,       , v27.8h, v26.8h, \dst, \src
+        load_add_store        ,       ,       ,       ,       , v27.8h, \dst, \src
 .endm
 .macro load_add_store_8x8 dst, src, shiftbits=4
        mov             \src, \dst
-        movi            v6.8h,   #0
        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
-        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
-        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
-        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
-        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
-        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
-        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
-        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
-        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
-        load_add_store       ,       , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
-        load_add_store       ,       , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
-        load_add_store       ,       ,      ,       , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
-        load_add_store       ,       ,      ,       ,       , v23.8h, v22.8h, \dst, \src, \shiftbits
-        load_add_store       ,       ,      ,       ,       ,       , v23.8h, \dst, \src, \shiftbits
+        load_add_store  v2.8h,  v16.8h,       ,      ,       ,       ,  \dst, \src, \shiftbits
+        load_add_store  v3.8h,  v17.8h,       ,      ,       ,       ,  \dst, \src, \shiftbits
+        load_add_store  v4.8h,  v18.8h, v16.8h, v2.8h,       ,       ,  \dst, \src, \shiftbits
+        load_add_store  v5.8h,  v19.8h, v17.8h, v3.8h,  v2.8h,       ,  \dst, \src, \shiftbits
+        load_add_store  v16.8h, v20.8h, v18.8h, v4.8h,  v3.8h,  v2.8h,  \dst, \src, \shiftbits
+        load_add_store  v17.8h, v21.8h, v19.8h, v5.8h,  v4.8h,  v3.8h,  \dst, \src, \shiftbits
+        load_add_store  v18.8h, v22.8h, v20.8h, v16.8h, v5.8h,  v4.8h,  \dst, \src, \shiftbits
+        load_add_store  v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h,  \dst, \src, \shiftbits
+        load_add_store        ,       , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+        load_add_store        ,       , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+        load_add_store        ,       ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
+        load_add_store        ,       ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
 .endm
 .macro load_add_store_8x4 dst, src, shiftbits=4
        mov             \src, \dst
-        movi            v6.8h,   #0
        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
-        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
-        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
-        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
-        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
-        load_add_store       ,       , v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
-        load_add_store       ,       , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
-        load_add_store       ,       ,      ,       , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
-        load_add_store       ,       ,      ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
-        load_add_store       ,       ,      ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
+        load_add_store  v2.8h, v16.8h,       ,      ,      ,      , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v17.8h,       ,      ,      ,      , \dst, \src, \shiftbits
+        load_add_store  v4.8h, v18.8h, v16.8h, v2.8h,      ,      , \dst, \src, \shiftbits
+        load_add_store  v5.8h, v19.8h, v17.8h, v3.8h, v2.8h,      , \dst, \src, \shiftbits
+        load_add_store       ,       , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+        load_add_store       ,       , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,       ,      , v5.8h, v4.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,       ,      ,      , v5.8h, \dst, \src, \shiftbits
 .endm
-.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
 .ifnb \load
        ld1             {\load}[0],  [\src], x1
 .endif
@ -216,14 +207,11 @@ endconst
        ld1             {\load}[1],  [\src], x1
 .endif
 .ifnb \addsrc
-        sqadd           \adddst, \adddst, \addsrc
+        usqadd          \adddst, \addsrc
 .endif
 .ifnb \store
        st1             {\store}[0],  [\dst], x1
 .endif
-.ifnb \max
-        smax            \max,  \max,  v6.8h
-.endif
 .ifnb \min
        smin            \min,  \min,  v7.8h
 .endif
@ -233,37 +221,33 @@ endconst
 .endm
 .macro load_add_store_4x16 dst, src
        mov             \src, \dst
-        movi            v6.8h,   #0
        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
-        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
-        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
-        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
-        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
-        load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
-        load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
-        load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
-        load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
-        load_add_store4     ,    ,    , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
-        load_add_store4     ,    ,    , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
-        load_add_store4     ,    ,    ,       , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
-        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v28.8h, v26.d, \dst, \src
-        load_add_store4     ,    ,    ,       ,      ,       ,       , v30.8h, v28.d, \dst, \src
-        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v30.d, \dst, \src
+        load_add_store4 v0.d,  v17, v16,       ,       ,      ,       ,      ,  \dst, \src
+        load_add_store4 v1.d,  v19, v18,       ,       ,      ,       ,      ,  \dst, \src
+        load_add_store4 v2.d,  v21, v20, v16.8h,       ,      ,       ,      ,  \dst, \src
+        load_add_store4 v3.d,  v23, v22, v18.8h, v16.8h, v0.8h,       ,      ,  \dst, \src
+        load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h,  v0.8h,      ,  \dst, \src
+        load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h,  v1.8h,  v0.d,  \dst, \src
+        load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h,  v2.8h,  v1.d,  \dst, \src
+        load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h,  v2.d,  \dst, \src
+        load_add_store4      ,    ,    , v28.8h, v26.8h, v19.8h, v17.8h, v3.d,  \dst, \src
+        load_add_store4      ,    ,    , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
+        load_add_store4      ,    ,    ,       , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
+        load_add_store4      ,    ,    ,       ,      ,        , v23.8h, v21.d, \dst, \src
+        load_add_store4      ,    ,    ,       ,      ,        ,       , v23.d, \dst, \src
 .endm
 .macro load_add_store_4x8 dst, src
        mov             \src, \dst
-        movi            v6.8h,   #0
        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
-        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
-        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
-        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
-        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
-        load_add_store4     ,    ,    , v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
-        load_add_store4     ,    ,    , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
-        load_add_store4     ,    ,    ,       , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
-        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v20.8h, v18.d, \dst, \src
-        load_add_store4     ,    ,    ,       ,      ,       ,       , v22.8h, v20.d, \dst, \src
-        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v22.d, \dst, \src
+        load_add_store4 v0.d, v17, v16,       ,       ,      ,      ,     , \dst, \src
+        load_add_store4 v1.d, v19, v18,       ,       ,      ,      ,     , \dst, \src
+        load_add_store4 v2.d, v21, v20, v16.8h,       ,      ,      ,     , \dst, \src
+        load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h,      ,     , \dst, \src
+        load_add_store4     ,    ,    , v20.8h, v18.8h, v1.8h, v0.8h,     , \dst, \src
+        load_add_store4     ,    ,    , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+        load_add_store4     ,    ,    ,       , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,       ,      , v3.8h, v2.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,       ,      ,      , v3.d, \dst, \src
 .endm

 .macro idct_dc w, h, shift
@ -291,7 +275,6 @@ endconst
 .endm

 function idct_dc_w4_neon
-        movi            v30.8h,  #0
        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
 1:
        ld1             {v0.d}[0], [x0], x1
@ -299,11 +282,9 @@ function idct_dc_w4_neon
        ld1             {v1.d}[0], [x0], x1
        subs            w4,  w4,  #4
        ld1             {v1.d}[1], [x0], x1
-        sqadd           v0.8h,   v0.8h,   v16.8h
+        usqadd          v0.8h,   v16.8h
        sub             x0,  x0,  x1, lsl #2
-        sqadd           v1.8h,   v1.8h,   v16.8h
-        smax            v0.8h,   v0.8h,   v30.8h
-        smax            v1.8h,   v1.8h,   v30.8h
+        usqadd          v1.8h,   v16.8h
        smin            v0.8h,   v0.8h,   v31.8h
        st1             {v0.d}[0], [x0], x1
        smin            v1.8h,   v1.8h,   v31.8h
@ -315,23 +296,18 @@ function idct_dc_w4_neon
 endfunc

 function idct_dc_w8_neon
-        movi            v30.8h,  #0
        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
 1:
        ld1             {v0.8h}, [x0], x1
        subs            w4,  w4,  #4
        ld1             {v1.8h}, [x0], x1
-        sqadd           v0.8h,   v0.8h,   v16.8h
+        usqadd          v0.8h,   v16.8h
        ld1             {v2.8h}, [x0], x1
-        sqadd           v1.8h,   v1.8h,   v16.8h
+        usqadd          v1.8h,   v16.8h
        ld1             {v3.8h}, [x0], x1
-        sqadd           v2.8h,   v2.8h,   v16.8h
-        sqadd           v3.8h,   v3.8h,   v16.8h
+        usqadd          v2.8h,   v16.8h
+        usqadd          v3.8h,   v16.8h
        sub             x0,  x0,  x1, lsl #2
-        smax            v0.8h,   v0.8h,   v30.8h
-        smax            v1.8h,   v1.8h,   v30.8h
-        smax            v2.8h,   v2.8h,   v30.8h
-        smax            v3.8h,   v3.8h,   v30.8h
        smin            v0.8h,   v0.8h,   v31.8h
        smin            v1.8h,   v1.8h,   v31.8h
        st1             {v0.8h}, [x0], x1
@ -345,21 +321,16 @@ function idct_dc_w8_neon
 endfunc

 function idct_dc_w16_neon
-        movi            v30.8h,  #0
        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
 1:
        ld1             {v0.8h, v1.8h}, [x0], x1
        subs            w4,  w4,  #2
        ld1             {v2.8h, v3.8h}, [x0], x1
-        sqadd           v0.8h,   v0.8h,   v16.8h
-        sqadd           v1.8h,   v1.8h,   v16.8h
+        usqadd          v0.8h,   v16.8h
+        usqadd          v1.8h,   v16.8h
        sub             x0,  x0,  x1, lsl #1
-        sqadd           v2.8h,   v2.8h,   v16.8h
-        sqadd           v3.8h,   v3.8h,   v16.8h
-        smax            v0.8h,   v0.8h,   v30.8h
-        smax            v1.8h,   v1.8h,   v30.8h
-        smax            v2.8h,   v2.8h,   v30.8h
-        smax            v3.8h,   v3.8h,   v30.8h
+        usqadd          v2.8h,   v16.8h
+        usqadd          v3.8h,   v16.8h
        smin            v0.8h,   v0.8h,   v31.8h
        smin            v1.8h,   v1.8h,   v31.8h
        smin            v2.8h,   v2.8h,   v31.8h
@ -371,19 +342,14 @@ function idct_dc_w16_neon
 endfunc

 function idct_dc_w32_neon
-        movi            v30.8h,  #0
        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
 1:
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
        subs            w4,  w4,  #1
-        sqadd           v0.8h,   v0.8h,   v16.8h
-        sqadd           v1.8h,   v1.8h,   v16.8h
-        sqadd           v2.8h,   v2.8h,   v16.8h
-        sqadd           v3.8h,   v3.8h,   v16.8h
-        smax            v0.8h,   v0.8h,   v30.8h
-        smax            v1.8h,   v1.8h,   v30.8h
-        smax            v2.8h,   v2.8h,   v30.8h
-        smax            v3.8h,   v3.8h,   v30.8h
+        usqadd          v0.8h,   v16.8h
+        usqadd          v1.8h,   v16.8h
+        usqadd          v2.8h,   v16.8h
+        usqadd          v3.8h,   v16.8h
        smin            v0.8h,   v0.8h,   v31.8h
        smin            v1.8h,   v1.8h,   v31.8h
        smin            v2.8h,   v2.8h,   v31.8h
@ -394,30 +360,21 @@ function idct_dc_w32_neon
 endfunc

 function idct_dc_w64_neon
-        movi            v30.8h,  #0
        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
        sub             x1,  x1,  #64
 1:
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        subs            w4,  w4,  #1
-        sqadd           v0.8h,   v0.8h,   v16.8h
+        usqadd          v0.8h,   v16.8h
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
-        sqadd           v1.8h,   v1.8h,   v16.8h
+        usqadd          v1.8h,   v16.8h
        sub             x0,  x0,  #64
-        sqadd           v2.8h,   v2.8h,   v16.8h
-        sqadd           v3.8h,   v3.8h,   v16.8h
-        sqadd           v4.8h,   v4.8h,   v16.8h
-        sqadd           v5.8h,   v5.8h,   v16.8h
-        sqadd           v6.8h,   v6.8h,   v16.8h
-        sqadd           v7.8h,   v7.8h,   v16.8h
-        smax            v0.8h,   v0.8h,   v30.8h
-        smax            v1.8h,   v1.8h,   v30.8h
-        smax            v2.8h,   v2.8h,   v30.8h
-        smax            v3.8h,   v3.8h,   v30.8h
-        smax            v4.8h,   v4.8h,   v30.8h
-        smax            v5.8h,   v5.8h,   v30.8h
-        smax            v6.8h,   v6.8h,   v30.8h
-        smax            v7.8h,   v7.8h,   v30.8h
+        usqadd          v2.8h,   v16.8h
+        usqadd          v3.8h,   v16.8h
+        usqadd          v4.8h,   v16.8h
+        usqadd          v5.8h,   v16.8h
+        usqadd          v6.8h,   v16.8h
+        usqadd          v7.8h,   v16.8h
        smin            v0.8h,   v0.8h,   v31.8h
        smin            v1.8h,   v1.8h,   v31.8h
        smin            v2.8h,   v2.8h,   v31.8h
@ -445,12 +402,12 @@ endfunc

 .macro idct_4 r0, r1, r2, r3
        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]
-        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]
+        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]
        srshr           v6.4s,  v6.4s,  #12
-        srshr           v7.4s,  v4.4s,  #12
        srshr           v2.4s,  v2.4s,  #12
+        srshr           v7.4s,  v4.4s,  #12
        srshr           v3.4s,  v3.4s,  #12
        sqadd           \r0\().4s,  v2.4s,   v6.4s
        sqsub           \r3\().4s,  v2.4s,   v6.4s
@ -575,16 +532,14 @@ function inv_txfm_add_4x4_neon
 L(itx_4x4_end):
        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
        sub             x0,  x0,  x1, lsl #2
-        sqadd           v16.8h,  v16.8h,  v0.8h
-        sqadd           v18.8h,  v18.8h,  v1.8h
-        smax            v16.8h,  v16.8h,  v30.8h
-        smax            v18.8h,  v18.8h,  v30.8h
-        smin            v16.8h,  v16.8h,  v31.8h
-        st1             {v16.d}[0], [x0], x1
-        smin            v18.8h,  v18.8h,  v31.8h
-        st1             {v16.d}[1], [x0], x1
-        st1             {v18.d}[0], [x0], x1
-        st1             {v18.d}[1], [x0], x1
+        usqadd          v0.8h,   v16.8h
+        usqadd          v1.8h,   v18.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        st1             {v0.d}[0], [x0], x1
+        smin            v1.8h,   v1.8h,   v31.8h
+        st1             {v0.d}[1], [x0], x1
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x0], x1

        br              x15
 endfunc
@ -647,7 +602,7 @@ def_fn_4x4 identity, flipadst
        srshr           \r1\().4s, v2.4s,  #12           // t4a
        srshr           \r7\().4s, v4.4s,  #12           // t7a
        srshr           \r3\().4s, v6.4s,  #12           // t5a
-        srshr           \r5\().4s, v7.4s,  #12           // taa
+        srshr           \r5\().4s, v7.4s,  #12           // t6a

        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4
        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a
@ -1052,7 +1007,7 @@ function inv_dct_4s_x16_neon

        srshr           v4.4s,  v4.4s,  #12   // t11
        srshr           v5.4s,  v6.4s,  #12   // t12
-        mul_mla         v6,  v25, v21, v0.s[0], v0.s[0] // -> t10a
+        mul_mla         v6,  v25, v21, v0.s[0], v0.s[0] // -> t13a
        srshr           v2.4s,  v2.4s,  #12   // t10a
        srshr           v3.4s,  v6.4s,  #12   // t13a

@ -1488,10 +1443,10 @@ function inv_txfm_add_4x16_neon
        st1             {v2.4s}, [x6], x11
 .endr
        blr             x4
-        rshrn           v28.4h,  v16.4s,  #1
-        rshrn           v29.4h,  v17.4s,  #1
-        rshrn           v30.4h,  v18.4s,  #1
-        rshrn           v31.4h,  v19.4s,  #1
+        sqrshrn         v28.4h,  v16.4s,  #1
+        sqrshrn         v29.4h,  v17.4s,  #1
+        sqrshrn         v30.4h,  v18.4s,  #1
+        sqrshrn         v31.4h,  v19.4s,  #1
        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7

        b               2f
@ -1511,10 +1466,10 @@ function inv_txfm_add_4x16_neon
        st1             {v2.4s}, [x6], x11
 .endr
        blr             x4
-        rshrn           v24.4h,  v16.4s,  #1
-        rshrn           v25.4h,  v17.4s,  #1
-        rshrn           v26.4h,  v18.4s,  #1
-        rshrn           v27.4h,  v19.4s,  #1
+        sqrshrn         v24.4h,  v16.4s,  #1
+        sqrshrn         v25.4h,  v17.4s,  #1
+        sqrshrn         v26.4h,  v18.4s,  #1
+        sqrshrn         v27.4h,  v19.4s,  #1
        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7

        b               2f
@ -1533,10 +1488,10 @@ function inv_txfm_add_4x16_neon
        st1             {v2.4s}, [x6], x11
 .endr
        blr             x4
-        rshrn           v20.4h,  v16.4s,  #1
-        rshrn           v21.4h,  v17.4s,  #1
-        rshrn           v22.4h,  v18.4s,  #1
-        rshrn           v23.4h,  v19.4s,  #1
+        sqrshrn         v20.4h,  v16.4s,  #1
+        sqrshrn         v21.4h,  v17.4s,  #1
+        sqrshrn         v22.4h,  v18.4s,  #1
+        sqrshrn         v23.4h,  v19.4s,  #1
        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7

        b               2f
@ -1552,10 +1507,10 @@ function inv_txfm_add_4x16_neon
        st1             {v2.4s}, [x2], x11
 .endr
        blr             x4
-        rshrn           v16.4h,  v16.4s,  #1
-        rshrn           v17.4h,  v17.4s,  #1
-        rshrn           v18.4h,  v18.4s,  #1
-        rshrn           v19.4h,  v19.4s,  #1
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7

        blr             x5
@ -2219,7 +2174,6 @@ function inv_txfm_add_vert_dct_8x32_neon

        neg             x9,  x8
        mov             x10, x6
-        movi            v0.8h,   #0
        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff
 .macro combine r0, r1, r2, r3, op, stride
        ld1             {v5.8h}, [x7],    \stride
@ -2231,27 +2185,23 @@ function inv_txfm_add_vert_dct_8x32_neon
        ld1             {v4.8h}, [x10],   x1
        srshr           v5.8h,   v5.8h,   #4
        \op             v6.8h,   v6.8h,   \r1
-        sqadd           v5.8h,   v5.8h,   v2.8h
+        usqadd          v2.8h,   v5.8h
        srshr           v6.8h,   v6.8h,   #4
        \op             v7.8h,   v7.8h,   \r2
-        smax            v2.8h,   v5.8h,   v0.8h
        ld1             {v5.8h}, [x7],    \stride
-        sqadd           v6.8h,   v6.8h,   v3.8h
+        usqadd          v3.8h,   v6.8h
        smin            v2.8h,   v2.8h,   v1.8h
        srshr           v7.8h,   v7.8h,   #4
        \op             v5.8h,   v5.8h,   \r3
        st1             {v2.8h}, [x6],    x1
        ld1             {v2.8h}, [x10],   x1
-        smax            v3.8h,   v6.8h,   v0.8h
-        sqadd           v7.8h,   v7.8h,   v4.8h
+        usqadd          v4.8h,   v7.8h
        smin            v3.8h,   v3.8h,   v1.8h
        srshr           v5.8h,   v5.8h,   #4
        st1             {v3.8h}, [x6],    x1
-        smax            v4.8h,   v7.8h,   v0.8h
-        sqadd           v5.8h,   v5.8h,   v2.8h
+        usqadd          v2.8h,   v5.8h
        smin            v4.8h,   v4.8h,   v1.8h
        st1             {v4.8h}, [x6],    x1
-        smax            v2.8h,   v5.8h,   v0.8h
        smin            v2.8h,   v2.8h,   v1.8h
        st1             {v2.8h}, [x6],    x1
 .endm
@ -2652,7 +2602,9 @@ function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
        mov             w8,  #(16 - \i)
        cmp             w3,  w12
        b.lt            1f
+.if \i < 12
        ldrh            w12, [x13], #2
+.endif
 .endif
        mov             x8,  #4*16
        bl              inv_txfm_horz_scale_dct_32x4_neon
@ -3195,7 +3147,6 @@ function inv_txfm_add_vert_dct_8x64_neon
        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11

-        movi            v6.8h,   #0
        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
 .macro add_dest_addsub src0, src1, src2, src3
        ld1             {v0.8h}, [x6], x1
@ -3211,18 +3162,14 @@ function inv_txfm_add_vert_dct_8x64_neon
        srshr           v4.8h,   v4.8h,   #4
        srshr           v5.8h,   v5.8h,   #4
        srshr           \src0,   \src0,   #4
-        sqadd           v0.8h,   v0.8h,   v4.8h
+        usqadd          v0.8h,   v4.8h
        srshr           \src2,   \src2,   #4
-        sqadd           v1.8h,   v1.8h,   \src0
-        sqadd           v2.8h,   v2.8h,   v5.8h
-        smax            v0.8h,   v0.8h,   v6.8h
-        sqadd           v3.8h,   v3.8h,   \src2
-        smax            v1.8h,   v1.8h,   v6.8h
+        usqadd          v1.8h,   \src0
+        usqadd          v2.8h,   v5.8h
        smin            v0.8h,   v0.8h,   v7.8h
-        smax            v2.8h,   v2.8h,   v6.8h
+        usqadd          v3.8h,   \src2
        smin            v1.8h,   v1.8h,   v7.8h
        st1             {v0.8h}, [x6], x1
-        smax            v3.8h,   v3.8h,   v6.8h
        smin            v2.8h,   v2.8h,   v7.8h
        st1             {v1.8h}, [x9], x10
        smin            v3.8h,   v3.8h,   v7.8h
@ -3240,29 +3187,6 @@ function inv_txfm_add_vert_dct_8x64_neon
        br              x14
 endfunc

-.macro sub_sp space
-#ifdef _WIN32
-.if \space > 8192
-        // Here, we'd need to touch two (or more) pages while decrementing
-        // the stack pointer.
-        .error          "sub_sp_align doesn't support values over 8K at the moment"
-.elseif \space > 4096
-        sub             x16, sp,  #4096
-        ldr             xzr, [x16]
-        sub             sp,  x16, #(\space - 4096)
-.else
-        sub             sp,  sp,  #\space
-.endif
-#else
-.if \space >= 4096
-        sub             sp,  sp,  #(\space)/4096*4096
-.endif
-.if (\space % 4096) != 0
-        sub             sp,  sp,  #(\space)%4096
-.endif
-#endif
-.endm
-
 function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
        idct_dc         64,  64,  2

@ -3492,7 +3416,9 @@ function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
        mov             w8,  #(32 - \i)
        cmp             w3,  w12
        b.lt            1f
+.if \i < 28
        ldrh            w12, [x13], #2
+.endif
 .endif
        add             x7,  x2,  #(\i*4)
        mov             x8,  #32*4
--- a/third_party/dav1d/src/arm/64/loopfilter.S
+++ b/third_party/dav1d/src/arm/64/loopfilter.S
@ -132,12 +132,11 @@ function lpf_16_wd\wd\()_neon
 .endif
        b.eq            1f                        // skip wd == 4 case
 .endif
-
-        usubl           v2.8h,   v22.8b,  v25.8b  // p1 - q1
-        usubl2          v3.8h,   v22.16b, v25.16b
+        movi            v3.16b,  #128
+        eor             v2.16b,  v22.16b, v3.16b  // p1 - 128
+        eor             v3.16b,  v25.16b, v3.16b  // q1 - 128
        cmhi            v0.16b,  v0.16b,  v12.16b // hev
-        sqxtn           v2.8b,   v2.8h            // iclip_diff(p1 - q1)
-        sqxtn2          v2.16b,  v3.8h
+        sqsub           v2.16b,  v2.16b,  v3.16b  // iclip_diff(p1 - q1)
        and             v4.16b,  v2.16b,  v0.16b  // if (hev) iclip_diff(p1 - q1)
        bic             v0.16b,  v1.16b,  v0.16b  // (fm && wd >= 4 && !hev)
        usubl           v2.8h,   v24.8b,  v23.8b
@ -155,35 +154,23 @@ function lpf_16_wd\wd\()_neon
        sqadd           v5.16b,  v7.16b,  v2.16b  // imin(f + 3, 127)
        sshr            v4.16b,  v4.16b,  #3      // f1
        sshr            v5.16b,  v5.16b,  #3      // f2
-        uxtl            v2.8h,   v23.8b           // p0
-        uxtl2           v3.8h,   v23.16b
-        uxtl            v6.8h,   v24.8b           // q0
-        uxtl2           v7.8h,   v24.16b
-        saddw           v2.8h,   v2.8h,   v5.8b
-        saddw2          v3.8h,   v3.8h,   v5.16b
-        ssubw           v6.8h,   v6.8h,   v4.8b
-        ssubw2          v7.8h,   v7.8h,   v4.16b
+        mov             v2.16b,  v23.16b          // p0
+        mov             v3.16b,  v24.16b          // q0
+        neg             v6.16b,  v4.16b           // -f1
        srshr           v4.16b,  v4.16b,  #1      // (f1 + 1) >> 1
-        sqxtun          v2.8b,   v2.8h            // out p0
-        sqxtun2         v2.16b,  v3.8h
-        sqxtun          v6.8b,   v6.8h            // out q0
-        sqxtun2         v6.16b,  v7.8h
+        // p0 + f2, q0 - f1
+        usqadd          v2.16b,  v5.16b           // out p0
+        usqadd          v3.16b,  v6.16b           // out q0
+        neg             v6.16b,  v4.16b           // -((f1 + 1) >> 1)
        bit             v23.16b, v2.16b,  v1.16b  // if (fm && wd >= 4)
-        uxtl            v2.8h,   v22.8b           // p1
-        uxtl2           v3.8h,   v22.16b
-        bit             v24.16b, v6.16b,  v1.16b  // if (fm && wd >= 4)
-        uxtl            v6.8h,   v25.8b           // q1
-        uxtl2           v7.8h,   v25.16b
-        saddw           v2.8h,   v2.8h,   v4.8b
-        saddw2          v3.8h,   v3.8h,   v4.16b
-        ssubw           v6.8h,   v6.8h,   v4.8b
-        ssubw2          v7.8h,   v7.8h,   v4.16b
-        sqxtun          v2.8b,   v2.8h            // out p1
-        sqxtun2         v2.16b,  v3.8h
-        sqxtun          v6.8b,   v6.8h            // out q1
-        sqxtun2         v6.16b,  v7.8h
+        bit             v24.16b, v3.16b,  v1.16b  // if (fm && wd >= 4)
+        mov             v2.16b,  v22.16b          // p1
+        mov             v3.16b,  v25.16b          // q1
+        // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
+        usqadd          v2.16b,  v4.16b           // out p1
+        usqadd          v3.16b,  v6.16b           // out q1
        bit             v22.16b, v2.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
-        bit             v25.16b, v6.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
+        bit             v25.16b, v3.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
 1:

 .if \wd == 6
--- a/third_party/dav1d/src/arm/64/loopfilter16.S
+++ b/third_party/dav1d/src/arm/64/loopfilter16.S
@ -150,10 +150,9 @@ function lpf_8_wd\wd\()_neon
        movi            v6.8h,   #4
        add             v2.8h,   v2.8h,   v4.8h
        smin            v2.8h,   v2.8h,   v3.8h   // f = iclip_diff()
-        movi            v7.8h,   #3
        smax            v2.8h,   v2.8h,   v9.8h   // f = iclip_diff()
        sqadd           v4.8h,   v6.8h,   v2.8h   // f + 4
-        sqadd           v5.8h,   v7.8h,   v2.8h   // f + 3
+        sqadd           v5.8h,   v5.8h,   v2.8h   // f + 3
        smin            v4.8h,   v4.8h,   v3.8h   // imin(f + 4, 128 << bitdepth_min_8 - 1)
        smin            v5.8h,   v5.8h,   v3.8h   // imin(f + 3, 128 << bitdepth_min_8 - 1)
        sshr            v4.8h,   v4.8h,   #3      // f1
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -2180,16 +2180,7 @@ L(\type\()_8tap_filter_4):
        lsl             \d_strd, \d_strd, #1
        lsl             \s_strd, \s_strd, #1

-        ld1             {v28.8b, v29.8b},  [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        uxtl            v29.8h,  v29.8b
-        mul             v24.8h,  v28.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        mla             v24.8h,  v26.8h,  v0.h[\i]
-.endr
-        srshr           v16.8h,  v24.8h, #2
-
+        bl              L(\type\()_8tap_filter_8_first)
        bl              L(\type\()_8tap_filter_8)
        mov             v17.16b, v24.16b
        mov             v18.16b, v25.16b
@ -2267,16 +2258,7 @@ L(\type\()_8tap_filter_4):
        lsl             \d_strd, \d_strd, #1
        lsl             \s_strd, \s_strd, #1

-        ld1             {v28.8b, v29.8b},  [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        uxtl            v29.8h,  v29.8b
-        mul             v24.8h,  v28.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        mla             v24.8h,  v26.8h,  v0.h[\i]
-.endr
-        srshr           v16.8h,  v24.8h, #2
-
+        bl              L(\type\()_8tap_filter_8_first)
        bl              L(\type\()_8tap_filter_8)
        mov             v17.16b, v24.16b
        mov             v18.16b, v25.16b
@ -2363,6 +2345,28 @@ L(\type\()_8tap_filter_4):
 0:
        br              x15

+L(\type\()_8tap_filter_8_first):
+        ld1             {v28.8b, v29.8b},  [\src], \s_strd
+        uxtl            v28.8h,  v28.8b
+        uxtl            v29.8h,  v29.8b
+        mul             v16.8h,  v28.8h,  v0.h[0]
+        ext             v24.16b, v28.16b, v29.16b, #(2*1)
+        ext             v25.16b, v28.16b, v29.16b, #(2*2)
+        ext             v26.16b, v28.16b, v29.16b, #(2*3)
+        ext             v27.16b, v28.16b, v29.16b, #(2*4)
+        mla             v16.8h,  v24.8h,  v0.h[1]
+        mla             v16.8h,  v25.8h,  v0.h[2]
+        mla             v16.8h,  v26.8h,  v0.h[3]
+        mla             v16.8h,  v27.8h,  v0.h[4]
+        ext             v24.16b, v28.16b, v29.16b, #(2*5)
+        ext             v25.16b, v28.16b, v29.16b, #(2*6)
+        ext             v26.16b, v28.16b, v29.16b, #(2*7)
+        mla             v16.8h,  v24.8h,  v0.h[5]
+        mla             v16.8h,  v25.8h,  v0.h[6]
+        mla             v16.8h,  v26.8h,  v0.h[7]
+        srshr           v16.8h,  v16.8h,  #2
+        ret
+
 L(\type\()_8tap_filter_8):
        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
        ld1             {v30.8b, v31.8b},  [\src], \s_strd
@ -2916,8 +2920,8 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6

 .macro load_filter_row dst, src, inc
        asr             w13, \src, #10
-        ldr             \dst, [x11, w13, sxtw #3]
        add             \src, \src, \inc
+        ldr             \dst, [x11, w13, sxtw #3]
 .endm

 function warp_filter_horz_neon
@ -2926,58 +2930,45 @@ function warp_filter_horz_neon
        ld1             {v16.8b, v17.8b}, [x2], x3

        load_filter_row d0, w12, w7
-        uxtl            v16.8h,  v16.8b
        load_filter_row d1, w12, w7
-        uxtl            v17.8h,  v17.8b
        load_filter_row d2, w12, w7
-        sxtl            v0.8h,   v0.8b
        load_filter_row d3, w12, w7
-        sxtl            v1.8h,   v1.8b
        load_filter_row d4, w12, w7
-        sxtl            v2.8h,   v2.8b
        load_filter_row d5, w12, w7
-        sxtl            v3.8h,   v3.8b
        load_filter_row d6, w12, w7
-        sxtl            v4.8h,   v4.8b
+        // subtract by 128 to allow using smull
+        eor             v16.8b,  v16.8b,  v22.8b
+        eor             v17.8b,  v17.8b,  v22.8b
        load_filter_row d7, w12, w7
-        sxtl            v5.8h,   v5.8b
-        ext             v18.16b, v16.16b, v17.16b, #2*1
-        mul             v23.8h,  v16.8h,  v0.8h
-        sxtl            v6.8h,   v6.8b
-        ext             v19.16b, v16.16b, v17.16b, #2*2
-        mul             v18.8h,  v18.8h,  v1.8h
-        sxtl            v7.8h,   v7.8b
-        ext             v20.16b, v16.16b, v17.16b, #2*3
-        mul             v19.8h,  v19.8h,  v2.8h
-        ext             v21.16b, v16.16b, v17.16b, #2*4
-        saddlp          v23.4s,  v23.8h
-        mul             v20.8h,  v20.8h,  v3.8h
-        ext             v22.16b, v16.16b, v17.16b, #2*5
-        saddlp          v18.4s,  v18.8h
-        mul             v21.8h,  v21.8h,  v4.8h
-        saddlp          v19.4s,  v19.8h
-        mul             v22.8h,  v22.8h,  v5.8h
-        saddlp          v20.4s,  v20.8h
-        saddlp          v21.4s,  v21.8h
-        saddlp          v22.4s,  v22.8h
-        addp            v18.4s,  v23.4s,  v18.4s
-        ext             v23.16b, v16.16b, v17.16b, #2*6
-        addp            v19.4s,  v19.4s,  v20.4s
-        mul             v23.8h,  v23.8h,  v6.8h
-        ext             v20.16b, v16.16b, v17.16b, #2*7
-        mul             v20.8h,  v20.8h,  v7.8h
-        saddlp          v23.4s,  v23.8h
-        addp            v21.4s,  v21.4s,  v22.4s
-        saddlp          v20.4s,  v20.8h
-        addp            v20.4s,  v23.4s,  v20.4s
-        addp            v18.4s,  v18.4s,  v19.4s
-        addp            v20.4s,  v21.4s,  v20.4s
+
+        ext             v18.8b,  v16.8b,  v17.8b,  #1
+        ext             v19.8b,  v16.8b,  v17.8b,  #2
+        smull           v0.8h,   v0.8b,   v16.8b
+        smull           v1.8h,   v1.8b,   v18.8b
+        ext             v18.8b,  v16.8b,  v17.8b,  #3
+        ext             v20.8b,  v16.8b,  v17.8b,  #4
+        smull           v2.8h,   v2.8b,   v19.8b
+        smull           v3.8h,   v3.8b,   v18.8b
+        ext             v18.8b,  v16.8b,  v17.8b,  #5
+        ext             v19.8b,  v16.8b,  v17.8b,  #6
+        smull           v4.8h,   v4.8b,   v20.8b
+        smull           v5.8h,   v5.8b,   v18.8b
+        ext             v18.8b,  v16.8b,  v17.8b,  #7
+        smull           v6.8h,   v6.8b,   v19.8b
+        smull           v7.8h,   v7.8b,   v18.8b
+
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+
+        addp            v0.8h,   v0.8h,   v4.8h

        add             w5,  w5,  w8

-        rshrn           v16.4h,  v18.4s,  #3
-        rshrn2          v16.8h,  v20.4s,  #3
-
        ret
 endfunc

@ -3002,25 +2993,32 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
        lsl             x1,  x1,  #1
 .endif

+        movi            v22.8b,  #128
+.ifb \t
+        movi            v23.8h,  #128
+.else
+        movi            v23.8h,  #8, lsl #8
+.endif
+
        bl              warp_filter_horz_neon
-        mov             v24.16b, v16.16b
+        srshr           v24.8h,  v0.8h,  #3
        bl              warp_filter_horz_neon
-        mov             v25.16b, v16.16b
+        srshr           v25.8h,  v0.8h,  #3
        bl              warp_filter_horz_neon
-        mov             v26.16b, v16.16b
+        srshr           v26.8h,  v0.8h,  #3
        bl              warp_filter_horz_neon
-        mov             v27.16b, v16.16b
+        srshr           v27.8h,  v0.8h,  #3
        bl              warp_filter_horz_neon
-        mov             v28.16b, v16.16b
+        srshr           v28.8h,  v0.8h,  #3
        bl              warp_filter_horz_neon
-        mov             v29.16b, v16.16b
+        srshr           v29.8h,  v0.8h,  #3
        bl              warp_filter_horz_neon
-        mov             v30.16b, v16.16b
+        srshr           v30.8h,  v0.8h,  #3

 1:
        add             w14, w6,  #512
        bl              warp_filter_horz_neon
-        mov             v31.16b, v16.16b
+        srshr           v31.8h,  v0.8h,  #3

        load_filter_row d0, w14, w9
        load_filter_row d1, w14, w9
@ -3030,15 +3028,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
        load_filter_row d5, w14, w9
        load_filter_row d6, w14, w9
        load_filter_row d7, w14, w9
-        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
-        sxtl            v0.8h,   v0.8b
-        sxtl            v1.8h,   v1.8b
-        sxtl            v2.8h,   v2.8b
-        sxtl            v3.8h,   v3.8b
-        sxtl            v4.8h,   v4.8b
-        sxtl            v5.8h,   v5.8b
-        sxtl            v6.8h,   v6.8b
-        sxtl            v7.8h,   v7.8b
+        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl

        // This ordering of smull/smlal/smull2/smlal2 is highly
        // beneficial for Cortex A53 here.
@ -3066,6 +3056,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
        sqrshrn2        v16.8h,  v17.4s,  #\shift
        mov             v27.16b, v28.16b
        mov             v28.16b, v29.16b
+        add             v16.8h,  v16.8h,  v23.8h
 .ifb \t
        sqxtun          v16.8b,  v16.8h
 .endif
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@ -3188,8 +3188,8 @@ filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10

 .macro load_filter_row dst, src, inc
        asr             w13, \src, #10
-        ldr             \dst, [x11, w13, sxtw #3]
        add             \src, \src, \inc
+        ldr             \dst, [x11, w13, sxtw #3]
 .endm

 function warp_filter_horz_neon
@ -3343,15 +3343,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
        load_filter_row d5, w14, w9
        load_filter_row d6, w14, w9
        load_filter_row d7, w14, w9
-        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
-        sxtl            v0.8h,   v0.8b
-        sxtl            v1.8h,   v1.8b
-        sxtl            v2.8h,   v2.8b
-        sxtl            v3.8h,   v3.8b
-        sxtl            v4.8h,   v4.8b
-        sxtl            v5.8h,   v5.8b
-        sxtl            v6.8h,   v6.8b
-        sxtl            v7.8h,   v7.8b
+        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl

        // This ordering of smull/smlal/smull2/smlal2 is highly
        // beneficial for Cortex A53 here.
--- a/third_party/dav1d/src/arm/64/util.S
+++ b/third_party/dav1d/src/arm/64/util.S
@ -59,33 +59,65 @@
 #endif
 .endm

-.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
-        trn1            \t8\().8b,  \r0\().8b,  \r1\().8b
-        trn2            \t9\().8b,  \r0\().8b,  \r1\().8b
-        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
-        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
-        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
-        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
-        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
-        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             x16, sp,  #4096
+        ldr             xzr, [x16]
+        sub             sp,  x16, #(\space - 4096)
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm

-        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
-        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
-        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
-        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
-        trn1            \r5\().4h,  \t9\().4h,  \r3\().4h
-        trn2            \t9\().4h,  \t9\().4h,  \r3\().4h
-        trn1            \r3\().4h,  \t8\().4h,  \r1\().4h
-        trn2            \t8\().4h,  \t8\().4h,  \r1\().4h
+.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
+        // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
+        zip1            \r0\().16b, \r0\().16b, \r1\().16b
+        // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
+        zip1            \r2\().16b, \r2\().16b, \r3\().16b
+        // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
+        zip1            \r4\().16b, \r4\().16b, \r5\().16b
+        // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
+        zip1            \r6\().16b, \r6\().16b, \r7\().16b

-        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
-        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s
-        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
-        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s
-        trn2            \r6\().2s,  \t8\().2s,  \r2\().2s
-        trn1            \r2\().2s,  \t8\().2s,  \r2\().2s
-        trn1            \r3\().2s,  \t9\().2s,  \r7\().2s
-        trn2            \r7\().2s,  \t9\().2s,  \r7\().2s
+        // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
+        trn1            \r1\().8h,  \r0\().8h,  \r2\().8h
+        // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
+        trn2            \r3\().8h,  \r0\().8h,  \r2\().8h
+        // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
+        trn1            \r5\().8h,  \r4\().8h,  \r6\().8h
+        // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
+        trn2            \r7\().8h,  \r4\().8h,  \r6\().8h
+
+        // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
+        trn1            \r0\().4s,  \r1\().4s,  \r5\().4s
+        // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
+        trn2            \r2\().4s,  \r1\().4s,  \r5\().4s
+        // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
+        trn1            \r1\().4s,  \r3\().4s,  \r7\().4s
+        // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
+        trn2            \r3\().4s,  \r3\().4s,  \r7\().4s
+
+        \xtl\()2        \r4\().8h,  \r0\().16b
+        \xtl            \r0\().8h,  \r0\().8b
+        \xtl\()2        \r6\().8h,  \r2\().16b
+        \xtl            \r2\().8h,  \r2\().8b
+        \xtl\()2        \r5\().8h,  \r1\().16b
+        \xtl            \r1\().8h,  \r1\().8b
+        \xtl\()2        \r7\().8h,  \r3\().16b
+        \xtl            \r3\().8h,  \r3\().8b
 .endm

 .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
--- a/third_party/dav1d/src/arm/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/arm/ipred_init_tmpl.c
@ -55,7 +55,6 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8 || ARCH_AARCH64
    c->intra_pred[DC_PRED]       = BF(dav1d_ipred_dc, neon);
    c->intra_pred[DC_128_PRED]   = BF(dav1d_ipred_dc_128, neon);
    c->intra_pred[TOP_DC_PRED]   = BF(dav1d_ipred_dc_top, neon);
@ -78,5 +77,4 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);

    c->pal_pred                  = BF(dav1d_pal_pred, neon);
-#endif
 }
--- a/third_party/dav1d/src/arm/itx_init_tmpl.c
+++ b/third_party/dav1d/src/arm/itx_init_tmpl.c
@ -119,7 +119,6 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc

    if (bpc > 10) return;

-#if ARCH_AARCH64 || BITDEPTH == 8
    assign_itx17_fn( ,  4,  4, neon);
    assign_itx16_fn(R,  4,  8, neon);
    assign_itx16_fn(R,  4, 16, neon);
@ -139,5 +138,4 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
    assign_itx1_fn (R, 64, 16, neon);
    assign_itx1_fn (R, 64, 32, neon);
    assign_itx1_fn ( , 64, 64, neon);
-#endif
 }
--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
@ -27,7 +27,23 @@

 #include "src/cpu.h"
 #include "src/looprestoration.h"
-#include "src/tables.h"
+
+#if ARCH_AARCH64
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride,
+                                    const pixel (*left)[4],
+                                    const pixel *lpf, const ptrdiff_t lpf_stride,
+                                    const int w, int h,
+                                    const LooprestorationParams *const params,
+                                    const enum LrEdgeFlags edges
+                                    HIGHBD_DECL_SUFFIX);
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride,
+                                    const pixel (*left)[4],
+                                    const pixel *lpf, const ptrdiff_t lpf_stride,
+                                    const int w, int h,
+                                    const LooprestorationParams *const params,
+                                    const enum LrEdgeFlags edges
+                                    HIGHBD_DECL_SUFFIX);
+#else

 // The 8bpc version calculates things slightly differently than the reference
 // C version. That version calculates roughly this:
@ -59,16 +75,15 @@ void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
                                     const int16_t *mid, int w, int h,
                                     const int16_t fv[8], enum LrEdgeFlags edges,
                                     ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
-void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
-                                 const pixel *src, int w, int h);

 static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
                               const pixel (*const left)[4],
                               const pixel *lpf, const ptrdiff_t lpf_stride,
                               const int w, const int h,
-                               const int16_t filter[2][8],
+                               const LooprestorationParams *const params,
                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
+    const int16_t (*const filter)[8] = params->filter;
    ALIGN_STK_16(int16_t, mid, 68 * 384,);
    int mid_stride = (w + 7) & ~7;

@ -86,23 +101,12 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
                                        HIGHBD_TAIL_SUFFIX);

    // Vertical filter
-    if (w >= 8)
-        BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
-                                        w & ~7, h, filter[1], edges,
-                                        mid_stride * sizeof(*mid)
-                                        HIGHBD_TAIL_SUFFIX);
-    if (w & 7) {
-        // For uneven widths, do a full 8 pixel wide filtering into a temp
-        // buffer and copy out the narrow slice of pixels separately into dest.
-        ALIGN_STK_16(pixel, tmp, 64 * 8,);
-        BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
-                                        &mid[2*mid_stride + (w & ~7)],
-                                        w & 7, h, filter[1], edges,
-                                        mid_stride * sizeof(*mid)
-                                        HIGHBD_TAIL_SUFFIX);
-        BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
-    }
+    BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
+                                    w, h, filter[1], edges,
+                                    mid_stride * sizeof(*mid)
+                                    HIGHBD_TAIL_SUFFIX);
 }
+#endif

 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
                                const pixel (*left)[4],
@ -204,83 +208,50 @@ void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
                                   const int w, const int h,
                                   const int16_t wt[2] HIGHBD_DECL_SUFFIX);

-static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
-                             const pixel (*const left)[4],
-                             const pixel *lpf, const ptrdiff_t lpf_stride,
-                             const int w, const int h, const int sgr_idx,
-                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges
-                             HIGHBD_DECL_SUFFIX)
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                                const pixel (*const left)[4],
+                                const pixel *lpf, const ptrdiff_t lpf_stride,
+                                const int w, const int h,
+                                const LooprestorationParams *const params,
+                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
-    if (!dav1d_sgr_params[sgr_idx][0]) {
-        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
-        dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][3], edges
-                               HIGHBD_TAIL_SUFFIX);
-        if (w >= 8)
-            BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
-                                          tmp, w & ~7, h, (1 << 7) - sgr_wt[1]
-                                          HIGHBD_TAIL_SUFFIX);
-        if (w & 7) {
-            // For uneven widths, do a full 8 pixel wide filtering into a temp
-            // buffer and copy out the narrow slice of pixels separately into
-            // dest.
-            ALIGN_STK_16(pixel, stripe, 64 * 8,);
-            BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
-                                          dst + (w & ~7), dst_stride,
-                                          tmp + (w & ~7), w & 7, h,
-                                          (1 << 7) - sgr_wt[1]
-                                          HIGHBD_TAIL_SUFFIX);
-            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
-                                        w & 7, h);
-        }
-    } else if (!dav1d_sgr_params[sgr_idx][1]) {
-        ALIGN_STK_16(int16_t, tmp, 64 * 384,);
-        dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][2], edges
-                               HIGHBD_TAIL_SUFFIX);
-        if (w >= 8)
-            BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
-                                          tmp, w & ~7, h, sgr_wt[0]
-                                          HIGHBD_TAIL_SUFFIX);
-        if (w & 7) {
-            // For uneven widths, do a full 8 pixel wide filtering into a temp
-            // buffer and copy out the narrow slice of pixels separately into
-            // dest.
-            ALIGN_STK_16(pixel, stripe, 64 * 8,);
-            BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel),
-                                          dst + (w & ~7), dst_stride,
-                                          tmp + (w & ~7), w & 7, h, sgr_wt[0]
-                                          HIGHBD_TAIL_SUFFIX);
-            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
-                                        w & 7, h);
-        }
-    } else {
-        ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
-        ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
-        dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][2], edges
-                               HIGHBD_TAIL_SUFFIX);
-        dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
-                               w, h, dav1d_sgr_params[sgr_idx][3], edges
-                               HIGHBD_TAIL_SUFFIX);
-        const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
-        if (w >= 8)
-            BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
-                                          tmp1, tmp2, w & ~7, h, wt
-                                          HIGHBD_TAIL_SUFFIX);
-        if (w & 7) {
-            // For uneven widths, do a full 8 pixel wide filtering into a temp
-            // buffer and copy out the narrow slice of pixels separately into
-            // dest.
-            ALIGN_STK_16(pixel, stripe, 64 * 8,);
-            BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel),
-                                          dst + (w & ~7), dst_stride,
-                                          tmp1 + (w & ~7), tmp2 + (w & ~7),
-                                          w & 7, h, wt HIGHBD_TAIL_SUFFIX);
-            BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe,
-                                        w & 7, h);
-        }
-    }
+    ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+    dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
+                           w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+    BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+                                  tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                                const pixel (*const left)[4],
+                                const pixel *lpf, const ptrdiff_t lpf_stride,
+                                const int w, const int h,
+                                const LooprestorationParams *const params,
+                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+    dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
+                           w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+    BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+                                  tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                                const pixel (*const left)[4],
+                                const pixel *lpf, const ptrdiff_t lpf_stride,
+                                const int w, const int h,
+                                const LooprestorationParams *const params,
+                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+    ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+    dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
+                           w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+    dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
+                           w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+    const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
+    BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
+                                  tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
 }

 COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
@ -288,7 +259,15 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

+#if ARCH_AARCH64
+    c->wiener[0] = BF(dav1d_wiener_filter7, neon);
+    c->wiener[1] = BF(dav1d_wiener_filter5, neon);
+#else
    c->wiener[0] = c->wiener[1] = wiener_filter_neon;
-    if (bpc <= 10)
-        c->selfguided = sgr_filter_neon;
+#endif
+    if (bpc <= 10) {
+        c->sgr[0] = sgr_filter_5x5_neon;
+        c->sgr[1] = sgr_filter_3x3_neon;
+        c->sgr[2] = sgr_filter_mix_neon;
+    }
 }
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@ -117,7 +117,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,

    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
        const int tf = f->lf.top_pre_cdef_toggle;
-        const int by_idx = by & 30;
+        const int by_idx = (by & 30) >> 1;
        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;

        if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
@ -140,6 +140,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                goto next_sb;
            }

+            // Create a complete 32-bit mask for the sb row ahead of time.
+            const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
+            const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
+                                                    noskip_row[0][0];
+
            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
            const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
@ -162,11 +167,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,

                // check if this 8x8 block had any coded coefficients; if not,
                // go to the next block
-                const unsigned bx_mask = 3U << (bx & 14);
-                const int bx_idx = (bx & 16) >> 4;
-                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
-                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
-                {
+                const uint32_t bx_mask = 3U << (bx & 30);
+                if (!(noskip_mask & bx_mask)) {
                    last_skip = 1;
                    goto next_b;
                }
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
@ -1,5 +1,5 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
@ -29,6 +29,8 @@

 #include <string.h>

+#include "common/frame.h"
+
 #include "src/internal.h"
 #include "src/tables.h"

@ -4012,7 +4014,7 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
    update_cdf_1d(11, m.txtp_inter2);
    update_bit_1d(4, m.txtp_inter3);

-    if (!(hdr->frame_type & 1)) {
+    if (IS_KEY_OR_INTRA(hdr)) {
        update_bit_0d(m.intrabc);

        update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@ -102,18 +102,6 @@ void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
    *dst = *src;
 }

-void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
-    validate_input(dst != NULL);
-    validate_input(dst->data == NULL);
-    validate_input(src != NULL);
-
-    if (src->ref)
-        validate_input(src->data != NULL);
-
-    *dst = *src;
-    memset(src, 0, sizeof(*src));
-}
-
 void dav1d_data_props_copy(Dav1dDataProps *const dst,
                           const Dav1dDataProps *const src)
 {
--- a/third_party/dav1d/src/data.h
+++ b/third_party/dav1d/src/data.h
@ -32,11 +32,6 @@

 void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);

-/**
- * Move a data reference.
- */
-void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
-
 /**
 * Copy the source properties to the destitionatin and increase the
 * user_data's reference count (if it's not NULL).
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -1,5 +1,5 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
@ -35,6 +35,7 @@

 #include "dav1d/data.h"

+#include "common/frame.h"
 #include "common/intops.h"

 #include "src/ctx.h"
@ -727,7 +728,7 @@ static int decode_b(Dav1dTileContext *const t,
            case_set(bh4, l., 1, by4);
            case_set(bw4, a->, 0, bx4);
 #undef set_ctx
-            if (f->frame_hdr->frame_type & 1) {
+            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
                for (int x = 0; x < bw4; x++) {
                    r[x].ref.ref[0] = 0;
@ -748,7 +749,7 @@ static int decode_b(Dav1dTileContext *const t,
 #undef set_ctx
            }
        } else {
-            if (f->frame_hdr->frame_type & 1 /* not intrabc */ &&
+            if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
                b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
            {
                if (b->matrix[0] == SHRT_MIN) {
@ -791,7 +792,7 @@ static int decode_b(Dav1dTileContext *const t,
            case_set(bw4, a->, 0, bx4);
 #undef set_ctx

-            if (f->frame_hdr->frame_type & 1) {
+            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
                for (int x = 0; x < bw4; x++) {
                    r[x].ref.ref[0] = b->ref[0] + 1;
@ -1043,7 +1044,7 @@ static int decode_b(Dav1dTileContext *const t,

    if (b->skip_mode) {
        b->intra = 0;
-    } else if (f->frame_hdr->frame_type & 1) {
+    } else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
        if (seg && (seg->ref >= 0 || seg->globalmv)) {
            b->intra = !seg->ref;
        } else {
@ -1064,7 +1065,7 @@ static int decode_b(Dav1dTileContext *const t,

    // intra/inter-specific stuff
    if (b->intra) {
-        uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ?
+        uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
            ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
            ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
                        [dav1d_intra_mode_context[t->l.mode[by4]]];
@ -1252,7 +1253,7 @@ static int decode_b(Dav1dTileContext *const t,
        rep_macro(type, t->dir skip, off, mul * b->skip); \
        /* see aomedia bug 2183 for why we use luma coordinates here */ \
        rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
-        if (f->frame_hdr->frame_type & 1) { \
+        if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
            rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
            rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
            rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
@ -1293,10 +1294,10 @@ static int decode_b(Dav1dTileContext *const t,
                }
            }
        }
-        if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+        if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
            splat_intraref(&t->rt, t->by, t->bx, bs);
        }
-    } else if (!(f->frame_hdr->frame_type & 1)) {
+    } else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
        // intra block copy
        refmvs_candidate mvstack[8];
        int n_mvs, ctx;
@ -1984,10 +1985,10 @@ static int decode_b(Dav1dTileContext *const t,
 #undef set_ctx
    }
    if (!b->skip) {
-        uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
+        uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
        const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
        const int bx_idx = (bx4 & 16) >> 4;
-        for (int y = 0; y < bh4; y++, noskip_mask++) {
+        for (int y = 0; y < bh4; y += 2, noskip_mask++) {
            (*noskip_mask)[bx_idx] |= mask;
            if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
                (*noskip_mask)[1] |= mask;
@ -2484,15 +2485,12 @@ static void read_restoration_info(Dav1dTileContext *const t,
                   lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
    } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
        const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
+        const uint16_t *const sgr_params = dav1d_sgr_params[idx];
        lr->sgr_idx = idx;
-        lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
-            dav1d_msac_decode_subexp(&ts->msac,
-                ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 :
-            0;
-        lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
-            dav1d_msac_decode_subexp(&ts->msac,
-                ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 :
-            95;
+        lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
+            ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
+        lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
+            ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
        memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
        memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
        ts->lr_ref[p] = lr;
@ -2513,20 +2511,20 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;

-    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
        dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
                                     ts->tiling.col_end, ts->tiling.row_start,
                                     ts->tiling.row_end, t->by >> f->sb_shift,
                                     ts->tiling.row);
    }

-    reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
+    reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
    if (f->frame_thread.pass == 2) {
        for (t->bx = ts->tiling.col_start,
             t->a = f->a + col_sb128_start + tile_row * f->sb128w;
             t->bx < ts->tiling.col_end; t->bx += sb_step)
        {
-            if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
+            if (atomic_load_explicit(c->flush, memory_order_acquire))
                return 1;
            if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
                return 1;
@ -2557,7 +2555,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
         t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
         t->bx < ts->tiling.col_end; t->bx += sb_step)
    {
-        if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
+        if (atomic_load_explicit(c->flush, memory_order_acquire))
            return 1;
        if (root_bl == BL_128X128) {
            t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
@ -2631,7 +2629,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
        }
    }

-    if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) {
+    if (f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
        dav1d_refmvs_save_tmvs(&t->rt,
                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
                               t->by >> 1, (t->by + sb_step) >> 1);
@ -2859,7 +2857,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
    const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
    if (lr_line_sz != f->lf.lr_line_sz) {
        dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
-        uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);
+        const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12;
+        // lr simd may overread the input, so slightly over-allocate the lpf buffer
+        uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32);
        if (!lr_ptr) {
            f->lf.lr_line_sz = 0;
            goto error;
@ -2867,7 +2867,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {

        for (int pl = 0; pl <= 2; pl++) {
            f->lf.lr_lpf_line[pl] = lr_ptr;
-            lr_ptr += lr_line_sz * 12;
+            lr_ptr += lr_line_sz * num_lines;
        }

        f->lf.lr_line_sz = lr_line_sz;
@ -2949,26 +2949,30 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
    }

    // init ref mvs
-    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
        const int ret =
            dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
                                    f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
        if (ret < 0) goto error;
    }
+
+    // create post-filtering tasks
+    if (c->n_pfc > 1)
+        if (dav1d_task_create_filter_sbrow(f))
+            goto error;
+
    retval = DAV1D_ERR(EINVAL);

    // setup dequant tables
    init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
    if (f->frame_hdr->quant.qm)
-        for (int j = 0; j < N_RECT_TX_SIZES; j++) {
-            f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
-            f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
-            f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
+        for (int i = 0; i < N_RECT_TX_SIZES; i++) {
+            f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
+            f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
+            f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
        }
-    for (int i = f->frame_hdr->quant.qm; i < 2; i++)
-        for (int tx = 0; tx < N_RECT_TX_SIZES; tx++)
-            for (int pl = 0; pl < 3; pl++)
-                f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx];
+    else
+        memset(f->qm, 0, sizeof(f->qm));

    // setup jnt_comp weights
    if (f->frame_hdr->switchable_comp_refs) {
@ -3079,9 +3083,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
            f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y;

        for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
-            reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
+            reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);

-        if (f->n_tc == 1) {
+        if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) {
            Dav1dTileContext *const t = f->tc;

            // no tile threading - we explicitly interleave tile/sbrow decoding
@ -3108,18 +3112,31 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
                    }
                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
                        t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
-
                        if (dav1d_decode_tile_sbrow(t)) goto error;
                    }
-                    if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) {
+                    if (f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
                        dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
                    }

                    // loopfilter + cdef + restoration
-                    if (f->frame_thread.pass != 1)
-                        f->bd_fn.filter_sbrow(f, sby);
-                    dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
-                                                progress_plane_type);
+                    if (f->frame_thread.pass != 1) {
+                        if (c->n_pfc == 1)
+                            f->bd_fn.filter_sbrow(f, sby);
+                        else {
+                            pthread_mutex_lock(&f->lf.thread.pftd->lock);
+                            if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
+                                Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
+                                t->start = 1;
+                                if (t->status == DAV1D_TASK_READY)
+                                    dav1d_task_schedule(f->lf.thread.pftd, t);
+                            }
+                            pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+                        }
+                    }
+                    if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
+                        dav1d_thread_picture_signal(&f->sr_cur,
+                                                    (sby + 1) * f->sb_step * 4,
+                                                    progress_plane_type);
                }
            }
        } else {
@ -3142,7 +3159,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
            pthread_cond_broadcast(&f->tile_thread.cond);
            pthread_mutex_unlock(&f->tile_thread.lock);

-            // loopfilter + cdef + restoration
            for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
                     sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
@ -3174,10 +3190,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
                    }

                    // loopfilter + cdef + restoration
-                    if (f->frame_thread.pass != 1)
-                        f->bd_fn.filter_sbrow(f, sby);
-                    dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
-                                                progress_plane_type);
+                    if (f->frame_thread.pass != 1) {
+                        if (c->n_pfc == 1)
+                            f->bd_fn.filter_sbrow(f, sby);
+                        else {
+                            pthread_mutex_lock(&f->lf.thread.pftd->lock);
+                            if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
+                                Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
+                                t->start = 1;
+                                if (t->status == DAV1D_TASK_READY)
+                                    dav1d_task_schedule(f->lf.thread.pftd, t);
+                            }
+                            pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+                        }
+                    }
+                    if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
+                        dav1d_thread_picture_signal(&f->sr_cur,
+                                                    (sby + 1) * f->sb_step * 4,
+                                                    progress_plane_type);
                }
            }

@ -3222,6 +3252,17 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {

    retval = 0;
 error:
+    if (c->n_pfc > 1) {
+        pthread_mutex_lock(&f->lf.thread.pftd->lock);
+        if (!f->lf.thread.done) {
+            if (retval != 0) {
+                f->lf.thread.done = -1;
+                pthread_cond_signal(&f->lf.thread.pftd->cond);
+            }
+            pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock);
+        }
+        pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+    }
    dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
                                PLANE_TYPE_ALL);
    for (int i = 0; i < 7; i++) {
@ -3329,6 +3370,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
        f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
        f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
        f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
+        f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \
+        f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
+        f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
+        f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
    if (!f->seq_hdr->hbd) {
@ -3343,7 +3388,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
 #undef assign_bitdepth_case

    int ref_coded_width[7];
-    if (f->frame_hdr->frame_type & 1) {
+    if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
        if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
            const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
            if (!c->refs[pri_ref].p.p.data[0]) {
@ -3461,7 +3506,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
    f->bitdepth_max = (1 << f->cur.p.bpc) - 1;

    // ref_mvs
-    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
        f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
            sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
        if (!f->mvs_ref) {
--- a/third_party/dav1d/src/dequant_tables.c
+++ b/third_party/dav1d/src/dequant_tables.c
@ -29,7 +29,7 @@

 #include "src/dequant_tables.h"

-const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = {
+const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = {
    {
        {    4,    4, }, {    8,    8, }, {    8,    9, }, {    9,   10, },
        {   10,   11, }, {   11,   12, }, {   12,   13, }, {   12,   14, },
--- a/third_party/dav1d/src/dequant_tables.h
+++ b/third_party/dav1d/src/dequant_tables.h
@ -32,6 +32,6 @@

 #include "src/levels.h"

-extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2];
+extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];

 #endif /* DAV1D_SRC_DEQUANT_TABLES_H */
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x86 abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2020 x264 project
+;* Copyright (C) 2005-2021 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Henrik Gramner <henrik@gramner.com>
@ -349,6 +349,28 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
 %define high_mm_regs (16*cpuflag(avx512))

+; Large stack allocations on Windows need to use stack probing in order
+; to guarantee that all stack memory is committed before accessing it.
+; This is done by ensuring that the guard page(s) at the end of the
+; currently committed pages are touched prior to any pages beyond that.
+%if WIN64
+    %assign STACK_PROBE_SIZE 8192
+%elifidn __OUTPUT_FORMAT__, win32
+    %assign STACK_PROBE_SIZE 4096
+%else
+    %assign STACK_PROBE_SIZE 0
+%endif
+
+%macro PROBE_STACK 1 ; stack_size
+    %if STACK_PROBE_SIZE
+        %assign %%i STACK_PROBE_SIZE
+        %rep %1 / STACK_PROBE_SIZE
+            mov eax, [rsp-%%i]
+            %assign %%i %%i+STACK_PROBE_SIZE
+        %endrep
+    %endif
+%endmacro
+
 %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
    %ifnum %1
        %if %1 != 0
@ -369,6 +391,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
            %if required_stack_alignment <= STACK_ALIGNMENT
                ; maintain the current stack alignment
                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                PROBE_STACK stack_size_padded
                SUB rsp, stack_size_padded
            %else
                %assign %%reg_num (regs_used - 1)
@ -384,6 +407,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
                    %xdefine rstkm rstk
                %endif
                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                PROBE_STACK stack_size_padded
                mov rstk, rsp
                and rsp, ~(required_stack_alignment-1)
                sub rsp, stack_size_padded
@ -1139,8 +1163,7 @@ INIT_XMM
    %endif
    %xdefine %%tmp %%f %+ 0
    %ifnum %%tmp
-        RESET_MM_PERMUTATION
-        AVX512_MM_PERMUTATION
+        DEFINE_MMREGS mmtype
        %assign %%i 0
        %rep num_mmregs
            %xdefine %%tmp %%f %+ %%i
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -35,6 +35,8 @@
 typedef struct Dav1dFrameContext Dav1dFrameContext;
 typedef struct Dav1dTileState Dav1dTileState;
 typedef struct Dav1dTileContext Dav1dTileContext;
+typedef struct Dav1dPostFilterContext Dav1dPostFilterContext;
+typedef struct Dav1dTask Dav1dTask;

 #include "common/attributes.h"

@ -76,6 +78,9 @@ struct Dav1dContext {
    Dav1dFrameContext *fc;
    unsigned n_fc;

+    Dav1dPostFilterContext *pfc;
+    unsigned n_pfc;
+
    // cache of OBUs that make up a single frame before we submit them
    // to a frame worker to be decoded
    struct Dav1dTileGroup *tile;
@ -99,15 +104,23 @@ struct Dav1dContext {
    // decoded output picture queue
    Dav1dData in;
    Dav1dPicture out;
+    // dummy is a pointer to prevent compiler errors about atomic_load()
+    // not taking const arguments
+    atomic_int flush_mem, *flush;
    struct {
        Dav1dThreadPicture *out_delayed;
        unsigned next;
-        // dummy is a pointer to prevent compiler errors about atomic_load()
-        // not taking const arguments; the const attribute is not taken
-        // from pointers
-        atomic_int flush_mem, *flush;
    } frame_thread;

+    // postfilter threading (refer to pfc[] for per_thread thingies)
+    struct PostFilterThreadData {
+        pthread_mutex_t lock;
+        pthread_cond_t cond;
+        struct Dav1dTask *tasks;
+        int frame_cnt;
+        int inited;
+    } postfilter_thread;
+
    // reference/entropy state
    Dav1dMemPool *segmap_pool;
    Dav1dMemPool *refmvs_pool;
@ -182,6 +195,10 @@ struct Dav1dFrameContext {
        recon_b_intra_fn recon_b_intra;
        recon_b_inter_fn recon_b_inter;
        filter_sbrow_fn filter_sbrow;
+        filter_sbrow_fn filter_sbrow_deblock;
+        filter_sbrow_fn filter_sbrow_cdef;
+        filter_sbrow_fn filter_sbrow_resize;
+        filter_sbrow_fn filter_sbrow_lr;
        backup_ipred_edge_fn backup_ipred_edge;
        read_coef_blocks_fn read_coef_blocks;
    } bd_fn;
@ -191,7 +208,7 @@ struct Dav1dFrameContext {
    ptrdiff_t b4_stride;
    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
    uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
-    const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
+    const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */];
    BlockContext *a;
    int a_sz /* w*tile_rows */;
    refmvs_frame rf;
@ -238,6 +255,16 @@ struct Dav1dFrameContext {
        pixel *p[3], *sr_p[3];
        Av1Filter *mask_ptr, *prev_mask_ptr;
        int restore_planes; // enum LrRestorePlanes
+
+        struct {
+            pthread_cond_t cond;
+            struct PostFilterThreadData *pftd;
+            struct Dav1dTask *tasks;
+            int num_tasks;
+            int npf;
+            int done;
+            int inited;
+        } thread;
    } lf;

    // threading (refer to tc[] for per-thread things)
@ -353,4 +380,11 @@ struct Dav1dTileContext {
    } tile_thread;
 };

+struct Dav1dPostFilterContext {
+    Dav1dContext *c;
+    struct thread_data td;
+    int flushed;
+    int die;
+};
+
 #endif /* DAV1D_SRC_INTERNAL_H */
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@ -89,7 +89,7 @@ static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
    int y, x;

-    uint8_t txa[2 /* edge */][2 /* txsz, step */][32 /* y */][32 /* x */];
+    ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]);
    for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
        for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
            decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],
--- a/third_party/dav1d/src/lf_mask.h
+++ b/third_party/dav1d/src/lf_mask.h
@ -40,11 +40,11 @@ typedef struct Av1FilterLUT {
 } Av1FilterLUT;

 typedef struct Av1RestorationUnit {
-    enum Dav1dRestorationType type;
+    uint8_t /* enum Dav1dRestorationType */ type;
    int8_t filter_h[3];
    int8_t filter_v[3];
    uint8_t sgr_idx;
-    int16_t sgr_weights[2];
+    int8_t sgr_weights[2];
 } Av1RestorationUnit;

 // each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
@ -53,7 +53,7 @@ typedef struct Av1Filter {
    uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
    uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
    int8_t cdef_idx[4]; // -1 means "unset"
-    uint16_t noskip_mask[32][2];
+    uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis
 } Av1Filter;

 // each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -65,6 +65,7 @@ COLD const char *dav1d_version(void) {
 COLD void dav1d_default_settings(Dav1dSettings *const s) {
    s->n_frame_threads = 1;
    s->n_tile_threads = 1;
+    s->n_postfilter_threads = 1;
    s->apply_grain = 1;
    s->allocator.cookie = NULL;
    s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
@ -100,6 +101,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {

    validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
    validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s->n_postfilter_threads >= 1 &&
+                          s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL));
    validate_input_or_ret(s->n_tile_threads >= 1 &&
                          s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
    validate_input_or_ret(s->n_frame_threads >= 1 &&
@ -136,9 +139,17 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
    {
        goto error;
    }
-    if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc) {
+
+    if (c->allocator.alloc_picture_callback   == dav1d_default_picture_alloc &&
+        c->allocator.release_picture_callback == dav1d_default_picture_release)
+    {
+        if (c->allocator.cookie) goto error;
        if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
        c->allocator.cookie = c->picture_pool;
+    } else if (c->allocator.alloc_picture_callback   == dav1d_default_picture_alloc ||
+               c->allocator.release_picture_callback == dav1d_default_picture_release)
+    {
+        goto error;
    }

    /* On 32-bit systems extremely large frame sizes can cause overflows in
@ -152,12 +163,49 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
                      s->frame_size_limit, c->frame_size_limit);
    }

-    c->frame_thread.flush = &c->frame_thread.flush_mem;
-    atomic_init(c->frame_thread.flush, 0);
+    c->flush = &c->flush_mem;
+    atomic_init(c->flush, 0);
+
+    c->n_pfc = s->n_postfilter_threads;
    c->n_fc = s->n_frame_threads;
    c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
    if (!c->fc) goto error;
    memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
+
+    if (c->n_pfc > 1) {
+        c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32);
+        if (!c->pfc) goto error;
+        memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads);
+        if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error;
+        if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) {
+            pthread_mutex_destroy(&c->postfilter_thread.lock);
+            goto error;
+        }
+        c->postfilter_thread.inited = 1;
+        for (int n = 0; n < s->n_frame_threads; n++) {
+            Dav1dFrameContext *const f = &c->fc[n];
+            if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error;
+            f->lf.thread.pftd = &c->postfilter_thread;
+            f->lf.thread.done = 1;
+            f->lf.thread.inited = 1;
+        }
+        for (int n = 0; n < s->n_postfilter_threads; ++n) {
+            Dav1dPostFilterContext *const pf = &c->pfc[n];
+            pf->c = c;
+            if (pthread_mutex_init(&pf->td.lock, NULL)) goto error;
+            if (pthread_cond_init(&pf->td.cond, NULL)) {
+                pthread_mutex_destroy(&pf->td.lock);
+                goto error;
+            }
+            if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) {
+                pthread_cond_destroy(&c->postfilter_thread.cond);
+                pthread_mutex_destroy(&c->postfilter_thread.lock);
+                goto error;
+            }
+            pf->td.inited = 1;
+        }
+    }
+
    if (c->n_fc > 1) {
        c->frame_thread.out_delayed =
            calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
@ -459,11 +507,17 @@ void dav1d_flush(Dav1dContext *const c) {
    dav1d_ref_dec(&c->content_light_ref);
    dav1d_ref_dec(&c->itut_t35_ref);

-    if (c->n_fc == 1) return;
+    if (c->n_fc == 1 && c->n_pfc == 1) return;

-    // mark each currently-running frame as flushing, so that we
-    // exit out as quickly as the running thread checks this flag
-    atomic_store(c->frame_thread.flush, 1);
+    // wait for threads to complete flushing
+    if (c->n_pfc > 1)
+        pthread_mutex_lock(&c->postfilter_thread.lock);
+    atomic_store(c->flush, 1);
+    if (c->n_pfc > 1) {
+        pthread_cond_broadcast(&c->postfilter_thread.cond);
+        pthread_mutex_unlock(&c->postfilter_thread.lock);
+    }
+    if (c->n_fc == 1) goto skip_ft_flush;
    for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
        if (next == c->n_fc) next = 0;
        Dav1dFrameContext *const f = &c->fc[next];
@ -475,13 +529,31 @@ void dav1d_flush(Dav1dContext *const c) {
            assert(!f->cur.data[0]);
        }
        pthread_mutex_unlock(&f->frame_thread.td.lock);
-        Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next];
+        Dav1dThreadPicture *const out_delayed =
+            &c->frame_thread.out_delayed[next];
        if (out_delayed->p.data[0])
            dav1d_thread_picture_unref(out_delayed);
    }
-    atomic_store(c->frame_thread.flush, 0);
-
    c->frame_thread.next = 0;
+skip_ft_flush:
+    if (c->n_pfc > 1) {
+        for (unsigned i = 0; i < c->n_pfc; ++i) {
+            Dav1dPostFilterContext *const pf = &c->pfc[i];
+            pthread_mutex_lock(&pf->td.lock);
+            if (!pf->flushed)
+                pthread_cond_wait(&pf->td.cond, &pf->td.lock);
+            pf->flushed = 0;
+            pthread_mutex_unlock(&pf->td.lock);
+        }
+        pthread_mutex_lock(&c->postfilter_thread.lock);
+        c->postfilter_thread.tasks = NULL;
+        pthread_mutex_unlock(&c->postfilter_thread.lock);
+        for (unsigned i = 0; i < c->n_fc; ++i) {
+            freep(&c->fc[i].lf.thread.tasks);
+            c->fc[i].lf.thread.num_tasks = 0;
+        }
+    }
+    atomic_store(c->flush, 0);
 }

 COLD void dav1d_close(Dav1dContext **const c_out) {
@ -495,6 +567,25 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {

    if (flush) dav1d_flush(c);

+    if (c->pfc) {
+        struct PostFilterThreadData *pftd = &c->postfilter_thread;
+        if (pftd->inited) {
+            pthread_mutex_lock(&pftd->lock);
+            for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++)
+                c->pfc[n].die = 1;
+            pthread_cond_broadcast(&pftd->cond);
+            pthread_mutex_unlock(&pftd->lock);
+            for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) {
+                pthread_join(c->pfc[n].td.thread, NULL);
+                pthread_cond_destroy(&c->pfc[n].td.cond);
+                pthread_mutex_destroy(&c->pfc[n].td.lock);
+            }
+            pthread_cond_destroy(&pftd->cond);
+            pthread_mutex_destroy(&pftd->lock);
+        }
+        dav1d_free_aligned(c->pfc);
+    }
+
    for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
        Dav1dFrameContext *const f = &c->fc[n];

@ -546,6 +637,10 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
            pthread_cond_destroy(&ts->tile_thread.cond);
            pthread_mutex_destroy(&ts->tile_thread.lock);
        }
+        if (f->lf.thread.inited) {
+            freep(&f->lf.thread.tasks);
+            pthread_cond_destroy(&f->lf.thread.cond);
+        }
        dav1d_free_aligned(f->ts);
        dav1d_free_aligned(f->tc);
        dav1d_free_aligned(f->ipred_edge[0]);
--- a/third_party/dav1d/src/looprestoration.h
+++ b/third_party/dav1d/src/looprestoration.h
@ -46,29 +46,32 @@ typedef const pixel (*const_left_pixel_row)[4];
 typedef const void *const_left_pixel_row;
 #endif

-// Although the spec applies restoration filters over 4x4 blocks, the wiener
-// filter can be applied to a bigger surface.
+typedef union LooprestorationParams {
+    ALIGN(int16_t filter[2][8], 16);
+    struct {
+        uint32_t s0, s1;
+        int16_t w0, w1;
+    } sgr;
+} LooprestorationParams;
+
+// Although the spec applies restoration filters over 4x4 blocks,
+// they can be applied to a bigger surface.
 //    * w is constrained by the restoration unit size (w <= 256)
 //    * h is constrained by the stripe height (h <= 64)
-#define decl_wiener_filter_fn(name) \
+// The filter functions are allowed to do aligned writes past the right
+// edge of the buffer, aligned up to the minimum loop restoration unit size
+// (which is 32 pixels for subsampled chroma and 64 pixels for luma).
+#define decl_lr_filter_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
            const_left_pixel_row left, \
            const pixel *lpf, ptrdiff_t lpf_stride, \
-            int w, int h, const int16_t filter[2][8], \
+            int w, int h, const LooprestorationParams *params, \
            enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
-typedef decl_wiener_filter_fn(*wienerfilter_fn);
-
-#define decl_selfguided_filter_fn(name) \
-void (name)(pixel *dst, ptrdiff_t dst_stride, \
-            const_left_pixel_row left, \
-            const pixel *lpf, ptrdiff_t lpf_stride, \
-            int w, int h, int sgr_idx, const int16_t sgr_w[2], \
-            const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
-typedef decl_selfguided_filter_fn(*selfguided_fn);
+typedef decl_lr_filter_fn(*looprestorationfilter_fn);

 typedef struct Dav1dLoopRestorationDSPContext {
-    wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
-    selfguided_fn selfguided;
+    looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */
+    looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */
 } Dav1dLoopRestorationDSPContext;

 bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@ -39,10 +39,10 @@

 // TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
 // TODO Chroma only requires 2 rows of padding.
-static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
-                    const pixel (*left)[4],
-                    const pixel *lpf, const ptrdiff_t lpf_stride,
-                    int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
+static NOINLINE void
+padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
+        const pixel (*left)[4], const pixel *lpf, const ptrdiff_t lpf_stride,
+        int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
 {
    const int have_left = !!(edges & LR_HAVE_LEFT);
    const int have_right = !!(edges & LR_HAVE_RIGHT);
@ -135,7 +135,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
                     const pixel (*const left)[4],
                     const pixel *lpf, const ptrdiff_t lpf_stride,
                     const int w, const int h,
-                     const int16_t filter[2][8],
+                     const LooprestorationParams *const params,
                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
@ -150,6 +150,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
    uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
    uint16_t *hor_ptr = hor;

+    const int16_t (*const filter)[8] = params->filter;
    const int bitdepth = bitdepth_from_max(bitdepth_max);
    const int round_bits_h = 3 + (bitdepth == 12) * 2;
    const int rounding_off_h = 1 << (round_bits_h - 1);
@ -347,12 +348,12 @@ static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
    }
 }

-static void selfguided_filter(coef *dst, const pixel *src,
-                              const ptrdiff_t src_stride, const int w,
-                              const int h, const int n, const int s
-                              HIGHBD_DECL_SUFFIX)
+static NOINLINE void
+selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
+                  const int w, const int h, const int n, const unsigned s
+                  HIGHBD_DECL_SUFFIX)
 {
-    const int sgr_one_by_x = n == 25 ? 164 : 455;
+    const unsigned sgr_one_by_x = n == 25 ? 164 : 455;

    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
    // of padding above and below
@ -446,71 +447,93 @@ static void selfguided_filter(coef *dst, const pixel *src,
 #undef EIGHT_NEIGHBORS
 }

-static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
-                         const pixel (*const left)[4],
-                         const pixel *lpf, const ptrdiff_t lpf_stride,
-                         const int w, const int h, const int sgr_idx,
-                         const int16_t sgr_w[2], const enum LrEdgeFlags edges
-                         HIGHBD_DECL_SUFFIX)
+static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride,
+                      const pixel (*const left)[4], const pixel *lpf,
+                      const ptrdiff_t lpf_stride, const int w, const int h,
+                      const LooprestorationParams *const params,
+                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
    // of padding above and below
    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

-    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
-
    // Selfguided filter outputs to a maximum stripe height of 64 and a
    // maximum restoration width of 384 (256 * 1.5)
    coef dst[64 * 384];

-    // both r1 and r0 can't be zero
-    if (!dav1d_sgr_params[sgr_idx][0]) {
-        const int s1 = dav1d_sgr_params[sgr_idx][3];
-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
-        const int w1 = (1 << 7) - sgr_w[1];
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int u = (p[i] << 4);
-                const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
-            }
-            p += PXSTRIDE(p_stride);
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+    selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
+                      params->sgr.s0 HIGHBD_TAIL_SUFFIX);
+
+    const int w0 = params->sgr.w0;
+    for (int j = 0; j < h; j++) {
+        for (int i = 0; i < w; i++) {
+            const int u = (p[i] << 4);
+            const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
+            p[i] = iclip_pixel((v + (1 << 10)) >> 11);
        }
-    } else if (!dav1d_sgr_params[sgr_idx][1]) {
-        const int s0 = dav1d_sgr_params[sgr_idx][2];
-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
-        const int w0 = sgr_w[0];
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int u = (p[i] << 4);
-                const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
-            }
-            p += PXSTRIDE(p_stride);
+        p += PXSTRIDE(p_stride);
+    }
+}
+
+static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride,
+                      const pixel (*const left)[4], const pixel *lpf,
+                      const ptrdiff_t lpf_stride, const int w, const int h,
+                      const LooprestorationParams *const params,
+                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    coef dst[64 * 384];
+
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+    selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
+                      params->sgr.s1 HIGHBD_TAIL_SUFFIX);
+
+    const int w1 = params->sgr.w1;
+    for (int j = 0; j < h; j++) {
+        for (int i = 0; i < w; i++) {
+            const int u = (p[i] << 4);
+            const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
+            p[i] = iclip_pixel((v + (1 << 10)) >> 11);
        }
-    } else {
-        coef dst1[64 * 384];
-        const int s0 = dav1d_sgr_params[sgr_idx][2];
-        const int s1 = dav1d_sgr_params[sgr_idx][3];
-        const int w0 = sgr_w[0];
-        const int w1 = (1 << 7) - w0 - sgr_w[1];
-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
-        selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int u = (p[i] << 4);
-                const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
-                              w1 * (dst1[j * 384 + i] - u);
-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
-            }
-            p += PXSTRIDE(p_stride);
+        p += PXSTRIDE(p_stride);
+    }
+}
+
+static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride,
+                      const pixel (*const left)[4], const pixel *lpf,
+                      const ptrdiff_t lpf_stride, const int w, const int h,
+                      const LooprestorationParams *const params,
+                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    coef dst0[64 * 384];
+    coef dst1[64 * 384];
+
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+    selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
+                      params->sgr.s0 HIGHBD_TAIL_SUFFIX);
+    selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h,  9,
+                      params->sgr.s1 HIGHBD_TAIL_SUFFIX);
+
+    const int w0 = params->sgr.w0;
+    const int w1 = params->sgr.w1;
+    for (int j = 0; j < h; j++) {
+        for (int i = 0; i < w; i++) {
+            const int u = (p[i] << 4);
+            const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) +
+                                     w1 * (dst1[j * 384 + i] - u);
+            p[i] = iclip_pixel((v + (1 << 10)) >> 11);
        }
+        p += PXSTRIDE(p_stride);
    }
 }

 COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
    c->wiener[0] = c->wiener[1] = wiener_c;
-    c->selfguided = selfguided_c;
+    c->sgr[0] = sgr_5x5_c;
+    c->sgr[1] = sgr_3x3_c;
+    c->sgr[2] = sgr_mix_c;

 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
--- a/third_party/dav1d/src/lr_apply_tmpl.c
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@ -48,31 +48,32 @@ static void backup_lpf(const Dav1dFrameContext *const f,
                       const pixel *src, const ptrdiff_t src_stride,
                       const int ss_ver, const int sb128,
                       int row, const int row_h, const int src_w,
-                       const int h, const int ss_hor)
+                       const int h, const int ss_hor, const int pft)
 {
    const int dst_w = f->frame_hdr->super_res.enabled ?
                      (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;

    // The first stripe of the frame is shorter by 8 luma pixel rows.
    int stripe_h = (64 - 8 * !row) >> ss_ver;
-
-    if (row) {
-        const int top = 4 << sb128;
-        // Copy the top part of the stored loop filtered pixels from the
-        // previous sb row needed above the first stripe of this sb row.
-        pixel_copy(&dst[PXSTRIDE(dst_stride) *  0],
-                   &dst[PXSTRIDE(dst_stride) *  top],      dst_w);
-        pixel_copy(&dst[PXSTRIDE(dst_stride) *  1],
-                   &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
-        pixel_copy(&dst[PXSTRIDE(dst_stride) *  2],
-                   &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
-        pixel_copy(&dst[PXSTRIDE(dst_stride) *  3],
-                   &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
-    }
-
-    dst += 4 * PXSTRIDE(dst_stride);
    src += (stripe_h - 2) * PXSTRIDE(src_stride);

+    if (!pft) {
+        if (row) {
+            const int top = 4 << sb128;
+            // Copy the top part of the stored loop filtered pixels from the
+            // previous sb row needed above the first stripe of this sb row.
+            pixel_copy(&dst[PXSTRIDE(dst_stride) *  0],
+                       &dst[PXSTRIDE(dst_stride) *  top],      dst_w);
+            pixel_copy(&dst[PXSTRIDE(dst_stride) *  1],
+                       &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
+            pixel_copy(&dst[PXSTRIDE(dst_stride) *  2],
+                       &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
+            pixel_copy(&dst[PXSTRIDE(dst_stride) *  3],
+                       &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
+        }
+        dst += 4 * PXSTRIDE(dst_stride);
+    }
+
    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
        while (row + stripe_h <= row_h) {
            const int n_lines = 4 - (row + stripe_h + 1 == h);
@ -107,9 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f,
 void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
                               /*const*/ pixel *const src[3], const int sby)
 {
+    const int pft = f->c->n_pfc > 1;
    const int offset = 8 * !!sby;
    const ptrdiff_t *const src_stride = f->cur.stride;
    const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
+    pixel *const dst[3] = {
+        f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
+        f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
+        f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride)
+    };

    // TODO Also check block level restore type to reduce copying.
    const int restore_planes = f->lf.restore_planes;
@ -119,9 +126,9 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
        const int w = f->bw << 2;
        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
-        backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,
+        backup_lpf(f, dst[0], lr_stride,
                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
-                   0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
+                   0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft);
    }
    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -130,18 +137,16 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
        const int w = f->bw << (2 - ss_hor);
        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
        const int offset_uv = offset >> ss_ver;
-        const int y_stripe =
-            (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
-
+        const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
        if (restore_planes & LR_RESTORE_U) {
-            backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,
+            backup_lpf(f, dst[1], lr_stride,
                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
        }
        if (restore_planes & LR_RESTORE_V) {
-            backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,
+            backup_lpf(f, dst[2], lr_stride,
                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
        }
    }
 }
@ -154,17 +159,18 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
    const Dav1dDSPContext *const dsp = f->dsp;
    const int chroma = !!plane;
    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
-    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
-    const pixel *lpf = f->lf.lr_lpf_line[plane] + x;
    const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
    const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
+    const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
+    const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;

    // The first stripe of the frame is shorter by 8 luma pixel rows.
    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);

-    ALIGN_STK_16(int16_t, filter, 2, [8]);
-    wienerfilter_fn wiener_fn = NULL;
+    looprestorationfilter_fn lr_fn;
+    LooprestorationParams params;
    if (lr->type == DAV1D_RESTORATION_WIENER) {
+        int16_t (*const filter)[8] = params.filter;
        filter[0][0] = filter[0][6] = lr->filter_h[0];
        filter[0][1] = filter[0][5] = lr->filter_h[1];
        filter[0][2] = filter[0][4] = lr->filter_h[2];
@ -180,25 +186,26 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
        filter[1][2] = filter[1][4] = lr->filter_v[2];
        filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;

-        wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
+        lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
    } else {
        assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
+        const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx];
+        params.sgr.s0 = sgr_params[0];
+        params.sgr.s1 = sgr_params[1];
+        params.sgr.w0 = lr->sgr_weights[0];
+        params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
+
+        lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
    }

    while (y + stripe_h <= row_h) {
-        // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
-        edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
-        if (wiener_fn) {
-            wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
-                      filter, edges HIGHBD_CALL_SUFFIX);
-        } else {
-            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
-                               lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
-        }
+        // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
+        edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
+        lr_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+              &params, edges HIGHBD_CALL_SUFFIX);

        left += stripe_h;
        y += stripe_h;
-        if (y + stripe_h > row_h && sbrow_has_bottom) break;
        p += stripe_h * PXSTRIDE(p_stride);
        edges |= LR_HAVE_TOP;
        stripe_h = imin(64 >> ss_ver, row_h - y);
@ -242,8 +249,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
    const Av1RestorationUnit *lr[2];

-    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
-                             (row_h < h ? LR_HAVE_BOTTOM : 0);
+    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;

    int aligned_unit_pos = row_y & ~(unit_size - 1);
    if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
@ -281,11 +287,13 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
    const int offset_y = 8 * !!sby;
    const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
    const int restore_planes = f->lf.restore_planes;
+    const int not_last = sby + 1 < f->sbh;

    if (restore_planes & LR_RESTORE_Y) {
        const int h = f->sr_cur.p.p.h;
        const int w = f->sr_cur.p.p.w;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h);
+        const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
+        const int row_h = imin(next_row_y - 8 * not_last, h);
        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
                 h, row_h, 0);
@ -295,10 +303,10 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
        const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
        const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h);
+        const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
+        const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
        const int offset_uv = offset_y >> ss_ver;
-        const int y_stripe =
-            (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+        const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
        if (restore_planes & LR_RESTORE_U)
            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
                     w, h, row_h, 1);
--- a/third_party/dav1d/src/mc_tmpl.c
+++ b/third_party/dav1d/src/mc_tmpl.c
@ -87,9 +87,15 @@ prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
 #define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
    ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))

+#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
+    ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
+
 #define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
    iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))

+#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
+    iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
+
 #define GET_H_FILTER(mx) \
    const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
        dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
@ -111,7 +117,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
           const int filter_type HIGHBD_DECL_SUFFIX)
 {
    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
-    const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+    const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);

    GET_FILTERS();
    dst_stride = PXSTRIDE(dst_stride);
@ -144,9 +150,8 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
        } else {
            do {
                for (int x = 0; x < w; x++) {
-                    const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
-                                                         6 - intermediate_bits);
-                    dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
+                    dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
+                                                     intermediate_rnd, 6);
                }

                dst += dst_stride;
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -132,6 +132,8 @@ if is_asm_enabled
            endif
        elif host_machine.cpu_family().startswith('arm')
            libdav1d_sources_asm = files(
+                # itx.S is used for both 8 and 16 bpc.
+                'arm/32/itx.S',
                'arm/32/looprestoration_common.S',
                'arm/32/msac.S',
            )
@ -140,7 +142,6 @@ if is_asm_enabled
                libdav1d_sources_asm += files(
                    'arm/32/cdef.S',
                    'arm/32/ipred.S',
-                    'arm/32/itx.S',
                    'arm/32/loopfilter.S',
                    'arm/32/looprestoration.S',
                    'arm/32/mc.S',
@ -150,6 +151,8 @@ if is_asm_enabled
            if dav1d_bitdepths.contains('16')
                libdav1d_sources_asm += files(
                    'arm/32/cdef16.S',
+                    'arm/32/ipred16.S',
+                    'arm/32/itx16.S',
                    'arm/32/loopfilter16.S',
                    'arm/32/looprestoration16.S',
                    'arm/32/mc16.S',
@ -183,20 +186,20 @@ if is_asm_enabled
        libdav1d_sources_asm = files(
            'x86/cpuid.asm',
            'x86/msac.asm',
+            'x86/cdef_avx2.asm',
+            'x86/cdef_sse.asm',
        )

        if dav1d_bitdepths.contains('8')
            libdav1d_sources_asm += files(
                'x86/cdef_avx512.asm',
                'x86/mc_avx512.asm',
-                'x86/cdef_avx2.asm',
                'x86/mc_avx2.asm',
                'x86/film_grain.asm',
                'x86/ipred.asm',
                'x86/itx.asm',
                'x86/loopfilter.asm',
                'x86/looprestoration.asm',
-                'x86/cdef_sse.asm',
                'x86/film_grain_ssse3.asm',
                'x86/ipred_ssse3.asm',
                'x86/itx_ssse3.asm',
@ -208,6 +211,9 @@ if is_asm_enabled

        if dav1d_bitdepths.contains('16')
            libdav1d_sources_asm += files(
+                'x86/cdef16_avx2.asm',
+                'x86/cdef16_sse.asm',
+                'x86/looprestoration16_avx2.asm',
            )
        endif

--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -1,5 +1,5 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
@ -33,6 +33,7 @@

 #include "dav1d/data.h"

+#include "common/frame.h"
 #include "common/intops.h"

 #include "src/decode.h"
@ -406,7 +407,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    else
        hdr->force_integer_mv = 0;

-    if (!(hdr->frame_type & 1))
+    if (IS_KEY_OR_INTRA(hdr))
        hdr->force_integer_mv = 1;

    if (seqhdr->frame_id_numbers_present)
@ -420,7 +421,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 #endif
    hdr->frame_offset = seqhdr->order_hint ?
                        dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
-    hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ?
+    hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ?
                             dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;

    if (seqhdr->decoder_model_info_present) {
@ -439,9 +440,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        }
    }

-    if (hdr->frame_type == DAV1D_FRAME_TYPE_KEY ||
-        hdr->frame_type == DAV1D_FRAME_TYPE_INTRA)
-    {
+    if (IS_KEY_OR_INTRA(hdr)) {
        hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
                                    hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
        if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
@ -569,7 +568,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        hdr->switchable_motion_mode = dav1d_get_bits(gb, 1);
        hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
            seqhdr->ref_frame_mvs && seqhdr->order_hint &&
-            hdr->frame_type & 1 && dav1d_get_bits(gb, 1);
+            IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1);
    }
 #if DEBUG_FRAME_HDR
    printf("HDR: post-frametype-specific-bits: off=%td\n",
@ -916,13 +915,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    printf("HDR: post-txfmmode: off=%td\n",
           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
-    hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0;
+    hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0;
 #if DEBUG_FRAME_HDR
    printf("HDR: post-refmode: off=%td\n",
           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
    hdr->skip_mode_allowed = 0;
-    if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
+    if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) {
        const unsigned poc = hdr->frame_offset;
        unsigned off_before = 0xFFFFFFFFU;
        int off_after = -1;
@ -982,7 +981,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    printf("HDR: post-extskip: off=%td\n",
           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
-    hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 &&
+    hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) &&
        seqhdr->warped_motion && dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
    printf("HDR: post-warpmotionbit: off=%td\n",
@ -997,7 +996,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    for (int i = 0; i < 7; i++)
        hdr->gmv[i] = dav1d_default_wm_params;

-    if (hdr->frame_type & 1) {
+    if (IS_INTER_OR_SWITCH(hdr)) {
        for (int i = 0; i < 7; i++) {
            hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY :
                                dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM :
--- a/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c
@ -299,7 +299,6 @@ static inline void padding(uint8_t *dst, const uint8_t *p,
    }
 }

-
 // FIXME Could split into luma and chroma specific functions,
 // (since first and last tops are always 0 for chroma)
 // FIXME Could implement a version that requires less temporary memory
@ -309,9 +308,11 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
                              const uint8_t *lpf,
                              const ptrdiff_t lpf_stride,
                              const int w, const int h,
-                              const int16_t filter[2][8],
+                              const LooprestorationParams *const params,
                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
+    const int16_t (*const filter)[8] = params->filter;
+
    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
    // of padding above and below
    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
@ -320,7 +321,6 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,

    wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
    wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h);
-
 }
 #endif

--- a/third_party/dav1d/src/qm.c
+++ b/third_party/dav1d/src/qm.c
@ -3066,7 +3066,6 @@ static const uint8_t qm_tbl_32x32_t[][2][528] = {
 };

 const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
-static uint8_t pb_32x32[32 * 32];
 static uint8_t qm_tbl_4x4[15][2][16];
 static uint8_t qm_tbl_4x8[15][2][32];
 static uint8_t qm_tbl_4x16[15][2][64];
@ -3145,8 +3144,5 @@ COLD void dav1d_init_qm_tables(void) {
            dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
        }

-    memset(pb_32x32, 32, sizeof(pb_32x32));
-    for (int j = 0; j < 2; j++)
-        for (int k = 0; k < N_RECT_TX_SIZES; k++)
-            dav1d_qm_tbl[15][j][k] = pb_32x32;
+    // dav1d_qm_tbl[15][*][*] == NULL
 }
--- a/third_party/dav1d/src/recon.h
+++ b/third_party/dav1d/src/recon.h
@ -65,6 +65,14 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);

 decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
 decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc);

 decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
 decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -1,5 +1,5 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
@ -33,6 +33,7 @@
 #include "common/attributes.h"
 #include "common/bitdepth.h"
 #include "common/dump.h"
+#include "common/frame.h"
 #include "common/intops.h"

 #include "src/cdef_apply.h"
@ -438,34 +439,39 @@ static int decode_coefs(Dav1dTileContext *const t,
    } else {
        eob = eob_bin;
    }
+    assert(eob >= 0);

    // base tokens
    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
-    const uint16_t *const scan = dav1d_scans[tx][tx_class];
-    int dc_tok;
+    unsigned rc, dc_tok;

    if (eob) {
        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
        const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
-        const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;

        /* eob */
-        unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
        unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
        int tok = eob_tok + 1;
        int level_tok = tok * 0x41;
        unsigned mag;
-        if (dbg)
-            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
-                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);

 #define DECODE_COEFS_CLASS(tx_class) \
+        unsigned x, y; \
+        if (tx_class == TX_CLASS_2D) \
+            rc = scan[eob], x = rc >> shift, y = rc & mask; \
+        else if (tx_class == TX_CLASS_H) \
+            /* Transposing reduces the stride and padding requirements */ \
+            x = eob & mask, y = eob >> shift, rc = eob; \
+        else /* tx_class == TX_CLASS_V */ \
+            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
+        if (dbg) \
+            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
        if (eob_tok == 2) { \
-            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
-                   tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
+            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
            level_tok = tok + (3 << 6); \
            if (dbg) \
@ -473,40 +479,46 @@ static int decode_coefs(Dav1dTileContext *const t,
                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
                       ts->msac.rng); \
        } \
-        cf[rc] = tok; \
-        if (tx_class == TX_CLASS_H) \
-            /* Transposing reduces the stride and padding requirements */ \
-            levels[y * stride + x] = (uint8_t) level_tok; \
-        else \
-            levels[x * stride + y] = (uint8_t) level_tok; \
+        cf[rc] = tok << 11; \
+        levels[x * stride + y] = (uint8_t) level_tok; \
        for (int i = eob - 1; i > 0; i--) { /* ac */ \
-            if (tx_class == TX_CLASS_H) \
-                rc = i, x = rc & mask, y = rc >> shift; \
-            else \
-                rc = scan[i], x = rc >> shift, y = rc & mask; \
+            unsigned rc_i; \
+            if (tx_class == TX_CLASS_2D) \
+                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
+            else if (tx_class == TX_CLASS_H) \
+                x = i & mask, y = i >> shift, rc_i = i; \
+            else /* tx_class == TX_CLASS_V */ \
+                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
            assert(x < 32 && y < 32); \
            uint8_t *const level = levels + x * stride + y; \
            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
            if (tx_class == TX_CLASS_2D) \
                y |= x; \
            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
-            level_tok = tok * 0x41; \
            if (dbg) \
                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
-                       t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
+                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
            if (tok == 3) { \
                mag &= 63; \
                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
                      (mag > 12 ? 6 : (mag + 1) >> 1); \
                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
-                level_tok = tok + (3 << 6); \
                if (dbg) \
                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
-                           imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
+                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
                           ts->msac.rng); \
+                *level = (uint8_t) (tok + (3 << 6)); \
+                cf[rc_i] = (tok << 11) | rc; \
+                rc = rc_i; \
+            } else { \
+                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
+                tok *= 0x17ff41; \
+                *level = (uint8_t) tok; \
+                /* tok ? (tok << 11) | rc : 0 */ \
+                tok = (tok >> 9) & (rc + ~0x7ffu); \
+                if (tok) rc = rc_i; \
+                cf[rc_i] = tok; \
            } \
-            cf[rc] = tok; \
-            *level = (uint8_t) level_tok; \
        } \
        /* dc */ \
        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
@ -528,27 +540,35 @@ static int decode_coefs(Dav1dTileContext *const t,
        } \
        break

+        const uint16_t *scan;
        switch (tx_class) {
        case TX_CLASS_2D: {
            const unsigned nonsquare_tx = tx >= RTX_4X8;
            const uint8_t (*const lo_ctx_offsets)[5] =
                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
+            scan = dav1d_scans[tx];
            const ptrdiff_t stride = 4 * sh;
+            const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
+            const unsigned mask = 4 * sh - 1;
            memset(levels, 0, stride * (4 * sw + 2));
            DECODE_COEFS_CLASS(TX_CLASS_2D);
        }
        case TX_CLASS_H: {
-#define lo_ctx_offsets NULL
+            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
            const ptrdiff_t stride = 16;
+            const unsigned shift = t_dim->lh + 2, shift2 = 0;
+            const unsigned mask = 4 * sh - 1;
            memset(levels, 0, stride * (4 * sh + 2));
            DECODE_COEFS_CLASS(TX_CLASS_H);
        }
        case TX_CLASS_V: {
+            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
            const ptrdiff_t stride = 16;
+            const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
+            const unsigned mask = 4 * sw - 1;
            memset(levels, 0, stride * (4 * sw + 2));
            DECODE_COEFS_CLASS(TX_CLASS_V);
        }
-#undef lo_ctx_offsets
 #undef DECODE_COEFS_CLASS
        default: assert(0);
        }
@ -564,71 +584,137 @@ static int decode_coefs(Dav1dTileContext *const t,
                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
        }
+        rc = 0;
    }

    // residual and sign
-    int dc_sign = 1 << 6;
    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
-    const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
+    const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
    const int dq_shift = imax(0, t_dim->ctx - 2);
-    const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
-    const int cf_max = (1 << (7 + bitdepth)) - 1;
-    unsigned cul_level = 0;
+    const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
+    unsigned cul_level, dc_sign_level;

-    if (dc_tok) { // dc
-        const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
-        uint16_t *const dc_sign_cdf =
-            ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
-        const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
-        const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
-        if (dbg)
-            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
-                   chroma, dc_sign_ctx, sign, ts->msac.rng);
-        dc_sign = (sign - 1) & (2 << 6);
+    if (!dc_tok) {
+        cul_level = 0;
+        dc_sign_level = 1 << 6;
+        if (qm_tbl) goto ac_qm;
+        goto ac_noqm;
+    }
+
+    const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
+    uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
+    const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
+    if (dbg)
+        printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
+               chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
+
+    unsigned dc_dq = dq_tbl[0];
+    dc_sign_level = (dc_sign - 1) & (2 << 6);
+
+    if (qm_tbl) {
+        dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;

        if (dc_tok == 15) {
-            dc_tok += read_golomb(&ts->msac);
+            dc_tok = read_golomb(&ts->msac) + 15;
            if (dbg)
                printf("Post-dc_residual[%d->%d]: r=%d\n",
                       dc_tok - 15, dc_tok, ts->msac.rng);

            dc_tok &= 0xfffff;
+            dc_dq = (dc_dq * dc_tok) & 0xffffff;
+        } else {
+            dc_dq *= dc_tok;
+            assert(dc_dq <= 0xffffff);
        }
+        cul_level = dc_tok;
+        dc_dq >>= dq_shift;
+        cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);

-        cul_level += dc_tok;
-        dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift;
-        cf[0] = imin(dc_tok - sign, cf_max) ^ -sign;
-    }
-    for (int i = 1; i <= eob; i++) { // ac
-        const int rc = scan[i];
-        int tok = cf[rc];
-        if (!tok) continue;
+        if (rc) ac_qm: {
+            const unsigned ac_dq = dq_tbl[1];
+            do {
+                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+                if (dbg)
+                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
+                const unsigned rc_tok = cf[rc];
+                unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;

-        // sign
-        const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
-        const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
-        if (dbg)
-            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
+                if (rc_tok >= (15 << 11)) {
+                    tok = read_golomb(&ts->msac) + 15;
+                    if (dbg)
+                        printf("Post-residual[%d=%d->%d]: r=%d\n",
+                               rc, tok - 15, tok, ts->msac.rng);

-        // residual
-        if (tok == 15) {
-            tok += read_golomb(&ts->msac);
+                    tok &= 0xfffff;
+                    dq = (dq * tok) & 0xffffff;
+                } else {
+                    tok = rc_tok >> 11;
+                    dq *= tok;
+                    assert(dq <= 0xffffff);
+                }
+                cul_level += tok;
+                dq >>= dq_shift;
+                cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
+
+                rc = rc_tok & 0x3ff;
+            } while (rc);
+        }
+    } else {
+        // non-qmatrix is the common case and allows for additional optimizations
+        if (dc_tok == 15) {
+            dc_tok = read_golomb(&ts->msac) + 15;
            if (dbg)
-                printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
-                       i, rc, tok - 15, tok, ts->msac.rng);
+                printf("Post-dc_residual[%d->%d]: r=%d\n",
+                       dc_tok - 15, dc_tok, ts->msac.rng);

-            // coefficient parsing, see 5.11.39
-            tok &= 0xfffff;
+            dc_tok &= 0xfffff;
+            dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
+            dc_dq = umin(dc_dq - dc_sign, cf_max);
+        } else {
+            dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
+            assert(dc_dq <= cf_max);
        }
+        cul_level = dc_tok;
+        cf[0] = (coef) (dc_dq ^ -dc_sign);

-        // dequant, see 7.12.3
-        cul_level += tok;
-        tok = ((dq * tok) & 0xffffff) >> dq_shift;
-        cf[rc] = imin(tok - sign, cf_max) ^ -sign;
+        if (rc) ac_noqm: {
+            const unsigned ac_dq = dq_tbl[1];
+            do {
+                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+                if (dbg)
+                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
+                const unsigned rc_tok = cf[rc];
+                unsigned tok, dq;
+
+                // residual
+                if (rc_tok >= (15 << 11)) {
+                    tok = read_golomb(&ts->msac) + 15;
+                    if (dbg)
+                        printf("Post-residual[%d=%d->%d]: r=%d\n",
+                               rc, tok - 15, tok, ts->msac.rng);
+
+                    // coefficient parsing, see 5.11.39
+                    tok &= 0xfffff;
+
+                    // dequant, see 7.12.3
+                    dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
+                    dq = umin(dq - sign, cf_max);
+                } else {
+                    // cannot exceed cf_max, so we can avoid the clipping
+                    tok = rc_tok >> 11;
+                    dq = ((ac_dq * tok) >> dq_shift) - sign;
+                    assert(dq <= cf_max);
+                }
+                cul_level += tok;
+                cf[rc] = (coef) (dq ^ -sign);
+
+                rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
+            } while (rc);
+        }
    }

    // context
-    *res_ctx = umin(cul_level, 63) | dc_sign;
+    *res_ctx = umin(cul_level, 63) | dc_sign_level;

    return eob;
 }
@ -1544,7 +1630,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
    const ptrdiff_t uvdstoff =
        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
-    if (!(f->frame_hdr->frame_type & 1)) {
+    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
        // intrabc
        assert(!f->frame_hdr->super_res.enabled);
        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
@ -1965,74 +2051,107 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
    return 0;
 }

-void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
-    const int sbsz = f->sb_step, sbh = f->sbh;
-
-    if (f->frame_hdr->loopfilter.level_y[0] ||
-        f->frame_hdr->loopfilter.level_y[1])
-    {
+void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) {
+    const int y = sby * f->sb_step * 4;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    pixel *const p[3] = {
+        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+    };
+    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+    if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
        int start_of_tile_row = 0;
        if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
            start_of_tile_row = f->lf.tile_row++;
-        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
-                                       start_of_tile_row);
+        bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row);
    }
-
    if (f->lf.restore_planes) {
        // Store loop filtered pixels required by loop restoration
-        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
-    }
-    if (f->seq_hdr->cdef) {
-        if (sby) {
-            const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-            pixel *p_up[3] = {
-                f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
-                f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
-                f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
-            };
-            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
-                                    sby * sbsz - 2, sby * sbsz);
-        }
-        const int n_blks = sbsz - 2 * (sby + 1 < sbh);
-        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
-                                imin(sby * sbsz + n_blks, f->bh));
-    }
-    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
-        const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
-        for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
-            const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-            const int h_start = 8 * !!sby >> ss_ver;
-            const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
-            pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride);
-            const ptrdiff_t src_stride = f->cur.stride[!!pl];
-            const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride);
-            const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver;
-            const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-            const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
-            const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
-            const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
-
-            f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
-                              imin(img_h, h_end) + h_start, src_w,
-                              f->resize_step[!!pl], f->resize_start[!!pl]
-                              HIGHBD_CALL_SUFFIX);
-        }
-    }
-    if (f->lf.restore_planes) {
-        bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
+        bytefn(dav1d_lr_copy_lpf)(f, p, sby);
    }
+}

+void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) {
+    const int sbsz = f->sb_step;
+    const int y = sby * sbsz * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]);
-    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
-    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
-    f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]);
-    f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
-    f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
-    f->lf.prev_mask_ptr = f->lf.mask_ptr;
-    if ((sby & 1) || f->seq_hdr->sb128) {
-        f->lf.mask_ptr += f->sb128w;
+    pixel *const p[3] = {
+        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+    };
+    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
+    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+    const int start = sby * sbsz;
+    if (sby) {
+        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        pixel *p_up[3] = {
+            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
+            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+        };
+        bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start);
    }
+    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
+    const int end = imin(start + n_blks, f->bh);
+    bytefn(dav1d_cdef_brow)(f, p, mask, start, end);
+}
+
+void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
+    const int sbsz = f->sb_step;
+    const int y = sby * sbsz * 4;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const pixel *const p[3] = {
+        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+    };
+    pixel *const sr_p[3] = {
+        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+    };
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
+        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int h_start = 8 * !!sby >> ss_ver;
+        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
+        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
+        const ptrdiff_t src_stride = f->cur.stride[!!pl];
+        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
+        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
+        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
+        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
+
+        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
+                          imin(img_h, h_end) + h_start, src_w,
+                          f->resize_step[!!pl], f->resize_start[!!pl]
+                          HIGHBD_CALL_SUFFIX);
+    }
+}
+
+void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
+    const int y = sby * f->sb_step * 4;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    pixel *const sr_p[3] = {
+        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+    };
+    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+    bytefn(dav1d_filter_sbrow_deblock)(f, sby);
+    if (f->seq_hdr->cdef)
+        bytefn(dav1d_filter_sbrow_cdef)(f, sby);
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+        bytefn(dav1d_filter_sbrow_resize)(f, sby);
+    if (f->lf.restore_planes)
+        bytefn(dav1d_filter_sbrow_lr)(f, sby);
 }

 void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
--- a/third_party/dav1d/src/refmvs.c
+++ b/third_party/dav1d/src/refmvs.c
@ -51,12 +51,13 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
                const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
                                   gmv[0] : b->mv.mv[n];

+                *have_refmv_match = 1;
+                *have_newmv_match |= b->mf >> 1;
+
                const int last = *cnt;
                for (int m = 0; m < last; m++)
                    if (mvstack[m].mv.mv[0].n == cand_mv.n) {
                        mvstack[m].weight += weight;
-                        *have_refmv_match = 1;
-                        *have_newmv_match |= b->mf >> 1;
                        return;
                    }

@ -65,8 +66,6 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
                    mvstack[last].weight = weight;
                    *cnt = last + 1;
                }
-                *have_refmv_match = 1;
-                *have_newmv_match |= b->mf >> 1;
                return;
            }
        }
@ -76,12 +75,13 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
            [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
        }};

+        *have_refmv_match = 1;
+        *have_newmv_match |= b->mf >> 1;
+
        const int last = *cnt;
        for (int n = 0; n < last; n++)
            if (mvstack[n].mv.n == cand_mv.n) {
                mvstack[n].weight += weight;
-                *have_refmv_match = 1;
-                *have_newmv_match |= b->mf >> 1;
                return;
            }

@ -90,8 +90,6 @@ static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cn
            mvstack[last].weight = weight;
            *cnt = last + 1;
        }
-        *have_refmv_match = 1;
-        *have_newmv_match |= b->mf >> 1;
    }
 }

--- a/third_party/dav1d/src/scan.c
+++ b/third_party/dav1d/src/scan.c
@ -30,19 +30,14 @@
 #include "common/attributes.h"
 #include "src/scan.h"

-static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
+static const uint16_t ALIGN(scan_4x4[], 32) = {
     0,  4,  1,  2,
     5,  8, 12,  9,
     6,  3,  7, 10,
    13, 14, 11, 15,
 };
-static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
-     0,  4,  8, 12,
-     1,  5,  9, 13,
-     2,  6, 10, 14,
-     3,  7, 11, 15,
-};
-static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
+
+static const uint16_t ALIGN(scan_4x8[], 32) = {
     0,  8,  1, 16,
     9,  2, 24, 17,
    10,  3, 25, 18,
@ -52,17 +47,8 @@ static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
    14,  7, 29, 22,
    15, 30, 23, 31,
 };
-static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
-     0,  8, 16, 24,
-     1,  9, 17, 25,
-     2, 10, 18, 26,
-     3, 11, 19, 27,
-     4, 12, 20, 28,
-     5, 13, 21, 29,
-     6, 14, 22, 30,
-     7, 15, 23, 31,
-};
-static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
+
+static const uint16_t ALIGN(scan_4x16[], 32) = {
     0, 16,  1, 32,
    17,  2, 48, 33,
    18,  3, 49, 34,
@ -80,37 +66,15 @@ static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
    30, 15, 61, 46,
    31, 62, 47, 63,
 };
-static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
-     0, 16, 32, 48,
-     1, 17, 33, 49,
-     2, 18, 34, 50,
-     3, 19, 35, 51,
-     4, 20, 36, 52,
-     5, 21, 37, 53,
-     6, 22, 38, 54,
-     7, 23, 39, 55,
-     8, 24, 40, 56,
-     9, 25, 41, 57,
-    10, 26, 42, 58,
-    11, 27, 43, 59,
-    12, 28, 44, 60,
-    13, 29, 45, 61,
-    14, 30, 46, 62,
-    15, 31, 47, 63,
-};
-static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
+
+static const uint16_t ALIGN(scan_8x4[], 32) = {
     0,  1,  4,  2,  5,  8,  3,  6,
     9, 12,  7, 10, 13, 16, 11, 14,
    17, 20, 15, 18, 21, 24, 19, 22,
    25, 28, 23, 26, 29, 27, 30, 31,
 };
-static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
-     0,  4,  8, 12, 16, 20, 24, 28,
-     1,  5,  9, 13, 17, 21, 25, 29,
-     2,  6, 10, 14, 18, 22, 26, 30,
-     3,  7, 11, 15, 19, 23, 27, 31,
-};
-static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
+
+static const uint16_t ALIGN(scan_8x8[], 32) = {
     0,  8,  1,  2,  9, 16, 24, 17,
    10,  3,  4, 11, 18, 25, 32, 40,
    33, 26, 19, 12,  5,  6, 13, 20,
@ -120,17 +84,8 @@ static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
    23, 31, 38, 45, 52, 59, 60, 53,
    46, 39, 47, 54, 61, 62, 55, 63,
 };
-static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
-     0,  8, 16, 24, 32, 40, 48, 56,
-     1,  9, 17, 25, 33, 41, 49, 57,
-     2, 10, 18, 26, 34, 42, 50, 58,
-     3, 11, 19, 27, 35, 43, 51, 59,
-     4, 12, 20, 28, 36, 44, 52, 60,
-     5, 13, 21, 29, 37, 45, 53, 61,
-     6, 14, 22, 30, 38, 46, 54, 62,
-     7, 15, 23, 31, 39, 47, 55, 63,
-};
-static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
+
+static const uint16_t ALIGN(scan_8x16[], 32) = {
      0,  16,   1,  32,  17,   2,  48,  33,
     18,   3,  64,  49,  34,  19,   4,  80,
     65,  50,  35,  20,   5,  96,  81,  66,
@ -148,25 +103,8 @@ static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
     47, 123, 108,  93,  78,  63, 124, 109,
     94,  79, 125, 110,  95, 126, 111, 127,
 };
-static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
-      0,  16,  32,  48,  64,  80,  96, 112,
-      1,  17,  33,  49,  65,  81,  97, 113,
-      2,  18,  34,  50,  66,  82,  98, 114,
-      3,  19,  35,  51,  67,  83,  99, 115,
-      4,  20,  36,  52,  68,  84, 100, 116,
-      5,  21,  37,  53,  69,  85, 101, 117,
-      6,  22,  38,  54,  70,  86, 102, 118,
-      7,  23,  39,  55,  71,  87, 103, 119,
-      8,  24,  40,  56,  72,  88, 104, 120,
-      9,  25,  41,  57,  73,  89, 105, 121,
-     10,  26,  42,  58,  74,  90, 106, 122,
-     11,  27,  43,  59,  75,  91, 107, 123,
-     12,  28,  44,  60,  76,  92, 108, 124,
-     13,  29,  45,  61,  77,  93, 109, 125,
-     14,  30,  46,  62,  78,  94, 110, 126,
-     15,  31,  47,  63,  79,  95, 111, 127,
-};
-static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
+
+static const uint16_t ALIGN(scan_8x32[], 32) = {
      0,  32,   1,  64,  33,   2,  96,  65,
     34,   3, 128,  97,  66,  35,   4, 160,
    129,  98,  67,  36,   5, 192, 161, 130,
@ -200,19 +138,15 @@ static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
     95, 251, 220, 189, 158, 127, 252, 221,
    190, 159, 253, 222, 191, 254, 223, 255,
 };
-static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
+
+static const uint16_t ALIGN(scan_16x4[], 32) = {
     0,  1,  4,  2,  5,  8,  3,  6,  9, 12,  7, 10, 13, 16, 11, 14,
    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
    33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
    49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
 };
-static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
-     0,  4,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
-     1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
-     2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
-     3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
-};
-static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
+
+static const uint16_t ALIGN(scan_16x8[], 32) = {
      0,   1,   8,   2,   9,  16,   3,  10,  17,  24,   4,  11,  18,  25,  32,   5,
     12,  19,  26,  33,  40,   6,  13,  20,  27,  34,  41,  48,   7,  14,  21,  28,
     35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,  30,  37,  44,
@ -222,17 +156,8 @@ static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
     99, 106, 113, 120,  79,  86,  93, 100, 107, 114, 121,  87,  94, 101, 108, 115,
    122,  95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
 };
-static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
-      0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104, 112, 120,
-      1,   9,  17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97, 105, 113, 121,
-      2,  10,  18,  26,  34,  42,  50,  58,  66,  74,  82,  90,  98, 106, 114, 122,
-      3,  11,  19,  27,  35,  43,  51,  59,  67,  75,  83,  91,  99, 107, 115, 123,
-      4,  12,  20,  28,  36,  44,  52,  60,  68,  76,  84,  92, 100, 108, 116, 124,
-      5,  13,  21,  29,  37,  45,  53,  61,  69,  77,  85,  93, 101, 109, 117, 125,
-      6,  14,  22,  30,  38,  46,  54,  62,  70,  78,  86,  94, 102, 110, 118, 126,
-      7,  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95, 103, 111, 119, 127,
-};
-static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
+
+static const uint16_t ALIGN(scan_16x16[], 32) = {
      0,  16,   1,   2,  17,  32,  48,  33,  18,   3,   4,  19,  34,  49,  64,  80,
     65,  50,  35,  20,   5,   6,  21,  36,  51,  66,  81,  96, 112,  97,  82,  67,
     52,  37,  22,   7,   8,  23,  38,  53,  68,  83,  98, 113, 128, 144, 129, 114,
@ -250,43 +175,8 @@ static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
    188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
    175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
 };
-static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
-      0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
-      1,  17,  33,  49,  65,  81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
-      2,  18,  34,  50,  66,  82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
-      3,  19,  35,  51,  67,  83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
-      4,  20,  36,  52,  68,  84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
-      5,  21,  37,  53,  69,  85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
-      6,  22,  38,  54,  70,  86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
-      7,  23,  39,  55,  71,  87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
-      8,  24,  40,  56,  72,  88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
-      9,  25,  41,  57,  73,  89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
-     10,  26,  42,  58,  74,  90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
-     11,  27,  43,  59,  75,  91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
-     12,  28,  44,  60,  76,  92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
-     13,  29,  45,  61,  77,  93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
-     14,  30,  46,  62,  78,  94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
-     15,  31,  47,  63,  79,  95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
-};
-static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
-     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
-     64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-     80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
-     96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
-    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
-    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-};
-static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
+
+static const uint16_t ALIGN(scan_16x32[], 32) = {
      0,  32,   1,  64,  33,   2,  96,  65,  34,   3, 128,  97,  66,  35,   4, 160,
    129,  98,  67,  36,   5, 192, 161, 130,  99,  68,  37,   6, 224, 193, 162, 131,
    100,  69,  38,   7, 256, 225, 194, 163, 132, 101,  70,  39,   8, 288, 257, 226,
@ -320,7 +210,8 @@ static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
    380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
    351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
 };
-static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
+
+static const uint16_t ALIGN(scan_32x8[], 32) = {
      0,   1,   8,   2,   9,  16,   3,  10,  17,  24,   4,  11,  18,  25,  32,   5,  12,  19,  26,  33,  40,   6,  13,  20,  27,  34,  41,  48,   7,  14,  21,  28,
     35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,  39,  46,  53,  60,
     67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,  96,  55,  62,  69,  76,  83,  90,  97, 104,  63,  70,  77,  84,  91,  98, 105, 112,  71,  78,  85,  92,
@ -330,7 +221,8 @@ static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
    195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
    227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
 };
-static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
+
+static const uint16_t ALIGN(scan_32x16[], 32) = {
      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  19,  34,  49,  64,   5,  20,  35,  50,  65,  80,   6,  21,  36,  51,  66,  81,  96,   7,  22,  37,  52,
     67,  82,  97, 112,   8,  23,  38,  53,  68,  83,  98, 113, 128,   9,  24,  39,  54,  69,  84,  99, 114, 129, 144,  10,  25,  40,  55,  70,  85, 100, 115, 130,
    145, 160,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176,  12,  27,  42,  57,  72,  87, 102, 117, 132, 147, 162, 177, 192,  13,  28,  43,  58,  73,
@ -348,7 +240,8 @@ static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
    381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
    459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
 };
-static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
+
+static const uint16_t ALIGN(scan_32x32[], 32) = {
       0,   32,    1,    2,   33,   64,   96,   65,   34,    3,    4,   35,   66,   97,  128,  160,  129,   98,   67,   36,    5,    6,   37,   68,   99,  130,  161,  192,  224,  193,  162,  131,
     100,   69,   38,    7,    8,   39,   70,  101,  132,  163,  194,  225,  256,  288,  257,  226,  195,  164,  133,  102,   71,   40,    9,   10,   41,   72,  103,  134,  165,  196,  227,  258,
     289,  320,  352,  321,  290,  259,  228,  197,  166,  135,  104,   73,   42,   11,   12,   43,   74,  105,  136,  167,  198,  229,  260,  291,  322,  353,  384,  416,  385,  354,  323,  292,
@ -383,62 +276,24 @@ static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
     892,  861,  830,  799,  831,  862,  893,  924,  955,  986, 1017, 1018,  987,  956,  925,  894,  863,  895,  926,  957,  988, 1019, 1020,  989,  958,  927,  959,  990, 1021, 1022,  991, 1023,
 };

-const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
-    [TX_4X4] = {
-        [TX_CLASS_2D] = av1_default_scan_4x4,
-        [TX_CLASS_V]  = av1_mrow_scan_4x4,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [TX_8X8] = {
-        [TX_CLASS_2D] = av1_default_scan_8x8,
-        [TX_CLASS_V]  = av1_mrow_scan_8x8,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [TX_16X16] = {
-        [TX_CLASS_2D] = av1_default_scan_16x16,
-        [TX_CLASS_V]  = av1_mrow_scan_16x16,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [TX_32X32] = {
-        [TX_CLASS_2D] = av1_default_scan_32x32,
-    }, [TX_64X64] = {
-        [TX_CLASS_2D] = av1_default_scan_32x32,
-    }, [RTX_4X8] = {
-        [TX_CLASS_2D] = av1_default_scan_4x8,
-        [TX_CLASS_V]  = av1_mrow_scan_4x8,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [RTX_8X4] = {
-        [TX_CLASS_2D] = av1_default_scan_8x4,
-        [TX_CLASS_V]  = av1_mrow_scan_8x4,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [RTX_8X16] = {
-        [TX_CLASS_2D] = av1_default_scan_8x16,
-        [TX_CLASS_V]  = av1_mrow_scan_8x16,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [RTX_16X8] = {
-        [TX_CLASS_2D] = av1_default_scan_16x8,
-        [TX_CLASS_V]  = av1_mrow_scan_16x8,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [RTX_16X32] = {
-        [TX_CLASS_2D] = av1_default_scan_16x32,
-    }, [RTX_32X16] = {
-        [TX_CLASS_2D] = av1_default_scan_32x16,
-    }, [RTX_32X64] = {
-        [TX_CLASS_2D] = av1_default_scan_32x32,
-    }, [RTX_64X32] = {
-        [TX_CLASS_2D] = av1_default_scan_32x32,
-    }, [RTX_4X16] = {
-        [TX_CLASS_2D] = av1_default_scan_4x16,
-        [TX_CLASS_V]  = av1_mrow_scan_4x16,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [RTX_16X4] = {
-        [TX_CLASS_2D] = av1_default_scan_16x4,
-        [TX_CLASS_V]  = av1_mrow_scan_16x4,
-        [TX_CLASS_H]  = av1_mcol_scan_16x16,
-    }, [RTX_8X32] = {
-        [TX_CLASS_2D] = av1_default_scan_8x32,
-    }, [RTX_32X8] = {
-        [TX_CLASS_2D] = av1_default_scan_32x8,
-    }, [RTX_16X64] = {
-        [TX_CLASS_2D] = av1_default_scan_16x32,
-    }, [RTX_64X16] = {
-        [TX_CLASS_2D] = av1_default_scan_32x16,
-    },
+const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
+    [ TX_4X4  ] = scan_4x4,
+    [ TX_8X8  ] = scan_8x8,
+    [ TX_16X16] = scan_16x16,
+    [ TX_32X32] = scan_32x32,
+    [ TX_64X64] = scan_32x32,
+    [RTX_4X8  ] = scan_4x8,
+    [RTX_8X4  ] = scan_8x4,
+    [RTX_8X16 ] = scan_8x16,
+    [RTX_16X8 ] = scan_16x8,
+    [RTX_16X32] = scan_16x32,
+    [RTX_32X16] = scan_32x16,
+    [RTX_32X64] = scan_32x32,
+    [RTX_64X32] = scan_32x32,
+    [RTX_4X16 ] = scan_4x16,
+    [RTX_16X4 ] = scan_16x4,
+    [RTX_8X32 ] = scan_8x32,
+    [RTX_32X8 ] = scan_32x8,
+    [RTX_16X64] = scan_16x32,
+    [RTX_64X16] = scan_32x16,
 };
--- a/third_party/dav1d/src/scan.h
+++ b/third_party/dav1d/src/scan.h
@ -32,6 +32,6 @@

 #include "src/levels.h"

-extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
+extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];

 #endif /* DAV1D_SRC_SCAN_H */
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@ -412,13 +412,11 @@ const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
 };

-const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
-    { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
-    { 2, 1,  80, 1438 }, { 2, 1,  70, 1295 }, { 2, 1, 58, 1177 },
-    { 2, 1,  47, 1079 }, { 2, 1,  37,  996 }, { 2, 1, 30,  925 },
-    { 2, 1,  25,  863 }, { 0, 1,  -1, 2589 }, { 0, 1, -1, 1618 },
-    { 0, 1,  -1, 1177 }, { 0, 1,  -1,  925 }, { 2, 0, 56,   -1 },
-    { 2, 0,  22,   -1 },
+const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = {
+    { 140, 3236 }, { 112, 2158 }, {  93, 1618 }, {  80, 1438 },
+    {  70, 1295 }, {  58, 1177 }, {  47, 1079 }, {  37,  996 },
+    {  30,  925 }, {  25,  863 }, {   0, 2589 }, {   0, 1618 },
+    {   0, 1177 }, {   0,  925 }, {  56,    0 }, {  22,    0 },
 };

 const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@ -107,7 +107,7 @@ extern const Dav1dWarpedMotionParams dav1d_default_wm_params;

 extern const int8_t dav1d_cdef_directions[12][2];

-extern const int16_t dav1d_sgr_params[16][4];
+extern const uint16_t dav1d_sgr_params[16][2];
 extern const uint8_t dav1d_sgr_x_by_x[256];

 extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
--- a/third_party/dav1d/src/thread.h
+++ b/third_party/dav1d/src/thread.h
@ -169,6 +169,14 @@ static inline void dav1d_set_thread_name(const char *const name) {
    pthread_setname_np(pthread_self(), "%s", (void*)name);
 }

+#elif defined(__HAIKU__)
+
+#include <os/kernel/OS.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    rename_thread(find_thread(NULL), name);
+}
+
 #else

 #define dav1d_set_thread_name(name) do {} while (0)
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@ -29,6 +29,140 @@

 #include "src/thread_task.h"

+int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) {
+    struct PostFilterThreadData *const pftd = f->lf.thread.pftd;
+    const int frame_idx = (int)(f - f->c->fc);
+
+    const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
+                            f->frame_hdr->loopfilter.level_y[1] ||
+                            f->lf.restore_planes;
+    const int has_cdef = f->seq_hdr->cdef;
+    const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+    const int has_lr = !!f->lf.restore_planes;
+    f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr;
+    if (f->lf.thread.npf == 0) return 0;
+
+    pthread_mutex_lock(&pftd->lock);
+
+    Dav1dTask *tasks = f->lf.thread.tasks;
+    int num_tasks = f->sbh * f->lf.thread.npf;
+    if (num_tasks > f->lf.thread.num_tasks) {
+        const size_t size = sizeof(Dav1dTask) * num_tasks;
+        tasks = realloc(f->lf.thread.tasks, size);
+        if (!tasks) {
+            pthread_mutex_unlock(&pftd->lock);
+            return -1;
+        }
+        memset(tasks, 0, size);
+        f->lf.thread.tasks = tasks;
+        f->lf.thread.num_tasks = num_tasks;
+    }
+
+#define create_task(task, ready_cond, start_cond) \
+    do { \
+        t = &tasks[num_tasks++]; \
+        t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \
+        t->start = start_cond; \
+        t->frame_id = frame_cnt; \
+        t->frame_idx = frame_idx; \
+        t->sby = sby; \
+        t->fn = f->bd_fn.filter_sbrow_##task; \
+        t->last_deps[0] = NULL; \
+        t->last_deps[1] = NULL; \
+        t->next_deps[0] = NULL; \
+        t->next_deps[1] = NULL; \
+        t->next_exec = NULL; \
+    } while (0)
+
+    Dav1dTask *last_sbrow_deblock = NULL;
+    Dav1dTask *last_sbrow_cdef = NULL;
+    Dav1dTask *last_sbrow_resize = NULL;
+    Dav1dTask *last_sbrow_lr = NULL;
+    num_tasks = 0;
+    const int frame_cnt = pftd->frame_cnt++;
+
+    for (int sby = 0; sby < f->sbh; ++sby) {
+        Dav1dTask *t;
+        Dav1dTask *last = NULL;
+        if (has_deblock) {
+            create_task(deblock, sby == 0, 0);
+            if (sby) {
+                t->last_deps[1] = last_sbrow_deblock;
+                last_sbrow_deblock->next_deps[1] = t;
+            }
+            last = t;
+            last_sbrow_deblock = t;
+        }
+        if (has_cdef) {
+            create_task(cdef, sby == 0 && !has_deblock, has_deblock);
+            if (has_deblock) {
+                t->last_deps[0] = last;
+                last->next_deps[0] = t;
+            }
+            if (sby) {
+                t->last_deps[1] = last_sbrow_cdef;
+                last_sbrow_cdef->next_deps[1] = t;
+            }
+            last = t;
+            last_sbrow_cdef = t;
+        };
+        if (has_resize) {
+            create_task(resize, sby == 0 && !last, !!last);
+            if (last) {
+                t->last_deps[0] = last;
+                last->next_deps[0] = t;
+            }
+            if (sby) {
+                t->last_deps[1] = last_sbrow_resize;
+                last_sbrow_resize->next_deps[1] = t;
+            }
+            last = t;
+            last_sbrow_resize = t;
+        }
+        if (has_lr) {
+            create_task(lr, sby == 0 && !last, !!last);
+            if (last) {
+                t->last_deps[0] = last;
+                last->next_deps[0] = t;
+            }
+            if (sby) {
+                t->last_deps[1] = last_sbrow_lr;
+                last_sbrow_lr->next_deps[1] = t;
+            }
+            last_sbrow_lr = t;
+        }
+    }
+    f->lf.thread.done = 0;
+    pthread_mutex_unlock(&pftd->lock);
+
+    return 0;
+}
+
+void dav1d_task_schedule(struct PostFilterThreadData *const pftd,
+                         Dav1dTask *const t)
+{
+    Dav1dTask **pt = &pftd->tasks;
+    while (*pt &&
+           ((*pt)->sby < t->sby ||
+            ((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id)))
+        pt = &(*pt)->next_exec;
+    t->next_exec = *pt;
+    *pt = t;
+    pthread_cond_signal(&pftd->cond);
+}
+
+static inline void update_task(Dav1dTask *const t, const int dep_type,
+                               Dav1dFrameContext *const f)
+{
+    if (!t->last_deps[!dep_type] ||
+        t->last_deps[!dep_type]->status == DAV1D_TASK_DONE)
+    {
+        t->status = DAV1D_TASK_READY;
+        if (t->start)
+            dav1d_task_schedule(f->lf.thread.pftd, t);
+    }
+}
+
 void *dav1d_frame_task(void *const data) {
    Dav1dFrameContext *const f = data;

@ -140,3 +274,98 @@ void *dav1d_tile_task(void *const data) {

    return NULL;
 }
+
+static inline int handle_abortion(Dav1dPostFilterContext *const pf,
+                                  Dav1dContext *const c,
+                                  struct PostFilterThreadData *const pftd)
+{
+    const int flush = atomic_load_explicit(c->flush, memory_order_acquire);
+    if (flush) {
+        pthread_mutex_lock(&pf->td.lock);
+        pf->flushed = 0;
+        pthread_mutex_unlock(&pf->td.lock);
+    }
+    for (unsigned i = 0; i < c->n_fc; i++) {
+        Dav1dFrameContext *const f = &c->fc[i];
+        int send_signal;
+        if (flush) // TODO before merge, see if this can be safely merged
+            send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0;
+        else
+            send_signal = f->lf.thread.done == -1;
+        for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) {
+            Dav1dTask *const t = &f->lf.thread.tasks[j];
+            if (t->status == DAV1D_TASK_RUNNING ||
+                (t->status == DAV1D_TASK_DONE && t->start != -1))
+                send_signal = 0;
+        }
+        if (send_signal) {
+            if (!flush) {
+                Dav1dTask **pt = &pftd->tasks;
+                while (*pt) {
+                    if ((*pt)->frame_idx == i)
+                        *pt = (*pt)->next_exec;
+                    else
+                        pt = &(*pt)->next_exec;
+                }
+            }
+            f->lf.thread.done = 1;
+            pthread_cond_signal(&f->lf.thread.cond);
+        }
+    }
+    if (flush) {
+        pthread_mutex_lock(&pf->td.lock);
+        pf->flushed = 1;
+        pthread_cond_signal(&pf->td.cond);
+        pthread_mutex_unlock(&pf->td.lock);
+    }
+    return !flush;
+}
+
+void *dav1d_postfilter_task(void *data) {
+    Dav1dPostFilterContext *const pf = data;
+    Dav1dContext *const c = pf->c;
+    struct PostFilterThreadData *pftd = &c->postfilter_thread;
+
+    dav1d_set_thread_name("dav1d-postfilter");
+
+    int exec = 1;
+    pthread_mutex_lock(&pftd->lock);
+    for (;;) {
+        if (!exec && !pf->die)
+            pthread_cond_wait(&pftd->cond, &pftd->lock);
+        if (!(exec = handle_abortion(pf, c, pftd))) continue;
+        if (pf->die) break;
+
+        Dav1dTask *const t = pftd->tasks;
+        if (!t) { exec = 0; continue; }
+        pftd->tasks = t->next_exec;
+        t->status = DAV1D_TASK_RUNNING;
+
+        pthread_mutex_unlock(&pftd->lock);
+        Dav1dFrameContext *const f = &c->fc[t->frame_idx];
+        t->fn(f, t->sby);
+        exec = 1;
+        pthread_mutex_lock(&pftd->lock);
+
+        if (t->next_deps[0])
+            update_task(t->next_deps[0], 0, f);
+        if (t->next_deps[1])
+            update_task(t->next_deps[1], 1, f);
+        t->status = DAV1D_TASK_DONE;
+        if (!t->next_deps[0]) {
+            const enum PlaneType progress_plane_type =
+                c->n_fc > 1 && f->frame_hdr->refresh_context ?
+                PLANE_TYPE_Y : PLANE_TYPE_ALL;
+            const int y = (t->sby + 1) * f->sb_step * 4;
+            dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type);
+            if (t->sby + 1 == f->sbh) {
+                f->lf.thread.done = 1;
+                pthread_cond_signal(&f->lf.thread.cond);
+            }
+        }
+        t->start = -1;
+    }
+    pthread_mutex_unlock(&pftd->lock);
+
+    return NULL;
+}
--- a/third_party/dav1d/src/thread_task.h
+++ b/third_party/dav1d/src/thread_task.h
@ -35,10 +35,33 @@
 #define FRAME_ERROR (UINT_MAX - 1)
 #define TILE_ERROR (INT_MAX - 1)

-int dav1d_decode_frame(Dav1dFrameContext *f);
-void *dav1d_frame_task(void *data);
+enum TaskStatus {
+    DAV1D_TASK_DEFAULT,
+    DAV1D_TASK_READY,
+    DAV1D_TASK_RUNNING,
+    DAV1D_TASK_DONE,
+};

-int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
+struct Dav1dTask {
+    enum TaskStatus status;     // task status
+    int start;                  // frame thread start flag
+    unsigned frame_idx;         // frame thread id
+    int frame_id;               // frame ordering
+    int sby;                    // sbrow
+    filter_sbrow_fn fn;         // task work
+    Dav1dTask *last_deps[2];    // dependencies
+    Dav1dTask *next_deps[2];    // dependant tasks
+    Dav1dTask *next_exec;       // tasks scheduling
+};
+
+int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f);
+void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t);
+
+void *dav1d_frame_task(void *data);
 void *dav1d_tile_task(void *data);
+void *dav1d_postfilter_task(void *data);
+
+int dav1d_decode_frame(Dav1dFrameContext *f);
+int dav1d_decode_tile_sbrow(Dav1dTileContext *t);

 #endif /* DAV1D_SRC_THREAD_TASK_H */
--- a/third_party/dav1d/src/wedge.c
+++ b/third_party/dav1d/src/wedge.c
@ -45,41 +45,41 @@ enum WedgeDirectionType {
 };

 typedef struct {
-    enum WedgeDirectionType direction;
-    int x_offset;
-    int y_offset;
+    uint8_t /* enum WedgeDirectionType */ direction;
+    uint8_t x_offset;
+    uint8_t y_offset;
 } wedge_code_type;

 static const wedge_code_type wedge_codebook_16_hgtw[16] = {
-    { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+    { WEDGE_OBLIQUE27,  4, 4 }, { WEDGE_OBLIQUE63,  4, 4 },
    { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
    { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
-    { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
-    { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+    { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL,   4, 4 },
+    { WEDGE_OBLIQUE27,  4, 2 }, { WEDGE_OBLIQUE27,  4, 6 },
    { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
-    { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+    { WEDGE_OBLIQUE63,  2, 4 }, { WEDGE_OBLIQUE63,  6, 4 },
    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };

 static const wedge_code_type wedge_codebook_16_hltw[16] = {
-    { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+    { WEDGE_OBLIQUE27,  4, 4 }, { WEDGE_OBLIQUE63,  4, 4 },
    { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-    { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
-    { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
-    { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+    { WEDGE_VERTICAL,   2, 4 }, { WEDGE_VERTICAL,   4, 4 },
+    { WEDGE_VERTICAL,   6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
+    { WEDGE_OBLIQUE27,  4, 2 }, { WEDGE_OBLIQUE27,  4, 6 },
    { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
-    { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+    { WEDGE_OBLIQUE63,  2, 4 }, { WEDGE_OBLIQUE63,  6, 4 },
    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };

 static const wedge_code_type wedge_codebook_16_heqw[16] = {
-    { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+    { WEDGE_OBLIQUE27,  4, 4 }, { WEDGE_OBLIQUE63,  4, 4 },
    { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
    { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
-    { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
-    { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+    { WEDGE_VERTICAL,   2, 4 }, { WEDGE_VERTICAL,   6, 4 },
+    { WEDGE_OBLIQUE27,  4, 2 }, { WEDGE_OBLIQUE27,  4, 6 },
    { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
-    { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+    { WEDGE_OBLIQUE63,  2, 4 }, { WEDGE_OBLIQUE63,  6, 4 },
    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };

--- a/third_party/dav1d/src/x86/cdef16_avx2.asm
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@ -0,0 +1,65 @@
+; Copyright (c) 2017-2021, The rav1e contributors
+; Copyright (c) 2021, Nathan Egge
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION .text
+
+cextern cdef_dir_8bpc_avx2
+
+INIT_YMM avx2
+cglobal cdef_dir_16bpc, 4, 4, 3, 32 + 8*8, src, ss, var, bdmax
+  popcnt  bdmaxd, bdmaxd
+  movzx   bdmaxq, bdmaxw
+  sub     bdmaxq, 8
+  movq       xm2, bdmaxq
+  DEFINE_ARGS src, ss, var, ss3
+  lea       ss3q, [ssq*3]
+  mova       xm0, [srcq + ssq*0]
+  mova       xm1, [srcq + ssq*1]
+  vinserti128 m0, [srcq + ssq*2], 1
+  vinserti128 m1, [srcq + ss3q], 1
+  psraw       m0, xm2
+  psraw       m1, xm2
+  vpackuswb   m0, m1
+  mova [rsp + 32 + 0*8], m0
+  lea       srcq, [srcq + ssq*4]
+  mova       xm0, [srcq + ssq*0]
+  mova       xm1, [srcq + ssq*1]
+  vinserti128 m0, [srcq + ssq*2], 1
+  vinserti128 m1, [srcq + ss3q], 1
+  psraw       m0, xm2
+  psraw       m1, xm2
+  vpackuswb   m0, m1
+  mova [rsp + 32 + 4*8], m0
+  lea       srcq, [rsp + 32] ; WIN64 shadow space
+  mov        ssq, 8
+  call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX)
+  RET
+
+%endif ; ARCH_X86_64
--- a/third_party/dav1d/src/x86/cdef16_sse.asm
+++ b/third_party/dav1d/src/x86/cdef16_sse.asm
@ -0,0 +1,93 @@
+; Copyright (c) 2017-2021, The rav1e contributors
+; Copyright (c) 2021, Nathan Egge
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%ifn ARCH_X86_64
+SECTION_RODATA 16
+
+pq_dir_shr:      dq  2,  4
+%endif
+
+SECTION .text
+
+cextern cdef_dir_8bpc_ssse3
+
+INIT_XMM ssse3
+cglobal cdef_dir_16bpc, 2, 4, 4, 32 + 8*8, src, ss, var, bdmax
+  bsr   bdmaxd, bdmaxm
+%if ARCH_X86_64
+  movzx bdmaxq, bdmaxw
+  sub   bdmaxq, 7
+  movq      m4, bdmaxq
+%else
+  push      r4
+  sub   bdmaxd, 9
+  LEA       r4, pq_dir_shr
+  movq      m4, [r4 + bdmaxd*4]
+  pop       r4
+%endif
+  DEFINE_ARGS src, ss, var, ss3
+  lea     ss3q, [ssq*3]
+  mova      m0, [srcq + ssq*0]
+  mova      m1, [srcq + ssq*1]
+  mova      m2, [srcq + ssq*2]
+  mova      m3, [srcq + ss3q]
+  psraw     m0, m4
+  psraw     m1, m4
+  psraw     m2, m4
+  psraw     m3, m4
+  packuswb  m0, m1
+  packuswb  m2, m3
+  mova [rsp + 32 + 0*8], m0
+  mova [rsp + 32 + 2*8], m2
+  lea     srcq, [srcq + ssq*4]
+  mova      m0, [srcq + ssq*0]
+  mova      m1, [srcq + ssq*1]
+  mova      m2, [srcq + ssq*2]
+  mova      m3, [srcq + ss3q]
+  psraw     m0, m4
+  psraw     m1, m4
+  psraw     m2, m4
+  psraw     m3, m4
+  packuswb  m0, m1
+  packuswb  m2, m3
+  mova [rsp + 32 + 4*8], m0
+  mova [rsp + 32 + 6*8], m2
+  lea     srcq, [rsp + 32] ; WIN64 shadow space
+  mov      ssq, 8
+%if ARCH_X86_64
+  call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX)
+%else
+  movifnidn vard, varm
+  push     eax ; align stack
+  push    vard
+  push     ssd
+  push    srcd
+  call mangle(private_prefix %+ _cdef_dir_8bpc)
+  add      esp, 0x10
+%endif
+  RET
--- a/third_party/dav1d/src/x86/cdef_avx2.asm
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@ -39,7 +39,7 @@
 %endmacro

 %macro CDEF_FILTER_JMP_TABLE 1
-JMP_TABLE cdef_filter_%1, \
+JMP_TABLE cdef_filter_%1_8bpc, \
    d6k0, d6k1, d7k0, d7k1, \
    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
@ -94,7 +94,7 @@ SECTION .text
 %macro PREP_REGS 2 ; w, h
    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
    mov           dird, r6m
-    lea         tableq, [cdef_filter_%1x%2_jmptable]
+    lea         tableq, [cdef_filter_%1x%2_8bpc_jmptable]
    lea           dirq, [tableq+dirq*2*4]
 %if %1 == 4
 %if %2 == 4
@ -397,7 +397,7 @@ SECTION .text

 %macro CDEF_FILTER 2 ; w, h
 INIT_YMM avx2
-cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
+cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
                                    pri, sec, dir, damping, edge
 %assign stack_offset_entry stack_offset
    mov          edged, edgem
@ -1592,7 +1592,7 @@ CDEF_FILTER 4, 8
 CDEF_FILTER 4, 4

 INIT_YMM avx2
-cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
+cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3
    lea       stride3q, [strideq*3]
    movq           xm0, [srcq+strideq*0]
    movq           xm1, [srcq+strideq*1]
@ -1622,10 +1622,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
    psubw           m3, m8

    ; shuffle registers to generate partial_sum_diag[0-1] together
-    vpermq          m7, m0, q1032
-    vpermq          m6, m1, q1032
-    vpermq          m5, m2, q1032
-    vpermq          m4, m3, q1032
+    vperm2i128      m7, m0, m0, 0x01
+    vperm2i128      m6, m1, m1, 0x01
+    vperm2i128      m5, m2, m2, 0x01
+    vperm2i128      m4, m3, m3, 0x01

    ; start with partial_sum_hv[0-1]
    paddw           m8, m0, m1
--- a/third_party/dav1d/src/x86/cdef_avx512.asm
+++ b/third_party/dav1d/src/x86/cdef_avx512.asm
@ -109,7 +109,8 @@ DECLARE_REG_TMP 8, 5
 ; 5e 5f 50 51 52 53 54 55

 INIT_ZMM avx512icl
-cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
+cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \
+                                        pri, sec, dir, damping, edge
 %define base r7-edge_mask
    movq         xmm0, [dstq+strideq*0]
    movhps       xmm0, [dstq+strideq*1]
@ -269,8 +270,7 @@ DECLARE_REG_TMP 2, 7
 ; L8 L9 40 41 42 43 44 45  8e 8f 80 81 82 83 84 85
 ; La Lb 50 51 52 53 54 55  9e 9f 90 91 92 93 94 95

-cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
-                                   pri, sec, dir, damping, edge
+cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, pri, sec, dir, damping, edge
 %define base r8-edge_mask
    vpbroadcastd ym21, strided
    mov           r6d, edgem
@ -504,8 +504,8 @@ ALIGN function_align
 ; 8e 8f 80 81 82 83 84 85  84 85 86 87 88 89 8a 8b
 ; 9e 9f 90 91 92 93 94 95  94 95 96 97 98 99 9a 9b

-cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
-                                          pri, sec, dir, damping, edge
+cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \
+                                               pri, sec, dir, damping, edge
 %define base r8-edge_mask
    mov           r6d, edgem
    lea           r10, [dstq+strideq*4-2]
--- a/third_party/dav1d/src/x86/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/x86/cdef_init_tmpl.c
@ -28,20 +28,23 @@
 #include "src/cpu.h"
 #include "src/cdef.h"

-#define decl_cdef_size_fn(sz) \
-    decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
-    decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
-    decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
-    decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
-    decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
+#define decl_cdef_fns(ext) \
+    decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
+    decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
+    decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))

-decl_cdef_size_fn(4x4);
-decl_cdef_size_fn(4x8);
-decl_cdef_size_fn(8x8);
+#if BITDEPTH == 8
+decl_cdef_fns(avx512icl);
+decl_cdef_fns(avx2);
+decl_cdef_fns(sse4);
+decl_cdef_fns(ssse3);
+decl_cdef_fns(sse2);

-decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
-decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
-decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
+#endif
+
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));

 COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();
@ -49,45 +52,47 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;

 #if BITDEPTH == 8
-    c->fb[0] = dav1d_cdef_filter_8x8_sse2;
-    c->fb[1] = dav1d_cdef_filter_4x8_sse2;
-    c->fb[2] = dav1d_cdef_filter_4x4_sse2;
+    c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
+    c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
+    c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
 #endif

    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;

+    c->dir = BF(dav1d_cdef_dir, ssse3);
+
 #if BITDEPTH == 8
-    c->dir = dav1d_cdef_dir_ssse3;
-    c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
-    c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
-    c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
+    c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
+    c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
+    c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
 #endif

    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;

 #if BITDEPTH == 8
-    c->dir = dav1d_cdef_dir_sse4;
-    c->fb[0] = dav1d_cdef_filter_8x8_sse4;
-    c->fb[1] = dav1d_cdef_filter_4x8_sse4;
-    c->fb[2] = dav1d_cdef_filter_4x4_sse4;
+    c->dir = BF(dav1d_cdef_dir, sse4);
+    c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
+    c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
+    c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
 #endif

 #if ARCH_X86_64
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

+    c->dir = BF(dav1d_cdef_dir, avx2);
+
 #if BITDEPTH == 8
-    c->dir = dav1d_cdef_dir_avx2;
-    c->fb[0] = dav1d_cdef_filter_8x8_avx2;
-    c->fb[1] = dav1d_cdef_filter_4x8_avx2;
-    c->fb[2] = dav1d_cdef_filter_4x4_avx2;
+    c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
+    c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
+    c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
 #endif

    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;

 #if HAVE_AVX512ICL && BITDEPTH == 8
-    c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
-    c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
-    c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
+    c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
+    c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
+    c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
 #endif

 #endif
--- a/third_party/dav1d/src/x86/cdef_sse.asm
+++ b/third_party/dav1d/src/x86/cdef_sse.asm
@ -249,13 +249,13 @@ SECTION .text

 %macro CDEF_FILTER 2 ; w, h
 %if ARCH_X86_64
-cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
-                           dst, stride, left, top, pri, sec, edge, stride3, dst4
+cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \
+                                dst, stride, left, top, pri, sec, edge, stride3, dst4
  %define px rsp+3*16+2*32
  %define base 0
 %else
-cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
-                           dst, stride, left, edge, stride3
+cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+                                dst, stride, left, edge, stride3
    %define       topq  r2
    %define      dst4q  r2
    LEA             r5, tap_table
@ -758,7 +758,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \

 %macro CDEF_DIR 0
 %if ARCH_X86_64
-cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
+cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3
    lea       stride3q, [strideq*3]
    movq            m1, [srcq+strideq*0]
    movhps          m1, [srcq+strideq*1]
@ -1030,7 +1030,7 @@ cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
    shr            r1d, 10
    mov         [varq], r1d
 %else
-cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
+cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
 %define base r2-shufw_6543210x
    LEA             r2, shufw_6543210x
    pxor            m0, m0
--- a/third_party/dav1d/src/x86/ipred.asm
+++ b/third_party/dav1d/src/x86/ipred.asm
@ -1170,7 +1170,7 @@ ALIGN function_align
    mova                 m9, [base+ipred_v_shuf]
    vbroadcasti128       m6, [base+smooth_weights+16*2]
    vbroadcasti128       m7, [base+smooth_weights+16*3]
-    vpermq               m8, m9, q1032
+    vperm2i128           m8, m9, m9, 0x01
    paddw                m0, m10, m3
    paddw                m3, m11
    paddw               m12, m0
@ -4197,7 +4197,7 @@ ALIGN function_align
    pmaddubsw           m%3, m5
    paddw               m%1, m%3
    psraw               m%1, 4
-    vpermq              m%3, m%1, q1032
+    vperm2i128          m%3, m%1, m%1, 0x01
    packuswb            m%1, m%3
 %endmacro

--- a/third_party/dav1d/src/x86/looprestoration.asm
+++ b/third_party/dav1d/src/x86/looprestoration.asm
--- a/third_party/dav1d/src/x86/looprestoration16_avx2.asm
+++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
@ -0,0 +1,480 @@
+; Copyright (c) 2017-2021, The rav1e contributors
+; Copyright (c) 2021, Nathan Egge
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener5_shufB:  db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
+wiener5_shufC:  db  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11, 14, 15, 12, 13
+wiener5_shufD:  db  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1, 10, 11, -1, -1
+wiener5_l_shuf: db  4,  5,  4,  5,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+pb_0to31:       db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+                db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+
+wiener7_shufC:  db  4,  5,  2,  3,  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9
+wiener7_shufD:  db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
+wiener7_shufE:  db  8,  9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1
+rev_w:          db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
+rev_d:          db 12, 13, 14, 15,  8,  9, 10, 11,  4,  5,  6,  7,  0,  1,  2,  3
+wiener7_l_shuf: db  6,  7,  6,  7,  6,  7,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+                db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+
+pq_3:      dq (6 - 4) + 1
+pq_5:      dq (6 - 2) + 1
+pd_65540:  dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4))
+pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2))
+
+pq_11:      dq 12 - (6 - 4) + 1
+pq_9:       dq 12 - (6 - 2) + 1
+nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8))
+nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8))
+
+pb_wiener5_l: times 2 db  2,  3
+pb_wiener5_r: times 2 db -6, -5
+
+pb_wiener7_l: times 2 db  4,  5
+pb_wiener7_m: times 2 db -4, -3
+pb_wiener7_r: times 2 db -8, -7
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax
+  movifnidn      wd, wm
+  movifnidn      hd, hm
+  movifnidn   edgeb, edgem
+  vbroadcasti128 m6, [wiener5_shufB]
+  vpbroadcastd  m12, [fq + 2]
+  vbroadcasti128 m7, [wiener5_shufC]
+  vpbroadcastw  m13, [fq + 6]
+  vbroadcasti128 m8, [wiener5_shufD]
+  popcnt     bdmaxd, bdmaxm
+  vpbroadcastd   m9, [pd_65540]
+  movq         xm10, [pq_3]
+  cmp        bdmaxd, 10
+  je .bits10
+  vpbroadcastd   m9, [pd_262160]
+  movq         xm10, [pq_5]
+.bits10:
+  pxor          m11, m11
+  add            wq, wq
+  add          srcq, wq
+  add          dstq, wq
+  neg            wq
+  DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x
+.v_loop:
+  mov            xq, wq
+  test        edgeb, 1 ; LR_HAVE_LEFT
+  jz .h_extend_left
+  test        leftq, leftq
+  jz .h_loop
+  movd          xm4, [leftq + 4]
+  vpblendd       m4, [srcq + xq - 4], 0xfe
+  add         leftq, 8
+  jmp .h_main
+.h_extend_left:
+  vbroadcasti128 m5, [srcq + xq]
+  mova           m4, [srcq + xq]
+  palignr        m4, m5, 12
+  pshufb         m4, [wiener5_l_shuf]
+  jmp .h_main
+.h_loop:
+  movu           m4, [srcq + xq - 4]
+.h_main:
+  movu           m5, [srcq + xq + 4]
+  test        edgeb, 2 ; LR_HAVE_RIGHT
+  jnz .h_have_right
+  cmp            xd, -36
+  jl .h_have_right
+  movd          xm2, xd
+  vpbroadcastd   m0, [pb_wiener5_l]
+  vpbroadcastd   m1, [pb_wiener5_r]
+  vpbroadcastb   m2, xm2
+  movu           m3, [pb_0to31]
+  psubb          m0, m2
+  psubb          m1, m2
+  pminub         m0, m3
+  pminub         m1, m3
+  pshufb         m4, m0
+  pshufb         m5, m1
+.h_have_right:
+  pshufb         m0, m4, m6
+  pshufb         m2, m4, m7
+  paddw          m0, m2
+  pmaddwd        m0, m12
+  pshufb         m1, m5, m6
+  pshufb         m3, m5, m7
+  paddw          m1, m3
+  pmaddwd        m1, m12
+  pshufb         m4, m8
+  pmaddwd        m4, m13
+  pshufb         m5, m8
+  pmaddwd        m5, m13
+  paddd          m0, m4
+  paddd          m1, m5
+  paddd          m0, m9
+  paddd          m1, m9
+  psrad          m0, xm10
+  psrad          m1, xm10
+  packssdw       m0, m1
+  pmaxsw         m0, m11
+  mova  [dstq + xq], m0
+  add            xq, 32
+  jl .h_loop
+  add          srcq, ssq
+  add          dstq, 384*2
+  dec            hd
+  jg .v_loop
+  RET
+
+DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14
+
+INIT_YMM avx2
+cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax
+  movifnidn    wd, wm
+  movifnidn    hd, hm
+  movifnidn edgeb, edgem
+  pxor         m6, m6
+  vpbroadcastd m7, [fq + 2]
+  vpbroadcastd m8, [fq + 6]
+  popcnt   bdmaxd, bdmaxm
+  vpbroadcastd m9, [nd_1047552]
+  movq       xm10, [pq_11]
+  cmp      bdmaxd, 10
+  je .bits10
+  vpbroadcastd m9, [nd_1048320]
+  movq       xm10, [pq_9]
+.bits10:
+  vpbroadcastw m11, bdmaxm
+  add          wq, wq
+  add        midq, wq
+  add        dstq, wq
+  neg          wq
+  DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
+  mov         msq, 2*384
+  mov          t0, midq
+  lea          t1, [t0 + msq]
+  lea          t2, [t1 + msq]
+  lea          t3, [t2 + msq]
+  lea          t4, [t3 + msq]
+  test      edgeb, 4 ; LR_HAVE_TOP
+  jnz .have_top
+  mov          t0, t2
+  mov          t1, t2
+.have_top:
+  test      edgeb, 8 ; LR_HAVE_BOTTOM
+  jnz .v_loop
+  cmp          hd, 2
+  jg .v_loop
+  cmp          hd, 1
+  jne .limit_v
+  mov          t3, t2
+.limit_v:
+  mov          t4, t3
+.v_loop:
+  mov          xq, wq
+.h_loop:
+  mova         m1, [t0 + xq]
+  mova         m2, [t1 + xq]
+  mova         m3, [t2 + xq]
+  mova         m4, [t3 + xq]
+  mova         m5, [t4 + xq]
+  punpcklwd    m0, m1, m2
+  pmaddwd      m0, m7
+  punpckhwd    m1, m2
+  pmaddwd      m1, m7
+  punpcklwd    m2, m5, m4
+  pmaddwd      m2, m7
+  punpckhwd    m5, m4
+  pmaddwd      m5, m7
+  paddd        m0, m2
+  paddd        m1, m5
+  punpcklwd    m2, m3, m6
+  pmaddwd      m2, m8
+  punpckhwd    m3, m6
+  pmaddwd      m3, m8
+  paddd        m0, m2
+  paddd        m1, m3
+  paddd        m0, m9
+  paddd        m1, m9
+  psrad        m0, xm10
+  psrad        m1, xm10
+  packusdw     m0, m1
+  pminuw       m0, m11
+  mova [dstq + xq], m0
+  add          xq, 32
+  jl .h_loop
+  add        dstq, dsq
+  mov          t0, t1
+  mov          t1, t2
+  mov          t2, t3
+  mov          t3, t4
+  add          t4, msq
+  test      edgeb, 8 ; LR_HAVE_BOTTOM
+  jnz .have_bottom
+  cmp          hd, 3
+  jg .have_bottom
+  mov          t4, t3
+.have_bottom:
+  dec          hd
+  jg .v_loop
+  RET
+
+INIT_YMM avx2
+cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh
+  movifnidn       wd, wm
+  movifnidn       hd, hm
+  movifnidn    edgeb, edgem
+  vpbroadcastd    m7, [fq]
+  vpbroadcastd    m8, [fq + 4]
+  vbroadcasti128 m10, [rev_w]
+  vbroadcasti128 m11, [wiener5_shufB]
+  vbroadcasti128 m12, [wiener7_shufC]
+  vbroadcasti128 m13, [wiener7_shufD]
+  vbroadcasti128 m14, [wiener7_shufE]
+  vbroadcasti128 m15, [rev_d]
+  popcnt      bdmaxd, bdmaxm
+  vpbroadcastd    m9, [pd_65540]
+  mov            rhq, [pq_3]
+  cmp         bdmaxd, 10
+  je .bits10
+  vpbroadcastd    m9, [pd_262160]
+  mov            rhq, [pq_5]
+.bits10:
+  add             wq, wq
+  add           srcq, wq
+  add           dstq, wq
+  neg             wq
+  DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh
+.v_loop:
+  mov             xq, wq
+  test         edgeb, 1 ; LR_HAVE_LEFT
+  jz .h_extend_left
+  test         leftq, leftq
+  jz .h_loop
+  movq           xm4, [leftq + 2]
+  vpblendw       xm4, [srcq + xq - 6], 0xf8
+  vinserti128     m4, [srcq + xq + 10], 1
+  add          leftq, 8
+  jmp .h_main
+.h_extend_left:
+  vbroadcasti128  m5, [srcq + xq]
+  mova            m4, [srcq + xq]
+  palignr         m4, m5, 10
+  pshufb          m4, [wiener7_l_shuf]
+  jmp .h_main
+.h_loop:
+  movu            m4, [srcq + xq - 6]
+.h_main:
+  movu            m5, [srcq + xq + 2]
+  movu            m6, [srcq + xq + 6]
+  test         edgeb, 2 ; LR_HAVE_RIGHT
+  jnz .h_have_right
+  cmp             xd, -38
+  jl .h_have_right
+  movd           xm3, xd
+  vpbroadcastd    m0, [pb_wiener7_l]
+  vpbroadcastd    m1, [pb_wiener7_m]
+  vpbroadcastd    m2, [pb_wiener7_r]
+  vpbroadcastb    m3, xm3
+  psubb           m0, m3
+  psubb           m1, m3
+  psubb           m2, m3
+  movu            m3, [pb_0to31]
+  pminub          m0, m3
+  pminub          m1, m3
+  pminub          m2, m3
+  pshufb          m4, m0
+  pshufb          m5, m1
+  pshufb          m6, m2
+  cmp             xd, -9*2
+  jne .hack
+  vpbroadcastw   xm3, [srcq + xq + 16]
+  vinserti128     m5, xm3, 1
+  jmp .h_have_right
+.hack:
+  cmp             xd, -1*2
+  jne .h_have_right
+  vpbroadcastw   xm5, [srcq + xq]
+.h_have_right:
+  pshufb          m6, m10
+  pshufb          m0, m4, m11
+  pshufb          m2, m5, m12
+  paddw           m0, m2
+  pmaddwd         m0, m7
+  pshufb          m2, m4, m13
+  pshufb          m4, m14
+  paddw           m2, m4
+  pmaddwd         m2, m8
+  pshufb          m1, m6, m11
+  pshufb          m5, m11
+  pmaddwd         m1, m7
+  pmaddwd         m5, m7
+  pshufb          m3, m6, m13
+  pshufb          m6, m14
+  paddw           m3, m6
+  pmaddwd         m3, m8
+  paddd           m0, m2
+  paddd           m1, m3
+  pshufb          m1, m15
+  paddd           m1, m5
+  movq           xm4, rhq
+  pxor            m5, m5
+  paddd           m0, m9
+  paddd           m1, m9
+  psrad           m0, xm4
+  psrad           m1, xm4
+  packssdw        m0, m1
+  pmaxsw          m0, m5
+  mova   [dstq + xq], m0
+  add             xq, 32
+  jl .h_loop
+  add           srcq, ssq
+  add           dstq, 384*2
+  dec             hd
+  jg .v_loop
+  RET
+
+INIT_YMM avx2
+cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax
+  movifnidn     wd, wm
+  movifnidn     hd, hm
+  movifnidn  edgeb, edgem
+  pxor          m6, m6
+  vpbroadcastd  m7, [fq]
+  vpbroadcastw  m8, [fq + 4]
+  vpbroadcastd  m9, [fq + 6]
+  popcnt    bdmaxd, bdmaxm
+  vpbroadcastd m10, [nd_1047552]
+  movq        xm11, [pq_11]
+  cmp       bdmaxd, 10
+  je .bits10
+  vpbroadcastd m10, [nd_1048320]
+  movq        xm11, [pq_9]
+.bits10:
+  vpbroadcastw m12, bdmaxm
+  add           wq, wq
+  add         midq, wq
+  add         dstq, wq
+  neg           wq
+  DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x
+  mov          msq, 2*384
+  mov           t0, midq
+  mov           t1, t0
+  lea           t2, [t1 + msq]
+  lea           t3, [t2 + msq]
+  lea           t4, [t3 + msq]
+  lea           t5, [t4 + msq]
+  lea           t6, [t5 + msq]
+  test       edgeb, 4 ; LR_HAVE_TOP
+  jnz .have_top
+  mov           t0, t3
+  mov           t1, t3
+  mov           t2, t3
+.have_top:
+  cmp           hd, 3
+  jg .v_loop
+  test       edgeb, 8 ; LR_HAVE_BOTTOM
+  jz .no_bottom0
+  cmp           hd, 1
+  jg .v_loop
+  jmp .h3
+.no_bottom0:
+  cmp           hd, 2
+  je .h2
+  jns .h3
+.h1:
+  mov           t4, t3
+.h2:
+  mov           t5, t4
+.h3:
+  mov           t6, t5
+.v_loop:
+  mov           xq, wq
+.h_loop:
+  mova          m1, [t0 + xq]
+  mova          m2, [t1 + xq]
+  mova          m3, [t5 + xq]
+  mova          m4, [t6 + xq]
+  punpcklwd     m0, m1, m2
+  pmaddwd       m0, m7
+  punpckhwd     m1, m2
+  pmaddwd       m1, m7
+  punpcklwd     m2, m4, m3
+  pmaddwd       m2, m7
+  punpckhwd     m4, m3
+  pmaddwd       m4, m7
+  paddd         m0, m2
+  paddd         m1, m4
+  mova          m3, [t2 + xq]
+  mova          m4, [t4 + xq]
+  punpcklwd     m2, m3, m4
+  pmaddwd       m2, m8
+  punpckhwd     m3, m4
+  pmaddwd       m3, m8
+  paddd         m0, m2
+  paddd         m1, m3
+  mova          m3, [t3 + xq]
+  punpcklwd     m2, m3, m6
+  pmaddwd       m2, m9
+  punpckhwd     m3, m6
+  pmaddwd       m3, m9
+  paddd         m0, m2
+  paddd         m1, m3
+  paddd         m0, m10
+  paddd         m1, m10
+  psrad         m0, xm11
+  psrad         m1, xm11
+  packusdw      m0, m1
+  pminuw        m0, m12
+  mova [dstq + xq], m0
+  add           xq, 32
+  jl .h_loop
+  add         dstq, dsq
+  mov           t0, t1
+  mov           t1, t2
+  mov           t2, t3
+  mov           t3, t4
+  mov           t4, t5
+  mov           t5, t6
+  add           t6, msq
+  cmp           hd, 4
+  jg .next_row
+  test       edgeb, 8 ; LR_HAVE_BOTTOM
+  jz .no_bottom
+  cmp           hd, 2
+  jg .next_row
+.no_bottom:
+  mov           t6, t5
+.next_row:
+  dec           hd
+  jg .v_loop
+  RET
+
+%endif ; ARCH_X86_64
--- a/third_party/dav1d/src/x86/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/x86/looprestoration_init_tmpl.c
@ -29,173 +29,235 @@
 #include "src/looprestoration.h"

 #include "common/intops.h"
-#include "src/tables.h"

-#define WIENER_FILTER(ext) \
-void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
-                                const pixel (*left)[4], const pixel *lpf, \
-                                ptrdiff_t lpf_stride, int w, int h, \
-                                const int16_t filter[2][8], \
-                                enum LrEdgeFlags edges); \
-void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
-                                const pixel (*left)[4], const pixel *lpf, \
-                                ptrdiff_t lpf_stride, int w, int h, \
-                                const int16_t filter[2][8], \
-                                enum LrEdgeFlags edges);
+#if BITDEPTH != 8
+#define decl_wiener_filter_fn(name, ext) \
+void BF(name##_h, ext)(int16_t *dst, const pixel (*left)[4], const pixel *src, \
+                       ptrdiff_t stride, const int16_t fh[7], const intptr_t w, \
+                       int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
+void BF(name##_v, ext)(pixel *dst, ptrdiff_t stride, const int16_t *mid, \
+                       const int16_t fv[7], int w, int h, \
+                       enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); \
+static void BF(name, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
+                          const pixel (*const left)[4], \
+                          const pixel *lpf, const ptrdiff_t lpf_stride, \
+                          const int w, const int h, const LooprestorationParams *params, \
+                          const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { \
+    ALIGN_STK_64(int16_t, mid, 68 * 384,); \
+    BF(name##_h, ext)(&mid[2*384], left, dst, dst_stride, params->filter[0], w, h, \
+                      edges HIGHBD_TAIL_SUFFIX); \
+    if (edges & LR_HAVE_TOP) { \
+        BF(name##_h, ext)(mid, NULL, lpf, lpf_stride, params->filter[0], w, 2, \
+                          edges HIGHBD_TAIL_SUFFIX); \
+    } \
+    if (edges & LR_HAVE_BOTTOM) { \
+        BF(name##_h, ext)(&mid[(2 + h)*384], NULL, lpf + 6*PXSTRIDE(lpf_stride), \
+                          lpf_stride, params->filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); \
+    } \
+    BF(name##_v, ext)(dst, dst_stride, mid, params->filter[1], w, h, edges HIGHBD_TAIL_SUFFIX); \
+}
+#define decl_wiener_filter_fns(ext) \
+decl_wiener_filter_fn(dav1d_wiener_filter7, ext); \
+decl_wiener_filter_fn(dav1d_wiener_filter5, ext)
+#else
+#define decl_wiener_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+#endif

-#define SGR_FILTER(ext) \
-void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
-                            const pixel (*left)[4], \
-                            const pixel *src, const ptrdiff_t stride, \
-                            const int w, const int h, \
-                            const enum LrEdgeFlags edges); \
-void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
-                            const int w, const int h, \
-                            const enum LrEdgeFlags edges); \
-void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
-                              const int w, const int h, const int strength); \
-void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
-                                    const pixel *src, const ptrdiff_t stride, \
-                                    const int32_t *a, const int16_t *b, \
-                                    const int w, const int h); \
+#define decl_sgr_filter_fns(ext) \
+void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
+                                   const pixel (*left)[4], const pixel *lpf, \
+                                   ptrdiff_t lpf_stride, int w, int h, \
+                                   const LooprestorationParams *params, \
+                                   enum LrEdgeFlags edges); \
+void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \
+                                   const pixel (*left)[4], const pixel *lpf, \
+                                   ptrdiff_t lpf_stride, int w, int h, \
+                                   const LooprestorationParams *params, \
+                                   enum LrEdgeFlags edges); \
+void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \
+                                   const pixel (*left)[4], const pixel *lpf, \
+                                   ptrdiff_t lpf_stride, int w, int h, \
+                                   const LooprestorationParams *params, \
+                                   enum LrEdgeFlags edges);
+
+/* FIXME: Replace with a port of the AVX2 code */
+#define SGR_FILTER_OLD(ext) \
+void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
+                               const pixel (*left)[4], \
+                               const pixel *src, const ptrdiff_t stride, \
+                               const int w, const int h, \
+                               const enum LrEdgeFlags edges); \
+void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
+                               const int w, const int h, \
+                               const enum LrEdgeFlags edges); \
+void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
+                                 const int w, const int h, const unsigned s); \
+void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
+                                       const pixel *src, const ptrdiff_t stride, \
+                                       const int32_t *a, const int16_t *b, \
+                                       const int w, const int h); \
 \
 /* filter with a 3x3 box (radius=1) */ \
-static void dav1d_sgr_filter1_##ext(coef *tmp, \
-                                    const pixel *src, const ptrdiff_t stride, \
-                                    const pixel (*left)[4], \
-                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
-                                    const int w, const int h, const int strength, \
-                                    const enum LrEdgeFlags edges) \
+static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
+                                       const pixel *src, const ptrdiff_t stride, \
+                                       const pixel (*left)[4], \
+                                       const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                       const int w, const int h, const int strength, \
+                                       const enum LrEdgeFlags edges) \
 { \
    ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
    ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
 \
-    dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
+    BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
    if (edges & LR_HAVE_TOP) \
-        dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-                              NULL, lpf, lpf_stride, w, 2, edges); \
+        BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+                                  NULL, lpf, lpf_stride, w, 2, edges); \
 \
    if (edges & LR_HAVE_BOTTOM) \
-        dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-                              NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-                              lpf_stride, w, 2, edges); \
+        BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+                                  NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+                                  lpf_stride, w, 2, edges); \
 \
-    dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
-    dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
-    dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
+    BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
+    BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
+    BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
 } \
 \
-void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
-                            const pixel (*left)[4], \
-                            const pixel *src, const ptrdiff_t stride, \
-                            const int w, const int h, \
-                            const enum LrEdgeFlags edges); \
-void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
-                            const int w, const int h, \
-                            const enum LrEdgeFlags edges); \
-void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
-                              const int w, const int h, const int strength); \
-void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
-                                    const pixel *src, const ptrdiff_t stride, \
-                                    const int32_t *a, const int16_t *b, \
-                                    const int w, const int h); \
+void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
+                               const pixel (*left)[4], \
+                               const pixel *src, const ptrdiff_t stride, \
+                               const int w, const int h, \
+                               const enum LrEdgeFlags edges); \
+void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
+                               const int w, const int h, \
+                               const enum LrEdgeFlags edges); \
+void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
+                                 const int w, const int h, const int strength); \
+void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
+                                       const pixel *src, const ptrdiff_t stride, \
+                                       const int32_t *a, const int16_t *b, \
+                                       const int w, const int h); \
 \
 /* filter with a 5x5 box (radius=2) */ \
-static void dav1d_sgr_filter2_##ext(coef *tmp, \
-                                    const pixel *src, const ptrdiff_t stride, \
-                                    const pixel (*left)[4], \
-                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
-                                    const int w, const int h, const int strength, \
-                                    const enum LrEdgeFlags edges) \
+static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
+                                       const pixel *src, const ptrdiff_t stride, \
+                                       const pixel (*left)[4], \
+                                       const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                       const int w, const int h, const int strength, \
+                                       const enum LrEdgeFlags edges) \
 { \
    ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
    ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
 \
-    dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
+    BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
    if (edges & LR_HAVE_TOP) \
-        dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
-                              NULL, lpf, lpf_stride, w, 2, edges); \
+        BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
+                                  NULL, lpf, lpf_stride, w, 2, edges); \
 \
    if (edges & LR_HAVE_BOTTOM) \
-        dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
-                              NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
-                              lpf_stride, w, 2, edges); \
+        BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
+                                  NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
+                                  lpf_stride, w, 2, edges); \
 \
-    dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
-    dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
-    dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
+    BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
+    BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
+    BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
 } \
 \
-void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
-                               const coef *t1, const int w, const int h, \
-                               const int wt); \
-void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
-                               const coef *t1, const coef *t2, \
-                               const int w, const int h, \
-                               const uint32_t wt); \
+void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
+                                  const coef *t1, const int w, const int h, \
+                                  const int wt); \
+void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
+                                  const coef *t1, const coef *t2, \
+                                  const int w, const int h, \
+                                  const uint32_t wt); \
 \
-static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-                             const pixel (*const left)[4], \
-                             const pixel *lpf, const ptrdiff_t lpf_stride, \
-                             const int w, const int h, const int sgr_idx, \
-                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
+static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
+                                    const pixel (*const left)[4], \
+                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                    const int w, const int h, \
+                                    const LooprestorationParams *const params, \
+                                    const enum LrEdgeFlags edges) \
 { \
-    if (!dav1d_sgr_params[sgr_idx][0]) { \
-        ALIGN_STK_32(coef, tmp, 64 * 384,); \
-        dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-                               w, h, dav1d_sgr_params[sgr_idx][3], edges); \
-        dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \
-    } else if (!dav1d_sgr_params[sgr_idx][1]) { \
-        ALIGN_STK_32(coef, tmp, 64 * 384,); \
-        dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
-                               w, h, dav1d_sgr_params[sgr_idx][2], edges); \
-        dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \
-    } else { \
-        ALIGN_STK_32(coef, tmp1, 64 * 384,); \
-        ALIGN_STK_32(coef, tmp2, 64 * 384,); \
-        dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
-                               w, h, dav1d_sgr_params[sgr_idx][2], edges); \
-        dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
-                               w, h, dav1d_sgr_params[sgr_idx][3], edges); \
-        const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
-        dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
-    } \
+    ALIGN_STK_32(coef, tmp, 64 * 384,); \
+    BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, params->sgr.s0, edges); \
+    BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
+} \
+static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
+                                    const pixel (*const left)[4], \
+                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                    const int w, const int h, \
+                                    const LooprestorationParams *const params, \
+                                    const enum LrEdgeFlags edges) \
+{ \
+    ALIGN_STK_32(coef, tmp, 64 * 384,); \
+    BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, params->sgr.s1, edges); \
+    BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
+} \
+static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
+                                    const pixel (*const left)[4], \
+                                    const pixel *lpf, const ptrdiff_t lpf_stride, \
+                                    const int w, const int h, \
+                                    const LooprestorationParams *const params, \
+                                    const enum LrEdgeFlags edges) \
+{ \
+    ALIGN_STK_32(coef, tmp1, 64 * 384,); \
+    ALIGN_STK_32(coef, tmp2, 64 * 384,); \
+    BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, params->sgr.s0, edges); \
+    BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
+                               w, h, params->sgr.s1, edges); \
+    const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
+    BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
 }

 #if BITDEPTH == 8
-WIENER_FILTER(sse2)
-WIENER_FILTER(ssse3)
-SGR_FILTER(ssse3)
+decl_wiener_filter_fns(sse2);
+decl_wiener_filter_fns(ssse3);
+SGR_FILTER_OLD(ssse3)
 # if ARCH_X86_64
-WIENER_FILTER(avx2)
-SGR_FILTER(avx2)
+decl_sgr_filter_fns(avx2)
 # endif
 #endif

+#if ARCH_X86_64
+decl_wiener_filter_fns(avx2);
+#endif
+
 COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
 #if BITDEPTH == 8
-    c->wiener[0] = dav1d_wiener_filter7_sse2;
-    c->wiener[1] = dav1d_wiener_filter5_sse2;
+    c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
+    c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
 #endif

    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 #if BITDEPTH == 8
-    c->wiener[0] = dav1d_wiener_filter7_ssse3;
-    c->wiener[1] = dav1d_wiener_filter5_ssse3;
-    c->selfguided = sgr_filter_ssse3;
+    c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
+    c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
+    c->sgr[0] = BF(sgr_filter_5x5, ssse3);
+    c->sgr[1] = BF(sgr_filter_3x3, ssse3);
+    c->sgr[2] = BF(sgr_filter_mix, ssse3);
 #endif

    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
-#if BITDEPTH == 8 && ARCH_X86_64
-    c->wiener[0] = dav1d_wiener_filter7_avx2;
-    c->wiener[1] = dav1d_wiener_filter5_avx2;
-    c->selfguided = sgr_filter_avx2;
+#if ARCH_X86_64
+    c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+    c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
+# if BITDEPTH == 8
+    c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+    c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+    c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+# endif
 #endif
 }
--- a/third_party/dav1d/src/x86/looprestoration_sse.asm
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@ -97,8 +97,8 @@ SECTION .text
 %macro WIENER 0
 %if ARCH_X86_64
 DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
-cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
-                                               lpf_stride, w, edge, flt, h, x
+cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+                                                    lpf_stride, w, edge, flt, h, x
    %define base 0
    mov           fltq, fltmp
    mov          edged, r8m
@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
    %define m11         [stk+96]
    %define stk_off     112
 %endif
-cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
    %define base        r6-pb_right_ext_mask-21
    %define stk         esp
    %define dstq        leftq
@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
    add           lpfq, [rsp+gprsize*1]
    call .hv_bottom
 .v1:
-    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
    RET
 .no_top:
    lea             t3, [lpfq+lpf_strideq*4]
@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
    dec             hd
    jnz .main
 .v3:
-    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
 .v2:
-    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
    jmp .v1
 .extend_right:
    movd            m2, [lpfq-4]
@ -685,8 +685,8 @@ ALIGN function_align
 %endif

 %if ARCH_X86_64
-cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
-                                             lpf_stride, w, edge, flt, h, x
+cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+                                                  lpf_stride, w, edge, flt, h, x
    mov           fltq, fltmp
    mov          edged, r8m
    mov             wd, wm
@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
    %define m11         [stk+80]
    %define stk_off     96
 %endif
-cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
    %define stk         esp
    %define leftmp      [stk+28]
    %define m8          [base+pw_m16380]
@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
    dec             hd
    jnz .main
 .v2:
-    call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
    add           dstq, dst_strideq
    mov             t4, t3
    mov             t3, t2
    mov             t2, t1
    movifnidn    dstmp, dstq
 .v1:
-    call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
    jmp .end
 .h:
    %define stk esp+4
@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
    jnz .h_have_right
    cmp             xd, -17
    jl .h_have_right
-    call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
 .h_have_right:
 %macro %%h5 0
 %if cpuflag(ssse3)
@ -991,7 +991,7 @@ ALIGN function_align
    jnz .hv_have_right
    cmp             xd, -17
    jl .hv_have_right
-    call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
 .hv_have_right:
    %%h5
    mova            m2, [t3+xq*2]
@ -1161,7 +1161,7 @@ WIENER
 %endmacro

 %if ARCH_X86_64
-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
    mov        xlimd, edgem
    movifnidn     xd, xm
    mov           hd, hm
@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
    add           xd, xlimd
    xor        xlimd, 2                             ; 2*!have_right
 %else
-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
 %define wq     r0m
 %define xlimd  r1m
 %define hd     hmp
@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
    RET

 %if ARCH_X86_64
-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
    movifnidn  edged, edgem
 %else
-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
 %define sumsq_baseq dword [esp+0]
 %define sum_baseq   dword [esp+4]
 %define ylimd       dword [esp+8]
@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
    jl .loop_x
    RET

-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
    movifnidn     sd, sm
    sub           aq, (384+16-1)*4
    sub           bq, (384+16-1)*2
@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
    RET

 %if ARCH_X86_64
-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
-                                       tmp_base, src_base, a_base, b_base, x, y
+cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
+                                            tmp_base, src_base, a_base, b_base, x, y
    movifnidn     wd, wm
    mov           hd, hm
    mova         m15, [pw_16]
@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
    mov      b_baseq, bq
    xor           xd, xd
 %else
-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
 %define tmp_baseq  [esp+8]
 %define src_baseq  [esp+12]
 %define a_baseq    [esp+16]
@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
    jl .loop_x
    RET

-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
    movifnidn     hd, hm
 %if ARCH_X86_32
    SETUP_PIC r6, 0
@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
    RET

 %if ARCH_X86_64
-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
    mov        edged, edgem
    movifnidn     wd, wm
    mov           hd, hm
    mova         m10, [pb_0]
    mova         m11, [pb_0_1]
 %else
-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
 %define edgeb      byte edgem
 %define wd         xd
 %define wq         wd
@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
    RET

 %if ARCH_X86_64
-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
    movifnidn  edged, edgem
    mov        ylimd, edged
 %else
-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
 %define wm     [esp+0]
 %define hm     [esp+4]
 %define edgem  [esp+8]
@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
    jmp .sum_loop_y_noload
 %endif

-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
    movifnidn     sd, sm
    sub           aq, (384+16-1)*4
    sub           bq, (384+16-1)*2
@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
    RET

 %if ARCH_X86_64
-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
                                       tmp_base, src_base, a_base, b_base, x, y
    movifnidn     wd, wm
    mov           hd, hm
@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
    psrlw        m11, m12, 1                    ; pw_128
    pxor         m13, m13
 %else
-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
 %define tmp_baseq  r0m
 %define src_baseq  r1m
 %define a_baseq    r3m
@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
    RET

 %undef t2
-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
    movifnidn     wd, wm
    movd          m0, wtm
 %if ARCH_X86_64
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@ -3825,9 +3825,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
    pblendw              m6, m7, 0xaa   ; 67 89
    pmulhrsw             m6, m12
    paddd                m4, m5
-    vpblendd             m0, m1, m6, 0x0f
+    vperm2i128           m0, m1, m6, 0x21 ; 45 67
    mova                 m1, m6
-    vpermq               m0, m0, q1032  ; 45 67
    pmaddwd              m6, m0, m10
    pmaddwd              m7, m1, m11
    paddd                m4, m13
--- a/third_party/dav1d/src/x86/msac.asm
+++ b/third_party/dav1d/src/x86/msac.asm
@ -153,6 +153,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
 .renorm4:
    bsr           ecx, t2d
    xor           ecx, 15  ; d
+.renorm5:
    shl           t2d, cl
    shl            t4, cl
    mov [t7+msac.rng], t2d
@ -413,13 +414,20 @@ cglobal msac_decode_bool_equi, 0, 6, 0
    sub           t2d, t1d          ; r - v
    sub            t4, rax          ; dif - vw
    cmovb         t2d, t1d
+    mov           t1d, [t0+msac.cnt]
    cmovb          t4, t3
+    movifnidn      t7, t0
+    mov           ecx, 0xbfff
    setb           al ; the upper 32 bits contains garbage but that's OK
+    sub           ecx, t2d
    not            t4
+    ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
+    ;   i.e. (0 <= d <= 2) and v < (3 << 14)
+    shr           ecx, 14           ; d
 %if ARCH_X86_64 == 0
    movzx         eax, al
 %endif
-    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5

 cglobal msac_decode_bool, 0, 6, 0
    movifnidn      t0, r0mp
--- a/third_party/dav1d/tests/checkasm/checkasm.h
+++ b/third_party/dav1d/tests/checkasm/checkasm.h
@ -115,7 +115,7 @@ int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,

 #if HAVE_ASM
 #if ARCH_X86
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>
 #define readtime() (_mm_lfence(), __rdtsc())
 #else
--- a/third_party/dav1d/tests/checkasm/itx.c
+++ b/third_party/dav1d/tests/checkasm/itx.c
@ -138,14 +138,21 @@ static int copy_subcoefs(coef *coeff,
     * dimensions are non-zero. This leads to braching to specific optimized
     * simd versions (e.g. dc-only) so that we get full asm coverage in this
     * test */
-    const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
+
+    const enum TxClass tx_class = dav1d_tx_type_class[txtp];
+    const uint16_t *const scan = dav1d_scans[tx];
    const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
    const int sub_low  = subsh > 1 ? sub_high - 8 : 0;
    int n, eob;

    for (n = 0, eob = 0; n < sw * sh; n++) {
-        const int rc = scan[n];
-        const int rcx = rc % sh, rcy = rc / sh;
+        int rc, rcx, rcy;
+        if (tx_class == TX_CLASS_2D)
+            rc = scan[n], rcx = rc % sh, rcy = rc / sh;
+        else if (tx_class == TX_CLASS_H)
+            rcx = n % sh, rcy = n / sh, rc = n;
+        else /* tx_class == TX_CLASS_V */
+            rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx;

        /* Pick a random eob within this sub-itx */
        if (rcx > sub_high || rcy > sub_high) {
@ -156,8 +163,18 @@ static int copy_subcoefs(coef *coeff,

    if (eob)
        eob += rnd() % (n - eob - 1);
-    for (n = eob + 1; n < sw * sh; n++)
-        coeff[scan[n]] = 0;
+    if (tx_class == TX_CLASS_2D)
+        for (n = eob + 1; n < sw * sh; n++)
+            coeff[scan[n]] = 0;
+    else if (tx_class == TX_CLASS_H)
+        for (n = eob + 1; n < sw * sh; n++)
+            coeff[n] = 0;
+    else /* tx_class == TX_CLASS_V */ {
+        for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1)
+            while (++rcy < sw)
+                coeff[rcy * sh + rcx] = 0;
+        n = sw * sh;
+    }
    for (; n < 32 * 32; n++)
        coeff[n] = rnd();
    return eob;
--- a/third_party/dav1d/tests/checkasm/looprestoration.c
+++ b/third_party/dav1d/tests/checkasm/looprestoration.c
@ -41,24 +41,30 @@ static int to_binary(int x) { /* 0-15 -> 0000-1111 */
 static void init_tmp(pixel *buf, const ptrdiff_t stride,
                     const int w, const int h, const int bitdepth_max)
 {
+    const int noise_mask = bitdepth_max >> 4;
+    const int x_off = rnd() & 7, y_off = rnd() & 7;
+
    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++)
-            buf[x] = rnd() & bitdepth_max;
+        for (int x = 0; x < w; x++) {
+            buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^
+                     (rnd() & noise_mask);
+        }
        buf += PXSTRIDE(stride);
    }
 }

 static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
-    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
-    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
-    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
-    ALIGN_STK_16(int16_t, filter, 2, [8]);
+    ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
+    ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
+    ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
    pixel left[64][4];
+    LooprestorationParams params;
+    int16_t (*const filter)[8] = params.filter;

    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
                 const pixel (*const left)[4],
                 const pixel *lpf, ptrdiff_t lpf_stride,
-                 int w, int h, const int16_t filter[2][8],
+                 int w, int h, const LooprestorationParams *params,
                 enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);

    for (int t = 0; t < 2; t++) {
@ -80,24 +86,24 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc)
            const int base_h = 1 + (rnd() & 63);
            const int bitdepth_max = (1 << bpc) - 1;

-            init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
-            init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
+            init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
+            init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
            init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);

            for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
                const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
                const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;

-                memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
+                memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));

-                call_ref(c_dst + 32, 448 * sizeof(pixel), left,
-                         h_edge + 32, 448 * sizeof(pixel),
-                         w, h, filter, edges HIGHBD_TAIL_SUFFIX);
-                call_new(a_dst + 32, 448 * sizeof(pixel), left,
-                         h_edge + 32, 448 * sizeof(pixel),
-                         w, h, filter, edges HIGHBD_TAIL_SUFFIX);
-                if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
-                                         a_dst + 32, 448 * sizeof(pixel),
+                call_ref(c_dst, 448 * sizeof(pixel), left,
+                         h_edge, 448 * sizeof(pixel),
+                         w, h, &params, edges HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, 448 * sizeof(pixel), left,
+                         h_edge, 448 * sizeof(pixel),
+                         w, h, &params, edges HIGHBD_TAIL_SUFFIX);
+                if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
+                                         a_dst, 448 * sizeof(pixel),
                                         w, h, "dst"))
                {
                    fprintf(stderr, "size = %dx%d, edges = %04d\n",
@ -105,63 +111,72 @@ static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc)
                    break;
                }
            }
-            bench_new(a_dst + 32, 448 * sizeof(pixel), left,
-                      h_edge + 32, 448 * sizeof(pixel),
-                      256, 64, filter, 0xf HIGHBD_TAIL_SUFFIX);
+            bench_new(a_dst, 448 * sizeof(pixel), left,
+                      h_edge, 448 * sizeof(pixel),
+                      256, 64, &params, 0xf HIGHBD_TAIL_SUFFIX);
        }
    }
 }

 static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
-    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
-    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
-    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
+    ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32;
+    ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32;
+    ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32;
    pixel left[64][4];
+    LooprestorationParams params;

    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
                 const pixel (*const left)[4],
                 const pixel *lpf, ptrdiff_t lpf_stride,
-                 int w, int h, int sgr_idx,
-                 const int16_t sgr_wt[7], enum LrEdgeFlags edges
-                 HIGHBD_DECL_SUFFIX);
+                 int w, int h, const LooprestorationParams *params,
+                 enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);

-    for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) {
-        if (check_func(c->selfguided, "selfguided_%s_%dbpc",
-                       sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc))
-        {
-            int16_t sgr_wt[2];
+    static const struct { char name[4]; uint8_t idx; } sgr_data[3] = {
+        { "5x5", 14 },
+        { "3x3", 10 },
+        { "mix",  0 },
+    };

-            sgr_wt[0] = dav1d_sgr_params[sgr_idx][0] ? (rnd() & 127) - 96 : 0;
-            sgr_wt[1] = dav1d_sgr_params[sgr_idx][1] ? (rnd() & 127) - 32 :
-                            iclip(128 - sgr_wt[0], -32, 95);
+    for (int i = 0; i < 3; i++) {
+        if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) {
+            const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx];
+            params.sgr.s0 = sgr_params[0];
+            params.sgr.s1 = sgr_params[1];
+            params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0;
+            params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0;

            const int base_w = 1 + (rnd() % 384);
            const int base_h = 1 + (rnd() & 63);
            const int bitdepth_max = (1 << bpc) - 1;

-            init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max);
-            init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max);
+            init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max);
+            init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max);
            init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max);

            for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) {
                const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
                const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;

-                memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel));
+                memcpy(a_src, c_src, 448 * 64 * sizeof(pixel));

-                call_ref(c_dst + 32, 448 * sizeof(pixel), left,
-                         h_edge + 32, 448 * sizeof(pixel),
-                         w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
-                call_new(a_dst + 32, 448 * sizeof(pixel), left,
-                         h_edge + 32, 448 * sizeof(pixel),
-                         w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
-                                     a_dst + 32, 448 * sizeof(pixel),
-                                     w, h, "dst");
+                call_ref(c_dst, 448 * sizeof(pixel), left,
+                         h_edge, 448 * sizeof(pixel),
+                         w, h, &params, edges HIGHBD_TAIL_SUFFIX);
+                call_new(a_dst, 448 * sizeof(pixel), left,
+                         h_edge, 448 * sizeof(pixel),
+                         w, h, &params, edges HIGHBD_TAIL_SUFFIX);
+                if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel),
+                                         a_dst, 448 * sizeof(pixel),
+                                         w, h, "dst"))
+                {
+                    fprintf(stderr, "size = %dx%d, edges = %04d\n",
+                            w, h, to_binary(edges));
+                    break;
+                }
            }
-            bench_new(a_dst + 32, 448 * sizeof(pixel), left,
-                      h_edge + 32, 448 * sizeof(pixel),
-                      256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX);
+            bench_new(a_dst, 448 * sizeof(pixel), left,
+                      h_edge, 448 * sizeof(pixel),
+                      256, 64, &params, 0xf HIGHBD_TAIL_SUFFIX);
        }
    }
 }
--- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
+++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
@ -193,7 +193,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
    }

 cleanup:
-    dav1d_flush(ctx);
    dav1d_close(&ctx);
 end:
    return 0;
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -76,8 +76,6 @@ if is_asm_enabled
        checkasm_sources += checkasm_asm_sources
    endif

-    m_lib = cc.find_library('m', required: false)
-
    checkasm = executable('checkasm',
        checkasm_sources,
        checkasm_asm_objs,
@ -94,7 +92,7 @@ if is_asm_enabled
            thread_dependency,
            rt_dependency,
            libdl_dependency,
-            m_lib,
+            libm_dependency,
            ],
        )

@ -127,6 +125,26 @@ endforeach
 # fuzzing binaries
 subdir('libfuzzer')

+# seek stress test binary, depends on dav1d cli tool
+if get_option('enable_tools')
+    seek_stress_sources = files('seek_stress.c')
+    seek_stress = executable('seek_stress',
+        seek_stress_sources, rev_target,
+        objects: [
+            dav1d.extract_objects('dav1d_cli_parse.c'),
+            dav1d_input_objs.extract_objects('input/input.c', 'input/ivf.c'),
+        ],
+        include_directories: [dav1d_inc_dirs, include_directories('../tools')],
+        link_with: libdav1d,
+        dependencies: [
+            thread_dependency,
+            rt_dependency,
+            getopt_dependency,
+            libm_dependency,
+        ],
+    )
+endif
+
 # Include dav1d test data repository with additional tests
 if get_option('testdata_tests')
    subdir('dav1d-test-data')
--- a/third_party/dav1d/tests/seek_stress.c
+++ b/third_party/dav1d/tests/seek_stress.c
@ -0,0 +1,243 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "vcs_version.h"
+#include "cli_config.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dav1d/dav1d.h"
+#include "input/input.h"
+#include "input/demuxer.h"
+#include "dav1d_cli_parse.h"
+
+#define NUM_RAND_SEEK 3
+#define NUM_REL_SEEK  4
+#define NUM_END_SEEK  2
+
+const Demuxer annexb_demuxer = { .name = "" };
+const Demuxer section5_demuxer = { .name = "" };
+
+#ifdef _WIN32
+#include <windows.h>
+static unsigned get_seed(void) {
+    return GetTickCount();
+}
+#else
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+static unsigned get_seed(void) {
+#ifdef __APPLE__
+    return (unsigned) mach_absolute_time();
+#elif defined(HAVE_CLOCK_GETTIME)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
+#endif
+}
+#endif
+
+static uint32_t xs_state[4];
+
+static void xor128_srand(unsigned seed) {
+    xs_state[0] = seed;
+    xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
+    xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
+    xs_state[3] = ~seed;
+}
+
+// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
+//             Journal of Statistical Software. 8 (14).
+//             doi:10.18637/jss.v008.i14.
+static int xor128_rand(void) {
+    const uint32_t x = xs_state[0];
+    const uint32_t t = x ^ (x << 11);
+
+    xs_state[0] = xs_state[1];
+    xs_state[1] = xs_state[2];
+    xs_state[2] = xs_state[3];
+    uint32_t w = xs_state[3];
+
+    w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
+    xs_state[3] = w;
+
+    return w >> 1;
+}
+
+static inline int decode_frame(Dav1dPicture *const p,
+                               Dav1dContext *const c, Dav1dData *const data)
+{
+    int res;
+    memset(p, 0, sizeof(*p));
+    if ((res = dav1d_send_data(c, data)) < 0) {
+        if (res != DAV1D_ERR(EAGAIN)) {
+            fprintf(stderr, "Error decoding frame: %s\n",
+                    strerror(DAV1D_ERR(res)));
+            return res;
+        }
+    }
+    if ((res = dav1d_get_picture(c, p)) < 0) {
+        if (res != DAV1D_ERR(EAGAIN)) {
+            fprintf(stderr, "Error decoding frame: %s\n",
+                    strerror(DAV1D_ERR(res)));
+            return res;
+        }
+    } else dav1d_picture_unref(p);
+    return 0;
+}
+
+static int decode_rand(DemuxerContext *const in, Dav1dContext *const c,
+                       Dav1dData *const data, const double fps)
+{
+    int res = 0;
+    Dav1dPicture p;
+    const int num_frames = xor128_rand() % (int)(fps * 5);
+    for (int i = 0; i < num_frames; i++) {
+        if ((res = decode_frame(&p, c, data))) break;
+        if (input_read(in, data) || data->sz == 0) break;
+    }
+    return res;
+}
+
+static int decode_all(DemuxerContext *const in,
+                      Dav1dContext *const c, Dav1dData *const data)
+{
+    int res = 0;
+    Dav1dPicture p;
+    do { if ((res = decode_frame(&p, c, data))) break;
+    } while (!input_read(in, data) && data->sz > 0);
+    return res;
+}
+
+static int seek(DemuxerContext *const in, Dav1dContext *const c,
+                const uint64_t pts, Dav1dData *const data)
+{
+    int res;
+    if ((res = input_seek(in, pts))) return res;
+    Dav1dSequenceHeader seq;
+    do { if ((res = input_read(in, data))) break;
+    } while (dav1d_parse_sequence_header(&seq, data->data, data->sz));
+    dav1d_flush(c);
+    return res;
+}
+
+int main(const int argc, char *const *const argv) {
+    const char *version = dav1d_version();
+    if (strcmp(version, DAV1D_VERSION)) {
+        fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
+                version, DAV1D_VERSION);
+        return EXIT_FAILURE;
+    }
+
+    CLISettings cli_settings;
+    Dav1dSettings lib_settings;
+    DemuxerContext *in;
+    Dav1dContext *c;
+    Dav1dData data;
+    unsigned total, i_fps[2], i_timebase[2];
+    double timebase, spf, fps;
+    uint64_t pts;
+
+    xor128_srand(get_seed());
+    parse(argc, argv, &cli_settings, &lib_settings);
+
+    if (input_open(&in, "ivf", cli_settings.inputfile,
+                   i_fps, &total, i_timebase) < 0 ||
+        !i_timebase[0] || !i_timebase[1] ||  !i_fps[0] || !i_fps[1])
+    {
+        return EXIT_SUCCESS;
+    }
+    if (dav1d_open(&c, &lib_settings))
+        return EXIT_FAILURE;
+
+    timebase = (double)i_timebase[1] / i_timebase[0];
+    spf = (double)i_fps[1] / i_fps[0];
+    fps = (double)i_fps[0] / i_fps[1];
+    if (fps < 1) goto end;
+
+#define FRAME_OFFSET_TO_PTS(foff) \
+    (uint64_t)llround(((foff) * spf) * 1000000000.0)
+#define TS_TO_PTS(ts) \
+    (uint64_t)llround(((ts) * timebase) * 1000000000.0)
+
+    // seek at random pts
+    for (int i = 0; i < NUM_RAND_SEEK; i++) {
+        pts = FRAME_OFFSET_TO_PTS(xor128_rand() % total);
+        if (seek(in, c, pts, &data)) continue;
+        if (decode_rand(in, c, &data, fps)) goto end;
+    }
+    pts = TS_TO_PTS(data.m.timestamp);
+
+    // seek left / right randomly with random intervals within 1s
+    for (int i = 0, tries = 0;
+         i - tries < NUM_REL_SEEK && tries < NUM_REL_SEEK / 2;
+         i++)
+    {
+        const int sign = xor128_rand() & 1 ? -1 : +1;
+        const float diff = (xor128_rand() % 100) / 100.f;
+        int64_t new_pts = pts + sign * FRAME_OFFSET_TO_PTS(diff * fps);
+        const int64_t new_ts = llround(new_pts / (timebase * 1000000000.0));
+        new_pts = TS_TO_PTS(new_ts);
+        if (new_pts < 0 || (uint64_t)new_pts >= FRAME_OFFSET_TO_PTS(total)) {
+            if (seek(in, c, FRAME_OFFSET_TO_PTS(total / 2), &data)) break;
+            pts = TS_TO_PTS(data.m.timestamp);
+            tries++;
+            continue;
+        }
+        if (seek(in, c, new_pts, &data))
+            if (seek(in, c, 0, &data)) goto end;
+        if (decode_rand(in, c, &data, fps)) goto end;
+        pts = TS_TO_PTS(data.m.timestamp);
+    }
+
+    unsigned shift = 0;
+    do {
+        shift += 5;
+        if (shift > total)
+            shift = total;
+    } while (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data));
+
+    // simulate seeking after the end of the file
+    for (int i = 0; i < NUM_END_SEEK; i++) {
+        if (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)) goto end;
+        if (decode_all(in, c, &data)) goto end;
+        int num_flush = 1 + 64 + xor128_rand() % 64;
+        while (num_flush--) dav1d_flush(c);
+    }
+
+end:
+    input_close(in);
+    dav1d_close(&c);
+    return EXIT_SUCCESS;
+}
--- a/third_party/dav1d/tools/dav1d.c
+++ b/third_party/dav1d/tools/dav1d.c
@ -197,7 +197,6 @@ int main(const int argc, char *const *const argv) {
                    seq_skip);
    }

-    //getc(stdin);
    if (cli_settings.limit != 0 && cli_settings.limit < total)
        total = cli_settings.limit;

--- a/third_party/dav1d/tools/dav1d_cli_parse.c
+++ b/third_party/dav1d/tools/dav1d_cli_parse.c
@ -26,6 +26,7 @@
 */

 #include "config.h"
+#include "cli_config.h"

 #include <getopt.h>
 #include <limits.h>
@ -51,6 +52,7 @@ enum {
    ARG_REALTIME_CACHE,
    ARG_FRAME_THREADS,
    ARG_TILE_THREADS,
+    ARG_POSTFILTER_THREADS,
    ARG_VERIFY,
    ARG_FILM_GRAIN,
    ARG_OPPOINT,
@ -73,6 +75,7 @@ static const struct option long_opts[] = {
    { "realtimecache",  1, NULL, ARG_REALTIME_CACHE },
    { "framethreads",   1, NULL, ARG_FRAME_THREADS },
    { "tilethreads",    1, NULL, ARG_TILE_THREADS },
+    { "pfthreads",      1, NULL, ARG_POSTFILTER_THREADS },
    { "verify",         1, NULL, ARG_VERIFY },
    { "filmgrain",      1, NULL, ARG_FILM_GRAIN },
    { "oppoint",        1, NULL, ARG_OPPOINT },
@ -82,6 +85,12 @@ static const struct option long_opts[] = {
    { NULL,             0, NULL, 0 },
 };

+#if HAVE_XXHASH_H
+#define AVAILABLE_MUXERS "'md5', 'xxh3', 'yuv', 'yuv4mpeg2' or 'null'"
+#else
+#define AVAILABLE_MUXERS "'md5', 'yuv', 'yuv4mpeg2' or 'null'"
+#endif
+
 #if ARCH_AARCH64 || ARCH_ARM
 #define ALLOWED_CPU_MASKS " or 'neon'"
 #elif ARCH_PPC64LE
@ -107,7 +116,7 @@ static void usage(const char *const app, const char *const reason, ...) {
            " --input/-i $file:     input file\n"
            " --output/-o $file:    output file\n"
            " --demuxer $name:      force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from content)\n"
-            " --muxer $name:        force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n"
+            " --muxer $name:        force muxer type (" AVAILABLE_MUXERS "; default: detect from extension)\n"
            " --quiet/-q:           disable status messages\n"
            " --frametimes $file:   dump frame times to file\n"
            " --limit/-l $num:      stop decoding after $num frames\n"
@ -117,7 +126,8 @@ static void usage(const char *const app, const char *const reason, ...) {
            " --version/-v:         print version and exit\n"
            " --framethreads $num:  number of frame threads (default: 1)\n"
            " --tilethreads $num:   number of tile threads (default: 1)\n"
-            " --filmgrain $num:     enable film grain application (default: 1, except if muxer is md5)\n"
+            " --pfthreads $num:     number of postfilter threads (default: 1)\n"
+            " --filmgrain $num:     enable film grain application (default: 1, except if muxer is md5 or xxh3)\n"
            " --oppoint $num:       select an operating point of a scalable AV1 bitstream (0 - 31)\n"
            " --alllayers $num:     output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
            " --sizelimit $num:     stop decoding if the frame size exceeds the specified limit\n"
@ -198,24 +208,26 @@ static const EnumParseTable cpu_mask_tbl[] = {
    { "avx2",      X86_CPU_MASK_AVX2 },
    { "avx512icl", X86_CPU_MASK_AVX512ICL },
 #endif
-    { 0 },
+    { "none",      0 },
 };

+#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n)))
+
 static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl,
-                           const int option, const char *app)
+                           const int tbl_sz, const int option, const char *app)
 {
    char str[1024];

    strcpy(str, "any of ");
-    for (int n = 0; tbl[n].str; n++) {
+    for (int n = 0; n < tbl_sz; n++) {
        if (!strcmp(tbl[n].str, optarg))
            return tbl[n].val;

        if (n) {
-            if (!tbl[n + 1].str)
-                strcat(str, " or ");
-            else
+            if (n < tbl_sz - 1)
                strcat(str, ", ");
+            else
+                strcat(str, " or ");
        }
        strcat(str, tbl[n].str);
    }
@ -295,6 +307,10 @@ void parse(const int argc, char *const *const argv,
            lib_settings->n_tile_threads =
                parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
            break;
+        case ARG_POSTFILTER_THREADS:
+            lib_settings->n_postfilter_threads =
+                parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
+            break;
        case ARG_VERIFY:
            cli_settings->verify = optarg;
            break;
@ -325,7 +341,7 @@ void parse(const int argc, char *const *const argv,
            fprintf(stderr, "%s\n", dav1d_version());
            exit(0);
        case ARG_CPU_MASK:
-            dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl,
+            dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, ARRAY_SIZE(cpu_mask_tbl),
                                                ARG_CPU_MASK, argv[0]));
            break;
        default:
@ -338,8 +354,11 @@ void parse(const int argc, char *const *const argv,
    if (cli_settings->verify) {
        if (cli_settings->outputfile)
            usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set");
-        if (cli_settings->muxer && !strcmp(cli_settings->muxer, "md5"))
-            usage(argv[0], "Verification (--verify) requires the md5 muxer (--muxer md5)");
+        if (cli_settings->muxer && strcmp(cli_settings->muxer, "md5") &&
+            strcmp(cli_settings->muxer, "xxh3"))
+        {
+            usage(argv[0], "Verification (--verify) requires a checksum muxer (md5 or xxh3)");
+        }

        cli_settings->outputfile = "-";
        if (!cli_settings->muxer)
@ -347,7 +366,8 @@ void parse(const int argc, char *const *const argv,
    }

    if (!grain_specified && cli_settings->muxer &&
-        !strcmp(cli_settings->muxer, "md5"))
+        (!strcmp(cli_settings->muxer, "md5") ||
+        !strcmp(cli_settings->muxer, "xxh3")))
    {
        lib_settings->apply_grain = 0;
    }
--- a/third_party/dav1d/tools/input/annexb.c
+++ b/third_party/dav1d/tools/input/annexb.c
@ -191,5 +191,6 @@ const Demuxer annexb_demuxer = {
    .probe_sz = PROBE_SIZE,
    .open = annexb_open,
    .read = annexb_read,
+    .seek = NULL,
    .close = annexb_close,
 };
--- a/Показать больше
+++ b/Показать больше