Bug 1588123 - Update libdav1d to head; r=TD-Linux

This updates libdav1d to commit 5595102721d3c298d7cee64e64878486a3b8bdad. Differential Revision: https://phabricator.services.mozilla.com/D50205 --HG-- rename : third_party/dav1d/snap/snapcraft.yaml => third_party/dav1d/package/snap/snapcraft.yaml extra : moz-landing-system : lando
2019-10-24 18:58:15 +00:00 · 2019-10-24 18:58:15 +00:00 · 8f20970320
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -153,6 +153,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
    relative_path = '../../../third_party/dav1d/src/arm/'
    bitdepth_basenames = [
        'cdef_init_tmpl.c',
+        'ipred_init_tmpl.c',
        'itx_init_tmpl.c',
        'loopfilter_init_tmpl.c',
        'looprestoration_init_tmpl.c',
@ -191,6 +192,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
    if CONFIG['CPU_ARCH'] == 'aarch64':
        SOURCES += [
            '../../../third_party/dav1d/src/arm/64/cdef.S',
+            '../../../third_party/dav1d/src/arm/64/ipred.S',
            '../../../third_party/dav1d/src/arm/64/itx.S',
            '../../../third_party/dav1d/src/arm/64/loopfilter.S',
            '../../../third_party/dav1d/src/arm/64/looprestoration.S',
@ -199,6 +201,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
        ]
    elif CONFIG['CPU_ARCH'] == 'arm':
        SOURCES += [
+            '../../../third_party/dav1d/src/arm/32/cdef.S',
            '../../../third_party/dav1d/src/arm/32/looprestoration.S',
            '../../../third_party/dav1d/src/arm/32/mc.S',
        ]
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,7 +20,7 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit c0865f35c74bdcc71021630f64dca2db35d2bc8c (2019-09-19T12:07:23.000+02:00).
+  release: commit 5595102721d3c298d7cee64e64878486a3b8bdad (2019-10-22T19:50:25.000+02:00).

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.4.0-49-gc0865f3"
+#define DAV1D_VERSION "0.5.0-6-g5595102"
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@ -178,7 +178,7 @@ build-debian-aarch64:
        - aarch64
        - debian
    script:
-        - meson build --buildtype release --werror
+        - meson build --buildtype debugoptimized --werror
        - ninja -C build
        - cd build && meson test -v

@ -219,7 +219,7 @@ build-debian-armv7:
        - armv7
        - debian
    script:
-        - meson build --buildtype release --werror
+        - meson build --buildtype debugoptimized --werror
        - ninja -C build
        - cd build && meson test -v

@ -241,7 +241,7 @@ build-ubuntu-snap:
        - debian
        - amd64
    script:
-        - snapcraft snap
+        - cd package/snap && snapcraft snap
        - |
           if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then
            echo $SNAP_LOGIN | base64 --decode | snapcraft login --with -
@ -251,7 +251,7 @@ build-ubuntu-snap:
    artifacts:
        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
        paths:
-            - dav1d_*.snap
+            - package/snap/dav1d_*.snap
        expire_in: 1 week
    allow_failure: true

--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@ -1,3 +1,25 @@
+Changes for 0.5.0 'Asiatic Cheetah':
+----------------------------
+
+0.5.0 is a medium release fixing regressions and minor issues,
+and improving speed significantly:
+ - Export ITU T.35 metadata
+ - Speed improvements on blend_ on ARM
+ - Speed improvements on decode_coef and MSAC
+ - NEON optimizations for blend*, w_mask_, ipred functions for ARM64
+ - NEON optimizations for CDEF and warp on ARM32
+ - SSE2 optimizations for MSAC hi_tok decoding
+ - SSSE3 optimizations for deblocking loopfilters and warp_affine
+ - AVX-2 optimizations for film grain and ipred_z2
+ - SSE4 optimizations for warp_affine
+ - VSX optimizations for wiener
+ - Fix inverse transform overflows in x86 and NEON asm
+ - Fix integer overflows with large frames
+ - Improve film grain generation to match reference code
+ - Improve compatibility with older binutils for ARM
+ - More advanced Player example in tools
+
+
 Changes for 0.4.0 'Cheetah':
 ----------------------------

@ -11,6 +33,7 @@ Changes for 0.4.0 'Cheetah':
 - NEON optimizations for blend functions on ARM
 - NEON optimizations for w_mask functions on ARM
 - NEON optimizations for inverse transforms on ARM64
+ - VSX optimizations for CDEF filter
 - Improve handling of malloc failures
 - Simple Player example in tools

@ -38,7 +61,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
 - Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
   The impact is important on SSSE3, SSE4 and AVX-2 cpus
 - SSSE3 optimizations for all blocks size in itx
- - SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
+ - SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
 - Speed improvements on CDEF for SSE4 CPUs
 - NEON optimizations for SGR and loop filter
 - Minor crashes, improvements and build changes
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@ -73,28 +73,15 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
 # Compile

 1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
-2. Run `meson build --buildtype release`
-3. Build with `ninja -C build`
+2. Run `mkdir build && cd build` to create a build directory and enter it
+3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
+4. Run `ninja` to compile

 # Run tests

-1. During initial build dir setup or `meson configure` specify `-Denable_tests=true`
-2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
-   for checkasm
-
-# Run testdata based tests
-
-1. Checkout the test data repository
-
-   ```
-   git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
-   ```
-2. During initial build dir setup or `meson configure` specify `-Denable_tests=true` and `-Dtestdata_tests=true`
-
-   ```
-   meson .test -Denable_tests=true -Dtestdata_tests=true
-   ```
-3. In the build directory run `meson test` optionally with `-v` for more verbose output
+1. In the root directory, run `git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data` to fetch the test data repository
+2. During meson configuration, specify `-Dtestdata_tests=true`
+3. Run `meson test -v` after compiling

 # Support

--- a/third_party/dav1d/dav1d_logo.png
+++ b/third_party/dav1d/dav1d_logo.png
--- a/third_party/dav1d/examples/dav1dplay.c
+++ b/third_party/dav1d/examples/dav1dplay.c
@ -28,6 +28,7 @@
 #include "vcs_version.h"

 #include <getopt.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@ -48,6 +49,9 @@
 */
 typedef struct {
    const char *inputfile;
+    int highquality;
+    int untimed;
+    int zerocopy;
 } Dav1dPlaySettings;

 #define WINDOW_WIDTH  910
@ -156,9 +160,13 @@ typedef struct rdr_info
    // Callback to destroy the renderer
    void (*destroy_renderer)(void *cookie);
    // Callback to the render function that renders a prevously sent frame
-    void (*render)(void *cookie);
+    void (*render)(void *cookie, const Dav1dPlaySettings *settings);
    // Callback to the send frame function
-    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic);
+    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
+                        const Dav1dPlaySettings *settings);
+    // Callback for alloc/release pictures (optional)
+    int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
+    void (*release_pic)(Dav1dPicture *pic, void *cookie);
 } Dav1dPlayRenderInfo;

 #ifdef HAVE_PLACEBO_VULKAN
@ -325,7 +333,7 @@ static void placebo_renderer_destroy(void *cookie)
    pl_context_destroy(&(rd_priv_ctx->ctx));
 }

-static void placebo_render(void *cookie)
+static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
@ -358,8 +366,9 @@ static void placebo_render(void *cookie)
        .height     = img->params.h,
    };

-    struct pl_render_params render_params = pl_render_default_params;
-    //render_params.upscaler = &pl_filter_ewa_lanczos;
+    struct pl_render_params render_params = {0};
+    if (settings->highquality)
+        render_params = pl_render_default_params;

    struct pl_render_target target;
    pl_render_target_from_swapchain(&target, &frame);
@ -385,7 +394,8 @@ static void placebo_render(void *cookie)
    SDL_UnlockMutex(rd_priv_ctx->lock);
 }

-static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
+static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
+                                 const Dav1dPlaySettings *settings)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
@ -413,7 +423,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
        .height         = height,
        .pixel_stride   = 1,
        .row_stride     = dav1d_pic->stride[0],
-        .pixels         = dav1d_pic->data[0],
        .component_size = {8},
        .component_map  = {0},
    };
@ -424,7 +433,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
        .height         = height/2,
        .pixel_stride   = 1,
        .row_stride     = dav1d_pic->stride[1],
-        .pixels         = dav1d_pic->data[1],
        .component_size = {8},
        .component_map  = {1},
    };
@ -435,11 +443,23 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
        .height         = height/2,
        .pixel_stride   = 1,
        .row_stride     = dav1d_pic->stride[1],
-        .pixels         = dav1d_pic->data[2],
        .component_size = {8},
        .component_map  = {2},
    };

+    if (settings->zerocopy) {
+        const struct pl_buf *buf = dav1d_pic->allocator_data;
+        assert(buf);
+        data_y.buf = data_u.buf = data_v.buf = buf;
+        data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
+        data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
+        data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
+    } else {
+        data_y.pixels = dav1d_pic->data[0];
+        data_u.pixels = dav1d_pic->data[1];
+        data_v.pixels = dav1d_pic->data[2];
+    }
+
    bool ok = true;
    ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
    ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
@ -456,11 +476,106 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
    return !ok;
 }

+// Align to power of 2
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
+    int ret = DAV1D_ERR(ENOMEM);
+
+    // Copied from dav1d_default_picture_alloc
+    const int hbd = p->p.bpc > 8;
+    const int aligned_w = ALIGN2(p->p.w, 128);
+    const int aligned_h = ALIGN2(p->p.h, 128);
+    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    p->stride[0] = aligned_w << hbd;
+    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+    // Align strides up to multiples of the GPU performance hints
+    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
+    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
+
+    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
+    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
+    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+    // even in the case that the driver gives us insane alignments
+    const size_t pic_size = y_sz + 2 * uv_sz;
+    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+    // Validate size limitations
+    if (total_size > gpu->limits.max_xfer_size) {
+        printf("alloc of %zu bytes exceeds limits\n", total_size);
+        goto err;
+    }
+
+    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .type = PL_BUF_TEX_TRANSFER,
+        .host_mapped = true,
+        .size = total_size,
+        .memory_type = PL_BUF_MEM_HOST,
+        .user_data = p,
+    });
+
+    if (!buf) {
+        printf("alloc of GPU mapped buffer failed\n");
+        goto err;
+    }
+
+    assert(buf->data);
+    uintptr_t base = (uintptr_t) buf->data, data[3];
+    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+    // Sanity check offset alignment for the sake of debugging
+    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
+        data[1] - base != ALIGN2(data[1] - base, off_align) ||
+        data[2] - base != ALIGN2(data[2] - base, off_align))
+    {
+        printf("GPU buffer horribly misaligned, expect slowdown!\n");
+    }
+
+    p->allocator_data = (void *) buf;
+    p->data[0] = (void *) data[0];
+    p->data[1] = (void *) data[1];
+    p->data[2] = (void *) data[2];
+    ret = 0;
+
+    // fall through
+err:
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return ret;
+}
+
+static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    assert(pic->allocator_data);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+    const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
+    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
 static const Dav1dPlayRenderInfo renderer_info = {
    .create_renderer = placebo_renderer_create,
    .destroy_renderer = placebo_renderer_destroy,
    .render = placebo_render,
-    .update_frame = placebo_upload_planes
+    .update_frame = placebo_upload_planes,
+    .alloc_pic = placebo_alloc_pic,
+    .release_pic = placebo_release_pic,
 };

 #else
@ -516,7 +631,7 @@ static void sdl_renderer_destroy(void *cookie)
    free(rd_priv_ctx);
 }

-static void sdl_render(void *cookie)
+static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
@ -536,7 +651,8 @@ static void sdl_render(void *cookie)
    SDL_UnlockMutex(rd_priv_ctx->lock);
 }

-static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic)
+static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
+                              const Dav1dPlaySettings *settings)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
@ -647,8 +763,11 @@ static void dp_settings_print_usage(const char *const app,
    fprintf(stderr, "Usage: %s [options]\n\n", app);
    fprintf(stderr, "Supported options:\n"
            " --input/-i  $file:    input file\n"
+            " --untimed/-u:         ignore PTS, render as fast as possible\n"
            " --framethreads $num:  number of frame threads (default: 1)\n"
            " --tilethreads $num:   number of tile threads (default: 1)\n"
+            " --highquality:        enable high quality rendering\n"
+            " --zerocopy/-z:        enable zero copy upload path\n"
            " --version/-v:         print version and exit\n");
    exit(1);
 }
@ -672,19 +791,23 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
    Dav1dSettings *lib_settings = &rd_ctx->lib_settings;

    // Short options
-    static const char short_opts[] = "i:v";
+    static const char short_opts[] = "i:vuz";

    enum {
        ARG_FRAME_THREADS = 256,
        ARG_TILE_THREADS,
+        ARG_HIGH_QUALITY,
    };

    // Long options
    static const struct option long_opts[] = {
        { "input",          1, NULL, 'i' },
        { "version",        0, NULL, 'v' },
+        { "untimed",        0, NULL, 'u' },
        { "framethreads",   1, NULL, ARG_FRAME_THREADS },
        { "tilethreads",    1, NULL, ARG_TILE_THREADS },
+        { "highquality",    0, NULL, ARG_HIGH_QUALITY },
+        { "zerocopy",       0, NULL, 'z' },
        { NULL,             0, NULL, 0 },
    };

@ -696,6 +819,21 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
            case 'v':
                fprintf(stderr, "%s\n", dav1d_version());
                exit(0);
+            case 'u':
+                settings->untimed = true;
+                break;
+            case ARG_HIGH_QUALITY:
+                settings->highquality = true;
+#ifndef HAVE_PLACEBO_VULKAN
+                fprintf(stderr, "warning: --highquality requires libplacebo\n");
+#endif
+                break;
+            case 'z':
+                settings->zerocopy = true;
+#ifndef HAVE_PLACEBO_VULKAN
+                fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
+#endif
+                break;
            case ARG_FRAME_THREADS:
                lib_settings->n_frame_threads =
                    parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
@ -811,7 +949,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
 static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
    Dav1dPicture *dav1d_pic)
 {
-    renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic);
+    renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
    rd_ctx->current_pts = dav1d_pic->m.timestamp;
 }

@ -853,16 +991,20 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
    int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
    rd_ctx->last_pts = rd_ctx->current_pts;

+    // In untimed mode, simply don't wait
+    if (rd_ctx->settings.untimed)
+        wait_time = 0;
+
    // This way of timing the playback is not accurate, as there is no guarantee
    // that SDL_Delay will wait for exactly the requested amount of time so in a
    // accurate player this would need to be done in a better way.
-    if (wait_time >= 0) {
+    if (wait_time > 0) {
        SDL_Delay(wait_time);
    } else if (wait_time < -10) { // Do not warn for minor time drifts
        fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
    }

-    renderer_info.render(rd_ctx->rd_priv);
+    renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings);

    rd_ctx->last_ticks = SDL_GetTicks();
 }
@ -1046,6 +1188,18 @@ int main(int argc, char **argv)
    // Parse and validate arguments
    dp_rd_ctx_parse_args(rd_ctx, argc, argv);

+    if (rd_ctx->settings.zerocopy) {
+        if (renderer_info.alloc_pic) {
+            rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
+                .cookie = rd_ctx->rd_priv,
+                .alloc_picture_callback = renderer_info.alloc_pic,
+                .release_picture_callback = renderer_info.release_pic,
+            };
+        } else {
+            fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
+        }
+    }
+
    // Start decoder thread
    decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);

--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -23,7 +23,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '0.4.0',
+    version: '0.5.0',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
--- a/third_party/dav1d/package/snap/snapcraft.yaml
+++ b/third_party/dav1d/package/snap/snapcraft.yaml
@ -17,7 +17,7 @@ apps:
 parts:
  dav1d:
    plugin: meson
-    source: .
+    source: ../../
    build-packages: [ 'nasm' ]
    meson-parameters:
      - --prefix=/usr
--- a/third_party/dav1d/src/arm/32/cdef.S
+++ b/third_party/dav1d/src/arm/32/cdef.S
@ -0,0 +1,660 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
+        tst             r6,  #1 // CDEF_HAVE_LEFT
+        beq             2f
+        // CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldrh            r12, [\s1, #-2]
+        vldr            \n1, [\s1]
+        vdup.16         d4,  r12
+        ldrh            r12, [\s1, #\w]
+        vmov.16         d4[1], r12
+        ldrh            r12, [\s2, #-2]
+        vldr            \n2, [\s2]
+        vmov.16         d4[2], r12
+        ldrh            r12, [\s2, #\w]
+        vmovl.u8        q0,  d0
+        vmov.16         d4[3], r12
+        vmovl.u8        q1,  d2
+        vmovl.u8        q2,  d4
+        vstr            s8,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s9,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s10, [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s11, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldrh            r12, [\s1, #-2]
+        vldr            \n1, [\s1]
+        vdup.16         d4,  r12
+        ldrh            r12, [\s2, #-2]
+        vldr            \n2, [\s2]
+        vmovl.u8        q0,  d0
+        vmov.16         d4[1], r12
+        vmovl.u8        q1,  d2
+        vmovl.u8        q2,  d4
+        vstr            s8,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s9,  [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        vldr            \n1, [\s1]
+        ldrh            r12, [\s1, #\w]
+        vldr            \n2, [\s2]
+        vdup.16         d4,  r12
+        ldrh            r12, [\s2, #\w]
+        vmovl.u8        q0,  d0
+        vmov.16         d4[1], r12
+        vmovl.u8        q1,  d2
+        vmovl.u8        q2,  d4
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s8,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s9,  [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        vldr            \n1, [\s1]
+        vldr            \n2, [\s2]
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w2}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+        vld1.32         {\dst\()[0]}, [\src, :32], \incr
+.else
+        vld1.8          {\dst\()},    [\src, :64], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
+//                               ptrdiff_t src_stride, const pixel (*left)[2],
+//                               /*const*/ pixel *const top[2], int h,
+//                               enum CdefEdgeFlags edges);
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro padding_func w, stride, n1, w1, n2, w2, align
+function cdef_padding\w\()_neon, export=1
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldr             r6,  [sp, #28]
+        vmov.i16        q3,  #0x8000
+        tst             r6,  #4 // CDEF_HAVE_TOP
+        bne             1f
+        // !CDEF_HAVE_TOP
+        sub             r12, r0,  #2*(2*\stride+2)
+        vmov.i16        q2,  #0x8000
+        vst1.16         {q2,q3}, [r12]!
+.if \w == 8
+        vst1.16         {q2,q3}, [r12]!
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        ldr             r7,  [r4]
+        ldr             lr,  [r4, #4]
+        sub             r0,  r0,  #2*(2*\stride)
+        pad_top_bottom  r7,  lr,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+
+        // Middle section
+3:
+        tst             r6,  #1 // CDEF_HAVE_LEFT
+        beq             2f
+        // CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldrh            r12, [r3], #2
+        vldr            \n1, [r1]
+        vdup.16         d2,  r12
+        ldrh            r12, [r1, #\w]
+        add             r1,  r1,  r2
+        subs            r5,  r5,  #1
+        vmov.16         d2[1], r12
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s4,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s5,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldrh            r12, [r3], #2
+        load_n_incr     d0,  r1,  r2,  \w
+        vdup.16         d2,  r12
+        subs            r5,  r5,  #1
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s4,  [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             1b
+        b               3f
+2:
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldrh            r12, [r1, #\w]
+        load_n_incr     d0,  r1,  r2,  \w
+        vdup.16         d2,  r12
+        subs            r5,  r5,  #1
+        vmovl.u8        q0,  d0
+        vmovl.u8        q1,  d2
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s4,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        load_n_incr     d0,  r1,  r2,  \w
+        subs            r5,  r5,  #1
+        vmovl.u8        q0,  d0
+        vstr            s12, [r0, #-4]
+        vst1.16         {\w1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             1b
+
+3:
+        tst             r6,  #8 // CDEF_HAVE_BOTTOM
+        bne             1f
+        // !CDEF_HAVE_BOTTOM
+        sub             r12, r0,  #4
+        vmov.i16        q2,  #0x8000
+        vst1.16         {q2,q3}, [r12]!
+.if \w == 8
+        vst1.16         {q2,q3}, [r12]!
+.endif
+        pop             {r4-r7,pc}
+1:
+        // CDEF_HAVE_BOTTOM
+        add             r7,  r1,  r2
+        pad_top_bottom  r1,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 1
+endfunc
+.endm
+
+padding_func 8, 16, d0, q0, d2, q1, 128
+padding_func 4, 8,  s0, d0, s4, d2, 64
+
+.macro dir_table w, stride
+const directions\w
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+        .byte            1 * \stride + 0,  2 * \stride + 0
+        .byte            1 * \stride + 0,  2 * \stride - 1
+// Repeated, to avoid & 7
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+endconst
+.endm
+
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+        .byte           4, 2, 3, 3
+endconst
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+        add             r6,  r2,  r9, lsl #1 // x + off
+        sub             r9,  r2,  r9, lsl #1 // x - off
+        vld1.16         {\d11,\d12}, [r6]    // p0
+        vld1.16         {\d21,\d22}, [r9]    // p1
+.else
+        add             r6,  r2,  r9, lsl #1 // x + off
+        sub             r9,  r2,  r9, lsl #1 // x - off
+        vld1.16         {\d11}, [r6]         // p0
+        add             r6,  r6,  #2*8       // += stride
+        vld1.16         {\d21}, [r9]         // p1
+        add             r9,  r9,  #2*8       // += stride
+        vld1.16         {\d12}, [r6]         // p0
+        vld1.16         {\d22}, [r9]         // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
+        cmp             \threshold, #0
+        vmin.u16        q2,  q2,  \s1
+        vmax.s16        q3,  q3,  \s1
+        vmin.u16        q2,  q2,  \s2
+        vmax.s16        q3,  q3,  \s2
+
+        beq             3f
+        vabd.u16        q8,  q0,  \s1        // abs(diff)
+        vabd.u16        q11, q0,  \s2        // abs(diff)
+        vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
+        vshl.u16        q12, q11, \shift     // abs(diff) >> shift
+        vqsub.u16       q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+        vqsub.u16       q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+        vsub.i16        q10, \s1, q0         // diff = p0 - px
+        vsub.u16        q13, \s2, q0         // diff = p1 - px
+        vneg.s16        q8,  q9              // -clip
+        vneg.s16        q11, q12             // -clip
+        vmin.s16        q10, q10, q9         // imin(diff, clip)
+        vmin.s16        q13, q13, q12        // imin(diff, clip)
+        vdup.16         q9,  \tap            // taps[k]
+        vmax.s16        q10, q10, q8         // constrain() = imax(imin(diff, clip), -clip)
+        vmax.s16        q13, q13, q11        // constrain() = imax(imin(diff, clip), -clip)
+        vmla.i16        q1,  q10, q9         // sum += taps[k] * constrain()
+        vmla.i16        q1,  q13, q9         // sum += taps[k] * constrain()
+3:
+.endm
+
+// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
+//                              const uint16_t *tmp, int pri_strength,
+//                              int sec_strength, int dir, int damping, int h);
+.macro filter w
+function cdef_filter\w\()_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #92]
+        ldrd            r6,  r7,  [sp, #100]
+        movrel_local    r8,  pri_taps
+        and             r9,  r3,  #1
+        add             r8,  r8,  r9, lsl #1
+        movrel_local    r9,  directions\w
+        add             r5,  r9,  r5, lsl #1
+        vmov.u16        d17, #15
+        vdup.16         d16, r6              // damping
+
+        vdup.16         q5,  r3              // threshold
+        vdup.16         q7,  r4              // threshold
+        vmov.16         d8[0], r3
+        vmov.16         d8[1], r4
+        vclz.i16        d8,  d8              // clz(threshold)
+        vsub.i16        d8,  d17, d8         // ulog2(threshold)
+        vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
+        vneg.s16        d8,  d8              // -shift
+        vdup.16         q6,  d8[1]
+        vdup.16         q4,  d8[0]
+
+1:
+.if \w == 8
+        vld1.16         {q0},  [r2, :128]    // px
+.else
+        add             r12, r2,  #2*8
+        vld1.16         {d0},  [r2,  :64]    // px
+        vld1.16         {d1},  [r12, :64]    // px
+.endif
+
+        vmov.u16        q1,  #0              // sum
+        vmov.u16        q2,  q0              // min
+        vmov.u16        q3,  q0              // max
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        mov             lr,  #2              // sec_taps[0]
+
+2:
+        ldrsb           r9,  [r5]            // off1
+
+        load_px         d28, d29, d30, d31, \w
+
+        add             r5,  r5,  #4         // +2*2
+        ldrsb           r9,  [r5]            // off2
+
+        ldrb            r12, [r8]            // *pri_taps
+
+        handle_pixel    q14, q15, r3,  q5,  q4,  r12
+
+        load_px         d28, d29, d30, d31, \w
+
+        add             r5,  r5,  #8         // +2*4
+        ldrsb           r9,  [r5]            // off3
+
+        handle_pixel    q14, q15, r4,  q7,  q6,  lr
+
+        load_px         d28, d29, d30, d31, \w
+
+        handle_pixel    q14, q15, r4,  q7,  q6,  lr
+
+        sub             r5,  r5,  #11        // x8 -= 2*(2+4); x8 += 1;
+        subs            lr,  lr,  #1         // sec_tap-- (value)
+        add             r8,  r8,  #1         // pri_taps++ (pointer)
+        bne             2b
+
+        vshr.s16        q14, q1,  #15        // -(sum < 0)
+        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
+        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
+        vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
+        vmin.s16        q0,  q0,  q3
+        vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
+        vmovn.u16       d0,  q0
+.if \w == 8
+        add             r2,  r2,  #2*16      // tmp += tmp_stride
+        subs            r7,  r7,  #1         // h--
+        vst1.8          {d0}, [r0, :64], r1
+.else
+        vst1.32         {d0[0]}, [r0, :32], r1
+        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
+        subs            r7,  r7,  #2         // h -= 2
+        vst1.32         {d0[1]}, [r0, :32], r1
+.endif
+
+        // Reset pri_taps/sec_taps back to the original point
+        sub             r5,  r5,  #2
+        sub             r8,  r8,  #2
+
+        bgt             1b
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+filter 8
+filter 4
+
+const div_table, align=4
+        .short         840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
+//                              unsigned *const var)
+function cdef_find_dir_neon, export=1
+        push            {lr}
+        vpush           {q4-q7}
+        sub             sp,  sp,  #32          // cost
+        mov             r3,  #8
+        vmov.u16        q1,  #0                // q0-q1   sum_diag[0]
+        vmov.u16        q3,  #0                // q2-q3   sum_diag[1]
+        vmov.u16        q5,  #0                // q4-q5   sum_hv[0-1]
+        vmov.u16        q8,  #0                // q6,d16  sum_alt[0]
+                                               // q7,d17  sum_alt[1]
+        vmov.u16        q9,  #0                // q9,d22  sum_alt[2]
+        vmov.u16        q11, #0
+        vmov.u16        q10, #0                // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+        vld1.8          {d30}, [r0, :64], r1
+        vmov.u8         d31, #128
+        vsubl.u8        q15, d30, d31          // img[x] - 128
+        vmov.u16        q14, #0
+
+.if \i == 0
+        vmov            q0,  q15               // sum_diag[0]
+.else
+        vext.8          q12, q14, q15, #(16-2*\i)
+        vext.8          q13, q15, q14, #(16-2*\i)
+        vadd.i16        q0,  q0,  q12          // sum_diag[0]
+        vadd.i16        q1,  q1,  q13          // sum_diag[0]
+.endif
+        vrev64.16       q13, q15
+        vswp            d26, d27               // [-x]
+.if \i == 0
+        vmov            q2,  q13               // sum_diag[1]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q13, q13, q14, #(16-2*\i)
+        vadd.i16        q2,  q2,  q12          // sum_diag[1]
+        vadd.i16        q3,  q3,  q13          // sum_diag[1]
+.endif
+
+        vpadd.u16       d26, d30, d31          // [(x >> 1)]
+        vmov.u16        d27, #0
+        vpadd.u16       d24, d26, d28
+        vpadd.u16       d24, d24, d28          // [y]
+        vmov.u16        r12, d24[0]
+        vadd.i16        q5,  q5,  q15          // sum_hv[1]
+.if \i < 4
+        vmov.16         d8[\i],   r12          // sum_hv[0]
+.else
+        vmov.16         d9[\i-4], r12          // sum_hv[0]
+.endif
+
+.if \i == 0
+        vmov.u16        q6,  q13               // sum_alt[0]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q14, q13, q14, #(16-2*\i)
+        vadd.i16        q6,  q6,  q12          // sum_alt[0]
+        vadd.i16        d16, d16, d28          // sum_alt[0]
+.endif
+        vrev64.16       d26, d26               // [-(x >> 1)]
+        vmov.u16        q14, #0
+.if \i == 0
+        vmov            q7,  q13               // sum_alt[1]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q13, q13, q14, #(16-2*\i)
+        vadd.i16        q7,  q7,  q12          // sum_alt[1]
+        vadd.i16        d17, d17, d26          // sum_alt[1]
+.endif
+
+.if \i < 6
+        vext.8          q12, q14, q15, #(16-2*(3-(\i/2)))
+        vext.8          q13, q15, q14, #(16-2*(3-(\i/2)))
+        vadd.i16        q9,  q9,  q12          // sum_alt[2]
+        vadd.i16        d22, d22, d26          // sum_alt[2]
+.else
+        vadd.i16        q9,  q9,  q15          // sum_alt[2]
+.endif
+.if \i == 0
+        vmov            q10, q15               // sum_alt[3]
+.elseif \i == 1
+        vadd.i16        q10, q10, q15          // sum_alt[3]
+.else
+        vext.8          q12, q14, q15, #(16-2*(\i/2))
+        vext.8          q13, q15, q14, #(16-2*(\i/2))
+        vadd.i16        q10, q10, q12          // sum_alt[3]
+        vadd.i16        d23, d23, d26          // sum_alt[3]
+.endif
+.endr
+
+        vmov.u32        q15, #105
+
+        vmull.s16       q12, d8,  d8           // sum_hv[0]*sum_hv[0]
+        vmlal.s16       q12, d9,  d9
+        vmull.s16       q13, d10, d10          // sum_hv[1]*sum_hv[1]
+        vmlal.s16       q13, d11, d11
+        vadd.s32        d8,  d24, d25
+        vadd.s32        d9,  d26, d27
+        vpadd.s32       d8,  d8,  d9           // cost[2,6] (s16, s17)
+        vmul.i32        d8,  d8,  d30          // cost[2,6] *= 105
+
+        vrev64.16       q1,  q1
+        vrev64.16       q3,  q3
+        vext.8          q1,  q1,  q1,  #10     // sum_diag[0][14-n]
+        vext.8          q3,  q3,  q3,  #10     // sum_diag[1][14-n]
+
+        vstr            s16, [sp, #2*4]        // cost[2]
+        vstr            s17, [sp, #6*4]        // cost[6]
+
+        movrel_local    r12, div_table
+        vld1.16         {q14}, [r12, :128]
+
+        vmull.s16       q5,  d0,  d0           // sum_diag[0]*sum_diag[0]
+        vmull.s16       q12, d1,  d1
+        vmlal.s16       q5,  d2,  d2
+        vmlal.s16       q12, d3,  d3
+        vmull.s16       q0,  d4,  d4           // sum_diag[1]*sum_diag[1]
+        vmull.s16       q1,  d5,  d5
+        vmlal.s16       q0,  d6,  d6
+        vmlal.s16       q1,  d7,  d7
+        vmovl.u16       q13, d28               // div_table
+        vmovl.u16       q14, d29
+        vmul.i32        q5,  q5,  q13          // cost[0]
+        vmla.i32        q5,  q12, q14
+        vmul.i32        q0,  q0,  q13          // cost[4]
+        vmla.i32        q0,  q1,  q14
+        vadd.i32        d10, d10, d11
+        vadd.i32        d0,  d0,  d1
+        vpadd.i32       d0,  d10, d0           // cost[0,4] = s0,s1
+
+        movrel_local    r12, alt_fact
+        vld1.16         {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+        vstr            s0,  [sp, #0*4]        // cost[0]
+        vstr            s1,  [sp, #4*4]        // cost[4]
+
+        vmovl.u16       q13, d29               // div_table[2*m+1] + 105
+        vmovl.u16       q14, d30
+        vmovl.u16       q15, d31
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+        vmull.s16       q1,  \s1, \s1          // sum_alt[n]*sum_alt[n]
+        vmull.s16       q2,  \s2, \s2
+        vmull.s16       q3,  \s3, \s3
+        vmull.s16       q5,  \s4, \s4          // sum_alt[n]*sum_alt[n]
+        vmull.s16       q12, \s5, \s5
+        vmull.s16       q6,  \s6, \s6          // q6 overlaps the first \s1-\s2 here
+        vmul.i32        q1,  q1,  q13          // sum_alt[n]^2*fact
+        vmla.i32        q1,  q2,  q14
+        vmla.i32        q1,  q3,  q15
+        vmul.i32        q5,  q5,  q13          // sum_alt[n]^2*fact
+        vmla.i32        q5,  q12, q14
+        vmla.i32        q5,  q6,  q15
+        vadd.i32        d2,  d2,  d3
+        vadd.i32        d3,  d10, d11
+        vpadd.i32       \dest, d2, d3          // *cost_ptr
+.endm
+        cost_alt        d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+        cost_alt        d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+        vstr            s28, [sp, #1*4]        // cost[1]
+        vstr            s29, [sp, #3*4]        // cost[3]
+
+        mov             r0,  #0                // best_dir
+        vmov.32         r1,  d0[0]             // best_cost
+        mov             r3,  #1                // n
+
+        vstr            s30, [sp, #5*4]        // cost[5]
+        vstr            s31, [sp, #7*4]        // cost[7]
+
+        vmov.32         r12, d14[0]
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+        vmov.32         lr,  \s2
+.endif
+        cmp             r12, r1                // cost[n] > best_cost
+        itt             gt
+        movgt           r0,  r3                // best_dir = n
+        movgt           r1,  r12               // best_cost = cost[n]
+.ifnb \s2
+        add             r3,  r3,  #1           // n++
+        cmp             lr,  r1                // cost[n] > best_cost
+        vmov.32         r12, \s3
+        itt             gt
+        movgt           r0,  r3                // best_dir = n
+        movgt           r1,  lr                // best_cost = cost[n]
+        add             r3,  r3,  #1           // n++
+.endif
+.endm
+        find_best       d14[0], d8[0], d14[1]
+        find_best       d14[1], d0[1], d15[0]
+        find_best       d15[0], d8[1], d15[1]
+        find_best       d15[1]
+
+        eor             r3,  r0,  #4           // best_dir ^4
+        ldr             r12, [sp, r3, lsl #2]
+        sub             r1,  r1,  r12          // best_cost - cost[best_dir ^ 4]
+        lsr             r1,  r1,  #10
+        str             r1,  [r2]              // *var
+
+        add             sp,  sp,  #32
+        vpop            {q4-q7}
+        pop             {pc}
+endfunc
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@ -2971,3 +2971,206 @@ endfunc

 filter_fn put,  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
 filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
+
+.macro load_filter_ptr src
+        asr             r12, \src, #10
+        add             r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+        vld1.8          {\dst}, [r12, :64]
+        add             \src, \src, \inc
+.endm
+
+.macro load_filter_row dst, src, inc
+        load_filter_ptr \src
+        load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+        load_filter_ptr r5                  // filter 0
+        vld1.16         {q7}, [r2], r3
+
+        load_filter_coef d0, r5,  r7        // filter 0
+        vmovl.u8        q6,  d14            // original pixels
+        load_filter_row d2,  r5,  r7        // filter 1
+        vmovl.u8        q7,  d15            // original pixels
+        load_filter_row d4,  r5,  r7        // filter 2
+        vmovl.s8        q0,  d0             // filter 0
+        vext.8          q3,  q6,  q7,  #2*1 // filter 1 pixels
+        load_filter_ptr r5                  // filter 3
+        vmovl.s8        q1,  d2             // filter 1
+        vmul.i16        q5,  q6,  q0        // filter 0 output
+        load_filter_coef d0, r5,  r7        // filter 3
+        vmovl.s8        q2,  d4             // filter 2
+        load_filter_ptr r5                  // filter 4
+        vext.8          q4,  q6,  q7,  #2*2 // filter 2 pixels
+        vmul.i16        q3,  q3,  q1        // filter 1 output
+        load_filter_coef d2, r5,  r7        // filter 4
+        vmul.i16        q4,  q4,  q2        // filter 2 output
+        vext.8          q2,  q6,  q7,  #2*3 // filter 3 pixels
+        vmovl.s8        q0,  d0             // filter 3
+        vpaddl.s16      q5,  q5             // pixel 0 (4x32)
+        vpaddl.s16      q3,  q3             // pixel 1 (4x32)
+        vmul.i16        q0,  q2,  q0        // filter 3 output
+        load_filter_ptr r5                  // filter 5
+        vext.8          q2,  q6,  q7,  #2*4 // filter 4 pixels
+        vmovl.s8        q1,  d2             // filter 4
+        vpaddl.s16      q4,  q4             // pixel 2 (4x32)
+        vpadd.s32       d10, d10, d11       // pixel 0 (2x32)
+        vpadd.s32       d11, d6,  d7        // pixel 1 (2x32)
+        load_filter_coef d6, r5,  r7        // filter 5
+        vmul.i16        q1,  q2,  q1        // filter 4 output
+        vpadd.s32       d8,  d8,  d9        // pixel 2 (2x32)
+        load_filter_ptr r5                  // filter 6
+        vpaddl.s16      q0,  q0             // pixel 3 (4x32)
+        vpadd.s32       d10, d10, d11       // pixel 0,1
+        vext.8          q2,  q6,  q7,  #2*5 // filter 5 pixels
+        vmovl.s8        q3,  d6             // filter 5
+        vpaddl.s16      q1,  q1             // pixel 4 (4x32)
+        vpadd.s32       d9,  d0,  d1        // pixel 3 (2x32)
+        load_filter_coef d0, r5,  r7        // filter 6
+        vmul.i16        q2,  q2,  q3        // filter 5 output
+        vpadd.s32       d11, d8,  d9        // pixel 2,3
+        load_filter_ptr r5                  // filter 7
+        vpaddl.s16      q2,  q2             // pixel 5 (4x32)
+        vpadd.s32       d8,  d2,  d3        // pixel 4 (2x32)
+        vext.8          q3,  q6,  q7,  #2*6 // filter 6 pixels
+        vmovl.s8        q0,  d0             // filter 6
+        vpadd.s32       d9,  d4,  d5        // pixel 5 (2x32)
+        load_filter_coef d4, r5,  r7        // filter 7
+        vpadd.s32       d8,  d8,  d9        // pixel 4,5
+        vext.8          q1,  q6,  q7,  #2*7 // filter 7 pixels
+        vmovl.s8        q2,  d4             // filter 7
+        vmul.i16        q3,  q3,  q0        // filter 6 output
+        vmul.i16        q1,  q1,  q2        // filter 7 output
+        sub             r5,  r5,  r7, lsl #3
+        vpaddl.s16      q3,  q3             // pixel 6 (4x32)
+        vpaddl.s16      q1,  q1             // pixel 7 (4x32)
+        vpadd.s32       d6,  d6,  d7        // pixel 6 (2x32)
+        vpadd.s32       d2,  d2,  d3        // pixel 7 (2x32)
+        vpadd.s32       d9,  d6,  d2        // pixel 6,7
+
+        add             r5,  r5,  r8
+
+        vrshrn.s32      d10, q5,  #3
+        vrshrn.s32      d11, q4,  #3
+
+        bx              lr
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        ldrd            r8,  r9,  [r4]
+        sxth            r7,  r8
+        asr             r8,  r8, #16
+        asr             r4,  r9, #16
+        sxth            r9,  r9
+        mov             r10, #8
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        sub             r2,  r2,  #3
+        movrel          r11, X(mc_warp_filter), 64*8
+.ifnb \t
+        lsl             r1,  r1,  #1
+.endif
+        add             r5,  r5,  #512
+        add             r6,  r6,  #512
+
+        bl              warp_filter_horz_neon
+        vmov            q8,  q5
+        bl              warp_filter_horz_neon
+        vmov            q9,  q5
+        bl              warp_filter_horz_neon
+        vmov            q10, q5
+        bl              warp_filter_horz_neon
+        vmov            q11, q5
+        bl              warp_filter_horz_neon
+        vmov            q12, q5
+        bl              warp_filter_horz_neon
+        vmov            q13, q5
+        bl              warp_filter_horz_neon
+        vmov            q14, q5
+
+1:
+        bl              warp_filter_horz_neon
+        vmov            q15, q5
+
+        load_filter_row d8,  r6,  r9
+        load_filter_row d9,  r6,  r9
+        load_filter_row d10, r6,  r9
+        load_filter_row d11, r6,  r9
+        load_filter_row d12, r6,  r9
+        load_filter_row d13, r6,  r9
+        load_filter_row d14, r6,  r9
+        load_filter_row d15, r6,  r9
+        transpose_8x8b  q4,  q5,  q6,  q7,  d8,  d9,  d10, d11, d12, d13, d14, d15
+        vmovl.s8        q1,  d8
+        vmovl.s8        q2,  d9
+        vmovl.s8        q3,  d10
+        vmovl.s8        q4,  d11
+        vmovl.s8        q5,  d12
+        vmovl.s8        q6,  d13
+
+        sub             r6,  r6,  r9, lsl #3
+
+        // This ordering of vmull/vmlal is highly beneficial for
+        // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+        vmull.s16       q0,  d16,  d2
+        vmlal.s16       q0,  d18,  d4
+        vmlal.s16       q0,  d20,  d6
+        vmlal.s16       q0,  d22,  d8
+        vmlal.s16       q0,  d24,  d10
+        vmlal.s16       q0,  d26,  d12
+        vmull.s16       q1,  d17,  d3
+        vmlal.s16       q1,  d19,  d5
+        vmlal.s16       q1,  d21,  d7
+        vmlal.s16       q1,  d23,  d9
+        vmlal.s16       q1,  d25,  d11
+        vmlal.s16       q1,  d27,  d13
+
+        vmovl.s8        q2,  d14
+        vmovl.s8        q3,  d15
+
+        vmlal.s16       q0,  d28,  d4
+        vmlal.s16       q0,  d30,  d6
+        vmlal.s16       q1,  d29,  d5
+        vmlal.s16       q1,  d31,  d7
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vqrshrn.s32     d0,  q0,  #\shift
+        vmov            q10, q11
+        vqrshrn.s32     d1,  q1,  #\shift
+        vmov            q11, q12
+        vmov            q12, q13
+.ifb \t
+        vqmovun.s16     d0,  q0
+.endif
+        vmov            q13, q14
+        vmov            q14, q15
+        subs            r10, r10, #1
+.ifnb \t
+        vst1.16         {q0}, [r0, :128], r1
+.else
+        vst1.8          {d0}, [r0, :64], r1
+.endif
+
+        add             r6,  r6,  r4
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+warp  , 11
+warp t, 7
--- a/third_party/dav1d/src/arm/32/util.S
+++ b/third_party/dav1d/src/arm/32/util.S
@ -32,6 +32,20 @@
 #include "config.h"
 #include "src/arm/asm.S"

+.macro movrel_local rd, val, offset=0
+#if defined(PIC)
+        ldr             \rd,  1f
+        b               2f
+1:
+        .word           \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
+2:
+        add             \rd,  \rd,  pc
+#else
+        movw            \rd, #:lower16:\val+\offset
+        movt            \rd, #:upper16:\val+\offset
+#endif
+.endm
+
 .macro movrel rd, val, offset=0
 #if defined(PIC) && defined(__APPLE__)
        ldr             \rd,  1f
@ -50,17 +64,24 @@
        .indirect_symbol \val
        .word       0
        .text
-#elif defined(PIC)
-        ldr             \rd,  1f
-        b               2f
-1:
-        .word           \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
-2:
-        add             \rd,  \rd,  pc
 #else
-        movw            \rd, #:lower16:\val+\offset
-        movt            \rd, #:upper16:\val+\offset
+        movrel_local    \rd, \val, \offset
 #endif
 .endm

+.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.32         \q0,  \q2
+        vtrn.32         \q1,  \q3
+
+        vtrn.16         \r0,  \r2
+        vtrn.16         \r1,  \r3
+        vtrn.16         \r4,  \r6
+        vtrn.16         \r5,  \r7
+
+        vtrn.8          \r0,  \r1
+        vtrn.8          \r2,  \r3
+        vtrn.8          \r4,  \r5
+        vtrn.8          \r6,  \r7
+.endm
+
 #endif /* DAV1D_SRC_ARM_32_UTIL_S */
--- a/third_party/dav1d/src/arm/64/cdef.S
+++ b/third_party/dav1d/src/arm/64/cdef.S
@ -129,6 +129,14 @@
 3:
 .endm

+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+        ld1             {\dst\().s}[0], [\src], \incr
+.else
+        ld1             {\dst\().8b},   [\src], \incr
+.endif
+.endm
+
 // void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
 //                               ptrdiff_t src_stride, const pixel (*left)[2],
 //                               /*const*/ pixel *const top[2], int h,
@ -163,9 +171,8 @@ function cdef_padding\w\()_neon, export=1
        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
        ld1             {v0.h}[0], [x3], #2
-        ldr             \rn\()1, [x1]
        ldr             h2,      [x1, #\w]
-        add             x1,  x1,  x2
+        load_n_incr     v1,  x1,  x2,  \w
        subs            w5,  w5,  #1
        uxtl            v0.8h,  v0.8b
        uxtl            v1.8h,  v1.8b
@ -179,11 +186,7 @@ function cdef_padding\w\()_neon, export=1
 1:
        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
        ld1             {v0.h}[0], [x3], #2
-.if \w == 8
-        ld1             {v1.8b},   [x1], x2
-.else
-        ld1             {v1.s}[0], [x1], x2
-.endif
+        load_n_incr     v1,  x1,  x2,  \w
        subs            w5,  w5,  #1
        uxtl            v0.8h,  v0.8b
        uxtl            v1.8h,  v1.8b
@ -198,9 +201,8 @@ function cdef_padding\w\()_neon, export=1
        b.eq            1f
        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
-        ldr             \rn\()0, [x1]
        ldr             h1,      [x1, #\w]
-        add             x1,  x1,  x2
+        load_n_incr     v0,  x1,  x2,  \w
        subs            w5,  w5,  #1
        uxtl            v0.8h,  v0.8b
        uxtl            v1.8h,  v1.8b
@ -212,11 +214,7 @@ function cdef_padding\w\()_neon, export=1
        b               3f
 1:
        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
-.if \w == 8
-        ld1             {v0.8b},   [x1], x2
-.else
-        ld1             {v0.s}[0], [x1], x2
-.endif
+        load_n_incr     v0,  x1,  x2,  \w
        subs            w5,  w5,  #1
        uxtl            v0.8h,  v0.8b
        str             s31,     [x0]
@ -299,17 +297,17 @@ endconst
        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
-        uqsub           v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
-        uqsub           v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
-        cmhi            v18.8h, v0.8h,  \s1\().8h   // px > p0
-        cmhi            v22.8h, v0.8h,  \s2\().8h   // px > p1
-        umin            v17.8h, v17.8h, v16.8h      // imin(abs(diff), imax())
-        umin            v21.8h, v21.8h, v20.8h      // imin(abs(diff), imax())
+        uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+        uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+        sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
+        sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
+        neg             v16.8h, v17.8h              // -clip
+        neg             v20.8h, v21.8h              // -clip
+        smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
+        smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
        dup             v19.8h, \tap                // taps[k]
-        neg             v16.8h, v17.8h              // -imin()
-        neg             v20.8h, v21.8h              // -imin()
-        bsl             v18.16b, v16.16b, v17.16b   // constrain() = apply_sign()
-        bsl             v22.16b, v20.16b, v21.16b   // constrain() = apply_sign()
+        smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
+        smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
 3:
@ -325,19 +323,18 @@ function cdef_filter\w\()_neon, export=1
        add             x8,  x8,  w9, uxtw #1
        movrel          x9,  directions\w
        add             x5,  x9,  w5, uxtw #1
-        movi            v30.8h,   #15
-        dup             v28.8h,   w6                // damping
+        movi            v30.4h,   #15
+        dup             v28.4h,   w6                // damping

        dup             v25.8h, w3                  // threshold
        dup             v27.8h, w4                  // threshold
-        clz             v24.8h, v25.8h              // clz(threshold)
-        clz             v26.8h, v27.8h              // clz(threshold)
-        sub             v24.8h, v30.8h, v24.8h      // ulog2(threshold)
-        sub             v26.8h, v30.8h, v26.8h      // ulog2(threshold)
-        uqsub           v24.8h, v28.8h, v24.8h      // shift = imax(0, damping - ulog2(threshold))
-        uqsub           v26.8h, v28.8h, v26.8h      // shift = imax(0, damping - ulog2(threshold))
-        neg             v24.8h, v24.8h              // -shift
-        neg             v26.8h, v26.8h              // -shift
+        trn1            v24.4h, v25.4h, v27.4h
+        clz             v24.4h, v24.4h              // clz(threshold)
+        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
+        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.4h, v24.4h              // -shift
+        dup             v26.8h, v24.h[1]
+        dup             v24.8h, v24.h[0]

 1:
 .if \w == 8
@ -467,15 +464,15 @@ function cdef_find_dir_neon, export=1
        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
-        add             v7.8h,   v7.8h,   v23.8h      // sum_alt[0]
+        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
-        add             v17.8h,  v17.8h,  v25.8h      // sum_alt[1]
+        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
 .endif
 .if \i < 6
        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
-        add             v19.8h,  v19.8h,  v23.8h      // sum_alt[2]
+        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
 .else
        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
 .endif
@ -487,7 +484,7 @@ function cdef_find_dir_neon, export=1
        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
-        add             v21.8h,  v21.8h,  v25.8h      // sum_alt[3]
+        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
 .endif
 .endr

@ -504,10 +501,8 @@ function cdef_find_dir_neon, export=1

        rev64           v1.8h,   v1.8h
        rev64           v3.8h,   v3.8h
-        ext             v1.16b,  v1.16b,  v1.16b, #8  // sum_diag[0][15-n]
-        ext             v3.16b,  v3.16b,  v3.16b, #8  // sum_diag[1][15-n]
-        ext             v1.16b,  v1.16b,  v1.16b, #2  // sum_diag[0][14-n]
-        ext             v3.16b,  v3.16b,  v3.16b, #2  // sum_diag[1][14-n]
+        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
+        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]

        str             s4,  [sp, #2*4]               // cost[2]
        str             s5,  [sp, #6*4]               // cost[6]
@ -559,16 +554,17 @@ function cdef_find_dir_neon, export=1
        addv            \d2, v25.4s                   // *cost_ptr
 .endm
        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
+        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
        str             s6,  [sp, #1*4]               // cost[1]
        str             s16, [sp, #3*4]               // cost[3]
-        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
-        str             s18, [sp, #5*4]               // cost[5]
-        str             s20, [sp, #7*4]               // cost[7]

        mov             w0,  #0                       // best_dir
        mov             w1,  v0.s[0]                  // best_cost
        mov             w3,  #1                       // n

+        str             s18, [sp, #5*4]               // cost[5]
+        str             s20, [sp, #7*4]               // cost[7]
+
        mov             w4,  v6.s[0]

 .macro find_best s1, s2, s3
--- a/third_party/dav1d/src/arm/64/ipred.S
+++ b/third_party/dav1d/src/arm/64/ipred.S
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@ -148,27 +148,6 @@ endconst
 .endif
 .endm

-.macro saddl_sz d0, d1, s0, s1, sz
-        saddl           \d0\().4s,  \s0\().4h,  \s1\().4h
-.ifc \sz, .8h
-        saddl2          \d1\().4s,  \s0\().8h,  \s1\().8h
-.endif
-.endm
-
-.macro ssubl_sz d0, d1, s0, s1, sz
-        ssubl           \d0\().4s,  \s0\().4h,  \s1\().4h
-.ifc \sz, .8h
-        ssubl2          \d1\().4s,  \s0\().8h,  \s1\().8h
-.endif
-.endm
-
-.macro mul_4s_sz d0, d1, s0, s1, c, sz
-        mul             \d0\().4s,  \s0\().4s,  \c
-.ifc \sz, .8h
-        mul             \d1\().4s,  \s1\().4s,  \c
-.endif
-.endm
-
 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
        sqrdmulh        \r0\sz,  \r0\sz,  \c
        sqrdmulh        \r1\sz,  \r1\sz,  \c
@ -489,18 +468,18 @@ endfunc
 .endm

 .macro idct_4 r0, r1, r2, r3, sz
-        add             v2\sz,   \r0\sz,  \r2\sz
-        sub             v3\sz,   \r0\sz,  \r2\sz
        smull_smlal     v6,  v7,  \r1, \r3, v0.h[3], v0.h[2], \sz
        smull_smlsl     v4,  v5,  \r1, \r3, v0.h[2], v0.h[3], \sz
-        sqrdmulh        v2\sz,   v2\sz,  v0.h[1]
-        sqrdmulh        v3\sz,   v3\sz,  v0.h[1]
+        smull_smlal     v2,  v3,  \r0, \r2, v0.h[0], v0.h[0], \sz
        rshrn_sz        v6,  v6,  v7,  #12, \sz
-        rshrn_sz        v4,  v4,  v5,  #12, \sz
+        rshrn_sz        v7,  v4,  v5,  #12, \sz
+        smull_smlsl     v4,  v5,  \r0, \r2, v0.h[0], v0.h[0], \sz
+        rshrn_sz        v2,  v2,  v3,  #12, \sz
+        rshrn_sz        v3,  v4,  v5,  #12, \sz
        sqadd           \r0\sz,  v2\sz,   v6\sz
        sqsub           \r3\sz,  v2\sz,   v6\sz
-        sqadd           \r1\sz,  v3\sz,   v4\sz
-        sqsub           \r2\sz,  v3\sz,   v4\sz
+        sqadd           \r1\sz,  v3\sz,   v7\sz
+        sqsub           \r2\sz,  v3\sz,   v7\sz
 .endm

 function inv_dct_4x4_neon
@ -780,11 +759,10 @@ def_fn_4x4 identity, flipadst
        sqadd           v3\sz,   \r7\sz,  \r5\sz // t7
        sqsub           \r3\sz,  \r7\sz,  \r5\sz // t6a

-        sub             \r5\sz,  \r3\sz,  \r1\sz // -> t5
-        add             \r7\sz,  \r3\sz,  \r1\sz // -> t6
-
-        sqrdmulh        v4\sz,   \r5\sz, v0.h[1] // t5
-        sqrdmulh        v5\sz,   \r7\sz, v0.h[1] // t6
+        smull_smlsl     v4,  v5,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
+        smull_smlal     v6,  v7,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
+        rshrn_sz        v4,  v4,  v5,  #12, \sz // t5
+        rshrn_sz        v5,  v6,  v7,  #12, \sz // t6

        sqsub           \r7\sz,  \r0\sz,  v3\sz // out7
        sqadd           \r0\sz,  \r0\sz,  v3\sz // out0
@ -865,22 +843,14 @@ endfunc
        sqsub           v5\sz,     v5\sz, v19\sz // t7
        sqneg           \o1\()\sz, \o1\()\sz     // out1

-        movi            v0.4s,  #2896>>4
-
-        saddl_sz        v18, v19, v2,  v4,  \sz // -> out3 (v19 or v20)
-        ssubl_sz        v6,  v7,  v2,  v4,  \sz // -> out4 (v20 or v19)
-        ssubl_sz        v20, v21, v3,  v5,  \sz // -> out5 (v21 or v18)
-        saddl_sz        v4,  v5,  v3,  v5,  \sz // -> out2 (v18 or v21)
-
-        mul_4s_sz       v18, v19, v18, v19, v0.s[0], \sz
-        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
-        mul_4s_sz       v20, v21, v20, v21, v0.s[0], \sz
-        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
-
-        rshrn_sz        v2,  v18, v19, #8,  \sz // out3
-        rshrn_sz        v3,  v20, v21, #8,  \sz // out5
-        rshrn_sz        \o2, v4,  v5,  #8,  \sz // out2 (v18 or v21)
-        rshrn_sz        \o4, v6,  v7,  #8,  \sz // out4 (v20 or v19)
+        smull_smlal     v18, v19, v2,  v4,  v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
+        smull_smlsl     v6,  v7,  v2,  v4,  v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
+        smull_smlsl     v20, v21, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
+        rshrn_sz        v2,  v18, v19, #12, \sz // out3
+        smull_smlal     v18, v19, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
+        rshrn_sz        v3,  v20, v21, #12, \sz // out5
+        rshrn_sz        \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+        rshrn_sz        \o4, v6,  v7,  #12, \sz // out4 (v20 or v19)

        sqneg           \o3\()\sz, v2\sz     // out3
        sqneg           \o5\()\sz, v3\sz     // out5
@ -1127,14 +1097,15 @@ def_fns_48 8, 4
        sqsub           v25\sz,  v27\sz,  v29\sz  // t13
        sqadd           v27\sz,  v27\sz,  v29\sz  // t14

-        sub             v23\sz,  v3\sz,   v2\sz     // -> t11
-        add             v29\sz,  v3\sz,   v2\sz     // -> t12
-        sub             v6\sz,   v25\sz,  v21\sz    // -> t10a
-        add             v7\sz,   v25\sz,  v21\sz    // -> t13a
-        sqrdmulh        v2\sz,   v23\sz,  v0.h[1]   // t11
-        sqrdmulh        v3\sz,   v29\sz,  v0.h[1]   // t12
-        sqrdmulh        v4\sz,   v6\sz,   v0.h[1]   // t10a
-        sqrdmulh        v5\sz,   v7\sz,   v0.h[1]   // t13a
+        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t11
+        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t12
+        smull_smlsl     v2,  v3,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+
+        rshrn_sz        v4,  v4,  v5,  #12, \sz   // t11
+        rshrn_sz        v5,  v6,  v7,  #12, \sz   // t12
+        smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+        rshrn_sz        v2,  v2,  v3,  #12, \sz   // t10a
+        rshrn_sz        v3,  v6,  v7,  #12, \sz   // t13a

        sqadd           v6\sz,   v16\sz,  v31\sz  // out0
        sqsub           v31\sz,  v16\sz,  v31\sz  // out15
@ -1143,18 +1114,18 @@ def_fns_48 8, 4
        sqsub           v7\sz,   v30\sz,  v17\sz  // out8
        sqadd           v17\sz,  v18\sz,  v27\sz  // out1
        sqsub           v30\sz,  v18\sz,  v27\sz  // out14
-        sqadd           v18\sz,  v20\sz,  v5\sz   // out2
-        sqsub           v29\sz,  v20\sz,  v5\sz   // out13
-        sqadd           v5\sz,   v28\sz,  v19\sz  // out6
+        sqadd           v18\sz,  v20\sz,  v3\sz   // out2
+        sqsub           v29\sz,  v20\sz,  v3\sz   // out13
+        sqadd           v3\sz,   v28\sz,  v19\sz  // out6
        sqsub           v25\sz,  v28\sz,  v19\sz  // out9
-        sqadd           v19\sz,  v22\sz,  v3\sz   // out3
-        sqsub           v28\sz,  v22\sz,  v3\sz   // out12
-        sqadd           v20\sz,  v24\sz,  v2\sz   // out4
-        sqsub           v27\sz,  v24\sz,  v2\sz   // out11
-        sqadd           v21\sz,  v26\sz,  v4\sz   // out5
-        sqsub           v26\sz,  v26\sz,  v4\sz   // out10
+        sqadd           v19\sz,  v22\sz,  v5\sz   // out3
+        sqsub           v28\sz,  v22\sz,  v5\sz   // out12
+        sqadd           v20\sz,  v24\sz,  v4\sz   // out4
+        sqsub           v27\sz,  v24\sz,  v4\sz   // out11
+        sqadd           v21\sz,  v26\sz,  v2\sz   // out5
+        sqsub           v26\sz,  v26\sz,  v2\sz   // out10
        mov             v24\szb, v7\szb
-        mov             v22\szb, v5\szb
+        mov             v22\szb, v3\szb
 .endm

 function inv_dct_8x16_neon
@ -1310,37 +1281,25 @@ endfunc
        sqsub           v23\sz,  v25\sz,  v23\sz // t7
        sqneg           \o3\sz,  \o3\sz          // out3

-        movi            v0.4s,  #2896>>4
+        smull_smlsl     v24, v25, v2,  v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
+        smull_smlal     v4,  v5,  v2,  v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
+        smull_smlal     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)

-        ssubl_sz        v24, v25, v2,  v21, \sz // -> out8 (v24 or v23)
-        saddl_sz        v4,  v5,  v2,  v21, \sz // -> out7 (v23 or v24)
-        saddl_sz        v6,  v7,  v26, v3,  \sz // -> out5 (v21 or v26)
-        ssubl_sz        v2,  v3,  v26, v3,  \sz // -> out10 (v26 or v21)
+        rshrn_sz        v24, v24, v25, #12, \sz // out8
+        rshrn_sz        v4,  v4,  v5,  #12, \sz // out7
+        rshrn_sz        v5,  v6,  v7,  #12, \sz // out5
+        smull_smlsl     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
+        smull_smlal     v2,  v3,  v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
+        rshrn_sz        v26, v6,  v7,  #12, \sz // out10

-        mul_4s_sz       v24, v25, v24, v25, v0.s[0], \sz
-        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
-        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
-        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
+        smull_smlsl     v6,  v7,  v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
+        smull_smlal     v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
+        smull_smlsl     v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)

-        rshrn_sz        v24, v24, v25, #8,  \sz // out8
-        rshrn_sz        v4,  v4,  v5,  #8,  \sz // out7
-        rshrn_sz        v5,  v6,  v7,  #8,  \sz // out5
-        rshrn_sz        v26, v2,  v3,  #8,  \sz // out10
-
-        saddl_sz        v2,  v3,  v22, v23, \sz // -> out4 (v20 or v27)
-        ssubl_sz        v6,  v7,  v22, v23, \sz // -> out11 (v27 or v20)
-        saddl_sz        v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
-        ssubl_sz        v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
-
-        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
-        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
-        mul_4s_sz       v22, v23, v22, v23, v0.s[0], \sz
-        mul_4s_sz       v21, v25, v21, v25, v0.s[0], \sz
-
-        rshrn_sz        \o4, v2,  v3,  #8,  \sz // out4
-        rshrn_sz        v6,  v6,  v7,  #8,  \sz // out11
-        rshrn_sz        v7,  v21, v25, #8,  \sz // out9
-        rshrn_sz        \o6, v22, v23, #8,  \sz // out6
+        rshrn_sz        \o4, v2,  v3,  #12, \sz // out4
+        rshrn_sz        v6,  v6,  v7,  #12, \sz // out11
+        rshrn_sz        v7,  v21, v25, #12, \sz // out9
+        rshrn_sz        \o6, v22, v23, #12, \sz // out6

 .ifc \o8, v23
        mov             \o8\szb,  v24\szb
@ -1915,22 +1874,26 @@ function inv_dct32_odd_8x16_neon
        sqsub           v24.8h,  v24.8h,  v19.8h // t27a
        mov             v19.16b, v4.16b          // out19

-        sub             v20.8h,  v24.8h,  v26.8h    // -> t20
-        add             v4.8h,   v24.8h,  v26.8h    // -> t27
-        sub             v5.8h,   v25.8h,  v27.8h    // -> t21a
-        add             v26.8h,  v25.8h,  v27.8h    // -> t26a
-        sqrdmulh        v20.8h,  v20.8h,  v0.h[1]   // t20 = out20
-        sqrdmulh        v27.8h,  v4.8h,   v0.h[1]   // t27 = out27
-        sub             v22.8h,  v21.8h,  v23.8h    // -> t22
-        add             v25.8h,  v21.8h,  v23.8h    // -> t25
-        sqrdmulh        v21.8h,  v5.8h,   v0.h[1]   // t21a = out21
-        sqrdmulh        v26.8h,  v26.8h,  v0.h[1]   // t26a = out26
-        sub             v23.8h,  v3.8h,   v2.8h     // -> t23a
-        add             v24.8h,  v3.8h,   v2.8h     // -> t24a
-        sqrdmulh        v22.8h,  v22.8h,  v0.h[1]   // t22 = out22
-        sqrdmulh        v25.8h,  v25.8h,  v0.h[1]   // t25 = out25
-        sqrdmulh        v23.8h,  v23.8h,  v0.h[1]   // t23a = out23
-        sqrdmulh        v24.8h,  v24.8h,  v0.h[1]   // t24a = out24
+        smull_smlsl     v4,  v5,  v24, v26, v0.h[0], v0.h[0], .8h // -> t20
+        smull_smlal     v6,  v7,  v24, v26, v0.h[0], v0.h[0], .8h // -> t27
+        rshrn_sz        v20, v4,  v5,  #12, .8h   // t20
+        rshrn_sz        v22, v6,  v7,  #12, .8h   // t27
+
+        smull_smlal     v4,  v5,  v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
+        smull_smlsl     v6,  v7,  v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
+        mov             v27.16b,  v22.16b         // t27
+        rshrn_sz        v26, v4,  v5,  #12, .8h   // t26a
+
+        smull_smlsl     v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
+        smull_smlal     v4,  v5,  v21, v23, v0.h[0], v0.h[0], .8h // -> t25
+        rshrn_sz        v21, v6,  v7,  #12, .8h   // t21a
+        rshrn_sz        v22, v24, v25, #12, .8h   // t22
+        rshrn_sz        v25, v4,  v5,  #12, .8h   // t25
+
+        smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t23a
+        smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t24a
+        rshrn_sz        v23, v4,  v5,  #12, .8h   // t23a
+        rshrn_sz        v24, v6,  v7,  #12, .8h   // t24a

        ret
 endfunc
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -2975,7 +2975,9 @@ function warp_filter_horz_neon
        ld1             {v16.8b, v17.8b}, [x2], x3

        load_filter_row d0, w12, w7
+        uxtl            v16.8h,  v16.8b
        load_filter_row d1, w12, w7
+        uxtl            v17.8h,  v17.8b
        load_filter_row d2, w12, w7
        sxtl            v0.8h,   v0.8b
        load_filter_row d3, w12, w7
@ -2988,16 +2990,12 @@ function warp_filter_horz_neon
        sxtl            v4.8h,   v4.8b
        load_filter_row d7, w12, w7
        sxtl            v5.8h,   v5.8b
-        sxtl            v6.8h,   v6.8b
-        sxtl            v7.8h,   v7.8b
-
-        uxtl            v16.8h,  v16.8b
-        uxtl            v17.8h,  v17.8b
-
        ext             v18.16b, v16.16b, v17.16b, #2*1
        mul             v23.8h,  v16.8h,  v0.8h
+        sxtl            v6.8h,   v6.8b
        ext             v19.16b, v16.16b, v17.16b, #2*2
        mul             v18.8h,  v18.8h,  v1.8h
+        sxtl            v7.8h,   v7.8b
        ext             v20.16b, v16.16b, v17.16b, #2*3
        mul             v19.8h,  v19.8h,  v2.8h
        ext             v21.16b, v16.16b, v17.16b, #2*4
@ -3009,28 +3007,20 @@ function warp_filter_horz_neon
        saddlp          v19.4s,  v19.8h
        mul             v22.8h,  v22.8h,  v5.8h
        saddlp          v20.4s,  v20.8h
-        addv            s23,     v23.4s
        saddlp          v21.4s,  v21.8h
-        addv            s18,     v18.4s
        saddlp          v22.4s,  v22.8h
-        addv            s19,     v19.4s
-        trn1            v18.2s,  v23.2s,  v18.2s
-        addv            s20,     v20.4s
+        addp            v18.4s,  v23.4s,  v18.4s
        ext             v23.16b, v16.16b, v17.16b, #2*6
-        trn1            v19.2s,  v19.2s,  v20.2s
-        addv            s21,     v21.4s
+        addp            v19.4s,  v19.4s,  v20.4s
        mul             v23.8h,  v23.8h,  v6.8h
        ext             v20.16b, v16.16b, v17.16b, #2*7
-        addv            s22,     v22.4s
        mul             v20.8h,  v20.8h,  v7.8h
        saddlp          v23.4s,  v23.8h
-        trn1            v21.2s,  v21.2s,  v22.2s
+        addp            v21.4s,  v21.4s,  v22.4s
        saddlp          v20.4s,  v20.8h
-        addv            s23,     v23.4s
-        addv            s20,     v20.4s
-        trn1            v20.2s,  v23.2s,  v20.2s
-        trn1            v18.2d,  v18.2d,  v19.2d
-        trn1            v20.2d,  v21.2d,  v20.2d
+        addp            v20.4s,  v23.4s,  v20.4s
+        addp            v18.4s,  v18.4s,  v19.4s
+        addp            v20.4s,  v21.4s,  v20.4s

        add             w5,  w5,  w8

@ -3047,14 +3037,10 @@ endfunc
 .macro warp t, shift
 function warp_affine_8x8\t\()_8bpc_neon, export=1
        ldr             x4,  [x4]
-        ubfx            x7,  x4, #0,  #16
-        ubfx            x8,  x4, #16, #16
-        ubfx            x9,  x4, #32, #16
-        ubfx            x4,  x4, #48, #16
-        sxth            w7,  w7
-        sxth            w8,  w8
-        sxth            w9,  w9
-        sxth            w4,  w4
+        sbfx            x7,  x4, #0,  #16
+        sbfx            x8,  x4, #16, #16
+        sbfx            x9,  x4, #32, #16
+        sbfx            x4,  x4, #48, #16
        mov             w10, #8
        sub             x2,  x2,  x3, lsl #1
        sub             x2,  x2,  x3
--- a/third_party/dav1d/src/arm/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/arm/cdef_init_tmpl.c
@ -27,7 +27,7 @@
 #include "src/cpu.h"
 #include "src/cdef.h"

-#if BITDEPTH == 8 && ARCH_AARCH64
+#if BITDEPTH == 8
 decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);

 void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
@ -58,8 +58,8 @@ cdef_filter_##w##x##h##_neon(pixel *dst,                                     \
                             const int damping,                              \
                             const enum CdefEdgeFlags edges)                 \
 {                                                                            \
-    ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride,);                         \
-    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;                            \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,);                     \
+    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8;                            \
    dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges);     \
    dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength,              \
                                sec_strength, dir, damping, h);              \
@ -76,7 +76,7 @@ COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8 && ARCH_AARCH64
+#if BITDEPTH == 8
    c->dir = dav1d_cdef_find_dir_neon;
    c->fb[0] = cdef_filter_8x8_neon;
    c->fb[1] = cdef_filter_4x8_neon;
--- a/third_party/dav1d/src/arm/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/arm/ipred_init_tmpl.c
@ -0,0 +1,80 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(dav1d_ipred_dc_neon);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_neon);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_neon);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_neon);
+decl_angular_ipred_fn(dav1d_ipred_h_neon);
+decl_angular_ipred_fn(dav1d_ipred_v_neon);
+decl_angular_ipred_fn(dav1d_ipred_paeth_neon);
+decl_angular_ipred_fn(dav1d_ipred_smooth_neon);
+decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon);
+decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon);
+decl_angular_ipred_fn(dav1d_ipred_filter_neon);
+
+decl_cfl_pred_fn(dav1d_ipred_cfl_neon);
+decl_cfl_pred_fn(dav1d_ipred_cfl_128_neon);
+decl_cfl_pred_fn(dav1d_ipred_cfl_top_neon);
+decl_cfl_pred_fn(dav1d_ipred_cfl_left_neon);
+
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_neon);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_neon);
+
+decl_pal_pred_fn(dav1d_pal_pred_neon);
+
+COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_neon;
+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_neon;
+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_neon;
+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_neon;
+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_neon;
+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_neon;
+    c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_neon;
+    c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_neon;
+    c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
+    c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon;
+    c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_neon;
+
+    c->cfl_pred[DC_PRED]         = dav1d_ipred_cfl_neon;
+    c->cfl_pred[DC_128_PRED]     = dav1d_ipred_cfl_128_neon;
+    c->cfl_pred[TOP_DC_PRED]     = dav1d_ipred_cfl_top_neon;
+    c->cfl_pred[LEFT_DC_PRED]    = dav1d_ipred_cfl_left_neon;
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_neon;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_neon;
+
+    c->pal_pred                  = dav1d_pal_pred_neon;
+#endif
+}
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@ -107,9 +107,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
    c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
    c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
    c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
-#if ARCH_AARCH64
    c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
    c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
 #endif
-#endif
 }
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@ -33,6 +33,12 @@

 #include "src/cdef_apply.h"

+
+enum Backup2x8Flags {
+    BACKUP_2X8_Y = 1 << 0,
+    BACKUP_2X8_UV = 1 << 1,
+};
+
 static void backup2lines(pixel *const dst[3][2],
                         /*const*/ pixel *const src[3],
                         const ptrdiff_t src_stride[2], int y_off, int w,
@ -56,13 +62,18 @@ static void backup2lines(pixel *const dst[3][2],
 static void backup2x8(pixel dst[3][8][2],
                      /*const*/ pixel *const src[3],
                      const ptrdiff_t src_stride[2], int x_off,
-                      const enum Dav1dPixelLayout layout)
+                      const enum Dav1dPixelLayout layout,
+                      const enum Backup2x8Flags flag)
 {
    ptrdiff_t y_off = 0;
-    for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
-        pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+    if (flag & BACKUP_2X8_Y) {
+        for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+            pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+    }
+
+    if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV))
+        return;

-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

@ -98,13 +109,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

-    // FIXME a design improvement that could be made here is to keep a set of
-    // flags for each block position on whether the block was filtered; if not,
-    // the backup of pre-filter data is empty, and the restore is therefore
-    // unnecessary as well.
-
    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
        const int tf = f->lf.top_pre_cdef_toggle;
+        const int by_idx = by & 30;
        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;

        if (edges & CDEF_HAVE_BOTTOM) {
@ -117,6 +124,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
        edges &= ~CDEF_HAVE_LEFT;
        edges |= CDEF_HAVE_RIGHT;
+        enum Backup2x8Flags prev_flag = 0;
        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
            const int sb128x = sbx >>1;
            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
@ -131,6 +139,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,

            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
+            const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
+
            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
                 bx += 2, edges |= CDEF_HAVE_LEFT)
@ -140,22 +150,23 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                // check if this 8x8 block had any coded coefficients; if not,
                // go to the next block
                const unsigned bx_mask = 3U << (bx & 14);
-                const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
+                const int bx_idx = (bx & 16) >> 4;
                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
                {
                    last_skip = 1;
                    goto next_b;
                }
-
-                if (last_skip && edges & CDEF_HAVE_LEFT) {
+                const int do_left = last_skip ? flag : (prev_flag ^ flag) & flag;
+                prev_flag = flag;
+                if (do_left && edges & CDEF_HAVE_LEFT) {
                    // we didn't backup the prefilter data because it wasn't
                    // there, so do it here instead
-                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout);
+                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
                }
                if (edges & CDEF_HAVE_RIGHT) {
                    // backup pre-filter data for next iteration
-                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout);
+                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
                }

                // the actual filter
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -1176,14 +1176,18 @@ static int decode_b(Dav1dTileContext *const t,
            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
        }

-        dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
-                                   f->frame_hdr, (const uint8_t (*)[8][2])
-                                   &ts->lflvl[b->seg_id][0][0][0],
-                                   t->bx, t->by, f->w4, f->h4, bs,
-                                   b->tx, b->uvtx, f->cur.p.layout,
-                                   &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
-                                   has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
-                                   has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+        if (f->frame_hdr->loopfilter.level_y[0] ||
+            f->frame_hdr->loopfilter.level_y[1])
+        {
+            dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
+                                       (const uint8_t (*)[8][2])
+                                       &ts->lflvl[b->seg_id][0][0][0],
+                                       t->bx, t->by, f->w4, f->h4, bs,
+                                       b->tx, b->uvtx, f->cur.p.layout,
+                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+        }

        // update contexts
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
@ -1859,17 +1863,21 @@ static int decode_b(Dav1dTileContext *const t,
            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
        }

-        const int is_globalmv =
-            b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
-        const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
-            &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
-        dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
-                                   f->frame_hdr, lf_lvls, t->bx, t->by,
-                                   f->w4, f->h4, b->skip, bs, b->tx_split,
-                                   b->uvtx, f->cur.p.layout,
-                                   &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
-                                   has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
-                                   has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+        if (f->frame_hdr->loopfilter.level_y[0] ||
+            f->frame_hdr->loopfilter.level_y[1])
+        {
+            const int is_globalmv =
+                b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
+            const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
+                &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
+            dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
+                                       lf_lvls, t->bx, t->by, f->w4, f->h4,
+                                       b->skip, bs, b->tx_split, b->uvtx,
+                                       f->cur.p.layout,
+                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+        }

        // context updates
        if (is_comp) {
@ -2339,7 +2347,7 @@ static void setup_tile(Dav1dTileState *const ts,
                   ((ts->tiling.col_start & 16) >> 4);
    }
    for (int p = 0; p < 3; p++) {
-        if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
+        if (!((f->lf.restore_planes >> p) & 1U))
            continue;

        if (f->frame_hdr->super_res.enabled) {
@ -2503,7 +2511,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
        }
        // Restoration filter
        for (int p = 0; p < 3; p++) {
-            if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
+            if (!((f->lf.restore_planes >> p) & 1U))
                continue;

            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -2817,6 +2825,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
        }
        f->lf.lr_mask_sz = lr_mask_sz;
    }
+    f->lf.restore_planes =
+        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
+        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
+        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
    if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
        dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
        f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@ -1126,6 +1126,7 @@ INIT_XMM
    %xdefine %%tmp %%f %+ 0
    %ifnum %%tmp
        RESET_MM_PERMUTATION
+        AVX512_MM_PERMUTATION
        %assign %%i 0
        %rep num_mmregs
            %xdefine %%tmp %%f %+ %%i
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@ -42,9 +42,12 @@ static void generate_scaling(const int bitdepth,
                             const uint8_t points[][2], const int num,
                             uint8_t scaling[SCALING_SIZE])
 {
+#if BITDEPTH == 8
+    const int shift_x = 0;
+#else
    const int shift_x = bitdepth - 8;
+#endif
    const int scaling_size = 1 << bitdepth;
-    const int pad = 1 << shift_x;

    // Fill up the preceding entries with the initial value
    for (int i = 0; i < points[0][0] << shift_x; i++)
@ -69,9 +72,8 @@ static void generate_scaling(const int bitdepth,
    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
        scaling[i] = points[num - 1][1];

-    if (pad <= 1) return;
-
-    const int rnd = pad >> 1;
+#if BITDEPTH != 8
+    const int pad = 1 << shift_x, rnd = pad >> 1;
    for (int i = 0; i < num - 1; i++) {
        const int bx = points[i][0] << shift_x;
        const int ex = points[i+1][0] << shift_x;
@ -83,6 +85,7 @@ static void generate_scaling(const int bitdepth,
            }
        }
    }
+#endif
 }

 #ifndef UNIT_TEST
--- a/third_party/dav1d/src/film_grain.h
+++ b/third_party/dav1d/src/film_grain.h
@ -51,7 +51,7 @@ typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
 #define decl_generate_grain_uv_fn(name) \
 void (name)(entry buf[][GRAIN_WIDTH], \
            const entry buf_y[][GRAIN_WIDTH], \
-            const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)
+            const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
 typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);

 #define decl_fgy_32x32xn_fn(name) \
--- a/third_party/dav1d/src/film_grain_tmpl.c
+++ b/third_party/dav1d/src/film_grain_tmpl.c
@ -88,7 +88,7 @@ static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
 static NOINLINE void
 generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
                    const entry buf_y[][GRAIN_WIDTH],
-                    const Dav1dFilmGrainData *const data, const int uv,
+                    const Dav1dFilmGrainData *const data, const intptr_t uv,
                    const int subx, const int suby HIGHBD_DECL_SUFFIX)
 {
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
@ -156,8 +156,8 @@ gnuv_ss_fn(444, 0, 0);
 // samples from the correct block of a grain LUT, while taking into account the
 // offsets provided by the offsets cache
 static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
-                               int offsets[2][2], int subx, int suby,
-                               int bx, int by, int x, int y)
+                               const int offsets[2][2], const int subx, const int suby,
+                               const int bx, const int by, const int x, const int y)
 {
    const int randval = offsets[bx][by];
    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -228,6 +228,7 @@ struct Dav1dFrameContext {
        int tile_row; // for carry-over at tile row edges
        pixel *p[3], *sr_p[3];
        Av1Filter *mask_ptr, *prev_mask_ptr;
+        int restore_planes; // enum LrRestorePlanes
    } lf;

    // threading (refer to tc[] for per-thread things)
--- a/third_party/dav1d/src/ipred.h
+++ b/third_party/dav1d/src/ipred.h
@ -89,6 +89,7 @@ typedef struct Dav1dIntraPredDSPContext {
 } Dav1dIntraPredDSPContext;

 bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
+bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
 bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);

 #endif /* DAV1D_SRC_IPRED_H */
--- a/third_party/dav1d/src/ipred_tmpl.c
+++ b/third_party/dav1d/src/ipred_tmpl.c
@ -324,44 +324,37 @@ static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
    }
 }

-static int get_filter_strength(const unsigned blk_wh, const unsigned d,
-                               const int type)
-{
-    int strength = 0;
-
-    if (type == 0) {
-        if (blk_wh <= 8) {
-            if (d >= 56) strength = 1;
-        } else if (blk_wh <= 12) {
-            if (d >= 40) strength = 1;
-        } else if (blk_wh <= 16) {
-            if (d >= 40) strength = 1;
-        } else if (blk_wh <= 24) {
-            if (d >= 8) strength = 1;
-            if (d >= 16) strength = 2;
-            if (d >= 32) strength = 3;
-        } else if (blk_wh <= 32) {
-            if (d >= 1) strength = 1;
-            if (d >= 4) strength = 2;
-            if (d >= 32) strength = 3;
+static int get_filter_strength(const int wh, const int angle, const int is_sm) {
+    if (is_sm) {
+        if (wh <= 8) {
+            if (angle >= 64) return 2;
+            if (angle >= 40) return 1;
+        } else if (wh <= 16) {
+            if (angle >= 48) return 2;
+            if (angle >= 20) return 1;
+        } else if (wh <= 24) {
+            if (angle >=  4) return 3;
        } else {
-            if (d >= 1) strength = 3;
+            return 3;
        }
    } else {
-        if (blk_wh <= 8) {
-            if (d >= 40) strength = 1;
-            if (d >= 64) strength = 2;
-        } else if (blk_wh <= 16) {
-            if (d >= 20) strength = 1;
-            if (d >= 48) strength = 2;
-        } else if (blk_wh <= 24) {
-            if (d >= 4) strength = 3;
+        if (wh <= 8) {
+            if (angle >= 56) return 1;
+        } else if (wh <= 16) {
+            if (angle >= 40) return 1;
+        } else if (wh <= 24) {
+            if (angle >= 32) return 3;
+            if (angle >= 16) return 2;
+            if (angle >=  8) return 1;
+        } else if (wh <= 32) {
+            if (angle >= 32) return 3;
+            if (angle >=  4) return 2;
+            return 1;
        } else {
-            if (d >= 1) strength = 3;
+            return 3;
        }
    }
-
-    return strength;
+    return 0;
 }

 static void filter_edge(pixel *const out, const int sz,
@ -451,12 +444,12 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
    for (int y = 0, xpos = dx; y < height;
         y++, dst += PXSTRIDE(stride), xpos += dx)
    {
-        const int frac = (xpos >> 1) & 0x1F;
+        const int frac = xpos & 0x3E;

        for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
            if (base < max_base_x) {
-                const int v = top[base] * (32 - frac) + top[base + 1] * frac;
-                dst[x] = iclip_pixel((v + 16) >> 5);
+                const int v = top[base] * (64 - frac) + top[base + 1] * frac;
+                dst[x] = (v + 32) >> 6;
            } else {
                pixel_set(&dst[x], top[max_base_x], width - x);
                break;
@ -518,30 +511,29 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
    }
    *topleft = *topleft_in;

-    const int min_base_x = -(1 + upsample_above);
    const int base_inc_x = 1 + upsample_above;
    const pixel *const left = &topleft[-(1 + upsample_left)];
-    const pixel *const top = &topleft[1 + upsample_above];
-    for (int y = 0, xpos = -dx; y < height;
+    for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
         y++, xpos -= dx, dst += PXSTRIDE(stride))
    {
        int base_x = xpos >> 6;
-        const int frac_x = (xpos >> 1) & 0x1F;
+        const int frac_x = xpos & 0x3E;

        for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
             x++, base_x += base_inc_x, ypos -= dy)
        {
            int v;
-
-            if (base_x >= min_base_x) {
-                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
+            if (base_x >= 0) {
+                v = topleft[base_x] * (64 - frac_x) +
+                    topleft[base_x + 1] * frac_x;
            } else {
                const int base_y = ypos >> 6;
                assert(base_y >= -(1 + upsample_left));
-                const int frac_y = (ypos >> 1) & 0x1F;
-                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
+                const int frac_y = ypos & 0x3E;
+                v = left[-base_y] * (64 - frac_y) +
+                    left[-(base_y + 1)] * frac_y;
            }
-            dst[x] = iclip_pixel((v + 16) >> 5);
+            dst[x] = (v + 32) >> 6;
        }
    }
 }
@ -588,13 +580,13 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
    }
    const int base_inc = 1 + upsample_left;
    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
-        const int frac = (ypos >> 1) & 0x1F;
+        const int frac = ypos & 0x3E;

        for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
            if (base < max_base_y) {
-                const int v = left[-base] * (32 - frac) +
+                const int v = left[-base] * (64 - frac) +
                              left[-(base + 1)] * frac;
-                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
+                dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
            } else {
                do {
                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
@ -605,6 +597,22 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
    }
 }

+#if ARCH_X86
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+    flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +           \
+    flt_ptr[16] * p2 + flt_ptr[17] * p3 +           \
+    flt_ptr[32] * p4 + flt_ptr[33] * p5 +           \
+    flt_ptr[48] * p6
+#define FLT_INCR 2
+#else
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+    flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 +           \
+    flt_ptr[16] * p2 + flt_ptr[24] * p3 +           \
+    flt_ptr[32] * p4 + flt_ptr[40] * p5 +           \
+    flt_ptr[48] * p6
+#define FLT_INCR 1
+#endif
+
 /* Up to 32x32 only */
 static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft_in,
@ -633,11 +641,8 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
            const int8_t *flt_ptr = filter;

            for (int yy = 0; yy < 2; yy++) {
-                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
-                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
-                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +
-                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +
-                              flt_ptr[48] * p6;
+                for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
+                    int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
                }
                ptr += PXSTRIDE(stride);
@ -751,7 +756,11 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {

    c->pal_pred = pal_pred_c;

-#if HAVE_ASM && ARCH_X86
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_intra_pred_dsp_init_arm)(c);
+#elif ARCH_X86
    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
 #endif
+#endif
 }
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@ -286,7 +286,6 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
                                uint8_t (*const level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Dav1dFrameHeader *const hdr,
                                const uint8_t (*filter_level)[8][2],
                                const int bx, const int by,
                                const int iw, const int ih,
@ -297,9 +296,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
                                uint8_t *const ay, uint8_t *const ly,
                                uint8_t *const auv, uint8_t *const luv)
 {
-    if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1])
-        return;
-
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = imin(iw - bx, b_dim[0]);
    const int bh4 = imin(ih - by, b_dim[1]);
@ -350,7 +346,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
 void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
                                uint8_t (*const level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Dav1dFrameHeader *const hdr,
                                const uint8_t (*filter_level)[8][2],
                                const int bx, const int by,
                                const int iw, const int ih,
@ -361,9 +356,6 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
                                uint8_t *const ay, uint8_t *const ly,
                                uint8_t *const auv, uint8_t *const luv)
 {
-    if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1])
-        return;
-
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = imin(iw - bx, b_dim[0]);
    const int bh4 = imin(ih - by, b_dim[1]);
--- a/third_party/dav1d/src/lf_mask.h
+++ b/third_party/dav1d/src/lf_mask.h
@ -63,7 +63,6 @@ typedef struct Av1Restoration {

 void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Dav1dFrameHeader *hdr,
                                const uint8_t (*level)[8][2], int bx, int by,
                                int iw, int ih, enum BlockSize bs,
                                enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
@ -71,7 +70,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                uint8_t *ly, uint8_t *auv, uint8_t *luv);
 void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Dav1dFrameHeader *hdr,
                                const uint8_t (*level)[8][2], int bx, int by,
                                int iw, int ih, int skip_inter,
                                enum BlockSize bs, const uint16_t *tx_mask,
--- a/third_party/dav1d/src/looprestoration.h
+++ b/third_party/dav1d/src/looprestoration.h
@ -75,5 +75,6 @@ typedef struct Dav1dLoopRestorationDSPContext {
 bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c);
 bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c);
 bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
+bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);

 #endif /* DAV1D_SRC_LOOPRESTORATION_H */
--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@ -580,6 +580,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
    bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
+#elif ARCH_PPC64LE
+    bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
 #elif ARCH_X86
    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
 #endif
--- a/third_party/dav1d/src/lr_apply_tmpl.c
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@ -112,10 +112,7 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
    const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);

    // TODO Also check block level restore type to reduce copying.
-    const int restore_planes =
-        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
-        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
-        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
+    const int restore_planes = f->lf.restore_planes;

    if (restore_planes & LR_RESTORE_Y) {
        const int h = f->cur.p.h;
@ -180,12 +177,8 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
    }

    while (y + stripe_h <= row_h) {
-        // TODO Look into getting rid of the this if
-        if (y + stripe_h == row_h) {
-            edges &= ~LR_HAVE_BOTTOM;
-        } else {
-            edges |= LR_HAVE_BOTTOM;
-        }
+        // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
+        edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
        if (lr->type == DAV1D_RESTORATION_WIENER) {
            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                           filterh, filterv, edges HIGHBD_CALL_SUFFIX);
@ -239,8 +232,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
    const int shift_hor = 7 - ss_hor;

    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
-
-    int unit_w = unit_size, bit = 0;
+    const Av1RestorationUnit *lr[2];

    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
                             (row_h < h ? LR_HAVE_BOTTOM : 0);
@ -251,26 +243,27 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
    aligned_unit_pos <<= ss_ver;
    const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
    const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
-    for (int x = 0; x < w; x += unit_w, edges |= LR_HAVE_LEFT, bit ^= 1) {
-        if (x + max_unit_size > w) {
-            unit_w = w - x;
-            edges &= ~LR_HAVE_RIGHT;
-        }
-
-        // Based on the position of the restoration unit, find the corresponding
-        // AV1Filter unit.
-        const int u_idx = unit_idx + ((x >> (shift_hor - 1)) & 1);
-        const Av1RestorationUnit *const lr =
-            &f->lf.lr_mask[sb_idx + (x >> shift_hor)].lr[plane][u_idx];
-
-        // FIXME Don't backup if the next restoration unit is RESTORE_NONE
-        if (edges & LR_HAVE_RIGHT) {
-            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, row_h - y);
-        }
-        if (lr->type != DAV1D_RESTORATION_NONE) {
-            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
-        }
-        p += unit_w;
+    lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
+    int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
+    int x = 0, bit = 0;
+    for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
+        const int next_x = x + unit_size;
+        const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
+        lr[!bit] =
+            &f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
+        const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
+        if (restore_next)
+            backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
+        if (restore)
+            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
+                      lr[bit], edges);
+        x = next_x;
+        restore = restore_next;
+    }
+    if (restore) {
+        edges &= ~LR_HAVE_RIGHT;
+        const int unit_w = w - x;
+        lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
    }
 }

@ -279,11 +272,7 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
 {
    const int offset_y = 8 * !!sby;
    const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
-
-    const int restore_planes =
-        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
-        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
-        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
+    const int restore_planes = f->lf.restore_planes;

    if (restore_planes & LR_RESTORE_Y) {
        const int h = f->sr_cur.p.p.h;
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -93,6 +93,7 @@ if is_asm_enabled
        )
        libdav1d_tmpl_sources += files(
            'arm/cdef_init_tmpl.c',
+            'arm/ipred_init_tmpl.c',
            'arm/itx_init_tmpl.c',
            'arm/loopfilter_init_tmpl.c',
            'arm/looprestoration_init_tmpl.c',
@ -101,6 +102,7 @@ if is_asm_enabled
        if host_machine.cpu_family() == 'aarch64'
            libdav1d_sources += files(
                'arm/64/cdef.S',
+                'arm/64/ipred.S',
                'arm/64/itx.S',
                'arm/64/loopfilter.S',
                'arm/64/looprestoration.S',
@ -109,6 +111,7 @@ if is_asm_enabled
            )
        elif host_machine.cpu_family().startswith('arm')
            libdav1d_sources += files(
+                'arm/32/cdef.S',
                'arm/32/looprestoration.S',
                'arm/32/mc.S',
            )
@ -167,6 +170,7 @@ if is_asm_enabled
        )
        libdav1d_arch_tmpl_sources += files(
            'ppc/cdef_init_tmpl.c',
+            'ppc/looprestoration_init_tmpl.c',
        )
    endif
 endif
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -1098,6 +1098,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                    const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
                    for (int i = 0; i < num_uv_pos; i++)
                        fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
+                    if (!fgd->num_y_points)
+                        fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
                }
            fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
            fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
--- a/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/ppc/looprestoration_init_tmpl.c
@ -0,0 +1,350 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/intops.h"
+#include "src/ppc/types.h"
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
+    v = vec_max(minv, v);
+    v = vec_min(maxv, v);
+    return v;
+}
+
+#define APPLY_FILTER_H(v, f, ssum1, ssum2) do {  \
+    i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
+    i16x8 ktmp_u16_low  = (i16x8) u8l_to_u16(v); \
+    ssum1 = vec_madd(ktmp_u16_high, f, ssum1);   \
+    ssum2 = vec_madd(ktmp_u16_low, f, ssum2);    \
+} while (0)
+
+static void wiener_filter_h_vsx(int32_t *hor_ptr,
+                                uint8_t *tmp_ptr,
+                                const int16_t filterh[7],
+                                const int w, const int h)
+{
+    static const i32x4 zerov = vec_splats(0);
+    static const i32x4 seven_vec = vec_splats(7);
+    static const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
+    static const i32x4 round_bits_vec = vec_splats(3);
+    static const i32x4 rounding_off_vec = vec_splats(1<<2);
+    static const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
+
+    i16x8 filterhvall = vec_vsx_ld(0, filterh);
+    i16x8 filterhv0 =  vec_splat( filterhvall, 0);
+    i16x8 filterhv1 =  vec_splat( filterhvall, 1);
+    i16x8 filterhv2 =  vec_splat( filterhvall, 2);
+    i16x8 filterhv3 =  vec_splat( filterhvall, 3);
+    i16x8 filterhv4 =  vec_splat( filterhvall, 4);
+    i16x8 filterhv5 =  vec_splat( filterhvall, 5);
+    i16x8 filterhv6 =  vec_splat( filterhvall, 6);
+
+    for (int j = 0; j < h + 6; j++) {
+        for (int i = 0; i < w; i+=16) {
+            i32x4 sum1 = bitdepth_added_vec;
+            i32x4 sum2 = bitdepth_added_vec;
+            i32x4 sum3 = bitdepth_added_vec;
+            i32x4 sum4 = bitdepth_added_vec;
+
+            u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
+            u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
+
+            u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
+            u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
+            u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
+            u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
+            u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
+            u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
+
+            u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
+            u16x8 tmp_u16_low  = u8l_to_u16(tmp_v3);
+
+            i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
+            i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
+            i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
+            i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
+
+            i16x8 ssum1 = (i16x8) zerov;
+            i16x8 ssum2 = (i16x8) zerov;
+
+            APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
+            APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
+
+            sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
+            sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
+            sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
+            sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
+
+            sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
+            sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
+            sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
+            sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
+
+            sum1 = iclip_vec(sum1, zerov, clip_limit_v);
+            sum2 = iclip_vec(sum2, zerov, clip_limit_v);
+            sum3 = iclip_vec(sum3, zerov, clip_limit_v);
+            sum4 = iclip_vec(sum4, zerov, clip_limit_v);
+
+            vec_st(sum1,  0, &hor_ptr[i]);
+            vec_st(sum2, 16, &hor_ptr[i]);
+            vec_st(sum3, 32, &hor_ptr[i]);
+            vec_st(sum4, 48, &hor_ptr[i]);
+        }
+        tmp_ptr += REST_UNIT_STRIDE;
+        hor_ptr += REST_UNIT_STRIDE;
+    }
+}
+
+static inline i16x8 iclip_u8_vec(i16x8 v) {
+    static const i16x8 zerov = vec_splats((int16_t)0);
+    static const i16x8 maxv = vec_splats((int16_t)255);
+    v = vec_max(zerov, v);
+    v = vec_min(maxv, v);
+    return v;
+}
+
+#define APPLY_FILTER_V(index, f) do { \
+    i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+    sum1 = sum1 + v1 * f; \
+    sum2 = sum2 + v2 * f; \
+    sum3 = sum3 + v3 * f; \
+    sum4 = sum4 + v4 * f; \
+} while (0)
+
+#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
+    i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
+    i32x4 sum1 = -round_offset_vec; \
+    i32x4 sum2 = -round_offset_vec; \
+    i32x4 sum3 = -round_offset_vec; \
+    i32x4 sum4 = -round_offset_vec; \
+    APPLY_FILTER_V(0, filterv0); \
+    APPLY_FILTER_V(1, filterv1); \
+    APPLY_FILTER_V(2, filterv2); \
+    APPLY_FILTER_V(3, filterv3); \
+    APPLY_FILTER_V(4, filterv4); \
+    APPLY_FILTER_V(5, filterv5); \
+    APPLY_FILTER_V(6, filterv6); \
+    sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \
+    sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \
+    sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \
+    sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \
+    sum1 = sum1 >> round_bits_vec; \
+    sum2 = sum2 >> round_bits_vec; \
+    sum3 = sum3 >> round_bits_vec; \
+    sum4 = sum4 >> round_bits_vec; \
+    i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \
+    i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \
+    sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
+    sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
+    sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \
+} while (0)
+
+static inline void wiener_filter_v_vsx(uint8_t *p,
+                                       const ptrdiff_t p_stride,
+                                       const int32_t *hor,
+                                       const int16_t filterv[7],
+                                       const int w, const int h)
+{
+    static const i32x4 round_bits_vec = vec_splats(11);
+    static const i32x4 rounding_off_vec = vec_splats(1 << 10);
+    static const i32x4 round_offset_vec = vec_splats(1 << 18);
+    static const i32x4 seven_vec = vec_splats(7);
+
+    i32x4 filterv0 =  vec_splats((int32_t) filterv[0]);
+    i32x4 filterv1 =  vec_splats((int32_t) filterv[1]);
+    i32x4 filterv2 =  vec_splats((int32_t) filterv[2]);
+    i32x4 filterv3 =  vec_splats((int32_t) filterv[3]);
+    i32x4 filterv4 =  vec_splats((int32_t) filterv[4]);
+    i32x4 filterv5 =  vec_splats((int32_t) filterv[5]);
+    i32x4 filterv6 =  vec_splats((int32_t) filterv[6]);
+
+    for (int j = 0; j < h; j++) {
+        for (int i = 0; i <(w-w%16); i += 16) {
+            u8x16 sum_pixel;
+            LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+            vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(p_stride) + i]);
+        }
+        // remaining loop
+        if (w & 0xf){
+            int i=w-w%16;
+            ALIGN_STK_16(uint8_t, tmp_out, 16,);
+            u8x16 sum_pixel;
+
+            LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+            vec_vsx_st(sum_pixel, 0, tmp_out);
+
+            for (int k=0; i<w; i++, k++) {
+                p[j * PXSTRIDE(p_stride) + i] = tmp_out[k];
+            }
+        }
+    }
+}
+
+static inline void padding(uint8_t *dst, const uint8_t *p,
+                           const ptrdiff_t p_stride, const uint8_t (*left)[4],
+                           const uint8_t *lpf, const ptrdiff_t lpf_stride,
+                           int unit_w, const int stripe_h,
+                           const enum LrEdgeFlags edges)
+{
+    const int have_left = !!(edges & LR_HAVE_LEFT);
+    const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+    // Copy more pixels if we don't have to pad them
+    unit_w += 3 * have_left + 3 * have_right;
+    uint8_t *dst_l = dst + 3 * !have_left;
+    p -= 3 * have_left;
+    lpf -= 3 * have_left;
+
+    if (edges & LR_HAVE_TOP) {
+        // Copy previous loop filtered rows
+        const uint8_t *const above_1 = lpf;
+        const uint8_t *const above_2 = above_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_l, above_1, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+    } else {
+        // Pad with first row
+        pixel_copy(dst_l, p, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+        if (have_left) {
+            pixel_copy(dst_l, &left[0][1], 3);
+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+        }
+    }
+
+    uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+    if (edges & LR_HAVE_BOTTOM) {
+        // Copy next loop filtered rows
+        const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
+        const uint8_t *const below_2 = below_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+    } else {
+        // Pad with last row
+        const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+        if (have_left) {
+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+        }
+    }
+
+    // Inner UNIT_WxSTRIPE_H
+    for (int j = 0; j < stripe_h; j++) {
+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+        dst_tl += REST_UNIT_STRIDE;
+        p += PXSTRIDE(p_stride);
+    }
+
+    if (!have_right) {
+        uint8_t *pad = dst_l + unit_w;
+        uint8_t *row_last = &dst_l[unit_w - 1];
+        // Pad 3x(STRIPE_H+6) with last column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(pad, *row_last, 3);
+            pad += REST_UNIT_STRIDE;
+            row_last += REST_UNIT_STRIDE;
+        }
+    }
+
+    if (!have_left) {
+        // Pad 3x(STRIPE_H+6) with first column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(dst, *dst_l, 3);
+            dst += REST_UNIT_STRIDE;
+            dst_l += REST_UNIT_STRIDE;
+        }
+    } else {
+        dst += 3 * REST_UNIT_STRIDE;
+        for (int j = 0; j < stripe_h; j++) {
+            pixel_copy(dst, &left[j][1], 3);
+            dst += REST_UNIT_STRIDE;
+        }
+    }
+}
+
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
+                              const uint8_t (*const left)[4],
+                              const uint8_t *lpf,
+                              const ptrdiff_t lpf_stride,
+                              const int w, const int h,
+                              const int16_t filterh[7],
+                              const int16_t filterv[7],
+                              const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+    ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+    wiener_filter_h_vsx(hor, tmp, filterh, w, h);
+    wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h);
+
+}
+#endif
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)
+    (Dav1dLoopRestorationDSPContext *const c)
+{
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+    c->wiener = wiener_filter_vsx;
+#endif
+}
+
+
--- a/third_party/dav1d/src/ppc/types.h
+++ b/third_party/dav1d/src/ppc/types.h
@ -47,6 +47,8 @@
 #define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
 #define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
 #define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
+#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v))
 #define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
+#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v))

 #endif /* DAV1D_SRC_PPC_TYPES_H */
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -1971,7 +1971,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
                                       start_of_tile_row);
    }

-    if (f->seq_hdr->restoration) {
+    if (f->lf.restore_planes) {
        // Store loop filtered pixels required by loop restoration
        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
    }
@ -2010,7 +2010,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
                              f->resize_start[!!pl] HIGHBD_CALL_SUFFIX);
        }
    }
-    if (f->seq_hdr->restoration) {
+    if (f->lf.restore_planes) {
        bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
    }

--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@ -716,52 +716,65 @@ const uint16_t dav1d_dr_intra_derivative[44] = {
       3        // 87, 177, 267
 };

+#if ARCH_X86
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+    [2*idx+0]  = f0, [2*idx+1]  = f1,      \
+    [2*idx+16] = f2, [2*idx+17] = f3,      \
+    [2*idx+32] = f4, [2*idx+33] = f5,      \
+    [2*idx+48] = f6
+#else
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+    [1*idx+0]  = f0, [1*idx+8]  = f1,      \
+    [1*idx+16] = f2, [1*idx+24] = f3,      \
+    [1*idx+32] = f4, [1*idx+40] = f5,      \
+    [1*idx+48] = f6
+#endif
 const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
    {
-         -6,  10,  -5,   2,  -3,   1,  -3,   1,
-         -4,   6,  -3,   2,  -3,   2,  -3,   1,
-          0,   0,  10,   0,   1,  10,   1,   2,
-          0,   0,   6,   0,   2,   6,   2,   2,
-          0,  12,   0,   9,   0,   7,  10,   5,
-          0,   2,   0,   2,   0,   2,   6,   3,
-          0,   0,   0,   0,   0,   0,   0,   0,
-         12,   0,   9,   0,   7,   0,   5,   0
+        F( 0,  -6, 10,  0,  0,  0, 12,  0 ),
+        F( 1,  -5,  2, 10,  0,  0,  9,  0 ),
+        F( 2,  -3,  1,  1, 10,  0,  7,  0 ),
+        F( 3,  -3,  1,  1,  2, 10,  5,  0 ),
+        F( 4,  -4,  6,  0,  0,  0,  2, 12 ),
+        F( 5,  -3,  2,  6,  0,  0,  2,  9 ),
+        F( 6,  -3,  2,  2,  6,  0,  2,  7 ),
+        F( 7,  -3,  1,  2,  2,  6,  3,  5 ),
    }, {
-        -10,  16,  -6,   0,  -4,   0,  -2,   0,
-        -10,  16,  -6,   0,  -4,   0,  -2,   0,
-          0,   0,  16,   0,   0,  16,   0,   0,
-          0,   0,  16,   0,   0,  16,   0,   0,
-          0,  10,   0,   6,   0,   4,  16,   2,
-          0,   0,   0,   0,   0,   0,  16,   0,
-          0,   0,   0,   0,   0,   0,   0,   0,
-         10,   0,   6,   0,   4,   0,   2,   0
+        F( 0, -10, 16,  0,  0,  0, 10,  0 ),
+        F( 1,  -6,  0, 16,  0,  0,  6,  0 ),
+        F( 2,  -4,  0,  0, 16,  0,  4,  0 ),
+        F( 3,  -2,  0,  0,  0, 16,  2,  0 ),
+        F( 4, -10, 16,  0,  0,  0,  0, 10 ),
+        F( 5,  -6,  0, 16,  0,  0,  0,  6 ),
+        F( 6,  -4,  0,  0, 16,  0,  0,  4 ),
+        F( 7,  -2,  0,  0,  0, 16,  0,  2 ),
    }, {
-         -8,   8,  -8,   0,  -8,   0,  -8,   0,
-         -4,   4,  -4,   0,  -4,   0,  -4,   0,
-          0,   0,   8,   0,   0,   8,   0,   0,
-          0,   0,   4,   0,   0,   4,   0,   0,
-          0,  16,   0,  16,   0,  16,   8,  16,
-          0,   0,   0,   0,   0,   0,   4,   0,
-          0,   0,   0,   0,   0,   0,   0,   0,
-         16,   0,  16,   0,  16,   0,  16,   0
+        F( 0,  -8,  8,  0,  0,  0, 16,  0 ),
+        F( 1,  -8,  0,  8,  0,  0, 16,  0 ),
+        F( 2,  -8,  0,  0,  8,  0, 16,  0 ),
+        F( 3,  -8,  0,  0,  0,  8, 16,  0 ),
+        F( 4,  -4,  4,  0,  0,  0,  0, 16 ),
+        F( 5,  -4,  0,  4,  0,  0,  0, 16 ),
+        F( 6,  -4,  0,  0,  4,  0,  0, 16 ),
+        F( 7,  -4,  0,  0,  0,  4,  0, 16 ),
    }, {
-         -2,   8,  -1,   3,  -1,   2,   0,   1,
-         -1,   4,  -1,   3,  -1,   2,  -1,   2,
-          0,   0,   8,   0,   3,   8,   2,   3,
-          0,   0,   4,   0,   3,   4,   2,   3,
-          0,  10,   0,   6,   0,   4,   8,   2,
-          0,   3,   0,   4,   0,   4,   4,   3,
-          0,   0,   0,   0,   0,   0,   0,   0,
-         10,   0,   6,   0,   4,   0,   3,   0
+        F( 0,  -2,  8,  0,  0,  0, 10,  0 ),
+        F( 1,  -1,  3,  8,  0,  0,  6,  0 ),
+        F( 2,  -1,  2,  3,  8,  0,  4,  0 ),
+        F( 3,   0,  1,  2,  3,  8,  2,  0 ),
+        F( 4,  -1,  4,  0,  0,  0,  3, 10 ),
+        F( 5,  -1,  3,  4,  0,  0,  4,  6 ),
+        F( 6,  -1,  2,  3,  4,  0,  4,  4 ),
+        F( 7,  -1,  2,  2,  3,  4,  3,  3 ),
    }, {
-        -12,  14, -10,   0,  -9,   0,  -8,   0,
-        -10,  12,  -9,   1,  -8,   0,  -7,   0,
-          0,   0,  14,   0,   0,  14,   0,   0,
-          0,   0,  12,   0,   0,  12,   0,   1,
-          0,  14,   0,  12,   0,  11,  14,  10,
-          0,   0,   0,   0,   0,   1,  12,   1,
-          0,   0,   0,   0,   0,   0,   0,   0,
-         14,   0,  12,   0,  11,   0,   9,   0
+        F( 0, -12, 14,  0,  0,  0, 14,  0 ),
+        F( 1, -10,  0, 14,  0,  0, 12,  0 ),
+        F( 2,  -9,  0,  0, 14,  0, 11,  0 ),
+        F( 3,  -8,  0,  0,  0, 14, 10,  0 ),
+        F( 4, -10, 12,  0,  0,  0,  0, 14 ),
+        F( 5,  -9,  1, 12,  0,  0,  0, 12 ),
+        F( 6,  -8,  0,  0, 12,  0,  1, 11 ),
+        F( 7,  -7,  0,  0,  1, 12,  1,  9 ),
    }
 };

--- a/third_party/dav1d/src/x86/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/x86/cdef_init_tmpl.c
@ -31,14 +31,17 @@
 decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
 decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
 decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
+decl_cdef_fn(dav1d_cdef_filter_8x8_sse2);

 decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
 decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
 decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
+decl_cdef_fn(dav1d_cdef_filter_4x8_sse2);

 decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
 decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
 decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
+decl_cdef_fn(dav1d_cdef_filter_4x4_sse2);

 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
 decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
@ -47,6 +50,14 @@ decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
 COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();

+    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+#if BITDEPTH == 8
+    c->fb[0] = dav1d_cdef_filter_8x8_sse2;
+    c->fb[1] = dav1d_cdef_filter_4x8_sse2;
+    c->fb[2] = dav1d_cdef_filter_4x4_sse2;
+#endif
+
    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;

 #if BITDEPTH == 8
--- a/third_party/dav1d/src/x86/cdef_sse.asm
+++ b/third_party/dav1d/src/x86/cdef_sse.asm
@ -32,6 +32,7 @@ SECTION_RODATA 16
 pb_0: times 16 db 0
 pb_0xFF: times 16 db 0xFF
 %endif
+pw_8: times 8 dw 8
 pw_128: times 8 dw 128
 pw_256: times 8 dw 256
 pw_2048: times 8 dw 2048
@ -118,6 +119,36 @@ SECTION .text
 %endif
 %endmacro

+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if %3 == 1
+    movd            %1, %2
+ %else
+    movq            %1, %2
+ %endif
+    punpcklbw       %1, m15
+%endmacro
+
+%macro PSHUFB_0 2
+ %if cpuflag(ssse3)
+    pshufb          %1, %2
+ %else
+    punpcklbw       %1, %1
+    pshuflw         %1, %1, q0000
+    punpcklqdq      %1, %1
+ %endif
+%endmacro
+
+%macro LOAD_SEC_TAP 0
+ %if ARCH_X86_64
+    movd            m3, [secq+kq]
+    PSHUFB_0        m3, m15
+ %else
+    movd            m2, [secq+kq]             ; sec_taps
+    pxor            m3, m3
+    PSHUFB_0        m2, m3
+ %endif
+%endmacro
+
 %macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
    ; load p0/p1
    movsx         offq, byte [dirq+kq+%1]       ; off1
@ -153,13 +184,13 @@ SECTION .text
    pmaxsw          m7, m10                     ; max after p1
    pminsw          m8, m6                      ; min after p1
  %else
-    pcmpeqw         m3, m5, OUT_OF_BOUNDS_MEM
-    pandn           m3, m5
-    pmaxsw          m7, m3                      ; max after p0
+    pcmpeqw         m9, m5, OUT_OF_BOUNDS_MEM
+    pandn           m9, m5
+    pmaxsw          m7, m9                      ; max after p0
    pminsw          m8, m5                      ; min after p0
-    pcmpeqw         m3, m6, OUT_OF_BOUNDS_MEM
-    pandn           m3, m6
-    pmaxsw          m7, m3                      ; max after p1
+    pcmpeqw         m9, m6, OUT_OF_BOUNDS_MEM
+    pandn           m9, m6
+    pmaxsw          m7, m9                      ; max after p1
    pminsw          m8, m6                      ; min after p1
  %endif
 %endif
@ -168,13 +199,24 @@ SECTION .text
    psubw           m5, m4          ; diff_p0(p0 - px)
    psubw           m6, m4          ; diff_p1(p1 - px)
    packsswb        m5, m6          ; convert pixel diff to 8-bit
- %if ARCH_X86_64 && cpuflag(sse4)
+ %if cpuflag(ssse3)
+  %if ARCH_X86_64 && cpuflag(sse4)
    pshufb          m5, m14         ; group diffs p0 and p1 into pairs
- %else
+  %else
    pshufb          m5, [PIC_sym(shufb_lohi)]
- %endif
+  %endif
    pabsb           m6, m5
    psignb          m9, %5, m5
+ %else
+    movlhps         m6, m5
+    punpckhbw       m6, m5
+    pxor            m5, m5
+    pcmpgtb         m5, m6
+    paddb           m6, m5
+    pxor            m6, m5
+    paddb           m9, %5, m5
+    pxor            m9, m5
+ %endif
 %if ARCH_X86_64
    psrlw          m10, m6, %2      ; emulate 8-bit shift
    pand           m10, %3
@ -186,17 +228,18 @@ SECTION .text
    pxor            m5, [PIC_sym(pb_0xFF)]
 %endif
    pminub          m5, m6          ; constrain(diff_p)
+ %if cpuflag(ssse3)
    pmaddubsw       m5, m9          ; constrain(diff_p) * taps
-    paddw          m13, m5
-%endmacro
-
-%macro PMOVZXBW 2-3 0 ; %3 = half
- %if %3 == 1
-    movd            %1, %2
 %else
-    movq            %1, %2
+    psrlw           m2, m5, 8
+    psraw           m6, m9, 8
+    psllw           m5, 8
+    psllw           m9, 8
+    pmullw          m2, m6
+    pmulhw          m5, m9
+    paddw           m5, m2
 %endif
-    punpcklbw       %1, m15
+    paddw          m13, m5
 %endmacro

 %macro LOAD_BODY 4  ; dst, src, block_width, tmp_stride
@ -610,8 +653,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
 %endif
    movd            m2, [tableq+pridmpq]
    movd            m3, [tableq+secdmpq]
-    pshufb          m2, m15                     ; pri_shift_mask
-    pshufb          m3, m15                     ; sec_shift_mask
+    PSHUFB_0        m2, m15                     ; pri_shift_mask
+    PSHUFB_0        m3, m15                     ; sec_shift_mask
 %if ARCH_X86_64
    SWAP            m2, m11
    SWAP            m3, m12
@ -630,13 +673,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
    movd            m0, prid
    movd            m1, secd
 %if ARCH_X86_64
-    pshufb          m0, m15
-    pshufb          m1, m15
+    PSHUFB_0        m0, m15
+    PSHUFB_0        m1, m15
 %else
-    mova            m2, m15
+  %if cpuflag(ssse3)
+    pxor            m2, m2
+  %endif
    mova            m3, [PIC_sym(pb_0xFF)]
-    pshufb          m0, m2
-    pshufb          m1, m2
+    PSHUFB_0        m0, m2
+    PSHUFB_0        m1, m2
    pxor            m0, m3
    pxor            m1, m3
    mova    [esp+0x20], m0
@ -687,36 +732,44 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
    mova            m7, m4                      ; max
    mova            m8, m4                      ; min
 .k_loop:
- %if ARCH_X86_64
    movd            m2, [priq+kq]               ; pri_taps
-    movd            m3, [secq+kq]               ; sec_taps
-    pshufb          m2, m15
-    pshufb          m3, m15
+ %if ARCH_X86_64
+    PSHUFB_0        m2, m15
+  %if cpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
    ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
+  %if notcpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
    ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
    ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
 %else
-    movd            m2, [priq+kq]             ; pri_taps
-    pshufb          m2, m15
+  %if cpuflag(ssse3)
+    pxor            m3, m3
+  %endif
+    PSHUFB_0        m2, m3
    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
-
-    movd            m2, [secq+kq]             ; sec_taps
-    pshufb          m2, m15
+    LOAD_SEC_TAP                                ; sec_taps
    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+  %if notcpuflag(ssse3)
+    LOAD_SEC_TAP                                ; sec_taps
+  %endif
    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
 %endif

    dec             kq
    jge .k_loop

- %if cpuflag(sse4)
-    pcmpgtw         m6, m15, m13
- %else
    pxor            m6, m6
    pcmpgtw         m6, m13
- %endif
    paddw          m13, m6
+ %if cpuflag(ssse3)
    pmulhrsw       m13, [PIC_sym(pw_2048)]
+ %else
+    paddw          m13, [PIC_sym(pw_8)]
+    psraw          m13, 4
+ %endif
    paddw           m4, m13
    pminsw          m4, m7
    pmaxsw          m4, m8
@ -1352,3 +1405,8 @@ CDEF_FILTER 8, 8, 32
 CDEF_FILTER 4, 8, 32
 CDEF_FILTER 4, 4, 32
 CDEF_DIR
+
+INIT_XMM sse2
+CDEF_FILTER 8, 8, 32
+CDEF_FILTER 4, 8, 32
+CDEF_FILTER 4, 4, 32
--- a/third_party/dav1d/src/x86/film_grain.asm
+++ b/third_party/dav1d/src/x86/film_grain.asm
@ -32,6 +32,8 @@ pw_1024: times 16 dw 1024
 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+             times 2 dw 0x49d8
 pd_m65536: dd ~0xffff
 pb_23_22: times 2 db 23, 22
 pb_1: times 4 db 1
@ -55,6 +57,7 @@ pb_27_17_17_27: db 27, 17, 17, 27
 %endmacro

 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3

 struc FGData
    .seed:                      resd 1
@ -409,6 +412,443 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
    jg .y_loop_ar3
    RET

+INIT_XMM avx2
+cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
+    lea              r4, [pb_mask]
+%define base r4-pb_mask
+    movq            xm1, [base+rnd_next_upperbit_mask]
+    movq            xm4, [base+mul_bits]
+    movq            xm7, [base+hmul_bits]
+    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
+    vpbroadcastw    xm8, [base+round+r5*2]
+    mova            xm5, [base+pb_mask]
+    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
+    vpbroadcastw    xm9, [base+pw_seed_xor+uvq*4]
+    pxor            xm0, xm9
+    vpbroadcastd    xm9, [base+pd_m65536]
+    lea              r6, [gaussian_sequence]
+    mov             r7d, 38
+    add            bufq, 44
+.loop_y:
+    mov              r5, -44
+.loop_x:
+    pand            xm2, xm0, xm1
+    psrlw           xm3, xm2, 10
+    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+    pmullw          xm2, xm4            ; bits 0x0f00 are set
+    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
+    psllq           xm6, xm2, 30
+    por             xm2, xm6
+    psllq           xm6, xm2, 15
+    por             xm2, xm6            ; aggregate each bit into next seed's high bit
+    pmulhuw         xm3, xm0, xm7
+    por             xm2, xm3            ; 4 next output seeds
+    pshuflw         xm0, xm2, q3333
+    psrlw           xm2, 5
+    pmovzxwd        xm3, xm2
+    mova            xm6, xm9
+    vpgatherdd      xm2, [r6+xm3*2], xm6
+    pandn           xm2, xm9, xm2
+    packusdw        xm2, xm2
+    pmulhrsw        xm2, xm8
+    packsswb        xm2, xm2
+    movd      [bufq+r5], xm2
+    add              r5, 4
+    jl .loop_x
+    add            bufq, 82
+    dec             r7d
+    jg .loop_y
+
+    ; auto-regression code
+    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
+    movsxd           r5, [base+generate_grain_uv_420_avx2_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_420_avx2_table]
+    jmp              r5
+
+.ar0:
+    INIT_YMM avx2
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    imul            uvd, 25
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    movd            xm3, [base+hmul_bits+shiftq*2]
+    DEFINE_ARGS buf, bufy, h
+    pmovsxbw        xm4, xm4
+    vpbroadcastd     m7, [pb_1]
+    vpbroadcastw     m6, [hmul_bits+4]
+    vpbroadcastw     m4, xm4
+    vpbroadcastw     m3, xm3
+    sub            bufq, 82*38+82-(82*3+41)
+    add           bufyq, 3+82*3
+    mov              hd, 35
+.y_loop_ar0:
+    ; first 32 pixels
+    movu            xm8, [bufyq]
+    movu            xm9, [bufyq+82]
+    movu           xm10, [bufyq+16]
+    movu           xm11, [bufyq+82+16]
+    vinserti128      m8, [bufyq+32], 1
+    vinserti128      m9, [bufyq+82+32], 1
+    vinserti128     m10, [bufyq+48], 1
+    vinserti128     m11, [bufyq+82+48], 1
+    pmaddubsw        m8, m7, m8
+    pmaddubsw        m9, m7, m9
+    pmaddubsw       m10, m7, m10
+    pmaddubsw       m11, m7, m11
+    paddw            m8, m9
+    paddw           m10, m11
+    pmulhrsw         m8, m6
+    pmulhrsw        m10, m6
+    pmullw           m8, m4
+    pmullw          m10, m4
+    pmulhrsw         m8, m3
+    pmulhrsw        m10, m3
+    packsswb         m8, m10
+    movu             m0, [bufq]
+    punpckhbw        m1, m0, m8
+    punpcklbw        m0, m8
+    pmaddubsw        m1, m7, m1
+    pmaddubsw        m0, m7, m0
+    packsswb         m0, m1
+    movu         [bufq], m0
+
+    ; last 6 pixels
+    movu            xm8, [bufyq+32*2]
+    movu            xm9, [bufyq+32*2+82]
+    pmaddubsw       xm8, xm7, xm8
+    pmaddubsw       xm9, xm7, xm9
+    paddw           xm8, xm9
+    pmulhrsw        xm8, xm6
+    pmullw          xm8, xm4
+    pmulhrsw        xm8, xm3
+    packsswb        xm8, xm8
+    movq            xm0, [bufq+32]
+    punpcklbw       xm8, xm0
+    pmaddubsw       xm8, xm7, xm8
+    packsswb        xm8, xm8
+    vpblendw        xm0, xm8, xm0, 1000b
+    movq      [bufq+32], xm0
+
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar0
+    RET
+
+.ar1:
+    INIT_XMM avx2
+    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
+    imul            uvd, 25
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
+    pmovsxbw        xm4, xm4
+    pshufd          xm5, xm4, q1111
+    pshufd          xm4, xm4, q0000
+    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
+    vpbroadcastd    xm7, [pb_1]
+    vpbroadcastw    xm6, [hmul_bits+4]
+    vpbroadcastd    xm3, xm3
+    sub            bufq, 82*38+44-(82*3+41)
+    add           bufyq, 79+82*3
+    mov              hd, 35
+    mov            mind, -128
+    mov            maxd, 127
+.y_loop_ar1:
+    mov              xq, -38
+    movsx         val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
+    movq            xm8, [bufyq+xq*2]
+    movq            xm9, [bufyq+xq*2+82]
+    psrldq          xm2, xm0, 2             ; top
+    psrldq          xm1, xm0, 4             ; top/right
+    pmaddubsw       xm8, xm7, xm8
+    pmaddubsw       xm9, xm7, xm9
+    paddw           xm8, xm9
+    pmulhrsw        xm8, xm6
+    punpcklwd       xm0, xm2
+    punpcklwd       xm1, xm8
+    pmaddwd         xm0, xm4
+    pmaddwd         xm1, xm5
+    paddd           xm0, xm1
+    paddd           xm0, xm3
+.x_loop_ar1_inner:
+    movd          val0d, xm0
+    psrldq          xm0, 4
+    imul          val3d, cf3d
+    add           val3d, val0d
+    sarx          val3d, val3d, shiftd
+    movsx         val0d, byte [bufq+xq]
+    add           val3d, val0d
+    cmp           val3d, maxd
+    cmovg         val3d, maxd
+    cmp           val3d, mind
+    cmovl         val3d, mind
+    mov  byte [bufq+xq], val3b
+    ; keep val3d in-place as left for next x iteration
+    inc              xq
+    jz .x_loop_ar1_end
+    test             xq, 3
+    jnz .x_loop_ar1_inner
+    jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar1
+    RET
+
+.ar2:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 25
+    movd           xm15, [base+hmul_bits-10+shiftq*2]
+    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
+    pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
+    DEFINE_ARGS buf, bufy, h, x
+    pshufd         xm12, xm9, q0000
+    pshufd         xm13, xm9, q1111
+    pshufd         xm14, xm9, q2222
+    pxor           xm10, xm10
+    vpblendw       xm14, xm10, 10101010b
+    pshufd         xm11, xm8, q3333
+    pshufd         xm10, xm8, q2222
+    pshufd          xm9, xm8, q1111
+    pshufd          xm8, xm8, q0000
+    sub            bufq, 82*38+44-(82*3+41)
+    add           bufyq, 79+82*3
+    mov              hd, 35
+.y_loop_ar2:
+    mov              xq, -38
+
+.x_loop_ar2:
+    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
+    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
+    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
+    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
+    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
+    punpcklwd       xm2, xm0, xm2
+    punpcklwd       xm3, xm4
+    pmaddwd         xm2, xm8
+    pmaddwd         xm3, xm11
+    paddd           xm2, xm3
+
+    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
+    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
+    psrldq          xm6, xm0, 8             ; y=-2,x=[+2,+5]
+    punpcklwd       xm4, xm5
+    punpcklwd       xm6, xm1
+    psrldq          xm7, xm1, 6             ; y=-1,x=[+1,+5]
+    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
+    punpcklwd       xm7, xm1
+    pmaddwd         xm4, xm9
+    pmaddwd         xm6, xm10
+    pmaddwd         xm7, xm12
+    paddd           xm4, xm6
+    paddd           xm2, xm7
+    paddd           xm2, xm4
+
+    vpbroadcastd    xm4, [base+pb_1]
+    movq            xm6, [bufyq+xq*2]
+    movq            xm7, [bufyq+xq*2+82]
+    pmaddubsw       xm6, xm4, xm6
+    pmaddubsw       xm7, xm4, xm7
+    vpbroadcastw    xm4, [base+hmul_bits+4]
+    paddw           xm6, xm7
+    pmulhrsw        xm6, xm4
+    pxor            xm7, xm7
+    punpcklwd       xm6, xm7
+    pmaddwd         xm6, xm14
+    paddd           xm2, xm6
+
+    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+    pmovsxbw        xm0, xm0
+    pmaddwd         xm3, xm0, xm13
+    paddd           xm3, xm2
+    psrldq          xm2, 4                  ; shift top to next pixel
+    psrad           xm3, 5
+    packssdw        xm3, xm3
+    pmulhrsw        xm3, xm15
+    pslldq          xm3, 2
+    psrldq          xm0, 2
+    paddw           xm3, xm0
+    vpblendw        xm0, xm3, 00000010b
+    packsswb        xm0, xm0
+    pextrb    [bufq+xq], xm0, 1
+    inc              xq
+    jz .x_loop_ar2_end
+    test             xq, 3
+    jnz .x_loop_ar2_inner
+    jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar2
+    RET
+
+.ar3:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    SUB             rsp, 16*12
+%assign stack_size_padded (stack_size_padded+16*12)
+%assign stack_size (stack_size+16*12)
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 25
+    movd           xm14, [base+hmul_bits-10+shiftq*2]
+    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-7
+    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8]   ; cf8-15
+    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-23
+    pmovsxbw        xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24]   ; cf24 [luma]
+    pshufd          xm9, xm0, q1111
+    pshufd         xm10, xm0, q2222
+    pshufd         xm11, xm0, q3333
+    pshufd          xm0, xm0, q0000
+    pshufd          xm6, xm1, q1111
+    pshufd          xm7, xm1, q2222
+    pshufd          xm8, xm1, q3333
+    pshufd          xm1, xm1, q0000
+    pshufd          xm3, xm2, q1111
+    pshufd          xm4, xm2, q2222
+    vpbroadcastw    xm5, xm5
+    vpblendw        xm4, xm5, 10101010b                     ; interleave luma cf
+    psrldq          xm5, xm2, 10
+    pshufd          xm2, xm2, q0000
+    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
+    mova    [rsp+ 0*16], xm0
+    mova    [rsp+ 1*16], xm9
+    mova    [rsp+ 2*16], xm10
+    mova    [rsp+ 3*16], xm11
+    mova    [rsp+ 4*16], xm1
+    mova    [rsp+ 5*16], xm6
+    mova    [rsp+ 6*16], xm7
+    mova    [rsp+ 7*16], xm8
+    mova    [rsp+ 8*16], xm2
+    mova    [rsp+ 9*16], xm3
+    mova    [rsp+10*16], xm4
+    mova    [rsp+11*16], xm5
+    vpbroadcastd   xm13, [base+pb_1]
+    vpbroadcastw   xm15, [base+hmul_bits+4]
+    DEFINE_ARGS buf, bufy, h, x
+    sub            bufq, 82*38+44-(82*3+41)
+    add           bufyq, 79+82*3
+    mov              hd, 35
+.y_loop_ar3:
+    mov              xq, -38
+
+.x_loop_ar3:
+    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
+    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
+    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
+    pxor            xm3, xm3
+    pcmpgtb         xm6, xm3, xm2
+    pcmpgtb         xm5, xm3, xm1
+    pcmpgtb         xm4, xm3, xm0
+    punpckhbw       xm3, xm0, xm4
+    punpcklbw       xm0, xm4
+    punpckhbw       xm4, xm1, xm5
+    punpcklbw       xm1, xm5
+    punpckhbw       xm5, xm2, xm6
+    punpcklbw       xm2, xm6
+
+    psrldq          xm6, xm0, 2
+    psrldq          xm7, xm0, 4
+    psrldq          xm8, xm0, 6
+    psrldq          xm9, xm0, 8
+    palignr        xm10, xm3, xm0, 10
+    palignr        xm11, xm3, xm0, 12
+
+    punpcklwd       xm0, xm6
+    punpcklwd       xm7, xm8
+    punpcklwd       xm9, xm10
+    punpcklwd      xm11, xm1
+    pmaddwd         xm0, [rsp+ 0*16]
+    pmaddwd         xm7, [rsp+ 1*16]
+    pmaddwd         xm9, [rsp+ 2*16]
+    pmaddwd        xm11, [rsp+ 3*16]
+    paddd           xm0, xm7
+    paddd           xm9, xm11
+    paddd           xm0, xm9
+
+    psrldq          xm6, xm1, 2
+    psrldq          xm7, xm1, 4
+    psrldq          xm8, xm1, 6
+    psrldq          xm9, xm1, 8
+    palignr        xm10, xm4, xm1, 10
+    palignr        xm11, xm4, xm1, 12
+    psrldq         xm12, xm2, 2
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd      xm10, xm11
+    punpcklwd      xm12, xm2, xm12
+    pmaddwd         xm6, [rsp+ 4*16]
+    pmaddwd         xm8, [rsp+ 5*16]
+    pmaddwd        xm10, [rsp+ 6*16]
+    pmaddwd        xm12, [rsp+ 7*16]
+    paddd           xm6, xm8
+    paddd          xm10, xm12
+    paddd           xm6, xm10
+    paddd           xm0, xm6
+
+    psrldq          xm6, xm2, 4
+    psrldq          xm7, xm2, 6
+    psrldq          xm8, xm2, 8
+    palignr         xm9, xm5, xm2, 10
+    palignr         xm5, xm5, xm2, 12
+
+    movq            xm1, [bufyq+xq*2]
+    movq            xm2, [bufyq+xq*2+82]
+    pmaddubsw       xm1, xm13, xm1
+    pmaddubsw       xm2, xm13, xm2
+    paddw           xm1, xm2
+    vpbroadcastw    xm3, xm15
+    pmulhrsw        xm1, xm3
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd       xm5, xm1
+    pmaddwd         xm6, [rsp+ 8*16]
+    pmaddwd         xm8, [rsp+ 9*16]
+    pmaddwd         xm5, [rsp+10*16]
+    paddd           xm0, xm6
+    paddd           xm8, xm5
+    paddd           xm0, xm8
+
+    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+    pmovsxbw        xm1, xm1
+    pmaddwd         xm2, xm1, [rsp+16*11]
+    pshufd          xm3, xm2, q1111
+    paddd           xm2, xm3                ; left+cur
+    paddd           xm2, xm0                ; add top
+    psrldq          xm0, 4
+    psrad           xm2, 5
+    packssdw        xm2, xm2
+    pmulhrsw        xm2, xm14
+    pslldq          xm2, 6
+    vpblendw        xm1, xm2, 1000b
+    packsswb        xm1, xm1
+    pextrb    [bufq+xq], xm1, 3
+    psrldq          xm1, 1
+    inc              xq
+    jz .x_loop_ar3_end
+    test             xq, 3
+    jnz .x_loop_ar3_inner
+    jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar3
+    RET
+
 INIT_YMM avx2
 cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
    pcmpeqw         m10, m10
--- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/x86/film_grain_init_tmpl.c
@ -29,6 +29,7 @@
 #include "src/film_grain.h"

 decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);

@ -39,6 +40,7 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c

 #if BITDEPTH == 8 && ARCH_X86_64
    c->generate_grain_y = dav1d_generate_grain_y_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
    c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
 #endif
--- a/third_party/dav1d/src/x86/ipred.asm
+++ b/third_party/dav1d/src/x86/ipred.asm
--- a/third_party/dav1d/src/x86/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c
@ -39,6 +39,7 @@ decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
 decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z2_avx2);
 decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
 decl_angular_ipred_fn(dav1d_ipred_filter_avx2);

@ -119,6 +120,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
    c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
    c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
    c->intra_pred[Z1_PRED]       = dav1d_ipred_z1_avx2;
+    c->intra_pred[Z2_PRED]       = dav1d_ipred_z2_avx2;
    c->intra_pred[Z3_PRED]       = dav1d_ipred_z3_avx2;
    c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_avx2;

--- a/third_party/dav1d/src/x86/itx.asm
+++ b/third_party/dav1d/src/x86/itx.asm
@ -50,7 +50,6 @@ pw_2482_3344:   dw  2482,  3344
 pw_m3344_3344:  dw -3344,  3344
 pw_m3803_3344:  dw -3803,  3344
 pw_m3803_m6688: dw -3803, -6688
-COEF_PAIR           2896,  2896
 pw_2896_m2896:  dw  2896, -2896

 pw_5:      times 2 dw 5
@ -63,6 +62,7 @@ pw_5793x4: times 2 dw 5793*4

 pd_2048: dd 2048

+COEF_PAIR 2896, 2896
 COEF_PAIR 1567, 3784
 COEF_PAIR 3784, 1567
 COEF_PAIR  201, 4091
@ -194,7 +194,7 @@ SECTION .text

 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
-%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
    punpckhwd           m%3, m%2, m%1
    punpcklwd           m%2, m%1
 %if %7 < 32
@ -222,20 +222,20 @@ SECTION .text
    paddd               m%2, m%5
    psrad               m%3, 12
    psrad               m%2, 12
+%if %0 == 8
+    packssdw            m%8, m%2, m%3
+%else
    packssdw            m%2, m%3
+%endif
 %endmacro

 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
-    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
-    vpbroadcastd        m%6, [o(pw_2896x8)]
-    paddw               m%5, m%1, m%3
-    psubw               m%1, m%3
-    pmulhrsw            m%1, m%6 ; t1
-    pmulhrsw            m%5, m%6 ; t0
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
+    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
    psubsw              m%3, m%1, m%2
    paddsw              m%2, m%1
-    paddsw              m%1, m%5, m%4
-    psubsw              m%4, m%5, m%4
+    paddsw              m%1, m%4, m%5
+    psubsw              m%4, m%5
 %endmacro

 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
@ -246,27 +246,20 @@ SECTION .text
    psubsw              m%2, m%6       ; t5a
    paddsw             m%10, m%8, m%4  ; t7
    psubsw              m%8, m%4       ; t6a
-    vpbroadcastd        m%4, [o(pw_2896x8)]
-    psubw               m%6, m%1, m%5
-    paddw               m%1, m%5
-    psubw               m%5, m%8, m%2
-    paddw               m%8, m%2
-    pmulhrsw            m%1, m%4       ; t0
-    pmulhrsw            m%6, m%4       ; t1
-    pmulhrsw            m%8, m%4       ; t6
-    pmulhrsw            m%5, m%4       ; t5
-    psubsw              m%4, m%1, m%7  ; dct4 out3
-    paddsw              m%1, m%7       ; dct4 out0
-    paddsw              m%7, m%6, m%3  ; dct4 out1
-    psubsw              m%6, m%3       ; dct4 out2
-    paddsw              m%2, m%7, m%8  ; out1
-    psubsw              m%7, m%8       ; out6
+    ITX_MULSUB_2W        %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
+    ITX_MULSUB_2W        %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
+    psubsw              m%6, m%1, m%3  ; dct4 out2
+    paddsw              m%3, m%1       ; dct4 out1
+    paddsw              m%1, m%5, m%7  ; dct4 out0
+    psubsw              m%5, m%7       ; dct4 out3
+    psubsw              m%7, m%3, m%2  ; out6
+    paddsw              m%2, m%3       ; out1
+    paddsw              m%3, m%6, m%8  ; out2
+    psubsw              m%6, m%8       ; out5
    psubsw              m%8, m%1, m%10 ; out7
    paddsw              m%1, m%10      ; out0
-    paddsw              m%3, m%6, m%5  ; out2
-    psubsw              m%6, m%5       ; out5
-    psubsw              m%5, m%4, m%9  ; out4
-    paddsw              m%4, m%9       ; out3
+    paddsw              m%4, m%5, m%9  ; out3
+    psubsw              m%5, m%9       ; out4
 %endmacro

 ; in1 = %1, in3  = %2, in5  = %3, in7  = %4
@ -286,20 +279,16 @@ SECTION .text
    paddsw              m%1, m%5      ; t8
    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
-    vpbroadcastd       m%10, [o(pw_2896x8)]
-    psubsw              m%5, m%2, m%9 ; t10
-    paddsw              m%2, m%9      ; t9
-    psubsw              m%9, m%1, m%3 ; t11a
+    psubsw              m%5, m%1, m%3 ; t11a
    paddsw              m%1, m%3      ; t8a
    psubsw              m%3, m%7, m%4 ; t13
    paddsw              m%7, m%4      ; t14
    psubsw              m%4, m%8, m%6 ; t12a
    paddsw              m%8, m%6      ; t15a
-    paddw               m%6, m%3, m%5 ; t13a
-    psubw               m%3, m%5      ; t10a
-    paddw               m%5, m%4, m%9 ; t12
-    psubw               m%4, m%9      ; t11
-    REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
+    psubsw              m%6, m%2, m%9 ; t10
+    paddsw              m%2, m%9      ; t9
+    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
+    ITX_MULSUB_2W        %4, %5, %9, %10, %11, 2896, 2896 ; t11,  t12
 %endmacro

 %macro WRAP_XMM 1+
@ -446,21 +435,14 @@ ALIGN function_align
 %endif
 %endmacro

-%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
+%macro IDCT4_1D_PACKED 0
    vpbroadcastd         m4, [o(pd_2048)]
    punpckhwd            m2, m1, m0
-    psubw                m3, m0, m1
-    paddw                m0, m1
-    punpcklqdq           m0, m3
-    ITX_MUL2X_PACK        2, 1, 3, 4, 1567, 3784
-%if %0 == 1
-    pmulhrsw             m0, m%1
-%else
-    vpbroadcastd         m4, [o(pw_2896x8)]
-    pmulhrsw             m0, m4     ; t0 t1
-%endif
-    psubsw               m1, m0, m2 ; out3 out2
-    paddsw               m0, m2     ; out0 out1
+    punpcklwd            m1, m0
+    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
+    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
+    paddsw               m0, m1, m2 ; out0 out1
+    psubsw               m1, m2     ; out3 out2
 %endmacro

 %macro IADST4_1D_PACKED 0
@ -683,30 +665,30 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    vpbroadcastd         m6, [o(pd_2048)]
    punpckhwd            m5, m3, m0 ; in7 in1
    punpckhwd            m4, m1, m2 ; in3 in5
-    punpcklwd            m3, m1     ; in2 in6
-    psubw                m1, m0, m2
-    paddw                m0, m2
-    punpcklqdq           m0, m1     ; in0+in4 in0-in4
-    ITX_MUL2X_PACK        5, 1, 2, 6,  799, 4017, 1 ; t4a t7a
-    ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
-    ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
-    vpbroadcastd         m6, [o(pw_2896x8)]
-    psubsw               m2, m5, m4 ; t4 t7
-    paddsw               m5, m4     ; t5a t6a
-    pshufd               m4, m2, q1032
-    psubw                m1, m2, m4
-    paddw                m4, m2
-    vpblendd             m4, m4, m1, 0xcc
-    pmulhrsw             m0, m6     ; t0 t1
-    pmulhrsw             m4, m6     ; t6 t5
-    psubsw               m1, m0, m3 ; tmp3 tmp2
-    paddsw               m0, m3     ; tmp0 tmp1
-    shufps               m2, m5, m4, q1032 ; t7 t6
-    vpblendd             m5, m5, m4, 0xcc  ; t4 t5
-    psubsw               m3, m0, m2 ; out7 out6
-    paddsw               m0, m2     ; out0 out1
-    psubsw               m2, m1, m5 ; out4 out5
-    paddsw               m1, m5     ; out3 out2
+    punpcklwd            m3, m1     ; in6 in2
+    punpcklwd            m2, m0     ; in4 in0
+    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
+    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
+    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
+    paddsw               m4, m5     ; t4  t7  (interleaved)
+    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
+    vpbroadcastd         m1, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        0, 1, _, 6, 1, 5, 4 ; t6 t5
+%if mmsize > 16
+    vbroadcasti128       m1, [o(deint_shuf)]
+    pshufb               m4, m1
+%else
+    pshufb               m4, [o(deint_shuf)]
+%endif
+    psubsw               m1, m2, m3 ; tmp3 tmp2
+    paddsw               m3, m2     ; tmp0 tmp1
+    shufps               m2, m4, m0, q1032 ; t7 t6
+    vpblendd             m4, m0, 0xcc      ; t4 t5
+    paddsw               m0, m3, m2 ; out0 out1
+    psubsw               m3, m2     ; out7 out6
+    psubsw               m2, m1, m4 ; out4 out5
+    paddsw               m1, m4     ; out3 out2
 %endmacro

 %macro IADST8_1D_PACKED 1 ; pass
@ -797,10 +779,10 @@ INV_TXFM_4X8_FN dct, flipadst
 cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q3120
    vpermq               m1, [cq+32*1], q3120
-    vpbroadcastd         m5, [o(pw_2896x8)]
-    pmulhrsw             m0, m5
-    pmulhrsw             m1, m5
-    IDCT4_1D_PACKED       5
+    vpbroadcastd         m2, [o(pw_2896x8)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    IDCT4_1D_PACKED
    vbroadcasti128       m2, [o(deint_shuf)]
    shufps               m3, m0, m1, q1331
    shufps               m0, m0, m1, q0220
@ -1011,9 +993,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpbroadcastd        m10, [o(pd_2048)]
 .main2:
    punpckhwd            m8, m7, m0 ; dct16 in15 in1
-    paddw                m9, m0, m4
-    psubw                m0, m4
-    punpcklqdq           m9, m0     ; dct4  in0+in2 in0-in2
+    punpcklwd            m9, m4, m0 ; dct4  in2  in0
    punpckhwd            m0, m3, m4 ; dct16 in7  in9
    punpcklwd            m7, m1     ; dct8  in7  in1
    punpckhwd            m1, m6     ; dct16 in3  in13
@ -1024,47 +1004,44 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
-    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
-    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
+    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 3 ; t4a  t7a
+    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 3 ; t5a  t6a
    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
    psubsw               m2, m8, m0 ; t9  t14
    paddsw               m8, m0     ; t8  t15
    psubsw               m0, m1, m5 ; t10 t13
    paddsw               m1, m5     ; t11 t12
-%if mmsize > 16
-    vbroadcasti128       m5, [o(deint_shuf)]
-%else
-    mova                 m5, [o(deint_shuf)]
-%endif
-    pshufb               m8, m5
-    pshufb               m1, m5
    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
-    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
+    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 6   ; t9a  t14a
    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
-    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
-    psubsw               m5, m7, m3 ; t5a t6a
-    paddsw               m7, m3     ; t4  t7
+    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 6   ; t10a t13a
    psubsw               m4, m8, m1 ; t11a t12a
    paddsw               m8, m1     ; t8a  t15a
-    paddsw               m1, m2, m0 ; t9   t14
+    psubsw               m1, m7, m3 ; t5a  t6a
+    paddsw               m7, m3     ; t4   t7
+    paddsw               m3, m2, m0 ; t9   t14
    psubsw               m2, m0     ; t10  t13
-    punpckhqdq           m0, m8, m1 ; t15a t14
-    punpcklqdq           m8, m1     ; t8a  t9
-    pshufd               m3, m5, q1032
-    psubw                m1, m5, m3
-    paddw                m3, m5
-    vpblendd             m3, m3, m1, 0xcc ; t6 t5
-    vpbroadcastd         m1, [o(pw_2896x8)]
-    punpckhqdq           m5, m4, m2 ; t12a t13
-    punpcklqdq           m2, m4, m2 ; t11a t10
-    psubw                m4, m5, m2
-    paddw                m5, m2
-    pmulhrsw             m9, m1     ; t0   t1
-    pmulhrsw             m3, m1     ; t6   t5
-    pmulhrsw             m4, m1     ; t11  t10a
-    pmulhrsw             m5, m1     ; t12  t13a
-    shufps               m2, m7, m3, q1032 ; t7 t6
-    vpblendd             m7, m7, m3, 0xcc  ; t4 t5
+%if mmsize > 16
+    vbroadcasti128       m0, [o(deint_shuf)]
+%else
+    mova                 m0, [o(deint_shuf)]
+%endif
+    pshufb               m8, m0
+    pshufb               m7, m0
+    pshufb               m3, m0
+    ITX_MUL2X_PACK        9, 0, 5, 10, 2896, 2896 ; t0   t1
+    vpbroadcastd         m0, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        4, 5, _, 10, 5, 0, 4    ; t11  t12
+    vpbroadcastd         m5, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
+    vpbroadcastd         m0, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4,   ; t13a t10a
+    punpckhqdq           m0, m8, m3        ; t15a t14
+    punpcklqdq           m8, m3            ; t8a  t9
+    shufps               m5, m4, m2, q1032 ; t12  t13a
+    vpblendd             m4, m2, 0xcc      ; t11  t10a
+    shufps               m2, m7, m1, q1032 ; t7 t6
+    vpblendd             m7, m1, 0xcc      ; t4 t5
    psubsw               m1, m9, m6 ; dct4 out3 out2
    paddsw               m9, m6     ; dct4 out0 out1
    psubsw               m3, m9, m2 ; dct8 out7 out6
@ -3699,12 +3676,11 @@ ALIGN function_align
    paddsw               m6, m11      ; t17  t30
    psubsw              m11, m0, m14  ; t21  t26
    paddsw               m0, m14      ; t22  t25
-    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
-    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
-    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
-    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
+    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 3 ; t18a t29a
+    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 3 ; t19  t28
+    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 3 ; t20  t27
+    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
    vbroadcasti128      m12, [o(deint_shuf)]
-    REPX    {pshufb x, m12}, m0, m1, m6, m8
    psubsw              m14, m1, m8   ; t23  t24
    paddsw               m1, m8       ; t16  t31
    psubsw               m8, m6, m0   ; t22a t25a
@ -3713,16 +3689,18 @@ ALIGN function_align
    paddsw              m15, m11      ; t18  t29
    psubsw              m11, m13, m9  ; t20a t27a
    paddsw              m13, m9       ; t19a t28a
-    vpbroadcastd        m12, [o(pw_2896x8)]
-    punpcklqdq           m9, m11, m0  ; t20a t21
-    punpckhqdq          m11, m0       ; t27a t26
-    punpcklqdq           m0, m14, m8  ; t23  t22a
-    punpckhqdq          m14, m8       ; t24  t25a
-    psubw                m8, m11, m9  ; t20  t21a
-    paddw               m11, m9       ; t27  t26a
-    psubw                m9, m14, m0  ; t23a t22
-    paddw               m14, m0       ; t24a t25
-    REPX  {pmulhrsw x, m12}, m8, m9, m14, m11
+    REPX    {pshufb x, m12}, m1, m6, m15, m13
+    ITX_MUL2X_PACK       14,  9, 12, 10, 2896, 2896 ; t24a t23a
+    vpbroadcastd         m9, [o(pw_m2896_2896)]
+    ITX_MUL2X_PACK        8, 12,  _, 10, 12,  9, 4  ; t22  t25
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK        0, 12,  _, 10, 12,  9, 4  ; t21a t26a
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    ITX_MUL2X_PACK       11,  9,  _, 10,  9, 12, 4  ; t27  t20
+    shufps               m9, m14, m8, q1032 ; t23a t22
+    vpblendd            m14, m8, 0xcc       ; t24a t25
+    shufps               m8, m11, m0, q1032 ; t20  t21a
+    vpblendd            m11, m0, 0xcc       ; t27  t26a
    punpcklqdq           m0, m1, m6   ; t16  t17a
    punpckhqdq           m1, m6       ; t31  t30a
    psubsw              m10, m5, m8   ; out20 out21
@ -4327,33 +4305,29 @@ ALIGN function_align
    mova                 m5, [rsp+gprsize+32*0] ; t22
    mova                 m6, [rsp+gprsize+32*1] ; t23
    mova                 m3, [rsp+gprsize+32*2] ; t24a
-    vpbroadcastd         m8, [o(pw_2896x8)]
    psubsw               m1, m14, m5  ; t22a
    paddsw              m14, m5       ; t17a
    psubsw               m5, m0, m6   ; t23
    paddsw               m0, m6       ; t16
    psubsw               m6, m4, m3   ; t24
    paddsw               m4, m3       ; t31
+    vpbroadcastd         m8, [o(pw_m2896_2896)]
+    vpbroadcastd         m3, [o(pw_2896_2896)]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m14
    mova       [tmp2q+32*3], m4
-    psubw                m3, m13, m9  ; t20
-    paddw               m13, m9       ; t27
-    psubw                m9, m2, m10  ; t21a
-    paddw                m2, m10      ; t26a
-    psubw               m10, m7, m1   ; t22
-    paddw                m7, m1       ; t25
-    psubw                m1, m6, m5   ; t23a
-    paddw                m6, m5       ; t24a
-    REPX   {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
-    mova       [tmp1q+32*0], m3
-    mova       [tmp1q+32*1], m9
-    mova       [tmp1q+32*2], m10
-    mova       [tmp1q+32*3], m1
-    mova       [tmp2q-32*4], m6
-    mova       [tmp2q-32*3], m7
-    mova       [tmp2q-32*2], m2
-    mova       [tmp2q-32*1], m13
+    ITX_MULSUB_2W        13,  9,  0,  4, 15,  3,  8 ; t20,  t27
+    ITX_MULSUB_2W         2, 10,  0,  4, 15,  3,  8 ; t21a, t26a
+    ITX_MULSUB_2W         7,  1,  0,  4, 15,  3,  8 ; t22,  t25
+    ITX_MULSUB_2W         6,  5,  0,  4, 15,  3,  8 ; t23a, t24a
+    mova       [tmp1q+32*0], m13
+    mova       [tmp1q+32*1], m2
+    mova       [tmp1q+32*2], m7
+    mova       [tmp1q+32*3], m6
+    mova       [tmp2q-32*4], m5
+    mova       [tmp2q-32*3], m1
+    mova       [tmp2q-32*2], m10
+    mova       [tmp2q-32*1], m9
    ret
 ALIGN function_align
 .transpose_2x8x8_round:
@ -5237,11 +5211,10 @@ ALIGN function_align
    sub                 rax, o_idct64_offset + 8
    vpbroadcastd        m11, [o(pw_1567_3784)]
    vpbroadcastd        m12, [o(pw_m3784_1567)]
-    vpbroadcastd        m13, [o(pw_m1567_m3784)]
-    vpbroadcastd        m14, [o(pw_2896x8)]
+    vpbroadcastd        m13, [o(pw_2896_2896)]
+    vpbroadcastd        m14, [o(pw_m2896_2896)]
 .main_part2_pass1_loop:
    call .main_part2_internal
-    REPX  {pmulhrsw x, m14}, m1, m2, m4, m3
    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
@ -5251,53 +5224,51 @@ ALIGN function_align
    ret
 .main_part2_internal:
    mova                 m0, [tmp1q-32*12] ; t32a
-    mova                 m1, [tmp2q-32*13] ; t39a
-    mova                 m2, [tmp1q-32* 4] ; t40a
+    mova                 m6, [tmp2q-32*13] ; t39a
+    mova                 m1, [tmp1q-32* 4] ; t40a
    mova                 m5, [tmp2q+32* 3] ; t55a
    add               tmp1q, 32
    sub               tmp2q, 32
-    mova                 m4, [tmp1q+32* 3] ; t48a
-    mova                 m3, [tmp2q-32* 4] ; t47a
-    mova                 m6, [tmp1q+32*11] ; t56a
+    mova                 m2, [tmp1q+32* 3] ; t48a
+    mova                 m4, [tmp2q-32* 4] ; t47a
+    mova                 m3, [tmp1q+32*11] ; t56a
    mova                 m7, [tmp2q+32*12] ; t63a
-    psubsw               m8, m0, m1 ; t39
-    paddsw               m0, m1     ; t32
-    psubsw               m1, m3, m2 ; t40
-    paddsw               m3, m2     ; t47
-    psubsw               m2, m4, m5 ; t55
-    paddsw               m4, m5     ; t48
-    psubsw               m5, m7, m6 ; t56
-    paddsw               m7, m6     ; t63
-    ITX_MULSUB_2W         5,  8,  6,  9, 15, 11, 12 ; t39a, t56a
-    ITX_MULSUB_2W         2,  1,  6,  9, 15, 12, 13 ; t40a, t55a
-    psubsw               m6, m0, m3 ; t47a
-    paddsw               m0, m3     ; t32a
-    psubsw               m3, m7, m4 ; t48a
-    paddsw               m7, m4     ; t63a
-    psubsw               m4, m5, m2 ; t40
-    paddsw               m5, m2     ; t39
-    psubsw               m2, m8, m1 ; t55
-    paddsw               m8, m1     ; t56
-    psubw                m1, m2, m4 ; t40a
-    paddw                m2, m4     ; t55a
-    psubw                m4, m3, m6 ; t47
-    paddw                m3, m6     ; t48
+    psubsw               m8, m0, m6 ; t39
+    paddsw               m0, m6     ; t32
+    psubsw               m6, m4, m1 ; t40
+    paddsw               m4, m1     ; t47
+    psubsw               m1, m2, m5 ; t55
+    paddsw               m2, m5     ; t48
+    psubsw               m5, m7, m3 ; t56
+    paddsw               m7, m3     ; t63
+    ITX_MULSUB_2W         5,  8,  3,  9, 15, 11, 12 ; t39a, t56a
+    vpbroadcastd         m9, [o(pw_m1567_m3784)]
+    ITX_MULSUB_2W         1,  6,  3,  9, 15, 12,  9 ; t40a, t55a
+    psubsw               m3, m0, m4 ; t47a
+    paddsw               m0, m4     ; t32a
+    psubsw               m4, m7, m2 ; t48a
+    paddsw               m7, m2     ; t63a
+    psubsw               m2, m5, m1 ; t40
+    paddsw               m5, m1     ; t39
+    psubsw               m1, m8, m6 ; t55
+    paddsw               m8, m6     ; t56
+    ITX_MULSUB_2W         4,  3,  6,  9, 15, 13, 14 ; t47,  t48
+    ITX_MULSUB_2W         1,  2,  6,  9, 15, 13, 14 ; t40a, t55a
    ret
 .main_part2_pass2:
    sub                 rax, o_idct64_offset + 8
    vpbroadcastd        m11, [o(pw_1567_3784)]
    vpbroadcastd        m12, [o(pw_m3784_1567)]
-    vpbroadcastd        m13, [o(pw_m1567_m3784)]
-    vpbroadcastd        m14, [o(pw_2048)]
+    vpbroadcastd        m13, [o(pw_2896_2896)]
    lea                  r9, [strideq*5]    ; stride*5
    lea                  r3, [r9+strideq*1] ; stride*6
    lea                  r7, [r9+strideq*2] ; stride*7
    lea                  r8, [r3+strideq*2] ; stride*8
    lea                  r2, [dstq+r7]
 .main_part2_pass2_loop:
+    vpbroadcastd        m14, [o(pw_m2896_2896)]
    call .main_part2_internal
-    vpbroadcastd        m10, [o(pw_2896x8)]
-    REPX  {pmulhrsw x, m10}, m1, m2, m4, m3
+    vpbroadcastd        m14, [o(pw_2048)]
    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
--- a/third_party/dav1d/src/x86/itx_ssse3.asm
+++ b/third_party/dav1d/src/x86/itx_ssse3.asm
@ -202,7 +202,7 @@ SECTION .text
    ret
 %endmacro

-; flags: 1 = swap, 2: coef_regs
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
    pmaddwd              m%2, m%4, m%1
@ -218,24 +218,17 @@ SECTION .text
    paddd                m%1, m%3
    psrad                m%2, 12
    psrad                m%1, 12
+%if %6 & 4 == 0
    packssdw             m%1, m%2
+%endif
 %endmacro

 %macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
-    punpckhwd            m2, m0, m1            ;unpacked in1 in3
-    psubw                m3, m0, m1
-    paddw                m0, m1
-    punpcklqdq           m0, m3                ;high: in0-in2 ;low: in0+in2
-
    mova                 m3, [o(pd_2048)]
+    punpckhwd            m2, m0, m1            ;unpacked in1 in3
+    punpcklwd            m0, m1                ;unpacked in0 in2
    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
-
-%if %0 == 1
-    pmulhrsw             m0, m%1
-%else
-    pmulhrsw             m0, [o(pw_2896x8)]    ;high: t1 ;low: t0
-%endif
-
+    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
    psubsw               m1, m0, m2            ;high: out2 ;low: out3
    paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro
@ -499,79 +492,81 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff

 %macro IDCT8_1D_PACKED 0
    mova                 m6, [o(pd_2048)]
-    punpckhwd            m5, m0, m3                 ;unpacked in1 in7
-    punpckhwd            m4, m2, m1                 ;unpacked in5 in3
+    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
+    punpcklwd            m0, m2                     ;unpacked in0 in4
+    punpckhwd            m2, m1                     ;unpacked in5 in3
    punpcklwd            m1, m3                     ;unpacked in2 in6
-    psubw                m3, m0, m2
-    paddw                m0, m2
-    punpcklqdq           m0, m3                     ;low: in0+in4 high: in0-in4
-    ITX_MUL2X_PACK        5, 2, 6,  799, 4017, 1    ;low: t4a high: t7a
-    ITX_MUL2X_PACK        4, 2, 6, 3406, 2276, 1    ;low: t5a high: t6a
-    ITX_MUL2X_PACK        1, 2, 6, 1567, 3784       ;low: t3  high: t2
-    mova                 m6, [o(pw_2896x8)]
-    psubsw               m2, m5, m4                 ;low: t5a high: t6a
-    paddsw               m5, m4                     ;low: t4  high: t7
-    punpckhqdq           m4, m2, m2                 ;low: t6a high: t6a
-    psubw                m3, m4, m2                 ;low: t6a - t5a
-    paddw                m4, m2                     ;low: t6a + t5a
-    punpcklqdq           m4, m3                     ;low: t6a + t5a high: t6a - t5a
-    pmulhrsw             m0, m6                     ;low: t0   high: t1
-    pmulhrsw             m4, m6                     ;low: t6   high: t5
-    shufps               m2, m5, m4, q1032          ;low: t7   high: t6
-    shufps               m5, m4, q3210              ;low: t4   high: t5
-    psubsw               m4, m0, m1                 ;low: tmp3 high: tmp2
+    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
+    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
+    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
+    psubsw               m3, m4, m2                 ;low: t6a high: t5a
+    paddsw               m4, m2                     ;low: t7  high: t4
+    pshufb               m3, [o(deint_shuf1)]
+    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
+    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
+    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
    paddsw               m0, m1                     ;low: tmp0 high: tmp1
-    psubsw               m3, m0, m2                 ;low: out7 high: out6
-    paddsw               m0, m2                     ;low: out0 high: out1
-    psubsw               m2, m4, m5                 ;low: out4 high: out5
-    paddsw               m1, m4, m5                 ;low: out3 high: out2
+    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
+    punpckhqdq           m4, m3                     ;low: t4   high: t5
+    psubsw               m3, m0, m1                 ;low: out7 high: out6
+    paddsw               m0, m1                     ;low: out0 high: out1
+    paddsw               m1, m2, m4                 ;low: out3 high: out2
+    psubsw               m2, m4                     ;low: out4 high: out5
 %endmacro

 ;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
 ;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
-%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
-    punpckhwd           m%3, m%1, m%2
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+    punpckhwd           m%4, m%1, m%2
    punpcklwd           m%1, m%2
 %if %7 < 8
    pmaddwd             m%2, m%7, m%1
-    pmaddwd             m%4, m%7, m%3
+    pmaddwd             m%3, m%7, m%4
 %else
    mova                m%2, [o(pw_%7_%6)]
-    pmaddwd             m%4, m%3, m%2
+%if %8
+    pmaddwd             m%3, m%1, m%2
+    pmaddwd             m%2, m%4
+%else
+    pmaddwd             m%3, m%4, m%2
    pmaddwd             m%2, m%1
 %endif
-    paddd               m%4, m%5
-    paddd               m%2, m%5
-    psrad               m%4, 12
-    psrad               m%2, 12
-    packssdw            m%2, m%4                 ;dst2
-%if %7 < 8
-    pmaddwd             m%3, m%6
-    pmaddwd             m%1, m%6
-%else
-    mova                m%4, [o(pw_%6_m%7)]
-    pmaddwd             m%3, m%4
-    pmaddwd             m%1, m%4
 %endif
    paddd               m%3, m%5
-    paddd               m%1, m%5
+    paddd               m%2, m%5
    psrad               m%3, 12
+    psrad               m%2, 12
+%if %8
+    packssdw            m%3, m%2
+%else
+    packssdw            m%2, m%3                 ;dst2
+%endif
+%if %7 < 8
+    pmaddwd             m%4, m%6
+    pmaddwd             m%1, m%6
+%elif %8
+    mova                m%2, [o(pw_%6_m%7)]
+    pmaddwd             m%4, m%2
+    pmaddwd             m%1, m%2
+%else
+    mova                m%3, [o(pw_%6_m%7)]
+    pmaddwd             m%4, m%3
+    pmaddwd             m%1, m%3
+%endif
+    paddd               m%4, m%5
+    paddd               m%1, m%5
+    psrad               m%4, 12
    psrad               m%1, 12
-    packssdw            m%1, m%3                 ;dst1
+    packssdw            m%1, m%4                 ;dst1
 %endmacro

 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
-    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784   ;t2, t3
-    mova                m%6, [o(pw_2896x8)]
-    paddw               m%5, m%1, m%3
-    psubw               m%1, m%3
-    pmulhrsw            m%1, m%6                          ;t1
-    pmulhrsw            m%5, m%6                          ;t0
-    psubsw              m%3, m%1, m%2                     ;out2
-    paddsw              m%2, m%1                          ;out1
-    paddsw              m%1, m%5, m%4                     ;out0
-    psubsw              m%5, m%4                          ;out3
-    mova                m%4, m%5
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+    psubsw              m%3, m%1, m%2                      ;out2
+    paddsw              m%2, m%1                           ;out1
+    paddsw              m%1, m%5, m%4                      ;out0
+    psubsw              m%4, m%5                           ;out3
 %endmacro

 %macro WRITE_4X8 4 ;row[1-4]
@ -1286,17 +1281,13 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
 %endmacro

 %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
-    ITX_MULSUB_2W        %1, %4, %5, %6, %7,  799, 4017   ;t4a, t7a
-    ITX_MULSUB_2W        %3, %2, %5, %6, %7, 3406, 2276   ;t5a, t6a
-    psubsw               m%5, m%1, m%3                    ;t5a
-    paddsw               m%1, m%3                         ;t4
-    psubsw               m%6, m%4, m%2                    ;t6a
-    paddsw               m%4, m%2                         ;t7
-    mova                 m%3, [o(pw_2896x8)]
-    psubw                m%2, m%6, m%5                    ;t6a - t5a
-    paddw                m%6, m%5                         ;t6a + t5a
-    pmulhrsw             m%2, m%3                         ;t5
-    pmulhrsw             m%3, m%6                         ;t6
+    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
+    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+    psubsw               m%2, m%4, m%5                      ;t6a
+    paddsw               m%4, m%5                           ;t7
+    psubsw               m%5, m%1, m%3                      ;t5a
+    paddsw               m%1, m%3                           ;t4
+    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
 %endmacro

 INV_TXFM_8X8_FN dct, dct,      0
@ -2063,37 +2054,34 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
 %macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
    punpckhwd            m%5, m%4, m%1                ;packed in13 in3
    punpcklwd            m%1, m%4                     ;packed in1  in15
-    punpcklwd            m%6, m%3, m%2                ;packed in9  in7
+    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
    punpckhwd            m%2, m%3                     ;packed in5  in11
-
    mova                 m%7, [o(pd_2048)]
-    ITX_MUL2X_PACK        %1, %4, %7,  401, 4076, 1    ;low: t8a   high: t15a
-    ITX_MUL2X_PACK        %6, %4, %7, 3166, 2598, 1    ;low: t9a   high: t14a
-    ITX_MUL2X_PACK        %2, %4, %7, 1931, 3612, 1    ;low: t10a  high: t13a
-    ITX_MUL2X_PACK        %5, %4, %7, 3920, 1189, 1    ;low: t11a  high: t12a
-    psubsw               m%4, m%1, m%6                 ;low: t9    high: t14
-    paddsw               m%1, m%6                      ;low: t8    high: t15
+    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
+    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
+    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
+    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
+    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
+    paddsw               m%1, m%4                      ;low: t8    high: t15
    psubsw               m%3, m%5, m%2                 ;low: t10   high: t13
-    paddsw               m%2, m%5                      ;low: t11   high: t12
-    punpcklqdq           m%5, m%4, m%3                 ;low: t9    high: t10
-    punpckhqdq           m%4, m%3                      ;low: t14   high: t13
-    punpcklwd            m%6, m%4, m%5                 ;packed t14 t9
-    punpckhwd            m%5, m%4                      ;packed t10 t13
+    paddsw               m%5, m%2                      ;low: t11   high: t12
+    mova                 m%2, [o(deint_shuf2)]
+    pshufb               m%6, m%2
+    pshufb               m%3, [o(deint_shuf1)]
    pxor                 m%4, m%4
-    psubw                m%4, m%5                      ;packed -t10 -t13
+    psubw                m%4, m%3                      ;packed -t10 -t13
    ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
    ITX_MUL2X_PACK        %4, %3, %7, 3784, 1567       ;low: t10a  high: t13a
-    psubsw               m%3, m%1, m%2                 ;low: t11a  high: t12a
-    paddsw               m%1, m%2                      ;low: t8a   high: t15a
+    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
+    paddsw               m%1, m%5                      ;low: t8a   high: t15a
    psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
    paddsw               m%6, m%4                      ;low: t9    high: t14
-    mova                 m%7, [o(pw_2896x8)]
-    punpckhqdq           m%4, m%3, m%5                 ;low: t12a  high: t13
-    punpcklqdq           m%3, m%5                      ;low: t11a  high: t10
-    psubw                m%2, m%4, m%3
-    paddw                m%3, m%4
-    pmulhrsw             m%2, m%7                      ;low: t11   high: t10a
-    pmulhrsw             m%3, m%7                      ;low: t12   high: t13a
+    pshufb               m%3, m%2
+    pshufb               m%5, m%2
+    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
+    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
+    packssdw             m%2, m%4                      ;low: t11   high: t10a
+    packssdw             m%3, m%5                      ;low: t12   high: t13a
    punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
    punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
 %endmacro
@ -2918,19 +2906,14 @@ ALIGN function_align
    mova                   m0, [rsp+gprsize*2+16*1]
    mova                   m2, [rsp+gprsize*2+16*2]
    mova [rsp+gprsize*2+16*1], m4
-    psubsw                 m4, m0, m3                   ;t13
+    psubsw                 m5, m0, m3                   ;t13
    paddsw                 m0, m3                       ;t14
-    psubsw                 m3, m2, m1                   ;t12a
+    mova                   m3, [o(pd_2048)]
+    psubsw                 m4, m2, m1                   ;t12a
    paddsw                 m1, m2                       ;t15a
-    mova                   m5, [o(pw_2896x8)]
-    psubw                  m2, m4, m7                   ;t13-t10
-    paddw                  m7, m4                       ;t13+t10
-    psubw                  m4, m3, m6                   ;t12a-t11a
-    paddw                  m6, m3                       ;t12a+t11a
-    pmulhrsw               m7, m5                       ;t13a
-    pmulhrsw               m4, m5                       ;t11
-    pmulhrsw               m6, m5                       ;t12
-    pmulhrsw               m5, m2                       ;t10a
+    mova [rsp+gprsize*2+16*2], m1
+    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
+    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
    mova                   m3, [rsp+gprsize*2+16*8]
    psubsw                 m2, m3, m5                   ;out10
    paddsw                 m3, m5                       ;out5
@ -2950,6 +2933,7 @@ ALIGN function_align
    mova [rsp+gprsize*2+16*5], m6
    psubsw                 m6, m7, m0                   ;out14
    paddsw                 m7, m0                       ;out1
+    mova                   m1, [rsp+gprsize*2+16*2]
    mova                   m0, [rsp+gprsize*2+16*3]
    mova [rsp+gprsize*2+16*4], m7
    psubsw                 m7, m0, m1                   ;out15
@ -4211,35 +4195,30 @@ ALIGN function_align
    psubsw                  m5, m3, m2                    ;t28a
    paddsw                  m3, m2                        ;t31a
    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
-
    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
    psubsw                  m1, m5, m6                    ;t20a
    paddsw                  m5, m6                        ;t19a
    psubsw                  m6, m2, m5                    ;out19
    paddsw                  m2, m5                        ;out12
+    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
    mova [rsp+gprsize*2+16*22], m6                        ;out19
    mova [rsp+gprsize*2+16*15], m2                        ;out12
-    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
    psubsw                  m6, m4, m5                    ;t27a
    paddsw                  m4, m5                        ;t28a
+    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
-    mova                    m7, [o(pw_2896x8)]
-    psubw                   m5, m6, m1                    ;t27a - t20a
-    paddw                   m6, m1                        ;t27a + t20a
-    psubsw                  m1, m2, m4                    ;out28
+    psubsw                  m5, m2, m4                    ;out28
    paddsw                  m2, m4                        ;out3
-    pmulhrsw                m5, m7                        ;t20
-    pmulhrsw                m6, m7                        ;t27
    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
-    mova [rsp+gprsize*2+16*31], m1                        ;out28
+    mova [rsp+gprsize*2+16*31], m5                        ;out28
    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
-    psubsw                  m1, m4, m5                    ;out20
-    paddsw                  m4, m5                        ;out11
+    psubsw                  m5, m4, m6                    ;out20
+    paddsw                  m4, m6                        ;out11
    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
-    mova [rsp+gprsize*2+16*23], m1                        ;out20
+    mova [rsp+gprsize*2+16*23], m5                        ;out20
    mova [rsp+gprsize*2+16*14], m4                        ;out11
-    psubsw                  m5, m2, m6                    ;out27
-    paddsw                  m2, m6                        ;out4
+    psubsw                  m5, m2, m1                    ;out27
+    paddsw                  m2, m1                        ;out4
    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
    mova [rsp+gprsize*2+16*30], m5                        ;out27
@ -4248,27 +4227,24 @@ ALIGN function_align
    paddsw                  m0, m1                        ;t16
    psubsw                  m2, m3, m4                    ;t24
    paddsw                  m3, m4                        ;t31
+    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
-    psubw                   m1, m2, m5                    ;t24  - t23
-    paddw                   m2, m5                        ;t24  + t23
    psubsw                  m4, m6, m0                    ;out16
    paddsw                  m6, m0                        ;out15
-    pmulhrsw                m1, m7                        ;t23a
-    pmulhrsw                m2, m7                        ;t24a
    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
-    mova                    m5, [rsp+gprsize*2+16*11]     ;tmp8
+    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
    mova [rsp+gprsize*2+16*18], m6                        ;out15
    mova [rsp+gprsize*2+16*19], m4                        ;out16
    psubsw                  m6, m0, m3                    ;out31
    paddsw                  m0, m3                        ;out0
-    psubsw                  m4, m5, m1                    ;out23
-    paddsw                  m5, m1                        ;out8
+    psubsw                  m4, m1, m2                    ;out23
+    paddsw                  m1, m2                        ;out8
    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
    mova [rsp+gprsize*2+16*34], m6                        ;out31
-    mova [rsp+gprsize*2+16*11], m5                        ;out8
+    mova [rsp+gprsize*2+16*11], m1                        ;out8
    mova [rsp+gprsize*2+16*26], m4                        ;out23
-    paddsw                  m6, m3, m2                    ;out7
-    psubsw                  m3, m2                        ;out24
+    paddsw                  m6, m3, m5                    ;out7
+    psubsw                  m3, m5                        ;out24
    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
@ -4283,23 +4259,20 @@ ALIGN function_align
    mova [rsp+gprsize*2+16*20], m3                        ;out17
    psubsw                  m2, m1, m5                    ;t25a
    paddsw                  m1, m5                        ;t30a
-    psubw                   m3, m2, m4                    ;t25a - t22a
-    paddw                   m2, m4                        ;t25a + t22a
+    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
-    pmulhrsw                m3, m7                        ;t22
-    pmulhrsw                m2, m7                        ;t25
-    psubsw                  m4, m5, m1                    ;out30
+    psubsw                  m3, m5, m1                    ;out30
    paddsw                  m5, m1                        ;out1
    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
-    mova [rsp+gprsize*2+16*33], m4                        ;out30
+    mova [rsp+gprsize*2+16*33], m3                        ;out30
    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
-    psubsw                  m4, m1, m3                    ;out22
-    paddsw                  m1, m3                        ;out9
+    psubsw                  m3, m1, m2                    ;out22
+    paddsw                  m1, m2                        ;out9
    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
-    mova [rsp+gprsize*2+16*25], m4                        ;out22
+    mova [rsp+gprsize*2+16*25], m3                        ;out22
    mova [rsp+gprsize*2+16*12], m1                        ;out9
-    psubsw                  m3, m5, m2                    ;out25
-    paddsw                  m5, m2                        ;out6
+    psubsw                  m3, m5, m4                    ;out25
+    paddsw                  m5, m4                        ;out6
    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
@ -4315,17 +4288,14 @@ ALIGN function_align
    mova [rsp+gprsize*2+16*16], m2                        ;out13
    psubsw                  m5, m3, m1                    ;t26
    paddsw                  m3, m1                        ;t29
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
-    psubw                   m1, m5, m4                    ;t26 - t21
-    paddw                   m4, m5                        ;t26 + t21
-    psubsw                  m5, m2, m3                    ;out29
+    psubsw                  m1, m2, m3                    ;out29
    paddsw                  m2, m3                        ;out2
-    pmulhrsw                m1, m7                        ;t21a
-    pmulhrsw                m4, m7                        ;t26a
    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
-    mova [rsp+gprsize*2+16*32], m5                        ;out29
-    psubsw                  m7, m3, m1                    ;out21
-    paddsw                  m3, m1                        ;out10
+    mova [rsp+gprsize*2+16*32], m1                        ;out29
+    psubsw                  m7, m3, m5                    ;out21
+    paddsw                  m3, m5                        ;out10
    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
    mova [rsp+gprsize*2+16*24], m7                        ;out21
    mova [rsp+gprsize*2+16*13], m3                        ;out10
@ -6010,262 +5980,237 @@ ALIGN function_align
    psubw                   m5, m6, m3
    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t43, t52

-    mova                    m7, [o(pw_2896x8)]
    mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
    mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
    psubsw                  m6, m2, m0                    ;t44
    paddsw                  m2, m0                        ;t35
    psubsw                  m0, m3, m2                    ;out35
    paddsw                  m2, m3                        ;out28
+    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
    mova [rsp+gprsize*2+16*38], m0                        ;out35
    mova [rsp+gprsize*2+16*31], m2                        ;out28
-    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
-    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
    psubsw                  m0, m3, m1                    ;t51
    paddsw                  m3, m1                        ;t60
-    psubw                   m1, m0, m6                    ;t44a
-    paddw                   m0, m6                        ;t51a
-    psubsw                  m6, m2, m3                    ;out60
+    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
+    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
+    psubsw                  m1, m2, m3                    ;out60
    paddsw                  m2, m3                        ;out3
-    pmulhrsw                m1, m7                        ;t44a
-    pmulhrsw                m0, m7                        ;t51a
    mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
-    mova [rsp+gprsize*2+16*63], m6                        ;out60
+    mova [rsp+gprsize*2+16*63], m1                        ;out60
    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
-    psubsw                  m6, m3, m1                    ;out44
-    paddsw                  m3, m1                        ;out19
+    psubsw                  m1, m3, m0                    ;out44
+    paddsw                  m3, m0                        ;out19
    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
-    mova [rsp+gprsize*2+16*47], m6                        ;out44
-    mova [rsp+gprsize*2+16*22], m3                        ;out19
-    psubsw                  m1, m2, m0                    ;out51
-    paddsw                  m2, m0                        ;out12
-    mova [rsp+gprsize*2+16*54], m1                        ;out51
-    mova [rsp+gprsize*2+16*15], m2                        ;out12

    mova                    m0, [rsp+gprsize*2+16*39]     ;t36
+    mova [rsp+gprsize*2+16*47], m1                        ;out44
+    mova [rsp+gprsize*2+16*22], m3                        ;out19
    mova                    m1, [rsp+gprsize*2+16*62]     ;t59
+    psubsw                  m3, m2, m6                    ;out51
+    paddsw                  m2, m6                        ;out12
+    mova [rsp+gprsize*2+16*54], m3                        ;out51
+    mova [rsp+gprsize*2+16*15], m2                        ;out12
    psubsw                  m2, m0, m5                    ;t43a
    paddsw                  m0, m5                        ;t36a
+    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
    psubsw                  m3, m1, m4                    ;t52a
    paddsw                  m1, m4                        ;t59a
-    psubw                   m5, m3, m2                    ;t43
-    paddw                   m3, m2                        ;t52
-    mova                    m2, [rsp+gprsize*2+16*30]     ;tmp[27]
+    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
    mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
-    pmulhrsw                m5, m7                        ;t43
-    pmulhrsw                m3, m7                        ;t52
-    psubsw                  m6, m2, m0                    ;out36
-    paddsw                  m2, m0                        ;out27
+    psubsw                  m6, m5, m0                    ;out36
+    paddsw                  m5, m0                        ;out27
    psubsw                  m0, m4, m1                    ;out59
    paddsw                  m4, m1                        ;out4
    mova [rsp+gprsize*2+16*39], m6                        ;out36
-    mova [rsp+gprsize*2+16*30], m2                        ;out27
+    mova [rsp+gprsize*2+16*30], m5                        ;out27
    mova [rsp+gprsize*2+16*62], m0                        ;out59
    mova [rsp+gprsize*2+16*7 ], m4                        ;out4
    mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
-    mova                    m2, [rsp+gprsize*2+16*14]     ;tmp[11]
-    psubsw                  m4, m0, m5                    ;out43
-    paddsw                  m0, m5                        ;out20
-    psubsw                  m6, m2, m3                    ;out52
-    paddsw                  m2, m3                        ;out11
+    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
+    psubsw                  m4, m0, m3                    ;out43
+    paddsw                  m0, m3                        ;out20
+    psubsw                  m6, m5, m2                    ;out52
+    paddsw                  m5, m2                        ;out11
    mova [rsp+gprsize*2+16*46], m4                        ;out43
    mova [rsp+gprsize*2+16*23], m0                        ;out20
    mova [rsp+gprsize*2+16*55], m6                        ;out52
-    mova [rsp+gprsize*2+16*14], m2                        ;out11
+    mova [rsp+gprsize*2+16*14], m5                        ;out11

    mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
-    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
+    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
    mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
-    psubsw                  m4, m0, m2                    ;t42
-    paddsw                  m0, m2                        ;t37
+    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
+    psubsw                  m4, m0, m5                    ;t42
+    paddsw                  m0, m5                        ;t37
    psubsw                  m5, m1, m3                    ;t53
    paddsw                  m1, m3                        ;t58
-    psubw                   m6, m5, m4                    ;t42a
-    paddw                   m5, m4                        ;t53a
-    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
    mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
-    pmulhrsw                m6, m7                        ;t42a
-    pmulhrsw                m5, m7                        ;t53a
-    psubsw                  m4, m2, m0                    ;out37
+    psubsw                  m6, m2, m0                    ;out37
    paddsw                  m2, m0                        ;out26
    psubsw                  m0, m3, m1                    ;out58
    paddsw                  m3, m1                        ;out5
-    mova [rsp+gprsize*2+16*40], m4                        ;out37
+    mova [rsp+gprsize*2+16*40], m6                        ;out37
    mova [rsp+gprsize*2+16*29], m2                        ;out26
    mova [rsp+gprsize*2+16*61], m0                        ;out58
    mova [rsp+gprsize*2+16*8 ], m3                        ;out5
    mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
    mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
-    psubsw                  m2, m0, m6                    ;out42
-    paddsw                  m0, m6                        ;out21
-    psubsw                  m3, m1, m5                    ;out53
-    paddsw                  m1, m5                        ;out10
+    psubsw                  m2, m0, m5                    ;out42
+    paddsw                  m0, m5                        ;out21
+    psubsw                  m3, m1, m4                    ;out53
+    paddsw                  m1, m4                        ;out10
    mova [rsp+gprsize*2+16*45], m2                        ;out42
    mova [rsp+gprsize*2+16*24], m0                        ;out21
    mova [rsp+gprsize*2+16*56], m3                        ;out53
    mova [rsp+gprsize*2+16*13], m1                        ;out10

    mova                    m0, [rsp+gprsize*2+16*41]     ;t38
-    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
+    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
    mova                    m1, [rsp+gprsize*2+16*60]     ;t57
-    psubsw                  m4, m0, m2                    ;t41a
-    paddsw                  m0, m2                        ;t38a
+    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
+    psubsw                  m4, m0, m5                    ;t41a
+    paddsw                  m0, m5                        ;t38a
    psubsw                  m5, m1, m3                    ;t54a
    paddsw                  m1, m3                        ;t57a
-    psubw                   m6, m5, m4                    ;t41
-    paddw                   m5, m4                        ;t54
-    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
    mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
-    pmulhrsw                m6, m7                        ;t41a
-    pmulhrsw                m5, m7                        ;t54a
-    psubsw                  m4, m2, m0                    ;out38
+    psubsw                  m6, m2, m0                    ;out38
    paddsw                  m2, m0                        ;out25
    psubsw                  m0, m3, m1                    ;out57
    paddsw                  m3, m1                        ;out6
-    mova [rsp+gprsize*2+16*41], m4                        ;out38
+    mova [rsp+gprsize*2+16*41], m6                        ;out38
    mova [rsp+gprsize*2+16*28], m2                        ;out25
    mova [rsp+gprsize*2+16*60], m0                        ;out57
    mova [rsp+gprsize*2+16*9 ], m3                        ;out6
    mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
-    psubsw                  m2, m0, m6                    ;out41
-    paddsw                  m0, m6                        ;out22
-    psubsw                  m3, m1, m5                    ;out54
-    paddsw                  m1, m5                        ;out9
+    psubsw                  m2, m0, m5                    ;out41
+    paddsw                  m0, m5                        ;out22
+    psubsw                  m3, m1, m4                    ;out54
+    paddsw                  m1, m4                        ;out9
    mova [rsp+gprsize*2+16*44], m2                        ;out41
    mova [rsp+gprsize*2+16*25], m0                        ;out22
    mova [rsp+gprsize*2+16*57], m3                        ;out54
    mova [rsp+gprsize*2+16*12], m1                        ;out9

    mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
-    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
+    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
    mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
-    psubsw                  m4, m0, m2                    ;t40
-    paddsw                  m0, m2                        ;t39
+    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
+    psubsw                  m4, m0, m5                    ;t40
+    paddsw                  m0, m5                        ;t39
    psubsw                  m5, m1, m3                    ;t55
    paddsw                  m1, m3                        ;t56
-    psubw                   m6, m5, m4                    ;t40a
-    paddw                   m5, m4                        ;t55a
-    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
-    pmulhrsw                m6, m7                        ;t40a
-    pmulhrsw                m5, m7                        ;t55a
-    psubsw                  m4, m2, m0                    ;out39
+    psubsw                  m6, m2, m0                    ;out39
    paddsw                  m2, m0                        ;out24
    psubsw                  m0, m3, m1                    ;out56
    paddsw                  m3, m1                        ;out7
-    mova [rsp+gprsize*2+16*42], m4                        ;out39
+    mova [rsp+gprsize*2+16*42], m6                        ;out39
    mova [rsp+gprsize*2+16*27], m2                        ;out24
    mova [rsp+gprsize*2+16*59], m0                        ;out56
    mova [rsp+gprsize*2+16*10], m3                        ;out7
    mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
-    psubsw                  m2, m0, m6                    ;out40
-    paddsw                  m0, m6                        ;out23
-    psubsw                  m3, m1, m5                    ;out55
-    paddsw                  m1, m5                        ;out8
+    psubsw                  m2, m0, m5                    ;out40
+    paddsw                  m0, m5                        ;out23
+    psubsw                  m3, m1, m4                    ;out55
+    paddsw                  m1, m4                        ;out8
    mova [rsp+gprsize*2+16*43], m2                        ;out40
    mova [rsp+gprsize*2+16*26], m0                        ;out23
    mova [rsp+gprsize*2+16*58], m3                        ;out55
    mova [rsp+gprsize*2+16*11], m1                        ;out8

    mova                    m0, [rsp+gprsize*2+16*37]     ;t34
-    mova                    m2, [rsp+gprsize*2+16*48]     ;t45
+    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
    mova                    m3, [rsp+gprsize*2+16*53]     ;t50
    mova                    m1, [rsp+gprsize*2+16*64]     ;t61
-    psubsw                  m4, m0, m2                    ;t45a
-    paddsw                  m0, m2                        ;t34a
+    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
+    psubsw                  m4, m0, m5                    ;t45a
+    paddsw                  m0, m5                        ;t34a
    psubsw                  m5, m1, m3                    ;t50a
    paddsw                  m1, m3                        ;t61a
-    psubw                   m6, m5, m4                    ;t45
-    paddw                   m5, m4                        ;t50
-    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
    mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
-    pmulhrsw                m6, m7                        ;t45
-    pmulhrsw                m5, m7                        ;t50
-    psubsw                  m4, m2, m0                    ;out34
+    psubsw                  m6, m2, m0                    ;out34
    paddsw                  m2, m0                        ;out29
    psubsw                  m0, m3, m1                    ;out61
    paddsw                  m3, m1                        ;out2
-    mova [rsp+gprsize*2+16*37], m4                        ;out34
+    mova [rsp+gprsize*2+16*37], m6                        ;out34
    mova [rsp+gprsize*2+16*32], m2                        ;out29
    mova [rsp+gprsize*2+16*64], m0                        ;out61
    mova [rsp+gprsize*2+16*5 ], m3                        ;out2
    mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
    mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
-    psubsw                  m2, m0, m6                    ;out45
-    paddsw                  m0, m6                        ;out18
-    psubsw                  m3, m1, m5                    ;out50
-    paddsw                  m1, m5                        ;out13
+    psubsw                  m2, m0, m5                    ;out45
+    paddsw                  m0, m5                        ;out18
+    psubsw                  m3, m1, m4                    ;out50
+    paddsw                  m1, m4                        ;out13
    mova [rsp+gprsize*2+16*48], m2                        ;out45
    mova [rsp+gprsize*2+16*21], m0                        ;out18
    mova [rsp+gprsize*2+16*53], m3                        ;out50
    mova [rsp+gprsize*2+16*16], m1                        ;out13

    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
-    mova                    m2, [rsp+gprsize*2+16*49]     ;t46a
+    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
    mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
-    psubsw                  m4, m0, m2                    ;t46
-    paddsw                  m0, m2                        ;t33
+    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
+    psubsw                  m4, m0, m5                    ;t46
+    paddsw                  m0, m5                        ;t33
    psubsw                  m5, m1, m3                    ;t49
    paddsw                  m1, m3                        ;t62
-    psubw                   m6, m5, m4                    ;t46a
-    paddw                   m5, m4                        ;t49a
-    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
    mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
-    pmulhrsw                m6, m7                        ;t46a
-    pmulhrsw                m5, m7                        ;t49a
-    psubsw                  m4, m2, m0                    ;out33
+    psubsw                  m6, m2, m0                    ;out33
    paddsw                  m2, m0                        ;out30
    psubsw                  m0, m3, m1                    ;out62
    paddsw                  m3, m1                        ;out1
-    mova [rsp+gprsize*2+16*36], m4                        ;out33
+    mova [rsp+gprsize*2+16*36], m6                        ;out33
    mova [rsp+gprsize*2+16*33], m2                        ;out30
    mova [rsp+gprsize*2+16*65], m0                        ;out62
    mova [rsp+gprsize*2+16*4 ], m3                        ;out1
    mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
    mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
-    psubsw                  m2, m0, m6                    ;out46
-    paddsw                  m0, m6                        ;out17
-    psubsw                  m3, m1, m5                    ;out49
-    paddsw                  m1, m5                        ;out14
+    psubsw                  m2, m0, m5                    ;out46
+    paddsw                  m0, m5                        ;out17
+    psubsw                  m3, m1, m4                    ;out49
+    paddsw                  m1, m4                        ;out14
    mova [rsp+gprsize*2+16*49], m2                        ;out46
    mova [rsp+gprsize*2+16*20], m0                        ;out17
    mova [rsp+gprsize*2+16*52], m3                        ;out49
    mova [rsp+gprsize*2+16*17], m1                        ;out14

    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
-    mova                    m2, [rsp+gprsize*2+16*50]     ;t47
+    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
    mova                    m3, [rsp+gprsize*2+16*51]     ;t48
    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
-    psubsw                  m4, m0, m2                    ;t47a
-    paddsw                  m0, m2                        ;t32a
+    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
+    psubsw                  m4, m0, m5                    ;t47a
+    paddsw                  m0, m5                        ;t32a
    psubsw                  m5, m1, m3                    ;t48a
    paddsw                  m1, m3                        ;t63a
-    psubw                   m6, m5, m4                    ;t47
-    paddw                   m5, m4                        ;t48
-    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
    mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
-    pmulhrsw                m6, m7                        ;t47
-    pmulhrsw                m5, m7                        ;t48
-    psubsw                  m4, m2, m0                    ;out32
+    psubsw                  m6, m2, m0                    ;out32
    paddsw                  m2, m0                        ;out31
    psubsw                  m0, m3, m1                    ;out63
    paddsw                  m3, m1                        ;out0
-    mova [rsp+gprsize*2+16*35], m4                        ;out32
+    mova [rsp+gprsize*2+16*35], m6                        ;out32
    mova [rsp+gprsize*2+16*34], m2                        ;out31
    mova [rsp+gprsize*2+16*66], m0                        ;out63
    mova [rsp+gprsize*2+16*3 ], m3                        ;out0
    mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
    mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
-    psubsw                  m2, m0, m6                    ;out47
-    paddsw                  m0, m6                        ;out16
-    psubsw                  m3, m1, m5                    ;out48
-    paddsw                  m1, m5                        ;out15
+    psubsw                  m2, m0, m5                    ;out47
+    paddsw                  m0, m5                        ;out16
+    psubsw                  m3, m1, m4                    ;out48
+    paddsw                  m1, m4                        ;out15
    mova [rsp+gprsize*2+16*50], m2                        ;out47
    mova [rsp+gprsize*2+16*19], m0                        ;out16
    mova [rsp+gprsize*2+16*51], m3                        ;out48
@ -6273,7 +6218,6 @@ ALIGN function_align
    ret


-
 cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
 %if ARCH_X86_32
    LEA                     r5, $$
--- a/third_party/dav1d/src/x86/mc_init_tmpl.c
+++ b/third_party/dav1d/src/x86/mc_init_tmpl.c
@ -88,7 +88,11 @@ decl_blend_dir_fn(dav1d_blend_h_avx2);
 decl_blend_dir_fn(dav1d_blend_h_ssse3);

 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
 decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);

 decl_emu_edge_fn(dav1d_emu_edge_avx2);
 decl_emu_edge_fn(dav1d_emu_edge_ssse3);
@ -134,9 +138,21 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    c->blend = dav1d_blend_ssse3;
    c->blend_v = dav1d_blend_v_ssse3;
    c->blend_h = dav1d_blend_h_ssse3;
+
+    c->warp8x8  = dav1d_warp_affine_8x8_ssse3;
+    c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
+
    c->emu_edge = dav1d_emu_edge_ssse3;
 #endif

+    if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+        return;
+
+#if BITDEPTH == 8
+    c->warp8x8  = dav1d_warp_affine_8x8_sse4;
+    c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
+#endif
+
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
        return;

--- a/third_party/dav1d/src/x86/mc_ssse3.asm
+++ b/third_party/dav1d/src/x86/mc_ssse3.asm
@ -44,6 +44,10 @@ obmc_masks: db  0,  0,  0,  0
            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2

+warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
+warp_8x8_shufB: db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
+warp_8x8_shufC: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
+warp_8x8_shufD: db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
 blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
@ -53,17 +57,18 @@ subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
 bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7

-pb_64:   times 16 db 64
-pw_8:    times 8 dw 8
-pw_26:   times 8 dw 26
-pw_34:   times 8 dw 34
-pw_512:  times 8 dw 512
-pw_1024: times 8 dw 1024
-pw_2048: times 8 dw 2048
-pw_6903: times 8 dw 6903
-pw_8192: times 8 dw 8192
-pd_32:   times 4 dd 32
-pd_512:  times 4 dd 512
+pb_64:    times 16 db 64
+pw_8:     times 8 dw 8
+pw_26:    times 8 dw 26
+pw_34:    times 8 dw 34
+pw_512:   times 8 dw 512
+pw_1024:  times 8 dw 1024
+pw_2048:  times 8 dw 2048
+pw_6903:  times 8 dw 6903
+pw_8192:  times 8 dw 8192
+pd_32:    times 4 dd 32
+pd_512:   times 4 dd 512
+pd_32768: times 4 dd 32768

 pw_258:  times 2 dw 258

@ -146,6 +151,8 @@ HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128

 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX

+cextern mc_warp_filter
+
 SECTION .text

 INIT_XMM ssse3
@ -3302,6 +3309,580 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
    jg .hv_w8_loop0
    RET

+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+    mov              alpham, alphad
+    mov               betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+    mov              deltam, deltad
+    mov              gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+    mov                 mym, myd
+    mov              alphad, alpham
+    mov               betad, betam
+    mov                 mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+    mov                 mxm, mxd
+    mov              deltad, deltam
+    mov              gammad, gammam
+    mov                 myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+  %assign copy_args 8*4
+ %else
+  %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+    mov                  r0, r0m
+    mov                  r1, r1m
+    mov                  r2, r2m
+    mov                  r3, r3m
+    mov                  r5, r5m
+    mov                dstm, r0
+    mov                 dsm, r1
+    mov                srcm, r2
+    mov                 ssm, r3
+    mov                 mxm, r5
+    mov                  r0, r6m
+    mov                 mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+    pblendw              %1, %2, 0xAA
+ %else
+    pand                 %2, m10
+    por                  %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+    ; Can be done using gathers, but that's terribly slow on many CPU:s
+ %if ARCH_X86_32
+  %define m8  m4
+  %define m9  m5
+  %define m14 m6
+  %define m15 m7
+  %define m11 m7
+    pxor                m11, m11
+ %endif
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                 m2, [filterq+myq  *8] ; a
+    movq                 m8, [filterq+tmp1q*8] ; e
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+deltaq*1]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                 m3, [filterq+tmp2q*8] ; b
+    movq                 m0, [filterq+tmp1q*8] ; f
+    punpcklwd            m2, m3
+    punpcklwd            m8, m0
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                 m0, [filterq+myq  *8] ; c
+    movq                 m9, [filterq+tmp1q*8] ; g
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+gammaq]       ; my += gamma
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                 m3, [filterq+tmp2q*8] ; d
+    movq                 m1, [filterq+tmp1q*8] ; h
+    punpcklwd            m0, m3
+    punpcklwd            m9, m1
+    punpckldq            m1, m2, m0
+    punpckhdq            m2, m0
+    punpcklbw            m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+    punpckhbw            m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+    punpcklbw            m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+    punpckhbw           m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+    pmaddwd              m0, %3
+    pmaddwd              m3, %5
+    pmaddwd              m1, %7
+    pmaddwd             m14, %9
+    paddd                m0, m3
+    paddd                m1, m14
+    paddd                m0, m1
+    mova                 %1, m0
+ %if ARCH_X86_64
+    SWAP                 m3, m14
+ %endif
+    punpckldq            m0, m8, m9
+    punpckhdq            m8, m9
+    punpcklbw            m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+    punpckhbw           m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+    punpcklbw            m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+    punpckhbw           m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+    pmaddwd              m1, %4
+    pmaddwd             m14, %6
+    pmaddwd              m2, %8
+    pmaddwd             m15, %10
+    paddd                m1, m14
+    paddd                m2, m15
+    paddd                m1, m2
+    mova                 %2, m1
+ %if ARCH_X86_64
+    SWAP                m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+  %define counterd dword r4m
+ %else
+  %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+  %define tmpm [esp+stack_size-4*1]
+  %define tsm  [esp+stack_size-4*2]
+ %endif
+%endif
+    call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+    mova                m12, [esp+0xC0]
+    mova                m13, [esp+0xD0]
+    mova                m14, [esp+0xE0]
+    mova                m15, [esp+0xF0]
+%endif
+    psrad               m12, 13
+    psrad               m13, 13
+    psrad               m14, 13
+    psrad               m15, 13
+    packssdw            m12, m13
+    packssdw            m14, m15
+    mova                m13, [PIC_sym(pw_8192)]
+    pmulhrsw            m12, m13 ; (x + (1 << 6)) >> 7
+    pmulhrsw            m14, m13
+    mova       [tmpq+tsq*0], m12
+    mova       [tmpq+tsq*2], m14
+    dec            counterd
+    jz   mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
+%if ARCH_X86_32
+    mov                tmpm, tmpd
+    mov                  r0, [esp+0x100]
+    mov                  r1, [esp+0x104]
+%endif
+    call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
+    lea                tmpq, [tmpq+tsq*4]
+    jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
+                         dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+                         filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
+                         dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+                         filter, tmp1, delta, my, gamma
+ %define alphaq     r0
+ %define alphad     r0
+ %define alpham     [esp+gprsize+0x100]
+ %define betaq      r1
+ %define betad      r1
+ %define betam      [esp+gprsize+0x104]
+ %define deltaq     r0
+ %define deltad     r0
+ %define deltam     [esp+gprsize+0x108]
+ %define gammaq     r1
+ %define gammad     r1
+ %define gammam     [esp+gprsize+0x10C]
+ %define filterq    r3
+ %define tmp1q      r4
+ %define tmp1d      r4
+ %define tmp1m      [esp+gprsize+0x110]
+ %define myq        r5
+ %define myd        r5
+ %define mym        r6m
+ %if copy_args
+  %define dstm [esp+stack_size-4*1]
+  %define dsm  [esp+stack_size-4*2]
+  %define srcm [esp+stack_size-4*3]
+  %define ssm  [esp+stack_size-4*4]
+  %define mxm  [esp+stack_size-4*5]
+  %define mym  [esp+stack_size-4*6]
+ %endif
+%endif
+    call .main
+    jmp .start
+.loop:
+%if ARCH_X86_32
+    mov                dstm, dstd
+    mov              alphad, [esp+0x100]
+    mov               betad, [esp+0x104]
+%endif
+    call .main2
+    lea                dstq, [dstq+dsq*2]
+.start:
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+    mova                m10, [PIC_sym(pw_8192)]
+ %else
+  %define m10 [PIC_sym(pw_8192)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+    mova                m12, [esp+0xC0]
+    mova                m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+  %define m11 m4
+    pxor                m11, m11
+ %endif
+    psrad               m12, 18
+    psrad               m13, 18
+    packusdw            m12, m13
+    pavgw               m12, m11 ; (x + (1 << 10)) >> 11
+%else
+    psrad               m12, 17
+    psrad               m13, 17
+    packssdw            m12, m13
+    pmulhrsw            m12, m10 ; (x + (1 << 10)) >> 11
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+    mova                m14, [esp+0xE0]
+    mova                m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+    psrad               m14, 18
+    psrad               m15, 18
+    packusdw            m14, m15
+    pavgw               m14, m11 ; (x + (1 << 10)) >> 11
+%else
+    psrad               m14, 17
+    psrad               m15, 17
+    packssdw            m14, m15
+    pmulhrsw            m14, m10 ; (x + (1 << 10)) >> 11
+%endif
+    packuswb            m12, m14
+    movq       [dstq+dsq*0], m12
+    movhps     [dstq+dsq*1], m12
+    dec            counterd
+    jg .loop
+.end:
+    RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+  %assign stack_offset stack_offset-4
+ %endif
+    RELOC_ARGS
+    LEA             PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+    mov               abcdd, abcdm
+ %if copy_args == 0
+    mov                 ssd, ssm
+    mov                 mxd, mxm
+ %endif
+    mov             PIC_mem, PIC_reg
+    mov                srcd, srcm
+%endif
+    movsx            deltad, word [abcdq+2*2]
+    movsx            gammad, word [abcdq+2*3]
+    lea               tmp1d, [deltaq*3]
+    sub              gammad, tmp1d    ; gamma -= delta*3
+    SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+    mov               abcdd, abcdm
+%endif
+    movsx            alphad, word [abcdq+2*0]
+    movsx             betad, word [abcdq+2*1]
+    lea               tmp1q, [ssq*3+3]
+    add                 mxd, 512+(64<<10)
+    lea               tmp2d, [alphaq*3]
+    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+    mov                srcm, srcd
+    mov             PIC_reg, PIC_mem
+%endif
+    sub               betad, tmp2d    ; beta -= alpha*3
+    lea             filterq, [PIC_sym(mc_warp_filter)]
+%if ARCH_X86_64
+    mov                 myd, r6m
+    pxor                m11, m11
+%endif
+    call .h
+    psrld                m2, m0, 16
+    psrld                m3, m1, 16
+%if ARCH_X86_32
+    mova [esp+gprsize+0x10], m3
+%endif
+    call .h
+    psrld                m4, m0, 16
+    psrld                m5, m1, 16
+%if ARCH_X86_32
+    mova [esp+gprsize+0x20], m4
+    mova [esp+gprsize+0x30], m5
+%endif
+    call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+    mova                 m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+    pcmpeqd             m10, m10
+    pslld               m10, 16
+    mova          blendmask, m10
+    BLENDHWDW            m2, m0 ; 0
+    BLENDHWDW            m3, m1 ; 2
+    mova [rsp+gprsize+0x00], m2
+    mova [rsp+gprsize+0x10], m3
+    call .h
+%if ARCH_X86_32
+    mova                 m4, [esp+gprsize+0x20]
+    mova                 m5, [esp+gprsize+0x30]
+%endif
+    mova                m10, blendmask
+    BLENDHWDW            m4, m0 ; 1
+    BLENDHWDW            m5, m1 ; 3
+    mova [rsp+gprsize+0x20], m4
+    mova [rsp+gprsize+0x30], m5
+    call .h
+%if ARCH_X86_32
+    mova                 m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+    psrld                m6, m2, 16
+    psrld                m7, m3, 16
+    mova                m10, blendmask
+    BLENDHWDW            m6, m0 ; 2
+    BLENDHWDW            m7, m1 ; 4
+    mova [rsp+gprsize+0x40], m6
+    mova [rsp+gprsize+0x50], m7
+    call .h
+%if ARCH_X86_32
+    mova                m4, [esp+gprsize+0x20]
+    mova                m5, [esp+gprsize+0x30]
+%endif
+    psrld               m2, m4, 16
+    psrld               m3, m5, 16
+    mova                m10, blendmask
+    BLENDHWDW           m2, m0 ; 3
+    BLENDHWDW           m3, m1 ; 5
+    mova [rsp+gprsize+0x60], m2
+    mova [rsp+gprsize+0x70], m3
+    call .h
+%if ARCH_X86_32
+    mova                 m6, [esp+gprsize+0x40]
+    mova                 m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+    psrld                m4, m6, 16
+    psrld                m5, m7, 16
+    mova                m10, blendmask
+    BLENDHWDW            m4, m0 ; 4
+    BLENDHWDW            m5, m1 ; 6
+%if ARCH_X86_64
+    add                 myd, 512+(64<<10)
+    mova                 m6, m2
+    mova                 m7, m3
+%else
+    mova [esp+gprsize+0x80], m4
+    mova [esp+gprsize+0x90], m5
+    add           dword mym, 512+(64<<10)
+%endif
+    mov            counterd, 4
+    SAVE_ALPHA_BETA
+.main2:
+    call .h
+%if ARCH_X86_32
+    mova                 m6, [esp+gprsize+0x60]
+    mova                 m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+    psrld                m6, 16
+    psrld                m7, 16
+    mova                m10, blendmask
+    BLENDHWDW            m6, m0 ; 5
+    BLENDHWDW            m7, m1 ; 7
+%if ARCH_X86_64
+    WARP_V              m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+                                  m4, m5, \
+                                  [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+                                  m6, m7
+%else
+    mova [esp+gprsize+0xA0], m6
+    mova [esp+gprsize+0xB0], m7
+    LOAD_DELTA_GAMMA_MY
+    WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+           [esp+gprsize+0x00], [esp+gprsize+0x10], \
+           [esp+gprsize+0x80], [esp+gprsize+0x90], \
+           [esp+gprsize+0x20], [esp+gprsize+0x30], \
+           [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+    LOAD_ALPHA_BETA_MX
+%endif
+    call .h
+    mova                 m2, [rsp+gprsize+0x40]
+    mova                 m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+    mova                 m4, [rsp+gprsize+0x80]
+    mova                 m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+    mova [rsp+gprsize+0x00], m2
+    mova [rsp+gprsize+0x10], m3
+    mova [rsp+gprsize+0x40], m4
+    mova [rsp+gprsize+0x50], m5
+    psrld                m4, 16
+    psrld                m5, 16
+    mova                m10, blendmask
+    BLENDHWDW            m4, m0 ; 6
+    BLENDHWDW            m5, m1 ; 8
+%if ARCH_X86_64
+    WARP_V              m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+                                  m6, m7, \
+                                  [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+                                  m4, m5
+%else
+    mova [esp+gprsize+0x80], m4
+    mova [esp+gprsize+0x90], m5
+    LOAD_DELTA_GAMMA_MY
+    WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+           [esp+gprsize+0x20], [esp+gprsize+0x30], \
+           [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+           [esp+gprsize+0x00], [esp+gprsize+0x10], \
+           [esp+gprsize+0x80], [esp+gprsize+0x90]
+    mov                 mym, myd
+    mov                dstd, dstm
+    mov                 dsd, dsm
+    mov                 mxd, mxm
+%endif
+    mova                 m2, [rsp+gprsize+0x60]
+    mova                 m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+    mova                 m6, [esp+gprsize+0xA0]
+    mova                 m7, [esp+gprsize+0xB0]
+%endif
+    mova [rsp+gprsize+0x20], m2
+    mova [rsp+gprsize+0x30], m3
+    mova [rsp+gprsize+0x60], m6
+    mova [rsp+gprsize+0x70], m7
+    ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8  m3
+ %define m9  m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+    mov             PIC_mem, PIC_reg
+    mov                srcd, srcm
+%endif
+    movu                m10, [srcq]
+%if ARCH_X86_32
+    add                srcd, ssm
+    mov                srcm, srcd
+    mov             PIC_reg, PIC_mem
+%else
+    add                srcq, ssq
+%endif
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    movq                 m1, [filterq+mxq  *8]  ; 0 X
+    movq                 m8, [filterq+tmp1q*8]  ; 4 X
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+alphaq*1]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movhps               m1, [filterq+tmp2q*8]  ; 0 1
+    movhps               m8, [filterq+tmp1q*8]  ; 4 5
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    movq                m14, [filterq+mxq  *8]  ; 2 X
+    movq                 m9, [filterq+tmp1q*8]  ; 6 X
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+betaq]  ; mx += beta
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movhps              m14, [filterq+tmp2q*8]  ; 2 3
+    movhps               m9, [filterq+tmp1q*8]  ; 6 7
+    pshufb               m0, m10, [PIC_sym(warp_8x8_shufA)]
+    pmaddubsw            m0, m1
+    pshufb               m1, m10, [PIC_sym(warp_8x8_shufB)]
+    pmaddubsw            m1, m8
+    pshufb              m15, m10, [PIC_sym(warp_8x8_shufC)]
+    pmaddubsw           m15, m14
+    pshufb              m10, m10, [PIC_sym(warp_8x8_shufD)]
+    pmaddubsw           m10, m9
+    mova                m14, [PIC_sym(pw_8192)]
+    mova                 m9, [PIC_sym(pd_32768)]
+    phaddw               m0, m15
+    phaddw               m1, m10
+    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
+    pmaddwd              m1, m14
+    paddd                m0, m9  ; rounded 14-bit result in upper 16 bits of dword
+    paddd                m1, m9
+    ret
+%endmacro
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM ssse3
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
 %if WIN64
 DECLARE_REG_TMP 6, 4
 %else
--- a/third_party/dav1d/tests/checkasm/checkasm.c
+++ b/third_party/dav1d/tests/checkasm/checkasm.c
@ -142,6 +142,7 @@ static struct {
    unsigned int seed;
    int bench_c;
    int verbose;
+    int function_listing;
 } state;

 /* float compare support code */
@ -365,6 +366,14 @@ static void print_benchs(const CheckasmFunc *const f) {
 }
 #endif

+static void print_functions(const CheckasmFunc *const f) {
+    if (f) {
+        print_functions(f->child[0]);
+        printf("%s\n", f->name);
+        print_functions(f->child[1]);
+    }
+}
+
 #define is_digit(x) ((x) >= '0' && (x) <= '9')

 /* ASCIIbetical sort except preserving natural order for numbers */
@ -515,7 +524,8 @@ int main(int argc, char *argv[]) {
                    "Options:\n"
                    "    --test=<test_name>  Test only <test_name>\n"
                    "    --bench=<pattern>   Test and benchmark the functions matching <pattern>\n"
-                    "    --list              List the available tests\n"
+                    "    --list-functions    List available functions\n"
+                    "    --list-tests        List available tests\n"
                    "    --bench-c           Benchmark the C-only functions\n"
                    "    --verbose -v        Print failures verbosely\n");
            return 0;
@ -534,11 +544,11 @@ int main(int argc, char *argv[]) {
                state.bench_pattern = "";
        } else if (!strncmp(argv[1], "--test=", 7)) {
            state.test_name = argv[1] + 7;
-        } else if (!strcmp(argv[1], "--list")) {
-            fprintf(stderr, "checkasm: available tests [");
-            for (int i = 0; tests[i].func; i++)
-                fprintf(stderr, "%s%s", i ? ", ": "", tests[i].name);
-            fprintf(stderr, "]\n");
+        } else if (!strcmp(argv[1], "--list-functions")) {
+            state.function_listing = 1;
+        } else if (!strcmp(argv[1], "--list-tests")) {
+            for (int i = 0; tests[i].name; i++)
+                printf("%s\n", tests[i].name);
            return 0;
        } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
            state.verbose = 1;
@ -553,24 +563,28 @@ int main(int argc, char *argv[]) {
    fprintf(stderr, "checkasm: using random seed %u\n", state.seed);

    check_cpu_flag(NULL, 0);
-    for (int i = 0; cpus[i].flag; i++)
-        check_cpu_flag(cpus[i].name, cpus[i].flag);
-
-    if (!state.num_checked) {
-        fprintf(stderr, "checkasm: no tests to perform\n");
-    } else if (state.num_failed) {
-        fprintf(stderr, "checkasm: %d of %d tests have failed\n",
-                state.num_failed, state.num_checked);
-        ret = 1;
+    if (state.function_listing) {
+        print_functions(state.funcs);
    } else {
-        fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
+        for (int i = 0; cpus[i].flag; i++)
+            check_cpu_flag(cpus[i].name, cpus[i].flag);
+
+        if (!state.num_checked) {
+            fprintf(stderr, "checkasm: no tests to perform\n");
+        } else if (state.num_failed) {
+            fprintf(stderr, "checkasm: %d of %d tests have failed\n",
+                    state.num_failed, state.num_checked);
+            ret = 1;
+        } else {
+            fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
 #ifdef readtime
-        if (state.bench_pattern) {
-            state.nop_time = measure_nop_time();
-            printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
-            print_benchs(state.funcs);
-        }
+            if (state.bench_pattern) {
+                state.nop_time = measure_nop_time();
+                printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
+                print_benchs(state.funcs);
+            }
 #endif
+        }
    }

    destroy_func_tree(state.funcs);
@ -592,6 +606,10 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
        return NULL;

    state.current_func = get_func(&state.funcs, name_buf);
+
+    if (state.function_listing) /* Save function names without running tests */
+        return NULL;
+
    state.funcs->color = 1;
    CheckasmFuncVersion *v = &state.current_func->versions;
    void *ref = func;
--- a/third_party/dav1d/tests/checkasm/filmgrain.c
+++ b/third_party/dav1d/tests/checkasm/filmgrain.c
@ -34,6 +34,12 @@
 #define UNIT_TEST 1
 #include "src/fg_apply_tmpl.c"

+static const char ss_name[][4] = {
+    [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
+    [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
+    [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
+};
+
 static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
    entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
    entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
@ -72,6 +78,64 @@ static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
    report("gen_grain_y");
 }

+static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
+    entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
+    entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+
+    declare_func(void, entry grain_lut[][GRAIN_WIDTH],
+                 const entry grain_lut_y[][GRAIN_WIDTH],
+                 const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX);
+
+    for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
+        const enum Dav1dPixelLayout layout = layout_idx + 1;
+        const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
+
+        for (int i = 0; i < 4; i++) {
+            if (check_func(dsp->generate_grain_uv[layout_idx],
+                           "gen_grain_uv_ar%d_%dbpc_%s",
+                           i, BITDEPTH, ss_name[layout_idx]))
+            {
+                Dav1dFilmGrainData fg_data;
+                fg_data.seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#endif
+
+                fg_data.num_y_points = rnd() & 1;
+                fg_data.grain_scale_shift = rnd() & 3;
+                fg_data.ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data.ar_coeff_lag = i;
+                const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+                for (int n = 0; n < num_y_pos; n++)
+                    fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
+
+                const int uv = rnd() & 1;
+                const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
+                for (int n = 0; n < num_uv_pos; n++)
+                    fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
+                if (!fg_data.num_y_points)
+                    fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
+                memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
+                memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
+                call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+                call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+                int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
+                for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
+                    diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
+                if (diff) fail();
+
+                bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    }
+
+    report("gen_grain_uv");
+}
+
 static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
@ -157,11 +221,6 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                 int is_identity HIGHBD_DECL_SUFFIX);

    for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
-        const char ss_name[][4] = {
-            [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
-            [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
-            [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
-        };
        const enum Dav1dPixelLayout layout = layout_idx + 1;
        const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
        const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
@ -264,6 +323,7 @@ void bitfn(checkasm_check_filmgrain)(void) {
    bitfn(dav1d_film_grain_dsp_init)(&c);

    check_gen_grny(&c);
+    check_gen_grnuv(&c);
    check_fgy_sbrow(&c);
    check_fguv_sbrow(&c);
 }
--- a/third_party/dav1d/tests/checkasm/ipred.c
+++ b/third_party/dav1d/tests/checkasm/ipred.c
@ -29,6 +29,8 @@
 #include "src/ipred.h"
 #include "src/levels.h"

+#include <stdio.h>
+
 static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
    [DC_PRED]       = "dc",
    [DC_128_PRED]   = "dc_128",
@ -83,11 +85,16 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
                {
                    const ptrdiff_t stride = w * sizeof(pixel);

-                    int a = 0;
-                    if (mode >= Z1_PRED && mode <= Z3_PRED) /* angle */
+                    int a = 0, maxw = 0, maxh = 0;
+                    if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
                        a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
                            (rnd() & 0x600);
-                    else if (mode == FILTER_PRED) /* filter_idx */
+                        if (mode == Z2_PRED) {
+                            maxw = rnd(), maxh = rnd();
+                            maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
+                            maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+                        }
+                    } else if (mode == FILTER_PRED) /* filter_idx */
                        a = (rnd() % 5) | (rnd() & ~511);

 #if BITDEPTH == 16
@ -99,13 +106,23 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
                    for (int i = -h * 2; i <= w * 2; i++)
                        topleft[i] = rnd() & bitdepth_max;

-                    const int maxw = 1 + (rnd() % 128), maxh = 1 + (rnd() % 128);
                    call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
                             HIGHBD_TAIL_SUFFIX);
                    call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
                             HIGHBD_TAIL_SUFFIX);
-                    checkasm_check_pixel(c_dst, stride, a_dst, stride,
-                                         w, h, "dst");
+                    if (checkasm_check_pixel(c_dst, stride, a_dst, stride,
+                                             w, h, "dst"))
+                    {
+                        if (mode == Z1_PRED || mode == Z3_PRED)
+                            fprintf(stderr, "angle = %d (0x%03x)\n",
+                                    a & 0x1ff, a & 0x600);
+                        else if (mode == Z2_PRED)
+                            fprintf(stderr, "angle = %d (0x%03x), "
+                                    "max_width = %d, max_height = %d\n",
+                                    a & 0x1ff, a & 0x600, maxw, maxh);
+                        else if (mode == FILTER_PRED)
+                            fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
+                    }

                    bench_new(a_dst, stride, topleft, w, h, a, 128, 128
                              HIGHBD_TAIL_SUFFIX);
--- a/third_party/dav1d/tools/dav1d_cli_parse.c
+++ b/third_party/dav1d/tools/dav1d_cli_parse.c
@ -192,7 +192,7 @@ static const EnumParseTable cpu_mask_tbl[] = {
 #if ARCH_AARCH64 || ARCH_ARM
    { "neon", DAV1D_ARM_CPU_FLAG_NEON },
 #elif ARCH_X86
-    { "sse2",   X86_CPU_MASK_SSE },
+    { "sse2",   X86_CPU_MASK_SSE2 },
    { "ssse3",  X86_CPU_MASK_SSSE3 },
    { "sse41",  X86_CPU_MASK_SSE41 },
    { "avx2",   X86_CPU_MASK_AVX2 },